Linux-libre 4.5-gnu

author: André Fabian Silva Delgado <emulatorman@parabola.nu> 2016-03-25 03:53:42 -0300
committer: André Fabian Silva Delgado <emulatorman@parabola.nu> 2016-03-25 03:53:42 -0300
commit: 03dd4cb26d967f9588437b0fc9cc0e8353322bb7 (patch)
tree: fa581f6dc1c0596391690d1f67eceef3af8246dc /fs
parent: d4e493caf788ef44982e131ff9c786546904d934 (diff)
600 files changed, 15088 insertions, 39036 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c
index a7e28890f..9da967f38 100644
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -67,8 +67,8 @@ int v9fs_get_acl(struct inode *inode, struct p9_fid *fid)
 		return 0;
 	}
 	/* get the default/access acl values and cache them */
-	dacl = __v9fs_get_acl(fid, POSIX_ACL_XATTR_DEFAULT);
-	pacl = __v9fs_get_acl(fid, POSIX_ACL_XATTR_ACCESS);
+	dacl = __v9fs_get_acl(fid, XATTR_NAME_POSIX_ACL_DEFAULT);
+	pacl = __v9fs_get_acl(fid, XATTR_NAME_POSIX_ACL_ACCESS);
 
 	if (!IS_ERR(dacl) && !IS_ERR(pacl)) {
 		set_cached_acl(inode, ACL_TYPE_DEFAULT, dacl);
@@ -133,10 +133,10 @@ static int v9fs_set_acl(struct p9_fid *fid, int type, struct posix_acl *acl)
 		goto err_free_out;
 	switch (type) {
 	case ACL_TYPE_ACCESS:
-		name = POSIX_ACL_XATTR_ACCESS;
+		name = XATTR_NAME_POSIX_ACL_ACCESS;
 		break;
 	case ACL_TYPE_DEFAULT:
-		name = POSIX_ACL_XATTR_DEFAULT;
+		name = XATTR_NAME_POSIX_ACL_DEFAULT;
 		break;
 	default:
 		BUG();
@@ -220,15 +220,12 @@ static int v9fs_xattr_get_acl(const struct xattr_handler *handler,
 	struct posix_acl *acl;
 	int error;
 
-	if (strcmp(name, "") != 0)
-		return -EINVAL;
-
 	v9ses = v9fs_dentry2v9ses(dentry);
 	/*
 	 * We allow set/get/list of acl when access=client is not specified
 	 */
 	if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT)
-		return v9fs_xattr_get(dentry, handler->prefix, buffer, size);
+		return v9fs_xattr_get(dentry, handler->name, buffer, size);
 
 	acl = v9fs_get_cached_acl(d_inode(dentry), handler->flags);
 	if (IS_ERR(acl))
@@ -250,16 +247,13 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler,
 	struct v9fs_session_info *v9ses;
 	struct inode *inode = d_inode(dentry);
 
-	if (strcmp(name, "") != 0)
-		return -EINVAL;
-
 	v9ses = v9fs_dentry2v9ses(dentry);
 	/*
 	 * set the attribute on the remote. Without even looking at the
 	 * xattr value. We leave it to the server to validate
 	 */
 	if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT)
-		return v9fs_xattr_set(dentry, handler->prefix, value, size,
+		return v9fs_xattr_set(dentry, handler->name, value, size,
 				      flags);
 
 	if (S_ISLNK(inode->i_mode))
@@ -319,7 +313,7 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler,
 	default:
 		BUG();
 	}
-	retval = v9fs_xattr_set(dentry, handler->prefix, value, size, flags);
+	retval = v9fs_xattr_set(dentry, handler->name, value, size, flags);
 	if (!retval)
 		set_cached_acl(inode, handler->flags, acl);
 err_out:
@@ -328,14 +322,14 @@ err_out:
 }
 
 const struct xattr_handler v9fs_xattr_acl_access_handler = {
-	.prefix	= POSIX_ACL_XATTR_ACCESS,
+	.name	= XATTR_NAME_POSIX_ACL_ACCESS,
 	.flags	= ACL_TYPE_ACCESS,
 	.get	= v9fs_xattr_get_acl,
 	.set	= v9fs_xattr_set_acl,
 };
 
 const struct xattr_handler v9fs_xattr_acl_default_handler = {
-	.prefix	= POSIX_ACL_XATTR_DEFAULT,
+	.name	= XATTR_NAME_POSIX_ACL_DEFAULT,
 	.flags	= ACL_TYPE_DEFAULT,
 	.get	= v9fs_xattr_get_acl,
 	.set	= v9fs_xattr_set_acl,
diff --git a/fs/9p/cache.c b/fs/9p/cache.c
index a69260f27..103ca5e12 100644
--- a/fs/9p/cache.c
+++ b/fs/9p/cache.c
@@ -243,14 +243,14 @@ void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *filp)
 	if (!v9inode->fscache)
 		return;
 
-	spin_lock(&v9inode->fscache_lock);
+	mutex_lock(&v9inode->fscache_lock);
 
 	if ((filp->f_flags & O_ACCMODE) != O_RDONLY)
 		v9fs_cache_inode_flush_cookie(inode);
 	else
 		v9fs_cache_inode_get_cookie(inode);
 
-	spin_unlock(&v9inode->fscache_lock);
+	mutex_unlock(&v9inode->fscache_lock);
 }
 
 void v9fs_cache_inode_reset_cookie(struct inode *inode)
@@ -264,7 +264,7 @@ void v9fs_cache_inode_reset_cookie(struct inode *inode)
 
 	old = v9inode->fscache;
 
-	spin_lock(&v9inode->fscache_lock);
+	mutex_lock(&v9inode->fscache_lock);
 	fscache_relinquish_cookie(v9inode->fscache, 1);
 
 	v9ses = v9fs_inode2v9ses(inode);
@@ -274,7 +274,7 @@ void v9fs_cache_inode_reset_cookie(struct inode *inode)
 	p9_debug(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p\n",
 		 inode, old, v9inode->fscache);
 
-	spin_unlock(&v9inode->fscache_lock);
+	mutex_unlock(&v9inode->fscache_lock);
 }
 
 int __v9fs_fscache_release_page(struct page *page, gfp_t gfp)
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 6caca0250..072e75995 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -575,7 +575,7 @@ static int v9fs_init_inode_cache(void)
 	v9fs_inode_cache = kmem_cache_create("v9fs_inode_cache",
 					  sizeof(struct v9fs_inode),
 					  0, (SLAB_RECLAIM_ACCOUNT|
-					      SLAB_MEM_SPREAD),
+					      SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					  v9fs_inode_init_once);
 	if (!v9fs_inode_cache)
 		return -ENOMEM;
diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h
index 0923f2cf3..687705038 100644
--- a/fs/9p/v9fs.h
+++ b/fs/9p/v9fs.h
@@ -123,7 +123,7 @@ struct v9fs_session_info {
 
 struct v9fs_inode {
 #ifdef CONFIG_9P_FSCACHE
-	spinlock_t fscache_lock;
+	struct mutex fscache_lock;
 	struct fscache_cookie *fscache;
 #endif
 	struct p9_qid qid;
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 7bf835f85..eadc894fa 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -449,14 +449,14 @@ static int v9fs_file_fsync(struct file *filp, loff_t start, loff_t end,
 	if (retval)
 		return retval;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	p9_debug(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync);
 
 	fid = filp->private_data;
 	v9fs_blank_wstat(&wstat);
 
 	retval = p9_client_wstat(fid, &wstat);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	return retval;
 }
@@ -472,13 +472,13 @@ int v9fs_file_fsync_dotl(struct file *filp, loff_t start, loff_t end,
 	if (retval)
 		return retval;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	p9_debug(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync);
 
 	fid = filp->private_data;
 
 	retval = p9_client_fsync(fid, datasync);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	return retval;
 }
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 511078586..3a08b3e6f 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -244,7 +244,7 @@ struct inode *v9fs_alloc_inode(struct super_block *sb)
 		return NULL;
 #ifdef CONFIG_9P_FSCACHE
 	v9inode->fscache = NULL;
-	spin_lock_init(&v9inode->fscache_lock);
+	mutex_init(&v9inode->fscache_lock);
 #endif
 	v9inode->writeback_fid = NULL;
 	v9inode->cache_validity = 0;
@@ -1223,18 +1223,26 @@ ino_t v9fs_qid2ino(struct p9_qid *qid)
 }
 
 /**
- * v9fs_vfs_follow_link - follow a symlink path
+ * v9fs_vfs_get_link - follow a symlink path
  * @dentry: dentry for symlink
- * @cookie: place to pass the data to put_link()
+ * @inode: inode for symlink
+ * @done: delayed call for when we are done with the return value
  */
 
-static const char *v9fs_vfs_follow_link(struct dentry *dentry, void **cookie)
+static const char *v9fs_vfs_get_link(struct dentry *dentry,
+				     struct inode *inode,
+				     struct delayed_call *done)
 {
-	struct v9fs_session_info *v9ses = v9fs_dentry2v9ses(dentry);
-	struct p9_fid *fid = v9fs_fid_lookup(dentry);
+	struct v9fs_session_info *v9ses;
+	struct p9_fid *fid;
 	struct p9_wstat *st;
 	char *res;
 
+	if (!dentry)
+		return ERR_PTR(-ECHILD);
+
+	v9ses = v9fs_dentry2v9ses(dentry);
+	fid = v9fs_fid_lookup(dentry);
 	p9_debug(P9_DEBUG_VFS, "%pd\n", dentry);
 
 	if (IS_ERR(fid))
@@ -1259,7 +1267,8 @@ static const char *v9fs_vfs_follow_link(struct dentry *dentry, void **cookie)
 
 	p9stat_free(st);
 	kfree(st);
-	return *cookie = res;
+	set_delayed_call(done, kfree_link, res);
+	return res;
 }
 
 /**
@@ -1452,8 +1461,7 @@ static const struct inode_operations v9fs_file_inode_operations = {
 
 static const struct inode_operations v9fs_symlink_inode_operations = {
 	.readlink = generic_readlink,
-	.follow_link = v9fs_vfs_follow_link,
-	.put_link = kfree_put_link,
+	.get_link = v9fs_vfs_get_link,
 	.getattr = v9fs_vfs_getattr,
 	.setattr = v9fs_vfs_setattr,
 };
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index cb899af1b..a34702c99 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -899,26 +899,34 @@ error:
 }
 
 /**
- * v9fs_vfs_follow_link_dotl - follow a symlink path
+ * v9fs_vfs_get_link_dotl - follow a symlink path
  * @dentry: dentry for symlink
- * @cookie: place to pass the data to put_link()
+ * @inode: inode for symlink
+ * @done: destructor for return value
  */
 
 static const char *
-v9fs_vfs_follow_link_dotl(struct dentry *dentry, void **cookie)
+v9fs_vfs_get_link_dotl(struct dentry *dentry,
+		       struct inode *inode,
+		       struct delayed_call *done)
 {
-	struct p9_fid *fid = v9fs_fid_lookup(dentry);
+	struct p9_fid *fid;
 	char *target;
 	int retval;
 
+	if (!dentry)
+		return ERR_PTR(-ECHILD);
+
 	p9_debug(P9_DEBUG_VFS, "%pd\n", dentry);
 
+	fid = v9fs_fid_lookup(dentry);
 	if (IS_ERR(fid))
 		return ERR_CAST(fid);
 	retval = p9_client_readlink(fid, &target);
 	if (retval)
 		return ERR_PTR(retval);
-	return *cookie = target;
+	set_delayed_call(done, kfree_link, target);
+	return target;
 }
 
 int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode)
@@ -984,8 +992,7 @@ const struct inode_operations v9fs_file_inode_operations_dotl = {
 
 const struct inode_operations v9fs_symlink_inode_operations_dotl = {
 	.readlink = generic_readlink,
-	.follow_link = v9fs_vfs_follow_link_dotl,
-	.put_link = kfree_put_link,
+	.get_link = v9fs_vfs_get_link_dotl,
 	.getattr = v9fs_vfs_getattr_dotl,
 	.setattr = v9fs_vfs_setattr_dotl,
 	.setxattr = generic_setxattr,
diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c
index e3d026ac3..9dd9b47a6 100644
--- a/fs/9p/xattr.c
+++ b/fs/9p/xattr.c
@@ -143,8 +143,6 @@ static int v9fs_xattr_handler_get(const struct xattr_handler *handler,
 {
 	const char *full_name = xattr_full_name(handler, name);
 
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
 	return v9fs_xattr_get(dentry, full_name, buffer, size);
 }
 
@@ -154,8 +152,6 @@ static int v9fs_xattr_handler_set(const struct xattr_handler *handler,
 {
 	const char *full_name = xattr_full_name(handler, name);
 
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
 	return v9fs_xattr_set(dentry, full_name, value, size, flags);
 }
 
diff --git a/fs/Kconfig b/fs/Kconfig
index bec7b6beb..dc04844aa 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -50,7 +50,8 @@ config FS_DAX_PMD
 	bool
 	default FS_DAX
 	depends on FS_DAX
-	depends on BROKEN
+	depends on ZONE_DEVICE
+	depends on TRANSPARENT_HUGEPAGE
 
 endif # BLOCK
 
@@ -73,6 +74,16 @@ config FILE_LOCKING
           for filesystems like NFS and for the flock() system
           call. Disabling this option saves about 11k.
 
+config MANDATORY_FILE_LOCKING
+	bool "Enable Mandatory file locking"
+	depends on FILE_LOCKING
+	default y
+	help
+	  This option enables files appropriately marked files on appropriely
+	  mounted filesystems to support mandatory locking.
+
+	  To the best of my knowledge this is dead code that no one cares about.
+
 source "fs/notify/Kconfig"
 
 source "fs/quota/Kconfig"
@@ -222,7 +233,6 @@ source "fs/pstore/Kconfig"
 source "fs/sysv/Kconfig"
 source "fs/ufs/Kconfig"
 source "fs/exofs/Kconfig"
-source "fs/aufs/Kconfig"
 
 endif # MISC_FILESYSTEMS
 
diff --git a/fs/Makefile b/fs/Makefile
index 0c61756a2..b8da1c2c3 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -127,4 +127,3 @@ obj-y				+= exofs/ # Multiple modules
 obj-$(CONFIG_CEPH_FS)		+= ceph/
 obj-$(CONFIG_PSTORE)		+= pstore/
 obj-$(CONFIG_EFIVAR_FS)		+= efivarfs/
-obj-$(CONFIG_AUFS_FS)           += aufs/
diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h
index 24575d9d8..fadf408bd 100644
--- a/fs/adfs/adfs.h
+++ b/fs/adfs/adfs.h
@@ -44,24 +44,24 @@ struct adfs_dir_ops;
  */
 struct adfs_sb_info {
 	union { struct {
-		struct adfs_discmap *s_map;	/* bh list containing map	 */
-		struct adfs_dir_ops *s_dir;	/* directory operations		 */
+		struct adfs_discmap *s_map;	/* bh list containing map */
+		const struct adfs_dir_ops *s_dir; /* directory operations */
 		};
-		struct rcu_head rcu;		/* used only at shutdown time	 */
+		struct rcu_head rcu;	/* used only at shutdown time	 */
 	};
-	kuid_t		s_uid;		/* owner uid				 */
-	kgid_t		s_gid;		/* owner gid				 */
-	umode_t		s_owner_mask;	/* ADFS owner perm -> unix perm		 */
-	umode_t		s_other_mask;	/* ADFS other perm -> unix perm		 */
+	kuid_t		s_uid;		/* owner uid */
+	kgid_t		s_gid;		/* owner gid */
+	umode_t		s_owner_mask;	/* ADFS owner perm -> unix perm */
+	umode_t		s_other_mask;	/* ADFS other perm -> unix perm	*/
 	int		s_ftsuffix;	/* ,xyz hex filetype suffix option */
 
-	__u32		s_ids_per_zone;	/* max. no ids in one zone		 */
-	__u32		s_idlen;	/* length of ID in map			 */
-	__u32		s_map_size;	/* sector size of a map			 */
-	unsigned long	s_size;		/* total size (in blocks) of this fs	 */
-	signed int	s_map2blk;	/* shift left by this for map->sector	 */
-	unsigned int	s_log2sharesize;/* log2 share size			 */
-	__le32		s_version;	/* disc format version			 */
+	__u32		s_ids_per_zone;	/* max. no ids in one zone */
+	__u32		s_idlen;	/* length of ID in map */
+	__u32		s_map_size;	/* sector size of a map	*/
+	unsigned long	s_size;		/* total size (in blocks) of this fs */
+	signed int	s_map2blk;	/* shift left by this for map->sector*/
+	unsigned int	s_log2sharesize;/* log2 share size */
+	__le32		s_version;	/* disc format version */
 	unsigned int	s_namelen;	/* maximum number of characters in name	 */
 };
 
@@ -168,8 +168,8 @@ void __adfs_error(struct super_block *sb, const char *function,
 extern const struct inode_operations adfs_dir_inode_operations;
 extern const struct file_operations adfs_dir_operations;
 extern const struct dentry_operations adfs_dentry_operations;
-extern struct adfs_dir_ops adfs_f_dir_ops;
-extern struct adfs_dir_ops adfs_fplus_dir_ops;
+extern const struct adfs_dir_ops adfs_f_dir_ops;
+extern const struct adfs_dir_ops adfs_fplus_dir_ops;
 
 extern int adfs_dir_update(struct super_block *sb, struct object_info *obj,
 			   int wait);
diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c
index 51c279a29..fd4cf2c48 100644
--- a/fs/adfs/dir.c
+++ b/fs/adfs/dir.c
@@ -21,7 +21,7 @@ adfs_readdir(struct file *file, struct dir_context *ctx)
 {
 	struct inode *inode = file_inode(file);
 	struct super_block *sb = inode->i_sb;
-	struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir;
+	const struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir;
 	struct object_info obj;
 	struct adfs_dir dir;
 	int ret = 0;
@@ -69,7 +69,7 @@ adfs_dir_update(struct super_block *sb, struct object_info *obj, int wait)
 {
 	int ret = -EINVAL;
 #ifdef CONFIG_ADFS_FS_RW
-	struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir;
+	const struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir;
 	struct adfs_dir dir;
 
 	printk(KERN_INFO "adfs_dir_update: object %06X in dir %06X\n",
@@ -129,7 +129,7 @@ static int
 adfs_dir_lookup_byname(struct inode *inode, struct qstr *name, struct object_info *obj)
 {
 	struct super_block *sb = inode->i_sb;
-	struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir;
+	const struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir;
 	struct adfs_dir dir;
 	int ret;
 
diff --git a/fs/adfs/dir_f.c b/fs/adfs/dir_f.c
index 4bbe853ee..0fbfd0b04 100644
--- a/fs/adfs/dir_f.c
+++ b/fs/adfs/dir_f.c
@@ -476,7 +476,7 @@ adfs_f_free(struct adfs_dir *dir)
 	dir->sb = NULL;
 }
 
-struct adfs_dir_ops adfs_f_dir_ops = {
+const struct adfs_dir_ops adfs_f_dir_ops = {
 	.read		= adfs_f_read,
 	.setpos		= adfs_f_setpos,
 	.getnext	= adfs_f_getnext,
diff --git a/fs/adfs/dir_fplus.c b/fs/adfs/dir_fplus.c
index 82d14cdf7..c92cfb638 100644
--- a/fs/adfs/dir_fplus.c
+++ b/fs/adfs/dir_fplus.c
@@ -256,7 +256,7 @@ adfs_fplus_free(struct adfs_dir *dir)
 	dir->sb = NULL;
 }
 
-struct adfs_dir_ops adfs_fplus_dir_ops = {
+const struct adfs_dir_ops adfs_fplus_dir_ops = {
 	.read		= adfs_fplus_read,
 	.setpos		= adfs_fplus_setpos,
 	.getnext	= adfs_fplus_getnext,
diff --git a/fs/adfs/super.c b/fs/adfs/super.c
index 4d4a0df83..c9fdfb112 100644
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -271,7 +271,7 @@ static int __init init_inodecache(void)
 	adfs_inode_cachep = kmem_cache_create("adfs_inode_cache",
 					     sizeof(struct adfs_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					     init_once);
 	if (adfs_inode_cachep == NULL)
 		return -ENOMEM;
diff --git a/fs/affs/affs.h b/fs/affs/affs.h
index c69a87eaf..cc2b2efc9 100644
--- a/fs/affs/affs.h
+++ b/fs/affs/affs.h
@@ -138,7 +138,7 @@ extern int	affs_remove_hash(struct inode *dir, struct buffer_head *rem_bh);
 extern int	affs_remove_header(struct dentry *dentry);
 extern u32	affs_checksum_block(struct super_block *sb, struct buffer_head *bh);
 extern void	affs_fix_checksum(struct super_block *sb, struct buffer_head *bh);
-extern void	secs_to_datestamp(time_t secs, struct affs_date *ds);
+extern void	secs_to_datestamp(time64_t secs, struct affs_date *ds);
 extern umode_t	prot_to_mode(u32 prot);
 extern void	mode_to_prot(struct inode *inode);
 __printf(3, 4)
diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c
index 5fa92bc79..d6c7a51c9 100644
--- a/fs/affs/amigaffs.c
+++ b/fs/affs/amigaffs.c
@@ -8,6 +8,7 @@
  *  Please send bug reports to: hjw@zvw.de
  */
 
+#include <linux/math64.h>
 #include "affs.h"
 
 /*
@@ -366,22 +367,22 @@ affs_fix_checksum(struct super_block *sb, struct buffer_head *bh)
 }
 
 void
-secs_to_datestamp(time_t secs, struct affs_date *ds)
+secs_to_datestamp(time64_t secs, struct affs_date *ds)
 {
 	u32	 days;
 	u32	 minute;
+	s32	 rem;
 
 	secs -= sys_tz.tz_minuteswest * 60 + ((8 * 365 + 2) * 24 * 60 * 60);
 	if (secs < 0)
 		secs = 0;
-	days    = secs / 86400;
-	secs   -= days * 86400;
-	minute  = secs / 60;
-	secs   -= minute * 60;
+	days    = div_s64_rem(secs, 86400, &rem);
+	minute  = rem / 60;
+	rem    -= minute * 60;
 
 	ds->days = cpu_to_be32(days);
 	ds->mins = cpu_to_be32(minute);
-	ds->ticks = cpu_to_be32(secs * 50);
+	ds->ticks = cpu_to_be32(rem * 50);
 }
 
 umode_t
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 659c579c4..22fc7c802 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -33,11 +33,11 @@ affs_file_release(struct inode *inode, struct file *filp)
 		 inode->i_ino, atomic_read(&AFFS_I(inode)->i_opencnt));
 
 	if (atomic_dec_and_test(&AFFS_I(inode)->i_opencnt)) {
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 		if (inode->i_size != AFFS_I(inode)->mmu_private)
 			affs_truncate(inode);
 		affs_free_prealloc(inode);
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 	}
 
 	return 0;
@@ -511,8 +511,6 @@ affs_do_readpage_ofs(struct page *page, unsigned to)
 	pr_debug("%s(%lu, %ld, 0, %d)\n", __func__, inode->i_ino,
 		 page->index, to);
 	BUG_ON(to > PAGE_CACHE_SIZE);
-	kmap(page);
-	data = page_address(page);
 	bsize = AFFS_SB(sb)->s_data_blksize;
 	tmp = page->index << PAGE_CACHE_SHIFT;
 	bidx = tmp / bsize;
@@ -524,14 +522,15 @@ affs_do_readpage_ofs(struct page *page, unsigned to)
 			return PTR_ERR(bh);
 		tmp = min(bsize - boff, to - pos);
 		BUG_ON(pos + tmp > to || tmp > bsize);
+		data = kmap_atomic(page);
 		memcpy(data + pos, AFFS_DATA(bh) + boff, tmp);
+		kunmap_atomic(data);
 		affs_brelse(bh);
 		bidx++;
 		pos += tmp;
 		boff = 0;
 	}
 	flush_dcache_page(page);
-	kunmap(page);
 	return 0;
 }
 
@@ -958,12 +957,12 @@ int affs_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
 	if (err)
 		return err;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	ret = write_inode_now(inode, 0);
 	err = sync_blockdev(inode->i_sb->s_bdev);
 	if (!ret)
 		ret = err;
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return ret;
 }
 const struct file_operations affs_file_operations = {
diff --git a/fs/affs/inode.c b/fs/affs/inode.c
index 173495005..0fdb0f5b2 100644
--- a/fs/affs/inode.c
+++ b/fs/affs/inode.c
@@ -140,6 +140,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino)
 		break;
 	case ST_SOFTLINK:
 		inode->i_mode |= S_IFLNK;
+		inode_nohighmem(inode);
 		inode->i_op = &affs_symlink_inode_operations;
 		inode->i_data.a_ops = &affs_symlink_aops;
 		break;
diff --git a/fs/affs/namei.c b/fs/affs/namei.c
index 181e05b46..00d3002a6 100644
--- a/fs/affs/namei.c
+++ b/fs/affs/namei.c
@@ -344,6 +344,7 @@ affs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
 		return -ENOSPC;
 
 	inode->i_op = &affs_symlink_inode_operations;
+	inode_nohighmem(inode);
 	inode->i_data.a_ops = &affs_symlink_aops;
 	inode->i_mode = S_IFLNK | 0777;
 	mode_to_prot(inode);
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 5b50c4ca4..2a6713b6b 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -32,7 +32,7 @@ affs_commit_super(struct super_block *sb, int wait)
 	struct affs_root_tail *tail = AFFS_ROOT_TAIL(sb, bh);
 
 	lock_buffer(bh);
-	secs_to_datestamp(get_seconds(), &tail->disk_change);
+	secs_to_datestamp(ktime_get_real_seconds(), &tail->disk_change);
 	affs_fix_checksum(sb, bh);
 	unlock_buffer(bh);
 
@@ -132,7 +132,7 @@ static int __init init_inodecache(void)
 	affs_inode_cachep = kmem_cache_create("affs_inode_cache",
 					     sizeof(struct affs_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					     init_once);
 	if (affs_inode_cachep == NULL)
 		return -ENOMEM;
diff --git a/fs/affs/symlink.c b/fs/affs/symlink.c
index ea5b69a18..69b03dbb7 100644
--- a/fs/affs/symlink.c
+++ b/fs/affs/symlink.c
@@ -14,13 +14,13 @@ static int affs_symlink_readpage(struct file *file, struct page *page)
 {
 	struct buffer_head *bh;
 	struct inode *inode = page->mapping->host;
-	char *link = kmap(page);
+	char *link = page_address(page);
 	struct slink_front *lf;
 	int			 i, j;
 	char			 c;
 	char			 lc;
 
-	pr_debug("follow_link(ino=%lu)\n", inode->i_ino);
+	pr_debug("get_link(ino=%lu)\n", inode->i_ino);
 
 	bh = affs_bread(inode->i_sb, inode->i_ino);
 	if (!bh)
@@ -57,12 +57,10 @@ static int affs_symlink_readpage(struct file *file, struct page *page)
 	link[i] = '\0';
 	affs_brelse(bh);
 	SetPageUptodate(page);
-	kunmap(page);
 	unlock_page(page);
 	return 0;
 fail:
 	SetPageError(page);
-	kunmap(page);
 	unlock_page(page);
 	return -EIO;
 }
@@ -73,7 +71,6 @@ const struct address_space_operations affs_symlink_aops = {
 
 const struct inode_operations affs_symlink_inode_operations = {
 	.readlink	= generic_readlink,
-	.follow_link	= page_follow_link_light,
-	.put_link	= page_put_link,
+	.get_link	= page_get_link,
 	.setattr	= affs_notify_change,
 };
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index 4baf1d2b3..d91a9c9cf 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -483,7 +483,7 @@ static int afs_do_getlk(struct file *file, struct file_lock *fl)
 
 	fl->fl_type = F_UNLCK;
 
-	mutex_lock(&vnode->vfs_inode.i_mutex);
+	inode_lock(&vnode->vfs_inode);
 
 	/* check local lock records first */
 	ret = 0;
@@ -505,7 +505,7 @@ static int afs_do_getlk(struct file *file, struct file_lock *fl)
 	}
 
 error:
-	mutex_unlock(&vnode->vfs_inode.i_mutex);
+	inode_unlock(&vnode->vfs_inode);
 	_leave(" = %d [%hd]", ret, fl->fl_type);
 	return ret;
 }
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index e06f5a233..86cc7264c 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -56,6 +56,7 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key)
 	case AFS_FTYPE_SYMLINK:
 		inode->i_mode	= S_IFLNK | vnode->status.mode;
 		inode->i_op	= &page_symlink_inode_operations;
+		inode_nohighmem(inode);
 		break;
 	default:
 		printk("kAFS: AFS vnode with undefined type\n");
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 24a905b07..2853b4095 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -230,14 +230,9 @@ static ssize_t afs_proc_cells_write(struct file *file, const char __user *buf,
 	if (size <= 1 || size >= PAGE_SIZE)
 		return -EINVAL;
 
-	kbuf = kmalloc(size + 1, GFP_KERNEL);
-	if (!kbuf)
-		return -ENOMEM;
-
-	ret = -EFAULT;
-	if (copy_from_user(kbuf, buf, size) != 0)
-		goto done;
-	kbuf[size] = 0;
+	kbuf = memdup_user_nul(buf, size);
+	if (IS_ERR(kbuf))
+		return PTR_ERR(kbuf);
 
 	/* trim to first NL */
 	name = memchr(kbuf, '\n', size);
@@ -315,15 +310,9 @@ static ssize_t afs_proc_rootcell_write(struct file *file,
 	if (size <= 1 || size >= PAGE_SIZE)
 		return -EINVAL;
 
-	ret = -ENOMEM;
-	kbuf = kmalloc(size + 1, GFP_KERNEL);
-	if (!kbuf)
-		goto nomem;
-
-	ret = -EFAULT;
-	if (copy_from_user(kbuf, buf, size) != 0)
-		goto infault;
-	kbuf[size] = 0;
+	kbuf = memdup_user_nul(buf, size);
+	if (IS_ERR(kbuf))
+		return PTR_ERR(kbuf);
 
 	/* trim to first NL */
 	s = memchr(kbuf, '\n', size);
@@ -337,9 +326,7 @@ static ssize_t afs_proc_rootcell_write(struct file *file,
 	if (ret >= 0)
 		ret = size;	/* consume everything, always */
 
-infault:
 	kfree(kbuf);
-nomem:
 	_leave(" = %d", ret);
 	return ret;
 }
diff --git a/fs/afs/super.c b/fs/afs/super.c
index 1fb4a5129..81afefe7d 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -91,7 +91,7 @@ int __init afs_fs_init(void)
 	afs_inode_cachep = kmem_cache_create("afs_inode_cache",
 					     sizeof(struct afs_vnode),
 					     0,
-					     SLAB_HWCACHE_ALIGN,
+					     SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT,
 					     afs_i_init_once);
 	if (!afs_inode_cachep) {
 		printk(KERN_NOTICE "kAFS: Failed to allocate inode cache\n");
diff --git a/fs/afs/write.c b/fs/afs/write.c
index 0714abcd7..dfef94f70 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -693,7 +693,7 @@ int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
 	if (ret)
 		return ret;
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	/* use a writeback record as a marker in the queue - when this reaches
 	 * the front of the queue, all the outstanding writes are either
@@ -735,7 +735,7 @@ int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 	afs_put_writeback(wb);
 	_leave(" = %d", ret);
 out:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return ret;
 }
 
diff --git a/fs/attr.c b/fs/attr.c
index 6530ced19..25b24d0f6 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -195,7 +195,7 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de
 	struct timespec now;
 	unsigned int ia_valid = attr->ia_valid;
 
-	WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex));
+	WARN_ON_ONCE(!inode_is_locked(inode));
 
 	if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_TIMES_SET)) {
 		if (IS_IMMUTABLE(inode) || IS_APPEND(inode))
diff --git a/fs/aufs/Kconfig b/fs/aufs/Kconfig
deleted file mode 100644
index 63560ceda..000000000
--- a/fs/aufs/Kconfig
+++ /dev/null
@@ -1,185 +0,0 @@
-config AUFS_FS
-	tristate "Aufs (Advanced multi layered unification filesystem) support"
-	help
-	Aufs is a stackable unification filesystem such as Unionfs,
-	which unifies several directories and provides a merged single
-	directory.
-	In the early days, aufs was entirely re-designed and
-	re-implemented Unionfs Version 1.x series. Introducing many
-	original ideas, approaches and improvements, it becomes totally
-	different from Unionfs while keeping the basic features.
-
-if AUFS_FS
-choice
-	prompt "Maximum number of branches"
-	default AUFS_BRANCH_MAX_127
-	help
-	Specifies the maximum number of branches (or member directories)
-	in a single aufs. The larger value consumes more system
-	resources and has a minor impact to performance.
-config AUFS_BRANCH_MAX_127
-	bool "127"
-	help
-	Specifies the maximum number of branches (or member directories)
-	in a single aufs. The larger value consumes more system
-	resources and has a minor impact to performance.
-config AUFS_BRANCH_MAX_511
-	bool "511"
-	help
-	Specifies the maximum number of branches (or member directories)
-	in a single aufs. The larger value consumes more system
-	resources and has a minor impact to performance.
-config AUFS_BRANCH_MAX_1023
-	bool "1023"
-	help
-	Specifies the maximum number of branches (or member directories)
-	in a single aufs. The larger value consumes more system
-	resources and has a minor impact to performance.
-config AUFS_BRANCH_MAX_32767
-	bool "32767"
-	help
-	Specifies the maximum number of branches (or member directories)
-	in a single aufs. The larger value consumes more system
-	resources and has a minor impact to performance.
-endchoice
-
-config AUFS_SBILIST
-	bool
-	depends on AUFS_MAGIC_SYSRQ || PROC_FS
-	default y
-	help
-	Automatic configuration for internal use.
-	When aufs supports Magic SysRq or /proc, enabled automatically.
-
-config AUFS_HNOTIFY
-	bool "Detect direct branch access (bypassing aufs)"
-	help
-	If you want to modify files on branches directly, eg. bypassing aufs,
-	and want aufs to detect the changes of them fully, then enable this
-	option and use 'udba=notify' mount option.
-	Currently there is only one available configuration, "fsnotify".
-	It will have a negative impact to the performance.
-	See detail in aufs.5.
-
-choice
-	prompt "method" if AUFS_HNOTIFY
-	default AUFS_HFSNOTIFY
-config AUFS_HFSNOTIFY
-	bool "fsnotify"
-	select FSNOTIFY
-endchoice
-
-config AUFS_EXPORT
-	bool "NFS-exportable aufs"
-	depends on EXPORTFS
-	help
-	If you want to export your mounted aufs via NFS, then enable this
-	option. There are several requirements for this configuration.
-	See detail in aufs.5.
-
-config AUFS_INO_T_64
-	bool
-	depends on AUFS_EXPORT
-	depends on 64BIT && !(ALPHA || S390)
-	default y
-	help
-	Automatic configuration for internal use.
-	/* typedef unsigned long/int __kernel_ino_t */
-	/* alpha and s390x are int */
-
-config AUFS_XATTR
-	bool "support for XATTR/EA (including Security Labels)"
-	help
-	If your branch fs supports XATTR/EA and you want to make them
-	available in aufs too, then enable this opsion and specify the
-	branch attributes for EA.
-	See detail in aufs.5.
-
-config AUFS_FHSM
-	bool "File-based Hierarchical Storage Management"
-	help
-	Hierarchical Storage Management (or HSM) is a well-known feature
-	in the storage world. Aufs provides this feature as file-based.
-	with multiple branches.
-	These multiple branches are prioritized, ie. the topmost one
-	should be the fastest drive and be used heavily.
-
-config AUFS_RDU
-	bool "Readdir in userspace"
-	help
-	Aufs has two methods to provide a merged view for a directory,
-	by a user-space library and by kernel-space natively. The latter
-	is always enabled but sometimes large and slow.
-	If you enable this option, install the library in aufs2-util
-	package, and set some environment variables for your readdir(3),
-	then the work will be handled in user-space which generally
-	shows better performance in most cases.
-	See detail in aufs.5.
-
-config AUFS_SHWH
-	bool "Show whiteouts"
-	help
-	If you want to make the whiteouts in aufs visible, then enable
-	this option and specify 'shwh' mount option. Although it may
-	sounds like philosophy or something, but in technically it
-	simply shows the name of whiteout with keeping its behaviour.
-
-config AUFS_BR_RAMFS
-	bool "Ramfs (initramfs/rootfs) as an aufs branch"
-	help
-	If you want to use ramfs as an aufs branch fs, then enable this
-	option. Generally tmpfs is recommended.
-	Aufs prohibited them to be a branch fs by default, because
-	initramfs becomes unusable after switch_root or something
-	generally. If you sets initramfs as an aufs branch and boot your
-	system by switch_root, you will meet a problem easily since the
-	files in initramfs may be inaccessible.
-	Unless you are going to use ramfs as an aufs branch fs without
-	switch_root or something, leave it N.
-
-config AUFS_BR_FUSE
-	bool "Fuse fs as an aufs branch"
-	depends on FUSE_FS
-	select AUFS_POLL
-	help
-	If you want to use fuse-based userspace filesystem as an aufs
-	branch fs, then enable this option.
-	It implements the internal poll(2) operation which is
-	implemented by fuse only (curretnly).
-
-config AUFS_POLL
-	bool
-	help
-	Automatic configuration for internal use.
-
-config AUFS_BR_HFSPLUS
-	bool "Hfsplus as an aufs branch"
-	depends on HFSPLUS_FS
-	default y
-	help
-	If you want to use hfsplus fs as an aufs branch fs, then enable
-	this option. This option introduces a small overhead at
-	copying-up a file on hfsplus.
-
-config AUFS_BDEV_LOOP
-	bool
-	depends on BLK_DEV_LOOP
-	default y
-	help
-	Automatic configuration for internal use.
-	Convert =[ym] into =y.
-
-config AUFS_DEBUG
-	bool "Debug aufs"
-	help
-	Enable this to compile aufs internal debug code.
-	It will have a negative impact to the performance.
-
-config AUFS_MAGIC_SYSRQ
-	bool
-	depends on AUFS_DEBUG && MAGIC_SYSRQ
-	default y
-	help
-	Automatic configuration for internal use.
-	When aufs supports Magic SysRq, enabled automatically.
-endif
diff --git a/fs/aufs/Makefile b/fs/aufs/Makefile
deleted file mode 100644
index c7efb62b5..000000000
--- a/fs/aufs/Makefile
+++ /dev/null
@@ -1,36 +0,0 @@
-
-include ${srctree}/${src}/magic.mk
-
-# cf. include/linux/kernel.h
-# enable pr_debug
-ccflags-y += -DDEBUG
-# sparse requires the full pathname
-ccflags-y += -include ${srctree}/include/uapi/linux/aufs_type.h
-
-obj-$(CONFIG_AUFS_FS) += aufs.o
-aufs-y := module.o sbinfo.o super.o branch.o xino.o sysaufs.o opts.o \
-	wkq.o vfsub.o dcsub.o \
-	cpup.o whout.o wbr_policy.o \
-	dinfo.o dentry.o \
-	dynop.o \
-	finfo.o file.o f_op.o \
-	dir.o vdir.o \
-	iinfo.o inode.o i_op.o i_op_add.o i_op_del.o i_op_ren.o \
-	mvdown.o ioctl.o
-
-# all are boolean
-aufs-$(CONFIG_PROC_FS) += procfs.o plink.o
-aufs-$(CONFIG_SYSFS) += sysfs.o
-aufs-$(CONFIG_DEBUG_FS) += dbgaufs.o
-aufs-$(CONFIG_AUFS_BDEV_LOOP) += loop.o
-aufs-$(CONFIG_AUFS_HNOTIFY) += hnotify.o
-aufs-$(CONFIG_AUFS_HFSNOTIFY) += hfsnotify.o
-aufs-$(CONFIG_AUFS_EXPORT) += export.o
-aufs-$(CONFIG_AUFS_XATTR) += xattr.o
-aufs-$(CONFIG_FS_POSIX_ACL) += posix_acl.o
-aufs-$(CONFIG_AUFS_FHSM) += fhsm.o
-aufs-$(CONFIG_AUFS_POLL) += poll.o
-aufs-$(CONFIG_AUFS_RDU) += rdu.o
-aufs-$(CONFIG_AUFS_BR_HFSPLUS) += hfsplus.o
-aufs-$(CONFIG_AUFS_DEBUG) += debug.o
-aufs-$(CONFIG_AUFS_MAGIC_SYSRQ) += sysrq.o
diff --git a/fs/aufs/aufs.h b/fs/aufs/aufs.h
deleted file mode 100644
index 49f43b433..000000000
--- a/fs/aufs/aufs.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/*
- * Copyright (C) 2005-2016 Junjiro R. Okajima
- */
-
-/*
- * all header files
- */
-
-#ifndef __AUFS_H__
-#define __AUFS_H__
-
-#ifdef __KERNEL__
-
-#define AuStub(type, name, body, ...) \
-	static inline type name(__VA_ARGS__) { body; }
-
-#define AuStubVoid(name, ...) \
-	AuStub(void, name, , __VA_ARGS__)
-#define AuStubInt0(name, ...) \
-	AuStub(int, name, return 0, __VA_ARGS__)
-
-#include "debug.h"
-
-#include "branch.h"
-#include "cpup.h"
-#include "dcsub.h"
-#include "dbgaufs.h"
-#include "dentry.h"
-#include "dir.h"
-#include "dynop.h"
-#include "file.h"
-#include "fstype.h"
-#include "inode.h"
-#include "loop.h"
-#include "module.h"
-#include "opts.h"
-#include "rwsem.h"
-#include "spl.h"
-#include "super.h"
-#include "sysaufs.h"
-#include "vfsub.h"
-#include "whout.h"
-#include "wkq.h"
-
-#endif /* __KERNEL__ */
-#endif /* __AUFS_H__ */
diff --git a/fs/aufs/branch.c b/fs/aufs/branch.c
deleted file mode 100644
index 1ab5e1f2d..000000000
--- a/fs/aufs/branch.c
+++ /dev/null
@@ -1,1394 +0,0 @@
-/*
- * Copyright (C) 2005-2016 Junjiro R. Okajima
- */
-
-/*
- * branch management
- */
-
-#include <linux/compat.h>
-#include <linux/statfs.h>
-#include "aufs.h"
-
-/*
- * free a single branch
- */
-static void au_br_do_free(struct au_branch *br)
-{
-	int i;
-	struct au_wbr *wbr;
-	struct au_dykey **key;
-
-	au_hnotify_fin_br(br);
-
-	if (br->br_xino.xi_file)
-		fput(br->br_xino.xi_file);
-	mutex_destroy(&br->br_xino.xi_nondir_mtx);
-
-	AuDebugOn(atomic_read(&br->br_count));
-
-	wbr = br->br_wbr;
-	if (wbr) {
-		for (i = 0; i < AuBrWh_Last; i++)
-			dput(wbr->wbr_wh[i]);
-		AuDebugOn(atomic_read(&wbr->wbr_wh_running));
-		AuRwDestroy(&wbr->wbr_wh_rwsem);
-	}
-
-	if (br->br_fhsm) {
-		au_br_fhsm_fin(br->br_fhsm);
-		kfree(br->br_fhsm);
-	}
-
-	key = br->br_dykey;
-	for (i = 0; i < AuBrDynOp; i++, key++)
-		if (*key)
-			au_dy_put(*key);
-		else
-			break;
-
-	/* recursive lock, s_umount of branch's */
-	lockdep_off();
-	path_put(&br->br_path);
-	lockdep_on();
-	kfree(wbr);
-	kfree(br);
-}
-
-/*
- * frees all branches
- */
-void au_br_free(struct au_sbinfo *sbinfo)
-{
-	aufs_bindex_t bmax;
-	struct au_branch **br;
-
-	AuRwMustWriteLock(&sbinfo->si_rwsem);
-
-	bmax = sbinfo->si_bend + 1;
-	br = sbinfo->si_branch;
-	while (bmax--)
-		au_br_do_free(*br++);
-}
-
-/*
- * find the index of a branch which is specified by @br_id.
- */
-int au_br_index(struct super_block *sb, aufs_bindex_t br_id)
-{
-	aufs_bindex_t bindex, bend;
-
-	bend = au_sbend(sb);
-	for (bindex = 0; bindex <= bend; bindex++)
-		if (au_sbr_id(sb, bindex) == br_id)
-			return bindex;
-	return -1;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/*
- * add a branch
- */
-
-static int test_overlap(struct super_block *sb, struct dentry *h_adding,
-			struct dentry *h_root)
-{
-	if (unlikely(h_adding == h_root
-		     || au_test_loopback_overlap(sb, h_adding)))
-		return 1;
-	if (h_adding->d_sb != h_root->d_sb)
-		return 0;
-	return au_test_subdir(h_adding, h_root)
-		|| au_test_subdir(h_root, h_adding);
-}
-
-/*
- * returns a newly allocated branch. @new_nbranch is a number of branches
- * after adding a branch.
- */
-static struct au_branch *au_br_alloc(struct super_block *sb, int new_nbranch,
-				     int perm)
-{
-	struct au_branch *add_branch;
-	struct dentry *root;
-	struct inode *inode;
-	int err;
-
-	err = -ENOMEM;
-	root = sb->s_root;
-	add_branch = kzalloc(sizeof(*add_branch), GFP_NOFS);
-	if (unlikely(!add_branch))
-		goto out;
-
-	err = au_hnotify_init_br(add_branch, perm);
-	if (unlikely(err))
-		goto out_br;
-
-	if (au_br_writable(perm)) {
-		/* may be freed separately at changing the branch permission */
-		add_branch->br_wbr = kzalloc(sizeof(*add_branch->br_wbr),
-					     GFP_NOFS);
-		if (unlikely(!add_branch->br_wbr))
-			goto out_hnotify;
-	}
-
-	if (au_br_fhsm(perm)) {
-		err = au_fhsm_br_alloc(add_branch);
-		if (unlikely(err))
-			goto out_wbr;
-	}
-
-	err = au_sbr_realloc(au_sbi(sb), new_nbranch);
-	if (!err)
-		err = au_di_realloc(au_di(root), new_nbranch);
-	if (!err) {
-		inode = d_inode(root);
-		err = au_ii_realloc(au_ii(inode), new_nbranch);
-	}
-	if (!err)
-		return add_branch; /* success */
-
-out_wbr:
-	kfree(add_branch->br_wbr);
-out_hnotify:
-	au_hnotify_fin_br(add_branch);
-out_br:
-	kfree(add_branch);
-out:
-	return ERR_PTR(err);
-}
-
-/*
- * test if the branch permission is legal or not.
- */
-static int test_br(struct inode *inode, int brperm, char *path)
-{
-	int err;
-
-	err = (au_br_writable(brperm) && IS_RDONLY(inode));
-	if (!err)
-		goto out;
-
-	err = -EINVAL;
-	pr_err("write permission for readonly mount or inode, %s\n", path);
-
-out:
-	return err;
-}
-
-/*
- * returns:
- * 0: success, the caller will add it
- * plus: success, it is already unified, the caller should ignore it
- * minus: error
- */
-static int test_add(struct super_block *sb, struct au_opt_add *add, int remount)
-{
-	int err;
-	aufs_bindex_t bend, bindex;
-	struct dentry *root, *h_dentry;
-	struct inode *inode, *h_inode;
-
-	root = sb->s_root;
-	bend = au_sbend(sb);
-	if (unlikely(bend >= 0
-		     && au_find_dbindex(root, add->path.dentry) >= 0)) {
-		err = 1;
-		if (!remount) {
-			err = -EINVAL;
-			pr_err("%s duplicated\n", add->pathname);
-		}
-		goto out;
-	}
-
-	err = -ENOSPC; /* -E2BIG; */
-	if (unlikely(AUFS_BRANCH_MAX <= add->bindex
-		     || AUFS_BRANCH_MAX - 1 <= bend)) {
-		pr_err("number of branches exceeded %s\n", add->pathname);
-		goto out;
-	}
-
-	err = -EDOM;
-	if (unlikely(add->bindex < 0 || bend + 1 < add->bindex)) {
-		pr_err("bad index %d\n", add->bindex);
-		goto out;
-	}
-
-	inode = d_inode(add->path.dentry);
-	err = -ENOENT;
-	if (unlikely(!inode->i_nlink)) {
-		pr_err("no existence %s\n", add->pathname);
-		goto out;
-	}
-
-	err = -EINVAL;
-	if (unlikely(inode->i_sb == sb)) {
-		pr_err("%s must be outside\n", add->pathname);
-		goto out;
-	}
-
-	if (unlikely(au_test_fs_unsuppoted(inode->i_sb))) {
-		pr_err("unsupported filesystem, %s (%s)\n",
-		       add->pathname, au_sbtype(inode->i_sb));
-		goto out;
-	}
-
-	if (unlikely(inode->i_sb->s_stack_depth)) {
-		pr_err("already stacked, %s (%s)\n",
-		       add->pathname, au_sbtype(inode->i_sb));
-		goto out;
-	}
-
-	err = test_br(d_inode(add->path.dentry), add->perm, add->pathname);
-	if (unlikely(err))
-		goto out;
-
-	if (bend < 0)
-		return 0; /* success */
-
-	err = -EINVAL;
-	for (bindex = 0; bindex <= bend; bindex++)
-		if (unlikely(test_overlap(sb, add->path.dentry,
-					  au_h_dptr(root, bindex)))) {
-			pr_err("%s is overlapped\n", add->pathname);
-			goto out;
-		}
-
-	err = 0;
-	if (au_opt_test(au_mntflags(sb), WARN_PERM)) {
-		h_dentry = au_h_dptr(root, 0);
-		h_inode = d_inode(h_dentry);
-		if ((h_inode->i_mode & S_IALLUGO) != (inode->i_mode & S_IALLUGO)
-		    || !uid_eq(h_inode->i_uid, inode->i_uid)
-		    || !gid_eq(h_inode->i_gid, inode->i_gid))
-			pr_warn("uid/gid/perm %s %u/%u/0%o, %u/%u/0%o\n",
-				add->pathname,
-				i_uid_read(inode), i_gid_read(inode),
-				(inode->i_mode & S_IALLUGO),
-				i_uid_read(h_inode), i_gid_read(h_inode),
-				(h_inode->i_mode & S_IALLUGO));
-	}
-
-out:
-	return err;
-}
-
-/*
- * initialize or clean the whiteouts for an adding branch
- */
-static int au_br_init_wh(struct super_block *sb, struct au_branch *br,
-			 int new_perm)
-{
-	int err, old_perm;
-	aufs_bindex_t bindex;
-	struct mutex *h_mtx;
-	struct au_wbr *wbr;
-	struct au_hinode *hdir;
-	struct dentry *h_dentry;
-
-	err = vfsub_mnt_want_write(au_br_mnt(br));
-	if (unlikely(err))
-		goto out;
-
-	wbr = br->br_wbr;
-	old_perm = br->br_perm;
-	br->br_perm = new_perm;
-	hdir = NULL;
-	h_mtx = NULL;
-	bindex = au_br_index(sb, br->br_id);
-	if (0 <= bindex) {
-		hdir = au_hi(d_inode(sb->s_root), bindex);
-		au_hn_imtx_lock_nested(hdir, AuLsc_I_PARENT);
-	} else {
-		h_dentry = au_br_dentry(br);
-		h_mtx = &d_inode(h_dentry)->i_mutex;
-		mutex_lock_nested(h_mtx, AuLsc_I_PARENT);
-	}
-	if (!wbr)
-		err = au_wh_init(br, sb);
-	else {
-		wbr_wh_write_lock(wbr);
-		err = au_wh_init(br, sb);
-		wbr_wh_write_unlock(wbr);
-	}
-	if (hdir)
-		au_hn_imtx_unlock(hdir);
-	else
-		mutex_unlock(h_mtx);
-	vfsub_mnt_drop_write(au_br_mnt(br));
-	br->br_perm = old_perm;
-
-	if (!err && wbr && !au_br_writable(new_perm)) {
-		kfree(wbr);
-		br->br_wbr = NULL;
-	}
-
-out:
-	return err;
-}
-
-static int au_wbr_init(struct au_branch *br, struct super_block *sb,
-		       int perm)
-{
-	int err;
-	struct kstatfs kst;
-	struct au_wbr *wbr;
-
-	wbr = br->br_wbr;
-	au_rw_init(&wbr->wbr_wh_rwsem);
-	atomic_set(&wbr->wbr_wh_running, 0);
-
-	/*
-	 * a limit for rmdir/rename a dir
-	 * cf. AUFS_MAX_NAMELEN in include/uapi/linux/aufs_type.h
-	 */
-	err = vfs_statfs(&br->br_path, &kst);
-	if (unlikely(err))
-		goto out;
-	err = -EINVAL;
-	if (kst.f_namelen >= NAME_MAX)
-		err = au_br_init_wh(sb, br, perm);
-	else
-		pr_err("%pd(%s), unsupported namelen %ld\n",
-		       au_br_dentry(br),
-		       au_sbtype(au_br_dentry(br)->d_sb), kst.f_namelen);
-
-out:
-	return err;
-}
-
-/* initialize a new branch */
-static int au_br_init(struct au_branch *br, struct super_block *sb,
-		      struct au_opt_add *add)
-{
-	int err;
-	struct inode *h_inode;
-
-	err = 0;
-	mutex_init(&br->br_xino.xi_nondir_mtx);
-	br->br_perm = add->perm;
-	br->br_path = add->path; /* set first, path_get() later */
-	spin_lock_init(&br->br_dykey_lock);
-	atomic_set(&br->br_count, 0);
-	atomic_set(&br->br_xino_running, 0);
-	br->br_id = au_new_br_id(sb);
-	AuDebugOn(br->br_id < 0);
-
-	if (au_br_writable(add->perm)) {
-		err = au_wbr_init(br, sb, add->perm);
-		if (unlikely(err))
-			goto out_err;
-	}
-
-	if (au_opt_test(au_mntflags(sb), XINO)) {
-		h_inode = d_inode(add->path.dentry);
-		err = au_xino_br(sb, br, h_inode->i_ino,
-				 au_sbr(sb, 0)->br_xino.xi_file, /*do_test*/1);
-		if (unlikely(err)) {
-			AuDebugOn(br->br_xino.xi_file);
-			goto out_err;
-		}
-	}
-
-	sysaufs_br_init(br);
-	path_get(&br->br_path);
-	goto out; /* success */
-
-out_err:
-	memset(&br->br_path, 0, sizeof(br->br_path));
-out:
-	return err;
-}
-
-static void au_br_do_add_brp(struct au_sbinfo *sbinfo, aufs_bindex_t bindex,
-			     struct au_branch *br, aufs_bindex_t bend,
-			     aufs_bindex_t amount)
-{
-	struct au_branch **brp;
-
-	AuRwMustWriteLock(&sbinfo->si_rwsem);
-
-	brp = sbinfo->si_branch + bindex;
-	memmove(brp + 1, brp, sizeof(*brp) * amount);
-	*brp = br;
-	sbinfo->si_bend++;
-	if (unlikely(bend < 0))
-		sbinfo->si_bend = 0;
-}
-
-static void au_br_do_add_hdp(struct au_dinfo *dinfo, aufs_bindex_t bindex,
-			     aufs_bindex_t bend, aufs_bindex_t amount)
-{
-	struct au_hdentry *hdp;
-
-	AuRwMustWriteLock(&dinfo->di_rwsem);
-
-	hdp = dinfo->di_hdentry + bindex;
-	memmove(hdp + 1, hdp, sizeof(*hdp) * amount);
-	au_h_dentry_init(hdp);
-	dinfo->di_bend++;
-	if (unlikely(bend < 0))
-		dinfo->di_bstart = 0;
-}
-
-static void au_br_do_add_hip(struct au_iinfo *iinfo, aufs_bindex_t bindex,
-			     aufs_bindex_t bend, aufs_bindex_t amount)
-{
-	struct au_hinode *hip;
-
-	AuRwMustWriteLock(&iinfo->ii_rwsem);
-
-	hip = iinfo->ii_hinode + bindex;
-	memmove(hip + 1, hip, sizeof(*hip) * amount);
-	hip->hi_inode = NULL;
-	au_hn_init(hip);
-	iinfo->ii_bend++;
-	if (unlikely(bend < 0))
-		iinfo->ii_bstart = 0;
-}
-
-static void au_br_do_add(struct super_block *sb, struct au_branch *br,
-			 aufs_bindex_t bindex)
-{
-	struct dentry *root, *h_dentry;
-	struct inode *root_inode, *h_inode;
-	aufs_bindex_t bend, amount;
-
-	root = sb->s_root;
-	root_inode = d_inode(root);
-	bend = au_sbend(sb);
-	amount = bend + 1 - bindex;
-	h_dentry = au_br_dentry(br);
-	au_sbilist_lock();
-	au_br_do_add_brp(au_sbi(sb), bindex, br, bend, amount);
-	au_br_do_add_hdp(au_di(root), bindex, bend, amount);
-	au_br_do_add_hip(au_ii(root_inode), bindex, bend, amount);
-	au_set_h_dptr(root, bindex, dget(h_dentry));
-	h_inode = d_inode(h_dentry);
-	au_set_h_iptr(root_inode, bindex, au_igrab(h_inode), /*flags*/0);
-	au_sbilist_unlock();
-}
-
-int au_br_add(struct super_block *sb, struct au_opt_add *add, int remount)
-{
-	int err;
-	aufs_bindex_t bend, add_bindex;
-	struct dentry *root, *h_dentry;
-	struct inode *root_inode;
-	struct au_branch *add_branch;
-
-	root = sb->s_root;
-	root_inode = d_inode(root);
-	IMustLock(root_inode);
-	err = test_add(sb, add, remount);
-	if (unlikely(err < 0))
-		goto out;
-	if (err) {
-		err = 0;
-		goto out; /* success */
-	}
-
-	bend = au_sbend(sb);
-	add_branch = au_br_alloc(sb, bend + 2, add->perm);
-	err = PTR_ERR(add_branch);
-	if (IS_ERR(add_branch))
-		goto out;
-
-	err = au_br_init(add_branch, sb, add);
-	if (unlikely(err)) {
-		au_br_do_free(add_branch);
-		goto out;
-	}
-
-	add_bindex = add->bindex;
-	if (!remount)
-		au_br_do_add(sb, add_branch, add_bindex);
-	else {
-		sysaufs_brs_del(sb, add_bindex);
-		au_br_do_add(sb, add_branch, add_bindex);
-		sysaufs_brs_add(sb, add_bindex);
-	}
-
-	h_dentry = add->path.dentry;
-	if (!add_bindex) {
-		au_cpup_attr_all(root_inode, /*force*/1);
-		sb->s_maxbytes = h_dentry->d_sb->s_maxbytes;
-	} else
-		au_add_nlink(root_inode, d_inode(h_dentry));
-
-	/*
-	 * this test/set prevents aufs from handling unnecesary notify events
-	 * of xino files, in case of re-adding a writable branch which was
-	 * once detached from aufs.
-	 */
-	if (au_xino_brid(sb) < 0
-	    && au_br_writable(add_branch->br_perm)
-	    && !au_test_fs_bad_xino(h_dentry->d_sb)
-	    && add_branch->br_xino.xi_file
-	    && add_branch->br_xino.xi_file->f_path.dentry->d_parent == h_dentry)
-		au_xino_brid_set(sb, add_branch->br_id);
-
-out:
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static unsigned long long au_farray_cb(struct super_block *sb, void *a,
-				       unsigned long long max __maybe_unused,
-				       void *arg)
-{
-	unsigned long long n;
-	struct file **p, *f;
-	struct au_sphlhead *files;
-	struct au_finfo *finfo;
-
-	n = 0;
-	p = a;
-	files = &au_sbi(sb)->si_files;
-	spin_lock(&files->spin);
-	hlist_for_each_entry(finfo, &files->head, fi_hlist) {
-		f = finfo->fi_file;
-		if (file_count(f)
-		    && !special_file(file_inode(f)->i_mode)) {
-			get_file(f);
-			*p++ = f;
-			n++;
-			AuDebugOn(n > max);
-		}
-	}
-	spin_unlock(&files->spin);
-
-	return n;
-}
-
-static struct file **au_farray_alloc(struct super_block *sb,
-				     unsigned long long *max)
-{
-	*max = atomic_long_read(&au_sbi(sb)->si_nfiles);
-	return au_array_alloc(max, au_farray_cb, sb, /*arg*/NULL);
-}
-
-static void au_farray_free(struct file **a, unsigned long long max)
-{
-	unsigned long long ull;
-
-	for (ull = 0; ull < max; ull++)
-		if (a[ull])
-			fput(a[ull]);
-	kvfree(a);
-}
-
-/* ---------------------------------------------------------------------- */
-
-/*
- * delete a branch
- */
-
-/* to show the line number, do not make it inlined function */
-#define AuVerbose(do_info, fmt, ...) do { \
-	if (do_info) \
-		pr_info(fmt, ##__VA_ARGS__); \
-} while (0)
-
-static int au_test_ibusy(struct inode *inode, aufs_bindex_t bstart,
-			 aufs_bindex_t bend)
-{
-	return (inode && !S_ISDIR(inode->i_mode)) || bstart == bend;
-}
-
-static int au_test_dbusy(struct dentry *dentry, aufs_bindex_t bstart,
-			 aufs_bindex_t bend)
-{
-	return au_test_ibusy(d_inode(dentry), bstart, bend);
-}
-
-/*
- * test if the branch is deletable or not.
- */
-static int test_dentry_busy(struct dentry *root, aufs_bindex_t bindex,
-			    unsigned int sigen, const unsigned int verbose)
-{
-	int err, i, j, ndentry;
-	aufs_bindex_t bstart, bend;
-	struct au_dcsub_pages dpages;
-	struct au_dpage *dpage;
-	struct dentry *d;
-
-	err = au_dpages_init(&dpages, GFP_NOFS);
-	if (unlikely(err))
-		goto out;
-	err = au_dcsub_pages(&dpages, root, NULL, NULL);
-	if (unlikely(err))
-		goto out_dpages;
-
-	for (i = 0; !err && i < dpages.ndpage; i++) {
-		dpage = dpages.dpages + i;
-		ndentry = dpage->ndentry;
-		for (j = 0; !err && j < ndentry; j++) {
-			d = dpage->dentries[j];
-			AuDebugOn(au_dcount(d) <= 0);
-			if (!au_digen_test(d, sigen)) {
-				di_read_lock_child(d, AuLock_IR);
-				if (unlikely(au_dbrange_test(d))) {
-					di_read_unlock(d, AuLock_IR);
-					continue;
-				}
-			} else {
-				di_write_lock_child(d);
-				if (unlikely(au_dbrange_test(d))) {
-					di_write_unlock(d);
-					continue;
-				}
-				err = au_reval_dpath(d, sigen);
-				if (!err)
-					di_downgrade_lock(d, AuLock_IR);
-				else {
-					di_write_unlock(d);
-					break;
-				}
-			}
-
-			/* AuDbgDentry(d); */
-			bstart = au_dbstart(d);
-			bend = au_dbend(d);
-			if (bstart <= bindex
-			    && bindex <= bend
-			    && au_h_dptr(d, bindex)
-			    && au_test_dbusy(d, bstart, bend)) {
-				err = -EBUSY;
-				AuVerbose(verbose, "busy %pd\n", d);
-				AuDbgDentry(d);
-			}
-			di_read_unlock(d, AuLock_IR);
-		}
-	}
-
-out_dpages:
-	au_dpages_free(&dpages);
-out:
-	return err;
-}
-
-static int test_inode_busy(struct super_block *sb, aufs_bindex_t bindex,
-			   unsigned int sigen, const unsigned int verbose)
-{
-	int err;
-	unsigned long long max, ull;
-	struct inode *i, **array;
-	aufs_bindex_t bstart, bend;
-
-	array = au_iarray_alloc(sb, &max);
-	err = PTR_ERR(array);
-	if (IS_ERR(array))
-		goto out;
-
-	err = 0;
-	AuDbg("b%d\n", bindex);
-	for (ull = 0; !err && ull < max; ull++) {
-		i = array[ull];
-		if (unlikely(!i))
-			break;
-		if (i->i_ino == AUFS_ROOT_INO)
-			continue;
-
-		/* AuDbgInode(i); */
-		if (au_iigen(i, NULL) == sigen)
-			ii_read_lock_child(i);
-		else {
-			ii_write_lock_child(i);
-			err = au_refresh_hinode_self(i);
-			au_iigen_dec(i);
-			if (!err)
-				ii_downgrade_lock(i);
-			else {
-				ii_write_unlock(i);
-				break;
-			}
-		}
-
-		bstart = au_ibstart(i);
-		bend = au_ibend(i);
-		if (bstart <= bindex
-		    && bindex <= bend
-		    && au_h_iptr(i, bindex)
-		    && au_test_ibusy(i, bstart, bend)) {
-			err = -EBUSY;
-			AuVerbose(verbose, "busy i%lu\n", i->i_ino);
-			AuDbgInode(i);
-		}
-		ii_read_unlock(i);
-	}
-	au_iarray_free(array, max);
-
-out:
-	return err;
-}
-
-static int test_children_busy(struct dentry *root, aufs_bindex_t bindex,
-			      const unsigned int verbose)
-{
-	int err;
-	unsigned int sigen;
-
-	sigen = au_sigen(root->d_sb);
-	DiMustNoWaiters(root);
-	IiMustNoWaiters(d_inode(root));
-	di_write_unlock(root);
-	err = test_dentry_busy(root, bindex, sigen, verbose);
-	if (!err)
-		err = test_inode_busy(root->d_sb, bindex, sigen, verbose);
-	di_write_lock_child(root); /* aufs_write_lock() calls ..._child() */
-
-	return err;
-}
-
-static int test_dir_busy(struct file *file, aufs_bindex_t br_id,
-			 struct file **to_free, int *idx)
-{
-	int err;
-	unsigned char matched, root;
-	aufs_bindex_t bindex, bend;
-	struct au_fidir *fidir;
-	struct au_hfile *hfile;
-
-	err = 0;
-	root = IS_ROOT(file->f_path.dentry);
-	if (root) {
-		get_file(file);
-		to_free[*idx] = file;
-		(*idx)++;
-		goto out;
-	}
-
-	matched = 0;
-	fidir = au_fi(file)->fi_hdir;
-	AuDebugOn(!fidir);
-	bend = au_fbend_dir(file);
-	for (bindex = au_fbstart(file); bindex <= bend; bindex++) {
-		hfile = fidir->fd_hfile + bindex;
-		if (!hfile->hf_file)
-			continue;
-
-		if (hfile->hf_br->br_id == br_id) {
-			matched = 1;
-			break;
-		}
-	}
-	if (matched)
-		err = -EBUSY;
-
-out:
-	return err;
-}
-
-static int test_file_busy(struct super_block *sb, aufs_bindex_t br_id,
-			  struct file **to_free, int opened)
-{
-	int err, idx;
-	unsigned long long ull, max;
-	aufs_bindex_t bstart;
-	struct file *file, **array;
-	struct dentry *root;
-	struct au_hfile *hfile;
-
-	array = au_farray_alloc(sb, &max);
-	err = PTR_ERR(array);
-	if (IS_ERR(array))
-		goto out;
-
-	err = 0;
-	idx = 0;
-	root = sb->s_root;
-	di_write_unlock(root);
-	for (ull = 0; ull < max; ull++) {
-		file = array[ull];
-		if (unlikely(!file))
-			break;
-
-		/* AuDbg("%pD\n", file); */
-		fi_read_lock(file);
-		bstart = au_fbstart(file);
-		if (!d_is_dir(file->f_path.dentry)) {
-			hfile = &au_fi(file)->fi_htop;
-			if (hfile->hf_br->br_id == br_id)
-				err = -EBUSY;
-		} else
-			err = test_dir_busy(file, br_id, to_free, &idx);
-		fi_read_unlock(file);
-		if (unlikely(err))
-			break;
-	}
-	di_write_lock_child(root);
-	au_farray_free(array, max);
-	AuDebugOn(idx > opened);
-
-out:
-	return err;
-}
-
-static void br_del_file(struct file **to_free, unsigned long long opened,
-			  aufs_bindex_t br_id)
-{
-	unsigned long long ull;
-	aufs_bindex_t bindex, bstart, bend, bfound;
-	struct file *file;
-	struct au_fidir *fidir;
-	struct au_hfile *hfile;
-
-	for (ull = 0; ull < opened; ull++) {
-		file = to_free[ull];
-		if (unlikely(!file))
-			break;
-
-		/* AuDbg("%pD\n", file); */
-		AuDebugOn(!d_is_dir(file->f_path.dentry));
-		bfound = -1;
-		fidir = au_fi(file)->fi_hdir;
-		AuDebugOn(!fidir);
-		fi_write_lock(file);
-		bstart = au_fbstart(file);
-		bend = au_fbend_dir(file);
-		for (bindex = bstart; bindex <= bend; bindex++) {
-			hfile = fidir->fd_hfile + bindex;
-			if (!hfile->hf_file)
-				continue;
-
-			if (hfile->hf_br->br_id == br_id) {
-				bfound = bindex;
-				break;
-			}
-		}
-		AuDebugOn(bfound < 0);
-		au_set_h_fptr(file, bfound, NULL);
-		if (bfound == bstart) {
-			for (bstart++; bstart <= bend; bstart++)
-				if (au_hf_dir(file, bstart)) {
-					au_set_fbstart(file, bstart);
-					break;
-				}
-		}
-		fi_write_unlock(file);
-	}
-}
-
-static void au_br_do_del_brp(struct au_sbinfo *sbinfo,
-			     const aufs_bindex_t bindex,
-			     const aufs_bindex_t bend)
-{
-	struct au_branch **brp, **p;
-
-	AuRwMustWriteLock(&sbinfo->si_rwsem);
-
-	brp = sbinfo->si_branch + bindex;
-	if (bindex < bend)
-		memmove(brp, brp + 1, sizeof(*brp) * (bend - bindex));
-	sbinfo->si_branch[0 + bend] = NULL;
-	sbinfo->si_bend--;
-
-	p = krealloc(sbinfo->si_branch, sizeof(*p) * bend, AuGFP_SBILIST);
-	if (p)
-		sbinfo->si_branch = p;
-	/* harmless error */
-}
-
-static void au_br_do_del_hdp(struct au_dinfo *dinfo, const aufs_bindex_t bindex,
-			     const aufs_bindex_t bend)
-{
-	struct au_hdentry *hdp, *p;
-
-	AuRwMustWriteLock(&dinfo->di_rwsem);
-
-	hdp = dinfo->di_hdentry;
-	if (bindex < bend)
-		memmove(hdp + bindex, hdp + bindex + 1,
-			sizeof(*hdp) * (bend - bindex));
-	hdp[0 + bend].hd_dentry = NULL;
-	dinfo->di_bend--;
-
-	p = krealloc(hdp, sizeof(*p) * bend, AuGFP_SBILIST);
-	if (p)
-		dinfo->di_hdentry = p;
-	/* harmless error */
-}
-
-static void au_br_do_del_hip(struct au_iinfo *iinfo, const aufs_bindex_t bindex,
-			     const aufs_bindex_t bend)
-{
-	struct au_hinode *hip, *p;
-
-	AuRwMustWriteLock(&iinfo->ii_rwsem);
-
-	hip = iinfo->ii_hinode + bindex;
-	if (bindex < bend)
-		memmove(hip, hip + 1, sizeof(*hip) * (bend - bindex));
-	iinfo->ii_hinode[0 + bend].hi_inode = NULL;
-	au_hn_init(iinfo->ii_hinode + bend);
-	iinfo->ii_bend--;
-
-	p = krealloc(iinfo->ii_hinode, sizeof(*p) * bend, AuGFP_SBILIST);
-	if (p)
-		iinfo->ii_hinode = p;
-	/* harmless error */
-}
-
-static void au_br_do_del(struct super_block *sb, aufs_bindex_t bindex,
-			 struct au_branch *br)
-{
-	aufs_bindex_t bend;
-	struct au_sbinfo *sbinfo;
-	struct dentry *root, *h_root;
-	struct inode *inode, *h_inode;
-	struct au_hinode *hinode;
-
-	SiMustWriteLock(sb);
-
-	root = sb->s_root;
-	inode = d_inode(root);
-	sbinfo = au_sbi(sb);
-	bend = sbinfo->si_bend;
-
-	h_root = au_h_dptr(root, bindex);
-	hinode = au_hi(inode, bindex);
-	h_inode = au_igrab(hinode->hi_inode);
-	au_hiput(hinode);
-
-	au_sbilist_lock();
-	au_br_do_del_brp(sbinfo, bindex, bend);
-	au_br_do_del_hdp(au_di(root), bindex, bend);
-	au_br_do_del_hip(au_ii(inode), bindex, bend);
-	au_sbilist_unlock();
-
-	dput(h_root);
-	iput(h_inode);
-	au_br_do_free(br);
-}
-
-static unsigned long long empty_cb(struct super_block *sb, void *array,
-				   unsigned long long max, void *arg)
-{
-	return max;
-}
-
-int au_br_del(struct super_block *sb, struct au_opt_del *del, int remount)
-{
-	int err, rerr, i;
-	unsigned long long opened;
-	unsigned int mnt_flags;
-	aufs_bindex_t bindex, bend, br_id;
-	unsigned char do_wh, verbose;
-	struct au_branch *br;
-	struct au_wbr *wbr;
-	struct dentry *root;
-	struct file **to_free;
-
-	err = 0;
-	opened = 0;
-	to_free = NULL;
-	root = sb->s_root;
-	bindex = au_find_dbindex(root, del->h_path.dentry);
-	if (bindex < 0) {
-		if (remount)
-			goto out; /* success */
-		err = -ENOENT;
-		pr_err("%s no such branch\n", del->pathname);
-		goto out;
-	}
-	AuDbg("bindex b%d\n", bindex);
-
-	err = -EBUSY;
-	mnt_flags = au_mntflags(sb);
-	verbose = !!au_opt_test(mnt_flags, VERBOSE);
-	bend = au_sbend(sb);
-	if (unlikely(!bend)) {
-		AuVerbose(verbose, "no more branches left\n");
-		goto out;
-	}
-	br = au_sbr(sb, bindex);
-	AuDebugOn(!path_equal(&br->br_path, &del->h_path));
-
-	br_id = br->br_id;
-	opened = atomic_read(&br->br_count);
-	if (unlikely(opened)) {
-		to_free = au_array_alloc(&opened, empty_cb, sb, NULL);
-		err = PTR_ERR(to_free);
-		if (IS_ERR(to_free))
-			goto out;
-
-		err = test_file_busy(sb, br_id, to_free, opened);
-		if (unlikely(err)) {
-			AuVerbose(verbose, "%llu file(s) opened\n", opened);
-			goto out;
-		}
-	}
-
-	wbr = br->br_wbr;
-	do_wh = wbr && (wbr->wbr_whbase || wbr->wbr_plink || wbr->wbr_orph);
-	if (do_wh) {
-		/* instead of WbrWhMustWriteLock(wbr) */
-		SiMustWriteLock(sb);
-		for (i = 0; i < AuBrWh_Last; i++) {
-			dput(wbr->wbr_wh[i]);
-			wbr->wbr_wh[i] = NULL;
-		}
-	}
-
-	err = test_children_busy(root, bindex, verbose);
-	if (unlikely(err)) {
-		if (do_wh)
-			goto out_wh;
-		goto out;
-	}
-
-	err = 0;
-	if (to_free) {
-		/*
-		 * now we confirmed the branch is deletable.
-		 * let's free the remaining opened dirs on the branch.
-		 */
-		di_write_unlock(root);
-		br_del_file(to_free, opened, br_id);
-		di_write_lock_child(root);
-	}
-
-	if (!remount)
-		au_br_do_del(sb, bindex, br);
-	else {
-		sysaufs_brs_del(sb, bindex);
-		au_br_do_del(sb, bindex, br);
-		sysaufs_brs_add(sb, bindex);
-	}
-
-	if (!bindex) {
-		au_cpup_attr_all(d_inode(root), /*force*/1);
-		sb->s_maxbytes = au_sbr_sb(sb, 0)->s_maxbytes;
-	} else
-		au_sub_nlink(d_inode(root), d_inode(del->h_path.dentry));
-	if (au_opt_test(mnt_flags, PLINK))
-		au_plink_half_refresh(sb, br_id);
-
-	if (au_xino_brid(sb) == br_id)
-		au_xino_brid_set(sb, -1);
-	goto out; /* success */
-
-out_wh:
-	/* revert */
-	rerr = au_br_init_wh(sb, br, br->br_perm);
-	if (rerr)
-		pr_warn("failed re-creating base whiteout, %s. (%d)\n",
-			del->pathname, rerr);
-out:
-	if (to_free)
-		au_farray_free(to_free, opened);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static int au_ibusy(struct super_block *sb, struct aufs_ibusy __user *arg)
-{
-	int err;
-	aufs_bindex_t bstart, bend;
-	struct aufs_ibusy ibusy;
-	struct inode *inode, *h_inode;
-
-	err = -EPERM;
-	if (unlikely(!capable(CAP_SYS_ADMIN)))
-		goto out;
-
-	err = copy_from_user(&ibusy, arg, sizeof(ibusy));
-	if (!err)
-		err = !access_ok(VERIFY_WRITE, &arg->h_ino, sizeof(arg->h_ino));
-	if (unlikely(err)) {
-		err = -EFAULT;
-		AuTraceErr(err);
-		goto out;
-	}
-
-	err = -EINVAL;
-	si_read_lock(sb, AuLock_FLUSH);
-	if (unlikely(ibusy.bindex < 0 || ibusy.bindex > au_sbend(sb)))
-		goto out_unlock;
-
-	err = 0;
-	ibusy.h_ino = 0; /* invalid */
-	inode = ilookup(sb, ibusy.ino);
-	if (!inode
-	    || inode->i_ino == AUFS_ROOT_INO
-	    || is_bad_inode(inode))
-		goto out_unlock;
-
-	ii_read_lock_child(inode);
-	bstart = au_ibstart(inode);
-	bend = au_ibend(inode);
-	if (bstart <= ibusy.bindex && ibusy.bindex <= bend) {
-		h_inode = au_h_iptr(inode, ibusy.bindex);
-		if (h_inode && au_test_ibusy(inode, bstart, bend))
-			ibusy.h_ino = h_inode->i_ino;
-	}
-	ii_read_unlock(inode);
-	iput(inode);
-
-out_unlock:
-	si_read_unlock(sb);
-	if (!err) {
-		err = __put_user(ibusy.h_ino, &arg->h_ino);
-		if (unlikely(err)) {
-			err = -EFAULT;
-			AuTraceErr(err);
-		}
-	}
-out:
-	return err;
-}
-
-long au_ibusy_ioctl(struct file *file, unsigned long arg)
-{
-	return au_ibusy(file->f_path.dentry->d_sb, (void __user *)arg);
-}
-
-#ifdef CONFIG_COMPAT
-long au_ibusy_compat_ioctl(struct file *file, unsigned long arg)
-{
-	return au_ibusy(file->f_path.dentry->d_sb, compat_ptr(arg));
-}
-#endif
-
-/* ---------------------------------------------------------------------- */
-
-/*
- * change a branch permission
- */
-
-static void au_warn_ima(void)
-{
-#ifdef CONFIG_IMA
-	/* since it doesn't support mark_files_ro() */
-	AuWarn1("RW -> RO makes IMA to produce wrong message\n");
-#endif
-}
-
-static int do_need_sigen_inc(int a, int b)
-{
-	return au_br_whable(a) && !au_br_whable(b);
-}
-
-static int need_sigen_inc(int old, int new)
-{
-	return do_need_sigen_inc(old, new)
-		|| do_need_sigen_inc(new, old);
-}
-
-static int au_br_mod_files_ro(struct super_block *sb, aufs_bindex_t bindex)
-{
-	int err, do_warn;
-	unsigned int mnt_flags;
-	unsigned long long ull, max;
-	aufs_bindex_t br_id;
-	unsigned char verbose, writer;
-	struct file *file, *hf, **array;
-	struct au_hfile *hfile;
-
-	mnt_flags = au_mntflags(sb);
-	verbose = !!au_opt_test(mnt_flags, VERBOSE);
-
-	array = au_farray_alloc(sb, &max);
-	err = PTR_ERR(array);
-	if (IS_ERR(array))
-		goto out;
-
-	do_warn = 0;
-	br_id = au_sbr_id(sb, bindex);
-	for (ull = 0; ull < max; ull++) {
-		file = array[ull];
-		if (unlikely(!file))
-			break;
-
-		/* AuDbg("%pD\n", file); */
-		fi_read_lock(file);
-		if (unlikely(au_test_mmapped(file))) {
-			err = -EBUSY;
-			AuVerbose(verbose, "mmapped %pD\n", file);
-			AuDbgFile(file);
-			FiMustNoWaiters(file);
-			fi_read_unlock(file);
-			goto out_array;
-		}
-
-		hfile = &au_fi(file)->fi_htop;
-		hf = hfile->hf_file;
-		if (!d_is_reg(file->f_path.dentry)
-		    || !(file->f_mode & FMODE_WRITE)
-		    || hfile->hf_br->br_id != br_id
-		    || !(hf->f_mode & FMODE_WRITE))
-			array[ull] = NULL;
-		else {
-			do_warn = 1;
-			get_file(file);
-		}
-
-		FiMustNoWaiters(file);
-		fi_read_unlock(file);
-		fput(file);
-	}
-
-	err = 0;
-	if (do_warn)
-		au_warn_ima();
-
-	for (ull = 0; ull < max; ull++) {
-		file = array[ull];
-		if (!file)
-			continue;
-
-		/* todo: already flushed? */
-		/*
-		 * fs/super.c:mark_files_ro() is gone, but aufs keeps its
-		 * approach which resets f_mode and calls mnt_drop_write() and
-		 * file_release_write() for each file, because the branch
-		 * attribute in aufs world is totally different from the native
-		 * fs rw/ro mode.
-		*/
-		/* fi_read_lock(file); */
-		hfile = &au_fi(file)->fi_htop;
-		hf = hfile->hf_file;
-		/* fi_read_unlock(file); */
-		spin_lock(&hf->f_lock);
-		writer = !!(hf->f_mode & FMODE_WRITER);
-		hf->f_mode &= ~(FMODE_WRITE | FMODE_WRITER);
-		spin_unlock(&hf->f_lock);
-		if (writer) {
-			put_write_access(file_inode(hf));
-			__mnt_drop_write(hf->f_path.mnt);
-		}
-	}
-
-out_array:
-	au_farray_free(array, max);
-out:
-	AuTraceErr(err);
-	return err;
-}
-
-int au_br_mod(struct super_block *sb, struct au_opt_mod *mod, int remount,
-	      int *do_refresh)
-{
-	int err, rerr;
-	aufs_bindex_t bindex;
-	struct dentry *root;
-	struct au_branch *br;
-	struct au_br_fhsm *bf;
-
-	root = sb->s_root;
-	bindex = au_find_dbindex(root, mod->h_root);
-	if (bindex < 0) {
-		if (remount)
-			return 0; /* success */
-		err = -ENOENT;
-		pr_err("%s no such branch\n", mod->path);
-		goto out;
-	}
-	AuDbg("bindex b%d\n", bindex);
-
-	err = test_br(d_inode(mod->h_root), mod->perm, mod->path);
-	if (unlikely(err))
-		goto out;
-
-	br = au_sbr(sb, bindex);
-	AuDebugOn(mod->h_root != au_br_dentry(br));
-	if (br->br_perm == mod->perm)
-		return 0; /* success */
-
-	/* pre-allocate for non-fhsm --> fhsm */
-	bf = NULL;
-	if (!au_br_fhsm(br->br_perm) && au_br_fhsm(mod->perm)) {
-		err = au_fhsm_br_alloc(br);
-		if (unlikely(err))
-			goto out;
-		bf = br->br_fhsm;
-		br->br_fhsm = NULL;
-	}
-
-	if (au_br_writable(br->br_perm)) {
-		/* remove whiteout base */
-		err = au_br_init_wh(sb, br, mod->perm);
-		if (unlikely(err))
-			goto out_bf;
-
-		if (!au_br_writable(mod->perm)) {
-			/* rw --> ro, file might be mmapped */
-			DiMustNoWaiters(root);
-			IiMustNoWaiters(d_inode(root));
-			di_write_unlock(root);
-			err = au_br_mod_files_ro(sb, bindex);
-			/* aufs_write_lock() calls ..._child() */
-			di_write_lock_child(root);
-
-			if (unlikely(err)) {
-				rerr = -ENOMEM;
-				br->br_wbr = kzalloc(sizeof(*br->br_wbr),
-						     GFP_NOFS);
-				if (br->br_wbr)
-					rerr = au_wbr_init(br, sb, br->br_perm);
-				if (unlikely(rerr)) {
-					AuIOErr("nested error %d (%d)\n",
-						rerr, err);
-					br->br_perm = mod->perm;
-				}
-			}
-		}
-	} else if (au_br_writable(mod->perm)) {
-		/* ro --> rw */
-		err = -ENOMEM;
-		br->br_wbr = kzalloc(sizeof(*br->br_wbr), GFP_NOFS);
-		if (br->br_wbr) {
-			err = au_wbr_init(br, sb, mod->perm);
-			if (unlikely(err)) {
-				kfree(br->br_wbr);
-				br->br_wbr = NULL;
-			}
-		}
-	}
-	if (unlikely(err))
-		goto out_bf;
-
-	if (au_br_fhsm(br->br_perm)) {
-		if (!au_br_fhsm(mod->perm)) {
-			/* fhsm --> non-fhsm */
-			au_br_fhsm_fin(br->br_fhsm);
-			kfree(br->br_fhsm);
-			br->br_fhsm = NULL;
-		}
-	} else if (au_br_fhsm(mod->perm))
-		/* non-fhsm --> fhsm */
-		br->br_fhsm = bf;
-
-	*do_refresh |= need_sigen_inc(br->br_perm, mod->perm);
-	br->br_perm = mod->perm;
-	goto out; /* success */
-
-out_bf:
-	kfree(bf);
-out:
-	AuTraceErr(err);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-int au_br_stfs(struct au_branch *br, struct aufs_stfs *stfs)
-{
-	int err;
-	struct kstatfs kstfs;
-
-	err = vfs_statfs(&br->br_path, &kstfs);
-	if (!err) {
-		stfs->f_blocks = kstfs.f_blocks;
-		stfs->f_bavail = kstfs.f_bavail;
-		stfs->f_files = kstfs.f_files;
-		stfs->f_ffree = kstfs.f_ffree;
-	}
-
-	return err;
-}
diff --git a/fs/aufs/branch.h b/fs/aufs/branch.h
deleted file mode 100644
index 4c52ae166..000000000
--- a/fs/aufs/branch.h
+++ /dev/null
@@ -1,266 +0,0 @@
-/*
- * Copyright (C) 2005-2016 Junjiro R. Okajima
- */
-
-/*
- * branch filesystems and xino for them
- */
-
-#ifndef __AUFS_BRANCH_H__
-#define __AUFS_BRANCH_H__
-
-#ifdef __KERNEL__
-
-#include <linux/mount.h>
-#include "dynop.h"
-#include "rwsem.h"
-#include "super.h"
-
-/* ---------------------------------------------------------------------- */
-
-/* a xino file */
-struct au_xino_file {
-	struct file		*xi_file;
-	struct mutex		xi_nondir_mtx;
-
-	/* todo: make xino files an array to support huge inode number */
-
-#ifdef CONFIG_DEBUG_FS
-	struct dentry		 *xi_dbgaufs;
-#endif
-};
-
-/* File-based Hierarchical Storage Management */
-struct au_br_fhsm {
-#ifdef CONFIG_AUFS_FHSM
-	struct mutex		bf_lock;
-	unsigned long		bf_jiffy;
-	struct aufs_stfs	bf_stfs;
-	int			bf_readable;
-#endif
-};
-
-/* members for writable branch only */
-enum {AuBrWh_BASE, AuBrWh_PLINK, AuBrWh_ORPH, AuBrWh_Last};
-struct au_wbr {
-	struct au_rwsem		wbr_wh_rwsem;
-	struct dentry		*wbr_wh[AuBrWh_Last];
-	atomic_t		wbr_wh_running;
-#define wbr_whbase		wbr_wh[AuBrWh_BASE]	/* whiteout base */
-#define wbr_plink		wbr_wh[AuBrWh_PLINK]	/* pseudo-link dir */
-#define wbr_orph		wbr_wh[AuBrWh_ORPH]	/* dir for orphans */
-
-	/* mfs mode */
-	unsigned long long	wbr_bytes;
-};
-
-/* ext2 has 3 types of operations at least, ext3 has 4 */
-#define AuBrDynOp (AuDyLast * 4)
-
-#ifdef CONFIG_AUFS_HFSNOTIFY
-/* support for asynchronous destruction */
-struct au_br_hfsnotify {
-	struct fsnotify_group	*hfsn_group;
-};
-#endif
-
-/* sysfs entries */
-struct au_brsysfs {
-	char			name[16];
-	struct attribute	attr;
-};
-
-enum {
-	AuBrSysfs_BR,
-	AuBrSysfs_BRID,
-	AuBrSysfs_Last
-};
-
-/* protected by superblock rwsem */
-struct au_branch {
-	struct au_xino_file	br_xino;
-
-	aufs_bindex_t		br_id;
-
-	int			br_perm;
-	struct path		br_path;
-	spinlock_t		br_dykey_lock;
-	struct au_dykey		*br_dykey[AuBrDynOp];
-	atomic_t		br_count;
-
-	struct au_wbr		*br_wbr;
-	struct au_br_fhsm	*br_fhsm;
-
-	/* xino truncation */
-	atomic_t		br_xino_running;
-
-#ifdef CONFIG_AUFS_HFSNOTIFY
-	struct au_br_hfsnotify	*br_hfsn;
-#endif
-
-#ifdef CONFIG_SYSFS
-	/* entries under sysfs per mount-point */
-	struct au_brsysfs	br_sysfs[AuBrSysfs_Last];
-#endif
-};
-
-/* ---------------------------------------------------------------------- */
-
-static inline struct vfsmount *au_br_mnt(struct au_branch *br)
-{
-	return br->br_path.mnt;
-}
-
-static inline struct dentry *au_br_dentry(struct au_branch *br)
-{
-	return br->br_path.dentry;
-}
-
-static inline struct super_block *au_br_sb(struct au_branch *br)
-{
-	return au_br_mnt(br)->mnt_sb;
-}
-
-static inline int au_br_rdonly(struct au_branch *br)
-{
-	return ((au_br_sb(br)->s_flags & MS_RDONLY)
-		|| !au_br_writable(br->br_perm))
-		? -EROFS : 0;
-}
-
-static inline int au_br_hnotifyable(int brperm __maybe_unused)
-{
-#ifdef CONFIG_AUFS_HNOTIFY
-	return !(brperm & AuBrPerm_RR);
-#else
-	return 0;
-#endif
-}
-
-static inline int au_br_test_oflag(int oflag, struct au_branch *br)
-{
-	int err, exec_flag;
-
-	err = 0;
-	exec_flag = oflag & __FMODE_EXEC;
-	if (unlikely(exec_flag && (au_br_mnt(br)->mnt_flags & MNT_NOEXEC)))
-		err = -EACCES;
-
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* branch.c */
-struct au_sbinfo;
-void au_br_free(struct au_sbinfo *sinfo);
-int au_br_index(struct super_block *sb, aufs_bindex_t br_id);
-struct au_opt_add;
-int au_br_add(struct super_block *sb, struct au_opt_add *add, int remount);
-struct au_opt_del;
-int au_br_del(struct super_block *sb, struct au_opt_del *del, int remount);
-long au_ibusy_ioctl(struct file *file, unsigned long arg);
-#ifdef CONFIG_COMPAT
-long au_ibusy_compat_ioctl(struct file *file, unsigned long arg);
-#endif
-struct au_opt_mod;
-int au_br_mod(struct super_block *sb, struct au_opt_mod *mod, int remount,
-	      int *do_refresh);
-struct aufs_stfs;
-int au_br_stfs(struct au_branch *br, struct aufs_stfs *stfs);
-
-/* xino.c */
-static const loff_t au_loff_max = LLONG_MAX;
-
-int au_xib_trunc(struct super_block *sb);
-ssize_t xino_fread(vfs_readf_t func, struct file *file, void *buf, size_t size,
-		   loff_t *pos);
-ssize_t xino_fwrite(vfs_writef_t func, struct file *file, void *buf,
-		    size_t size, loff_t *pos);
-struct file *au_xino_create2(struct file *base_file, struct file *copy_src);
-struct file *au_xino_create(struct super_block *sb, char *fname, int silent);
-ino_t au_xino_new_ino(struct super_block *sb);
-void au_xino_delete_inode(struct inode *inode, const int unlinked);
-int au_xino_write(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino,
-		  ino_t ino);
-int au_xino_read(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino,
-		 ino_t *ino);
-int au_xino_br(struct super_block *sb, struct au_branch *br, ino_t hino,
-	       struct file *base_file, int do_test);
-int au_xino_trunc(struct super_block *sb, aufs_bindex_t bindex);
-
-struct au_opt_xino;
-int au_xino_set(struct super_block *sb, struct au_opt_xino *xino, int remount);
-void au_xino_clr(struct super_block *sb);
-struct file *au_xino_def(struct super_block *sb);
-int au_xino_path(struct seq_file *seq, struct file *file);
-
-/* ---------------------------------------------------------------------- */
-
-/* Superblock to branch */
-static inline
-aufs_bindex_t au_sbr_id(struct super_block *sb, aufs_bindex_t bindex)
-{
-	return au_sbr(sb, bindex)->br_id;
-}
-
-static inline
-struct vfsmount *au_sbr_mnt(struct super_block *sb, aufs_bindex_t bindex)
-{
-	return au_br_mnt(au_sbr(sb, bindex));
-}
-
-static inline
-struct super_block *au_sbr_sb(struct super_block *sb, aufs_bindex_t bindex)
-{
-	return au_br_sb(au_sbr(sb, bindex));
-}
-
-static inline void au_sbr_put(struct super_block *sb, aufs_bindex_t bindex)
-{
-	atomic_dec(&au_sbr(sb, bindex)->br_count);
-}
-
-static inline int au_sbr_perm(struct super_block *sb, aufs_bindex_t bindex)
-{
-	return au_sbr(sb, bindex)->br_perm;
-}
-
-static inline int au_sbr_whable(struct super_block *sb, aufs_bindex_t bindex)
-{
-	return au_br_whable(au_sbr_perm(sb, bindex));
-}
-
-/* ---------------------------------------------------------------------- */
-
-/*
- * wbr_wh_read_lock, wbr_wh_write_lock
- * wbr_wh_read_unlock, wbr_wh_write_unlock, wbr_wh_downgrade_lock
- */
-AuSimpleRwsemFuncs(wbr_wh, struct au_wbr *wbr, &wbr->wbr_wh_rwsem);
-
-#define WbrWhMustNoWaiters(wbr)	AuRwMustNoWaiters(&wbr->wbr_wh_rwsem)
-#define WbrWhMustAnyLock(wbr)	AuRwMustAnyLock(&wbr->wbr_wh_rwsem)
-#define WbrWhMustWriteLock(wbr)	AuRwMustWriteLock(&wbr->wbr_wh_rwsem)
-
-/* ---------------------------------------------------------------------- */
-
-#ifdef CONFIG_AUFS_FHSM
-static inline void au_br_fhsm_init(struct au_br_fhsm *brfhsm)
-{
-	mutex_init(&brfhsm->bf_lock);
-	brfhsm->bf_jiffy = 0;
-	brfhsm->bf_readable = 0;
-}
-
-static inline void au_br_fhsm_fin(struct au_br_fhsm *brfhsm)
-{
-	mutex_destroy(&brfhsm->bf_lock);
-}
-#else
-AuStubVoid(au_br_fhsm_init, struct au_br_fhsm *brfhsm)
-AuStubVoid(au_br_fhsm_fin, struct au_br_fhsm *brfhsm)
-#endif
-
-#endif /* __KERNEL__ */
-#endif /* __AUFS_BRANCH_H__ */
diff --git a/fs/aufs/cpup.c b/fs/aufs/cpup.c
deleted file mode 100644
index cadb3adb7..000000000
--- a/fs/aufs/cpup.c
+++ /dev/null
@@ -1,1366 +0,0 @@
-/*
- * Copyright (C) 2005-2016 Junjiro R. Okajima
- */
-
-/*
- * copy-up functions, see wbr_policy.c for copy-down
- */
-
-#include <linux/fs_stack.h>
-#include <linux/mm.h>
-#include <linux/task_work.h>
-#include "aufs.h"
-
-void au_cpup_attr_flags(struct inode *dst, unsigned int iflags)
-{
-	const unsigned int mask = S_DEAD | S_SWAPFILE | S_PRIVATE
-		| S_NOATIME | S_NOCMTIME | S_AUTOMOUNT;
-
-	BUILD_BUG_ON(sizeof(iflags) != sizeof(dst->i_flags));
-
-	dst->i_flags |= iflags & ~mask;
-	if (au_test_fs_notime(dst->i_sb))
-		dst->i_flags |= S_NOATIME | S_NOCMTIME;
-}
-
-void au_cpup_attr_timesizes(struct inode *inode)
-{
-	struct inode *h_inode;
-
-	h_inode = au_h_iptr(inode, au_ibstart(inode));
-	fsstack_copy_attr_times(inode, h_inode);
-	fsstack_copy_inode_size(inode, h_inode);
-}
-
-void au_cpup_attr_nlink(struct inode *inode, int force)
-{
-	struct inode *h_inode;
-	struct super_block *sb;
-	aufs_bindex_t bindex, bend;
-
-	sb = inode->i_sb;
-	bindex = au_ibstart(inode);
-	h_inode = au_h_iptr(inode, bindex);
-	if (!force
-	    && !S_ISDIR(h_inode->i_mode)
-	    && au_opt_test(au_mntflags(sb), PLINK)
-	    && au_plink_test(inode))
-		return;
-
-	/*
-	 * 0 can happen in revalidating.
-	 * h_inode->i_mutex may not be held here, but it is harmless since once
-	 * i_nlink reaches 0, it will never become positive except O_TMPFILE
-	 * case.
-	 * todo: O_TMPFILE+linkat(AT_SYMLINK_FOLLOW) bypassing aufs may cause
-	 *	 the incorrect link count.
-	 */
-	set_nlink(inode, h_inode->i_nlink);
-
-	/*
-	 * fewer nlink makes find(1) noisy, but larger nlink doesn't.
-	 * it may includes whplink directory.
-	 */
-	if (S_ISDIR(h_inode->i_mode)) {
-		bend = au_ibend(inode);
-		for (bindex++; bindex <= bend; bindex++) {
-			h_inode = au_h_iptr(inode, bindex);
-			if (h_inode)
-				au_add_nlink(inode, h_inode);
-		}
-	}
-}
-
-void au_cpup_attr_changeable(struct inode *inode)
-{
-	struct inode *h_inode;
-
-	h_inode = au_h_iptr(inode, au_ibstart(inode));
-	inode->i_mode = h_inode->i_mode;
-	inode->i_uid = h_inode->i_uid;
-	inode->i_gid = h_inode->i_gid;
-	au_cpup_attr_timesizes(inode);
-	au_cpup_attr_flags(inode, h_inode->i_flags);
-}
-
-void au_cpup_igen(struct inode *inode, struct inode *h_inode)
-{
-	struct au_iinfo *iinfo = au_ii(inode);
-
-	IiMustWriteLock(inode);
-
-	iinfo->ii_higen = h_inode->i_generation;
-	iinfo->ii_hsb1 = h_inode->i_sb;
-}
-
-void au_cpup_attr_all(struct inode *inode, int force)
-{
-	struct inode *h_inode;
-
-	h_inode = au_h_iptr(inode, au_ibstart(inode));
-	au_cpup_attr_changeable(inode);
-	if (inode->i_nlink > 0)
-		au_cpup_attr_nlink(inode, force);
-	inode->i_rdev = h_inode->i_rdev;
-	inode->i_blkbits = h_inode->i_blkbits;
-	au_cpup_igen(inode, h_inode);
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* Note: dt_dentry and dt_h_dentry are not dget/dput-ed */
-
-/* keep the timestamps of the parent dir when cpup */
-void au_dtime_store(struct au_dtime *dt, struct dentry *dentry,
-		    struct path *h_path)
-{
-	struct inode *h_inode;
-
-	dt->dt_dentry = dentry;
-	dt->dt_h_path = *h_path;
-	h_inode = d_inode(h_path->dentry);
-	dt->dt_atime = h_inode->i_atime;
-	dt->dt_mtime = h_inode->i_mtime;
-	/* smp_mb(); */
-}
-
-void au_dtime_revert(struct au_dtime *dt)
-{
-	struct iattr attr;
-	int err;
-
-	attr.ia_atime = dt->dt_atime;
-	attr.ia_mtime = dt->dt_mtime;
-	attr.ia_valid = ATTR_FORCE | ATTR_MTIME | ATTR_MTIME_SET
-		| ATTR_ATIME | ATTR_ATIME_SET;
-
-	/* no delegation since this is a directory */
-	err = vfsub_notify_change(&dt->dt_h_path, &attr, /*delegated*/NULL);
-	if (unlikely(err))
-		pr_warn("restoring timestamps failed(%d). ignored\n", err);
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* internal use only */
-struct au_cpup_reg_attr {
-	int		valid;
-	struct kstat	st;
-	unsigned int	iflags; /* inode->i_flags */
-};
-
-static noinline_for_stack
-int cpup_iattr(struct dentry *dst, aufs_bindex_t bindex, struct dentry *h_src,
-	       struct au_cpup_reg_attr *h_src_attr)
-{
-	int err, sbits, icex;
-	unsigned int mnt_flags;
-	unsigned char verbose;
-	struct iattr ia;
-	struct path h_path;
-	struct inode *h_isrc, *h_idst;
-	struct kstat *h_st;
-	struct au_branch *br;
-
-	h_path.dentry = au_h_dptr(dst, bindex);
-	h_idst = d_inode(h_path.dentry);
-	br = au_sbr(dst->d_sb, bindex);
-	h_path.mnt = au_br_mnt(br);
-	h_isrc = d_inode(h_src);
-	ia.ia_valid = ATTR_FORCE | ATTR_UID | ATTR_GID
-		| ATTR_ATIME | ATTR_MTIME
-		| ATTR_ATIME_SET | ATTR_MTIME_SET;
-	if (h_src_attr && h_src_attr->valid) {
-		h_st = &h_src_attr->st;
-		ia.ia_uid = h_st->uid;
-		ia.ia_gid = h_st->gid;
-		ia.ia_atime = h_st->atime;
-		ia.ia_mtime = h_st->mtime;
-		if (h_idst->i_mode != h_st->mode
-		    && !S_ISLNK(h_idst->i_mode)) {
-			ia.ia_valid |= ATTR_MODE;
-			ia.ia_mode = h_st->mode;
-		}
-		sbits = !!(h_st->mode & (S_ISUID | S_ISGID));
-		au_cpup_attr_flags(h_idst, h_src_attr->iflags);
-	} else {
-		ia.ia_uid = h_isrc->i_uid;
-		ia.ia_gid = h_isrc->i_gid;
-		ia.ia_atime = h_isrc->i_atime;
-		ia.ia_mtime = h_isrc->i_mtime;
-		if (h_idst->i_mode != h_isrc->i_mode
-		    && !S_ISLNK(h_idst->i_mode)) {
-			ia.ia_valid |= ATTR_MODE;
-			ia.ia_mode = h_isrc->i_mode;
-		}
-		sbits = !!(h_isrc->i_mode & (S_ISUID | S_ISGID));
-		au_cpup_attr_flags(h_idst, h_isrc->i_flags);
-	}
-	/* no delegation since it is just created */
-	err = vfsub_notify_change(&h_path, &ia, /*delegated*/NULL);
-
-	/* is this nfs only? */
-	if (!err && sbits && au_test_nfs(h_path.dentry->d_sb)) {
-		ia.ia_valid = ATTR_FORCE | ATTR_MODE;
-		ia.ia_mode = h_isrc->i_mode;
-		err = vfsub_notify_change(&h_path, &ia, /*delegated*/NULL);
-	}
-
-	icex = br->br_perm & AuBrAttr_ICEX;
-	if (!err) {
-		mnt_flags = au_mntflags(dst->d_sb);
-		verbose = !!au_opt_test(mnt_flags, VERBOSE);
-		err = au_cpup_xattr(h_path.dentry, h_src, icex, verbose);
-	}
-
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static int au_do_copy_file(struct file *dst, struct file *src, loff_t len,
-			   char *buf, unsigned long blksize)
-{
-	int err;
-	size_t sz, rbytes, wbytes;
-	unsigned char all_zero;
-	char *p, *zp;
-	struct mutex *h_mtx;
-	/* reduce stack usage */
-	struct iattr *ia;
-
-	zp = page_address(ZERO_PAGE(0));
-	if (unlikely(!zp))
-		return -ENOMEM; /* possible? */
-
-	err = 0;
-	all_zero = 0;
-	while (len) {
-		AuDbg("len %lld\n", len);
-		sz = blksize;
-		if (len < blksize)
-			sz = len;
-
-		rbytes = 0;
-		/* todo: signal_pending? */
-		while (!rbytes || err == -EAGAIN || err == -EINTR) {
-			rbytes = vfsub_read_k(src, buf, sz, &src->f_pos);
-			err = rbytes;
-		}
-		if (unlikely(err < 0))
-			break;
-
-		all_zero = 0;
-		if (len >= rbytes && rbytes == blksize)
-			all_zero = !memcmp(buf, zp, rbytes);
-		if (!all_zero) {
-			wbytes = rbytes;
-			p = buf;
-			while (wbytes) {
-				size_t b;
-
-				b = vfsub_write_k(dst, p, wbytes, &dst->f_pos);
-				err = b;
-				/* todo: signal_pending? */
-				if (unlikely(err == -EAGAIN || err == -EINTR))
-					continue;
-				if (unlikely(err < 0))
-					break;
-				wbytes -= b;
-				p += b;
-			}
-			if (unlikely(err < 0))
-				break;
-		} else {
-			loff_t res;
-
-			AuLabel(hole);
-			res = vfsub_llseek(dst, rbytes, SEEK_CUR);
-			err = res;
-			if (unlikely(res < 0))
-				break;
-		}
-		len -= rbytes;
-		err = 0;
-	}
-
-	/* the last block may be a hole */
-	if (!err && all_zero) {
-		AuLabel(last hole);
-
-		err = 1;
-		if (au_test_nfs(dst->f_path.dentry->d_sb)) {
-			/* nfs requires this step to make last hole */
-			/* is this only nfs? */
-			do {
-				/* todo: signal_pending? */
-				err = vfsub_write_k(dst, "\0", 1, &dst->f_pos);
-			} while (err == -EAGAIN || err == -EINTR);
-			if (err == 1)
-				dst->f_pos--;
-		}
-
-		if (err == 1) {
-			ia = (void *)buf;
-			ia->ia_size = dst->f_pos;
-			ia->ia_valid = ATTR_SIZE | ATTR_FILE;
-			ia->ia_file = dst;
-			h_mtx = &file_inode(dst)->i_mutex;
-			mutex_lock_nested(h_mtx, AuLsc_I_CHILD2);
-			/* no delegation since it is just created */
-			err = vfsub_notify_change(&dst->f_path, ia,
-						  /*delegated*/NULL);
-			mutex_unlock(h_mtx);
-		}
-	}
-
-	return err;
-}
-
-int au_copy_file(struct file *dst, struct file *src, loff_t len)
-{
-	int err;
-	unsigned long blksize;
-	unsigned char do_kfree;
-	char *buf;
-
-	err = -ENOMEM;
-	blksize = dst->f_path.dentry->d_sb->s_blocksize;
-	if (!blksize || PAGE_SIZE < blksize)
-		blksize = PAGE_SIZE;
-	AuDbg("blksize %lu\n", blksize);
-	do_kfree = (blksize != PAGE_SIZE && blksize >= sizeof(struct iattr *));
-	if (do_kfree)
-		buf = kmalloc(blksize, GFP_NOFS);
-	else
-		buf = (void *)__get_free_page(GFP_NOFS);
-	if (unlikely(!buf))
-		goto out;
-
-	if (len > (1 << 22))
-		AuDbg("copying a large file %lld\n", (long long)len);
-
-	src->f_pos = 0;
-	dst->f_pos = 0;
-	err = au_do_copy_file(dst, src, len, buf, blksize);
-	if (do_kfree)
-		kfree(buf);
-	else
-		free_page((unsigned long)buf);
-
-out:
-	return err;
-}
-
-/*
- * to support a sparse file which is opened with O_APPEND,
- * we need to close the file.
- */
-static int au_cp_regular(struct au_cp_generic *cpg)
-{
-	int err, i;
-	enum { SRC, DST };
-	struct {
-		aufs_bindex_t bindex;
-		unsigned int flags;
-		struct dentry *dentry;
-		int force_wr;
-		struct file *file;
-		void *label;
-	} *f, file[] = {
-		{
-			.bindex = cpg->bsrc,
-			.flags = O_RDONLY | O_NOATIME | O_LARGEFILE,
-			.label = &&out
-		},
-		{
-			.bindex = cpg->bdst,
-			.flags = O_WRONLY | O_NOATIME | O_LARGEFILE,
-			.force_wr = !!au_ftest_cpup(cpg->flags, RWDST),
-			.label = &&out_src
-		}
-	};
-	struct super_block *sb;
-	struct task_struct *tsk = current;
-
-	/* bsrc branch can be ro/rw. */
-	sb = cpg->dentry->d_sb;
-	f = file;
-	for (i = 0; i < 2; i++, f++) {
-		f->dentry = au_h_dptr(cpg->dentry, f->bindex);
-		f->file = au_h_open(cpg->dentry, f->bindex, f->flags,
-				    /*file*/NULL, f->force_wr);
-		err = PTR_ERR(f->file);
-		if (IS_ERR(f->file))
-			goto *f->label;
-	}
-
-	/* try stopping to update while we copyup */
-	IMustLock(d_inode(file[SRC].dentry));
-	err = au_copy_file(file[DST].file, file[SRC].file, cpg->len);
-
-	/* i wonder if we had O_NO_DELAY_FPUT flag */
-	if (tsk->flags & PF_KTHREAD)
-		__fput_sync(file[DST].file);
-	else {
-		WARN(1, "%pD\nPlease report this warning to aufs-users ML",
-		     file[DST].file);
-		fput(file[DST].file);
-		/*
-		 * too bad.
-		 * we have to call both since we don't know which place the file
-		 * was added to.
-		 */
-		task_work_run();
-		flush_delayed_fput();
-	}
-	au_sbr_put(sb, file[DST].bindex);
-
-out_src:
-	fput(file[SRC].file);
-	au_sbr_put(sb, file[SRC].bindex);
-out:
-	return err;
-}
-
-static int au_do_cpup_regular(struct au_cp_generic *cpg,
-			      struct au_cpup_reg_attr *h_src_attr)
-{
-	int err, rerr;
-	loff_t l;
-	struct path h_path;
-	struct inode *h_src_inode, *h_dst_inode;
-
-	err = 0;
-	h_src_inode = au_h_iptr(d_inode(cpg->dentry), cpg->bsrc);
-	l = i_size_read(h_src_inode);
-	if (cpg->len == -1 || l < cpg->len)
-		cpg->len = l;
-	if (cpg->len) {
-		/* try stopping to update while we are referencing */
-		mutex_lock_nested(&h_src_inode->i_mutex, AuLsc_I_CHILD);
-		au_pin_hdir_unlock(cpg->pin);
-
-		h_path.dentry = au_h_dptr(cpg->dentry, cpg->bsrc);
-		h_path.mnt = au_sbr_mnt(cpg->dentry->d_sb, cpg->bsrc);
-		h_src_attr->iflags = h_src_inode->i_flags;
-		if (!au_test_nfs(h_src_inode->i_sb))
-			err = vfs_getattr(&h_path, &h_src_attr->st);
-		else {
-			mutex_unlock(&h_src_inode->i_mutex);
-			err = vfs_getattr(&h_path, &h_src_attr->st);
-			mutex_lock_nested(&h_src_inode->i_mutex, AuLsc_I_CHILD);
-		}
-		if (unlikely(err)) {
-			mutex_unlock(&h_src_inode->i_mutex);
-			goto out;
-		}
-		h_src_attr->valid = 1;
-		err = au_cp_regular(cpg);
-		mutex_unlock(&h_src_inode->i_mutex);
-		rerr = au_pin_hdir_relock(cpg->pin);
-		if (!err && rerr)
-			err = rerr;
-	}
-	if (!err && (h_src_inode->i_state & I_LINKABLE)) {
-		h_path.dentry = au_h_dptr(cpg->dentry, cpg->bdst);
-		h_dst_inode = d_inode(h_path.dentry);
-		spin_lock(&h_dst_inode->i_lock);
-		h_dst_inode->i_state |= I_LINKABLE;
-		spin_unlock(&h_dst_inode->i_lock);
-	}
-
-out:
-	return err;
-}
-
-static int au_do_cpup_symlink(struct path *h_path, struct dentry *h_src,
-			      struct inode *h_dir)
-{
-	int err, symlen;
-	mm_segment_t old_fs;
-	union {
-		char *k;
-		char __user *u;
-	} sym;
-	struct inode *h_inode = d_inode(h_src);
-	const struct inode_operations *h_iop = h_inode->i_op;
-
-	err = -ENOSYS;
-	if (unlikely(!h_iop->readlink))
-		goto out;
-
-	err = -ENOMEM;
-	sym.k = (void *)__get_free_page(GFP_NOFS);
-	if (unlikely(!sym.k))
-		goto out;
-
-	/* unnecessary to support mmap_sem since symlink is not mmap-able */
-	old_fs = get_fs();
-	set_fs(KERNEL_DS);
-	symlen = h_iop->readlink(h_src, sym.u, PATH_MAX);
-	err = symlen;
-	set_fs(old_fs);
-
-	if (symlen > 0) {
-		sym.k[symlen] = 0;
-		err = vfsub_symlink(h_dir, h_path, sym.k);
-	}
-	free_page((unsigned long)sym.k);
-
-out:
-	return err;
-}
-
-/*
- * regardless 'acl' option, reset all ACL.
- * All ACL will be copied up later from the original entry on the lower branch.
- */
-static int au_reset_acl(struct inode *h_dir, struct path *h_path, umode_t mode)
-{
-	int err;
-	struct dentry *h_dentry;
-	struct inode *h_inode;
-
-	h_dentry = h_path->dentry;
-	h_inode = d_inode(h_dentry);
-	/* forget_all_cached_acls(h_inode)); */
-	err = vfsub_removexattr(h_dentry, XATTR_NAME_POSIX_ACL_ACCESS);
-	AuTraceErr(err);
-	if (err == -EOPNOTSUPP)
-		err = 0;
-	if (!err)
-		err = vfsub_acl_chmod(h_inode, mode);
-
-	AuTraceErr(err);
-	return err;
-}
-
-static int au_do_cpup_dir(struct au_cp_generic *cpg, struct dentry *dst_parent,
-			  struct inode *h_dir, struct path *h_path)
-{
-	int err;
-	struct inode *dir, *inode;
-
-	err = vfsub_removexattr(h_path->dentry, XATTR_NAME_POSIX_ACL_DEFAULT);
-	AuTraceErr(err);
-	if (err == -EOPNOTSUPP)
-		err = 0;
-	if (unlikely(err))
-		goto out;
-
-	/*
-	 * strange behaviour from the users view,
-	 * particularry setattr case
-	 */
-	dir = d_inode(dst_parent);
-	if (au_ibstart(dir) == cpg->bdst)
-		au_cpup_attr_nlink(dir, /*force*/1);
-	inode = d_inode(cpg->dentry);
-	au_cpup_attr_nlink(inode, /*force*/1);
-
-out:
-	return err;
-}
-
-static noinline_for_stack
-int cpup_entry(struct au_cp_generic *cpg, struct dentry *dst_parent,
-	       struct au_cpup_reg_attr *h_src_attr)
-{
-	int err;
-	umode_t mode;
-	unsigned int mnt_flags;
-	unsigned char isdir, isreg, force;
-	const unsigned char do_dt = !!au_ftest_cpup(cpg->flags, DTIME);
-	struct au_dtime dt;
-	struct path h_path;
-	struct dentry *h_src, *h_dst, *h_parent;
-	struct inode *h_inode, *h_dir;
-	struct super_block *sb;
-
-	/* bsrc branch can be ro/rw. */
-	h_src = au_h_dptr(cpg->dentry, cpg->bsrc);
-	h_inode = d_inode(h_src);
-	AuDebugOn(h_inode != au_h_iptr(d_inode(cpg->dentry), cpg->bsrc));
-
-	/* try stopping to be referenced while we are creating */
-	h_dst = au_h_dptr(cpg->dentry, cpg->bdst);
-	if (au_ftest_cpup(cpg->flags, RENAME))
-		AuDebugOn(strncmp(h_dst->d_name.name, AUFS_WH_PFX,
-				  AUFS_WH_PFX_LEN));
-	h_parent = h_dst->d_parent; /* dir inode is locked */
-	h_dir = d_inode(h_parent);
-	IMustLock(h_dir);
-	AuDebugOn(h_parent != h_dst->d_parent);
-
-	sb = cpg->dentry->d_sb;
-	h_path.mnt = au_sbr_mnt(sb, cpg->bdst);
-	if (do_dt) {
-		h_path.dentry = h_parent;
-		au_dtime_store(&dt, dst_parent, &h_path);
-	}
-	h_path.dentry = h_dst;
-
-	isreg = 0;
-	isdir = 0;
-	mode = h_inode->i_mode;
-	switch (mode & S_IFMT) {
-	case S_IFREG:
-		isreg = 1;
-		err = vfsub_create(h_dir, &h_path, S_IRUSR | S_IWUSR,
-				   /*want_excl*/true);
-		if (!err)
-			err = au_do_cpup_regular(cpg, h_src_attr);
-		break;
-	case S_IFDIR:
-		isdir = 1;
-		err = vfsub_mkdir(h_dir, &h_path, mode);
-		if (!err)
-			err = au_do_cpup_dir(cpg, dst_parent, h_dir, &h_path);
-		break;
-	case S_IFLNK:
-		err = au_do_cpup_symlink(&h_path, h_src, h_dir);
-		break;
-	case S_IFCHR:
-	case S_IFBLK:
-		AuDebugOn(!capable(CAP_MKNOD));
-		/*FALLTHROUGH*/
-	case S_IFIFO:
-	case S_IFSOCK:
-		err = vfsub_mknod(h_dir, &h_path, mode, h_inode->i_rdev);
-		break;
-	default:
-		AuIOErr("Unknown inode type 0%o\n", mode);
-		err = -EIO;
-	}
-	if (!err)
-		err = au_reset_acl(h_dir, &h_path, mode);
-
-	mnt_flags = au_mntflags(sb);
-	if (!au_opt_test(mnt_flags, UDBA_NONE)
-	    && !isdir
-	    && au_opt_test(mnt_flags, XINO)
-	    && (h_inode->i_nlink == 1
-		|| (h_inode->i_state & I_LINKABLE))
-	    /* todo: unnecessary? */
-	    /* && d_inode(cpg->dentry)->i_nlink == 1 */
-	    && cpg->bdst < cpg->bsrc
-	    && !au_ftest_cpup(cpg->flags, KEEPLINO))
-		au_xino_write(sb, cpg->bsrc, h_inode->i_ino, /*ino*/0);
-		/* ignore this error */
-
-	if (!err) {
-		force = 0;
-		if (isreg) {
-			force = !!cpg->len;
-			if (cpg->len == -1)
-				force = !!i_size_read(h_inode);
-		}
-		au_fhsm_wrote(sb, cpg->bdst, force);
-	}
-
-	if (do_dt)
-		au_dtime_revert(&dt);
-	return err;
-}
-
-static int au_do_ren_after_cpup(struct au_cp_generic *cpg, struct path *h_path)
-{
-	int err;
-	struct dentry *dentry, *h_dentry, *h_parent, *parent;
-	struct inode *h_dir;
-	aufs_bindex_t bdst;
-
-	dentry = cpg->dentry;
-	bdst = cpg->bdst;
-	h_dentry = au_h_dptr(dentry, bdst);
-	if (!au_ftest_cpup(cpg->flags, OVERWRITE)) {
-		dget(h_dentry);
-		au_set_h_dptr(dentry, bdst, NULL);
-		err = au_lkup_neg(dentry, bdst, /*wh*/0);
-		if (!err)
-			h_path->dentry = dget(au_h_dptr(dentry, bdst));
-		au_set_h_dptr(dentry, bdst, h_dentry);
-	} else {
-		err = 0;
-		parent = dget_parent(dentry);
-		h_parent = au_h_dptr(parent, bdst);
-		dput(parent);
-		h_path->dentry = vfsub_lkup_one(&dentry->d_name, h_parent);
-		if (IS_ERR(h_path->dentry))
-			err = PTR_ERR(h_path->dentry);
-	}
-	if (unlikely(err))
-		goto out;
-
-	h_parent = h_dentry->d_parent; /* dir inode is locked */
-	h_dir = d_inode(h_parent);
-	IMustLock(h_dir);
-	AuDbg("%pd %pd\n", h_dentry, h_path->dentry);
-	/* no delegation since it is just created */
-	err = vfsub_rename(h_dir, h_dentry, h_dir, h_path, /*delegated*/NULL);
-	dput(h_path->dentry);
-
-out:
-	return err;
-}
-
-/*
- * copyup the @dentry from @bsrc to @bdst.
- * the caller must set the both of lower dentries.
- * @len is for truncating when it is -1 copyup the entire file.
- * in link/rename cases, @dst_parent may be different from the real one.
- * basic->bsrc can be larger than basic->bdst.
- */
-static int au_cpup_single(struct au_cp_generic *cpg, struct dentry *dst_parent)
-{
-	int err, rerr;
-	aufs_bindex_t old_ibstart;
-	unsigned char isdir, plink;
-	struct dentry *h_src, *h_dst, *h_parent;
-	struct inode *dst_inode, *h_dir, *inode, *delegated, *src_inode;
-	struct super_block *sb;
-	struct au_branch *br;
-	/* to reuduce stack size */
-	struct {
-		struct au_dtime dt;
-		struct path h_path;
-		struct au_cpup_reg_attr h_src_attr;
-	} *a;
-
-	err = -ENOMEM;
-	a = kmalloc(sizeof(*a), GFP_NOFS);
-	if (unlikely(!a))
-		goto out;
-	a->h_src_attr.valid = 0;
-
-	sb = cpg->dentry->d_sb;
-	br = au_sbr(sb, cpg->bdst);
-	a->h_path.mnt = au_br_mnt(br);
-	h_dst = au_h_dptr(cpg->dentry, cpg->bdst);
-	h_parent = h_dst->d_parent; /* dir inode is locked */
-	h_dir = d_inode(h_parent);
-	IMustLock(h_dir);
-
-	h_src = au_h_dptr(cpg->dentry, cpg->bsrc);
-	inode = d_inode(cpg->dentry);
-
-	if (!dst_parent)
-		dst_parent = dget_parent(cpg->dentry);
-	else
-		dget(dst_parent);
-
-	plink = !!au_opt_test(au_mntflags(sb), PLINK);
-	dst_inode = au_h_iptr(inode, cpg->bdst);
-	if (dst_inode) {
-		if (unlikely(!plink)) {
-			err = -EIO;
-			AuIOErr("hi%lu(i%lu) exists on b%d "
-				"but plink is disabled\n",
-				dst_inode->i_ino, inode->i_ino, cpg->bdst);
-			goto out_parent;
-		}
-
-		if (dst_inode->i_nlink) {
-			const int do_dt = au_ftest_cpup(cpg->flags, DTIME);
-
-			h_src = au_plink_lkup(inode, cpg->bdst);
-			err = PTR_ERR(h_src);
-			if (IS_ERR(h_src))
-				goto out_parent;
-			if (unlikely(d_is_negative(h_src))) {
-				err = -EIO;
-				AuIOErr("i%lu exists on b%d "
-					"but not pseudo-linked\n",
-					inode->i_ino, cpg->bdst);
-				dput(h_src);
-				goto out_parent;
-			}
-
-			if (do_dt) {
-				a->h_path.dentry = h_parent;
-				au_dtime_store(&a->dt, dst_parent, &a->h_path);
-			}
-
-			a->h_path.dentry = h_dst;
-			delegated = NULL;
-			err = vfsub_link(h_src, h_dir, &a->h_path, &delegated);
-			if (!err && au_ftest_cpup(cpg->flags, RENAME))
-				err = au_do_ren_after_cpup(cpg, &a->h_path);
-			if (do_dt)
-				au_dtime_revert(&a->dt);
-			if (unlikely(err == -EWOULDBLOCK)) {
-				pr_warn("cannot retry for NFSv4 delegation"
-					" for an internal link\n");
-				iput(delegated);
-			}
-			dput(h_src);
-			goto out_parent;
-		} else
-			/* todo: cpup_wh_file? */
-			/* udba work */
-			au_update_ibrange(inode, /*do_put_zero*/1);
-	}
-
-	isdir = S_ISDIR(inode->i_mode);
-	old_ibstart = au_ibstart(inode);
-	err = cpup_entry(cpg, dst_parent, &a->h_src_attr);
-	if (unlikely(err))
-		goto out_rev;
-	dst_inode = d_inode(h_dst);
-	mutex_lock_nested(&dst_inode->i_mutex, AuLsc_I_CHILD2);
-	/* todo: necessary? */
-	/* au_pin_hdir_unlock(cpg->pin); */
-
-	err = cpup_iattr(cpg->dentry, cpg->bdst, h_src, &a->h_src_attr);
-	if (unlikely(err)) {
-		/* todo: necessary? */
-		/* au_pin_hdir_relock(cpg->pin); */ /* ignore an error */
-		mutex_unlock(&dst_inode->i_mutex);
-		goto out_rev;
-	}
-
-	if (cpg->bdst < old_ibstart) {
-		if (S_ISREG(inode->i_mode)) {
-			err = au_dy_iaop(inode, cpg->bdst, dst_inode);
-			if (unlikely(err)) {
-				/* ignore an error */
-				/* au_pin_hdir_relock(cpg->pin); */
-				mutex_unlock(&dst_inode->i_mutex);
-				goto out_rev;
-			}
-		}
-		au_set_ibstart(inode, cpg->bdst);
-	} else
-		au_set_ibend(inode, cpg->bdst);
-	au_set_h_iptr(inode, cpg->bdst, au_igrab(dst_inode),
-		      au_hi_flags(inode, isdir));
-
-	/* todo: necessary? */
-	/* err = au_pin_hdir_relock(cpg->pin); */
-	mutex_unlock(&dst_inode->i_mutex);
-	if (unlikely(err))
-		goto out_rev;
-
-	src_inode = d_inode(h_src);
-	if (!isdir
-	    && (src_inode->i_nlink > 1
-		|| src_inode->i_state & I_LINKABLE)
-	    && plink)
-		au_plink_append(inode, cpg->bdst, h_dst);
-
-	if (au_ftest_cpup(cpg->flags, RENAME)) {
-		a->h_path.dentry = h_dst;
-		err = au_do_ren_after_cpup(cpg, &a->h_path);
-	}
-	if (!err)
-		goto out_parent; /* success */
-
-	/* revert */
-out_rev:
-	a->h_path.dentry = h_parent;
-	au_dtime_store(&a->dt, dst_parent, &a->h_path);
-	a->h_path.dentry = h_dst;
-	rerr = 0;
-	if (d_is_positive(h_dst)) {
-		if (!isdir) {
-			/* no delegation since it is just created */
-			rerr = vfsub_unlink(h_dir, &a->h_path,
-					    /*delegated*/NULL, /*force*/0);
-		} else
-			rerr = vfsub_rmdir(h_dir, &a->h_path);
-	}
-	au_dtime_revert(&a->dt);
-	if (rerr) {
-		AuIOErr("failed removing broken entry(%d, %d)\n", err, rerr);
-		err = -EIO;
-	}
-out_parent:
-	dput(dst_parent);
-	kfree(a);
-out:
-	return err;
-}
-
-#if 0 /* reserved */
-struct au_cpup_single_args {
-	int *errp;
-	struct au_cp_generic *cpg;
-	struct dentry *dst_parent;
-};
-
-static void au_call_cpup_single(void *args)
-{
-	struct au_cpup_single_args *a = args;
-
-	au_pin_hdir_acquire_nest(a->cpg->pin);
-	*a->errp = au_cpup_single(a->cpg, a->dst_parent);
-	au_pin_hdir_release(a->cpg->pin);
-}
-#endif
-
-/*
- * prevent SIGXFSZ in copy-up.
- * testing CAP_MKNOD is for generic fs,
- * but CAP_FSETID is for xfs only, currently.
- */
-static int au_cpup_sio_test(struct au_pin *pin, umode_t mode)
-{
-	int do_sio;
-	struct super_block *sb;
-	struct inode *h_dir;
-
-	do_sio = 0;
-	sb = au_pinned_parent(pin)->d_sb;
-	if (!au_wkq_test()
-	    && (!au_sbi(sb)->si_plink_maint_pid
-		|| au_plink_maint(sb, AuLock_NOPLM))) {
-		switch (mode & S_IFMT) {
-		case S_IFREG:
-			/* no condition about RLIMIT_FSIZE and the file size */
-			do_sio = 1;
-			break;
-		case S_IFCHR:
-		case S_IFBLK:
-			do_sio = !capable(CAP_MKNOD);
-			break;
-		}
-		if (!do_sio)
-			do_sio = ((mode & (S_ISUID | S_ISGID))
-				  && !capable(CAP_FSETID));
-		/* this workaround may be removed in the future */
-		if (!do_sio) {
-			h_dir = au_pinned_h_dir(pin);
-			do_sio = h_dir->i_mode & S_ISVTX;
-		}
-	}
-
-	return do_sio;
-}
-
-#if 0 /* reserved */
-int au_sio_cpup_single(struct au_cp_generic *cpg, struct dentry *dst_parent)
-{
-	int err, wkq_err;
-	struct dentry *h_dentry;
-
-	h_dentry = au_h_dptr(cpg->dentry, cpg->bsrc);
-	if (!au_cpup_sio_test(pin, d_inode(h_dentry)->i_mode))
-		err = au_cpup_single(cpg, dst_parent);
-	else {
-		struct au_cpup_single_args args = {
-			.errp		= &err,
-			.cpg		= cpg,
-			.dst_parent	= dst_parent
-		};
-		wkq_err = au_wkq_wait(au_call_cpup_single, &args);
-		if (unlikely(wkq_err))
-			err = wkq_err;
-	}
-
-	return err;
-}
-#endif
-
-/*
- * copyup the @dentry from the first active lower branch to @bdst,
- * using au_cpup_single().
- */
-static int au_cpup_simple(struct au_cp_generic *cpg)
-{
-	int err;
-	unsigned int flags_orig;
-	struct dentry *dentry;
-
-	AuDebugOn(cpg->bsrc < 0);
-
-	dentry = cpg->dentry;
-	DiMustWriteLock(dentry);
-
-	err = au_lkup_neg(dentry, cpg->bdst, /*wh*/1);
-	if (!err) {
-		flags_orig = cpg->flags;
-		au_fset_cpup(cpg->flags, RENAME);
-		err = au_cpup_single(cpg, NULL);
-		cpg->flags = flags_orig;
-		if (!err)
-			return 0; /* success */
-
-		/* revert */
-		au_set_h_dptr(dentry, cpg->bdst, NULL);
-		au_set_dbstart(dentry, cpg->bsrc);
-	}
-
-	return err;
-}
-
-struct au_cpup_simple_args {
-	int *errp;
-	struct au_cp_generic *cpg;
-};
-
-static void au_call_cpup_simple(void *args)
-{
-	struct au_cpup_simple_args *a = args;
-
-	au_pin_hdir_acquire_nest(a->cpg->pin);
-	*a->errp = au_cpup_simple(a->cpg);
-	au_pin_hdir_release(a->cpg->pin);
-}
-
-static int au_do_sio_cpup_simple(struct au_cp_generic *cpg)
-{
-	int err, wkq_err;
-	struct dentry *dentry, *parent;
-	struct file *h_file;
-	struct inode *h_dir;
-
-	dentry = cpg->dentry;
-	h_file = NULL;
-	if (au_ftest_cpup(cpg->flags, HOPEN)) {
-		AuDebugOn(cpg->bsrc < 0);
-		h_file = au_h_open_pre(dentry, cpg->bsrc, /*force_wr*/0);
-		err = PTR_ERR(h_file);
-		if (IS_ERR(h_file))
-			goto out;
-	}
-
-	parent = dget_parent(dentry);
-	h_dir = au_h_iptr(d_inode(parent), cpg->bdst);
-	if (!au_test_h_perm_sio(h_dir, MAY_EXEC | MAY_WRITE)
-	    && !au_cpup_sio_test(cpg->pin, d_inode(dentry)->i_mode))
-		err = au_cpup_simple(cpg);
-	else {
-		struct au_cpup_simple_args args = {
-			.errp		= &err,
-			.cpg		= cpg
-		};
-		wkq_err = au_wkq_wait(au_call_cpup_simple, &args);
-		if (unlikely(wkq_err))
-			err = wkq_err;
-	}
-
-	dput(parent);
-	if (h_file)
-		au_h_open_post(dentry, cpg->bsrc, h_file);
-
-out:
-	return err;
-}
-
-int au_sio_cpup_simple(struct au_cp_generic *cpg)
-{
-	aufs_bindex_t bsrc, bend;
-	struct dentry *dentry, *h_dentry;
-
-	if (cpg->bsrc < 0) {
-		dentry = cpg->dentry;
-		bend = au_dbend(dentry);
-		for (bsrc = cpg->bdst + 1; bsrc <= bend; bsrc++) {
-			h_dentry = au_h_dptr(dentry, bsrc);
-			if (h_dentry) {
-				AuDebugOn(d_is_negative(h_dentry));
-				break;
-			}
-		}
-		AuDebugOn(bsrc > bend);
-		cpg->bsrc = bsrc;
-	}
-	AuDebugOn(cpg->bsrc <= cpg->bdst);
-	return au_do_sio_cpup_simple(cpg);
-}
-
-int au_sio_cpdown_simple(struct au_cp_generic *cpg)
-{
-	AuDebugOn(cpg->bdst <= cpg->bsrc);
-	return au_do_sio_cpup_simple(cpg);
-}
-
-/* ---------------------------------------------------------------------- */
-
-/*
- * copyup the deleted file for writing.
- */
-static int au_do_cpup_wh(struct au_cp_generic *cpg, struct dentry *wh_dentry,
-			 struct file *file)
-{
-	int err;
-	unsigned int flags_orig;
-	aufs_bindex_t bsrc_orig;
-	struct dentry *h_d_dst, *h_d_start;
-	struct au_dinfo *dinfo;
-	struct au_hdentry *hdp;
-
-	dinfo = au_di(cpg->dentry);
-	AuRwMustWriteLock(&dinfo->di_rwsem);
-
-	bsrc_orig = cpg->bsrc;
-	cpg->bsrc = dinfo->di_bstart;
-	hdp = dinfo->di_hdentry;
-	h_d_dst = hdp[0 + cpg->bdst].hd_dentry;
-	dinfo->di_bstart = cpg->bdst;
-	hdp[0 + cpg->bdst].hd_dentry = wh_dentry;
-	h_d_start = NULL;
-	if (file) {
-		h_d_start = hdp[0 + cpg->bsrc].hd_dentry;
-		hdp[0 + cpg->bsrc].hd_dentry = au_hf_top(file)->f_path.dentry;
-	}
-	flags_orig = cpg->flags;
-	cpg->flags = !AuCpup_DTIME;
-	err = au_cpup_single(cpg, /*h_parent*/NULL);
-	cpg->flags = flags_orig;
-	if (file) {
-		if (!err)
-			err = au_reopen_nondir(file);
-		hdp[0 + cpg->bsrc].hd_dentry = h_d_start;
-	}
-	hdp[0 + cpg->bdst].hd_dentry = h_d_dst;
-	dinfo->di_bstart = cpg->bsrc;
-	cpg->bsrc = bsrc_orig;
-
-	return err;
-}
-
-static int au_cpup_wh(struct au_cp_generic *cpg, struct file *file)
-{
-	int err;
-	aufs_bindex_t bdst;
-	struct au_dtime dt;
-	struct dentry *dentry, *parent, *h_parent, *wh_dentry;
-	struct au_branch *br;
-	struct path h_path;
-
-	dentry = cpg->dentry;
-	bdst = cpg->bdst;
-	br = au_sbr(dentry->d_sb, bdst);
-	parent = dget_parent(dentry);
-	h_parent = au_h_dptr(parent, bdst);
-	wh_dentry = au_whtmp_lkup(h_parent, br, &dentry->d_name);
-	err = PTR_ERR(wh_dentry);
-	if (IS_ERR(wh_dentry))
-		goto out;
-
-	h_path.dentry = h_parent;
-	h_path.mnt = au_br_mnt(br);
-	au_dtime_store(&dt, parent, &h_path);
-	err = au_do_cpup_wh(cpg, wh_dentry, file);
-	if (unlikely(err))
-		goto out_wh;
-
-	dget(wh_dentry);
-	h_path.dentry = wh_dentry;
-	if (!d_is_dir(wh_dentry)) {
-		/* no delegation since it is just created */
-		err = vfsub_unlink(d_inode(h_parent), &h_path,
-				   /*delegated*/NULL, /*force*/0);
-	} else
-		err = vfsub_rmdir(d_inode(h_parent), &h_path);
-	if (unlikely(err)) {
-		AuIOErr("failed remove copied-up tmp file %pd(%d)\n",
-			wh_dentry, err);
-		err = -EIO;
-	}
-	au_dtime_revert(&dt);
-	au_set_hi_wh(d_inode(dentry), bdst, wh_dentry);
-
-out_wh:
-	dput(wh_dentry);
-out:
-	dput(parent);
-	return err;
-}
-
-struct au_cpup_wh_args {
-	int *errp;
-	struct au_cp_generic *cpg;
-	struct file *file;
-};
-
-static void au_call_cpup_wh(void *args)
-{
-	struct au_cpup_wh_args *a = args;
-
-	au_pin_hdir_acquire_nest(a->cpg->pin);
-	*a->errp = au_cpup_wh(a->cpg, a->file);
-	au_pin_hdir_release(a->cpg->pin);
-}
-
-int au_sio_cpup_wh(struct au_cp_generic *cpg, struct file *file)
-{
-	int err, wkq_err;
-	aufs_bindex_t bdst;
-	struct dentry *dentry, *parent, *h_orph, *h_parent;
-	struct inode *dir, *h_dir, *h_tmpdir;
-	struct au_wbr *wbr;
-	struct au_pin wh_pin, *pin_orig;
-
-	dentry = cpg->dentry;
-	bdst = cpg->bdst;
-	parent = dget_parent(dentry);
-	dir = d_inode(parent);
-	h_orph = NULL;
-	h_parent = NULL;
-	h_dir = au_igrab(au_h_iptr(dir, bdst));
-	h_tmpdir = h_dir;
-	pin_orig = NULL;
-	if (!h_dir->i_nlink) {
-		wbr = au_sbr(dentry->d_sb, bdst)->br_wbr;
-		h_orph = wbr->wbr_orph;
-
-		h_parent = dget(au_h_dptr(parent, bdst));
-		au_set_h_dptr(parent, bdst, dget(h_orph));
-		h_tmpdir = d_inode(h_orph);
-		au_set_h_iptr(dir, bdst, au_igrab(h_tmpdir), /*flags*/0);
-
-		mutex_lock_nested(&h_tmpdir->i_mutex, AuLsc_I_PARENT3);
-		/* todo: au_h_open_pre()? */
-
-		pin_orig = cpg->pin;
-		au_pin_init(&wh_pin, dentry, bdst, AuLsc_DI_PARENT,
-			    AuLsc_I_PARENT3, cpg->pin->udba, AuPin_DI_LOCKED);
-		cpg->pin = &wh_pin;
-	}
-
-	if (!au_test_h_perm_sio(h_tmpdir, MAY_EXEC | MAY_WRITE)
-	    && !au_cpup_sio_test(cpg->pin, d_inode(dentry)->i_mode))
-		err = au_cpup_wh(cpg, file);
-	else {
-		struct au_cpup_wh_args args = {
-			.errp	= &err,
-			.cpg	= cpg,
-			.file	= file
-		};
-		wkq_err = au_wkq_wait(au_call_cpup_wh, &args);
-		if (unlikely(wkq_err))
-			err = wkq_err;
-	}
-
-	if (h_orph) {
-		mutex_unlock(&h_tmpdir->i_mutex);
-		/* todo: au_h_open_post()? */
-		au_set_h_iptr(dir, bdst, au_igrab(h_dir), /*flags*/0);
-		au_set_h_dptr(parent, bdst, h_parent);
-		AuDebugOn(!pin_orig);
-		cpg->pin = pin_orig;
-	}
-	iput(h_dir);
-	dput(parent);
-
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/*
- * generic routine for both of copy-up and copy-down.
- */
-/* cf. revalidate function in file.c */
-int au_cp_dirs(struct dentry *dentry, aufs_bindex_t bdst,
-	       int (*cp)(struct dentry *dentry, aufs_bindex_t bdst,
-			 struct au_pin *pin,
-			 struct dentry *h_parent, void *arg),
-	       void *arg)
-{
-	int err;
-	struct au_pin pin;
-	struct dentry *d, *parent, *h_parent, *real_parent, *h_dentry;
-
-	err = 0;
-	parent = dget_parent(dentry);
-	if (IS_ROOT(parent))
-		goto out;
-
-	au_pin_init(&pin, dentry, bdst, AuLsc_DI_PARENT2, AuLsc_I_PARENT2,
-		    au_opt_udba(dentry->d_sb), AuPin_MNT_WRITE);
-
-	/* do not use au_dpage */
-	real_parent = parent;
-	while (1) {
-		dput(parent);
-		parent = dget_parent(dentry);
-		h_parent = au_h_dptr(parent, bdst);
-		if (h_parent)
-			goto out; /* success */
-
-		/* find top dir which is necessary to cpup */
-		do {
-			d = parent;
-			dput(parent);
-			parent = dget_parent(d);
-			di_read_lock_parent3(parent, !AuLock_IR);
-			h_parent = au_h_dptr(parent, bdst);
-			di_read_unlock(parent, !AuLock_IR);
-		} while (!h_parent);
-
-		if (d != real_parent)
-			di_write_lock_child3(d);
-
-		/* somebody else might create while we were sleeping */
-		h_dentry = au_h_dptr(d, bdst);
-		if (!h_dentry || d_is_negative(h_dentry)) {
-			if (h_dentry)
-				au_update_dbstart(d);
-
-			au_pin_set_dentry(&pin, d);
-			err = au_do_pin(&pin);
-			if (!err) {
-				err = cp(d, bdst, &pin, h_parent, arg);
-				au_unpin(&pin);
-			}
-		}
-
-		if (d != real_parent)
-			di_write_unlock(d);
-		if (unlikely(err))
-			break;
-	}
-
-out:
-	dput(parent);
-	return err;
-}
-
-static int au_cpup_dir(struct dentry *dentry, aufs_bindex_t bdst,
-		       struct au_pin *pin,
-		       struct dentry *h_parent __maybe_unused,
-		       void *arg __maybe_unused)
-{
-	struct au_cp_generic cpg = {
-		.dentry	= dentry,
-		.bdst	= bdst,
-		.bsrc	= -1,
-		.len	= 0,
-		.pin	= pin,
-		.flags	= AuCpup_DTIME
-	};
-	return au_sio_cpup_simple(&cpg);
-}
-
-int au_cpup_dirs(struct dentry *dentry, aufs_bindex_t bdst)
-{
-	return au_cp_dirs(dentry, bdst, au_cpup_dir, NULL);
-}
-
-int au_test_and_cpup_dirs(struct dentry *dentry, aufs_bindex_t bdst)
-{
-	int err;
-	struct dentry *parent;
-	struct inode *dir;
-
-	parent = dget_parent(dentry);
-	dir = d_inode(parent);
-	err = 0;
-	if (au_h_iptr(dir, bdst))
-		goto out;
-
-	di_read_unlock(parent, AuLock_IR);
-	di_write_lock_parent(parent);
-	/* someone else might change our inode while we were sleeping */
-	if (!au_h_iptr(dir, bdst))
-		err = au_cpup_dirs(dentry, bdst);
-	di_downgrade_lock(parent, AuLock_IR);
-
-out:
-	dput(parent);
-	return err;
-}
diff --git a/fs/aufs/cpup.h b/fs/aufs/cpup.h
deleted file mode 100644
index ccba2c427..000000000
--- a/fs/aufs/cpup.h
+++ /dev/null
@@ -1,81 +0,0 @@
-/*
- * Copyright (C) 2005-2016 Junjiro R. Okajima
- */
-
-/*
- * copy-up/down functions
- */
-
-#ifndef __AUFS_CPUP_H__
-#define __AUFS_CPUP_H__
-
-#ifdef __KERNEL__
-
-#include <linux/path.h>
-
-struct inode;
-struct file;
-struct au_pin;
-
-void au_cpup_attr_flags(struct inode *dst, unsigned int iflags);
-void au_cpup_attr_timesizes(struct inode *inode);
-void au_cpup_attr_nlink(struct inode *inode, int force);
-void au_cpup_attr_changeable(struct inode *inode);
-void au_cpup_igen(struct inode *inode, struct inode *h_inode);
-void au_cpup_attr_all(struct inode *inode, int force);
-
-/* ---------------------------------------------------------------------- */
-
-struct au_cp_generic {
-	struct dentry	*dentry;
-	aufs_bindex_t	bdst, bsrc;
-	loff_t		len;
-	struct au_pin	*pin;
-	unsigned int	flags;
-};
-
-/* cpup flags */
-#define AuCpup_DTIME		1		/* do dtime_store/revert */
-#define AuCpup_KEEPLINO		(1 << 1)	/* do not clear the lower xino,
-						   for link(2) */
-#define AuCpup_RENAME		(1 << 2)	/* rename after cpup */
-#define AuCpup_HOPEN		(1 << 3)	/* call h_open_pre/post() in
-						   cpup */
-#define AuCpup_OVERWRITE	(1 << 4)	/* allow overwriting the
-						   existing entry */
-#define AuCpup_RWDST		(1 << 5)	/* force write target even if
-						   the branch is marked as RO */
-
-#define au_ftest_cpup(flags, name)	((flags) & AuCpup_##name)
-#define au_fset_cpup(flags, name) \
-	do { (flags) |= AuCpup_##name; } while (0)
-#define au_fclr_cpup(flags, name) \
-	do { (flags) &= ~AuCpup_##name; } while (0)
-
-int au_copy_file(struct file *dst, struct file *src, loff_t len);
-int au_sio_cpup_simple(struct au_cp_generic *cpg);
-int au_sio_cpdown_simple(struct au_cp_generic *cpg);
-int au_sio_cpup_wh(struct au_cp_generic *cpg, struct file *file);
-
-int au_cp_dirs(struct dentry *dentry, aufs_bindex_t bdst,
-	       int (*cp)(struct dentry *dentry, aufs_bindex_t bdst,
-			 struct au_pin *pin,
-			 struct dentry *h_parent, void *arg),
-	       void *arg);
-int au_cpup_dirs(struct dentry *dentry, aufs_bindex_t bdst);
-int au_test_and_cpup_dirs(struct dentry *dentry, aufs_bindex_t bdst);
-
-/* ---------------------------------------------------------------------- */
-
-/* keep timestamps when copyup */
-struct au_dtime {
-	struct dentry *dt_dentry;
-	struct path dt_h_path;
-	struct timespec dt_atime, dt_mtime;
-};
-void au_dtime_store(struct au_dtime *dt, struct dentry *dentry,
-		    struct path *h_path);
-void au_dtime_revert(struct au_dtime *dt);
-
-#endif /* __KERNEL__ */
-#endif /* __AUFS_CPUP_H__ */
diff --git a/fs/aufs/dbgaufs.c b/fs/aufs/dbgaufs.c
deleted file mode 100644
index 0aefb5ed8..000000000
--- a/fs/aufs/dbgaufs.c
+++ /dev/null
@@ -1,419 +0,0 @@
-/*
- * Copyright (C) 2005-2016 Junjiro R. Okajima
- */
-
-/*
- * debugfs interface
- */
-
-#include <linux/debugfs.h>
-#include "aufs.h"
-
-#ifndef CONFIG_SYSFS
-#error DEBUG_FS depends upon SYSFS
-#endif
-
-static struct dentry *dbgaufs;
-static const mode_t dbgaufs_mode = S_IRUSR | S_IRGRP | S_IROTH;
-
-/* 20 is max digits length of ulong 64 */
-struct dbgaufs_arg {
-	int n;
-	char a[20 * 4];
-};
-
-/*
- * common function for all XINO files
- */
-static int dbgaufs_xi_release(struct inode *inode __maybe_unused,
-			      struct file *file)
-{
-	kfree(file->private_data);
-	return 0;
-}
-
-static int dbgaufs_xi_open(struct file *xf, struct file *file, int do_fcnt)
-{
-	int err;
-	struct kstat st;
-	struct dbgaufs_arg *p;
-
-	err = -ENOMEM;
-	p = kmalloc(sizeof(*p), GFP_NOFS);
-	if (unlikely(!p))
-		goto out;
-
-	err = 0;
-	p->n = 0;
-	file->private_data = p;
-	if (!xf)
-		goto out;
-
-	err = vfs_getattr(&xf->f_path, &st);
-	if (!err) {
-		if (do_fcnt)
-			p->n = snprintf
-				(p->a, sizeof(p->a), "%ld, %llux%lu %lld\n",
-				 (long)file_count(xf), st.blocks, st.blksize,
-				 (long long)st.size);
-		else
-			p->n = snprintf(p->a, sizeof(p->a), "%llux%lu %lld\n",
-					st.blocks, st.blksize,
-					(long long)st.size);
-		AuDebugOn(p->n >= sizeof(p->a));
-	} else {
-		p->n = snprintf(p->a, sizeof(p->a), "err %d\n", err);
-		err = 0;
-	}
-
-out:
-	return err;
-
-}
-
-static ssize_t dbgaufs_xi_read(struct file *file, char __user *buf,
-			       size_t count, loff_t *ppos)
-{
-	struct dbgaufs_arg *p;
-
-	p = file->private_data;
-	return simple_read_from_buffer(buf, count, ppos, p->a, p->n);
-}
-
-/* ---------------------------------------------------------------------- */
-
-struct dbgaufs_plink_arg {
-	int n;
-	char a[];
-};
-
-static int dbgaufs_plink_release(struct inode *inode __maybe_unused,
-				 struct file *file)
-{
-	free_page((unsigned long)file->private_data);
-	return 0;
-}
-
-static int dbgaufs_plink_open(struct inode *inode, struct file *file)
-{
-	int err, i, limit;
-	unsigned long n, sum;
-	struct dbgaufs_plink_arg *p;
-	struct au_sbinfo *sbinfo;
-	struct super_block *sb;
-	struct au_sphlhead *sphl;
-
-	err = -ENOMEM;
-	p = (void *)get_zeroed_page(GFP_NOFS);
-	if (unlikely(!p))
-		goto out;
-
-	err = -EFBIG;
-	sbinfo = inode->i_private;
-	sb = sbinfo->si_sb;
-	si_noflush_read_lock(sb);
-	if (au_opt_test(au_mntflags(sb), PLINK)) {
-		limit = PAGE_SIZE - sizeof(p->n);
-
-		/* the number of buckets */
-		n = snprintf(p->a + p->n, limit, "%d\n", AuPlink_NHASH);
-		p->n += n;
-		limit -= n;
-
-		sum = 0;
-		for (i = 0, sphl = sbinfo->si_plink;
-		     i < AuPlink_NHASH;
-		     i++, sphl++) {
-			n = au_sphl_count(sphl);
-			sum += n;
-
-			n = snprintf(p->a + p->n, limit, "%lu ", n);
-			p->n += n;
-			limit -= n;
-			if (unlikely(limit <= 0))
-				goto out_free;
-		}
-		p->a[p->n - 1] = '\n';
-
-		/* the sum of plinks */
-		n = snprintf(p->a + p->n, limit, "%lu\n", sum);
-		p->n += n;
-		limit -= n;
-		if (unlikely(limit <= 0))
-			goto out_free;
-	} else {
-#define str "1\n0\n0\n"
-		p->n = sizeof(str) - 1;
-		strcpy(p->a, str);
-#undef str
-	}
-	si_read_unlock(sb);
-
-	err = 0;
-	file->private_data = p;
-	goto out; /* success */
-
-out_free:
-	free_page((unsigned long)p);
-out:
-	return err;
-}
-
-static ssize_t dbgaufs_plink_read(struct file *file, char __user *buf,
-				  size_t count, loff_t *ppos)
-{
-	struct dbgaufs_plink_arg *p;
-
-	p = file->private_data;
-	return simple_read_from_buffer(buf, count, ppos, p->a, p->n);
-}
-
-static const struct file_operations dbgaufs_plink_fop = {
-	.owner		= THIS_MODULE,
-	.open		= dbgaufs_plink_open,
-	.release	= dbgaufs_plink_release,
-	.read		= dbgaufs_plink_read
-};
-
-/* ---------------------------------------------------------------------- */
-
-static int dbgaufs_xib_open(struct inode *inode, struct file *file)
-{
-	int err;
-	struct au_sbinfo *sbinfo;
-	struct super_block *sb;
-
-	sbinfo = inode->i_private;
-	sb = sbinfo->si_sb;
-	si_noflush_read_lock(sb);
-	err = dbgaufs_xi_open(sbinfo->si_xib, file, /*do_fcnt*/0);
-	si_read_unlock(sb);
-	return err;
-}
-
-static const struct file_operations dbgaufs_xib_fop = {
-	.owner		= THIS_MODULE,
-	.open		= dbgaufs_xib_open,
-	.release	= dbgaufs_xi_release,
-	.read		= dbgaufs_xi_read
-};
-
-/* ---------------------------------------------------------------------- */
-
-#define DbgaufsXi_PREFIX "xi"
-
-static int dbgaufs_xino_open(struct inode *inode, struct file *file)
-{
-	int err;
-	long l;
-	struct au_sbinfo *sbinfo;
-	struct super_block *sb;
-	struct file *xf;
-	struct qstr *name;
-
-	err = -ENOENT;
-	xf = NULL;
-	name = &file->f_path.dentry->d_name;
-	if (unlikely(name->len < sizeof(DbgaufsXi_PREFIX)
-		     || memcmp(name->name, DbgaufsXi_PREFIX,
-			       sizeof(DbgaufsXi_PREFIX) - 1)))
-		goto out;
-	err = kstrtol(name->name + sizeof(DbgaufsXi_PREFIX) - 1, 10, &l);
-	if (unlikely(err))
-		goto out;
-
-	sbinfo = inode->i_private;
-	sb = sbinfo->si_sb;
-	si_noflush_read_lock(sb);
-	if (l <= au_sbend(sb)) {
-		xf = au_sbr(sb, (aufs_bindex_t)l)->br_xino.xi_file;
-		err = dbgaufs_xi_open(xf, file, /*do_fcnt*/1);
-	} else
-		err = -ENOENT;
-	si_read_unlock(sb);
-
-out:
-	return err;
-}
-
-static const struct file_operations dbgaufs_xino_fop = {
-	.owner		= THIS_MODULE,
-	.open		= dbgaufs_xino_open,
-	.release	= dbgaufs_xi_release,
-	.read		= dbgaufs_xi_read
-};
-
-void dbgaufs_brs_del(struct super_block *sb, aufs_bindex_t bindex)
-{
-	aufs_bindex_t bend;
-	struct au_branch *br;
-	struct au_xino_file *xi;
-
-	if (!au_sbi(sb)->si_dbgaufs)
-		return;
-
-	bend = au_sbend(sb);
-	for (; bindex <= bend; bindex++) {
-		br = au_sbr(sb, bindex);
-		xi = &br->br_xino;
-		debugfs_remove(xi->xi_dbgaufs);
-		xi->xi_dbgaufs = NULL;
-	}
-}
-
-void dbgaufs_brs_add(struct super_block *sb, aufs_bindex_t bindex)
-{
-	struct au_sbinfo *sbinfo;
-	struct dentry *parent;
-	struct au_branch *br;
-	struct au_xino_file *xi;
-	aufs_bindex_t bend;
-	char name[sizeof(DbgaufsXi_PREFIX) + 5]; /* "xi" bindex NULL */
-
-	sbinfo = au_sbi(sb);
-	parent = sbinfo->si_dbgaufs;
-	if (!parent)
-		return;
-
-	bend = au_sbend(sb);
-	for (; bindex <= bend; bindex++) {
-		snprintf(name, sizeof(name), DbgaufsXi_PREFIX "%d", bindex);
-		br = au_sbr(sb, bindex);
-		xi = &br->br_xino;
-		AuDebugOn(xi->xi_dbgaufs);
-		xi->xi_dbgaufs = debugfs_create_file(name, dbgaufs_mode, parent,
-						     sbinfo, &dbgaufs_xino_fop);
-		/* ignore an error */
-		if (unlikely(!xi->xi_dbgaufs))
-			AuWarn1("failed %s under debugfs\n", name);
-	}
-}
-
-/* ---------------------------------------------------------------------- */
-
-#ifdef CONFIG_AUFS_EXPORT
-static int dbgaufs_xigen_open(struct inode *inode, struct file *file)
-{
-	int err;
-	struct au_sbinfo *sbinfo;
-	struct super_block *sb;
-
-	sbinfo = inode->i_private;
-	sb = sbinfo->si_sb;
-	si_noflush_read_lock(sb);
-	err = dbgaufs_xi_open(sbinfo->si_xigen, file, /*do_fcnt*/0);
-	si_read_unlock(sb);
-	return err;
-}
-
-static const struct file_operations dbgaufs_xigen_fop = {
-	.owner		= THIS_MODULE,
-	.open		= dbgaufs_xigen_open,
-	.release	= dbgaufs_xi_release,
-	.read		= dbgaufs_xi_read
-};
-
-static int dbgaufs_xigen_init(struct au_sbinfo *sbinfo)
-{
-	int err;
-
-	/*
-	 * This function is a dynamic '__init' function actually,
-	 * so the tiny check for si_rwsem is unnecessary.
-	 */
-	/* AuRwMustWriteLock(&sbinfo->si_rwsem); */
-
-	err = -EIO;
-	sbinfo->si_dbgaufs_xigen = debugfs_create_file
-		("xigen", dbgaufs_mode, sbinfo->si_dbgaufs, sbinfo,
-		 &dbgaufs_xigen_fop);
-	if (sbinfo->si_dbgaufs_xigen)
-		err = 0;
-
-	return err;
-}
-#else
-static int dbgaufs_xigen_init(struct au_sbinfo *sbinfo)
-{
-	return 0;
-}
-#endif /* CONFIG_AUFS_EXPORT */
-
-/* ---------------------------------------------------------------------- */
-
-void dbgaufs_si_fin(struct au_sbinfo *sbinfo)
-{
-	/*
-	 * This function is a dynamic '__fin' function actually,
-	 * so the tiny check for si_rwsem is unnecessary.
-	 */
-	/* AuRwMustWriteLock(&sbinfo->si_rwsem); */
-
-	debugfs_remove_recursive(sbinfo->si_dbgaufs);
-	sbinfo->si_dbgaufs = NULL;
-	kobject_put(&sbinfo->si_kobj);
-}
-
-int dbgaufs_si_init(struct au_sbinfo *sbinfo)
-{
-	int err;
-	char name[SysaufsSiNameLen];
-
-	/*
-	 * This function is a dynamic '__init' function actually,
-	 * so the tiny check for si_rwsem is unnecessary.
-	 */
-	/* AuRwMustWriteLock(&sbinfo->si_rwsem); */
-
-	err = -ENOENT;
-	if (!dbgaufs) {
-		AuErr1("/debug/aufs is uninitialized\n");
-		goto out;
-	}
-
-	err = -EIO;
-	sysaufs_name(sbinfo, name);
-	sbinfo->si_dbgaufs = debugfs_create_dir(name, dbgaufs);
-	if (unlikely(!sbinfo->si_dbgaufs))
-		goto out;
-	kobject_get(&sbinfo->si_kobj);
-
-	sbinfo->si_dbgaufs_xib = debugfs_create_file
-		("xib", dbgaufs_mode, sbinfo->si_dbgaufs, sbinfo,
-		 &dbgaufs_xib_fop);
-	if (unlikely(!sbinfo->si_dbgaufs_xib))
-		goto out_dir;
-
-	sbinfo->si_dbgaufs_plink = debugfs_create_file
-		("plink", dbgaufs_mode, sbinfo->si_dbgaufs, sbinfo,
-		 &dbgaufs_plink_fop);
-	if (unlikely(!sbinfo->si_dbgaufs_plink))
-		goto out_dir;
-
-	err = dbgaufs_xigen_init(sbinfo);
-	if (!err)
-		goto out; /* success */
-
-out_dir:
-	dbgaufs_si_fin(sbinfo);
-out:
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void dbgaufs_fin(void)
-{
-	debugfs_remove(dbgaufs);
-}
-
-int __init dbgaufs_init(void)
-{
-	int err;
-
-	err = -EIO;
-	dbgaufs = debugfs_create_dir(AUFS_NAME, NULL);
-	if (dbgaufs)
-		err = 0;
-	return err;
-}
diff --git a/fs/aufs/dbgaufs.h b/fs/aufs/dbgaufs.h
deleted file mode 100644
index 81f272e42..000000000
--- a/fs/aufs/dbgaufs.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/*
- * Copyright (C) 2005-2016 Junjiro R. Okajima
- */
-
-/*
- * debugfs interface
- */
-
-#ifndef __DBGAUFS_H__
-#define __DBGAUFS_H__
-
-#ifdef __KERNEL__
-
-struct super_block;
-struct au_sbinfo;
-
-#ifdef CONFIG_DEBUG_FS
-/* dbgaufs.c */
-void dbgaufs_brs_del(struct super_block *sb, aufs_bindex_t bindex);
-void dbgaufs_brs_add(struct super_block *sb, aufs_bindex_t bindex);
-void dbgaufs_si_fin(struct au_sbinfo *sbinfo);
-int dbgaufs_si_init(struct au_sbinfo *sbinfo);
-void dbgaufs_fin(void);
-int __init dbgaufs_init(void);
-#else
-AuStubVoid(dbgaufs_brs_del, struct super_block *sb, aufs_bindex_t bindex)
-AuStubVoid(dbgaufs_brs_add, struct super_block *sb, aufs_bindex_t bindex)
-AuStubVoid(dbgaufs_si_fin, struct au_sbinfo *sbinfo)
-AuStubInt0(dbgaufs_si_init, struct au_sbinfo *sbinfo)
-AuStubVoid(dbgaufs_fin, void)
-AuStubInt0(__init dbgaufs_init, void)
-#endif /* CONFIG_DEBUG_FS */
-
-#endif /* __KERNEL__ */
-#endif /* __DBGAUFS_H__ */
diff --git a/fs/aufs/dcsub.c b/fs/aufs/dcsub.c
deleted file mode 100644
index e72accebb..000000000
--- a/fs/aufs/dcsub.c
+++ /dev/null
@@ -1,211 +0,0 @@
-/*
- * Copyright (C) 2005-2016 Junjiro R. Okajima
- */
-
-/*
- * sub-routines for dentry cache
- */
-
-#include "aufs.h"
-
-static void au_dpage_free(struct au_dpage *dpage)
-{
-	int i;
-	struct dentry **p;
-
-	p = dpage->dentries;
-	for (i = 0; i < dpage->ndentry; i++)
-		dput(*p++);
-	free_page((unsigned long)dpage->dentries);
-}
-
-int au_dpages_init(struct au_dcsub_pages *dpages, gfp_t gfp)
-{
-	int err;
-	void *p;
-
-	err = -ENOMEM;
-	dpages->dpages = kmalloc(sizeof(*dpages->dpages), gfp);
-	if (unlikely(!dpages->dpages))
-		goto out;
-
-	p = (void *)__get_free_page(gfp);
-	if (unlikely(!p))
-		goto out_dpages;
-
-	dpages->dpages[0].ndentry = 0;
-	dpages->dpages[0].dentries = p;
-	dpages->ndpage = 1;
-	return 0; /* success */
-
-out_dpages:
-	kfree(dpages->dpages);
-out:
-	return err;
-}
-
-void au_dpages_free(struct au_dcsub_pages *dpages)
-{
-	int i;
-	struct au_dpage *p;
-
-	p = dpages->dpages;
-	for (i = 0; i < dpages->ndpage; i++)
-		au_dpage_free(p++);
-	kfree(dpages->dpages);
-}
-
-static int au_dpages_append(struct au_dcsub_pages *dpages,
-			    struct dentry *dentry, gfp_t gfp)
-{
-	int err, sz;
-	struct au_dpage *dpage;
-	void *p;
-
-	dpage = dpages->dpages + dpages->ndpage - 1;
-	sz = PAGE_SIZE / sizeof(dentry);
-	if (unlikely(dpage->ndentry >= sz)) {
-		AuLabel(new dpage);
-		err = -ENOMEM;
-		sz = dpages->ndpage * sizeof(*dpages->dpages);
-		p = au_kzrealloc(dpages->dpages, sz,
-				 sz + sizeof(*dpages->dpages), gfp);
-		if (unlikely(!p))
-			goto out;
-
-		dpages->dpages = p;
-		dpage = dpages->dpages + dpages->ndpage;
-		p = (void *)__get_free_page(gfp);
-		if (unlikely(!p))
-			goto out;
-
-		dpage->ndentry = 0;
-		dpage->dentries = p;
-		dpages->ndpage++;
-	}
-
-	AuDebugOn(au_dcount(dentry) <= 0);
-	dpage->dentries[dpage->ndentry++] = dget_dlock(dentry);
-	return 0; /* success */
-
-out:
-	return err;
-}
-
-/* todo: BAD approach */
-/* copied from linux/fs/dcache.c */
-enum d_walk_ret {
-	D_WALK_CONTINUE,
-	D_WALK_QUIT,
-	D_WALK_NORETRY,
-	D_WALK_SKIP,
-};
-
-extern void d_walk(struct dentry *parent, void *data,
-		   enum d_walk_ret (*enter)(void *, struct dentry *),
-		   void (*finish)(void *));
-
-struct ac_dpages_arg {
-	int err;
-	struct au_dcsub_pages *dpages;
-	struct super_block *sb;
-	au_dpages_test test;
-	void *arg;
-};
-
-static enum d_walk_ret au_call_dpages_append(void *_arg, struct dentry *dentry)
-{
-	enum d_walk_ret ret;
-	struct ac_dpages_arg *arg = _arg;
-
-	ret = D_WALK_CONTINUE;
-	if (dentry->d_sb == arg->sb
-	    && !IS_ROOT(dentry)
-	    && au_dcount(dentry) > 0
-	    && au_di(dentry)
-	    && (!arg->test || arg->test(dentry, arg->arg))) {
-		arg->err = au_dpages_append(arg->dpages, dentry, GFP_ATOMIC);
-		if (unlikely(arg->err))
-			ret = D_WALK_QUIT;
-	}
-
-	return ret;
-}
-
-int au_dcsub_pages(struct au_dcsub_pages *dpages, struct dentry *root,
-		   au_dpages_test test, void *arg)
-{
-	struct ac_dpages_arg args = {
-		.err	= 0,
-		.dpages	= dpages,
-		.sb	= root->d_sb,
-		.test	= test,
-		.arg	= arg
-	};
-
-	d_walk(root, &args, au_call_dpages_append, NULL);
-
-	return args.err;
-}
-
-int au_dcsub_pages_rev(struct au_dcsub_pages *dpages, struct dentry *dentry,
-		       int do_include, au_dpages_test test, void *arg)
-{
-	int err;
-
-	err = 0;
-	write_seqlock(&rename_lock);
-	spin_lock(&dentry->d_lock);
-	if (do_include
-	    && au_dcount(dentry) > 0
-	    && (!test || test(dentry, arg)))
-		err = au_dpages_append(dpages, dentry, GFP_ATOMIC);
-	spin_unlock(&dentry->d_lock);
-	if (unlikely(err))
-		goto out;
-
-	/*
-	 * RCU for vfsmount is unnecessary since this is a traverse in a single
-	 * mount
-	 */
-	while (!IS_ROOT(dentry)) {
-		dentry = dentry->d_parent; /* rename_lock is locked */
-		spin_lock(&dentry->d_lock);
-		if (au_dcount(dentry) > 0
-		    && (!test || test(dentry, arg)))
-			err = au_dpages_append(dpages, dentry, GFP_ATOMIC);
-		spin_unlock(&dentry->d_lock);
-		if (unlikely(err))
-			break;
-	}
-
-out:
-	write_sequnlock(&rename_lock);
-	return err;
-}
-
-static inline int au_dcsub_dpages_aufs(struct dentry *dentry, void *arg)
-{
-	return au_di(dentry) && dentry->d_sb == arg;
-}
-
-int au_dcsub_pages_rev_aufs(struct au_dcsub_pages *dpages,
-			    struct dentry *dentry, int do_include)
-{
-	return au_dcsub_pages_rev(dpages, dentry, do_include,
-				  au_dcsub_dpages_aufs, dentry->d_sb);
-}
-
-int au_test_subdir(struct dentry *d1, struct dentry *d2)
-{
-	struct path path[2] = {
-		{
-			.dentry = d1
-		},
-		{
-			.dentry = d2
-		}
-	};
-
-	return path_is_under(path + 0, path + 1);
-}
diff --git a/fs/aufs/dcsub.h b/fs/aufs/dcsub.h
deleted file mode 100644
index 5d2cf661d..000000000
--- a/fs/aufs/dcsub.h
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * Copyright (C) 2005-2016 Junjiro R. Okajima
- */
-
-/*
- * sub-routines for dentry cache
- */
-
-#ifndef __AUFS_DCSUB_H__
-#define __AUFS_DCSUB_H__
-
-#ifdef __KERNEL__
-
-#include <linux/dcache.h>
-#include <linux/fs.h>
-
-struct au_dpage {
-	int ndentry;
-	struct dentry **dentries;
-};
-
-struct au_dcsub_pages {
-	int ndpage;
-	struct au_dpage *dpages;
-};
-
-/* ---------------------------------------------------------------------- */
-
-/* dcsub.c */
-int au_dpages_init(struct au_dcsub_pages *dpages, gfp_t gfp);
-void au_dpages_free(struct au_dcsub_pages *dpages);
-typedef int (*au_dpages_test)(struct dentry *dentry, void *arg);
-int au_dcsub_pages(struct au_dcsub_pages *dpages, struct dentry *root,
-		   au_dpages_test test, void *arg);
-int au_dcsub_pages_rev(struct au_dcsub_pages *dpages, struct dentry *dentry,
-		       int do_include, au_dpages_test test, void *arg);
-int au_dcsub_pages_rev_aufs(struct au_dcsub_pages *dpages,
-			    struct dentry *dentry, int do_include);
-int au_test_subdir(struct dentry *d1, struct dentry *d2);
-
-/* ---------------------------------------------------------------------- */
-
-/*
- * todo: in linux-3.13, several similar (but faster) helpers are added to
- * include/linux/dcache.h. Try them (in the future).
- */
-
-static inline int au_d_hashed_positive(struct dentry *d)
-{
-	int err;
-	struct inode *inode = d_inode(d);
-
-	err = 0;
-	if (unlikely(d_unhashed(d)
-		     || d_is_negative(d)
-		     || !inode->i_nlink))
-		err = -ENOENT;
-	return err;
-}
-
-static inline int au_d_linkable(struct dentry *d)
-{
-	int err;
-	struct inode *inode = d_inode(d);
-
-	err = au_d_hashed_positive(d);
-	if (err
-	    && d_is_positive(d)
-	    && (inode->i_state & I_LINKABLE))
-		err = 0;
-	return err;
-}
-
-static inline int au_d_alive(struct dentry *d)
-{
-	int err;
-	struct inode *inode;
-
-	err = 0;
-	if (!IS_ROOT(d))
-		err = au_d_hashed_positive(d);
-	else {
-		inode = d_inode(d);
-		if (unlikely(d_unlinked(d)
-			     || d_is_negative(d)
-			     || !inode->i_nlink))
-			err = -ENOENT;
-	}
-	return err;
-}
-
-static inline int au_alive_dir(struct dentry *d)
-{
-	int err;
-
-	err = au_d_alive(d);
-	if (unlikely(err || IS_DEADDIR(d_inode(d))))
-		err = -ENOENT;
-	return err;
-}
-
-static inline int au_qstreq(struct qstr *a, struct qstr *b)
-{
-	return a->len == b->len
-		&& !memcmp(a->name, b->name, a->len);
-}
-
-/*
- * by the commit
- * 360f547 2015-01-25 dcache: let the dentry count go down to zero without
- *			taking d_lock
- * the type of d_lockref.count became int, but the inlined function d_count()
- * still returns unsigned int.
- * I don't know why. Maybe it is for every d_count() users?
- * Anyway au_dcount() lives on.
- */
-static inline int au_dcount(struct dentry *d)
-{
-	return (int)d_count(d);
-}
-
-#endif /* __KERNEL__ */
-#endif /* __AUFS_DCSUB_H__ */
diff --git a/fs/aufs/debug.c b/fs/aufs/debug.c
deleted file mode 100644
index 4529831a9..000000000
--- a/fs/aufs/debug.c
+++ /dev/null
@@ -1,425 +0,0 @@
-/*
- * Copyright (C) 2005-2016 Junjiro R. Okajima
- */
-
-/*
- * debug print functions
- */
-
-#include "aufs.h"
-
-/* Returns 0, or -errno.  arg is in kp->arg. */
-static int param_atomic_t_set(const char *val, const struct kernel_param *kp)
-{
-	int err, n;
-
-	err = kstrtoint(val, 0, &n);
-	if (!err) {
-		if (n > 0)
-			au_debug_on();
-		else
-			au_debug_off();
-	}
-	return err;
-}
-
-/* Returns length written or -errno.  Buffer is 4k (ie. be short!) */
-static int param_atomic_t_get(char *buffer, const struct kernel_param *kp)
-{
-	atomic_t *a;
-
-	a = kp->arg;
-	return sprintf(buffer, "%d", atomic_read(a));
-}
-
-static struct kernel_param_ops param_ops_atomic_t = {
-	.set = param_atomic_t_set,
-	.get = param_atomic_t_get
-	/* void (*free)(void *arg) */
-};
-
-atomic_t aufs_debug = ATOMIC_INIT(0);
-MODULE_PARM_DESC(debug, "debug print");
-module_param_named(debug, aufs_debug, atomic_t, S_IRUGO | S_IWUSR | S_IWGRP);
-
-DEFINE_MUTEX(au_dbg_mtx);	/* just to serialize the dbg msgs */
-char *au_plevel = KERN_DEBUG;
-#define dpri(fmt, ...) do {					\
-	if ((au_plevel						\
-	     && strcmp(au_plevel, KERN_DEBUG))			\
-	    || au_debug_test())					\
-		printk("%s" fmt, au_plevel, ##__VA_ARGS__);	\
-} while (0)
-
-/* ---------------------------------------------------------------------- */
-
-void au_dpri_whlist(struct au_nhash *whlist)
-{
-	unsigned long ul, n;
-	struct hlist_head *head;
-	struct au_vdir_wh *pos;
-
-	n = whlist->nh_num;
-	head = whlist->nh_head;
-	for (ul = 0; ul < n; ul++) {
-		hlist_for_each_entry(pos, head, wh_hash)
-			dpri("b%d, %.*s, %d\n",
-			     pos->wh_bindex,
-			     pos->wh_str.len, pos->wh_str.name,
-			     pos->wh_str.len);
-		head++;
-	}
-}
-
-void au_dpri_vdir(struct au_vdir *vdir)
-{
-	unsigned long ul;
-	union au_vdir_deblk_p p;
-	unsigned char *o;
-
-	if (!vdir || IS_ERR(vdir)) {
-		dpri("err %ld\n", PTR_ERR(vdir));
-		return;
-	}
-
-	dpri("deblk %u, nblk %lu, deblk %p, last{%lu, %p}, ver %lu\n",
-	     vdir->vd_deblk_sz, vdir->vd_nblk, vdir->vd_deblk,
-	     vdir->vd_last.ul, vdir->vd_last.p.deblk, vdir->vd_version);
-	for (ul = 0; ul < vdir->vd_nblk; ul++) {
-		p.deblk = vdir->vd_deblk[ul];
-		o = p.deblk;
-		dpri("[%lu]: %p\n", ul, o);
-	}
-}
-
-static int do_pri_inode(aufs_bindex_t bindex, struct inode *inode, int hn,
-			struct dentry *wh)
-{
-	char *n = NULL;
-	int l = 0;
-
-	if (!inode || IS_ERR(inode)) {
-		dpri("i%d: err %ld\n", bindex, PTR_ERR(inode));
-		return -1;
-	}
-
-	/* the type of i_blocks depends upon CONFIG_LBDAF */
-	BUILD_BUG_ON(sizeof(inode->i_blocks) != sizeof(unsigned long)
-		     && sizeof(inode->i_blocks) != sizeof(u64));
-	if (wh) {
-		n = (void *)wh->d_name.name;
-		l = wh->d_name.len;
-	}
-
-	dpri("i%d: %p, i%lu, %s, cnt %d, nl %u, 0%o, sz %llu, blk %llu,"
-	     " hn %d, ct %lld, np %lu, st 0x%lx, f 0x%x, v %llu, g %x%s%.*s\n",
-	     bindex, inode,
-	     inode->i_ino, inode->i_sb ? au_sbtype(inode->i_sb) : "??",
-	     atomic_read(&inode->i_count), inode->i_nlink, inode->i_mode,
-	     i_size_read(inode), (unsigned long long)inode->i_blocks,
-	     hn, (long long)timespec_to_ns(&inode->i_ctime) & 0x0ffff,
-	     inode->i_mapping ? inode->i_mapping->nrpages : 0,
-	     inode->i_state, inode->i_flags, inode->i_version,
-	     inode->i_generation,
-	     l ? ", wh " : "", l, n);
-	return 0;
-}
-
-void au_dpri_inode(struct inode *inode)
-{
-	struct au_iinfo *iinfo;
-	aufs_bindex_t bindex;
-	int err, hn;
-
-	err = do_pri_inode(-1, inode, -1, NULL);
-	if (err || !au_test_aufs(inode->i_sb))
-		return;
-
-	iinfo = au_ii(inode);
-	if (!iinfo)
-		return;
-	dpri("i-1: bstart %d, bend %d, gen %d\n",
-	     iinfo->ii_bstart, iinfo->ii_bend, au_iigen(inode, NULL));
-	if (iinfo->ii_bstart < 0)
-		return;
-	hn = 0;
-	for (bindex = iinfo->ii_bstart; bindex <= iinfo->ii_bend; bindex++) {
-		hn = !!au_hn(iinfo->ii_hinode + bindex);
-		do_pri_inode(bindex, iinfo->ii_hinode[0 + bindex].hi_inode, hn,
-			     iinfo->ii_hinode[0 + bindex].hi_whdentry);
-	}
-}
-
-void au_dpri_dalias(struct inode *inode)
-{
-	struct dentry *d;
-
-	spin_lock(&inode->i_lock);
-	hlist_for_each_entry(d, &inode->i_dentry, d_u.d_alias)
-		au_dpri_dentry(d);
-	spin_unlock(&inode->i_lock);
-}
-
-static int do_pri_dentry(aufs_bindex_t bindex, struct dentry *dentry)
-{
-	struct dentry *wh = NULL;
-	int hn;
-	struct au_iinfo *iinfo;
-
-	if (!dentry || IS_ERR(dentry)) {
-		dpri("d%d: err %ld\n", bindex, PTR_ERR(dentry));
-		return -1;
-	}
-	/* do not call dget_parent() here */
-	/* note: access d_xxx without d_lock */
-	dpri("d%d: %p, %pd2?, %s, cnt %d, flags 0x%x, %shashed\n",
-	     bindex, dentry, dentry,
-	     dentry->d_sb ? au_sbtype(dentry->d_sb) : "??",
-	     au_dcount(dentry), dentry->d_flags,
-	     d_unhashed(dentry) ? "un" : "");
-	hn = -1;
-	if (bindex >= 0
-	    && d_is_positive(dentry)
-	    && au_test_aufs(dentry->d_sb)) {
-		iinfo = au_ii(d_inode(dentry));
-		if (iinfo) {
-			hn = !!au_hn(iinfo->ii_hinode + bindex);
-			wh = iinfo->ii_hinode[0 + bindex].hi_whdentry;
-		}
-	}
-	do_pri_inode(bindex, d_inode(dentry), hn, wh);
-	return 0;
-}
-
-void au_dpri_dentry(struct dentry *dentry)
-{
-	struct au_dinfo *dinfo;
-	aufs_bindex_t bindex;
-	int err;
-	struct au_hdentry *hdp;
-
-	err = do_pri_dentry(-1, dentry);
-	if (err || !au_test_aufs(dentry->d_sb))
-		return;
-
-	dinfo = au_di(dentry);
-	if (!dinfo)
-		return;
-	dpri("d-1: bstart %d, bend %d, bwh %d, bdiropq %d, gen %d, tmp %d\n",
-	     dinfo->di_bstart, dinfo->di_bend,
-	     dinfo->di_bwh, dinfo->di_bdiropq, au_digen(dentry),
-	     dinfo->di_tmpfile);
-	if (dinfo->di_bstart < 0)
-		return;
-	hdp = dinfo->di_hdentry;
-	for (bindex = dinfo->di_bstart; bindex <= dinfo->di_bend; bindex++)
-		do_pri_dentry(bindex, hdp[0 + bindex].hd_dentry);
-}
-
-static int do_pri_file(aufs_bindex_t bindex, struct file *file)
-{
-	char a[32];
-
-	if (!file || IS_ERR(file)) {
-		dpri("f%d: err %ld\n", bindex, PTR_ERR(file));
-		return -1;
-	}
-	a[0] = 0;
-	if (bindex < 0
-	    && !IS_ERR_OR_NULL(file->f_path.dentry)
-	    && au_test_aufs(file->f_path.dentry->d_sb)
-	    && au_fi(file))
-		snprintf(a, sizeof(a), ", gen %d, mmapped %d",
-			 au_figen(file), atomic_read(&au_fi(file)->fi_mmapped));
-	dpri("f%d: mode 0x%x, flags 0%o, cnt %ld, v %llu, pos %llu%s\n",
-	     bindex, file->f_mode, file->f_flags, (long)file_count(file),
-	     file->f_version, file->f_pos, a);
-	if (!IS_ERR_OR_NULL(file->f_path.dentry))
-		do_pri_dentry(bindex, file->f_path.dentry);
-	return 0;
-}
-
-void au_dpri_file(struct file *file)
-{
-	struct au_finfo *finfo;
-	struct au_fidir *fidir;
-	struct au_hfile *hfile;
-	aufs_bindex_t bindex;
-	int err;
-
-	err = do_pri_file(-1, file);
-	if (err
-	    || IS_ERR_OR_NULL(file->f_path.dentry)
-	    || !au_test_aufs(file->f_path.dentry->d_sb))
-		return;
-
-	finfo = au_fi(file);
-	if (!finfo)
-		return;
-	if (finfo->fi_btop < 0)
-		return;
-	fidir = finfo->fi_hdir;
-	if (!fidir)
-		do_pri_file(finfo->fi_btop, finfo->fi_htop.hf_file);
-	else
-		for (bindex = finfo->fi_btop;
-		     bindex >= 0 && bindex <= fidir->fd_bbot;
-		     bindex++) {
-			hfile = fidir->fd_hfile + bindex;
-			do_pri_file(bindex, hfile ? hfile->hf_file : NULL);
-		}
-}
-
-static int do_pri_br(aufs_bindex_t bindex, struct au_branch *br)
-{
-	struct vfsmount *mnt;
-	struct super_block *sb;
-
-	if (!br || IS_ERR(br))
-		goto out;
-	mnt = au_br_mnt(br);
-	if (!mnt || IS_ERR(mnt))
-		goto out;
-	sb = mnt->mnt_sb;
-	if (!sb || IS_ERR(sb))
-		goto out;
-
-	dpri("s%d: {perm 0x%x, id %d, cnt %d, wbr %p}, "
-	     "%s, dev 0x%02x%02x, flags 0x%lx, cnt %d, active %d, "
-	     "xino %d\n",
-	     bindex, br->br_perm, br->br_id, atomic_read(&br->br_count),
-	     br->br_wbr, au_sbtype(sb), MAJOR(sb->s_dev), MINOR(sb->s_dev),
-	     sb->s_flags, sb->s_count,
-	     atomic_read(&sb->s_active), !!br->br_xino.xi_file);
-	return 0;
-
-out:
-	dpri("s%d: err %ld\n", bindex, PTR_ERR(br));
-	return -1;
-}
-
-void au_dpri_sb(struct super_block *sb)
-{
-	struct au_sbinfo *sbinfo;
-	aufs_bindex_t bindex;
-	int err;
-	/* to reuduce stack size */
-	struct {
-		struct vfsmount mnt;
-		struct au_branch fake;
-	} *a;
-
-	/* this function can be called from magic sysrq */
-	a = kzalloc(sizeof(*a), GFP_ATOMIC);
-	if (unlikely(!a)) {
-		dpri("no memory\n");
-		return;
-	}
-
-	a->mnt.mnt_sb = sb;
-	a->fake.br_path.mnt = &a->mnt;
-	atomic_set(&a->fake.br_count, 0);
-	smp_mb(); /* atomic_set */
-	err = do_pri_br(-1, &a->fake);
-	kfree(a);
-	dpri("dev 0x%x\n", sb->s_dev);
-	if (err || !au_test_aufs(sb))
-		return;
-
-	sbinfo = au_sbi(sb);
-	if (!sbinfo)
-		return;
-	dpri("nw %d, gen %u, kobj %d\n",
-	     atomic_read(&sbinfo->si_nowait.nw_len), sbinfo->si_generation,
-	     atomic_read(&sbinfo->si_kobj.kref.refcount));
-	for (bindex = 0; bindex <= sbinfo->si_bend; bindex++)
-		do_pri_br(bindex, sbinfo->si_branch[0 + bindex]);
-}
-
-/* ---------------------------------------------------------------------- */
-
-void __au_dbg_verify_dinode(struct dentry *dentry, const char *func, int line)
-{
-	struct inode *h_inode, *inode = d_inode(dentry);
-	struct dentry *h_dentry;
-	aufs_bindex_t bindex, bend, bi;
-
-	if (!inode /* || au_di(dentry)->di_lsc == AuLsc_DI_TMP */)
-		return;
-
-	bend = au_dbend(dentry);
-	bi = au_ibend(inode);
-	if (bi < bend)
-		bend = bi;
-	bindex = au_dbstart(dentry);
-	bi = au_ibstart(inode);
-	if (bi > bindex)
-		bindex = bi;
-
-	for (; bindex <= bend; bindex++) {
-		h_dentry = au_h_dptr(dentry, bindex);
-		if (!h_dentry)
-			continue;
-		h_inode = au_h_iptr(inode, bindex);
-		if (unlikely(h_inode != d_inode(h_dentry))) {
-			au_debug_on();
-			AuDbg("b%d, %s:%d\n", bindex, func, line);
-			AuDbgDentry(dentry);
-			AuDbgInode(inode);
-			au_debug_off();
-			BUG();
-		}
-	}
-}
-
-void au_dbg_verify_gen(struct dentry *parent, unsigned int sigen)
-{
-	int err, i, j;
-	struct au_dcsub_pages dpages;
-	struct au_dpage *dpage;
-	struct dentry **dentries;
-
-	err = au_dpages_init(&dpages, GFP_NOFS);
-	AuDebugOn(err);
-	err = au_dcsub_pages_rev_aufs(&dpages, parent, /*do_include*/1);
-	AuDebugOn(err);
-	for (i = dpages.ndpage - 1; !err && i >= 0; i--) {
-		dpage = dpages.dpages + i;
-		dentries = dpage->dentries;
-		for (j = dpage->ndentry - 1; !err && j >= 0; j--)
-			AuDebugOn(au_digen_test(dentries[j], sigen));
-	}
-	au_dpages_free(&dpages);
-}
-
-void au_dbg_verify_kthread(void)
-{
-	if (au_wkq_test()) {
-		au_dbg_blocked();
-		/*
-		 * It may be recursive, but udba=notify between two aufs mounts,
-		 * where a single ro branch is shared, is not a problem.
-		 */
-		/* WARN_ON(1); */
-	}
-}
-
-/* ---------------------------------------------------------------------- */
-
-int __init au_debug_init(void)
-{
-	aufs_bindex_t bindex;
-	struct au_vdir_destr destr;
-
-	bindex = -1;
-	AuDebugOn(bindex >= 0);
-
-	destr.len = -1;
-	AuDebugOn(destr.len < NAME_MAX);
-
-#ifdef CONFIG_4KSTACKS
-	pr_warn("CONFIG_4KSTACKS is defined.\n");
-#endif
-
-	return 0;
-}
diff --git a/fs/aufs/debug.h b/fs/aufs/debug.h
deleted file mode 100644
index 0567f31d0..000000000
--- a/fs/aufs/debug.h
+++ /dev/null
@@ -1,212 +0,0 @@
-/*
- * Copyright (C) 2005-2016 Junjiro R. Okajima
- */
-
-/*
- * debug print functions
- */
-
-#ifndef __AUFS_DEBUG_H__
-#define __AUFS_DEBUG_H__
-
-#ifdef __KERNEL__
-
-#include <linux/atomic.h>
-#include <linux/module.h>
-#include <linux/kallsyms.h>
-#include <linux/sysrq.h>
-
-#ifdef CONFIG_AUFS_DEBUG
-#define AuDebugOn(a)		BUG_ON(a)
-
-/* module parameter */
-extern atomic_t aufs_debug;
-static inline void au_debug_on(void)
-{
-	atomic_inc(&aufs_debug);
-}
-static inline void au_debug_off(void)
-{
-	atomic_dec_if_positive(&aufs_debug);
-}
-
-static inline int au_debug_test(void)
-{
-	return atomic_read(&aufs_debug) > 0;
-}
-#else
-#define AuDebugOn(a)		do {} while (0)
-AuStubVoid(au_debug_on, void)
-AuStubVoid(au_debug_off, void)
-AuStubInt0(au_debug_test, void)
-#endif /* CONFIG_AUFS_DEBUG */
-
-#define param_check_atomic_t(name, p) __param_check(name, p, atomic_t)
-
-/* ---------------------------------------------------------------------- */
-
-/* debug print */
-
-#define AuDbg(fmt, ...) do { \
-	if (au_debug_test()) \
-		pr_debug("DEBUG: " fmt, ##__VA_ARGS__); \
-} while (0)
-#define AuLabel(l)		AuDbg(#l "\n")
-#define AuIOErr(fmt, ...)	pr_err("I/O Error, " fmt, ##__VA_ARGS__)
-#define AuWarn1(fmt, ...) do { \
-	static unsigned char _c; \
-	if (!_c++) \
-		pr_warn(fmt, ##__VA_ARGS__); \
-} while (0)
-
-#define AuErr1(fmt, ...) do { \
-	static unsigned char _c; \
-	if (!_c++) \
-		pr_err(fmt, ##__VA_ARGS__); \
-} while (0)
-
-#define AuIOErr1(fmt, ...) do { \
-	static unsigned char _c; \
-	if (!_c++) \
-		AuIOErr(fmt, ##__VA_ARGS__); \
-} while (0)
-
-#define AuUnsupportMsg	"This operation is not supported." \
-			" Please report this application to aufs-users ML."
-#define AuUnsupport(fmt, ...) do { \
-	pr_err(AuUnsupportMsg "\n" fmt, ##__VA_ARGS__); \
-	dump_stack(); \
-} while (0)
-
-#define AuTraceErr(e) do { \
-	if (unlikely((e) < 0)) \
-		AuDbg("err %d\n", (int)(e)); \
-} while (0)
-
-#define AuTraceErrPtr(p) do { \
-	if (IS_ERR(p)) \
-		AuDbg("err %ld\n", PTR_ERR(p)); \
-} while (0)
-
-/* dirty macros for debug print, use with "%.*s" and caution */
-#define AuLNPair(qstr)		(qstr)->len, (qstr)->name
-
-/* ---------------------------------------------------------------------- */
-
-struct dentry;
-#ifdef CONFIG_AUFS_DEBUG
-extern struct mutex au_dbg_mtx;
-extern char *au_plevel;
-struct au_nhash;
-void au_dpri_whlist(struct au_nhash *whlist);
-struct au_vdir;
-void au_dpri_vdir(struct au_vdir *vdir);
-struct inode;
-void au_dpri_inode(struct inode *inode);
-void au_dpri_dalias(struct inode *inode);
-void au_dpri_dentry(struct dentry *dentry);
-struct file;
-void au_dpri_file(struct file *filp);
-struct super_block;
-void au_dpri_sb(struct super_block *sb);
-
-#define au_dbg_verify_dinode(d) __au_dbg_verify_dinode(d, __func__, __LINE__)
-void __au_dbg_verify_dinode(struct dentry *dentry, const char *func, int line);
-void au_dbg_verify_gen(struct dentry *parent, unsigned int sigen);
-void au_dbg_verify_kthread(void);
-
-int __init au_debug_init(void);
-
-#define AuDbgWhlist(w) do { \
-	mutex_lock(&au_dbg_mtx); \
-	AuDbg(#w "\n"); \
-	au_dpri_whlist(w); \
-	mutex_unlock(&au_dbg_mtx); \
-} while (0)
-
-#define AuDbgVdir(v) do { \
-	mutex_lock(&au_dbg_mtx); \
-	AuDbg(#v "\n"); \
-	au_dpri_vdir(v); \
-	mutex_unlock(&au_dbg_mtx); \
-} while (0)
-
-#define AuDbgInode(i) do { \
-	mutex_lock(&au_dbg_mtx); \
-	AuDbg(#i "\n"); \
-	au_dpri_inode(i); \
-	mutex_unlock(&au_dbg_mtx); \
-} while (0)
-
-#define AuDbgDAlias(i) do { \
-	mutex_lock(&au_dbg_mtx); \
-	AuDbg(#i "\n"); \
-	au_dpri_dalias(i); \
-	mutex_unlock(&au_dbg_mtx); \
-} while (0)
-
-#define AuDbgDentry(d) do { \
-	mutex_lock(&au_dbg_mtx); \
-	AuDbg(#d "\n"); \
-	au_dpri_dentry(d); \
-	mutex_unlock(&au_dbg_mtx); \
-} while (0)
-
-#define AuDbgFile(f) do { \
-	mutex_lock(&au_dbg_mtx); \
-	AuDbg(#f "\n"); \
-	au_dpri_file(f); \
-	mutex_unlock(&au_dbg_mtx); \
-} while (0)
-
-#define AuDbgSb(sb) do { \
-	mutex_lock(&au_dbg_mtx); \
-	AuDbg(#sb "\n"); \
-	au_dpri_sb(sb); \
-	mutex_unlock(&au_dbg_mtx); \
-} while (0)
-
-#define AuDbgSym(addr) do {				\
-	char sym[KSYM_SYMBOL_LEN];			\
-	sprint_symbol(sym, (unsigned long)addr);	\
-	AuDbg("%s\n", sym);				\
-} while (0)
-#else
-AuStubVoid(au_dbg_verify_dinode, struct dentry *dentry)
-AuStubVoid(au_dbg_verify_gen, struct dentry *parent, unsigned int sigen)
-AuStubVoid(au_dbg_verify_kthread, void)
-AuStubInt0(__init au_debug_init, void)
-
-#define AuDbgWhlist(w)		do {} while (0)
-#define AuDbgVdir(v)		do {} while (0)
-#define AuDbgInode(i)		do {} while (0)
-#define AuDbgDAlias(i)		do {} while (0)
-#define AuDbgDentry(d)		do {} while (0)
-#define AuDbgFile(f)		do {} while (0)
-#define AuDbgSb(sb)		do {} while (0)
-#define AuDbgSym(addr)		do {} while (0)
-#endif /* CONFIG_AUFS_DEBUG */
-
-/* ---------------------------------------------------------------------- */
-
-#ifdef CONFIG_AUFS_MAGIC_SYSRQ
-int __init au_sysrq_init(void);
-void au_sysrq_fin(void);
-
-#ifdef CONFIG_HW_CONSOLE
-#define au_dbg_blocked() do { \
-	WARN_ON(1); \
-	handle_sysrq('w'); \
-} while (0)
-#else
-AuStubVoid(au_dbg_blocked, void)
-#endif
-
-#else
-AuStubInt0(__init au_sysrq_init, void)
-AuStubVoid(au_sysrq_fin, void)
-AuStubVoid(au_dbg_blocked, void)
-#endif /* CONFIG_AUFS_MAGIC_SYSRQ */
-
-#endif /* __KERNEL__ */
-#endif /* __AUFS_DEBUG_H__ */
diff --git a/fs/aufs/dentry.c b/fs/aufs/dentry.c
deleted file mode 100644
index e47a7e6c4..000000000
--- a/fs/aufs/dentry.c
+++ /dev/null
@@ -1,1123 +0,0 @@
-/*
- * Copyright (C) 2005-2016 Junjiro R. Okajima
- */
-
-/*
- * lookup and dentry operations
- */
-
-#include <linux/namei.h>
-#include "aufs.h"
-
-#define AuLkup_ALLOW_NEG	1
-#define AuLkup_IGNORE_PERM	(1 << 1)
-#define au_ftest_lkup(flags, name)	((flags) & AuLkup_##name)
-#define au_fset_lkup(flags, name) \
-	do { (flags) |= AuLkup_##name; } while (0)
-#define au_fclr_lkup(flags, name) \
-	do { (flags) &= ~AuLkup_##name; } while (0)
-
-struct au_do_lookup_args {
-	unsigned int		flags;
-	mode_t			type;
-};
-
-/*
- * returns positive/negative dentry, NULL or an error.
- * NULL means whiteout-ed or not-found.
- */
-static struct dentry*
-au_do_lookup(struct dentry *h_parent, struct dentry *dentry,
-	     aufs_bindex_t bindex, struct qstr *wh_name,
-	     struct au_do_lookup_args *args)
-{
-	struct dentry *h_dentry;
-	struct inode *h_inode;
-	struct au_branch *br;
-	int wh_found, opq;
-	unsigned char wh_able;
-	const unsigned char allow_neg = !!au_ftest_lkup(args->flags, ALLOW_NEG);
-	const unsigned char ignore_perm = !!au_ftest_lkup(args->flags,
-							  IGNORE_PERM);
-
-	wh_found = 0;
-	br = au_sbr(dentry->d_sb, bindex);
-	wh_able = !!au_br_whable(br->br_perm);
-	if (wh_able)
-		wh_found = au_wh_test(h_parent, wh_name, /*try_sio*/0);
-	h_dentry = ERR_PTR(wh_found);
-	if (!wh_found)
-		goto real_lookup;
-	if (unlikely(wh_found < 0))
-		goto out;
-
-	/* We found a whiteout */
-	/* au_set_dbend(dentry, bindex); */
-	au_set_dbwh(dentry, bindex);
-	if (!allow_neg)
-		return NULL; /* success */
-
-real_lookup:
-	if (!ignore_perm)
-		h_dentry = vfsub_lkup_one(&dentry->d_name, h_parent);
-	else
-		h_dentry = au_sio_lkup_one(&dentry->d_name, h_parent);
-	if (IS_ERR(h_dentry)) {
-		if (PTR_ERR(h_dentry) == -ENAMETOOLONG
-		    && !allow_neg)
-			h_dentry = NULL;
-		goto out;
-	}
-
-	h_inode = d_inode(h_dentry);
-	if (d_is_negative(h_dentry)) {
-		if (!allow_neg)
-			goto out_neg;
-	} else if (wh_found
-		   || (args->type && args->type != (h_inode->i_mode & S_IFMT)))
-		goto out_neg;
-
-	if (au_dbend(dentry) <= bindex)
-		au_set_dbend(dentry, bindex);
-	if (au_dbstart(dentry) < 0 || bindex < au_dbstart(dentry))
-		au_set_dbstart(dentry, bindex);
-	au_set_h_dptr(dentry, bindex, h_dentry);
-
-	if (!d_is_dir(h_dentry)
-	    || !wh_able
-	    || (d_really_is_positive(dentry) && !d_is_dir(dentry)))
-		goto out; /* success */
-
-	mutex_lock_nested(&h_inode->i_mutex, AuLsc_I_CHILD);
-	opq = au_diropq_test(h_dentry);
-	mutex_unlock(&h_inode->i_mutex);
-	if (opq > 0)
-		au_set_dbdiropq(dentry, bindex);
-	else if (unlikely(opq < 0)) {
-		au_set_h_dptr(dentry, bindex, NULL);
-		h_dentry = ERR_PTR(opq);
-	}
-	goto out;
-
-out_neg:
-	dput(h_dentry);
-	h_dentry = NULL;
-out:
-	return h_dentry;
-}
-
-static int au_test_shwh(struct super_block *sb, const struct qstr *name)
-{
-	if (unlikely(!au_opt_test(au_mntflags(sb), SHWH)
-		     && !strncmp(name->name, AUFS_WH_PFX, AUFS_WH_PFX_LEN)))
-		return -EPERM;
-	return 0;
-}
-
-/*
- * returns the number of lower positive dentries,
- * otherwise an error.
- * can be called at unlinking with @type is zero.
- */
-int au_lkup_dentry(struct dentry *dentry, aufs_bindex_t bstart, mode_t type)
-{
-	int npositive, err;
-	aufs_bindex_t bindex, btail, bdiropq;
-	unsigned char isdir, dirperm1;
-	struct qstr whname;
-	struct au_do_lookup_args args = {
-		.flags		= 0,
-		.type		= type
-	};
-	const struct qstr *name = &dentry->d_name;
-	struct dentry *parent;
-	struct super_block *sb;
-
-	sb = dentry->d_sb;
-	err = au_test_shwh(sb, name);
-	if (unlikely(err))
-		goto out;
-
-	err = au_wh_name_alloc(&whname, name);
-	if (unlikely(err))
-		goto out;
-
-	isdir = !!d_is_dir(dentry);
-	if (!type)
-		au_fset_lkup(args.flags, ALLOW_NEG);
-	dirperm1 = !!au_opt_test(au_mntflags(sb), DIRPERM1);
-
-	npositive = 0;
-	parent = dget_parent(dentry);
-	btail = au_dbtaildir(parent);
-	for (bindex = bstart; bindex <= btail; bindex++) {
-		struct dentry *h_parent, *h_dentry;
-		struct inode *h_inode, *h_dir;
-
-		h_dentry = au_h_dptr(dentry, bindex);
-		if (h_dentry) {
-			if (d_is_positive(h_dentry))
-				npositive++;
-			if (type != S_IFDIR)
-				break;
-			continue;
-		}
-		h_parent = au_h_dptr(parent, bindex);
-		if (!h_parent || !d_is_dir(h_parent))
-			continue;
-
-		h_dir = d_inode(h_parent);
-		mutex_lock_nested(&h_dir->i_mutex, AuLsc_I_PARENT);
-		h_dentry = au_do_lookup(h_parent, dentry, bindex, &whname,
-					&args);
-		mutex_unlock(&h_dir->i_mutex);
-		err = PTR_ERR(h_dentry);
-		if (IS_ERR(h_dentry))
-			goto out_parent;
-		if (h_dentry)
-			au_fclr_lkup(args.flags, ALLOW_NEG);
-		if (dirperm1)
-			au_fset_lkup(args.flags, IGNORE_PERM);
-
-		if (au_dbwh(dentry) == bindex)
-			break;
-		if (!h_dentry)
-			continue;
-		if (d_is_negative(h_dentry))
-			continue;
-		h_inode = d_inode(h_dentry);
-		npositive++;
-		if (!args.type)
-			args.type = h_inode->i_mode & S_IFMT;
-		if (args.type != S_IFDIR)
-			break;
-		else if (isdir) {
-			/* the type of lower may be different */
-			bdiropq = au_dbdiropq(dentry);
-			if (bdiropq >= 0 && bdiropq <= bindex)
-				break;
-		}
-	}
-
-	if (npositive) {
-		AuLabel(positive);
-		au_update_dbstart(dentry);
-	}
-	err = npositive;
-	if (unlikely(!au_opt_test(au_mntflags(sb), UDBA_NONE)
-		     && au_dbstart(dentry) < 0)) {
-		err = -EIO;
-		AuIOErr("both of real entry and whiteout found, %pd, err %d\n",
-			dentry, err);
-	}
-
-out_parent:
-	dput(parent);
-	kfree(whname.name);
-out:
-	return err;
-}
-
-struct dentry *au_sio_lkup_one(struct qstr *name, struct dentry *parent)
-{
-	struct dentry *dentry;
-	int wkq_err;
-
-	if (!au_test_h_perm_sio(d_inode(parent), MAY_EXEC))
-		dentry = vfsub_lkup_one(name, parent);
-	else {
-		struct vfsub_lkup_one_args args = {
-			.errp	= &dentry,
-			.name	= name,
-			.parent	= parent
-		};
-
-		wkq_err = au_wkq_wait(vfsub_call_lkup_one, &args);
-		if (unlikely(wkq_err))
-			dentry = ERR_PTR(wkq_err);
-	}
-
-	return dentry;
-}
-
-/*
- * lookup @dentry on @bindex which should be negative.
- */
-int au_lkup_neg(struct dentry *dentry, aufs_bindex_t bindex, int wh)
-{
-	int err;
-	struct dentry *parent, *h_parent, *h_dentry;
-	struct au_branch *br;
-
-	parent = dget_parent(dentry);
-	h_parent = au_h_dptr(parent, bindex);
-	br = au_sbr(dentry->d_sb, bindex);
-	if (wh)
-		h_dentry = au_whtmp_lkup(h_parent, br, &dentry->d_name);
-	else
-		h_dentry = au_sio_lkup_one(&dentry->d_name, h_parent);
-	err = PTR_ERR(h_dentry);
-	if (IS_ERR(h_dentry))
-		goto out;
-	if (unlikely(d_is_positive(h_dentry))) {
-		err = -EIO;
-		AuIOErr("%pd should be negative on b%d.\n", h_dentry, bindex);
-		dput(h_dentry);
-		goto out;
-	}
-
-	err = 0;
-	if (bindex < au_dbstart(dentry))
-		au_set_dbstart(dentry, bindex);
-	if (au_dbend(dentry) < bindex)
-		au_set_dbend(dentry, bindex);
-	au_set_h_dptr(dentry, bindex, h_dentry);
-
-out:
-	dput(parent);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* subset of struct inode */
-struct au_iattr {
-	unsigned long		i_ino;
-	/* unsigned int		i_nlink; */
-	kuid_t			i_uid;
-	kgid_t			i_gid;
-	u64			i_version;
-/*
-	loff_t			i_size;
-	blkcnt_t		i_blocks;
-*/
-	umode_t			i_mode;
-};
-
-static void au_iattr_save(struct au_iattr *ia, struct inode *h_inode)
-{
-	ia->i_ino = h_inode->i_ino;
-	/* ia->i_nlink = h_inode->i_nlink; */
-	ia->i_uid = h_inode->i_uid;
-	ia->i_gid = h_inode->i_gid;
-	ia->i_version = h_inode->i_version;
-/*
-	ia->i_size = h_inode->i_size;
-	ia->i_blocks = h_inode->i_blocks;
-*/
-	ia->i_mode = (h_inode->i_mode & S_IFMT);
-}
-
-static int au_iattr_test(struct au_iattr *ia, struct inode *h_inode)
-{
-	return ia->i_ino != h_inode->i_ino
-		/* || ia->i_nlink != h_inode->i_nlink */
-		|| !uid_eq(ia->i_uid, h_inode->i_uid)
-		|| !gid_eq(ia->i_gid, h_inode->i_gid)
-		|| ia->i_version != h_inode->i_version
-/*
-		|| ia->i_size != h_inode->i_size
-		|| ia->i_blocks != h_inode->i_blocks
-*/
-		|| ia->i_mode != (h_inode->i_mode & S_IFMT);
-}
-
-static int au_h_verify_dentry(struct dentry *h_dentry, struct dentry *h_parent,
-			      struct au_branch *br)
-{
-	int err;
-	struct au_iattr ia;
-	struct inode *h_inode;
-	struct dentry *h_d;
-	struct super_block *h_sb;
-
-	err = 0;
-	memset(&ia, -1, sizeof(ia));
-	h_sb = h_dentry->d_sb;
-	h_inode = NULL;
-	if (d_is_positive(h_dentry)) {
-		h_inode = d_inode(h_dentry);
-		au_iattr_save(&ia, h_inode);
-	} else if (au_test_nfs(h_sb) || au_test_fuse(h_sb))
-		/* nfs d_revalidate may return 0 for negative dentry */
-		/* fuse d_revalidate always return 0 for negative dentry */
-		goto out;
-
-	/* main purpose is namei.c:cached_lookup() and d_revalidate */
-	h_d = vfsub_lkup_one(&h_dentry->d_name, h_parent);
-	err = PTR_ERR(h_d);
-	if (IS_ERR(h_d))
-		goto out;
-
-	err = 0;
-	if (unlikely(h_d != h_dentry
-		     || d_inode(h_d) != h_inode
-		     || (h_inode && au_iattr_test(&ia, h_inode))))
-		err = au_busy_or_stale();
-	dput(h_d);
-
-out:
-	AuTraceErr(err);
-	return err;
-}
-
-int au_h_verify(struct dentry *h_dentry, unsigned int udba, struct inode *h_dir,
-		struct dentry *h_parent, struct au_branch *br)
-{
-	int err;
-
-	err = 0;
-	if (udba == AuOpt_UDBA_REVAL
-	    && !au_test_fs_remote(h_dentry->d_sb)) {
-		IMustLock(h_dir);
-		err = (d_inode(h_dentry->d_parent) != h_dir);
-	} else if (udba != AuOpt_UDBA_NONE)
-		err = au_h_verify_dentry(h_dentry, h_parent, br);
-
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static int au_do_refresh_hdentry(struct dentry *dentry, struct dentry *parent)
-{
-	int err;
-	aufs_bindex_t new_bindex, bindex, bend, bwh, bdiropq;
-	struct au_hdentry tmp, *p, *q;
-	struct au_dinfo *dinfo;
-	struct super_block *sb;
-
-	DiMustWriteLock(dentry);
-
-	sb = dentry->d_sb;
-	dinfo = au_di(dentry);
-	bend = dinfo->di_bend;
-	bwh = dinfo->di_bwh;
-	bdiropq = dinfo->di_bdiropq;
-	p = dinfo->di_hdentry + dinfo->di_bstart;
-	for (bindex = dinfo->di_bstart; bindex <= bend; bindex++, p++) {
-		if (!p->hd_dentry)
-			continue;
-
-		new_bindex = au_br_index(sb, p->hd_id);
-		if (new_bindex == bindex)
-			continue;
-
-		if (dinfo->di_bwh == bindex)
-			bwh = new_bindex;
-		if (dinfo->di_bdiropq == bindex)
-			bdiropq = new_bindex;
-		if (new_bindex < 0) {
-			au_hdput(p);
-			p->hd_dentry = NULL;
-			continue;
-		}
-
-		/* swap two lower dentries, and loop again */
-		q = dinfo->di_hdentry + new_bindex;
-		tmp = *q;
-		*q = *p;
-		*p = tmp;
-		if (tmp.hd_dentry) {
-			bindex--;
-			p--;
-		}
-	}
-
-	dinfo->di_bwh = -1;
-	if (bwh >= 0 && bwh <= au_sbend(sb) && au_sbr_whable(sb, bwh))
-		dinfo->di_bwh = bwh;
-
-	dinfo->di_bdiropq = -1;
-	if (bdiropq >= 0
-	    && bdiropq <= au_sbend(sb)
-	    && au_sbr_whable(sb, bdiropq))
-		dinfo->di_bdiropq = bdiropq;
-
-	err = -EIO;
-	dinfo->di_bstart = -1;
-	dinfo->di_bend = -1;
-	bend = au_dbend(parent);
-	p = dinfo->di_hdentry;
-	for (bindex = 0; bindex <= bend; bindex++, p++)
-		if (p->hd_dentry) {
-			dinfo->di_bstart = bindex;
-			break;
-		}
-
-	if (dinfo->di_bstart >= 0) {
-		p = dinfo->di_hdentry + bend;
-		for (bindex = bend; bindex >= 0; bindex--, p--)
-			if (p->hd_dentry) {
-				dinfo->di_bend = bindex;
-				err = 0;
-				break;
-			}
-	}
-
-	return err;
-}
-
-static void au_do_hide(struct dentry *dentry)
-{
-	struct inode *inode;
-
-	if (d_really_is_positive(dentry)) {
-		inode = d_inode(dentry);
-		if (!d_is_dir(dentry)) {
-			if (inode->i_nlink && !d_unhashed(dentry))
-				drop_nlink(inode);
-		} else {
-			clear_nlink(inode);
-			/* stop next lookup */
-			inode->i_flags |= S_DEAD;
-		}
-		smp_mb(); /* necessary? */
-	}
-	d_drop(dentry);
-}
-
-static int au_hide_children(struct dentry *parent)
-{
-	int err, i, j, ndentry;
-	struct au_dcsub_pages dpages;
-	struct au_dpage *dpage;
-	struct dentry *dentry;
-
-	err = au_dpages_init(&dpages, GFP_NOFS);
-	if (unlikely(err))
-		goto out;
-	err = au_dcsub_pages(&dpages, parent, NULL, NULL);
-	if (unlikely(err))
-		goto out_dpages;
-
-	/* in reverse order */
-	for (i = dpages.ndpage - 1; i >= 0; i--) {
-		dpage = dpages.dpages + i;
-		ndentry = dpage->ndentry;
-		for (j = ndentry - 1; j >= 0; j--) {
-			dentry = dpage->dentries[j];
-			if (dentry != parent)
-				au_do_hide(dentry);
-		}
-	}
-
-out_dpages:
-	au_dpages_free(&dpages);
-out:
-	return err;
-}
-
-static void au_hide(struct dentry *dentry)
-{
-	int err;
-
-	AuDbgDentry(dentry);
-	if (d_is_dir(dentry)) {
-		/* shrink_dcache_parent(dentry); */
-		err = au_hide_children(dentry);
-		if (unlikely(err))
-			AuIOErr("%pd, failed hiding children, ignored %d\n",
-				dentry, err);
-	}
-	au_do_hide(dentry);
-}
-
-/*
- * By adding a dirty branch, a cached dentry may be affected in various ways.
- *
- * a dirty branch is added
- * - on the top of layers
- * - in the middle of layers
- * - to the bottom of layers
- *
- * on the added branch there exists
- * - a whiteout
- * - a diropq
- * - a same named entry
- *   + exist
- *     * negative --> positive
- *     * positive --> positive
- *	 - type is unchanged
- *	 - type is changed
- *   + doesn't exist
- *     * negative --> negative
- *     * positive --> negative (rejected by au_br_del() for non-dir case)
- * - none
- */
-static int au_refresh_by_dinfo(struct dentry *dentry, struct au_dinfo *dinfo,
-			       struct au_dinfo *tmp)
-{
-	int err;
-	aufs_bindex_t bindex, bend;
-	struct {
-		struct dentry *dentry;
-		struct inode *inode;
-		mode_t mode;
-	} orig_h, tmp_h = {
-		.dentry = NULL
-	};
-	struct au_hdentry *hd;
-	struct inode *inode, *h_inode;
-	struct dentry *h_dentry;
-
-	err = 0;
-	AuDebugOn(dinfo->di_bstart < 0);
-	orig_h.mode = 0;
-	orig_h.dentry = dinfo->di_hdentry[dinfo->di_bstart].hd_dentry;
-	orig_h.inode = NULL;
-	if (d_is_positive(orig_h.dentry)) {
-		orig_h.inode = d_inode(orig_h.dentry);
-		orig_h.mode = orig_h.inode->i_mode & S_IFMT;
-	}
-	if (tmp->di_bstart >= 0) {
-		tmp_h.dentry = tmp->di_hdentry[tmp->di_bstart].hd_dentry;
-		if (d_is_positive(tmp_h.dentry)) {
-			tmp_h.inode = d_inode(tmp_h.dentry);
-			tmp_h.mode = tmp_h.inode->i_mode & S_IFMT;
-		}
-	}
-
-	inode = NULL;
-	if (d_really_is_positive(dentry))
-		inode = d_inode(dentry);
-	if (!orig_h.inode) {
-		AuDbg("nagative originally\n");
-		if (inode) {
-			au_hide(dentry);
-			goto out;
-		}
-		AuDebugOn(inode);
-		AuDebugOn(dinfo->di_bstart != dinfo->di_bend);
-		AuDebugOn(dinfo->di_bdiropq != -1);
-
-		if (!tmp_h.inode) {
-			AuDbg("negative --> negative\n");
-			/* should have only one negative lower */
-			if (tmp->di_bstart >= 0
-			    && tmp->di_bstart < dinfo->di_bstart) {
-				AuDebugOn(tmp->di_bstart != tmp->di_bend);
-				AuDebugOn(dinfo->di_bstart != dinfo->di_bend);
-				au_set_h_dptr(dentry, dinfo->di_bstart, NULL);
-				au_di_cp(dinfo, tmp);
-				hd = tmp->di_hdentry + tmp->di_bstart;
-				au_set_h_dptr(dentry, tmp->di_bstart,
-					      dget(hd->hd_dentry));
-			}
-			au_dbg_verify_dinode(dentry);
-		} else {
-			AuDbg("negative --> positive\n");
-			/*
-			 * similar to the behaviour of creating with bypassing
-			 * aufs.
-			 * unhash it in order to force an error in the
-			 * succeeding create operation.
-			 * we should not set S_DEAD here.
-			 */
-			d_drop(dentry);
-			/* au_di_swap(tmp, dinfo); */
-			au_dbg_verify_dinode(dentry);
-		}
-	} else {
-		AuDbg("positive originally\n");
-		/* inode may be NULL */
-		AuDebugOn(inode && (inode->i_mode & S_IFMT) != orig_h.mode);
-		if (!tmp_h.inode) {
-			AuDbg("positive --> negative\n");
-			/* or bypassing aufs */
-			au_hide(dentry);
-			if (tmp->di_bwh >= 0 && tmp->di_bwh <= dinfo->di_bstart)
-				dinfo->di_bwh = tmp->di_bwh;
-			if (inode)
-				err = au_refresh_hinode_self(inode);
-			au_dbg_verify_dinode(dentry);
-		} else if (orig_h.mode == tmp_h.mode) {
-			AuDbg("positive --> positive, same type\n");
-			if (!S_ISDIR(orig_h.mode)
-			    && dinfo->di_bstart > tmp->di_bstart) {
-				/*
-				 * similar to the behaviour of removing and
-				 * creating.
-				 */
-				au_hide(dentry);
-				if (inode)
-					err = au_refresh_hinode_self(inode);
-				au_dbg_verify_dinode(dentry);
-			} else {
-				/* fill empty slots */
-				if (dinfo->di_bstart > tmp->di_bstart)
-					dinfo->di_bstart = tmp->di_bstart;
-				if (dinfo->di_bend < tmp->di_bend)
-					dinfo->di_bend = tmp->di_bend;
-				dinfo->di_bwh = tmp->di_bwh;
-				dinfo->di_bdiropq = tmp->di_bdiropq;
-				hd = tmp->di_hdentry;
-				bend = dinfo->di_bend;
-				for (bindex = tmp->di_bstart; bindex <= bend;
-				     bindex++) {
-					if (au_h_dptr(dentry, bindex))
-						continue;
-					h_dentry = hd[bindex].hd_dentry;
-					if (!h_dentry)
-						continue;
-					AuDebugOn(d_is_negative(h_dentry));
-					h_inode = d_inode(h_dentry);
-					AuDebugOn(orig_h.mode
-						  != (h_inode->i_mode
-						      & S_IFMT));
-					au_set_h_dptr(dentry, bindex,
-						      dget(h_dentry));
-				}
-				err = au_refresh_hinode(inode, dentry);
-				au_dbg_verify_dinode(dentry);
-			}
-		} else {
-			AuDbg("positive --> positive, different type\n");
-			/* similar to the behaviour of removing and creating */
-			au_hide(dentry);
-			if (inode)
-				err = au_refresh_hinode_self(inode);
-			au_dbg_verify_dinode(dentry);
-		}
-	}
-
-out:
-	return err;
-}
-
-void au_refresh_dop(struct dentry *dentry, int force_reval)
-{
-	const struct dentry_operations *dop
-		= force_reval ? &aufs_dop : dentry->d_sb->s_d_op;
-	static const unsigned int mask
-		= DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE;
-
-	BUILD_BUG_ON(sizeof(mask) != sizeof(dentry->d_flags));
-
-	if (dentry->d_op == dop)
-		return;
-
-	AuDbg("%pd\n", dentry);
-	spin_lock(&dentry->d_lock);
-	if (dop == &aufs_dop)
-		dentry->d_flags |= mask;
-	else
-		dentry->d_flags &= ~mask;
-	dentry->d_op = dop;
-	spin_unlock(&dentry->d_lock);
-}
-
-int au_refresh_dentry(struct dentry *dentry, struct dentry *parent)
-{
-	int err, ebrange;
-	unsigned int sigen;
-	struct au_dinfo *dinfo, *tmp;
-	struct super_block *sb;
-	struct inode *inode;
-
-	DiMustWriteLock(dentry);
-	AuDebugOn(IS_ROOT(dentry));
-	AuDebugOn(d_really_is_negative(parent));
-
-	sb = dentry->d_sb;
-	sigen = au_sigen(sb);
-	err = au_digen_test(parent, sigen);
-	if (unlikely(err))
-		goto out;
-
-	dinfo = au_di(dentry);
-	err = au_di_realloc(dinfo, au_sbend(sb) + 1);
-	if (unlikely(err))
-		goto out;
-	ebrange = au_dbrange_test(dentry);
-	if (!ebrange)
-		ebrange = au_do_refresh_hdentry(dentry, parent);
-
-	if (d_unhashed(dentry) || ebrange /* || dinfo->di_tmpfile */) {
-		AuDebugOn(au_dbstart(dentry) < 0 && au_dbend(dentry) >= 0);
-		if (d_really_is_positive(dentry)) {
-			inode = d_inode(dentry);
-			err = au_refresh_hinode_self(inode);
-		}
-		au_dbg_verify_dinode(dentry);
-		if (!err)
-			goto out_dgen; /* success */
-		goto out;
-	}
-
-	/* temporary dinfo */
-	AuDbgDentry(dentry);
-	err = -ENOMEM;
-	tmp = au_di_alloc(sb, AuLsc_DI_TMP);
-	if (unlikely(!tmp))
-		goto out;
-	au_di_swap(tmp, dinfo);
-	/* returns the number of positive dentries */
-	/*
-	 * if current working dir is removed, it returns an error.
-	 * but the dentry is legal.
-	 */
-	err = au_lkup_dentry(dentry, /*bstart*/0, /*type*/0);
-	AuDbgDentry(dentry);
-	au_di_swap(tmp, dinfo);
-	if (err == -ENOENT)
-		err = 0;
-	if (err >= 0) {
-		/* compare/refresh by dinfo */
-		AuDbgDentry(dentry);
-		err = au_refresh_by_dinfo(dentry, dinfo, tmp);
-		au_dbg_verify_dinode(dentry);
-		AuTraceErr(err);
-	}
-	au_rw_write_unlock(&tmp->di_rwsem);
-	au_di_free(tmp);
-	if (unlikely(err))
-		goto out;
-
-out_dgen:
-	au_update_digen(dentry);
-out:
-	if (unlikely(err && !(dentry->d_flags & DCACHE_NFSFS_RENAMED))) {
-		AuIOErr("failed refreshing %pd, %d\n", dentry, err);
-		AuDbgDentry(dentry);
-	}
-	AuTraceErr(err);
-	return err;
-}
-
-static int au_do_h_d_reval(struct dentry *h_dentry, unsigned int flags,
-			   struct dentry *dentry, aufs_bindex_t bindex)
-{
-	int err, valid;
-
-	err = 0;
-	if (!(h_dentry->d_flags & DCACHE_OP_REVALIDATE))
-		goto out;
-
-	AuDbg("b%d\n", bindex);
-	/*
-	 * gave up supporting LOOKUP_CREATE/OPEN for lower fs,
-	 * due to whiteout and branch permission.
-	 */
-	flags &= ~(/*LOOKUP_PARENT |*/ LOOKUP_OPEN | LOOKUP_CREATE
-		   | LOOKUP_FOLLOW | LOOKUP_EXCL);
-	/* it may return tri-state */
-	valid = h_dentry->d_op->d_revalidate(h_dentry, flags);
-
-	if (unlikely(valid < 0))
-		err = valid;
-	else if (!valid)
-		err = -EINVAL;
-
-out:
-	AuTraceErr(err);
-	return err;
-}
-
-/* todo: remove this */
-static int h_d_revalidate(struct dentry *dentry, struct inode *inode,
-			  unsigned int flags, int do_udba)
-{
-	int err;
-	umode_t mode, h_mode;
-	aufs_bindex_t bindex, btail, bstart, ibs, ibe;
-	unsigned char plus, unhashed, is_root, h_plus, h_nfs, tmpfile;
-	struct inode *h_inode, *h_cached_inode;
-	struct dentry *h_dentry;
-	struct qstr *name, *h_name;
-
-	err = 0;
-	plus = 0;
-	mode = 0;
-	ibs = -1;
-	ibe = -1;
-	unhashed = !!d_unhashed(dentry);
-	is_root = !!IS_ROOT(dentry);
-	name = &dentry->d_name;
-	tmpfile = au_di(dentry)->di_tmpfile;
-
-	/*
-	 * Theoretically, REVAL test should be unnecessary in case of
-	 * {FS,I}NOTIFY.
-	 * But {fs,i}notify doesn't fire some necessary events,
-	 *	IN_ATTRIB for atime/nlink/pageio
-	 * Let's do REVAL test too.
-	 */
-	if (do_udba && inode) {
-		mode = (inode->i_mode & S_IFMT);
-		plus = (inode->i_nlink > 0);
-		ibs = au_ibstart(inode);
-		ibe = au_ibend(inode);
-	}
-
-	bstart = au_dbstart(dentry);
-	btail = bstart;
-	if (inode && S_ISDIR(inode->i_mode))
-		btail = au_dbtaildir(dentry);
-	for (bindex = bstart; bindex <= btail; bindex++) {
-		h_dentry = au_h_dptr(dentry, bindex);
-		if (!h_dentry)
-			continue;
-
-		AuDbg("b%d, %pd\n", bindex, h_dentry);
-		h_nfs = !!au_test_nfs(h_dentry->d_sb);
-		spin_lock(&h_dentry->d_lock);
-		h_name = &h_dentry->d_name;
-		if (unlikely(do_udba
-			     && !is_root
-			     && ((!h_nfs
-				  && (unhashed != !!d_unhashed(h_dentry)
-				      || (!tmpfile
-					  && !au_qstreq(name, h_name))
-					  ))
-				 || (h_nfs
-				     && !(flags & LOOKUP_OPEN)
-				     && (h_dentry->d_flags
-					 & DCACHE_NFSFS_RENAMED)))
-			    )) {
-			int h_unhashed;
-
-			h_unhashed = d_unhashed(h_dentry);
-			spin_unlock(&h_dentry->d_lock);
-			AuDbg("unhash 0x%x 0x%x, %pd %pd\n",
-			      unhashed, h_unhashed, dentry, h_dentry);
-			goto err;
-		}
-		spin_unlock(&h_dentry->d_lock);
-
-		err = au_do_h_d_reval(h_dentry, flags, dentry, bindex);
-		if (unlikely(err))
-			/* do not goto err, to keep the errno */
-			break;
-
-		/* todo: plink too? */
-		if (!do_udba)
-			continue;
-
-		/* UDBA tests */
-		if (unlikely(!!inode != d_is_positive(h_dentry)))
-			goto err;
-
-		h_inode = NULL;
-		if (d_is_positive(h_dentry))
-			h_inode = d_inode(h_dentry);
-		h_plus = plus;
-		h_mode = mode;
-		h_cached_inode = h_inode;
-		if (h_inode) {
-			h_mode = (h_inode->i_mode & S_IFMT);
-			h_plus = (h_inode->i_nlink > 0);
-		}
-		if (inode && ibs <= bindex && bindex <= ibe)
-			h_cached_inode = au_h_iptr(inode, bindex);
-
-		if (!h_nfs) {
-			if (unlikely(plus != h_plus && !tmpfile))
-				goto err;
-		} else {
-			if (unlikely(!(h_dentry->d_flags & DCACHE_NFSFS_RENAMED)
-				     && !is_root
-				     && !IS_ROOT(h_dentry)
-				     && unhashed != d_unhashed(h_dentry)))
-				goto err;
-		}
-		if (unlikely(mode != h_mode
-			     || h_cached_inode != h_inode))
-			goto err;
-		continue;
-
-err:
-		err = -EINVAL;
-		break;
-	}
-
-	AuTraceErr(err);
-	return err;
-}
-
-/* todo: consolidate with do_refresh() and au_reval_for_attr() */
-static int simple_reval_dpath(struct dentry *dentry, unsigned int sigen)
-{
-	int err;
-	struct dentry *parent;
-
-	if (!au_digen_test(dentry, sigen))
-		return 0;
-
-	parent = dget_parent(dentry);
-	di_read_lock_parent(parent, AuLock_IR);
-	AuDebugOn(au_digen_test(parent, sigen));
-	au_dbg_verify_gen(parent, sigen);
-	err = au_refresh_dentry(dentry, parent);
-	di_read_unlock(parent, AuLock_IR);
-	dput(parent);
-	AuTraceErr(err);
-	return err;
-}
-
-int au_reval_dpath(struct dentry *dentry, unsigned int sigen)
-{
-	int err;
-	struct dentry *d, *parent;
-
-	if (!au_ftest_si(au_sbi(dentry->d_sb), FAILED_REFRESH_DIR))
-		return simple_reval_dpath(dentry, sigen);
-
-	/* slow loop, keep it simple and stupid */
-	/* cf: au_cpup_dirs() */
-	err = 0;
-	parent = NULL;
-	while (au_digen_test(dentry, sigen)) {
-		d = dentry;
-		while (1) {
-			dput(parent);
-			parent = dget_parent(d);
-			if (!au_digen_test(parent, sigen))
-				break;
-			d = parent;
-		}
-
-		if (d != dentry)
-			di_write_lock_child2(d);
-
-		/* someone might update our dentry while we were sleeping */
-		if (au_digen_test(d, sigen)) {
-			/*
-			 * todo: consolidate with simple_reval_dpath(),
-			 * do_refresh() and au_reval_for_attr().
-			 */
-			di_read_lock_parent(parent, AuLock_IR);
-			err = au_refresh_dentry(d, parent);
-			di_read_unlock(parent, AuLock_IR);
-		}
-
-		if (d != dentry)
-			di_write_unlock(d);
-		dput(parent);
-		if (unlikely(err))
-			break;
-	}
-
-	return err;
-}
-
-/*
- * if valid returns 1, otherwise 0.
- */
-static int aufs_d_revalidate(struct dentry *dentry, unsigned int flags)
-{
-	int valid, err;
-	unsigned int sigen;
-	unsigned char do_udba;
-	struct super_block *sb;
-	struct inode *inode;
-
-	/* todo: support rcu-walk? */
-	if (flags & LOOKUP_RCU)
-		return -ECHILD;
-
-	valid = 0;
-	if (unlikely(!au_di(dentry)))
-		goto out;
-
-	valid = 1;
-	sb = dentry->d_sb;
-	/*
-	 * todo: very ugly
-	 * i_mutex of parent dir may be held,
-	 * but we should not return 'invalid' due to busy.
-	 */
-	err = aufs_read_lock(dentry, AuLock_FLUSH | AuLock_DW | AuLock_NOPLM);
-	if (unlikely(err)) {
-		valid = err;
-		AuTraceErr(err);
-		goto out;
-	}
-	inode = NULL;
-	if (d_really_is_positive(dentry))
-		inode = d_inode(dentry);
-	if (unlikely(inode && is_bad_inode(inode))) {
-		err = -EINVAL;
-		AuTraceErr(err);
-		goto out_dgrade;
-	}
-	if (unlikely(au_dbrange_test(dentry))) {
-		err = -EINVAL;
-		AuTraceErr(err);
-		goto out_dgrade;
-	}
-
-	sigen = au_sigen(sb);
-	if (au_digen_test(dentry, sigen)) {
-		AuDebugOn(IS_ROOT(dentry));
-		err = au_reval_dpath(dentry, sigen);
-		if (unlikely(err)) {
-			AuTraceErr(err);
-			goto out_dgrade;
-		}
-	}
-	di_downgrade_lock(dentry, AuLock_IR);
-
-	err = -EINVAL;
-	if (!(flags & (LOOKUP_OPEN | LOOKUP_EMPTY))
-	    && inode
-	    && !(inode->i_state && I_LINKABLE)
-	    && (IS_DEADDIR(inode) || !inode->i_nlink)) {
-		AuTraceErr(err);
-		goto out_inval;
-	}
-
-	do_udba = !au_opt_test(au_mntflags(sb), UDBA_NONE);
-	if (do_udba && inode) {
-		aufs_bindex_t bstart = au_ibstart(inode);
-		struct inode *h_inode;
-
-		if (bstart >= 0) {
-			h_inode = au_h_iptr(inode, bstart);
-			if (h_inode && au_test_higen(inode, h_inode)) {
-				AuTraceErr(err);
-				goto out_inval;
-			}
-		}
-	}
-
-	err = h_d_revalidate(dentry, inode, flags, do_udba);
-	if (unlikely(!err && do_udba && au_dbstart(dentry) < 0)) {
-		err = -EIO;
-		AuDbg("both of real entry and whiteout found, %p, err %d\n",
-		      dentry, err);
-	}
-	goto out_inval;
-
-out_dgrade:
-	di_downgrade_lock(dentry, AuLock_IR);
-out_inval:
-	aufs_read_unlock(dentry, AuLock_IR);
-	AuTraceErr(err);
-	valid = !err;
-out:
-	if (!valid) {
-		AuDbg("%pd invalid, %d\n", dentry, valid);
-		d_drop(dentry);
-	}
-	return valid;
-}
-
-static void aufs_d_release(struct dentry *dentry)
-{
-	if (au_di(dentry)) {
-		au_di_fin(dentry);
-		au_hn_di_reinit(dentry);
-	}
-}
-
-const struct dentry_operations aufs_dop = {
-	.d_revalidate		= aufs_d_revalidate,
-	.d_weak_revalidate	= aufs_d_revalidate,
-	.d_release		= aufs_d_release
-};
-
-/* aufs_dop without d_revalidate */
-const struct dentry_operations aufs_dop_noreval = {
-	.d_release		= aufs_d_release
-};
diff --git a/fs/aufs/dentry.h b/fs/aufs/dentry.h
deleted file mode 100644
index c794adf59..000000000
--- a/fs/aufs/dentry.h
+++ /dev/null
@@ -1,221 +0,0 @@
-/*
- * Copyright (C) 2005-2016 Junjiro R. Okajima
- */
-
-/*
- * lookup and dentry operations
- */
-
-#ifndef __AUFS_DENTRY_H__
-#define __AUFS_DENTRY_H__
-
-#ifdef __KERNEL__
-
-#include <linux/dcache.h>
-#include "rwsem.h"
-
-struct au_hdentry {
-	struct dentry		*hd_dentry;
-	aufs_bindex_t		hd_id;
-};
-
-struct au_dinfo {
-	atomic_t		di_generation;
-
-	struct au_rwsem		di_rwsem;
-	aufs_bindex_t		di_bstart, di_bend, di_bwh, di_bdiropq;
-	unsigned char		di_tmpfile; /* to allow the different name */
-	struct au_hdentry	*di_hdentry;
-} ____cacheline_aligned_in_smp;
-
-/* ---------------------------------------------------------------------- */
-
-/* dentry.c */
-extern const struct dentry_operations aufs_dop, aufs_dop_noreval;
-struct au_branch;
-struct dentry *au_sio_lkup_one(struct qstr *name, struct dentry *parent);
-int au_h_verify(struct dentry *h_dentry, unsigned int udba, struct inode *h_dir,
-		struct dentry *h_parent, struct au_branch *br);
-
-int au_lkup_dentry(struct dentry *dentry, aufs_bindex_t bstart, mode_t type);
-int au_lkup_neg(struct dentry *dentry, aufs_bindex_t bindex, int wh);
-int au_refresh_dentry(struct dentry *dentry, struct dentry *parent);
-int au_reval_dpath(struct dentry *dentry, unsigned int sigen);
-void au_refresh_dop(struct dentry *dentry, int force_reval);
-
-/* dinfo.c */
-void au_di_init_once(void *_di);
-struct au_dinfo *au_di_alloc(struct super_block *sb, unsigned int lsc);
-void au_di_free(struct au_dinfo *dinfo);
-void au_di_swap(struct au_dinfo *a, struct au_dinfo *b);
-void au_di_cp(struct au_dinfo *dst, struct au_dinfo *src);
-int au_di_init(struct dentry *dentry);
-void au_di_fin(struct dentry *dentry);
-int au_di_realloc(struct au_dinfo *dinfo, int nbr);
-
-void di_read_lock(struct dentry *d, int flags, unsigned int lsc);
-void di_read_unlock(struct dentry *d, int flags);
-void di_downgrade_lock(struct dentry *d, int flags);
-void di_write_lock(struct dentry *d, unsigned int lsc);
-void di_write_unlock(struct dentry *d);
-void di_write_lock2_child(struct dentry *d1, struct dentry *d2, int isdir);
-void di_write_lock2_parent(struct dentry *d1, struct dentry *d2, int isdir);
-void di_write_unlock2(struct dentry *d1, struct dentry *d2);
-
-struct dentry *au_h_dptr(struct dentry *dentry, aufs_bindex_t bindex);
-struct dentry *au_h_d_alias(struct dentry *dentry, aufs_bindex_t bindex);
-aufs_bindex_t au_dbtail(struct dentry *dentry);
-aufs_bindex_t au_dbtaildir(struct dentry *dentry);
-
-void au_set_h_dptr(struct dentry *dentry, aufs_bindex_t bindex,
-		   struct dentry *h_dentry);
-int au_digen_test(struct dentry *dentry, unsigned int sigen);
-int au_dbrange_test(struct dentry *dentry);
-void au_update_digen(struct dentry *dentry);
-void au_update_dbrange(struct dentry *dentry, int do_put_zero);
-void au_update_dbstart(struct dentry *dentry);
-void au_update_dbend(struct dentry *dentry);
-int au_find_dbindex(struct dentry *dentry, struct dentry *h_dentry);
-
-/* ---------------------------------------------------------------------- */
-
-static inline struct au_dinfo *au_di(struct dentry *dentry)
-{
-	return dentry->d_fsdata;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* lock subclass for dinfo */
-enum {
-	AuLsc_DI_CHILD,		/* child first */
-	AuLsc_DI_CHILD2,	/* rename(2), link(2), and cpup at hnotify */
-	AuLsc_DI_CHILD3,	/* copyup dirs */
-	AuLsc_DI_PARENT,
-	AuLsc_DI_PARENT2,
-	AuLsc_DI_PARENT3,
-	AuLsc_DI_TMP		/* temp for replacing dinfo */
-};
-
-/*
- * di_read_lock_child, di_write_lock_child,
- * di_read_lock_child2, di_write_lock_child2,
- * di_read_lock_child3, di_write_lock_child3,
- * di_read_lock_parent, di_write_lock_parent,
- * di_read_lock_parent2, di_write_lock_parent2,
- * di_read_lock_parent3, di_write_lock_parent3,
- */
-#define AuReadLockFunc(name, lsc) \
-static inline void di_read_lock_##name(struct dentry *d, int flags) \
-{ di_read_lock(d, flags, AuLsc_DI_##lsc); }
-
-#define AuWriteLockFunc(name, lsc) \
-static inline void di_write_lock_##name(struct dentry *d) \
-{ di_write_lock(d, AuLsc_DI_##lsc); }
-
-#define AuRWLockFuncs(name, lsc) \
-	AuReadLockFunc(name, lsc) \
-	AuWriteLockFunc(name, lsc)
-
-AuRWLockFuncs(child, CHILD);
-AuRWLockFuncs(child2, CHILD2);
-AuRWLockFuncs(child3, CHILD3);
-AuRWLockFuncs(parent, PARENT);
-AuRWLockFuncs(parent2, PARENT2);
-AuRWLockFuncs(parent3, PARENT3);
-
-#undef AuReadLockFunc
-#undef AuWriteLockFunc
-#undef AuRWLockFuncs
-
-#define DiMustNoWaiters(d)	AuRwMustNoWaiters(&au_di(d)->di_rwsem)
-#define DiMustAnyLock(d)	AuRwMustAnyLock(&au_di(d)->di_rwsem)
-#define DiMustWriteLock(d)	AuRwMustWriteLock(&au_di(d)->di_rwsem)
-
-/* ---------------------------------------------------------------------- */
-
-/* todo: memory barrier? */
-static inline unsigned int au_digen(struct dentry *d)
-{
-	return atomic_read(&au_di(d)->di_generation);
-}
-
-static inline void au_h_dentry_init(struct au_hdentry *hdentry)
-{
-	hdentry->hd_dentry = NULL;
-}
-
-static inline void au_hdput(struct au_hdentry *hd)
-{
-	if (hd)
-		dput(hd->hd_dentry);
-}
-
-static inline aufs_bindex_t au_dbstart(struct dentry *dentry)
-{
-	DiMustAnyLock(dentry);
-	return au_di(dentry)->di_bstart;
-}
-
-static inline aufs_bindex_t au_dbend(struct dentry *dentry)
-{
-	DiMustAnyLock(dentry);
-	return au_di(dentry)->di_bend;
-}
-
-static inline aufs_bindex_t au_dbwh(struct dentry *dentry)
-{
-	DiMustAnyLock(dentry);
-	return au_di(dentry)->di_bwh;
-}
-
-static inline aufs_bindex_t au_dbdiropq(struct dentry *dentry)
-{
-	DiMustAnyLock(dentry);
-	return au_di(dentry)->di_bdiropq;
-}
-
-/* todo: hard/soft set? */
-static inline void au_set_dbstart(struct dentry *dentry, aufs_bindex_t bindex)
-{
-	DiMustWriteLock(dentry);
-	au_di(dentry)->di_bstart = bindex;
-}
-
-static inline void au_set_dbend(struct dentry *dentry, aufs_bindex_t bindex)
-{
-	DiMustWriteLock(dentry);
-	au_di(dentry)->di_bend = bindex;
-}
-
-static inline void au_set_dbwh(struct dentry *dentry, aufs_bindex_t bindex)
-{
-	DiMustWriteLock(dentry);
-	/* dbwh can be outside of bstart - bend range */
-	au_di(dentry)->di_bwh = bindex;
-}
-
-static inline void au_set_dbdiropq(struct dentry *dentry, aufs_bindex_t bindex)
-{
-	DiMustWriteLock(dentry);
-	au_di(dentry)->di_bdiropq = bindex;
-}
-
-/* ---------------------------------------------------------------------- */
-
-#ifdef CONFIG_AUFS_HNOTIFY
-static inline void au_digen_dec(struct dentry *d)
-{
-	atomic_dec(&au_di(d)->di_generation);
-}
-
-static inline void au_hn_di_reinit(struct dentry *dentry)
-{
-	dentry->d_fsdata = NULL;
-}
-#else
-AuStubVoid(au_hn_di_reinit, struct dentry *dentry __maybe_unused)
-#endif /* CONFIG_AUFS_HNOTIFY */
-
-#endif /* __KERNEL__ */
-#endif /* __AUFS_DENTRY_H__ */
diff --git a/fs/aufs/dinfo.c b/fs/aufs/dinfo.c
deleted file mode 100644
index ad6d045c4..000000000
--- a/fs/aufs/dinfo.c
+++ /dev/null
@@ -1,537 +0,0 @@
-/*
- * Copyright (C) 2005-2016 Junjiro R. Okajima
- */
-
-/*
- * dentry private data
- */
-
-#include "aufs.h"
-
-void au_di_init_once(void *_dinfo)
-{
-	struct au_dinfo *dinfo = _dinfo;
-	static struct lock_class_key aufs_di;
-
-	au_rw_init(&dinfo->di_rwsem);
-	au_rw_class(&dinfo->di_rwsem, &aufs_di);
-}
-
-struct au_dinfo *au_di_alloc(struct super_block *sb, unsigned int lsc)
-{
-	struct au_dinfo *dinfo;
-	int nbr, i;
-
-	dinfo = au_cache_alloc_dinfo();
-	if (unlikely(!dinfo))
-		goto out;
-
-	nbr = au_sbend(sb) + 1;
-	if (nbr <= 0)
-		nbr = 1;
-	dinfo->di_hdentry = kcalloc(nbr, sizeof(*dinfo->di_hdentry), GFP_NOFS);
-	if (dinfo->di_hdentry) {
-		au_rw_write_lock_nested(&dinfo->di_rwsem, lsc);
-		dinfo->di_bstart = -1;
-		dinfo->di_bend = -1;
-		dinfo->di_bwh = -1;
-		dinfo->di_bdiropq = -1;
-		dinfo->di_tmpfile = 0;
-		for (i = 0; i < nbr; i++)
-			dinfo->di_hdentry[i].hd_id = -1;
-		goto out;
-	}
-
-	au_cache_free_dinfo(dinfo);
-	dinfo = NULL;
-
-out:
-	return dinfo;
-}
-
-void au_di_free(struct au_dinfo *dinfo)
-{
-	struct au_hdentry *p;
-	aufs_bindex_t bend, bindex;
-
-	/* dentry may not be revalidated */
-	bindex = dinfo->di_bstart;
-	if (bindex >= 0) {
-		bend = dinfo->di_bend;
-		p = dinfo->di_hdentry + bindex;
-		while (bindex++ <= bend)
-			au_hdput(p++);
-	}
-	kfree(dinfo->di_hdentry);
-	au_cache_free_dinfo(dinfo);
-}
-
-void au_di_swap(struct au_dinfo *a, struct au_dinfo *b)
-{
-	struct au_hdentry *p;
-	aufs_bindex_t bi;
-
-	AuRwMustWriteLock(&a->di_rwsem);
-	AuRwMustWriteLock(&b->di_rwsem);
-
-#define DiSwap(v, name)				\
-	do {					\
-		v = a->di_##name;		\
-		a->di_##name = b->di_##name;	\
-		b->di_##name = v;		\
-	} while (0)
-
-	DiSwap(p, hdentry);
-	DiSwap(bi, bstart);
-	DiSwap(bi, bend);
-	DiSwap(bi, bwh);
-	DiSwap(bi, bdiropq);
-	/* smp_mb(); */
-
-#undef DiSwap
-}
-
-void au_di_cp(struct au_dinfo *dst, struct au_dinfo *src)
-{
-	AuRwMustWriteLock(&dst->di_rwsem);
-	AuRwMustWriteLock(&src->di_rwsem);
-
-	dst->di_bstart = src->di_bstart;
-	dst->di_bend = src->di_bend;
-	dst->di_bwh = src->di_bwh;
-	dst->di_bdiropq = src->di_bdiropq;
-	/* smp_mb(); */
-}
-
-int au_di_init(struct dentry *dentry)
-{
-	int err;
-	struct super_block *sb;
-	struct au_dinfo *dinfo;
-
-	err = 0;
-	sb = dentry->d_sb;
-	dinfo = au_di_alloc(sb, AuLsc_DI_CHILD);
-	if (dinfo) {
-		atomic_set(&dinfo->di_generation, au_sigen(sb));
-		/* smp_mb(); */ /* atomic_set */
-		dentry->d_fsdata = dinfo;
-	} else
-		err = -ENOMEM;
-
-	return err;
-}
-
-void au_di_fin(struct dentry *dentry)
-{
-	struct au_dinfo *dinfo;
-
-	dinfo = au_di(dentry);
-	AuRwDestroy(&dinfo->di_rwsem);
-	au_di_free(dinfo);
-}
-
-int au_di_realloc(struct au_dinfo *dinfo, int nbr)
-{
-	int err, sz;
-	struct au_hdentry *hdp;
-
-	AuRwMustWriteLock(&dinfo->di_rwsem);
-
-	err = -ENOMEM;
-	sz = sizeof(*hdp) * (dinfo->di_bend + 1);
-	if (!sz)
-		sz = sizeof(*hdp);
-	hdp = au_kzrealloc(dinfo->di_hdentry, sz, sizeof(*hdp) * nbr, GFP_NOFS);
-	if (hdp) {
-		dinfo->di_hdentry = hdp;
-		err = 0;
-	}
-
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static void do_ii_write_lock(struct inode *inode, unsigned int lsc)
-{
-	switch (lsc) {
-	case AuLsc_DI_CHILD:
-		ii_write_lock_child(inode);
-		break;
-	case AuLsc_DI_CHILD2:
-		ii_write_lock_child2(inode);
-		break;
-	case AuLsc_DI_CHILD3:
-		ii_write_lock_child3(inode);
-		break;
-	case AuLsc_DI_PARENT:
-		ii_write_lock_parent(inode);
-		break;
-	case AuLsc_DI_PARENT2:
-		ii_write_lock_parent2(inode);
-		break;
-	case AuLsc_DI_PARENT3:
-		ii_write_lock_parent3(inode);
-		break;
-	default:
-		BUG();
-	}
-}
-
-static void do_ii_read_lock(struct inode *inode, unsigned int lsc)
-{
-	switch (lsc) {
-	case AuLsc_DI_CHILD:
-		ii_read_lock_child(inode);
-		break;
-	case AuLsc_DI_CHILD2:
-		ii_read_lock_child2(inode);
-		break;
-	case AuLsc_DI_CHILD3:
-		ii_read_lock_child3(inode);
-		break;
-	case AuLsc_DI_PARENT:
-		ii_read_lock_parent(inode);
-		break;
-	case AuLsc_DI_PARENT2:
-		ii_read_lock_parent2(inode);
-		break;
-	case AuLsc_DI_PARENT3:
-		ii_read_lock_parent3(inode);
-		break;
-	default:
-		BUG();
-	}
-}
-
-void di_read_lock(struct dentry *d, int flags, unsigned int lsc)
-{
-	struct inode *inode;
-
-	au_rw_read_lock_nested(&au_di(d)->di_rwsem, lsc);
-	if (d_really_is_positive(d)) {
-		inode = d_inode(d);
-		if (au_ftest_lock(flags, IW))
-			do_ii_write_lock(inode, lsc);
-		else if (au_ftest_lock(flags, IR))
-			do_ii_read_lock(inode, lsc);
-	}
-}
-
-void di_read_unlock(struct dentry *d, int flags)
-{
-	struct inode *inode;
-
-	if (d_really_is_positive(d)) {
-		inode = d_inode(d);
-		if (au_ftest_lock(flags, IW)) {
-			au_dbg_verify_dinode(d);
-			ii_write_unlock(inode);
-		} else if (au_ftest_lock(flags, IR)) {
-			au_dbg_verify_dinode(d);
-			ii_read_unlock(inode);
-		}
-	}
-	au_rw_read_unlock(&au_di(d)->di_rwsem);
-}
-
-void di_downgrade_lock(struct dentry *d, int flags)
-{
-	if (d_really_is_positive(d) && au_ftest_lock(flags, IR))
-		ii_downgrade_lock(d_inode(d));
-	au_rw_dgrade_lock(&au_di(d)->di_rwsem);
-}
-
-void di_write_lock(struct dentry *d, unsigned int lsc)
-{
-	au_rw_write_lock_nested(&au_di(d)->di_rwsem, lsc);
-	if (d_really_is_positive(d))
-		do_ii_write_lock(d_inode(d), lsc);
-}
-
-void di_write_unlock(struct dentry *d)
-{
-	au_dbg_verify_dinode(d);
-	if (d_really_is_positive(d))
-		ii_write_unlock(d_inode(d));
-	au_rw_write_unlock(&au_di(d)->di_rwsem);
-}
-
-void di_write_lock2_child(struct dentry *d1, struct dentry *d2, int isdir)
-{
-	AuDebugOn(d1 == d2
-		  || d_inode(d1) == d_inode(d2)
-		  || d1->d_sb != d2->d_sb);
-
-	if (isdir && au_test_subdir(d1, d2)) {
-		di_write_lock_child(d1);
-		di_write_lock_child2(d2);
-	} else {
-		/* there should be no races */
-		di_write_lock_child(d2);
-		di_write_lock_child2(d1);
-	}
-}
-
-void di_write_lock2_parent(struct dentry *d1, struct dentry *d2, int isdir)
-{
-	AuDebugOn(d1 == d2
-		  || d_inode(d1) == d_inode(d2)
-		  || d1->d_sb != d2->d_sb);
-
-	if (isdir && au_test_subdir(d1, d2)) {
-		di_write_lock_parent(d1);
-		di_write_lock_parent2(d2);
-	} else {
-		/* there should be no races */
-		di_write_lock_parent(d2);
-		di_write_lock_parent2(d1);
-	}
-}
-
-void di_write_unlock2(struct dentry *d1, struct dentry *d2)
-{
-	di_write_unlock(d1);
-	if (d_inode(d1) == d_inode(d2))
-		au_rw_write_unlock(&au_di(d2)->di_rwsem);
-	else
-		di_write_unlock(d2);
-}
-
-/* ---------------------------------------------------------------------- */
-
-struct dentry *au_h_dptr(struct dentry *dentry, aufs_bindex_t bindex)
-{
-	struct dentry *d;
-
-	DiMustAnyLock(dentry);
-
-	if (au_dbstart(dentry) < 0 || bindex < au_dbstart(dentry))
-		return NULL;
-	AuDebugOn(bindex < 0);
-	d = au_di(dentry)->di_hdentry[0 + bindex].hd_dentry;
-	AuDebugOn(d && au_dcount(d) <= 0);
-	return d;
-}
-
-/*
- * extended version of au_h_dptr().
- * returns a hashed and positive (or linkable) h_dentry in bindex, NULL, or
- * error.
- */
-struct dentry *au_h_d_alias(struct dentry *dentry, aufs_bindex_t bindex)
-{
-	struct dentry *h_dentry;
-	struct inode *inode, *h_inode;
-
-	AuDebugOn(d_really_is_negative(dentry));
-
-	h_dentry = NULL;
-	if (au_dbstart(dentry) <= bindex
-	    && bindex <= au_dbend(dentry))
-		h_dentry = au_h_dptr(dentry, bindex);
-	if (h_dentry && !au_d_linkable(h_dentry)) {
-		dget(h_dentry);
-		goto out; /* success */
-	}
-
-	inode = d_inode(dentry);
-	AuDebugOn(bindex < au_ibstart(inode));
-	AuDebugOn(au_ibend(inode) < bindex);
-	h_inode = au_h_iptr(inode, bindex);
-	h_dentry = d_find_alias(h_inode);
-	if (h_dentry) {
-		if (!IS_ERR(h_dentry)) {
-			if (!au_d_linkable(h_dentry))
-				goto out; /* success */
-			dput(h_dentry);
-		} else
-			goto out;
-	}
-
-	if (au_opt_test(au_mntflags(dentry->d_sb), PLINK)) {
-		h_dentry = au_plink_lkup(inode, bindex);
-		AuDebugOn(!h_dentry);
-		if (!IS_ERR(h_dentry)) {
-			if (!au_d_hashed_positive(h_dentry))
-				goto out; /* success */
-			dput(h_dentry);
-			h_dentry = NULL;
-		}
-	}
-
-out:
-	AuDbgDentry(h_dentry);
-	return h_dentry;
-}
-
-aufs_bindex_t au_dbtail(struct dentry *dentry)
-{
-	aufs_bindex_t bend, bwh;
-
-	bend = au_dbend(dentry);
-	if (0 <= bend) {
-		bwh = au_dbwh(dentry);
-		if (!bwh)
-			return bwh;
-		if (0 < bwh && bwh < bend)
-			return bwh - 1;
-	}
-	return bend;
-}
-
-aufs_bindex_t au_dbtaildir(struct dentry *dentry)
-{
-	aufs_bindex_t bend, bopq;
-
-	bend = au_dbtail(dentry);
-	if (0 <= bend) {
-		bopq = au_dbdiropq(dentry);
-		if (0 <= bopq && bopq < bend)
-			bend = bopq;
-	}
-	return bend;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void au_set_h_dptr(struct dentry *dentry, aufs_bindex_t bindex,
-		   struct dentry *h_dentry)
-{
-	struct au_hdentry *hd = au_di(dentry)->di_hdentry + bindex;
-	struct au_branch *br;
-
-	DiMustWriteLock(dentry);
-
-	au_hdput(hd);
-	hd->hd_dentry = h_dentry;
-	if (h_dentry) {
-		br = au_sbr(dentry->d_sb, bindex);
-		hd->hd_id = br->br_id;
-	}
-}
-
-int au_dbrange_test(struct dentry *dentry)
-{
-	int err;
-	aufs_bindex_t bstart, bend;
-
-	err = 0;
-	bstart = au_dbstart(dentry);
-	bend = au_dbend(dentry);
-	if (bstart >= 0)
-		AuDebugOn(bend < 0 && bstart > bend);
-	else {
-		err = -EIO;
-		AuDebugOn(bend >= 0);
-	}
-
-	return err;
-}
-
-int au_digen_test(struct dentry *dentry, unsigned int sigen)
-{
-	int err;
-
-	err = 0;
-	if (unlikely(au_digen(dentry) != sigen
-		     || au_iigen_test(d_inode(dentry), sigen)))
-		err = -EIO;
-
-	return err;
-}
-
-void au_update_digen(struct dentry *dentry)
-{
-	atomic_set(&au_di(dentry)->di_generation, au_sigen(dentry->d_sb));
-	/* smp_mb(); */ /* atomic_set */
-}
-
-void au_update_dbrange(struct dentry *dentry, int do_put_zero)
-{
-	struct au_dinfo *dinfo;
-	struct dentry *h_d;
-	struct au_hdentry *hdp;
-
-	DiMustWriteLock(dentry);
-
-	dinfo = au_di(dentry);
-	if (!dinfo || dinfo->di_bstart < 0)
-		return;
-
-	hdp = dinfo->di_hdentry;
-	if (do_put_zero) {
-		aufs_bindex_t bindex, bend;
-
-		bend = dinfo->di_bend;
-		for (bindex = dinfo->di_bstart; bindex <= bend; bindex++) {
-			h_d = hdp[0 + bindex].hd_dentry;
-			if (h_d && d_is_negative(h_d))
-				au_set_h_dptr(dentry, bindex, NULL);
-		}
-	}
-
-	dinfo->di_bstart = -1;
-	while (++dinfo->di_bstart <= dinfo->di_bend)
-		if (hdp[0 + dinfo->di_bstart].hd_dentry)
-			break;
-	if (dinfo->di_bstart > dinfo->di_bend) {
-		dinfo->di_bstart = -1;
-		dinfo->di_bend = -1;
-		return;
-	}
-
-	dinfo->di_bend++;
-	while (0 <= --dinfo->di_bend)
-		if (hdp[0 + dinfo->di_bend].hd_dentry)
-			break;
-	AuDebugOn(dinfo->di_bstart > dinfo->di_bend || dinfo->di_bend < 0);
-}
-
-void au_update_dbstart(struct dentry *dentry)
-{
-	aufs_bindex_t bindex, bend;
-	struct dentry *h_dentry;
-
-	bend = au_dbend(dentry);
-	for (bindex = au_dbstart(dentry); bindex <= bend; bindex++) {
-		h_dentry = au_h_dptr(dentry, bindex);
-		if (!h_dentry)
-			continue;
-		if (d_is_positive(h_dentry)) {
-			au_set_dbstart(dentry, bindex);
-			return;
-		}
-		au_set_h_dptr(dentry, bindex, NULL);
-	}
-}
-
-void au_update_dbend(struct dentry *dentry)
-{
-	aufs_bindex_t bindex, bstart;
-	struct dentry *h_dentry;
-
-	bstart = au_dbstart(dentry);
-	for (bindex = au_dbend(dentry); bindex >= bstart; bindex--) {
-		h_dentry = au_h_dptr(dentry, bindex);
-		if (!h_dentry)
-			continue;
-		if (d_is_positive(h_dentry)) {
-			au_set_dbend(dentry, bindex);
-			return;
-		}
-		au_set_h_dptr(dentry, bindex, NULL);
-	}
-}
-
-int au_find_dbindex(struct dentry *dentry, struct dentry *h_dentry)
-{
-	aufs_bindex_t bindex, bend;
-
-	bend = au_dbend(dentry);
-	for (bindex = au_dbstart(dentry); bindex <= bend; bindex++)
-		if (au_h_dptr(dentry, bindex) == h_dentry)
-			return bindex;
-	return -1;
-}
diff --git a/fs/aufs/dir.c b/fs/aufs/dir.c
deleted file mode 100644
index a994e0862..000000000
--- a/fs/aufs/dir.c
+++ /dev/null
@@ -1,745 +0,0 @@
-/*
- * Copyright (C) 2005-2016 Junjiro R. Okajima
- */
-
-/*
- * directory operations
- */
-
-#include <linux/fs_stack.h>
-#include "aufs.h"
-
-void au_add_nlink(struct inode *dir, struct inode *h_dir)
-{
-	unsigned int nlink;
-
-	AuDebugOn(!S_ISDIR(dir->i_mode) || !S_ISDIR(h_dir->i_mode));
-
-	nlink = dir->i_nlink;
-	nlink += h_dir->i_nlink - 2;
-	if (h_dir->i_nlink < 2)
-		nlink += 2;
-	smp_mb(); /* for i_nlink */
-	/* 0 can happen in revaliding */
-	set_nlink(dir, nlink);
-}
-
-void au_sub_nlink(struct inode *dir, struct inode *h_dir)
-{
-	unsigned int nlink;
-
-	AuDebugOn(!S_ISDIR(dir->i_mode) || !S_ISDIR(h_dir->i_mode));
-
-	nlink = dir->i_nlink;
-	nlink -= h_dir->i_nlink - 2;
-	if (h_dir->i_nlink < 2)
-		nlink -= 2;
-	smp_mb(); /* for i_nlink */
-	/* nlink == 0 means the branch-fs is broken */
-	set_nlink(dir, nlink);
-}
-
-loff_t au_dir_size(struct file *file, struct dentry *dentry)
-{
-	loff_t sz;
-	aufs_bindex_t bindex, bend;
-	struct file *h_file;
-	struct dentry *h_dentry;
-
-	sz = 0;
-	if (file) {
-		AuDebugOn(!d_is_dir(file->f_path.dentry));
-
-		bend = au_fbend_dir(file);
-		for (bindex = au_fbstart(file);
-		     bindex <= bend && sz < KMALLOC_MAX_SIZE;
-		     bindex++) {
-			h_file = au_hf_dir(file, bindex);
-			if (h_file && file_inode(h_file))
-				sz += vfsub_f_size_read(h_file);
-		}
-	} else {
-		AuDebugOn(!dentry);
-		AuDebugOn(!d_is_dir(dentry));
-
-		bend = au_dbtaildir(dentry);
-		for (bindex = au_dbstart(dentry);
-		     bindex <= bend && sz < KMALLOC_MAX_SIZE;
-		     bindex++) {
-			h_dentry = au_h_dptr(dentry, bindex);
-			if (h_dentry && d_is_positive(h_dentry))
-				sz += i_size_read(d_inode(h_dentry));
-		}
-	}
-	if (sz < KMALLOC_MAX_SIZE)
-		sz = roundup_pow_of_two(sz);
-	if (sz > KMALLOC_MAX_SIZE)
-		sz = KMALLOC_MAX_SIZE;
-	else if (sz < NAME_MAX) {
-		BUILD_BUG_ON(AUFS_RDBLK_DEF < NAME_MAX);
-		sz = AUFS_RDBLK_DEF;
-	}
-	return sz;
-}
-
-struct au_dir_ts_arg {
-	struct dentry *dentry;
-	aufs_bindex_t brid;
-};
-
-static void au_do_dir_ts(void *arg)
-{
-	struct au_dir_ts_arg *a = arg;
-	struct au_dtime dt;
-	struct path h_path;
-	struct inode *dir, *h_dir;
-	struct super_block *sb;
-	struct au_branch *br;
-	struct au_hinode *hdir;
-	int err;
-	aufs_bindex_t bstart, bindex;
-
-	sb = a->dentry->d_sb;
-	if (d_really_is_negative(a->dentry))
-		goto out;
-	/* no dir->i_mutex lock */
-	aufs_read_lock(a->dentry, AuLock_DW); /* noflush */
-
-	dir = d_inode(a->dentry);
-	bstart = au_ibstart(dir);
-	bindex = au_br_index(sb, a->brid);
-	if (bindex < bstart)
-		goto out_unlock;
-
-	br = au_sbr(sb, bindex);
-	h_path.dentry = au_h_dptr(a->dentry, bindex);
-	if (!h_path.dentry)
-		goto out_unlock;
-	h_path.mnt = au_br_mnt(br);
-	au_dtime_store(&dt, a->dentry, &h_path);
-
-	br = au_sbr(sb, bstart);
-	if (!au_br_writable(br->br_perm))
-		goto out_unlock;
-	h_path.dentry = au_h_dptr(a->dentry, bstart);
-	h_path.mnt = au_br_mnt(br);
-	err = vfsub_mnt_want_write(h_path.mnt);
-	if (err)
-		goto out_unlock;
-	hdir = au_hi(dir, bstart);
-	au_hn_imtx_lock_nested(hdir, AuLsc_I_PARENT);
-	h_dir = au_h_iptr(dir, bstart);
-	if (h_dir->i_nlink
-	    && timespec_compare(&h_dir->i_mtime, &dt.dt_mtime) < 0) {
-		dt.dt_h_path = h_path;
-		au_dtime_revert(&dt);
-	}
-	au_hn_imtx_unlock(hdir);
-	vfsub_mnt_drop_write(h_path.mnt);
-	au_cpup_attr_timesizes(dir);
-
-out_unlock:
-	aufs_read_unlock(a->dentry, AuLock_DW);
-out:
-	dput(a->dentry);
-	au_nwt_done(&au_sbi(sb)->si_nowait);
-	kfree(arg);
-}
-
-void au_dir_ts(struct inode *dir, aufs_bindex_t bindex)
-{
-	int perm, wkq_err;
-	aufs_bindex_t bstart;
-	struct au_dir_ts_arg *arg;
-	struct dentry *dentry;
-	struct super_block *sb;
-
-	IMustLock(dir);
-
-	dentry = d_find_any_alias(dir);
-	AuDebugOn(!dentry);
-	sb = dentry->d_sb;
-	bstart = au_ibstart(dir);
-	if (bstart == bindex) {
-		au_cpup_attr_timesizes(dir);
-		goto out;
-	}
-
-	perm = au_sbr_perm(sb, bstart);
-	if (!au_br_writable(perm))
-		goto out;
-
-	arg = kmalloc(sizeof(*arg), GFP_NOFS);
-	if (!arg)
-		goto out;
-
-	arg->dentry = dget(dentry); /* will be dput-ted by au_do_dir_ts() */
-	arg->brid = au_sbr_id(sb, bindex);
-	wkq_err = au_wkq_nowait(au_do_dir_ts, arg, sb, /*flags*/0);
-	if (unlikely(wkq_err)) {
-		pr_err("wkq %d\n", wkq_err);
-		dput(dentry);
-		kfree(arg);
-	}
-
-out:
-	dput(dentry);
-}
-
-/* ---------------------------------------------------------------------- */
-
-static int reopen_dir(struct file *file)
-{
-	int err;
-	unsigned int flags;
-	aufs_bindex_t bindex, btail, bstart;
-	struct dentry *dentry, *h_dentry;
-	struct file *h_file;
-
-	/* open all lower dirs */
-	dentry = file->f_path.dentry;
-	bstart = au_dbstart(dentry);
-	for (bindex = au_fbstart(file); bindex < bstart; bindex++)
-		au_set_h_fptr(file, bindex, NULL);
-	au_set_fbstart(file, bstart);
-
-	btail = au_dbtaildir(dentry);
-	for (bindex = au_fbend_dir(file); btail < bindex; bindex--)
-		au_set_h_fptr(file, bindex, NULL);
-	au_set_fbend_dir(file, btail);
-
-	flags = vfsub_file_flags(file);
-	for (bindex = bstart; bindex <= btail; bindex++) {
-		h_dentry = au_h_dptr(dentry, bindex);
-		if (!h_dentry)
-			continue;
-		h_file = au_hf_dir(file, bindex);
-		if (h_file)
-			continue;
-
-		h_file = au_h_open(dentry, bindex, flags, file, /*force_wr*/0);
-		err = PTR_ERR(h_file);
-		if (IS_ERR(h_file))
-			goto out; /* close all? */
-		au_set_h_fptr(file, bindex, h_file);
-	}
-	au_update_figen(file);
-	/* todo: necessary? */
-	/* file->f_ra = h_file->f_ra; */
-	err = 0;
-
-out:
-	return err;
-}
-
-static int do_open_dir(struct file *file, int flags, struct file *h_file)
-{
-	int err;
-	aufs_bindex_t bindex, btail;
-	struct dentry *dentry, *h_dentry;
-	struct vfsmount *mnt;
-
-	FiMustWriteLock(file);
-	AuDebugOn(h_file);
-
-	err = 0;
-	mnt = file->f_path.mnt;
-	dentry = file->f_path.dentry;
-	file->f_version = d_inode(dentry)->i_version;
-	bindex = au_dbstart(dentry);
-	au_set_fbstart(file, bindex);
-	btail = au_dbtaildir(dentry);
-	au_set_fbend_dir(file, btail);
-	for (; !err && bindex <= btail; bindex++) {
-		h_dentry = au_h_dptr(dentry, bindex);
-		if (!h_dentry)
-			continue;
-
-		err = vfsub_test_mntns(mnt, h_dentry->d_sb);
-		if (unlikely(err))
-			break;
-		h_file = au_h_open(dentry, bindex, flags, file, /*force_wr*/0);
-		if (IS_ERR(h_file)) {
-			err = PTR_ERR(h_file);
-			break;
-		}
-		au_set_h_fptr(file, bindex, h_file);
-	}
-	au_update_figen(file);
-	/* todo: necessary? */
-	/* file->f_ra = h_file->f_ra; */
-	if (!err)
-		return 0; /* success */
-
-	/* close all */
-	for (bindex = au_fbstart(file); bindex <= btail; bindex++)
-		au_set_h_fptr(file, bindex, NULL);
-	au_set_fbstart(file, -1);
-	au_set_fbend_dir(file, -1);
-
-	return err;
-}
-
-static int aufs_open_dir(struct inode *inode __maybe_unused,
-			 struct file *file)
-{
-	int err;
-	struct super_block *sb;
-	struct au_fidir *fidir;
-
-	err = -ENOMEM;
-	sb = file->f_path.dentry->d_sb;
-	si_read_lock(sb, AuLock_FLUSH);
-	fidir = au_fidir_alloc(sb);
-	if (fidir) {
-		struct au_do_open_args args = {
-			.open	= do_open_dir,
-			.fidir	= fidir
-		};
-		err = au_do_open(file, &args);
-		if (unlikely(err))
-			kfree(fidir);
-	}
-	si_read_unlock(sb);
-	return err;
-}
-
-static int aufs_release_dir(struct inode *inode __maybe_unused,
-			    struct file *file)
-{
-	struct au_vdir *vdir_cache;
-	struct au_finfo *finfo;
-	struct au_fidir *fidir;
-	aufs_bindex_t bindex, bend;
-
-	finfo = au_fi(file);
-	fidir = finfo->fi_hdir;
-	if (fidir) {
-		au_sphl_del(&finfo->fi_hlist,
-			    &au_sbi(file->f_path.dentry->d_sb)->si_files);
-		vdir_cache = fidir->fd_vdir_cache; /* lock-free */
-		if (vdir_cache)
-			au_vdir_free(vdir_cache);
-
-		bindex = finfo->fi_btop;
-		if (bindex >= 0) {
-			/*
-			 * calls fput() instead of filp_close(),
-			 * since no dnotify or lock for the lower file.
-			 */
-			bend = fidir->fd_bbot;
-			for (; bindex <= bend; bindex++)
-				au_set_h_fptr(file, bindex, NULL);
-		}
-		kfree(fidir);
-		finfo->fi_hdir = NULL;
-	}
-	au_finfo_fin(file);
-	return 0;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static int au_do_flush_dir(struct file *file, fl_owner_t id)
-{
-	int err;
-	aufs_bindex_t bindex, bend;
-	struct file *h_file;
-
-	err = 0;
-	bend = au_fbend_dir(file);
-	for (bindex = au_fbstart(file); !err && bindex <= bend; bindex++) {
-		h_file = au_hf_dir(file, bindex);
-		if (h_file)
-			err = vfsub_flush(h_file, id);
-	}
-	return err;
-}
-
-static int aufs_flush_dir(struct file *file, fl_owner_t id)
-{
-	return au_do_flush(file, id, au_do_flush_dir);
-}
-
-/* ---------------------------------------------------------------------- */
-
-static int au_do_fsync_dir_no_file(struct dentry *dentry, int datasync)
-{
-	int err;
-	aufs_bindex_t bend, bindex;
-	struct inode *inode;
-	struct super_block *sb;
-
-	err = 0;
-	sb = dentry->d_sb;
-	inode = d_inode(dentry);
-	IMustLock(inode);
-	bend = au_dbend(dentry);
-	for (bindex = au_dbstart(dentry); !err && bindex <= bend; bindex++) {
-		struct path h_path;
-
-		if (au_test_ro(sb, bindex, inode))
-			continue;
-		h_path.dentry = au_h_dptr(dentry, bindex);
-		if (!h_path.dentry)
-			continue;
-
-		h_path.mnt = au_sbr_mnt(sb, bindex);
-		err = vfsub_fsync(NULL, &h_path, datasync);
-	}
-
-	return err;
-}
-
-static int au_do_fsync_dir(struct file *file, int datasync)
-{
-	int err;
-	aufs_bindex_t bend, bindex;
-	struct file *h_file;
-	struct super_block *sb;
-	struct inode *inode;
-
-	err = au_reval_and_lock_fdi(file, reopen_dir, /*wlock*/1);
-	if (unlikely(err))
-		goto out;
-
-	inode = file_inode(file);
-	sb = inode->i_sb;
-	bend = au_fbend_dir(file);
-	for (bindex = au_fbstart(file); !err && bindex <= bend; bindex++) {
-		h_file = au_hf_dir(file, bindex);
-		if (!h_file || au_test_ro(sb, bindex, inode))
-			continue;
-
-		err = vfsub_fsync(h_file, &h_file->f_path, datasync);
-	}
-
-out:
-	return err;
-}
-
-/*
- * @file may be NULL
- */
-static int aufs_fsync_dir(struct file *file, loff_t start, loff_t end,
-			  int datasync)
-{
-	int err;
-	struct dentry *dentry;
-	struct inode *inode;
-	struct super_block *sb;
-	struct mutex *mtx;
-
-	err = 0;
-	dentry = file->f_path.dentry;
-	inode = d_inode(dentry);
-	mtx = &inode->i_mutex;
-	mutex_lock(mtx);
-	sb = dentry->d_sb;
-	si_noflush_read_lock(sb);
-	if (file)
-		err = au_do_fsync_dir(file, datasync);
-	else {
-		di_write_lock_child(dentry);
-		err = au_do_fsync_dir_no_file(dentry, datasync);
-	}
-	au_cpup_attr_timesizes(inode);
-	di_write_unlock(dentry);
-	if (file)
-		fi_write_unlock(file);
-
-	si_read_unlock(sb);
-	mutex_unlock(mtx);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static int aufs_iterate(struct file *file, struct dir_context *ctx)
-{
-	int err;
-	struct dentry *dentry;
-	struct inode *inode, *h_inode;
-	struct super_block *sb;
-
-	AuDbg("%pD, ctx{%pf, %llu}\n", file, ctx->actor, ctx->pos);
-
-	dentry = file->f_path.dentry;
-	inode = d_inode(dentry);
-	IMustLock(inode);
-
-	sb = dentry->d_sb;
-	si_read_lock(sb, AuLock_FLUSH);
-	err = au_reval_and_lock_fdi(file, reopen_dir, /*wlock*/1);
-	if (unlikely(err))
-		goto out;
-	err = au_alive_dir(dentry);
-	if (!err)
-		err = au_vdir_init(file);
-	di_downgrade_lock(dentry, AuLock_IR);
-	if (unlikely(err))
-		goto out_unlock;
-
-	h_inode = au_h_iptr(inode, au_ibstart(inode));
-	if (!au_test_nfsd()) {
-		err = au_vdir_fill_de(file, ctx);
-		fsstack_copy_attr_atime(inode, h_inode);
-	} else {
-		/*
-		 * nfsd filldir may call lookup_one_len(), vfs_getattr(),
-		 * encode_fh() and others.
-		 */
-		atomic_inc(&h_inode->i_count);
-		di_read_unlock(dentry, AuLock_IR);
-		si_read_unlock(sb);
-		err = au_vdir_fill_de(file, ctx);
-		fsstack_copy_attr_atime(inode, h_inode);
-		fi_write_unlock(file);
-		iput(h_inode);
-
-		AuTraceErr(err);
-		return err;
-	}
-
-out_unlock:
-	di_read_unlock(dentry, AuLock_IR);
-	fi_write_unlock(file);
-out:
-	si_read_unlock(sb);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-#define AuTestEmpty_WHONLY	1
-#define AuTestEmpty_CALLED	(1 << 1)
-#define AuTestEmpty_SHWH	(1 << 2)
-#define au_ftest_testempty(flags, name)	((flags) & AuTestEmpty_##name)
-#define au_fset_testempty(flags, name) \
-	do { (flags) |= AuTestEmpty_##name; } while (0)
-#define au_fclr_testempty(flags, name) \
-	do { (flags) &= ~AuTestEmpty_##name; } while (0)
-
-#ifndef CONFIG_AUFS_SHWH
-#undef AuTestEmpty_SHWH
-#define AuTestEmpty_SHWH	0
-#endif
-
-struct test_empty_arg {
-	struct dir_context ctx;
-	struct au_nhash *whlist;
-	unsigned int flags;
-	int err;
-	aufs_bindex_t bindex;
-};
-
-static int test_empty_cb(struct dir_context *ctx, const char *__name,
-			 int namelen, loff_t offset __maybe_unused, u64 ino,
-			 unsigned int d_type)
-{
-	struct test_empty_arg *arg = container_of(ctx, struct test_empty_arg,
-						  ctx);
-	char *name = (void *)__name;
-
-	arg->err = 0;
-	au_fset_testempty(arg->flags, CALLED);
-	/* smp_mb(); */
-	if (name[0] == '.'
-	    && (namelen == 1 || (name[1] == '.' && namelen == 2)))
-		goto out; /* success */
-
-	if (namelen <= AUFS_WH_PFX_LEN
-	    || memcmp(name, AUFS_WH_PFX, AUFS_WH_PFX_LEN)) {
-		if (au_ftest_testempty(arg->flags, WHONLY)
-		    && !au_nhash_test_known_wh(arg->whlist, name, namelen))
-			arg->err = -ENOTEMPTY;
-		goto out;
-	}
-
-	name += AUFS_WH_PFX_LEN;
-	namelen -= AUFS_WH_PFX_LEN;
-	if (!au_nhash_test_known_wh(arg->whlist, name, namelen))
-		arg->err = au_nhash_append_wh
-			(arg->whlist, name, namelen, ino, d_type, arg->bindex,
-			 au_ftest_testempty(arg->flags, SHWH));
-
-out:
-	/* smp_mb(); */
-	AuTraceErr(arg->err);
-	return arg->err;
-}
-
-static int do_test_empty(struct dentry *dentry, struct test_empty_arg *arg)
-{
-	int err;
-	struct file *h_file;
-
-	h_file = au_h_open(dentry, arg->bindex,
-			   O_RDONLY | O_NONBLOCK | O_DIRECTORY | O_LARGEFILE,
-			   /*file*/NULL, /*force_wr*/0);
-	err = PTR_ERR(h_file);
-	if (IS_ERR(h_file))
-		goto out;
-
-	err = 0;
-	if (!au_opt_test(au_mntflags(dentry->d_sb), UDBA_NONE)
-	    && !file_inode(h_file)->i_nlink)
-		goto out_put;
-
-	do {
-		arg->err = 0;
-		au_fclr_testempty(arg->flags, CALLED);
-		/* smp_mb(); */
-		err = vfsub_iterate_dir(h_file, &arg->ctx);
-		if (err >= 0)
-			err = arg->err;
-	} while (!err && au_ftest_testempty(arg->flags, CALLED));
-
-out_put:
-	fput(h_file);
-	au_sbr_put(dentry->d_sb, arg->bindex);
-out:
-	return err;
-}
-
-struct do_test_empty_args {
-	int *errp;
-	struct dentry *dentry;
-	struct test_empty_arg *arg;
-};
-
-static void call_do_test_empty(void *args)
-{
-	struct do_test_empty_args *a = args;
-	*a->errp = do_test_empty(a->dentry, a->arg);
-}
-
-static int sio_test_empty(struct dentry *dentry, struct test_empty_arg *arg)
-{
-	int err, wkq_err;
-	struct dentry *h_dentry;
-	struct inode *h_inode;
-
-	h_dentry = au_h_dptr(dentry, arg->bindex);
-	h_inode = d_inode(h_dentry);
-	/* todo: i_mode changes anytime? */
-	mutex_lock_nested(&h_inode->i_mutex, AuLsc_I_CHILD);
-	err = au_test_h_perm_sio(h_inode, MAY_EXEC | MAY_READ);
-	mutex_unlock(&h_inode->i_mutex);
-	if (!err)
-		err = do_test_empty(dentry, arg);
-	else {
-		struct do_test_empty_args args = {
-			.errp	= &err,
-			.dentry	= dentry,
-			.arg	= arg
-		};
-		unsigned int flags = arg->flags;
-
-		wkq_err = au_wkq_wait(call_do_test_empty, &args);
-		if (unlikely(wkq_err))
-			err = wkq_err;
-		arg->flags = flags;
-	}
-
-	return err;
-}
-
-int au_test_empty_lower(struct dentry *dentry)
-{
-	int err;
-	unsigned int rdhash;
-	aufs_bindex_t bindex, bstart, btail;
-	struct au_nhash whlist;
-	struct test_empty_arg arg = {
-		.ctx = {
-			.actor = test_empty_cb
-		}
-	};
-	int (*test_empty)(struct dentry *dentry, struct test_empty_arg *arg);
-
-	SiMustAnyLock(dentry->d_sb);
-
-	rdhash = au_sbi(dentry->d_sb)->si_rdhash;
-	if (!rdhash)
-		rdhash = au_rdhash_est(au_dir_size(/*file*/NULL, dentry));
-	err = au_nhash_alloc(&whlist, rdhash, GFP_NOFS);
-	if (unlikely(err))
-		goto out;
-
-	arg.flags = 0;
-	arg.whlist = &whlist;
-	bstart = au_dbstart(dentry);
-	if (au_opt_test(au_mntflags(dentry->d_sb), SHWH))
-		au_fset_testempty(arg.flags, SHWH);
-	test_empty = do_test_empty;
-	if (au_opt_test(au_mntflags(dentry->d_sb), DIRPERM1))
-		test_empty = sio_test_empty;
-	arg.bindex = bstart;
-	err = test_empty(dentry, &arg);
-	if (unlikely(err))
-		goto out_whlist;
-
-	au_fset_testempty(arg.flags, WHONLY);
-	btail = au_dbtaildir(dentry);
-	for (bindex = bstart + 1; !err && bindex <= btail; bindex++) {
-		struct dentry *h_dentry;
-
-		h_dentry = au_h_dptr(dentry, bindex);
-		if (h_dentry && d_is_positive(h_dentry)) {
-			arg.bindex = bindex;
-			err = test_empty(dentry, &arg);
-		}
-	}
-
-out_whlist:
-	au_nhash_wh_free(&whlist);
-out:
-	return err;
-}
-
-int au_test_empty(struct dentry *dentry, struct au_nhash *whlist)
-{
-	int err;
-	struct test_empty_arg arg = {
-		.ctx = {
-			.actor = test_empty_cb
-		}
-	};
-	aufs_bindex_t bindex, btail;
-
-	err = 0;
-	arg.whlist = whlist;
-	arg.flags = AuTestEmpty_WHONLY;
-	if (au_opt_test(au_mntflags(dentry->d_sb), SHWH))
-		au_fset_testempty(arg.flags, SHWH);
-	btail = au_dbtaildir(dentry);
-	for (bindex = au_dbstart(dentry); !err && bindex <= btail; bindex++) {
-		struct dentry *h_dentry;
-
-		h_dentry = au_h_dptr(dentry, bindex);
-		if (h_dentry && d_is_positive(h_dentry)) {
-			arg.bindex = bindex;
-			err = sio_test_empty(dentry, &arg);
-		}
-	}
-
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-const struct file_operations aufs_dir_fop = {
-	.owner		= THIS_MODULE,
-	.llseek		= default_llseek,
-	.read		= generic_read_dir,
-	.iterate	= aufs_iterate,
-	.unlocked_ioctl	= aufs_ioctl_dir,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl	= aufs_compat_ioctl_dir,
-#endif
-	.open		= aufs_open_dir,
-	.release	= aufs_release_dir,
-	.flush		= aufs_flush_dir,
-	.fsync		= aufs_fsync_dir
-};
diff --git a/fs/aufs/dir.h b/fs/aufs/dir.h
deleted file mode 100644
index b0a79d722..000000000
--- a/fs/aufs/dir.h
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * Copyright (C) 2005-2016 Junjiro R. Okajima
- */
-
-/*
- * directory operations
- */
-
-#ifndef __AUFS_DIR_H__
-#define __AUFS_DIR_H__
-
-#ifdef __KERNEL__
-
-#include <linux/fs.h>
-
-/* ---------------------------------------------------------------------- */
-
-/* need to be faster and smaller */
-
-struct au_nhash {
-	unsigned int		nh_num;
-	struct hlist_head	*nh_head;
-};
-
-struct au_vdir_destr {
-	unsigned char	len;
-	unsigned char	name[0];
-} __packed;
-
-struct au_vdir_dehstr {
-	struct hlist_node	hash;
-	struct au_vdir_destr	*str;
-} ____cacheline_aligned_in_smp;
-
-struct au_vdir_de {
-	ino_t			de_ino;
-	unsigned char		de_type;
-	/* caution: packed */
-	struct au_vdir_destr	de_str;
-} __packed;
-
-struct au_vdir_wh {
-	struct hlist_node	wh_hash;
-#ifdef CONFIG_AUFS_SHWH
-	ino_t			wh_ino;
-	aufs_bindex_t		wh_bindex;
-	unsigned char		wh_type;
-#else
-	aufs_bindex_t		wh_bindex;
-#endif
-	/* caution: packed */
-	struct au_vdir_destr	wh_str;
-} __packed;
-
-union au_vdir_deblk_p {
-	unsigned char		*deblk;
-	struct au_vdir_de	*de;
-};
-
-struct au_vdir {
-	unsigned char	**vd_deblk;
-	unsigned long	vd_nblk;
-	struct {
-		unsigned long		ul;
-		union au_vdir_deblk_p	p;
-	} vd_last;
-
-	unsigned long	vd_version;
-	unsigned int	vd_deblk_sz;
-	unsigned long	vd_jiffy;
-} ____cacheline_aligned_in_smp;
-
-/* ---------------------------------------------------------------------- */
-
-/* dir.c */
-extern const struct file_operations aufs_dir_fop;
-void au_add_nlink(struct inode *dir, struct inode *h_dir);
-void au_sub_nlink(struct inode *dir, struct inode *h_dir);
-loff_t au_dir_size(struct file *file, struct dentry *dentry);
-void au_dir_ts(struct inode *dir, aufs_bindex_t bsrc);
-int au_test_empty_lower(struct dentry *dentry);
-int au_test_empty(struct dentry *dentry, struct au_nhash *whlist);
-
-/* vdir.c */
-unsigned int au_rdhash_est(loff_t sz);
-int au_nhash_alloc(struct au_nhash *nhash, unsigned int num_hash, gfp_t gfp);
-void au_nhash_wh_free(struct au_nhash *whlist);
-int au_nhash_test_longer_wh(struct au_nhash *whlist, aufs_bindex_t btgt,
-			    int limit);
-int au_nhash_test_known_wh(struct au_nhash *whlist, char *name, int nlen);
-int au_nhash_append_wh(struct au_nhash *whlist, char *name, int nlen, ino_t ino,
-		       unsigned int d_type, aufs_bindex_t bindex,
-		       unsigned char shwh);
-void au_vdir_free(struct au_vdir *vdir);
-int au_vdir_init(struct file *file);
-int au_vdir_fill_de(struct file *file, struct dir_context *ctx);
-
-/* ioctl.c */
-long aufs_ioctl_dir(struct file *file, unsigned int cmd, unsigned long arg);
-
-#ifdef CONFIG_AUFS_RDU
-/* rdu.c */
-long au_rdu_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
-#ifdef CONFIG_COMPAT
-long au_rdu_compat_ioctl(struct file *file, unsigned int cmd,
-			 unsigned long arg);
-#endif
-#else
-AuStub(long, au_rdu_ioctl, return -EINVAL, struct file *file,
-       unsigned int cmd, unsigned long arg)
-#ifdef CONFIG_COMPAT
-AuStub(long, au_rdu_compat_ioctl, return -EINVAL, struct file *file,
-       unsigned int cmd, unsigned long arg)
-#endif
-#endif
-
-#endif /* __KERNEL__ */
-#endif /* __AUFS_DIR_H__ */
diff --git a/fs/aufs/dynop.c b/fs/aufs/dynop.c
deleted file mode 100644
index 53a8b55d8..000000000
--- a/fs/aufs/dynop.c
+++ /dev/null
@@ -1,356 +0,0 @@
-/*
- * Copyright (C) 2010-2016 Junjiro R. Okajima
- */
-
-/*
- * dynamically customizable operations for regular files
- */
-
-#include "aufs.h"
-
-#define DyPrSym(key)	AuDbgSym(key->dk_op.dy_hop)
-
-/*
- * How large will these lists be?
- * Usually just a few elements, 20-30 at most for each, I guess.
- */
-static struct au_splhead dynop[AuDyLast];
-
-static struct au_dykey *dy_gfind_get(struct au_splhead *spl, const void *h_op)
-{
-	struct au_dykey *key, *tmp;
-	struct list_head *head;
-
-	key = NULL;
-	head = &spl->head;
-	rcu_read_lock();
-	list_for_each_entry_rcu(tmp, head, dk_list)
-		if (tmp->dk_op.dy_hop == h_op) {
-			key = tmp;
-			kref_get(&key->dk_kref);
-			break;
-		}
-	rcu_read_unlock();
-
-	return key;
-}
-
-static struct au_dykey *dy_bradd(struct au_branch *br, struct au_dykey *key)
-{
-	struct au_dykey **k, *found;
-	const void *h_op = key->dk_op.dy_hop;
-	int i;
-
-	found = NULL;
-	k = br->br_dykey;
-	for (i = 0; i < AuBrDynOp; i++)
-		if (k[i]) {
-			if (k[i]->dk_op.dy_hop == h_op) {
-				found = k[i];
-				break;
-			}
-		} else
-			break;
-	if (!found) {
-		spin_lock(&br->br_dykey_lock);
-		for (; i < AuBrDynOp; i++)
-			if (k[i]) {
-				if (k[i]->dk_op.dy_hop == h_op) {
-					found = k[i];
-					break;
-				}
-			} else {
-				k[i] = key;
-				break;
-			}
-		spin_unlock(&br->br_dykey_lock);
-		BUG_ON(i == AuBrDynOp); /* expand the array */
-	}
-
-	return found;
-}
-
-/* kref_get() if @key is already added */
-static struct au_dykey *dy_gadd(struct au_splhead *spl, struct au_dykey *key)
-{
-	struct au_dykey *tmp, *found;
-	struct list_head *head;
-	const void *h_op = key->dk_op.dy_hop;
-
-	found = NULL;
-	head = &spl->head;
-	spin_lock(&spl->spin);
-	list_for_each_entry(tmp, head, dk_list)
-		if (tmp->dk_op.dy_hop == h_op) {
-			kref_get(&tmp->dk_kref);
-			found = tmp;
-			break;
-		}
-	if (!found)
-		list_add_rcu(&key->dk_list, head);
-	spin_unlock(&spl->spin);
-
-	if (!found)
-		DyPrSym(key);
-	return found;
-}
-
-static void dy_free_rcu(struct rcu_head *rcu)
-{
-	struct au_dykey *key;
-
-	key = container_of(rcu, struct au_dykey, dk_rcu);
-	DyPrSym(key);
-	kfree(key);
-}
-
-static void dy_free(struct kref *kref)
-{
-	struct au_dykey *key;
-	struct au_splhead *spl;
-
-	key = container_of(kref, struct au_dykey, dk_kref);
-	spl = dynop + key->dk_op.dy_type;
-	au_spl_del_rcu(&key->dk_list, spl);
-	call_rcu(&key->dk_rcu, dy_free_rcu);
-}
-
-void au_dy_put(struct au_dykey *key)
-{
-	kref_put(&key->dk_kref, dy_free);
-}
-
-/* ---------------------------------------------------------------------- */
-
-#define DyDbgSize(cnt, op)	AuDebugOn(cnt != sizeof(op)/sizeof(void *))
-
-#ifdef CONFIG_AUFS_DEBUG
-#define DyDbgDeclare(cnt)	unsigned int cnt = 0
-#define DyDbgInc(cnt)		do { cnt++; } while (0)
-#else
-#define DyDbgDeclare(cnt)	do {} while (0)
-#define DyDbgInc(cnt)		do {} while (0)
-#endif
-
-#define DySet(func, dst, src, h_op, h_sb) do {				\
-	DyDbgInc(cnt);							\
-	if (h_op->func) {						\
-		if (src.func)						\
-			dst.func = src.func;				\
-		else							\
-			AuDbg("%s %s\n", au_sbtype(h_sb), #func);	\
-	}								\
-} while (0)
-
-#define DySetForce(func, dst, src) do {		\
-	AuDebugOn(!src.func);			\
-	DyDbgInc(cnt);				\
-	dst.func = src.func;			\
-} while (0)
-
-#define DySetAop(func) \
-	DySet(func, dyaop->da_op, aufs_aop, h_aop, h_sb)
-#define DySetAopForce(func) \
-	DySetForce(func, dyaop->da_op, aufs_aop)
-
-static void dy_aop(struct au_dykey *key, const void *h_op,
-		   struct super_block *h_sb __maybe_unused)
-{
-	struct au_dyaop *dyaop = (void *)key;
-	const struct address_space_operations *h_aop = h_op;
-	DyDbgDeclare(cnt);
-
-	AuDbg("%s\n", au_sbtype(h_sb));
-
-	DySetAop(writepage);
-	DySetAopForce(readpage);	/* force */
-	DySetAop(writepages);
-	DySetAop(set_page_dirty);
-	DySetAop(readpages);
-	DySetAop(write_begin);
-	DySetAop(write_end);
-	DySetAop(bmap);
-	DySetAop(invalidatepage);
-	DySetAop(releasepage);
-	DySetAop(freepage);
-	/* this one will be changed according to an aufs mount option */
-	DySetAop(direct_IO);
-	DySetAop(migratepage);
-	DySetAop(launder_page);
-	DySetAop(is_partially_uptodate);
-	DySetAop(is_dirty_writeback);
-	DySetAop(error_remove_page);
-	DySetAop(swap_activate);
-	DySetAop(swap_deactivate);
-
-	DyDbgSize(cnt, *h_aop);
-}
-
-/* ---------------------------------------------------------------------- */
-
-static void dy_bug(struct kref *kref)
-{
-	BUG();
-}
-
-static struct au_dykey *dy_get(struct au_dynop *op, struct au_branch *br)
-{
-	struct au_dykey *key, *old;
-	struct au_splhead *spl;
-	struct op {
-		unsigned int sz;
-		void (*set)(struct au_dykey *key, const void *h_op,
-			    struct super_block *h_sb __maybe_unused);
-	};
-	static const struct op a[] = {
-		[AuDy_AOP] = {
-			.sz	= sizeof(struct au_dyaop),
-			.set	= dy_aop
-		}
-	};
-	const struct op *p;
-
-	spl = dynop + op->dy_type;
-	key = dy_gfind_get(spl, op->dy_hop);
-	if (key)
-		goto out_add; /* success */
-
-	p = a + op->dy_type;
-	key = kzalloc(p->sz, GFP_NOFS);
-	if (unlikely(!key)) {
-		key = ERR_PTR(-ENOMEM);
-		goto out;
-	}
-
-	key->dk_op.dy_hop = op->dy_hop;
-	kref_init(&key->dk_kref);
-	p->set(key, op->dy_hop, au_br_sb(br));
-	old = dy_gadd(spl, key);
-	if (old) {
-		kfree(key);
-		key = old;
-	}
-
-out_add:
-	old = dy_bradd(br, key);
-	if (old)
-		/* its ref-count should never be zero here */
-		kref_put(&key->dk_kref, dy_bug);
-out:
-	return key;
-}
-
-/* ---------------------------------------------------------------------- */
-/*
- * Aufs prohibits O_DIRECT by defaut even if the branch supports it.
- * This behaviour is necessary to return an error from open(O_DIRECT) instead
- * of the succeeding I/O. The dio mount option enables O_DIRECT and makes
- * open(O_DIRECT) always succeed, but the succeeding I/O may return an error.
- * See the aufs manual in detail.
- */
-static void dy_adx(struct au_dyaop *dyaop, int do_dx)
-{
-	if (!do_dx)
-		dyaop->da_op.direct_IO = NULL;
-	else
-		dyaop->da_op.direct_IO = aufs_aop.direct_IO;
-}
-
-static struct au_dyaop *dy_aget(struct au_branch *br,
-				const struct address_space_operations *h_aop,
-				int do_dx)
-{
-	struct au_dyaop *dyaop;
-	struct au_dynop op;
-
-	op.dy_type = AuDy_AOP;
-	op.dy_haop = h_aop;
-	dyaop = (void *)dy_get(&op, br);
-	if (IS_ERR(dyaop))
-		goto out;
-	dy_adx(dyaop, do_dx);
-
-out:
-	return dyaop;
-}
-
-int au_dy_iaop(struct inode *inode, aufs_bindex_t bindex,
-		struct inode *h_inode)
-{
-	int err, do_dx;
-	struct super_block *sb;
-	struct au_branch *br;
-	struct au_dyaop *dyaop;
-
-	AuDebugOn(!S_ISREG(h_inode->i_mode));
-	IiMustWriteLock(inode);
-
-	sb = inode->i_sb;
-	br = au_sbr(sb, bindex);
-	do_dx = !!au_opt_test(au_mntflags(sb), DIO);
-	dyaop = dy_aget(br, h_inode->i_mapping->a_ops, do_dx);
-	err = PTR_ERR(dyaop);
-	if (IS_ERR(dyaop))
-		/* unnecessary to call dy_fput() */
-		goto out;
-
-	err = 0;
-	inode->i_mapping->a_ops = &dyaop->da_op;
-
-out:
-	return err;
-}
-
-/*
- * Is it safe to replace a_ops during the inode/file is in operation?
- * Yes, I hope so.
- */
-int au_dy_irefresh(struct inode *inode)
-{
-	int err;
-	aufs_bindex_t bstart;
-	struct inode *h_inode;
-
-	err = 0;
-	if (S_ISREG(inode->i_mode)) {
-		bstart = au_ibstart(inode);
-		h_inode = au_h_iptr(inode, bstart);
-		err = au_dy_iaop(inode, bstart, h_inode);
-	}
-	return err;
-}
-
-void au_dy_arefresh(int do_dx)
-{
-	struct au_splhead *spl;
-	struct list_head *head;
-	struct au_dykey *key;
-
-	spl = dynop + AuDy_AOP;
-	head = &spl->head;
-	spin_lock(&spl->spin);
-	list_for_each_entry(key, head, dk_list)
-		dy_adx((void *)key, do_dx);
-	spin_unlock(&spl->spin);
-}
-
-/* ---------------------------------------------------------------------- */
-
-void __init au_dy_init(void)
-{
-	int i;
-
-	/* make sure that 'struct au_dykey *' can be any type */
-	BUILD_BUG_ON(offsetof(struct au_dyaop, da_key));
-
-	for (i = 0; i < AuDyLast; i++)
-		au_spl_init(dynop + i);
-}
-
-void au_dy_fin(void)
-{
-	int i;
-
-	for (i = 0; i < AuDyLast; i++)
-		WARN_ON(!list_empty(&dynop[i].head));
-}
diff --git a/fs/aufs/dynop.h b/fs/aufs/dynop.h
deleted file mode 100644
index 8680bfc53..000000000
--- a/fs/aufs/dynop.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * Copyright (C) 2010-2016 Junjiro R. Okajima
- */
-
-/*
- * dynamically customizable operations (for regular files only)
- */
-
-#ifndef __AUFS_DYNOP_H__
-#define __AUFS_DYNOP_H__
-
-#ifdef __KERNEL__
-
-#include <linux/fs.h>
-#include <linux/kref.h>
-
-enum {AuDy_AOP, AuDyLast};
-
-struct au_dynop {
-	int						dy_type;
-	union {
-		const void				*dy_hop;
-		const struct address_space_operations	*dy_haop;
-	};
-};
-
-struct au_dykey {
-	union {
-		struct list_head	dk_list;
-		struct rcu_head		dk_rcu;
-	};
-	struct au_dynop		dk_op;
-
-	/*
-	 * during I am in the branch local array, kref is gotten. when the
-	 * branch is removed, kref is put.
-	 */
-	struct kref		dk_kref;
-};
-
-/* stop unioning since their sizes are very different from each other */
-struct au_dyaop {
-	struct au_dykey			da_key;
-	struct address_space_operations	da_op; /* not const */
-};
-
-/* ---------------------------------------------------------------------- */
-
-/* dynop.c */
-struct au_branch;
-void au_dy_put(struct au_dykey *key);
-int au_dy_iaop(struct inode *inode, aufs_bindex_t bindex,
-		struct inode *h_inode);
-int au_dy_irefresh(struct inode *inode);
-void au_dy_arefresh(int do_dio);
-
-void __init au_dy_init(void);
-void au_dy_fin(void);
-
-#endif /* __KERNEL__ */
-#endif /* __AUFS_DYNOP_H__ */
diff --git a/fs/aufs/export.c b/fs/aufs/export.c
deleted file mode 100644
index 7f6fec61f..000000000
--- a/fs/aufs/export.c
+++ /dev/null
@@ -1,819 +0,0 @@
-/*
- * Copyright (C) 2005-2016 Junjiro R. Okajima
- */
-
-/*
- * export via nfs
- */
-
-#include <linux/exportfs.h>
-#include <linux/fs_struct.h>
-#include <linux/namei.h>
-#include <linux/nsproxy.h>
-#include <linux/random.h>
-#include <linux/writeback.h>
-#include "../fs/mount.h"
-#include "aufs.h"
-
-union conv {
-#ifdef CONFIG_AUFS_INO_T_64
-	__u32 a[2];
-#else
-	__u32 a[1];
-#endif
-	ino_t ino;
-};
-
-static ino_t decode_ino(__u32 *a)
-{
-	union conv u;
-
-	BUILD_BUG_ON(sizeof(u.ino) != sizeof(u.a));
-	u.a[0] = a[0];
-#ifdef CONFIG_AUFS_INO_T_64
-	u.a[1] = a[1];
-#endif
-	return u.ino;
-}
-
-static void encode_ino(__u32 *a, ino_t ino)
-{
-	union conv u;
-
-	u.ino = ino;
-	a[0] = u.a[0];
-#ifdef CONFIG_AUFS_INO_T_64
-	a[1] = u.a[1];
-#endif
-}
-
-/* NFS file handle */
-enum {
-	Fh_br_id,
-	Fh_sigen,
-#ifdef CONFIG_AUFS_INO_T_64
-	/* support 64bit inode number */
-	Fh_ino1,
-	Fh_ino2,
-	Fh_dir_ino1,
-	Fh_dir_ino2,
-#else
-	Fh_ino1,
-	Fh_dir_ino1,
-#endif
-	Fh_igen,
-	Fh_h_type,
-	Fh_tail,
-
-	Fh_ino = Fh_ino1,
-	Fh_dir_ino = Fh_dir_ino1
-};
-
-static int au_test_anon(struct dentry *dentry)
-{
-	/* note: read d_flags without d_lock */
-	return !!(dentry->d_flags & DCACHE_DISCONNECTED);
-}
-
-int au_test_nfsd(void)
-{
-	int ret;
-	struct task_struct *tsk = current;
-	char comm[sizeof(tsk->comm)];
-
-	ret = 0;
-	if (tsk->flags & PF_KTHREAD) {
-		get_task_comm(comm, tsk);
-		ret = !strcmp(comm, "nfsd");
-	}
-
-	return ret;
-}
-
-/* ---------------------------------------------------------------------- */
-/* inode generation external table */
-
-void au_xigen_inc(struct inode *inode)
-{
-	loff_t pos;
-	ssize_t sz;
-	__u32 igen;
-	struct super_block *sb;
-	struct au_sbinfo *sbinfo;
-
-	sb = inode->i_sb;
-	AuDebugOn(!au_opt_test(au_mntflags(sb), XINO));
-
-	sbinfo = au_sbi(sb);
-	pos = inode->i_ino;
-	pos *= sizeof(igen);
-	igen = inode->i_generation + 1;
-	sz = xino_fwrite(sbinfo->si_xwrite, sbinfo->si_xigen, &igen,
-			 sizeof(igen), &pos);
-	if (sz == sizeof(igen))
-		return; /* success */
-
-	if (unlikely(sz >= 0))
-		AuIOErr("xigen error (%zd)\n", sz);
-}
-
-int au_xigen_new(struct inode *inode)
-{
-	int err;
-	loff_t pos;
-	ssize_t sz;
-	struct super_block *sb;
-	struct au_sbinfo *sbinfo;
-	struct file *file;
-
-	err = 0;
-	/* todo: dirty, at mount time */
-	if (inode->i_ino == AUFS_ROOT_INO)
-		goto out;
-	sb = inode->i_sb;
-	SiMustAnyLock(sb);
-	if (unlikely(!au_opt_test(au_mntflags(sb), XINO)))
-		goto out;
-
-	err = -EFBIG;
-	pos = inode->i_ino;
-	if (unlikely(au_loff_max / sizeof(inode->i_generation) - 1 < pos)) {
-		AuIOErr1("too large i%lld\n", pos);
-		goto out;
-	}
-	pos *= sizeof(inode->i_generation);
-
-	err = 0;
-	sbinfo = au_sbi(sb);
-	file = sbinfo->si_xigen;
-	BUG_ON(!file);
-
-	if (vfsub_f_size_read(file)
-	    < pos + sizeof(inode->i_generation)) {
-		inode->i_generation = atomic_inc_return(&sbinfo->si_xigen_next);
-		sz = xino_fwrite(sbinfo->si_xwrite, file, &inode->i_generation,
-				 sizeof(inode->i_generation), &pos);
-	} else
-		sz = xino_fread(sbinfo->si_xread, file, &inode->i_generation,
-				sizeof(inode->i_generation), &pos);
-	if (sz == sizeof(inode->i_generation))
-		goto out; /* success */
-
-	err = sz;
-	if (unlikely(sz >= 0)) {
-		err = -EIO;
-		AuIOErr("xigen error (%zd)\n", sz);
-	}
-
-out:
-	return err;
-}
-
-int au_xigen_set(struct super_block *sb, struct file *base)
-{
-	int err;
-	struct au_sbinfo *sbinfo;
-	struct file *file;
-
-	SiMustWriteLock(sb);
-
-	sbinfo = au_sbi(sb);
-	file = au_xino_create2(base, sbinfo->si_xigen);
-	err = PTR_ERR(file);
-	if (IS_ERR(file))
-		goto out;
-	err = 0;
-	if (sbinfo->si_xigen)
-		fput(sbinfo->si_xigen);
-	sbinfo->si_xigen = file;
-
-out:
-	return err;
-}
-
-void au_xigen_clr(struct super_block *sb)
-{
-	struct au_sbinfo *sbinfo;
-
-	SiMustWriteLock(sb);
-
-	sbinfo = au_sbi(sb);
-	if (sbinfo->si_xigen) {
-		fput(sbinfo->si_xigen);
-		sbinfo->si_xigen = NULL;
-	}
-}
-
-/* ---------------------------------------------------------------------- */
-
-static struct dentry *decode_by_ino(struct super_block *sb, ino_t ino,
-				    ino_t dir_ino)
-{
-	struct dentry *dentry, *d;
-	struct inode *inode;
-	unsigned int sigen;
-
-	dentry = NULL;
-	inode = ilookup(sb, ino);
-	if (!inode)
-		goto out;
-
-	dentry = ERR_PTR(-ESTALE);
-	sigen = au_sigen(sb);
-	if (unlikely(is_bad_inode(inode)
-		     || IS_DEADDIR(inode)
-		     || sigen != au_iigen(inode, NULL)))
-		goto out_iput;
-
-	dentry = NULL;
-	if (!dir_ino || S_ISDIR(inode->i_mode))
-		dentry = d_find_alias(inode);
-	else {
-		spin_lock(&inode->i_lock);
-		hlist_for_each_entry(d, &inode->i_dentry, d_u.d_alias) {
-			spin_lock(&d->d_lock);
-			if (!au_test_anon(d)
-			    && d_inode(d->d_parent)->i_ino == dir_ino) {
-				dentry = dget_dlock(d);
-				spin_unlock(&d->d_lock);
-				break;
-			}
-			spin_unlock(&d->d_lock);
-		}
-		spin_unlock(&inode->i_lock);
-	}
-	if (unlikely(dentry && au_digen_test(dentry, sigen))) {
-		/* need to refresh */
-		dput(dentry);
-		dentry = NULL;
-	}
-
-out_iput:
-	iput(inode);
-out:
-	AuTraceErrPtr(dentry);
-	return dentry;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* todo: dirty? */
-/* if exportfs_decode_fh() passed vfsmount*, we could be happy */
-
-struct au_compare_mnt_args {
-	/* input */
-	struct super_block *sb;
-
-	/* output */
-	struct vfsmount *mnt;
-};
-
-static int au_compare_mnt(struct vfsmount *mnt, void *arg)
-{
-	struct au_compare_mnt_args *a = arg;
-
-	if (mnt->mnt_sb != a->sb)
-		return 0;
-	a->mnt = mntget(mnt);
-	return 1;
-}
-
-static struct vfsmount *au_mnt_get(struct super_block *sb)
-{
-	int err;
-	struct path root;
-	struct au_compare_mnt_args args = {
-		.sb = sb
-	};
-
-	get_fs_root(current->fs, &root);
-	rcu_read_lock();
-	err = iterate_mounts(au_compare_mnt, &args, root.mnt);
-	rcu_read_unlock();
-	path_put(&root);
-	AuDebugOn(!err);
-	AuDebugOn(!args.mnt);
-	return args.mnt;
-}
-
-struct au_nfsd_si_lock {
-	unsigned int sigen;
-	aufs_bindex_t bindex, br_id;
-	unsigned char force_lock;
-};
-
-static int si_nfsd_read_lock(struct super_block *sb,
-			     struct au_nfsd_si_lock *nsi_lock)
-{
-	int err;
-	aufs_bindex_t bindex;
-
-	si_read_lock(sb, AuLock_FLUSH);
-
-	/* branch id may be wrapped around */
-	err = 0;
-	bindex = au_br_index(sb, nsi_lock->br_id);
-	if (bindex >= 0 && nsi_lock->sigen + AUFS_BRANCH_MAX > au_sigen(sb))
-		goto out; /* success */
-
-	err = -ESTALE;
-	bindex = -1;
-	if (!nsi_lock->force_lock)
-		si_read_unlock(sb);
-
-out:
-	nsi_lock->bindex = bindex;
-	return err;
-}
-
-struct find_name_by_ino {
-	struct dir_context ctx;
-	int called, found;
-	ino_t ino;
-	char *name;
-	int namelen;
-};
-
-static int
-find_name_by_ino(struct dir_context *ctx, const char *name, int namelen,
-		 loff_t offset, u64 ino, unsigned int d_type)
-{
-	struct find_name_by_ino *a = container_of(ctx, struct find_name_by_ino,
-						  ctx);
-
-	a->called++;
-	if (a->ino != ino)
-		return 0;
-
-	memcpy(a->name, name, namelen);
-	a->namelen = namelen;
-	a->found = 1;
-	return 1;
-}
-
-static struct dentry *au_lkup_by_ino(struct path *path, ino_t ino,
-				     struct au_nfsd_si_lock *nsi_lock)
-{
-	struct dentry *dentry, *parent;
-	struct file *file;
-	struct inode *dir;
-	struct find_name_by_ino arg = {
-		.ctx = {
-			.actor = find_name_by_ino
-		}
-	};
-	int err;
-
-	parent = path->dentry;
-	if (nsi_lock)
-		si_read_unlock(parent->d_sb);
-	file = vfsub_dentry_open(path, au_dir_roflags);
-	dentry = (void *)file;
-	if (IS_ERR(file))
-		goto out;
-
-	dentry = ERR_PTR(-ENOMEM);
-	arg.name = (void *)__get_free_page(GFP_NOFS);
-	if (unlikely(!arg.name))
-		goto out_file;
-	arg.ino = ino;
-	arg.found = 0;
-	do {
-		arg.called = 0;
-		/* smp_mb(); */
-		err = vfsub_iterate_dir(file, &arg.ctx);
-	} while (!err && !arg.found && arg.called);
-	dentry = ERR_PTR(err);
-	if (unlikely(err))
-		goto out_name;
-	/* instead of ENOENT */
-	dentry = ERR_PTR(-ESTALE);
-	if (!arg.found)
-		goto out_name;
-
-	/* do not call vfsub_lkup_one() */
-	dir = d_inode(parent);
-	mutex_lock(&dir->i_mutex);
-	dentry = vfsub_lookup_one_len(arg.name, parent, arg.namelen);
-	mutex_unlock(&dir->i_mutex);
-	AuTraceErrPtr(dentry);
-	if (IS_ERR(dentry))
-		goto out_name;
-	AuDebugOn(au_test_anon(dentry));
-	if (unlikely(d_really_is_negative(dentry))) {
-		dput(dentry);
-		dentry = ERR_PTR(-ENOENT);
-	}
-
-out_name:
-	free_page((unsigned long)arg.name);
-out_file:
-	fput(file);
-out:
-	if (unlikely(nsi_lock
-		     && si_nfsd_read_lock(parent->d_sb, nsi_lock) < 0))
-		if (!IS_ERR(dentry)) {
-			dput(dentry);
-			dentry = ERR_PTR(-ESTALE);
-		}
-	AuTraceErrPtr(dentry);
-	return dentry;
-}
-
-static struct dentry *decode_by_dir_ino(struct super_block *sb, ino_t ino,
-					ino_t dir_ino,
-					struct au_nfsd_si_lock *nsi_lock)
-{
-	struct dentry *dentry;
-	struct path path;
-
-	if (dir_ino != AUFS_ROOT_INO) {
-		path.dentry = decode_by_ino(sb, dir_ino, 0);
-		dentry = path.dentry;
-		if (!path.dentry || IS_ERR(path.dentry))
-			goto out;
-		AuDebugOn(au_test_anon(path.dentry));
-	} else
-		path.dentry = dget(sb->s_root);
-
-	path.mnt = au_mnt_get(sb);
-	dentry = au_lkup_by_ino(&path, ino, nsi_lock);
-	path_put(&path);
-
-out:
-	AuTraceErrPtr(dentry);
-	return dentry;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static int h_acceptable(void *expv, struct dentry *dentry)
-{
-	return 1;
-}
-
-static char *au_build_path(struct dentry *h_parent, struct path *h_rootpath,
-			   char *buf, int len, struct super_block *sb)
-{
-	char *p;
-	int n;
-	struct path path;
-
-	p = d_path(h_rootpath, buf, len);
-	if (IS_ERR(p))
-		goto out;
-	n = strlen(p);
-
-	path.mnt = h_rootpath->mnt;
-	path.dentry = h_parent;
-	p = d_path(&path, buf, len);
-	if (IS_ERR(p))
-		goto out;
-	if (n != 1)
-		p += n;
-
-	path.mnt = au_mnt_get(sb);
-	path.dentry = sb->s_root;
-	p = d_path(&path, buf, len - strlen(p));
-	mntput(path.mnt);
-	if (IS_ERR(p))
-		goto out;
-	if (n != 1)
-		p[strlen(p)] = '/';
-
-out:
-	AuTraceErrPtr(p);
-	return p;
-}
-
-static
-struct dentry *decode_by_path(struct super_block *sb, ino_t ino, __u32 *fh,
-			      int fh_len, struct au_nfsd_si_lock *nsi_lock)
-{
-	struct dentry *dentry, *h_parent, *root;
-	struct super_block *h_sb;
-	char *pathname, *p;
-	struct vfsmount *h_mnt;
-	struct au_branch *br;
-	int err;
-	struct path path;
-
-	br = au_sbr(sb, nsi_lock->bindex);
-	h_mnt = au_br_mnt(br);
-	h_sb = h_mnt->mnt_sb;
-	/* todo: call lower fh_to_dentry()? fh_to_parent()? */
-	h_parent = exportfs_decode_fh(h_mnt, (void *)(fh + Fh_tail),
-				      fh_len - Fh_tail, fh[Fh_h_type],
-				      h_acceptable, /*context*/NULL);
-	dentry = h_parent;
-	if (unlikely(!h_parent || IS_ERR(h_parent))) {
-		AuWarn1("%s decode_fh failed, %ld\n",
-			au_sbtype(h_sb), PTR_ERR(h_parent));
-		goto out;
-	}
-	dentry = NULL;
-	if (unlikely(au_test_anon(h_parent))) {
-		AuWarn1("%s decode_fh returned a disconnected dentry\n",
-			au_sbtype(h_sb));
-		goto out_h_parent;
-	}
-
-	dentry = ERR_PTR(-ENOMEM);
-	pathname = (void *)__get_free_page(GFP_NOFS);
-	if (unlikely(!pathname))
-		goto out_h_parent;
-
-	root = sb->s_root;
-	path.mnt = h_mnt;
-	di_read_lock_parent(root, !AuLock_IR);
-	path.dentry = au_h_dptr(root, nsi_lock->bindex);
-	di_read_unlock(root, !AuLock_IR);
-	p = au_build_path(h_parent, &path, pathname, PAGE_SIZE, sb);
-	dentry = (void *)p;
-	if (IS_ERR(p))
-		goto out_pathname;
-
-	si_read_unlock(sb);
-	err = vfsub_kern_path(p, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &path);
-	dentry = ERR_PTR(err);
-	if (unlikely(err))
-		goto out_relock;
-
-	dentry = ERR_PTR(-ENOENT);
-	AuDebugOn(au_test_anon(path.dentry));
-	if (unlikely(d_really_is_negative(path.dentry)))
-		goto out_path;
-
-	if (ino != d_inode(path.dentry)->i_ino)
-		dentry = au_lkup_by_ino(&path, ino, /*nsi_lock*/NULL);
-	else
-		dentry = dget(path.dentry);
-
-out_path:
-	path_put(&path);
-out_relock:
-	if (unlikely(si_nfsd_read_lock(sb, nsi_lock) < 0))
-		if (!IS_ERR(dentry)) {
-			dput(dentry);
-			dentry = ERR_PTR(-ESTALE);
-		}
-out_pathname:
-	free_page((unsigned long)pathname);
-out_h_parent:
-	dput(h_parent);
-out:
-	AuTraceErrPtr(dentry);
-	return dentry;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static struct dentry *
-aufs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len,
-		  int fh_type)
-{
-	struct dentry *dentry;
-	__u32 *fh = fid->raw;
-	struct au_branch *br;
-	ino_t ino, dir_ino;
-	struct au_nfsd_si_lock nsi_lock = {
-		.force_lock	= 0
-	};
-
-	dentry = ERR_PTR(-ESTALE);
-	/* it should never happen, but the file handle is unreliable */
-	if (unlikely(fh_len < Fh_tail))
-		goto out;
-	nsi_lock.sigen = fh[Fh_sigen];
-	nsi_lock.br_id = fh[Fh_br_id];
-
-	/* branch id may be wrapped around */
-	br = NULL;
-	if (unlikely(si_nfsd_read_lock(sb, &nsi_lock)))
-		goto out;
-	nsi_lock.force_lock = 1;
-
-	/* is this inode still cached? */
-	ino = decode_ino(fh + Fh_ino);
-	/* it should never happen */
-	if (unlikely(ino == AUFS_ROOT_INO))
-		goto out_unlock;
-
-	dir_ino = decode_ino(fh + Fh_dir_ino);
-	dentry = decode_by_ino(sb, ino, dir_ino);
-	if (IS_ERR(dentry))
-		goto out_unlock;
-	if (dentry)
-		goto accept;
-
-	/* is the parent dir cached? */
-	br = au_sbr(sb, nsi_lock.bindex);
-	atomic_inc(&br->br_count);
-	dentry = decode_by_dir_ino(sb, ino, dir_ino, &nsi_lock);
-	if (IS_ERR(dentry))
-		goto out_unlock;
-	if (dentry)
-		goto accept;
-
-	/* lookup path */
-	dentry = decode_by_path(sb, ino, fh, fh_len, &nsi_lock);
-	if (IS_ERR(dentry))
-		goto out_unlock;
-	if (unlikely(!dentry))
-		/* todo?: make it ESTALE */
-		goto out_unlock;
-
-accept:
-	if (!au_digen_test(dentry, au_sigen(sb))
-	    && d_inode(dentry)->i_generation == fh[Fh_igen])
-		goto out_unlock; /* success */
-
-	dput(dentry);
-	dentry = ERR_PTR(-ESTALE);
-out_unlock:
-	if (br)
-		atomic_dec(&br->br_count);
-	si_read_unlock(sb);
-out:
-	AuTraceErrPtr(dentry);
-	return dentry;
-}
-
-#if 0 /* reserved for future use */
-/* support subtreecheck option */
-static struct dentry *aufs_fh_to_parent(struct super_block *sb, struct fid *fid,
-					int fh_len, int fh_type)
-{
-	struct dentry *parent;
-	__u32 *fh = fid->raw;
-	ino_t dir_ino;
-
-	dir_ino = decode_ino(fh + Fh_dir_ino);
-	parent = decode_by_ino(sb, dir_ino, 0);
-	if (IS_ERR(parent))
-		goto out;
-	if (!parent)
-		parent = decode_by_path(sb, au_br_index(sb, fh[Fh_br_id]),
-					dir_ino, fh, fh_len);
-
-out:
-	AuTraceErrPtr(parent);
-	return parent;
-}
-#endif
-
-/* ---------------------------------------------------------------------- */
-
-static int aufs_encode_fh(struct inode *inode, __u32 *fh, int *max_len,
-			  struct inode *dir)
-{
-	int err;
-	aufs_bindex_t bindex;
-	struct super_block *sb, *h_sb;
-	struct dentry *dentry, *parent, *h_parent;
-	struct inode *h_dir;
-	struct au_branch *br;
-
-	err = -ENOSPC;
-	if (unlikely(*max_len <= Fh_tail)) {
-		AuWarn1("NFSv2 client (max_len %d)?\n", *max_len);
-		goto out;
-	}
-
-	err = FILEID_ROOT;
-	if (inode->i_ino == AUFS_ROOT_INO) {
-		AuDebugOn(inode->i_ino != AUFS_ROOT_INO);
-		goto out;
-	}
-
-	h_parent = NULL;
-	sb = inode->i_sb;
-	err = si_read_lock(sb, AuLock_FLUSH);
-	if (unlikely(err))
-		goto out;
-
-#ifdef CONFIG_AUFS_DEBUG
-	if (unlikely(!au_opt_test(au_mntflags(sb), XINO)))
-		AuWarn1("NFS-exporting requires xino\n");
-#endif
-	err = -EIO;
-	parent = NULL;
-	ii_read_lock_child(inode);
-	bindex = au_ibstart(inode);
-	if (!dir) {
-		dentry = d_find_any_alias(inode);
-		if (unlikely(!dentry))
-			goto out_unlock;
-		AuDebugOn(au_test_anon(dentry));
-		parent = dget_parent(dentry);
-		dput(dentry);
-		if (unlikely(!parent))
-			goto out_unlock;
-		if (d_really_is_positive(parent))
-			dir = d_inode(parent);
-	}
-
-	ii_read_lock_parent(dir);
-	h_dir = au_h_iptr(dir, bindex);
-	ii_read_unlock(dir);
-	if (unlikely(!h_dir))
-		goto out_parent;
-	h_parent = d_find_any_alias(h_dir);
-	if (unlikely(!h_parent))
-		goto out_hparent;
-
-	err = -EPERM;
-	br = au_sbr(sb, bindex);
-	h_sb = au_br_sb(br);
-	if (unlikely(!h_sb->s_export_op)) {
-		AuErr1("%s branch is not exportable\n", au_sbtype(h_sb));
-		goto out_hparent;
-	}
-
-	fh[Fh_br_id] = br->br_id;
-	fh[Fh_sigen] = au_sigen(sb);
-	encode_ino(fh + Fh_ino, inode->i_ino);
-	encode_ino(fh + Fh_dir_ino, dir->i_ino);
-	fh[Fh_igen] = inode->i_generation;
-
-	*max_len -= Fh_tail;
-	fh[Fh_h_type] = exportfs_encode_fh(h_parent, (void *)(fh + Fh_tail),
-					   max_len,
-					   /*connectable or subtreecheck*/0);
-	err = fh[Fh_h_type];
-	*max_len += Fh_tail;
-	/* todo: macros? */
-	if (err != FILEID_INVALID)
-		err = 99;
-	else
-		AuWarn1("%s encode_fh failed\n", au_sbtype(h_sb));
-
-out_hparent:
-	dput(h_parent);
-out_parent:
-	dput(parent);
-out_unlock:
-	ii_read_unlock(inode);
-	si_read_unlock(sb);
-out:
-	if (unlikely(err < 0))
-		err = FILEID_INVALID;
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static int aufs_commit_metadata(struct inode *inode)
-{
-	int err;
-	aufs_bindex_t bindex;
-	struct super_block *sb;
-	struct inode *h_inode;
-	int (*f)(struct inode *inode);
-
-	sb = inode->i_sb;
-	si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW);
-	ii_write_lock_child(inode);
-	bindex = au_ibstart(inode);
-	AuDebugOn(bindex < 0);
-	h_inode = au_h_iptr(inode, bindex);
-
-	f = h_inode->i_sb->s_export_op->commit_metadata;
-	if (f)
-		err = f(h_inode);
-	else {
-		struct writeback_control wbc = {
-			.sync_mode	= WB_SYNC_ALL,
-			.nr_to_write	= 0 /* metadata only */
-		};
-
-		err = sync_inode(h_inode, &wbc);
-	}
-
-	au_cpup_attr_timesizes(inode);
-	ii_write_unlock(inode);
-	si_read_unlock(sb);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static struct export_operations aufs_export_op = {
-	.fh_to_dentry		= aufs_fh_to_dentry,
-	/* .fh_to_parent	= aufs_fh_to_parent, */
-	.encode_fh		= aufs_encode_fh,
-	.commit_metadata	= aufs_commit_metadata
-};
-
-void au_export_init(struct super_block *sb)
-{
-	struct au_sbinfo *sbinfo;
-	__u32 u;
-
-	sb->s_export_op = &aufs_export_op;
-	sbinfo = au_sbi(sb);
-	sbinfo->si_xigen = NULL;
-	get_random_bytes(&u, sizeof(u));
-	BUILD_BUG_ON(sizeof(u) != sizeof(int));
-	atomic_set(&sbinfo->si_xigen_next, u);
-}
diff --git a/fs/aufs/f_op.c b/fs/aufs/f_op.c
deleted file mode 100644
index 145dec870..000000000
--- a/fs/aufs/f_op.c
+++ /dev/null
@@ -1,759 +0,0 @@
-/*
- * Copyright (C) 2005-2016 Junjiro R. Okajima
- */
-
-/*
- * file and vm operations
- */
-
-#include <linux/aio.h>
-#include <linux/fs_stack.h>
-#include <linux/mman.h>
-#include <linux/security.h>
-#include "aufs.h"
-
-int au_do_open_nondir(struct file *file, int flags, struct file *h_file)
-{
-	int err;
-	aufs_bindex_t bindex;
-	struct dentry *dentry, *h_dentry;
-	struct au_finfo *finfo;
-	struct inode *h_inode;
-
-	FiMustWriteLock(file);
-
-	err = 0;
-	dentry = file->f_path.dentry;
-	AuDebugOn(IS_ERR_OR_NULL(dentry));
-	finfo = au_fi(file);
-	memset(&finfo->fi_htop, 0, sizeof(finfo->fi_htop));
-	atomic_set(&finfo->fi_mmapped, 0);
-	bindex = au_dbstart(dentry);
-	if (!h_file) {
-		h_dentry = au_h_dptr(dentry, bindex);
-		err = vfsub_test_mntns(file->f_path.mnt, h_dentry->d_sb);
-		if (unlikely(err))
-			goto out;
-		h_file = au_h_open(dentry, bindex, flags, file, /*force_wr*/0);
-	} else {
-		h_dentry = h_file->f_path.dentry;
-		err = vfsub_test_mntns(file->f_path.mnt, h_dentry->d_sb);
-		if (unlikely(err))
-			goto out;
-		get_file(h_file);
-	}
-	if (IS_ERR(h_file))
-		err = PTR_ERR(h_file);
-	else {
-		if ((flags & __O_TMPFILE)
-		    && !(flags & O_EXCL)) {
-			h_inode = file_inode(h_file);
-			spin_lock(&h_inode->i_lock);
-			h_inode->i_state |= I_LINKABLE;
-			spin_unlock(&h_inode->i_lock);
-		}
-		au_set_fbstart(file, bindex);
-		au_set_h_fptr(file, bindex, h_file);
-		au_update_figen(file);
-		/* todo: necessary? */
-		/* file->f_ra = h_file->f_ra; */
-	}
-
-out:
-	return err;
-}
-
-static int aufs_open_nondir(struct inode *inode __maybe_unused,
-			    struct file *file)
-{
-	int err;
-	struct super_block *sb;
-	struct au_do_open_args args = {
-		.open	= au_do_open_nondir
-	};
-
-	AuDbg("%pD, f_flags 0x%x, f_mode 0x%x\n",
-	      file, vfsub_file_flags(file), file->f_mode);
-
-	sb = file->f_path.dentry->d_sb;
-	si_read_lock(sb, AuLock_FLUSH);
-	err = au_do_open(file, &args);
-	si_read_unlock(sb);
-	return err;
-}
-
-int aufs_release_nondir(struct inode *inode __maybe_unused, struct file *file)
-{
-	struct au_finfo *finfo;
-	aufs_bindex_t bindex;
-
-	finfo = au_fi(file);
-	au_sphl_del(&finfo->fi_hlist,
-		    &au_sbi(file->f_path.dentry->d_sb)->si_files);
-	bindex = finfo->fi_btop;
-	if (bindex >= 0)
-		au_set_h_fptr(file, bindex, NULL);
-
-	au_finfo_fin(file);
-	return 0;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static int au_do_flush_nondir(struct file *file, fl_owner_t id)
-{
-	int err;
-	struct file *h_file;
-
-	err = 0;
-	h_file = au_hf_top(file);
-	if (h_file)
-		err = vfsub_flush(h_file, id);
-	return err;
-}
-
-static int aufs_flush_nondir(struct file *file, fl_owner_t id)
-{
-	return au_do_flush(file, id, au_do_flush_nondir);
-}
-
-/* ---------------------------------------------------------------------- */
-/*
- * read and write functions acquire [fdi]_rwsem once, but release before
- * mmap_sem. This is because to stop a race condition between mmap(2).
- * Releasing these aufs-rwsem should be safe, no branch-mamagement (by keeping
- * si_rwsem), no harmful copy-up should happen. Actually copy-up may happen in
- * read functions after [fdi]_rwsem are released, but it should be harmless.
- */
-
-/* Callers should call au_read_post() or fput() in the end */
-struct file *au_read_pre(struct file *file, int keep_fi)
-{
-	struct file *h_file;
-	int err;
-
-	err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/0);
-	if (!err) {
-		di_read_unlock(file->f_path.dentry, AuLock_IR);
-		h_file = au_hf_top(file);
-		get_file(h_file);
-		if (!keep_fi)
-			fi_read_unlock(file);
-	} else
-		h_file = ERR_PTR(err);
-
-	return h_file;
-}
-
-static void au_read_post(struct inode *inode, struct file *h_file)
-{
-	/* update without lock, I don't think it a problem */
-	fsstack_copy_attr_atime(inode, file_inode(h_file));
-	fput(h_file);
-}
-
-struct au_write_pre {
-	blkcnt_t blks;
-	aufs_bindex_t bstart;
-};
-
-/*
- * return with iinfo is write-locked
- * callers should call au_write_post() or iinfo_write_unlock() + fput() in the
- * end
- */
-static struct file *au_write_pre(struct file *file, int do_ready,
-				 struct au_write_pre *wpre)
-{
-	struct file *h_file;
-	struct dentry *dentry;
-	int err;
-	struct au_pin pin;
-
-	err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/1);
-	h_file = ERR_PTR(err);
-	if (unlikely(err))
-		goto out;
-
-	dentry = file->f_path.dentry;
-	if (do_ready) {
-		err = au_ready_to_write(file, -1, &pin);
-		if (unlikely(err)) {
-			h_file = ERR_PTR(err);
-			di_write_unlock(dentry);
-			goto out_fi;
-		}
-	}
-
-	di_downgrade_lock(dentry, /*flags*/0);
-	if (wpre)
-		wpre->bstart = au_fbstart(file);
-	h_file = au_hf_top(file);
-	get_file(h_file);
-	if (wpre)
-		wpre->blks = file_inode(h_file)->i_blocks;
-	if (do_ready)
-		au_unpin(&pin);
-	di_read_unlock(dentry, /*flags*/0);
-
-out_fi:
-	fi_write_unlock(file);
-out:
-	return h_file;
-}
-
-static void au_write_post(struct inode *inode, struct file *h_file,
-			  struct au_write_pre *wpre, ssize_t written)
-{
-	struct inode *h_inode;
-
-	au_cpup_attr_timesizes(inode);
-	AuDebugOn(au_ibstart(inode) != wpre->bstart);
-	h_inode = file_inode(h_file);
-	inode->i_mode = h_inode->i_mode;
-	ii_write_unlock(inode);
-	fput(h_file);
-
-	/* AuDbg("blks %llu, %llu\n", (u64)blks, (u64)h_inode->i_blocks); */
-	if (written > 0)
-		au_fhsm_wrote(inode->i_sb, wpre->bstart,
-			      /*force*/h_inode->i_blocks > wpre->blks);
-}
-
-static ssize_t aufs_read(struct file *file, char __user *buf, size_t count,
-			 loff_t *ppos)
-{
-	ssize_t err;
-	struct inode *inode;
-	struct file *h_file;
-	struct super_block *sb;
-
-	inode = file_inode(file);
-	sb = inode->i_sb;
-	si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW);
-
-	h_file = au_read_pre(file, /*keep_fi*/0);
-	err = PTR_ERR(h_file);
-	if (IS_ERR(h_file))
-		goto out;
-
-	/* filedata may be obsoleted by concurrent copyup, but no problem */
-	err = vfsub_read_u(h_file, buf, count, ppos);
-	/* todo: necessary? */
-	/* file->f_ra = h_file->f_ra; */
-	au_read_post(inode, h_file);
-
-out:
-	si_read_unlock(sb);
-	return err;
-}
-
-/*
- * todo: very ugly
- * it locks both of i_mutex and si_rwsem for read in safe.
- * if the plink maintenance mode continues forever (that is the problem),
- * may loop forever.
- */
-static void au_mtx_and_read_lock(struct inode *inode)
-{
-	int err;
-	struct super_block *sb = inode->i_sb;
-
-	while (1) {
-		mutex_lock(&inode->i_mutex);
-		err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM);
-		if (!err)
-			break;
-		mutex_unlock(&inode->i_mutex);
-		si_read_lock(sb, AuLock_NOPLMW);
-		si_read_unlock(sb);
-	}
-}
-
-static ssize_t aufs_write(struct file *file, const char __user *ubuf,
-			  size_t count, loff_t *ppos)
-{
-	ssize_t err;
-	struct au_write_pre wpre;
-	struct inode *inode;
-	struct file *h_file;
-	char __user *buf = (char __user *)ubuf;
-
-	inode = file_inode(file);
-	au_mtx_and_read_lock(inode);
-
-	h_file = au_write_pre(file, /*do_ready*/1, &wpre);
-	err = PTR_ERR(h_file);
-	if (IS_ERR(h_file))
-		goto out;
-
-	err = vfsub_write_u(h_file, buf, count, ppos);
-	au_write_post(inode, h_file, &wpre, err);
-
-out:
-	si_read_unlock(inode->i_sb);
-	mutex_unlock(&inode->i_mutex);
-	return err;
-}
-
-static ssize_t au_do_iter(struct file *h_file, int rw, struct kiocb *kio,
-			  struct iov_iter *iov_iter)
-{
-	ssize_t err;
-	struct file *file;
-	ssize_t (*iter)(struct kiocb *, struct iov_iter *);
-
-	err = security_file_permission(h_file, rw);
-	if (unlikely(err))
-		goto out;
-
-	err = -ENOSYS;
-	iter = NULL;
-	if (rw == MAY_READ)
-		iter = h_file->f_op->read_iter;
-	else if (rw == MAY_WRITE)
-		iter = h_file->f_op->write_iter;
-
-	file = kio->ki_filp;
-	kio->ki_filp = h_file;
-	if (iter) {
-		lockdep_off();
-		err = iter(kio, iov_iter);
-		lockdep_on();
-	} else
-		/* currently there is no such fs */
-		WARN_ON_ONCE(1);
-	kio->ki_filp = file;
-
-out:
-	return err;
-}
-
-static ssize_t aufs_read_iter(struct kiocb *kio, struct iov_iter *iov_iter)
-{
-	ssize_t err;
-	struct file *file, *h_file;
-	struct inode *inode;
-	struct super_block *sb;
-
-	file = kio->ki_filp;
-	inode = file_inode(file);
-	sb = inode->i_sb;
-	si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW);
-
-	h_file = au_read_pre(file, /*keep_fi*/0);
-	err = PTR_ERR(h_file);
-	if (IS_ERR(h_file))
-		goto out;
-
-	err = au_do_iter(h_file, MAY_READ, kio, iov_iter);
-	/* todo: necessary? */
-	/* file->f_ra = h_file->f_ra; */
-	au_read_post(inode, h_file);
-
-out:
-	si_read_unlock(sb);
-	return err;
-}
-
-static ssize_t aufs_write_iter(struct kiocb *kio, struct iov_iter *iov_iter)
-{
-	ssize_t err;
-	struct au_write_pre wpre;
-	struct inode *inode;
-	struct file *file, *h_file;
-
-	file = kio->ki_filp;
-	inode = file_inode(file);
-	au_mtx_and_read_lock(inode);
-
-	h_file = au_write_pre(file, /*do_ready*/1, &wpre);
-	err = PTR_ERR(h_file);
-	if (IS_ERR(h_file))
-		goto out;
-
-	err = au_do_iter(h_file, MAY_WRITE, kio, iov_iter);
-	au_write_post(inode, h_file, &wpre, err);
-
-out:
-	si_read_unlock(inode->i_sb);
-	mutex_unlock(&inode->i_mutex);
-	return err;
-}
-
-static ssize_t aufs_splice_read(struct file *file, loff_t *ppos,
-				struct pipe_inode_info *pipe, size_t len,
-				unsigned int flags)
-{
-	ssize_t err;
-	struct file *h_file;
-	struct inode *inode;
-	struct super_block *sb;
-
-	inode = file_inode(file);
-	sb = inode->i_sb;
-	si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW);
-
-	h_file = au_read_pre(file, /*keep_fi*/1);
-	err = PTR_ERR(h_file);
-	if (IS_ERR(h_file))
-		goto out;
-
-	if (au_test_loopback_kthread()) {
-		au_warn_loopback(h_file->f_path.dentry->d_sb);
-		if (file->f_mapping != h_file->f_mapping) {
-			file->f_mapping = h_file->f_mapping;
-			smp_mb(); /* unnecessary? */
-		}
-	}
-	fi_read_unlock(file);
-
-	err = vfsub_splice_to(h_file, ppos, pipe, len, flags);
-	/* todo: necessasry? */
-	/* file->f_ra = h_file->f_ra; */
-	au_read_post(inode, h_file);
-
-out:
-	si_read_unlock(sb);
-	return err;
-}
-
-static ssize_t
-aufs_splice_write(struct pipe_inode_info *pipe, struct file *file, loff_t *ppos,
-		  size_t len, unsigned int flags)
-{
-	ssize_t err;
-	struct au_write_pre wpre;
-	struct inode *inode;
-	struct file *h_file;
-
-	inode = file_inode(file);
-	au_mtx_and_read_lock(inode);
-
-	h_file = au_write_pre(file, /*do_ready*/1, &wpre);
-	err = PTR_ERR(h_file);
-	if (IS_ERR(h_file))
-		goto out;
-
-	err = vfsub_splice_from(pipe, h_file, ppos, len, flags);
-	au_write_post(inode, h_file, &wpre, err);
-
-out:
-	si_read_unlock(inode->i_sb);
-	mutex_unlock(&inode->i_mutex);
-	return err;
-}
-
-static long aufs_fallocate(struct file *file, int mode, loff_t offset,
-			   loff_t len)
-{
-	long err;
-	struct au_write_pre wpre;
-	struct inode *inode;
-	struct file *h_file;
-
-	inode = file_inode(file);
-	au_mtx_and_read_lock(inode);
-
-	h_file = au_write_pre(file, /*do_ready*/1, &wpre);
-	err = PTR_ERR(h_file);
-	if (IS_ERR(h_file))
-		goto out;
-
-	lockdep_off();
-	err = vfs_fallocate(h_file, mode, offset, len);
-	lockdep_on();
-	au_write_post(inode, h_file, &wpre, /*written*/1);
-
-out:
-	si_read_unlock(inode->i_sb);
-	mutex_unlock(&inode->i_mutex);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/*
- * The locking order around current->mmap_sem.
- * - in most and regular cases
- *   file I/O syscall -- aufs_read() or something
- *	-- si_rwsem for read -- mmap_sem
- *	(Note that [fdi]i_rwsem are released before mmap_sem).
- * - in mmap case
- *   mmap(2) -- mmap_sem -- aufs_mmap() -- si_rwsem for read -- [fdi]i_rwsem
- * This AB-BA order is definitly bad, but is not a problem since "si_rwsem for
- * read" allows muliple processes to acquire it and [fdi]i_rwsem are not held in
- * file I/O. Aufs needs to stop lockdep in aufs_mmap() though.
- * It means that when aufs acquires si_rwsem for write, the process should never
- * acquire mmap_sem.
- *
- * Actually aufs_iterate() holds [fdi]i_rwsem before mmap_sem, but this is not a
- * problem either since any directory is not able to be mmap-ed.
- * The similar scenario is applied to aufs_readlink() too.
- */
-
-#if 0 /* stop calling security_file_mmap() */
-/* cf. linux/include/linux/mman.h: calc_vm_prot_bits() */
-#define AuConv_VM_PROT(f, b)	_calc_vm_trans(f, VM_##b, PROT_##b)
-
-static unsigned long au_arch_prot_conv(unsigned long flags)
-{
-	/* currently ppc64 only */
-#ifdef CONFIG_PPC64
-	/* cf. linux/arch/powerpc/include/asm/mman.h */
-	AuDebugOn(arch_calc_vm_prot_bits(-1) != VM_SAO);
-	return AuConv_VM_PROT(flags, SAO);
-#else
-	AuDebugOn(arch_calc_vm_prot_bits(-1));
-	return 0;
-#endif
-}
-
-static unsigned long au_prot_conv(unsigned long flags)
-{
-	return AuConv_VM_PROT(flags, READ)
-		| AuConv_VM_PROT(flags, WRITE)
-		| AuConv_VM_PROT(flags, EXEC)
-		| au_arch_prot_conv(flags);
-}
-
-/* cf. linux/include/linux/mman.h: calc_vm_flag_bits() */
-#define AuConv_VM_MAP(f, b)	_calc_vm_trans(f, VM_##b, MAP_##b)
-
-static unsigned long au_flag_conv(unsigned long flags)
-{
-	return AuConv_VM_MAP(flags, GROWSDOWN)
-		| AuConv_VM_MAP(flags, DENYWRITE)
-		| AuConv_VM_MAP(flags, LOCKED);
-}
-#endif
-
-static int aufs_mmap(struct file *file, struct vm_area_struct *vma)
-{
-	int err;
-	const unsigned char wlock
-		= (file->f_mode & FMODE_WRITE) && (vma->vm_flags & VM_SHARED);
-	struct super_block *sb;
-	struct file *h_file;
-	struct inode *inode;
-
-	AuDbgVmRegion(file, vma);
-
-	inode = file_inode(file);
-	sb = inode->i_sb;
-	lockdep_off();
-	si_read_lock(sb, AuLock_NOPLMW);
-
-	h_file = au_write_pre(file, wlock, /*wpre*/NULL);
-	lockdep_on();
-	err = PTR_ERR(h_file);
-	if (IS_ERR(h_file))
-		goto out;
-
-	err = 0;
-	au_set_mmapped(file);
-	au_vm_file_reset(vma, h_file);
-	/*
-	 * we cannot call security_mmap_file() here since it may acquire
-	 * mmap_sem or i_mutex.
-	 *
-	 * err = security_mmap_file(h_file, au_prot_conv(vma->vm_flags),
-	 *			 au_flag_conv(vma->vm_flags));
-	 */
-	if (!err)
-		err = h_file->f_op->mmap(h_file, vma);
-	if (!err) {
-		au_vm_prfile_set(vma, file);
-		fsstack_copy_attr_atime(inode, file_inode(h_file));
-		goto out_fput; /* success */
-	}
-	au_unset_mmapped(file);
-	au_vm_file_reset(vma, file);
-
-out_fput:
-	lockdep_off();
-	ii_write_unlock(inode);
-	lockdep_on();
-	fput(h_file);
-out:
-	lockdep_off();
-	si_read_unlock(sb);
-	lockdep_on();
-	AuTraceErr(err);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static int aufs_fsync_nondir(struct file *file, loff_t start, loff_t end,
-			     int datasync)
-{
-	int err;
-	struct au_write_pre wpre;
-	struct inode *inode;
-	struct file *h_file;
-
-	err = 0; /* -EBADF; */ /* posix? */
-	if (unlikely(!(file->f_mode & FMODE_WRITE)))
-		goto out;
-
-	inode = file_inode(file);
-	au_mtx_and_read_lock(inode);
-
-	h_file = au_write_pre(file, /*do_ready*/1, &wpre);
-	err = PTR_ERR(h_file);
-	if (IS_ERR(h_file))
-		goto out_unlock;
-
-	err = vfsub_fsync(h_file, &h_file->f_path, datasync);
-	au_write_post(inode, h_file, &wpre, /*written*/0);
-
-out_unlock:
-	si_read_unlock(inode->i_sb);
-	mutex_unlock(&inode->i_mutex);
-out:
-	return err;
-}
-
-/* no one supports this operation, currently */
-#if 0
-static int aufs_aio_fsync_nondir(struct kiocb *kio, int datasync)
-{
-	int err;
-	struct au_write_pre wpre;
-	struct inode *inode;
-	struct file *file, *h_file;
-
-	err = 0; /* -EBADF; */ /* posix? */
-	if (unlikely(!(file->f_mode & FMODE_WRITE)))
-		goto out;
-
-	file = kio->ki_filp;
-	inode = file_inode(file);
-	au_mtx_and_read_lock(inode);
-
-	h_file = au_write_pre(file, /*do_ready*/1, &wpre);
-	err = PTR_ERR(h_file);
-	if (IS_ERR(h_file))
-		goto out_unlock;
-
-	err = -ENOSYS;
-	h_file = au_hf_top(file);
-	if (h_file->f_op->aio_fsync) {
-		struct mutex *h_mtx;
-
-		h_mtx = &file_inode(h_file)->i_mutex;
-		if (!is_sync_kiocb(kio)) {
-			get_file(h_file);
-			fput(file);
-		}
-		kio->ki_filp = h_file;
-		err = h_file->f_op->aio_fsync(kio, datasync);
-		mutex_lock_nested(h_mtx, AuLsc_I_CHILD);
-		if (!err)
-			vfsub_update_h_iattr(&h_file->f_path, /*did*/NULL);
-		/*ignore*/
-		mutex_unlock(h_mtx);
-	}
-	au_write_post(inode, h_file, &wpre, /*written*/0);
-
-out_unlock:
-	si_read_unlock(inode->sb);
-	mutex_unlock(&inode->i_mutex);
-out:
-	return err;
-}
-#endif
-
-static int aufs_fasync(int fd, struct file *file, int flag)
-{
-	int err;
-	struct file *h_file;
-	struct super_block *sb;
-
-	sb = file->f_path.dentry->d_sb;
-	si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW);
-
-	h_file = au_read_pre(file, /*keep_fi*/0);
-	err = PTR_ERR(h_file);
-	if (IS_ERR(h_file))
-		goto out;
-
-	if (h_file->f_op->fasync)
-		err = h_file->f_op->fasync(fd, h_file, flag);
-	fput(h_file); /* instead of au_read_post() */
-
-out:
-	si_read_unlock(sb);
-	return err;
-}
-
-static int aufs_setfl(struct file *file, unsigned long arg)
-{
-	int err;
-	struct file *h_file;
-	struct super_block *sb;
-
-	sb = file->f_path.dentry->d_sb;
-	si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW);
-
-	h_file = au_read_pre(file, /*keep_fi*/0);
-	err = PTR_ERR(h_file);
-	if (IS_ERR(h_file))
-		goto out;
-
-	arg |= vfsub_file_flags(file) & FASYNC; /* stop calling h_file->fasync */
-	err = setfl(/*unused fd*/-1, h_file, arg);
-	fput(h_file); /* instead of au_read_post() */
-
-out:
-	si_read_unlock(sb);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* no one supports this operation, currently */
-#if 0
-static ssize_t aufs_sendpage(struct file *file, struct page *page, int offset,
-			     size_t len, loff_t *pos, int more)
-{
-}
-#endif
-
-/* ---------------------------------------------------------------------- */
-
-const struct file_operations aufs_file_fop = {
-	.owner		= THIS_MODULE,
-
-	.llseek		= default_llseek,
-
-	.read		= aufs_read,
-	.write		= aufs_write,
-	.read_iter	= aufs_read_iter,
-	.write_iter	= aufs_write_iter,
-
-#ifdef CONFIG_AUFS_POLL
-	.poll		= aufs_poll,
-#endif
-	.unlocked_ioctl	= aufs_ioctl_nondir,
-#ifdef CONFIG_COMPAT
-	.compat_ioctl	= aufs_compat_ioctl_nondir,
-#endif
-	.mmap		= aufs_mmap,
-	.open		= aufs_open_nondir,
-	.flush		= aufs_flush_nondir,
-	.release	= aufs_release_nondir,
-	.fsync		= aufs_fsync_nondir,
-	/* .aio_fsync	= aufs_aio_fsync_nondir, */
-	.fasync		= aufs_fasync,
-	/* .sendpage	= aufs_sendpage, */
-	.setfl		= aufs_setfl,
-	.splice_write	= aufs_splice_write,
-	.splice_read	= aufs_splice_read,
-#if 0
-	.aio_splice_write = aufs_aio_splice_write,
-	.aio_splice_read  = aufs_aio_splice_read,
-#endif
-	.fallocate	= aufs_fallocate
-};
diff --git a/fs/aufs/fhsm.c b/fs/aufs/fhsm.c
deleted file mode 100644
index db079d6ee..000000000
--- a/fs/aufs/fhsm.c
+++ /dev/null
@@ -1,412 +0,0 @@
-/*
- * Copyright (C) 2011-2016 Junjiro R. Okajima
- */
-
-/*
- * File-based Hierarchy Storage Management
- */
-
-#include <linux/anon_inodes.h>
-#include <linux/poll.h>
-#include <linux/seq_file.h>
-#include <linux/statfs.h>
-#include "aufs.h"
-
-static aufs_bindex_t au_fhsm_bottom(struct super_block *sb)
-{
-	struct au_sbinfo *sbinfo;
-	struct au_fhsm *fhsm;
-
-	SiMustAnyLock(sb);
-
-	sbinfo = au_sbi(sb);
-	fhsm = &sbinfo->si_fhsm;
-	AuDebugOn(!fhsm);
-	return fhsm->fhsm_bottom;
-}
-
-void au_fhsm_set_bottom(struct super_block *sb, aufs_bindex_t bindex)
-{
-	struct au_sbinfo *sbinfo;
-	struct au_fhsm *fhsm;
-
-	SiMustWriteLock(sb);
-
-	sbinfo = au_sbi(sb);
-	fhsm = &sbinfo->si_fhsm;
-	AuDebugOn(!fhsm);
-	fhsm->fhsm_bottom = bindex;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static int au_fhsm_test_jiffy(struct au_sbinfo *sbinfo, struct au_branch *br)
-{
-	struct au_br_fhsm *bf;
-
-	bf = br->br_fhsm;
-	MtxMustLock(&bf->bf_lock);
-
-	return !bf->bf_readable
-		|| time_after(jiffies,
-			      bf->bf_jiffy + sbinfo->si_fhsm.fhsm_expire);
-}
-
-/* ---------------------------------------------------------------------- */
-
-static void au_fhsm_notify(struct super_block *sb, int val)
-{
-	struct au_sbinfo *sbinfo;
-	struct au_fhsm *fhsm;
-
-	SiMustAnyLock(sb);
-
-	sbinfo = au_sbi(sb);
-	fhsm = &sbinfo->si_fhsm;
-	if (au_fhsm_pid(fhsm)
-	    && atomic_read(&fhsm->fhsm_readable) != -1) {
-		atomic_set(&fhsm->fhsm_readable, val);
-		if (val)
-			wake_up(&fhsm->fhsm_wqh);
-	}
-}
-
-static int au_fhsm_stfs(struct super_block *sb, aufs_bindex_t bindex,
-			struct aufs_stfs *rstfs, int do_lock, int do_notify)
-{
-	int err;
-	struct au_branch *br;
-	struct au_br_fhsm *bf;
-
-	br = au_sbr(sb, bindex);
-	AuDebugOn(au_br_rdonly(br));
-	bf = br->br_fhsm;
-	AuDebugOn(!bf);
-
-	if (do_lock)
-		mutex_lock(&bf->bf_lock);
-	else
-		MtxMustLock(&bf->bf_lock);
-
-	/* sb->s_root for NFS is unreliable */
-	err = au_br_stfs(br, &bf->bf_stfs);
-	if (unlikely(err)) {
-		AuErr1("FHSM failed (%d), b%d, ignored.\n", bindex, err);
-		goto out;
-	}
-
-	bf->bf_jiffy = jiffies;
-	bf->bf_readable = 1;
-	if (do_notify)
-		au_fhsm_notify(sb, /*val*/1);
-	if (rstfs)
-		*rstfs = bf->bf_stfs;
-
-out:
-	if (do_lock)
-		mutex_unlock(&bf->bf_lock);
-	au_fhsm_notify(sb, /*val*/1);
-
-	return err;
-}
-
-void au_fhsm_wrote(struct super_block *sb, aufs_bindex_t bindex, int force)
-{
-	int err;
-	struct au_sbinfo *sbinfo;
-	struct au_fhsm *fhsm;
-	struct au_branch *br;
-	struct au_br_fhsm *bf;
-
-	AuDbg("b%d, force %d\n", bindex, force);
-	SiMustAnyLock(sb);
-
-	sbinfo = au_sbi(sb);
-	fhsm = &sbinfo->si_fhsm;
-	if (!au_ftest_si(sbinfo, FHSM)
-	    || fhsm->fhsm_bottom == bindex)
-		return;
-
-	br = au_sbr(sb, bindex);
-	bf = br->br_fhsm;
-	AuDebugOn(!bf);
-	mutex_lock(&bf->bf_lock);
-	if (force
-	    || au_fhsm_pid(fhsm)
-	    || au_fhsm_test_jiffy(sbinfo, br))
-		err = au_fhsm_stfs(sb, bindex, /*rstfs*/NULL, /*do_lock*/0,
-				  /*do_notify*/1);
-	mutex_unlock(&bf->bf_lock);
-}
-
-void au_fhsm_wrote_all(struct super_block *sb, int force)
-{
-	aufs_bindex_t bindex, bend;
-	struct au_branch *br;
-
-	/* exclude the bottom */
-	bend = au_fhsm_bottom(sb);
-	for (bindex = 0; bindex < bend; bindex++) {
-		br = au_sbr(sb, bindex);
-		if (au_br_fhsm(br->br_perm))
-			au_fhsm_wrote(sb, bindex, force);
-	}
-}
-
-/* ---------------------------------------------------------------------- */
-
-static unsigned int au_fhsm_poll(struct file *file,
-				 struct poll_table_struct *wait)
-{
-	unsigned int mask;
-	struct au_sbinfo *sbinfo;
-	struct au_fhsm *fhsm;
-
-	mask = 0;
-	sbinfo = file->private_data;
-	fhsm = &sbinfo->si_fhsm;
-	poll_wait(file, &fhsm->fhsm_wqh, wait);
-	if (atomic_read(&fhsm->fhsm_readable))
-		mask = POLLIN /* | POLLRDNORM */;
-
-	AuTraceErr((int)mask);
-	return mask;
-}
-
-static int au_fhsm_do_read_one(struct aufs_stbr __user *stbr,
-			      struct aufs_stfs *stfs, __s16 brid)
-{
-	int err;
-
-	err = copy_to_user(&stbr->stfs, stfs, sizeof(*stfs));
-	if (!err)
-		err = __put_user(brid, &stbr->brid);
-	if (unlikely(err))
-		err = -EFAULT;
-
-	return err;
-}
-
-static ssize_t au_fhsm_do_read(struct super_block *sb,
-			       struct aufs_stbr __user *stbr, size_t count)
-{
-	ssize_t err;
-	int nstbr;
-	aufs_bindex_t bindex, bend;
-	struct au_branch *br;
-	struct au_br_fhsm *bf;
-
-	/* except the bottom branch */
-	err = 0;
-	nstbr = 0;
-	bend = au_fhsm_bottom(sb);
-	for (bindex = 0; !err && bindex < bend; bindex++) {
-		br = au_sbr(sb, bindex);
-		if (!au_br_fhsm(br->br_perm))
-			continue;
-
-		bf = br->br_fhsm;
-		mutex_lock(&bf->bf_lock);
-		if (bf->bf_readable) {
-			err = -EFAULT;
-			if (count >= sizeof(*stbr))
-				err = au_fhsm_do_read_one(stbr++, &bf->bf_stfs,
-							  br->br_id);
-			if (!err) {
-				bf->bf_readable = 0;
-				count -= sizeof(*stbr);
-				nstbr++;
-			}
-		}
-		mutex_unlock(&bf->bf_lock);
-	}
-	if (!err)
-		err = sizeof(*stbr) * nstbr;
-
-	return err;
-}
-
-static ssize_t au_fhsm_read(struct file *file, char __user *buf, size_t count,
-			   loff_t *pos)
-{
-	ssize_t err;
-	int readable;
-	aufs_bindex_t nfhsm, bindex, bend;
-	struct au_sbinfo *sbinfo;
-	struct au_fhsm *fhsm;
-	struct au_branch *br;
-	struct super_block *sb;
-
-	err = 0;
-	sbinfo = file->private_data;
-	fhsm = &sbinfo->si_fhsm;
-need_data:
-	spin_lock_irq(&fhsm->fhsm_wqh.lock);
-	if (!atomic_read(&fhsm->fhsm_readable)) {
-		if (vfsub_file_flags(file) & O_NONBLOCK)
-			err = -EAGAIN;
-		else
-			err = wait_event_interruptible_locked_irq
-				(fhsm->fhsm_wqh,
-				 atomic_read(&fhsm->fhsm_readable));
-	}
-	spin_unlock_irq(&fhsm->fhsm_wqh.lock);
-	if (unlikely(err))
-		goto out;
-
-	/* sb may already be dead */
-	au_rw_read_lock(&sbinfo->si_rwsem);
-	readable = atomic_read(&fhsm->fhsm_readable);
-	if (readable > 0) {
-		sb = sbinfo->si_sb;
-		AuDebugOn(!sb);
-		/* exclude the bottom branch */
-		nfhsm = 0;
-		bend = au_fhsm_bottom(sb);
-		for (bindex = 0; bindex < bend; bindex++) {
-			br = au_sbr(sb, bindex);
-			if (au_br_fhsm(br->br_perm))
-				nfhsm++;
-		}
-		err = -EMSGSIZE;
-		if (nfhsm * sizeof(struct aufs_stbr) <= count) {
-			atomic_set(&fhsm->fhsm_readable, 0);
-			err = au_fhsm_do_read(sbinfo->si_sb, (void __user *)buf,
-					     count);
-		}
-	}
-	au_rw_read_unlock(&sbinfo->si_rwsem);
-	if (!readable)
-		goto need_data;
-
-out:
-	return err;
-}
-
-static int au_fhsm_release(struct inode *inode, struct file *file)
-{
-	struct au_sbinfo *sbinfo;
-	struct au_fhsm *fhsm;
-
-	/* sb may already be dead */
-	sbinfo = file->private_data;
-	fhsm = &sbinfo->si_fhsm;
-	spin_lock(&fhsm->fhsm_spin);
-	fhsm->fhsm_pid = 0;
-	spin_unlock(&fhsm->fhsm_spin);
-	kobject_put(&sbinfo->si_kobj);
-
-	return 0;
-}
-
-static const struct file_operations au_fhsm_fops = {
-	.owner		= THIS_MODULE,
-	.llseek		= noop_llseek,
-	.read		= au_fhsm_read,
-	.poll		= au_fhsm_poll,
-	.release	= au_fhsm_release
-};
-
-int au_fhsm_fd(struct super_block *sb, int oflags)
-{
-	int err, fd;
-	struct au_sbinfo *sbinfo;
-	struct au_fhsm *fhsm;
-
-	err = -EPERM;
-	if (unlikely(!capable(CAP_SYS_ADMIN)))
-		goto out;
-
-	err = -EINVAL;
-	if (unlikely(oflags & ~(O_CLOEXEC | O_NONBLOCK)))
-		goto out;
-
-	err = 0;
-	sbinfo = au_sbi(sb);
-	fhsm = &sbinfo->si_fhsm;
-	spin_lock(&fhsm->fhsm_spin);
-	if (!fhsm->fhsm_pid)
-		fhsm->fhsm_pid = current->pid;
-	else
-		err = -EBUSY;
-	spin_unlock(&fhsm->fhsm_spin);
-	if (unlikely(err))
-		goto out;
-
-	oflags |= O_RDONLY;
-	/* oflags |= FMODE_NONOTIFY; */
-	fd = anon_inode_getfd("[aufs_fhsm]", &au_fhsm_fops, sbinfo, oflags);
-	err = fd;
-	if (unlikely(fd < 0))
-		goto out_pid;
-
-	/* succeed reglardless 'fhsm' status */
-	kobject_get(&sbinfo->si_kobj);
-	si_noflush_read_lock(sb);
-	if (au_ftest_si(sbinfo, FHSM))
-		au_fhsm_wrote_all(sb, /*force*/0);
-	si_read_unlock(sb);
-	goto out; /* success */
-
-out_pid:
-	spin_lock(&fhsm->fhsm_spin);
-	fhsm->fhsm_pid = 0;
-	spin_unlock(&fhsm->fhsm_spin);
-out:
-	AuTraceErr(err);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-int au_fhsm_br_alloc(struct au_branch *br)
-{
-	int err;
-
-	err = 0;
-	br->br_fhsm = kmalloc(sizeof(*br->br_fhsm), GFP_NOFS);
-	if (br->br_fhsm)
-		au_br_fhsm_init(br->br_fhsm);
-	else
-		err = -ENOMEM;
-
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void au_fhsm_fin(struct super_block *sb)
-{
-	au_fhsm_notify(sb, /*val*/-1);
-}
-
-void au_fhsm_init(struct au_sbinfo *sbinfo)
-{
-	struct au_fhsm *fhsm;
-
-	fhsm = &sbinfo->si_fhsm;
-	spin_lock_init(&fhsm->fhsm_spin);
-	init_waitqueue_head(&fhsm->fhsm_wqh);
-	atomic_set(&fhsm->fhsm_readable, 0);
-	fhsm->fhsm_expire
-		= msecs_to_jiffies(AUFS_FHSM_CACHE_DEF_SEC * MSEC_PER_SEC);
-	fhsm->fhsm_bottom = -1;
-}
-
-void au_fhsm_set(struct au_sbinfo *sbinfo, unsigned int sec)
-{
-	sbinfo->si_fhsm.fhsm_expire
-		= msecs_to_jiffies(sec * MSEC_PER_SEC);
-}
-
-void au_fhsm_show(struct seq_file *seq, struct au_sbinfo *sbinfo)
-{
-	unsigned int u;
-
-	if (!au_ftest_si(sbinfo, FHSM))
-		return;
-
-	u = jiffies_to_msecs(sbinfo->si_fhsm.fhsm_expire) / MSEC_PER_SEC;
-	if (u != AUFS_FHSM_CACHE_DEF_SEC)
-		seq_printf(seq, ",fhsm_sec=%u", u);
-}
diff --git a/fs/aufs/file.c b/fs/aufs/file.c
deleted file mode 100644
index 6b8a66b4a..000000000
--- a/fs/aufs/file.c
+++ /dev/null
@@ -1,831 +0,0 @@
-/*
- * Copyright (C) 2005-2016 Junjiro R. Okajima
- */
-
-/*
- * handling file/dir, and address_space operation
- */
-
-#ifdef CONFIG_AUFS_DEBUG
-#include <linux/migrate.h>
-#endif
-#include <linux/pagemap.h>
-#include "aufs.h"
-
-/* drop flags for writing */
-unsigned int au_file_roflags(unsigned int flags)
-{
-	flags &= ~(O_WRONLY | O_RDWR | O_APPEND | O_CREAT | O_TRUNC);
-	flags |= O_RDONLY | O_NOATIME;
-	return flags;
-}
-
-/* common functions to regular file and dir */
-struct file *au_h_open(struct dentry *dentry, aufs_bindex_t bindex, int flags,
-		       struct file *file, int force_wr)
-{
-	struct file *h_file;
-	struct dentry *h_dentry;
-	struct inode *h_inode;
-	struct super_block *sb;
-	struct au_branch *br;
-	struct path h_path;
-	int err;
-
-	/* a race condition can happen between open and unlink/rmdir */
-	h_file = ERR_PTR(-ENOENT);
-	h_dentry = au_h_dptr(dentry, bindex);
-	if (au_test_nfsd() && (!h_dentry || d_is_negative(h_dentry)))
-		goto out;
-	h_inode = d_inode(h_dentry);
-	spin_lock(&h_dentry->d_lock);
-	err = (!d_unhashed(dentry) && d_unlinked(h_dentry))
-		/* || !d_inode(dentry)->i_nlink */
-		;
-	spin_unlock(&h_dentry->d_lock);
-	if (unlikely(err))
-		goto out;
-
-	sb = dentry->d_sb;
-	br = au_sbr(sb, bindex);
-	err = au_br_test_oflag(flags, br);
-	h_file = ERR_PTR(err);
-	if (unlikely(err))
-		goto out;
-
-	/* drop flags for writing */
-	if (au_test_ro(sb, bindex, d_inode(dentry))) {
-		if (force_wr && !(flags & O_WRONLY))
-			force_wr = 0;
-		flags = au_file_roflags(flags);
-		if (force_wr) {
-			h_file = ERR_PTR(-EROFS);
-			flags = au_file_roflags(flags);
-			if (unlikely(vfsub_native_ro(h_inode)
-				     || IS_APPEND(h_inode)))
-				goto out;
-			flags &= ~O_ACCMODE;
-			flags |= O_WRONLY;
-		}
-	}
-	flags &= ~O_CREAT;
-	atomic_inc(&br->br_count);
-	h_path.dentry = h_dentry;
-	h_path.mnt = au_br_mnt(br);
-	h_file = vfsub_dentry_open(&h_path, flags);
-	if (IS_ERR(h_file))
-		goto out_br;
-
-	if (flags & __FMODE_EXEC) {
-		err = deny_write_access(h_file);
-		if (unlikely(err)) {
-			fput(h_file);
-			h_file = ERR_PTR(err);
-			goto out_br;
-		}
-	}
-	fsnotify_open(h_file);
-	goto out; /* success */
-
-out_br:
-	atomic_dec(&br->br_count);
-out:
-	return h_file;
-}
-
-static int au_cmoo(struct dentry *dentry)
-{
-	int err, cmoo;
-	unsigned int udba;
-	struct path h_path;
-	struct au_pin pin;
-	struct au_cp_generic cpg = {
-		.dentry	= dentry,
-		.bdst	= -1,
-		.bsrc	= -1,
-		.len	= -1,
-		.pin	= &pin,
-		.flags	= AuCpup_DTIME | AuCpup_HOPEN
-	};
-	struct inode *delegated;
-	struct super_block *sb;
-	struct au_sbinfo *sbinfo;
-	struct au_fhsm *fhsm;
-	pid_t pid;
-	struct au_branch *br;
-	struct dentry *parent;
-	struct au_hinode *hdir;
-
-	DiMustWriteLock(dentry);
-	IiMustWriteLock(d_inode(dentry));
-
-	err = 0;
-	if (IS_ROOT(dentry))
-		goto out;
-	cpg.bsrc = au_dbstart(dentry);
-	if (!cpg.bsrc)
-		goto out;
-
-	sb = dentry->d_sb;
-	sbinfo = au_sbi(sb);
-	fhsm = &sbinfo->si_fhsm;
-	pid = au_fhsm_pid(fhsm);
-	if (pid
-	    && (current->pid == pid
-		|| current->real_parent->pid == pid))
-		goto out;
-
-	br = au_sbr(sb, cpg.bsrc);
-	cmoo = au_br_cmoo(br->br_perm);
-	if (!cmoo)
-		goto out;
-	if (!d_is_reg(dentry))
-		cmoo &= AuBrAttr_COO_ALL;
-	if (!cmoo)
-		goto out;
-
-	parent = dget_parent(dentry);
-	di_write_lock_parent(parent);
-	err = au_wbr_do_copyup_bu(dentry, cpg.bsrc - 1);
-	cpg.bdst = err;
-	if (unlikely(err < 0)) {
-		err = 0;	/* there is no upper writable branch */
-		goto out_dgrade;
-	}
-	AuDbg("bsrc %d, bdst %d\n", cpg.bsrc, cpg.bdst);
-
-	/* do not respect the coo attrib for the target branch */
-	err = au_cpup_dirs(dentry, cpg.bdst);
-	if (unlikely(err))
-		goto out_dgrade;
-
-	di_downgrade_lock(parent, AuLock_IR);
-	udba = au_opt_udba(sb);
-	err = au_pin(&pin, dentry, cpg.bdst, udba,
-		     AuPin_DI_LOCKED | AuPin_MNT_WRITE);
-	if (unlikely(err))
-		goto out_parent;
-
-	err = au_sio_cpup_simple(&cpg);
-	au_unpin(&pin);
-	if (unlikely(err))
-		goto out_parent;
-	if (!(cmoo & AuBrWAttr_MOO))
-		goto out_parent; /* success */
-
-	err = au_pin(&pin, dentry, cpg.bsrc, udba,
-		     AuPin_DI_LOCKED | AuPin_MNT_WRITE);
-	if (unlikely(err))
-		goto out_parent;
-
-	h_path.mnt = au_br_mnt(br);
-	h_path.dentry = au_h_dptr(dentry, cpg.bsrc);
-	hdir = au_hi(d_inode(parent), cpg.bsrc);
-	delegated = NULL;
-	err = vfsub_unlink(hdir->hi_inode, &h_path, &delegated, /*force*/1);
-	au_unpin(&pin);
-	/* todo: keep h_dentry or not? */
-	if (unlikely(err == -EWOULDBLOCK)) {
-		pr_warn("cannot retry for NFSv4 delegation"
-			" for an internal unlink\n");
-		iput(delegated);
-	}
-	if (unlikely(err)) {
-		pr_err("unlink %pd after coo failed (%d), ignored\n",
-		       dentry, err);
-		err = 0;
-	}
-	goto out_parent; /* success */
-
-out_dgrade:
-	di_downgrade_lock(parent, AuLock_IR);
-out_parent:
-	di_read_unlock(parent, AuLock_IR);
-	dput(parent);
-out:
-	AuTraceErr(err);
-	return err;
-}
-
-int au_do_open(struct file *file, struct au_do_open_args *args)
-{
-	int err, no_lock = args->no_lock;
-	struct dentry *dentry;
-	struct au_finfo *finfo;
-
-	if (!no_lock)
-		err = au_finfo_init(file, args->fidir);
-	else {
-		lockdep_off();
-		err = au_finfo_init(file, args->fidir);
-		lockdep_on();
-	}
-	if (unlikely(err))
-		goto out;
-
-	dentry = file->f_path.dentry;
-	AuDebugOn(IS_ERR_OR_NULL(dentry));
-	if (!no_lock) {
-		di_write_lock_child(dentry);
-		err = au_cmoo(dentry);
-		di_downgrade_lock(dentry, AuLock_IR);
-		if (!err)
-			err = args->open(file, vfsub_file_flags(file), NULL);
-		di_read_unlock(dentry, AuLock_IR);
-	} else {
-		err = au_cmoo(dentry);
-		if (!err)
-			err = args->open(file, vfsub_file_flags(file),
-					 args->h_file);
-		if (!err && au_fbstart(file) != au_dbstart(dentry))
-			/*
-			 * cmoo happens after h_file was opened.
-			 * need to refresh file later.
-			 */
-			atomic_dec(&au_fi(file)->fi_generation);
-	}
-
-	finfo = au_fi(file);
-	if (!err) {
-		finfo->fi_file = file;
-		au_sphl_add(&finfo->fi_hlist,
-			    &au_sbi(file->f_path.dentry->d_sb)->si_files);
-	}
-	if (!no_lock)
-		fi_write_unlock(file);
-	else {
-		lockdep_off();
-		fi_write_unlock(file);
-		lockdep_on();
-	}
-	if (unlikely(err)) {
-		finfo->fi_hdir = NULL;
-		au_finfo_fin(file);
-	}
-
-out:
-	return err;
-}
-
-int au_reopen_nondir(struct file *file)
-{
-	int err;
-	aufs_bindex_t bstart;
-	struct dentry *dentry;
-	struct file *h_file, *h_file_tmp;
-
-	dentry = file->f_path.dentry;
-	bstart = au_dbstart(dentry);
-	h_file_tmp = NULL;
-	if (au_fbstart(file) == bstart) {
-		h_file = au_hf_top(file);
-		if (file->f_mode == h_file->f_mode)
-			return 0; /* success */
-		h_file_tmp = h_file;
-		get_file(h_file_tmp);
-		au_set_h_fptr(file, bstart, NULL);
-	}
-	AuDebugOn(au_fi(file)->fi_hdir);
-	/*
-	 * it can happen
-	 * file exists on both of rw and ro
-	 * open --> dbstart and fbstart are both 0
-	 * prepend a branch as rw, "rw" become ro
-	 * remove rw/file
-	 * delete the top branch, "rw" becomes rw again
-	 *	--> dbstart is 1, fbstart is still 0
-	 * write --> fbstart is 0 but dbstart is 1
-	 */
-	/* AuDebugOn(au_fbstart(file) < bstart); */
-
-	h_file = au_h_open(dentry, bstart, vfsub_file_flags(file) & ~O_TRUNC,
-			   file, /*force_wr*/0);
-	err = PTR_ERR(h_file);
-	if (IS_ERR(h_file)) {
-		if (h_file_tmp) {
-			atomic_inc(&au_sbr(dentry->d_sb, bstart)->br_count);
-			au_set_h_fptr(file, bstart, h_file_tmp);
-			h_file_tmp = NULL;
-		}
-		goto out; /* todo: close all? */
-	}
-
-	err = 0;
-	au_set_fbstart(file, bstart);
-	au_set_h_fptr(file, bstart, h_file);
-	au_update_figen(file);
-	/* todo: necessary? */
-	/* file->f_ra = h_file->f_ra; */
-
-out:
-	if (h_file_tmp)
-		fput(h_file_tmp);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static int au_reopen_wh(struct file *file, aufs_bindex_t btgt,
-			struct dentry *hi_wh)
-{
-	int err;
-	aufs_bindex_t bstart;
-	struct au_dinfo *dinfo;
-	struct dentry *h_dentry;
-	struct au_hdentry *hdp;
-
-	dinfo = au_di(file->f_path.dentry);
-	AuRwMustWriteLock(&dinfo->di_rwsem);
-
-	bstart = dinfo->di_bstart;
-	dinfo->di_bstart = btgt;
-	hdp = dinfo->di_hdentry;
-	h_dentry = hdp[0 + btgt].hd_dentry;
-	hdp[0 + btgt].hd_dentry = hi_wh;
-	err = au_reopen_nondir(file);
-	hdp[0 + btgt].hd_dentry = h_dentry;
-	dinfo->di_bstart = bstart;
-
-	return err;
-}
-
-static int au_ready_to_write_wh(struct file *file, loff_t len,
-				aufs_bindex_t bcpup, struct au_pin *pin)
-{
-	int err;
-	struct inode *inode, *h_inode;
-	struct dentry *h_dentry, *hi_wh;
-	struct au_cp_generic cpg = {
-		.dentry	= file->f_path.dentry,
-		.bdst	= bcpup,
-		.bsrc	= -1,
-		.len	= len,
-		.pin	= pin
-	};
-
-	au_update_dbstart(cpg.dentry);
-	inode = d_inode(cpg.dentry);
-	h_inode = NULL;
-	if (au_dbstart(cpg.dentry) <= bcpup
-	    && au_dbend(cpg.dentry) >= bcpup) {
-		h_dentry = au_h_dptr(cpg.dentry, bcpup);
-		if (h_dentry && d_is_positive(h_dentry))
-			h_inode = d_inode(h_dentry);
-	}
-	hi_wh = au_hi_wh(inode, bcpup);
-	if (!hi_wh && !h_inode)
-		err = au_sio_cpup_wh(&cpg, file);
-	else
-		/* already copied-up after unlink */
-		err = au_reopen_wh(file, bcpup, hi_wh);
-
-	if (!err
-	    && (inode->i_nlink > 1
-		|| (inode->i_state & I_LINKABLE))
-	    && au_opt_test(au_mntflags(cpg.dentry->d_sb), PLINK))
-		au_plink_append(inode, bcpup, au_h_dptr(cpg.dentry, bcpup));
-
-	return err;
-}
-
-/*
- * prepare the @file for writing.
- */
-int au_ready_to_write(struct file *file, loff_t len, struct au_pin *pin)
-{
-	int err;
-	aufs_bindex_t dbstart;
-	struct dentry *parent;
-	struct inode *inode;
-	struct super_block *sb;
-	struct file *h_file;
-	struct au_cp_generic cpg = {
-		.dentry	= file->f_path.dentry,
-		.bdst	= -1,
-		.bsrc	= -1,
-		.len	= len,
-		.pin	= pin,
-		.flags	= AuCpup_DTIME
-	};
-
-	sb = cpg.dentry->d_sb;
-	inode = d_inode(cpg.dentry);
-	cpg.bsrc = au_fbstart(file);
-	err = au_test_ro(sb, cpg.bsrc, inode);
-	if (!err && (au_hf_top(file)->f_mode & FMODE_WRITE)) {
-		err = au_pin(pin, cpg.dentry, cpg.bsrc, AuOpt_UDBA_NONE,
-			     /*flags*/0);
-		goto out;
-	}
-
-	/* need to cpup or reopen */
-	parent = dget_parent(cpg.dentry);
-	di_write_lock_parent(parent);
-	err = AuWbrCopyup(au_sbi(sb), cpg.dentry);
-	cpg.bdst = err;
-	if (unlikely(err < 0))
-		goto out_dgrade;
-	err = 0;
-
-	if (!d_unhashed(cpg.dentry) && !au_h_dptr(parent, cpg.bdst)) {
-		err = au_cpup_dirs(cpg.dentry, cpg.bdst);
-		if (unlikely(err))
-			goto out_dgrade;
-	}
-
-	err = au_pin(pin, cpg.dentry, cpg.bdst, AuOpt_UDBA_NONE,
-		     AuPin_DI_LOCKED | AuPin_MNT_WRITE);
-	if (unlikely(err))
-		goto out_dgrade;
-
-	dbstart = au_dbstart(cpg.dentry);
-	if (dbstart <= cpg.bdst)
-		cpg.bsrc = cpg.bdst;
-
-	if (dbstart <= cpg.bdst		/* just reopen */
-	    || !d_unhashed(cpg.dentry)	/* copyup and reopen */
-		) {
-		h_file = au_h_open_pre(cpg.dentry, cpg.bsrc, /*force_wr*/0);
-		if (IS_ERR(h_file))
-			err = PTR_ERR(h_file);
-		else {
-			di_downgrade_lock(parent, AuLock_IR);
-			if (dbstart > cpg.bdst)
-				err = au_sio_cpup_simple(&cpg);
-			if (!err)
-				err = au_reopen_nondir(file);
-			au_h_open_post(cpg.dentry, cpg.bsrc, h_file);
-		}
-	} else {			/* copyup as wh and reopen */
-		/*
-		 * since writable hfsplus branch is not supported,
-		 * h_open_pre/post() are unnecessary.
-		 */
-		err = au_ready_to_write_wh(file, len, cpg.bdst, pin);
-		di_downgrade_lock(parent, AuLock_IR);
-	}
-
-	if (!err) {
-		au_pin_set_parent_lflag(pin, /*lflag*/0);
-		goto out_dput; /* success */
-	}
-	au_unpin(pin);
-	goto out_unlock;
-
-out_dgrade:
-	di_downgrade_lock(parent, AuLock_IR);
-out_unlock:
-	di_read_unlock(parent, AuLock_IR);
-out_dput:
-	dput(parent);
-out:
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-int au_do_flush(struct file *file, fl_owner_t id,
-		int (*flush)(struct file *file, fl_owner_t id))
-{
-	int err;
-	struct super_block *sb;
-	struct inode *inode;
-
-	inode = file_inode(file);
-	sb = inode->i_sb;
-	si_noflush_read_lock(sb);
-	fi_read_lock(file);
-	ii_read_lock_child(inode);
-
-	err = flush(file, id);
-	au_cpup_attr_timesizes(inode);
-
-	ii_read_unlock(inode);
-	fi_read_unlock(file);
-	si_read_unlock(sb);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static int au_file_refresh_by_inode(struct file *file, int *need_reopen)
-{
-	int err;
-	struct au_pin pin;
-	struct au_finfo *finfo;
-	struct dentry *parent, *hi_wh;
-	struct inode *inode;
-	struct super_block *sb;
-	struct au_cp_generic cpg = {
-		.dentry	= file->f_path.dentry,
-		.bdst	= -1,
-		.bsrc	= -1,
-		.len	= -1,
-		.pin	= &pin,
-		.flags	= AuCpup_DTIME
-	};
-
-	FiMustWriteLock(file);
-
-	err = 0;
-	finfo = au_fi(file);
-	sb = cpg.dentry->d_sb;
-	inode = d_inode(cpg.dentry);
-	cpg.bdst = au_ibstart(inode);
-	if (cpg.bdst == finfo->fi_btop || IS_ROOT(cpg.dentry))
-		goto out;
-
-	parent = dget_parent(cpg.dentry);
-	if (au_test_ro(sb, cpg.bdst, inode)) {
-		di_read_lock_parent(parent, !AuLock_IR);
-		err = AuWbrCopyup(au_sbi(sb), cpg.dentry);
-		cpg.bdst = err;
-		di_read_unlock(parent, !AuLock_IR);
-		if (unlikely(err < 0))
-			goto out_parent;
-		err = 0;
-	}
-
-	di_read_lock_parent(parent, AuLock_IR);
-	hi_wh = au_hi_wh(inode, cpg.bdst);
-	if (!S_ISDIR(inode->i_mode)
-	    && au_opt_test(au_mntflags(sb), PLINK)
-	    && au_plink_test(inode)
-	    && !d_unhashed(cpg.dentry)
-	    && cpg.bdst < au_dbstart(cpg.dentry)) {
-		err = au_test_and_cpup_dirs(cpg.dentry, cpg.bdst);
-		if (unlikely(err))
-			goto out_unlock;
-
-		/* always superio. */
-		err = au_pin(&pin, cpg.dentry, cpg.bdst, AuOpt_UDBA_NONE,
-			     AuPin_DI_LOCKED | AuPin_MNT_WRITE);
-		if (!err) {
-			err = au_sio_cpup_simple(&cpg);
-			au_unpin(&pin);
-		}
-	} else if (hi_wh) {
-		/* already copied-up after unlink */
-		err = au_reopen_wh(file, cpg.bdst, hi_wh);
-		*need_reopen = 0;
-	}
-
-out_unlock:
-	di_read_unlock(parent, AuLock_IR);
-out_parent:
-	dput(parent);
-out:
-	return err;
-}
-
-static void au_do_refresh_dir(struct file *file)
-{
-	aufs_bindex_t bindex, bend, new_bindex, brid;
-	struct au_hfile *p, tmp, *q;
-	struct au_finfo *finfo;
-	struct super_block *sb;
-	struct au_fidir *fidir;
-
-	FiMustWriteLock(file);
-
-	sb = file->f_path.dentry->d_sb;
-	finfo = au_fi(file);
-	fidir = finfo->fi_hdir;
-	AuDebugOn(!fidir);
-	p = fidir->fd_hfile + finfo->fi_btop;
-	brid = p->hf_br->br_id;
-	bend = fidir->fd_bbot;
-	for (bindex = finfo->fi_btop; bindex <= bend; bindex++, p++) {
-		if (!p->hf_file)
-			continue;
-
-		new_bindex = au_br_index(sb, p->hf_br->br_id);
-		if (new_bindex == bindex)
-			continue;
-		if (new_bindex < 0) {
-			au_set_h_fptr(file, bindex, NULL);
-			continue;
-		}
-
-		/* swap two lower inode, and loop again */
-		q = fidir->fd_hfile + new_bindex;
-		tmp = *q;
-		*q = *p;
-		*p = tmp;
-		if (tmp.hf_file) {
-			bindex--;
-			p--;
-		}
-	}
-
-	p = fidir->fd_hfile;
-	if (!au_test_mmapped(file) && !d_unlinked(file->f_path.dentry)) {
-		bend = au_sbend(sb);
-		for (finfo->fi_btop = 0; finfo->fi_btop <= bend;
-		     finfo->fi_btop++, p++)
-			if (p->hf_file) {
-				if (file_inode(p->hf_file))
-					break;
-				au_hfput(p, file);
-			}
-	} else {
-		bend = au_br_index(sb, brid);
-		for (finfo->fi_btop = 0; finfo->fi_btop < bend;
-		     finfo->fi_btop++, p++)
-			if (p->hf_file)
-				au_hfput(p, file);
-		bend = au_sbend(sb);
-	}
-
-	p = fidir->fd_hfile + bend;
-	for (fidir->fd_bbot = bend; fidir->fd_bbot >= finfo->fi_btop;
-	     fidir->fd_bbot--, p--)
-		if (p->hf_file) {
-			if (file_inode(p->hf_file))
-				break;
-			au_hfput(p, file);
-		}
-	AuDebugOn(fidir->fd_bbot < finfo->fi_btop);
-}
-
-/*
- * after branch manipulating, refresh the file.
- */
-static int refresh_file(struct file *file, int (*reopen)(struct file *file))
-{
-	int err, need_reopen;
-	aufs_bindex_t bend, bindex;
-	struct dentry *dentry;
-	struct au_finfo *finfo;
-	struct au_hfile *hfile;
-
-	dentry = file->f_path.dentry;
-	finfo = au_fi(file);
-	if (!finfo->fi_hdir) {
-		hfile = &finfo->fi_htop;
-		AuDebugOn(!hfile->hf_file);
-		bindex = au_br_index(dentry->d_sb, hfile->hf_br->br_id);
-		AuDebugOn(bindex < 0);
-		if (bindex != finfo->fi_btop)
-			au_set_fbstart(file, bindex);
-	} else {
-		err = au_fidir_realloc(finfo, au_sbend(dentry->d_sb) + 1);
-		if (unlikely(err))
-			goto out;
-		au_do_refresh_dir(file);
-	}
-
-	err = 0;
-	need_reopen = 1;
-	if (!au_test_mmapped(file))
-		err = au_file_refresh_by_inode(file, &need_reopen);
-	if (!err && need_reopen && !d_unlinked(dentry))
-		err = reopen(file);
-	if (!err) {
-		au_update_figen(file);
-		goto out; /* success */
-	}
-
-	/* error, close all lower files */
-	if (finfo->fi_hdir) {
-		bend = au_fbend_dir(file);
-		for (bindex = au_fbstart(file); bindex <= bend; bindex++)
-			au_set_h_fptr(file, bindex, NULL);
-	}
-
-out:
-	return err;
-}
-
-/* common function to regular file and dir */
-int au_reval_and_lock_fdi(struct file *file, int (*reopen)(struct file *file),
-			  int wlock)
-{
-	int err;
-	unsigned int sigen, figen;
-	aufs_bindex_t bstart;
-	unsigned char pseudo_link;
-	struct dentry *dentry;
-	struct inode *inode;
-
-	err = 0;
-	dentry = file->f_path.dentry;
-	inode = d_inode(dentry);
-	sigen = au_sigen(dentry->d_sb);
-	fi_write_lock(file);
-	figen = au_figen(file);
-	di_write_lock_child(dentry);
-	bstart = au_dbstart(dentry);
-	pseudo_link = (bstart != au_ibstart(inode));
-	if (sigen == figen && !pseudo_link && au_fbstart(file) == bstart) {
-		if (!wlock) {
-			di_downgrade_lock(dentry, AuLock_IR);
-			fi_downgrade_lock(file);
-		}
-		goto out; /* success */
-	}
-
-	AuDbg("sigen %d, figen %d\n", sigen, figen);
-	if (au_digen_test(dentry, sigen)) {
-		err = au_reval_dpath(dentry, sigen);
-		AuDebugOn(!err && au_digen_test(dentry, sigen));
-	}
-
-	if (!err)
-		err = refresh_file(file, reopen);
-	if (!err) {
-		if (!wlock) {
-			di_downgrade_lock(dentry, AuLock_IR);
-			fi_downgrade_lock(file);
-		}
-	} else {
-		di_write_unlock(dentry);
-		fi_write_unlock(file);
-	}
-
-out:
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* cf. aufs_nopage() */
-/* for madvise(2) */
-static int aufs_readpage(struct file *file __maybe_unused, struct page *page)
-{
-	unlock_page(page);
-	return 0;
-}
-
-/* it will never be called, but necessary to support O_DIRECT */
-static ssize_t aufs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
-			      loff_t offset)
-{ BUG(); return 0; }
-
-/* they will never be called. */
-#ifdef CONFIG_AUFS_DEBUG
-static int aufs_write_begin(struct file *file, struct address_space *mapping,
-			    loff_t pos, unsigned len, unsigned flags,
-			    struct page **pagep, void **fsdata)
-{ AuUnsupport(); return 0; }
-static int aufs_write_end(struct file *file, struct address_space *mapping,
-			  loff_t pos, unsigned len, unsigned copied,
-			  struct page *page, void *fsdata)
-{ AuUnsupport(); return 0; }
-static int aufs_writepage(struct page *page, struct writeback_control *wbc)
-{ AuUnsupport(); return 0; }
-
-static int aufs_set_page_dirty(struct page *page)
-{ AuUnsupport(); return 0; }
-static void aufs_invalidatepage(struct page *page, unsigned int offset,
-				unsigned int length)
-{ AuUnsupport(); }
-static int aufs_releasepage(struct page *page, gfp_t gfp)
-{ AuUnsupport(); return 0; }
-#if 0 /* called by memory compaction regardless file */
-static int aufs_migratepage(struct address_space *mapping, struct page *newpage,
-			    struct page *page, enum migrate_mode mode)
-{ AuUnsupport(); return 0; }
-#endif
-static int aufs_launder_page(struct page *page)
-{ AuUnsupport(); return 0; }
-static int aufs_is_partially_uptodate(struct page *page,
-				      unsigned long from,
-				      unsigned long count)
-{ AuUnsupport(); return 0; }
-static void aufs_is_dirty_writeback(struct page *page, bool *dirty,
-				    bool *writeback)
-{ AuUnsupport(); }
-static int aufs_error_remove_page(struct address_space *mapping,
-				  struct page *page)
-{ AuUnsupport(); return 0; }
-static int aufs_swap_activate(struct swap_info_struct *sis, struct file *file,
-			      sector_t *span)
-{ AuUnsupport(); return 0; }
-static void aufs_swap_deactivate(struct file *file)
-{ AuUnsupport(); }
-#endif /* CONFIG_AUFS_DEBUG */
-
-const struct address_space_operations aufs_aop = {
-	.readpage		= aufs_readpage,
-	.direct_IO		= aufs_direct_IO,
-#ifdef CONFIG_AUFS_DEBUG
-	.writepage		= aufs_writepage,
-	/* no writepages, because of writepage */
-	.set_page_dirty		= aufs_set_page_dirty,
-	/* no readpages, because of readpage */
-	.write_begin		= aufs_write_begin,
-	.write_end		= aufs_write_end,
-	/* no bmap, no block device */
-	.invalidatepage		= aufs_invalidatepage,
-	.releasepage		= aufs_releasepage,
-	/* is fallback_migrate_page ok? */
-	/* .migratepage		= aufs_migratepage, */
-	.launder_page		= aufs_launder_page,
-	.is_partially_uptodate	= aufs_is_partially_uptodate,
-	.is_dirty_writeback	= aufs_is_dirty_writeback,
-	.error_remove_page	= aufs_error_remove_page,
-	.swap_activate		= aufs_swap_activate,
-	.swap_deactivate	= aufs_swap_deactivate
-#endif /* CONFIG_AUFS_DEBUG */
-};
diff --git a/fs/aufs/file.h b/fs/aufs/file.h
deleted file mode 100644
index 27d802487..000000000
--- a/fs/aufs/file.h
+++ /dev/null
@@ -1,278 +0,0 @@
-/*
- * Copyright (C) 2005-2016 Junjiro R. Okajima
- */
-
-/*
- * file operations
- */
-
-#ifndef __AUFS_FILE_H__
-#define __AUFS_FILE_H__
-
-#ifdef __KERNEL__
-
-#include <linux/file.h>
-#include <linux/fs.h>
-#include <linux/poll.h>
-#include "rwsem.h"
-
-struct au_branch;
-struct au_hfile {
-	struct file		*hf_file;
-	struct au_branch	*hf_br;
-};
-
-struct au_vdir;
-struct au_fidir {
-	aufs_bindex_t		fd_bbot;
-	aufs_bindex_t		fd_nent;
-	struct au_vdir		*fd_vdir_cache;
-	struct au_hfile		fd_hfile[];
-};
-
-static inline int au_fidir_sz(int nent)
-{
-	AuDebugOn(nent < 0);
-	return sizeof(struct au_fidir) + sizeof(struct au_hfile) * nent;
-}
-
-struct au_finfo {
-	atomic_t		fi_generation;
-
-	struct au_rwsem		fi_rwsem;
-	aufs_bindex_t		fi_btop;
-
-	/* do not union them */
-	struct {				/* for non-dir */
-		struct au_hfile			fi_htop;
-		atomic_t			fi_mmapped;
-	};
-	struct au_fidir		*fi_hdir;	/* for dir only */
-
-	struct hlist_node	fi_hlist;
-	struct file		*fi_file;	/* very ugly */
-} ____cacheline_aligned_in_smp;
-
-/* ---------------------------------------------------------------------- */
-
-/* file.c */
-extern const struct address_space_operations aufs_aop;
-unsigned int au_file_roflags(unsigned int flags);
-struct file *au_h_open(struct dentry *dentry, aufs_bindex_t bindex, int flags,
-		       struct file *file, int force_wr);
-struct au_do_open_args {
-	int		no_lock;
-	int		(*open)(struct file *file, int flags,
-				struct file *h_file);
-	struct au_fidir	*fidir;
-	struct file	*h_file;
-};
-int au_do_open(struct file *file, struct au_do_open_args *args);
-int au_reopen_nondir(struct file *file);
-struct au_pin;
-int au_ready_to_write(struct file *file, loff_t len, struct au_pin *pin);
-int au_reval_and_lock_fdi(struct file *file, int (*reopen)(struct file *file),
-			  int wlock);
-int au_do_flush(struct file *file, fl_owner_t id,
-		int (*flush)(struct file *file, fl_owner_t id));
-
-/* poll.c */
-#ifdef CONFIG_AUFS_POLL
-unsigned int aufs_poll(struct file *file, poll_table *wait);
-#endif
-
-#ifdef CONFIG_AUFS_BR_HFSPLUS
-/* hfsplus.c */
-struct file *au_h_open_pre(struct dentry *dentry, aufs_bindex_t bindex,
-			   int force_wr);
-void au_h_open_post(struct dentry *dentry, aufs_bindex_t bindex,
-		    struct file *h_file);
-#else
-AuStub(struct file *, au_h_open_pre, return NULL, struct dentry *dentry,
-       aufs_bindex_t bindex, int force_wr)
-AuStubVoid(au_h_open_post, struct dentry *dentry, aufs_bindex_t bindex,
-	   struct file *h_file);
-#endif
-
-/* f_op.c */
-extern const struct file_operations aufs_file_fop;
-int au_do_open_nondir(struct file *file, int flags, struct file *h_file);
-int aufs_release_nondir(struct inode *inode __maybe_unused, struct file *file);
-struct file *au_read_pre(struct file *file, int keep_fi);
-
-/* finfo.c */
-void au_hfput(struct au_hfile *hf, struct file *file);
-void au_set_h_fptr(struct file *file, aufs_bindex_t bindex,
-		   struct file *h_file);
-
-void au_update_figen(struct file *file);
-struct au_fidir *au_fidir_alloc(struct super_block *sb);
-int au_fidir_realloc(struct au_finfo *finfo, int nbr);
-
-void au_fi_init_once(void *_fi);
-void au_finfo_fin(struct file *file);
-int au_finfo_init(struct file *file, struct au_fidir *fidir);
-
-/* ioctl.c */
-long aufs_ioctl_nondir(struct file *file, unsigned int cmd, unsigned long arg);
-#ifdef CONFIG_COMPAT
-long aufs_compat_ioctl_dir(struct file *file, unsigned int cmd,
-			   unsigned long arg);
-long aufs_compat_ioctl_nondir(struct file *file, unsigned int cmd,
-			      unsigned long arg);
-#endif
-
-/* ---------------------------------------------------------------------- */
-
-static inline struct au_finfo *au_fi(struct file *file)
-{
-	return file->private_data;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/*
- * fi_read_lock, fi_write_lock,
- * fi_read_unlock, fi_write_unlock, fi_downgrade_lock
- */
-AuSimpleRwsemFuncs(fi, struct file *f, &au_fi(f)->fi_rwsem);
-
-#define FiMustNoWaiters(f)	AuRwMustNoWaiters(&au_fi(f)->fi_rwsem)
-#define FiMustAnyLock(f)	AuRwMustAnyLock(&au_fi(f)->fi_rwsem)
-#define FiMustWriteLock(f)	AuRwMustWriteLock(&au_fi(f)->fi_rwsem)
-
-/* ---------------------------------------------------------------------- */
-
-/* todo: hard/soft set? */
-static inline aufs_bindex_t au_fbstart(struct file *file)
-{
-	FiMustAnyLock(file);
-	return au_fi(file)->fi_btop;
-}
-
-static inline aufs_bindex_t au_fbend_dir(struct file *file)
-{
-	FiMustAnyLock(file);
-	AuDebugOn(!au_fi(file)->fi_hdir);
-	return au_fi(file)->fi_hdir->fd_bbot;
-}
-
-static inline struct au_vdir *au_fvdir_cache(struct file *file)
-{
-	FiMustAnyLock(file);
-	AuDebugOn(!au_fi(file)->fi_hdir);
-	return au_fi(file)->fi_hdir->fd_vdir_cache;
-}
-
-static inline void au_set_fbstart(struct file *file, aufs_bindex_t bindex)
-{
-	FiMustWriteLock(file);
-	au_fi(file)->fi_btop = bindex;
-}
-
-static inline void au_set_fbend_dir(struct file *file, aufs_bindex_t bindex)
-{
-	FiMustWriteLock(file);
-	AuDebugOn(!au_fi(file)->fi_hdir);
-	au_fi(file)->fi_hdir->fd_bbot = bindex;
-}
-
-static inline void au_set_fvdir_cache(struct file *file,
-				      struct au_vdir *vdir_cache)
-{
-	FiMustWriteLock(file);
-	AuDebugOn(!au_fi(file)->fi_hdir);
-	au_fi(file)->fi_hdir->fd_vdir_cache = vdir_cache;
-}
-
-static inline struct file *au_hf_top(struct file *file)
-{
-	FiMustAnyLock(file);
-	AuDebugOn(au_fi(file)->fi_hdir);
-	return au_fi(file)->fi_htop.hf_file;
-}
-
-static inline struct file *au_hf_dir(struct file *file, aufs_bindex_t bindex)
-{
-	FiMustAnyLock(file);
-	AuDebugOn(!au_fi(file)->fi_hdir);
-	return au_fi(file)->fi_hdir->fd_hfile[0 + bindex].hf_file;
-}
-
-/* todo: memory barrier? */
-static inline unsigned int au_figen(struct file *f)
-{
-	return atomic_read(&au_fi(f)->fi_generation);
-}
-
-static inline void au_set_mmapped(struct file *f)
-{
-	if (atomic_inc_return(&au_fi(f)->fi_mmapped))
-		return;
-	pr_warn("fi_mmapped wrapped around\n");
-	while (!atomic_inc_return(&au_fi(f)->fi_mmapped))
-		;
-}
-
-static inline void au_unset_mmapped(struct file *f)
-{
-	atomic_dec(&au_fi(f)->fi_mmapped);
-}
-
-static inline int au_test_mmapped(struct file *f)
-{
-	return atomic_read(&au_fi(f)->fi_mmapped);
-}
-
-/* customize vma->vm_file */
-
-static inline void au_do_vm_file_reset(struct vm_area_struct *vma,
-				       struct file *file)
-{
-	struct file *f;
-
-	f = vma->vm_file;
-	get_file(file);
-	vma->vm_file = file;
-	fput(f);
-}
-
-#ifdef CONFIG_MMU
-#define AuDbgVmRegion(file, vma) do {} while (0)
-
-static inline void au_vm_file_reset(struct vm_area_struct *vma,
-				    struct file *file)
-{
-	au_do_vm_file_reset(vma, file);
-}
-#else
-#define AuDbgVmRegion(file, vma) \
-	AuDebugOn((vma)->vm_region && (vma)->vm_region->vm_file != (file))
-
-static inline void au_vm_file_reset(struct vm_area_struct *vma,
-				    struct file *file)
-{
-	struct file *f;
-
-	au_do_vm_file_reset(vma, file);
-	f = vma->vm_region->vm_file;
-	get_file(file);
-	vma->vm_region->vm_file = file;
-	fput(f);
-}
-#endif /* CONFIG_MMU */
-
-/* handle vma->vm_prfile */
-static inline void au_vm_prfile_set(struct vm_area_struct *vma,
-				    struct file *file)
-{
-	get_file(file);
-	vma->vm_prfile = file;
-#ifndef CONFIG_MMU
-	get_file(file);
-	vma->vm_region->vm_prfile = file;
-#endif
-}
-
-#endif /* __KERNEL__ */
-#endif /* __AUFS_FILE_H__ */
diff --git a/fs/aufs/finfo.c b/fs/aufs/finfo.c
deleted file mode 100644
index b5eb55dfb..000000000
--- a/fs/aufs/finfo.c
+++ /dev/null
@@ -1,143 +0,0 @@
-/*
- * Copyright (C) 2005-2016 Junjiro R. Okajima
- */
-
-/*
- * file private data
- */
-
-#include "aufs.h"
-
-void au_hfput(struct au_hfile *hf, struct file *file)
-{
-	/* todo: direct access f_flags */
-	if (vfsub_file_flags(file) & __FMODE_EXEC)
-		allow_write_access(hf->hf_file);
-	fput(hf->hf_file);
-	hf->hf_file = NULL;
-	atomic_dec(&hf->hf_br->br_count);
-	hf->hf_br = NULL;
-}
-
-void au_set_h_fptr(struct file *file, aufs_bindex_t bindex, struct file *val)
-{
-	struct au_finfo *finfo = au_fi(file);
-	struct au_hfile *hf;
-	struct au_fidir *fidir;
-
-	fidir = finfo->fi_hdir;
-	if (!fidir) {
-		AuDebugOn(finfo->fi_btop != bindex);
-		hf = &finfo->fi_htop;
-	} else
-		hf = fidir->fd_hfile + bindex;
-
-	if (hf && hf->hf_file)
-		au_hfput(hf, file);
-	if (val) {
-		FiMustWriteLock(file);
-		AuDebugOn(IS_ERR_OR_NULL(file->f_path.dentry));
-		hf->hf_file = val;
-		hf->hf_br = au_sbr(file->f_path.dentry->d_sb, bindex);
-	}
-}
-
-void au_update_figen(struct file *file)
-{
-	atomic_set(&au_fi(file)->fi_generation, au_digen(file->f_path.dentry));
-	/* smp_mb(); */ /* atomic_set */
-}
-
-/* ---------------------------------------------------------------------- */
-
-struct au_fidir *au_fidir_alloc(struct super_block *sb)
-{
-	struct au_fidir *fidir;
-	int nbr;
-
-	nbr = au_sbend(sb) + 1;
-	if (nbr < 2)
-		nbr = 2; /* initial allocate for 2 branches */
-	fidir = kzalloc(au_fidir_sz(nbr), GFP_NOFS);
-	if (fidir) {
-		fidir->fd_bbot = -1;
-		fidir->fd_nent = nbr;
-	}
-
-	return fidir;
-}
-
-int au_fidir_realloc(struct au_finfo *finfo, int nbr)
-{
-	int err;
-	struct au_fidir *fidir, *p;
-
-	AuRwMustWriteLock(&finfo->fi_rwsem);
-	fidir = finfo->fi_hdir;
-	AuDebugOn(!fidir);
-
-	err = -ENOMEM;
-	p = au_kzrealloc(fidir, au_fidir_sz(fidir->fd_nent), au_fidir_sz(nbr),
-			 GFP_NOFS);
-	if (p) {
-		p->fd_nent = nbr;
-		finfo->fi_hdir = p;
-		err = 0;
-	}
-
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void au_finfo_fin(struct file *file)
-{
-	struct au_finfo *finfo;
-
-	au_nfiles_dec(file->f_path.dentry->d_sb);
-
-	finfo = au_fi(file);
-	AuDebugOn(finfo->fi_hdir);
-	AuRwDestroy(&finfo->fi_rwsem);
-	au_cache_free_finfo(finfo);
-}
-
-void au_fi_init_once(void *_finfo)
-{
-	struct au_finfo *finfo = _finfo;
-	static struct lock_class_key aufs_fi;
-
-	au_rw_init(&finfo->fi_rwsem);
-	au_rw_class(&finfo->fi_rwsem, &aufs_fi);
-}
-
-int au_finfo_init(struct file *file, struct au_fidir *fidir)
-{
-	int err;
-	struct au_finfo *finfo;
-	struct dentry *dentry;
-
-	err = -ENOMEM;
-	dentry = file->f_path.dentry;
-	finfo = au_cache_alloc_finfo();
-	if (unlikely(!finfo))
-		goto out;
-
-	err = 0;
-	au_nfiles_inc(dentry->d_sb);
-	/* verbose coding for lock class name */
-	if (!fidir)
-		au_rw_class(&finfo->fi_rwsem, au_lc_key + AuLcNonDir_FIINFO);
-	else
-		au_rw_class(&finfo->fi_rwsem, au_lc_key + AuLcDir_FIINFO);
-	au_rw_write_lock(&finfo->fi_rwsem);
-	finfo->fi_btop = -1;
-	finfo->fi_hdir = fidir;
-	atomic_set(&finfo->fi_generation, au_digen(dentry));
-	/* smp_mb(); */ /* atomic_set */
-
-	file->private_data = finfo;
-
-out:
-	return err;
-}
diff --git a/fs/aufs/fstype.h b/fs/aufs/fstype.h
deleted file mode 100644
index 725b2ffff..000000000
--- a/fs/aufs/fstype.h
+++ /dev/null
@@ -1,387 +0,0 @@
-/*
- * Copyright (C) 2005-2016 Junjiro R. Okajima
- */
-
-/*
- * judging filesystem type
- */
-
-#ifndef __AUFS_FSTYPE_H__
-#define __AUFS_FSTYPE_H__
-
-#ifdef __KERNEL__
-
-#include <linux/fs.h>
-#include <linux/magic.h>
-#include <linux/nfs_fs.h>
-#include <linux/romfs_fs.h>
-
-static inline int au_test_aufs(struct super_block *sb)
-{
-	return sb->s_magic == AUFS_SUPER_MAGIC;
-}
-
-static inline const char *au_sbtype(struct super_block *sb)
-{
-	return sb->s_type->name;
-}
-
-static inline int au_test_iso9660(struct super_block *sb __maybe_unused)
-{
-#if defined(CONFIG_ISO9660_FS) || defined(CONFIG_ISO9660_FS_MODULE)
-	return sb->s_magic == ISOFS_SUPER_MAGIC;
-#else
-	return 0;
-#endif
-}
-
-static inline int au_test_romfs(struct super_block *sb __maybe_unused)
-{
-#if defined(CONFIG_ROMFS_FS) || defined(CONFIG_ROMFS_FS_MODULE)
-	return sb->s_magic == ROMFS_MAGIC;
-#else
-	return 0;
-#endif
-}
-
-static inline int au_test_cramfs(struct super_block *sb __maybe_unused)
-{
-#if defined(CONFIG_CRAMFS) || defined(CONFIG_CRAMFS_MODULE)
-	return sb->s_magic == CRAMFS_MAGIC;
-#endif
-	return 0;
-}
-
-static inline int au_test_nfs(struct super_block *sb __maybe_unused)
-{
-#if defined(CONFIG_NFS_FS) || defined(CONFIG_NFS_FS_MODULE)
-	return sb->s_magic == NFS_SUPER_MAGIC;
-#else
-	return 0;
-#endif
-}
-
-static inline int au_test_fuse(struct super_block *sb __maybe_unused)
-{
-#if defined(CONFIG_FUSE_FS) || defined(CONFIG_FUSE_FS_MODULE)
-	return sb->s_magic == FUSE_SUPER_MAGIC;
-#else
-	return 0;
-#endif
-}
-
-static inline int au_test_xfs(struct super_block *sb __maybe_unused)
-{
-#if defined(CONFIG_XFS_FS) || defined(CONFIG_XFS_FS_MODULE)
-	return sb->s_magic == XFS_SB_MAGIC;
-#else
-	return 0;
-#endif
-}
-
-static inline int au_test_tmpfs(struct super_block *sb __maybe_unused)
-{
-#ifdef CONFIG_TMPFS
-	return sb->s_magic == TMPFS_MAGIC;
-#else
-	return 0;
-#endif
-}
-
-static inline int au_test_ecryptfs(struct super_block *sb __maybe_unused)
-{
-#if defined(CONFIG_ECRYPT_FS) || defined(CONFIG_ECRYPT_FS_MODULE)
-	return !strcmp(au_sbtype(sb), "ecryptfs");
-#else
-	return 0;
-#endif
-}
-
-static inline int au_test_ramfs(struct super_block *sb)
-{
-	return sb->s_magic == RAMFS_MAGIC;
-}
-
-static inline int au_test_ubifs(struct super_block *sb __maybe_unused)
-{
-#if defined(CONFIG_UBIFS_FS) || defined(CONFIG_UBIFS_FS_MODULE)
-	return sb->s_magic == UBIFS_SUPER_MAGIC;
-#else
-	return 0;
-#endif
-}
-
-static inline int au_test_procfs(struct super_block *sb __maybe_unused)
-{
-#ifdef CONFIG_PROC_FS
-	return sb->s_magic == PROC_SUPER_MAGIC;
-#else
-	return 0;
-#endif
-}
-
-static inline int au_test_sysfs(struct super_block *sb __maybe_unused)
-{
-#ifdef CONFIG_SYSFS
-	return sb->s_magic == SYSFS_MAGIC;
-#else
-	return 0;
-#endif
-}
-
-static inline int au_test_configfs(struct super_block *sb __maybe_unused)
-{
-#if defined(CONFIG_CONFIGFS_FS) || defined(CONFIG_CONFIGFS_FS_MODULE)
-	return sb->s_magic == CONFIGFS_MAGIC;
-#else
-	return 0;
-#endif
-}
-
-static inline int au_test_minix(struct super_block *sb __maybe_unused)
-{
-#if defined(CONFIG_MINIX_FS) || defined(CONFIG_MINIX_FS_MODULE)
-	return sb->s_magic == MINIX3_SUPER_MAGIC
-		|| sb->s_magic == MINIX2_SUPER_MAGIC
-		|| sb->s_magic == MINIX2_SUPER_MAGIC2
-		|| sb->s_magic == MINIX_SUPER_MAGIC
-		|| sb->s_magic == MINIX_SUPER_MAGIC2;
-#else
-	return 0;
-#endif
-}
-
-static inline int au_test_fat(struct super_block *sb __maybe_unused)
-{
-#if defined(CONFIG_FAT_FS) || defined(CONFIG_FAT_FS_MODULE)
-	return sb->s_magic == MSDOS_SUPER_MAGIC;
-#else
-	return 0;
-#endif
-}
-
-static inline int au_test_msdos(struct super_block *sb)
-{
-	return au_test_fat(sb);
-}
-
-static inline int au_test_vfat(struct super_block *sb)
-{
-	return au_test_fat(sb);
-}
-
-static inline int au_test_securityfs(struct super_block *sb __maybe_unused)
-{
-#ifdef CONFIG_SECURITYFS
-	return sb->s_magic == SECURITYFS_MAGIC;
-#else
-	return 0;
-#endif
-}
-
-static inline int au_test_squashfs(struct super_block *sb __maybe_unused)
-{
-#if defined(CONFIG_SQUASHFS) || defined(CONFIG_SQUASHFS_MODULE)
-	return sb->s_magic == SQUASHFS_MAGIC;
-#else
-	return 0;
-#endif
-}
-
-static inline int au_test_btrfs(struct super_block *sb __maybe_unused)
-{
-#if defined(CONFIG_BTRFS_FS) || defined(CONFIG_BTRFS_FS_MODULE)
-	return sb->s_magic == BTRFS_SUPER_MAGIC;
-#else
-	return 0;
-#endif
-}
-
-static inline int au_test_xenfs(struct super_block *sb __maybe_unused)
-{
-#if defined(CONFIG_XENFS) || defined(CONFIG_XENFS_MODULE)
-	return sb->s_magic == XENFS_SUPER_MAGIC;
-#else
-	return 0;
-#endif
-}
-
-static inline int au_test_debugfs(struct super_block *sb __maybe_unused)
-{
-#ifdef CONFIG_DEBUG_FS
-	return sb->s_magic == DEBUGFS_MAGIC;
-#else
-	return 0;
-#endif
-}
-
-static inline int au_test_nilfs(struct super_block *sb __maybe_unused)
-{
-#if defined(CONFIG_NILFS) || defined(CONFIG_NILFS_MODULE)
-	return sb->s_magic == NILFS_SUPER_MAGIC;
-#else
-	return 0;
-#endif
-}
-
-static inline int au_test_hfsplus(struct super_block *sb __maybe_unused)
-{
-#if defined(CONFIG_HFSPLUS_FS) || defined(CONFIG_HFSPLUS_FS_MODULE)
-	return sb->s_magic == HFSPLUS_SUPER_MAGIC;
-#else
-	return 0;
-#endif
-}
-
-/* ---------------------------------------------------------------------- */
-/*
- * they can't be an aufs branch.
- */
-static inline int au_test_fs_unsuppoted(struct super_block *sb)
-{
-	return
-#ifndef CONFIG_AUFS_BR_RAMFS
-		au_test_ramfs(sb) ||
-#endif
-		au_test_procfs(sb)
-		|| au_test_sysfs(sb)
-		|| au_test_configfs(sb)
-		|| au_test_debugfs(sb)
-		|| au_test_securityfs(sb)
-		|| au_test_xenfs(sb)
-		|| au_test_ecryptfs(sb)
-		/* || !strcmp(au_sbtype(sb), "unionfs") */
-		|| au_test_aufs(sb); /* will be supported in next version */
-}
-
-static inline int au_test_fs_remote(struct super_block *sb)
-{
-	return !au_test_tmpfs(sb)
-#ifdef CONFIG_AUFS_BR_RAMFS
-		&& !au_test_ramfs(sb)
-#endif
-		&& !(sb->s_type->fs_flags & FS_REQUIRES_DEV);
-}
-
-/* ---------------------------------------------------------------------- */
-
-/*
- * Note: these functions (below) are created after reading ->getattr() in all
- * filesystems under linux/fs. it means we have to do so in every update...
- */
-
-/*
- * some filesystems require getattr to refresh the inode attributes before
- * referencing.
- * in most cases, we can rely on the inode attribute in NFS (or every remote fs)
- * and leave the work for d_revalidate()
- */
-static inline int au_test_fs_refresh_iattr(struct super_block *sb)
-{
-	return au_test_nfs(sb)
-		|| au_test_fuse(sb)
-		/* || au_test_btrfs(sb) */	/* untested */
-		;
-}
-
-/*
- * filesystems which don't maintain i_size or i_blocks.
- */
-static inline int au_test_fs_bad_iattr_size(struct super_block *sb)
-{
-	return au_test_xfs(sb)
-		|| au_test_btrfs(sb)
-		|| au_test_ubifs(sb)
-		|| au_test_hfsplus(sb)	/* maintained, but incorrect */
-		/* || au_test_minix(sb) */	/* untested */
-		;
-}
-
-/*
- * filesystems which don't store the correct value in some of their inode
- * attributes.
- */
-static inline int au_test_fs_bad_iattr(struct super_block *sb)
-{
-	return au_test_fs_bad_iattr_size(sb)
-		|| au_test_fat(sb)
-		|| au_test_msdos(sb)
-		|| au_test_vfat(sb);
-}
-
-/* they don't check i_nlink in link(2) */
-static inline int au_test_fs_no_limit_nlink(struct super_block *sb)
-{
-	return au_test_tmpfs(sb)
-#ifdef CONFIG_AUFS_BR_RAMFS
-		|| au_test_ramfs(sb)
-#endif
-		|| au_test_ubifs(sb)
-		|| au_test_hfsplus(sb);
-}
-
-/*
- * filesystems which sets S_NOATIME and S_NOCMTIME.
- */
-static inline int au_test_fs_notime(struct super_block *sb)
-{
-	return au_test_nfs(sb)
-		|| au_test_fuse(sb)
-		|| au_test_ubifs(sb)
-		;
-}
-
-/* temporary support for i#1 in cramfs */
-static inline int au_test_fs_unique_ino(struct inode *inode)
-{
-	if (au_test_cramfs(inode->i_sb))
-		return inode->i_ino != 1;
-	return 1;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/*
- * the filesystem where the xino files placed must support i/o after unlink and
- * maintain i_size and i_blocks.
- */
-static inline int au_test_fs_bad_xino(struct super_block *sb)
-{
-	return au_test_fs_remote(sb)
-		|| au_test_fs_bad_iattr_size(sb)
-		/* don't want unnecessary work for xino */
-		|| au_test_aufs(sb)
-		|| au_test_ecryptfs(sb)
-		|| au_test_nilfs(sb);
-}
-
-static inline int au_test_fs_trunc_xino(struct super_block *sb)
-{
-	return au_test_tmpfs(sb)
-		|| au_test_ramfs(sb);
-}
-
-/*
- * test if the @sb is real-readonly.
- */
-static inline int au_test_fs_rr(struct super_block *sb)
-{
-	return au_test_squashfs(sb)
-		|| au_test_iso9660(sb)
-		|| au_test_cramfs(sb)
-		|| au_test_romfs(sb);
-}
-
-/*
- * test if the @inode is nfs with 'noacl' option
- * NFS always sets MS_POSIXACL regardless its mount option 'noacl.'
- */
-static inline int au_test_nfs_noacl(struct inode *inode)
-{
-	return au_test_nfs(inode->i_sb)
-		/* && IS_POSIXACL(inode) */
-		&& !nfs_server_capable(inode, NFS_CAP_ACLS);
-}
-
-#endif /* __KERNEL__ */
-#endif /* __AUFS_FSTYPE_H__ */
diff --git a/fs/aufs/hfsnotify.c b/fs/aufs/hfsnotify.c
deleted file mode 100644
index c0a1a63a9..000000000
--- a/fs/aufs/hfsnotify.c
+++ /dev/null
@@ -1,275 +0,0 @@
-/*
- * Copyright (C) 2005-2016 Junjiro R. Okajima
- */
-
-/*
- * fsnotify for the lower directories
- */
-
-#include "aufs.h"
-
-/* FS_IN_IGNORED is unnecessary */
-static const __u32 AuHfsnMask = (FS_MOVED_TO | FS_MOVED_FROM | FS_DELETE
-				 | FS_CREATE | FS_EVENT_ON_CHILD);
-static DECLARE_WAIT_QUEUE_HEAD(au_hfsn_wq);
-static __cacheline_aligned_in_smp atomic64_t au_hfsn_ifree = ATOMIC64_INIT(0);
-
-static void au_hfsn_free_mark(struct fsnotify_mark *mark)
-{
-	struct au_hnotify *hn = container_of(mark, struct au_hnotify,
-					     hn_mark);
-	AuDbg("here\n");
-	au_cache_free_hnotify(hn);
-	smp_mb__before_atomic();
-	if (atomic64_dec_and_test(&au_hfsn_ifree))
-		wake_up(&au_hfsn_wq);
-}
-
-static int au_hfsn_alloc(struct au_hinode *hinode)
-{
-	int err;
-	struct au_hnotify *hn;
-	struct super_block *sb;
-	struct au_branch *br;
-	struct fsnotify_mark *mark;
-	aufs_bindex_t bindex;
-
-	hn = hinode->hi_notify;
-	sb = hn->hn_aufs_inode->i_sb;
-	bindex = au_br_index(sb, hinode->hi_id);
-	br = au_sbr(sb, bindex);
-	AuDebugOn(!br->br_hfsn);
-
-	mark = &hn->hn_mark;
-	fsnotify_init_mark(mark, au_hfsn_free_mark);
-	mark->mask = AuHfsnMask;
-	/*
-	 * by udba rename or rmdir, aufs assign a new inode to the known
-	 * h_inode, so specify 1 to allow dups.
-	 */
-	lockdep_off();
-	err = fsnotify_add_mark(mark, br->br_hfsn->hfsn_group, hinode->hi_inode,
-				 /*mnt*/NULL, /*allow_dups*/1);
-	/* even if err */
-	fsnotify_put_mark(mark);
-	lockdep_on();
-
-	return err;
-}
-
-static int au_hfsn_free(struct au_hinode *hinode, struct au_hnotify *hn)
-{
-	struct fsnotify_mark *mark;
-	unsigned long long ull;
-	struct fsnotify_group *group;
-
-	ull = atomic64_inc_return(&au_hfsn_ifree);
-	BUG_ON(!ull);
-
-	mark = &hn->hn_mark;
-	spin_lock(&mark->lock);
-	group = mark->group;
-	fsnotify_get_group(group);
-	spin_unlock(&mark->lock);
-	lockdep_off();
-	fsnotify_destroy_mark(mark, group);
-	fsnotify_put_group(group);
-	lockdep_on();
-
-	/* free hn by myself */
-	return 0;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static void au_hfsn_ctl(struct au_hinode *hinode, int do_set)
-{
-	struct fsnotify_mark *mark;
-
-	mark = &hinode->hi_notify->hn_mark;
-	spin_lock(&mark->lock);
-	if (do_set) {
-		AuDebugOn(mark->mask & AuHfsnMask);
-		mark->mask |= AuHfsnMask;
-	} else {
-		AuDebugOn(!(mark->mask & AuHfsnMask));
-		mark->mask &= ~AuHfsnMask;
-	}
-	spin_unlock(&mark->lock);
-	/* fsnotify_recalc_inode_mask(hinode->hi_inode); */
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* #define AuDbgHnotify */
-#ifdef AuDbgHnotify
-static char *au_hfsn_name(u32 mask)
-{
-#ifdef CONFIG_AUFS_DEBUG
-#define test_ret(flag)				\
-	do {					\
-		if (mask & flag)		\
-			return #flag;		\
-	} while (0)
-	test_ret(FS_ACCESS);
-	test_ret(FS_MODIFY);
-	test_ret(FS_ATTRIB);
-	test_ret(FS_CLOSE_WRITE);
-	test_ret(FS_CLOSE_NOWRITE);
-	test_ret(FS_OPEN);
-	test_ret(FS_MOVED_FROM);
-	test_ret(FS_MOVED_TO);
-	test_ret(FS_CREATE);
-	test_ret(FS_DELETE);
-	test_ret(FS_DELETE_SELF);
-	test_ret(FS_MOVE_SELF);
-	test_ret(FS_UNMOUNT);
-	test_ret(FS_Q_OVERFLOW);
-	test_ret(FS_IN_IGNORED);
-	test_ret(FS_ISDIR);
-	test_ret(FS_IN_ONESHOT);
-	test_ret(FS_EVENT_ON_CHILD);
-	return "";
-#undef test_ret
-#else
-	return "??";
-#endif
-}
-#endif
-
-/* ---------------------------------------------------------------------- */
-
-static void au_hfsn_free_group(struct fsnotify_group *group)
-{
-	struct au_br_hfsnotify *hfsn = group->private;
-
-	AuDbg("here\n");
-	kfree(hfsn);
-}
-
-static int au_hfsn_handle_event(struct fsnotify_group *group,
-				struct inode *inode,
-				struct fsnotify_mark *inode_mark,
-				struct fsnotify_mark *vfsmount_mark,
-				u32 mask, void *data, int data_type,
-				const unsigned char *file_name, u32 cookie)
-{
-	int err;
-	struct au_hnotify *hnotify;
-	struct inode *h_dir, *h_inode;
-	struct qstr h_child_qstr = QSTR_INIT(file_name, strlen(file_name));
-
-	AuDebugOn(data_type != FSNOTIFY_EVENT_INODE);
-
-	err = 0;
-	/* if FS_UNMOUNT happens, there must be another bug */
-	AuDebugOn(mask & FS_UNMOUNT);
-	if (mask & (FS_IN_IGNORED | FS_UNMOUNT))
-		goto out;
-
-	h_dir = inode;
-	h_inode = NULL;
-#ifdef AuDbgHnotify
-	au_debug_on();
-	if (1 || h_child_qstr.len != sizeof(AUFS_XINO_FNAME) - 1
-	    || strncmp(h_child_qstr.name, AUFS_XINO_FNAME, h_child_qstr.len)) {
-		AuDbg("i%lu, mask 0x%x %s, hcname %.*s, hi%lu\n",
-		      h_dir->i_ino, mask, au_hfsn_name(mask),
-		      AuLNPair(&h_child_qstr), h_inode ? h_inode->i_ino : 0);
-		/* WARN_ON(1); */
-	}
-	au_debug_off();
-#endif
-
-	AuDebugOn(!inode_mark);
-	hnotify = container_of(inode_mark, struct au_hnotify, hn_mark);
-	err = au_hnotify(h_dir, hnotify, mask, &h_child_qstr, h_inode);
-
-out:
-	return err;
-}
-
-static struct fsnotify_ops au_hfsn_ops = {
-	.handle_event		= au_hfsn_handle_event,
-	.free_group_priv	= au_hfsn_free_group
-};
-
-/* ---------------------------------------------------------------------- */
-
-static void au_hfsn_fin_br(struct au_branch *br)
-{
-	struct au_br_hfsnotify *hfsn;
-
-	hfsn = br->br_hfsn;
-	if (hfsn) {
-		lockdep_off();
-		fsnotify_put_group(hfsn->hfsn_group);
-		lockdep_on();
-	}
-}
-
-static int au_hfsn_init_br(struct au_branch *br, int perm)
-{
-	int err;
-	struct fsnotify_group *group;
-	struct au_br_hfsnotify *hfsn;
-
-	err = 0;
-	br->br_hfsn = NULL;
-	if (!au_br_hnotifyable(perm))
-		goto out;
-
-	err = -ENOMEM;
-	hfsn = kmalloc(sizeof(*hfsn), GFP_NOFS);
-	if (unlikely(!hfsn))
-		goto out;
-
-	err = 0;
-	group = fsnotify_alloc_group(&au_hfsn_ops);
-	if (IS_ERR(group)) {
-		err = PTR_ERR(group);
-		pr_err("fsnotify_alloc_group() failed, %d\n", err);
-		goto out_hfsn;
-	}
-
-	group->private = hfsn;
-	hfsn->hfsn_group = group;
-	br->br_hfsn = hfsn;
-	goto out; /* success */
-
-out_hfsn:
-	kfree(hfsn);
-out:
-	return err;
-}
-
-static int au_hfsn_reset_br(unsigned int udba, struct au_branch *br, int perm)
-{
-	int err;
-
-	err = 0;
-	if (!br->br_hfsn)
-		err = au_hfsn_init_br(br, perm);
-
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static void au_hfsn_fin(void)
-{
-	AuDbg("au_hfsn_ifree %lld\n", (long long)atomic64_read(&au_hfsn_ifree));
-	wait_event(au_hfsn_wq, !atomic64_read(&au_hfsn_ifree));
-}
-
-const struct au_hnotify_op au_hnotify_op = {
-	.ctl		= au_hfsn_ctl,
-	.alloc		= au_hfsn_alloc,
-	.free		= au_hfsn_free,
-
-	.fin		= au_hfsn_fin,
-
-	.reset_br	= au_hfsn_reset_br,
-	.fin_br		= au_hfsn_fin_br,
-	.init_br	= au_hfsn_init_br
-};
diff --git a/fs/aufs/hfsplus.c b/fs/aufs/hfsplus.c
deleted file mode 100644
index 145c6ac2f..000000000
--- a/fs/aufs/hfsplus.c
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
- * Copyright (C) 2010-2016 Junjiro R. Okajima
- */
-
-/*
- * special support for filesystems which aqucires an inode mutex
- * at final closing a file, eg, hfsplus.
- *
- * This trick is very simple and stupid, just to open the file before really
- * neceeary open to tell hfsplus that this is not the final closing.
- * The caller should call au_h_open_pre() after acquiring the inode mutex,
- * and au_h_open_post() after releasing it.
- */
-
-#include "aufs.h"
-
-struct file *au_h_open_pre(struct dentry *dentry, aufs_bindex_t bindex,
-			   int force_wr)
-{
-	struct file *h_file;
-	struct dentry *h_dentry;
-
-	h_dentry = au_h_dptr(dentry, bindex);
-	AuDebugOn(!h_dentry);
-	AuDebugOn(d_is_negative(h_dentry));
-
-	h_file = NULL;
-	if (au_test_hfsplus(h_dentry->d_sb)
-	    && d_is_reg(h_dentry))
-		h_file = au_h_open(dentry, bindex,
-				   O_RDONLY | O_NOATIME | O_LARGEFILE,
-				   /*file*/NULL, force_wr);
-	return h_file;
-}
-
-void au_h_open_post(struct dentry *dentry, aufs_bindex_t bindex,
-		    struct file *h_file)
-{
-	if (h_file) {
-		fput(h_file);
-		au_sbr_put(dentry->d_sb, bindex);
-	}
-}
diff --git a/fs/aufs/hnotify.c b/fs/aufs/hnotify.c
deleted file mode 100644
index 3e0a4f67d..000000000
--- a/fs/aufs/hnotify.c
+++ /dev/null
@@ -1,697 +0,0 @@
-/*
- * Copyright (C) 2005-2016 Junjiro R. Okajima
- */
-
-/*
- * abstraction to notify the direct changes on lower directories
- */
-
-#include "aufs.h"
-
-int au_hn_alloc(struct au_hinode *hinode, struct inode *inode)
-{
-	int err;
-	struct au_hnotify *hn;
-
-	err = -ENOMEM;
-	hn = au_cache_alloc_hnotify();
-	if (hn) {
-		hn->hn_aufs_inode = inode;
-		hinode->hi_notify = hn;
-		err = au_hnotify_op.alloc(hinode);
-		AuTraceErr(err);
-		if (unlikely(err)) {
-			hinode->hi_notify = NULL;
-			au_cache_free_hnotify(hn);
-			/*
-			 * The upper dir was removed by udba, but the same named
-			 * dir left. In this case, aufs assignes a new inode
-			 * number and set the monitor again.
-			 * For the lower dir, the old monitnor is still left.
-			 */
-			if (err == -EEXIST)
-				err = 0;
-		}
-	}
-
-	AuTraceErr(err);
-	return err;
-}
-
-void au_hn_free(struct au_hinode *hinode)
-{
-	struct au_hnotify *hn;
-
-	hn = hinode->hi_notify;
-	if (hn) {
-		hinode->hi_notify = NULL;
-		if (au_hnotify_op.free(hinode, hn))
-			au_cache_free_hnotify(hn);
-	}
-}
-
-/* ---------------------------------------------------------------------- */
-
-void au_hn_ctl(struct au_hinode *hinode, int do_set)
-{
-	if (hinode->hi_notify)
-		au_hnotify_op.ctl(hinode, do_set);
-}
-
-void au_hn_reset(struct inode *inode, unsigned int flags)
-{
-	aufs_bindex_t bindex, bend;
-	struct inode *hi;
-	struct dentry *iwhdentry;
-
-	bend = au_ibend(inode);
-	for (bindex = au_ibstart(inode); bindex <= bend; bindex++) {
-		hi = au_h_iptr(inode, bindex);
-		if (!hi)
-			continue;
-
-		/* mutex_lock_nested(&hi->i_mutex, AuLsc_I_CHILD); */
-		iwhdentry = au_hi_wh(inode, bindex);
-		if (iwhdentry)
-			dget(iwhdentry);
-		au_igrab(hi);
-		au_set_h_iptr(inode, bindex, NULL, 0);
-		au_set_h_iptr(inode, bindex, au_igrab(hi),
-			      flags & ~AuHi_XINO);
-		iput(hi);
-		dput(iwhdentry);
-		/* mutex_unlock(&hi->i_mutex); */
-	}
-}
-
-/* ---------------------------------------------------------------------- */
-
-static int hn_xino(struct inode *inode, struct inode *h_inode)
-{
-	int err;
-	aufs_bindex_t bindex, bend, bfound, bstart;
-	struct inode *h_i;
-
-	err = 0;
-	if (unlikely(inode->i_ino == AUFS_ROOT_INO)) {
-		pr_warn("branch root dir was changed\n");
-		goto out;
-	}
-
-	bfound = -1;
-	bend = au_ibend(inode);
-	bstart = au_ibstart(inode);
-#if 0 /* reserved for future use */
-	if (bindex == bend) {
-		/* keep this ino in rename case */
-		goto out;
-	}
-#endif
-	for (bindex = bstart; bindex <= bend; bindex++)
-		if (au_h_iptr(inode, bindex) == h_inode) {
-			bfound = bindex;
-			break;
-		}
-	if (bfound < 0)
-		goto out;
-
-	for (bindex = bstart; bindex <= bend; bindex++) {
-		h_i = au_h_iptr(inode, bindex);
-		if (!h_i)
-			continue;
-
-		err = au_xino_write(inode->i_sb, bindex, h_i->i_ino, /*ino*/0);
-		/* ignore this error */
-		/* bad action? */
-	}
-
-	/* children inode number will be broken */
-
-out:
-	AuTraceErr(err);
-	return err;
-}
-
-static int hn_gen_tree(struct dentry *dentry)
-{
-	int err, i, j, ndentry;
-	struct au_dcsub_pages dpages;
-	struct au_dpage *dpage;
-	struct dentry **dentries;
-
-	err = au_dpages_init(&dpages, GFP_NOFS);
-	if (unlikely(err))
-		goto out;
-	err = au_dcsub_pages(&dpages, dentry, NULL, NULL);
-	if (unlikely(err))
-		goto out_dpages;
-
-	for (i = 0; i < dpages.ndpage; i++) {
-		dpage = dpages.dpages + i;
-		dentries = dpage->dentries;
-		ndentry = dpage->ndentry;
-		for (j = 0; j < ndentry; j++) {
-			struct dentry *d;
-
-			d = dentries[j];
-			if (IS_ROOT(d))
-				continue;
-
-			au_digen_dec(d);
-			if (d_really_is_positive(d))
-				/* todo: reset children xino?
-				   cached children only? */
-				au_iigen_dec(d_inode(d));
-		}
-	}
-
-out_dpages:
-	au_dpages_free(&dpages);
-
-#if 0
-	/* discard children */
-	dentry_unhash(dentry);
-	dput(dentry);
-#endif
-out:
-	return err;
-}
-
-/*
- * return 0 if processed.
- */
-static int hn_gen_by_inode(char *name, unsigned int nlen, struct inode *inode,
-			   const unsigned int isdir)
-{
-	int err;
-	struct dentry *d;
-	struct qstr *dname;
-
-	err = 1;
-	if (unlikely(inode->i_ino == AUFS_ROOT_INO)) {
-		pr_warn("branch root dir was changed\n");
-		err = 0;
-		goto out;
-	}
-
-	if (!isdir) {
-		AuDebugOn(!name);
-		au_iigen_dec(inode);
-		spin_lock(&inode->i_lock);
-		hlist_for_each_entry(d, &inode->i_dentry, d_u.d_alias) {
-			spin_lock(&d->d_lock);
-			dname = &d->d_name;
-			if (dname->len != nlen
-			    && memcmp(dname->name, name, nlen)) {
-				spin_unlock(&d->d_lock);
-				continue;
-			}
-			err = 0;
-			au_digen_dec(d);
-			spin_unlock(&d->d_lock);
-			break;
-		}
-		spin_unlock(&inode->i_lock);
-	} else {
-		au_fset_si(au_sbi(inode->i_sb), FAILED_REFRESH_DIR);
-		d = d_find_any_alias(inode);
-		if (!d) {
-			au_iigen_dec(inode);
-			goto out;
-		}
-
-		spin_lock(&d->d_lock);
-		dname = &d->d_name;
-		if (dname->len == nlen && !memcmp(dname->name, name, nlen)) {
-			spin_unlock(&d->d_lock);
-			err = hn_gen_tree(d);
-			spin_lock(&d->d_lock);
-		}
-		spin_unlock(&d->d_lock);
-		dput(d);
-	}
-
-out:
-	AuTraceErr(err);
-	return err;
-}
-
-static int hn_gen_by_name(struct dentry *dentry, const unsigned int isdir)
-{
-	int err;
-
-	if (IS_ROOT(dentry)) {
-		pr_warn("branch root dir was changed\n");
-		return 0;
-	}
-
-	err = 0;
-	if (!isdir) {
-		au_digen_dec(dentry);
-		if (d_really_is_positive(dentry))
-			au_iigen_dec(d_inode(dentry));
-	} else {
-		au_fset_si(au_sbi(dentry->d_sb), FAILED_REFRESH_DIR);
-		if (d_really_is_positive(dentry))
-			err = hn_gen_tree(dentry);
-	}
-
-	AuTraceErr(err);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* hnotify job flags */
-#define AuHnJob_XINO0		1
-#define AuHnJob_GEN		(1 << 1)
-#define AuHnJob_DIRENT		(1 << 2)
-#define AuHnJob_ISDIR		(1 << 3)
-#define AuHnJob_TRYXINO0	(1 << 4)
-#define AuHnJob_MNTPNT		(1 << 5)
-#define au_ftest_hnjob(flags, name)	((flags) & AuHnJob_##name)
-#define au_fset_hnjob(flags, name) \
-	do { (flags) |= AuHnJob_##name; } while (0)
-#define au_fclr_hnjob(flags, name) \
-	do { (flags) &= ~AuHnJob_##name; } while (0)
-
-enum {
-	AuHn_CHILD,
-	AuHn_PARENT,
-	AuHnLast
-};
-
-struct au_hnotify_args {
-	struct inode *h_dir, *dir, *h_child_inode;
-	u32 mask;
-	unsigned int flags[AuHnLast];
-	unsigned int h_child_nlen;
-	char h_child_name[];
-};
-
-struct hn_job_args {
-	unsigned int flags;
-	struct inode *inode, *h_inode, *dir, *h_dir;
-	struct dentry *dentry;
-	char *h_name;
-	int h_nlen;
-};
-
-static int hn_job(struct hn_job_args *a)
-{
-	const unsigned int isdir = au_ftest_hnjob(a->flags, ISDIR);
-	int e;
-
-	/* reset xino */
-	if (au_ftest_hnjob(a->flags, XINO0) && a->inode)
-		hn_xino(a->inode, a->h_inode); /* ignore this error */
-
-	if (au_ftest_hnjob(a->flags, TRYXINO0)
-	    && a->inode
-	    && a->h_inode) {
-		mutex_lock_nested(&a->h_inode->i_mutex, AuLsc_I_CHILD);
-		if (!a->h_inode->i_nlink
-		    && !(a->h_inode->i_state & I_LINKABLE))
-			hn_xino(a->inode, a->h_inode); /* ignore this error */
-		mutex_unlock(&a->h_inode->i_mutex);
-	}
-
-	/* make the generation obsolete */
-	if (au_ftest_hnjob(a->flags, GEN)) {
-		e = -1;
-		if (a->inode)
-			e = hn_gen_by_inode(a->h_name, a->h_nlen, a->inode,
-					      isdir);
-		if (e && a->dentry)
-			hn_gen_by_name(a->dentry, isdir);
-		/* ignore this error */
-	}
-
-	/* make dir entries obsolete */
-	if (au_ftest_hnjob(a->flags, DIRENT) && a->inode) {
-		struct au_vdir *vdir;
-
-		vdir = au_ivdir(a->inode);
-		if (vdir)
-			vdir->vd_jiffy = 0;
-		/* IMustLock(a->inode); */
-		/* a->inode->i_version++; */
-	}
-
-	/* can do nothing but warn */
-	if (au_ftest_hnjob(a->flags, MNTPNT)
-	    && a->dentry
-	    && d_mountpoint(a->dentry))
-		pr_warn("mount-point %pd is removed or renamed\n", a->dentry);
-
-	return 0;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static struct dentry *lookup_wlock_by_name(char *name, unsigned int nlen,
-					   struct inode *dir)
-{
-	struct dentry *dentry, *d, *parent;
-	struct qstr *dname;
-
-	parent = d_find_any_alias(dir);
-	if (!parent)
-		return NULL;
-
-	dentry = NULL;
-	spin_lock(&parent->d_lock);
-	list_for_each_entry(d, &parent->d_subdirs, d_child) {
-		/* AuDbg("%pd\n", d); */
-		spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED);
-		dname = &d->d_name;
-		if (dname->len != nlen || memcmp(dname->name, name, nlen))
-			goto cont_unlock;
-		if (au_di(d))
-			au_digen_dec(d);
-		else
-			goto cont_unlock;
-		if (au_dcount(d) > 0) {
-			dentry = dget_dlock(d);
-			spin_unlock(&d->d_lock);
-			break;
-		}
-
-cont_unlock:
-		spin_unlock(&d->d_lock);
-	}
-	spin_unlock(&parent->d_lock);
-	dput(parent);
-
-	if (dentry)
-		di_write_lock_child(dentry);
-
-	return dentry;
-}
-
-static struct inode *lookup_wlock_by_ino(struct super_block *sb,
-					 aufs_bindex_t bindex, ino_t h_ino)
-{
-	struct inode *inode;
-	ino_t ino;
-	int err;
-
-	inode = NULL;
-	err = au_xino_read(sb, bindex, h_ino, &ino);
-	if (!err && ino)
-		inode = ilookup(sb, ino);
-	if (!inode)
-		goto out;
-
-	if (unlikely(inode->i_ino == AUFS_ROOT_INO)) {
-		pr_warn("wrong root branch\n");
-		iput(inode);
-		inode = NULL;
-		goto out;
-	}
-
-	ii_write_lock_child(inode);
-
-out:
-	return inode;
-}
-
-static void au_hn_bh(void *_args)
-{
-	struct au_hnotify_args *a = _args;
-	struct super_block *sb;
-	aufs_bindex_t bindex, bend, bfound;
-	unsigned char xino, try_iput;
-	int err;
-	struct inode *inode;
-	ino_t h_ino;
-	struct hn_job_args args;
-	struct dentry *dentry;
-	struct au_sbinfo *sbinfo;
-
-	AuDebugOn(!_args);
-	AuDebugOn(!a->h_dir);
-	AuDebugOn(!a->dir);
-	AuDebugOn(!a->mask);
-	AuDbg("mask 0x%x, i%lu, hi%lu, hci%lu\n",
-	      a->mask, a->dir->i_ino, a->h_dir->i_ino,
-	      a->h_child_inode ? a->h_child_inode->i_ino : 0);
-
-	inode = NULL;
-	dentry = NULL;
-	/*
-	 * do not lock a->dir->i_mutex here
-	 * because of d_revalidate() may cause a deadlock.
-	 */
-	sb = a->dir->i_sb;
-	AuDebugOn(!sb);
-	sbinfo = au_sbi(sb);
-	AuDebugOn(!sbinfo);
-	si_write_lock(sb, AuLock_NOPLMW);
-
-	ii_read_lock_parent(a->dir);
-	bfound = -1;
-	bend = au_ibend(a->dir);
-	for (bindex = au_ibstart(a->dir); bindex <= bend; bindex++)
-		if (au_h_iptr(a->dir, bindex) == a->h_dir) {
-			bfound = bindex;
-			break;
-		}
-	ii_read_unlock(a->dir);
-	if (unlikely(bfound < 0))
-		goto out;
-
-	xino = !!au_opt_test(au_mntflags(sb), XINO);
-	h_ino = 0;
-	if (a->h_child_inode)
-		h_ino = a->h_child_inode->i_ino;
-
-	if (a->h_child_nlen
-	    && (au_ftest_hnjob(a->flags[AuHn_CHILD], GEN)
-		|| au_ftest_hnjob(a->flags[AuHn_CHILD], MNTPNT)))
-		dentry = lookup_wlock_by_name(a->h_child_name, a->h_child_nlen,
-					      a->dir);
-	try_iput = 0;
-	if (dentry && d_really_is_positive(dentry))
-		inode = d_inode(dentry);
-	if (xino && !inode && h_ino
-	    && (au_ftest_hnjob(a->flags[AuHn_CHILD], XINO0)
-		|| au_ftest_hnjob(a->flags[AuHn_CHILD], TRYXINO0)
-		|| au_ftest_hnjob(a->flags[AuHn_CHILD], GEN))) {
-		inode = lookup_wlock_by_ino(sb, bfound, h_ino);
-		try_iput = 1;
-	    }
-
-	args.flags = a->flags[AuHn_CHILD];
-	args.dentry = dentry;
-	args.inode = inode;
-	args.h_inode = a->h_child_inode;
-	args.dir = a->dir;
-	args.h_dir = a->h_dir;
-	args.h_name = a->h_child_name;
-	args.h_nlen = a->h_child_nlen;
-	err = hn_job(&args);
-	if (dentry) {
-		if (au_di(dentry))
-			di_write_unlock(dentry);
-		dput(dentry);
-	}
-	if (inode && try_iput) {
-		ii_write_unlock(inode);
-		iput(inode);
-	}
-
-	ii_write_lock_parent(a->dir);
-	args.flags = a->flags[AuHn_PARENT];
-	args.dentry = NULL;
-	args.inode = a->dir;
-	args.h_inode = a->h_dir;
-	args.dir = NULL;
-	args.h_dir = NULL;
-	args.h_name = NULL;
-	args.h_nlen = 0;
-	err = hn_job(&args);
-	ii_write_unlock(a->dir);
-
-out:
-	iput(a->h_child_inode);
-	iput(a->h_dir);
-	iput(a->dir);
-	si_write_unlock(sb);
-	au_nwt_done(&sbinfo->si_nowait);
-	kfree(a);
-}
-
-/* ---------------------------------------------------------------------- */
-
-int au_hnotify(struct inode *h_dir, struct au_hnotify *hnotify, u32 mask,
-	       struct qstr *h_child_qstr, struct inode *h_child_inode)
-{
-	int err, len;
-	unsigned int flags[AuHnLast], f;
-	unsigned char isdir, isroot, wh;
-	struct inode *dir;
-	struct au_hnotify_args *args;
-	char *p, *h_child_name;
-
-	err = 0;
-	AuDebugOn(!hnotify || !hnotify->hn_aufs_inode);
-	dir = igrab(hnotify->hn_aufs_inode);
-	if (!dir)
-		goto out;
-
-	isroot = (dir->i_ino == AUFS_ROOT_INO);
-	wh = 0;
-	h_child_name = (void *)h_child_qstr->name;
-	len = h_child_qstr->len;
-	if (h_child_name) {
-		if (len > AUFS_WH_PFX_LEN
-		    && !memcmp(h_child_name, AUFS_WH_PFX, AUFS_WH_PFX_LEN)) {
-			h_child_name += AUFS_WH_PFX_LEN;
-			len -= AUFS_WH_PFX_LEN;
-			wh = 1;
-		}
-	}
-
-	isdir = 0;
-	if (h_child_inode)
-		isdir = !!S_ISDIR(h_child_inode->i_mode);
-	flags[AuHn_PARENT] = AuHnJob_ISDIR;
-	flags[AuHn_CHILD] = 0;
-	if (isdir)
-		flags[AuHn_CHILD] = AuHnJob_ISDIR;
-	au_fset_hnjob(flags[AuHn_PARENT], DIRENT);
-	au_fset_hnjob(flags[AuHn_CHILD], GEN);
-	switch (mask & FS_EVENTS_POSS_ON_CHILD) {
-	case FS_MOVED_FROM:
-	case FS_MOVED_TO:
-		au_fset_hnjob(flags[AuHn_CHILD], XINO0);
-		au_fset_hnjob(flags[AuHn_CHILD], MNTPNT);
-		/*FALLTHROUGH*/
-	case FS_CREATE:
-		AuDebugOn(!h_child_name);
-		break;
-
-	case FS_DELETE:
-		/*
-		 * aufs never be able to get this child inode.
-		 * revalidation should be in d_revalidate()
-		 * by checking i_nlink, i_generation or d_unhashed().
-		 */
-		AuDebugOn(!h_child_name);
-		au_fset_hnjob(flags[AuHn_CHILD], TRYXINO0);
-		au_fset_hnjob(flags[AuHn_CHILD], MNTPNT);
-		break;
-
-	default:
-		AuDebugOn(1);
-	}
-
-	if (wh)
-		h_child_inode = NULL;
-
-	err = -ENOMEM;
-	/* iput() and kfree() will be called in au_hnotify() */
-	args = kmalloc(sizeof(*args) + len + 1, GFP_NOFS);
-	if (unlikely(!args)) {
-		AuErr1("no memory\n");
-		iput(dir);
-		goto out;
-	}
-	args->flags[AuHn_PARENT] = flags[AuHn_PARENT];
-	args->flags[AuHn_CHILD] = flags[AuHn_CHILD];
-	args->mask = mask;
-	args->dir = dir;
-	args->h_dir = igrab(h_dir);
-	if (h_child_inode)
-		h_child_inode = igrab(h_child_inode); /* can be NULL */
-	args->h_child_inode = h_child_inode;
-	args->h_child_nlen = len;
-	if (len) {
-		p = (void *)args;
-		p += sizeof(*args);
-		memcpy(p, h_child_name, len);
-		p[len] = 0;
-	}
-
-	/* NFS fires the event for silly-renamed one from kworker */
-	f = 0;
-	if (!dir->i_nlink
-	    || (au_test_nfs(h_dir->i_sb) && (mask & FS_DELETE)))
-		f = AuWkq_NEST;
-	err = au_wkq_nowait(au_hn_bh, args, dir->i_sb, f);
-	if (unlikely(err)) {
-		pr_err("wkq %d\n", err);
-		iput(args->h_child_inode);
-		iput(args->h_dir);
-		iput(args->dir);
-		kfree(args);
-	}
-
-out:
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-int au_hnotify_reset_br(unsigned int udba, struct au_branch *br, int perm)
-{
-	int err;
-
-	AuDebugOn(!(udba & AuOptMask_UDBA));
-
-	err = 0;
-	if (au_hnotify_op.reset_br)
-		err = au_hnotify_op.reset_br(udba, br, perm);
-
-	return err;
-}
-
-int au_hnotify_init_br(struct au_branch *br, int perm)
-{
-	int err;
-
-	err = 0;
-	if (au_hnotify_op.init_br)
-		err = au_hnotify_op.init_br(br, perm);
-
-	return err;
-}
-
-void au_hnotify_fin_br(struct au_branch *br)
-{
-	if (au_hnotify_op.fin_br)
-		au_hnotify_op.fin_br(br);
-}
-
-static void au_hn_destroy_cache(void)
-{
-	kmem_cache_destroy(au_cachep[AuCache_HNOTIFY]);
-	au_cachep[AuCache_HNOTIFY] = NULL;
-}
-
-int __init au_hnotify_init(void)
-{
-	int err;
-
-	err = -ENOMEM;
-	au_cachep[AuCache_HNOTIFY] = AuCache(au_hnotify);
-	if (au_cachep[AuCache_HNOTIFY]) {
-		err = 0;
-		if (au_hnotify_op.init)
-			err = au_hnotify_op.init();
-		if (unlikely(err))
-			au_hn_destroy_cache();
-	}
-	AuTraceErr(err);
-	return err;
-}
-
-void au_hnotify_fin(void)
-{
-	if (au_hnotify_op.fin)
-		au_hnotify_op.fin();
-	/* cf. au_cache_fin() */
-	if (au_cachep[AuCache_HNOTIFY])
-		au_hn_destroy_cache();
-}
diff --git a/fs/aufs/i_op.c b/fs/aufs/i_op.c
deleted file mode 100644
index 6e50526d8..000000000
--- a/fs/aufs/i_op.c
+++ /dev/null
@@ -1,1477 +0,0 @@
-/*
- * Copyright (C) 2005-2016 Junjiro R. Okajima
- */
-
-/*
- * inode operations (except add/del/rename)
- */
-
-#include <linux/device_cgroup.h>
-#include <linux/fs_stack.h>
-#include <linux/namei.h>
-#include <linux/security.h>
-#include "aufs.h"
-
-static int h_permission(struct inode *h_inode, int mask,
-			struct vfsmount *h_mnt, int brperm)
-{
-	int err;
-	const unsigned char write_mask = !!(mask & (MAY_WRITE | MAY_APPEND));
-
-	err = -EACCES;
-	if ((write_mask && IS_IMMUTABLE(h_inode))
-	    || ((mask & MAY_EXEC)
-		&& S_ISREG(h_inode->i_mode)
-		&& ((h_mnt->mnt_flags & MNT_NOEXEC)
-		    || !(h_inode->i_mode & S_IXUGO))))
-		goto out;
-
-	/*
-	 * - skip the lower fs test in the case of write to ro branch.
-	 * - nfs dir permission write check is optimized, but a policy for
-	 *   link/rename requires a real check.
-	 * - nfs always sets MS_POSIXACL regardless its mount option 'noacl.'
-	 *   in this case, generic_permission() returns -EOPNOTSUPP.
-	 */
-	if ((write_mask && !au_br_writable(brperm))
-	    || (au_test_nfs(h_inode->i_sb) && S_ISDIR(h_inode->i_mode)
-		&& write_mask && !(mask & MAY_READ))
-	    || !h_inode->i_op->permission) {
-		/* AuLabel(generic_permission); */
-		/* AuDbg("get_acl %pf\n", h_inode->i_op->get_acl); */
-		err = generic_permission(h_inode, mask);
-		if (err == -EOPNOTSUPP && au_test_nfs_noacl(h_inode))
-			err = h_inode->i_op->permission(h_inode, mask);
-		AuTraceErr(err);
-	} else {
-		/* AuLabel(h_inode->permission); */
-		err = h_inode->i_op->permission(h_inode, mask);
-		AuTraceErr(err);
-	}
-
-	if (!err)
-		err = devcgroup_inode_permission(h_inode, mask);
-	if (!err)
-		err = security_inode_permission(h_inode, mask);
-
-#if 0
-	if (!err) {
-		/* todo: do we need to call ima_path_check()? */
-		struct path h_path = {
-			.dentry	=
-			.mnt	= h_mnt
-		};
-		err = ima_path_check(&h_path,
-				     mask & (MAY_READ | MAY_WRITE | MAY_EXEC),
-				     IMA_COUNT_LEAVE);
-	}
-#endif
-
-out:
-	return err;
-}
-
-static int aufs_permission(struct inode *inode, int mask)
-{
-	int err;
-	aufs_bindex_t bindex, bend;
-	const unsigned char isdir = !!S_ISDIR(inode->i_mode),
-		write_mask = !!(mask & (MAY_WRITE | MAY_APPEND));
-	struct inode *h_inode;
-	struct super_block *sb;
-	struct au_branch *br;
-
-	/* todo: support rcu-walk? */
-	if (mask & MAY_NOT_BLOCK)
-		return -ECHILD;
-
-	sb = inode->i_sb;
-	si_read_lock(sb, AuLock_FLUSH);
-	ii_read_lock_child(inode);
-#if 0
-	err = au_iigen_test(inode, au_sigen(sb));
-	if (unlikely(err))
-		goto out;
-#endif
-
-	if (!isdir
-	    || write_mask
-	    || au_opt_test(au_mntflags(sb), DIRPERM1)) {
-		err = au_busy_or_stale();
-		h_inode = au_h_iptr(inode, au_ibstart(inode));
-		if (unlikely(!h_inode
-			     || (h_inode->i_mode & S_IFMT)
-			     != (inode->i_mode & S_IFMT)))
-			goto out;
-
-		err = 0;
-		bindex = au_ibstart(inode);
-		br = au_sbr(sb, bindex);
-		err = h_permission(h_inode, mask, au_br_mnt(br), br->br_perm);
-		if (write_mask
-		    && !err
-		    && !special_file(h_inode->i_mode)) {
-			/* test whether the upper writable branch exists */
-			err = -EROFS;
-			for (; bindex >= 0; bindex--)
-				if (!au_br_rdonly(au_sbr(sb, bindex))) {
-					err = 0;
-					break;
-				}
-		}
-		goto out;
-	}
-
-	/* non-write to dir */
-	err = 0;
-	bend = au_ibend(inode);
-	for (bindex = au_ibstart(inode); !err && bindex <= bend; bindex++) {
-		h_inode = au_h_iptr(inode, bindex);
-		if (h_inode) {
-			err = au_busy_or_stale();
-			if (unlikely(!S_ISDIR(h_inode->i_mode)))
-				break;
-
-			br = au_sbr(sb, bindex);
-			err = h_permission(h_inode, mask, au_br_mnt(br),
-					   br->br_perm);
-		}
-	}
-
-out:
-	ii_read_unlock(inode);
-	si_read_unlock(sb);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static struct dentry *aufs_lookup(struct inode *dir, struct dentry *dentry,
-				  unsigned int flags)
-{
-	struct dentry *ret, *parent;
-	struct inode *inode;
-	struct super_block *sb;
-	int err, npositive;
-
-	IMustLock(dir);
-
-	/* todo: support rcu-walk? */
-	ret = ERR_PTR(-ECHILD);
-	if (flags & LOOKUP_RCU)
-		goto out;
-
-	ret = ERR_PTR(-ENAMETOOLONG);
-	if (unlikely(dentry->d_name.len > AUFS_MAX_NAMELEN))
-		goto out;
-
-	sb = dir->i_sb;
-	err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM);
-	ret = ERR_PTR(err);
-	if (unlikely(err))
-		goto out;
-
-	err = au_di_init(dentry);
-	ret = ERR_PTR(err);
-	if (unlikely(err))
-		goto out_si;
-
-	inode = NULL;
-	npositive = 0; /* suppress a warning */
-	parent = dentry->d_parent; /* dir inode is locked */
-	di_read_lock_parent(parent, AuLock_IR);
-	err = au_alive_dir(parent);
-	if (!err)
-		err = au_digen_test(parent, au_sigen(sb));
-	if (!err) {
-		npositive = au_lkup_dentry(dentry, au_dbstart(parent),
-					   /*type*/0);
-		err = npositive;
-	}
-	di_read_unlock(parent, AuLock_IR);
-	ret = ERR_PTR(err);
-	if (unlikely(err < 0))
-		goto out_unlock;
-
-	if (npositive) {
-		inode = au_new_inode(dentry, /*must_new*/0);
-		if (IS_ERR(inode)) {
-			ret = (void *)inode;
-			inode = NULL;
-			goto out_unlock;
-		}
-	}
-
-	if (inode)
-		atomic_inc(&inode->i_count);
-	ret = d_splice_alias(inode, dentry);
-#if 0
-	if (unlikely(d_need_lookup(dentry))) {
-		spin_lock(&dentry->d_lock);
-		dentry->d_flags &= ~DCACHE_NEED_LOOKUP;
-		spin_unlock(&dentry->d_lock);
-	} else
-#endif
-	if (inode) {
-		if (!IS_ERR(ret)) {
-			iput(inode);
-			if (ret && ret != dentry)
-				ii_write_unlock(inode);
-		} else {
-			ii_write_unlock(inode);
-			iput(inode);
-			inode = NULL;
-		}
-	}
-
-out_unlock:
-	di_write_unlock(dentry);
-	if (inode) {
-		/* verbose coding for lock class name */
-		if (unlikely(S_ISLNK(inode->i_mode)))
-			au_rw_class(&au_di(dentry)->di_rwsem,
-				    au_lc_key + AuLcSymlink_DIINFO);
-		else if (unlikely(S_ISDIR(inode->i_mode)))
-			au_rw_class(&au_di(dentry)->di_rwsem,
-				    au_lc_key + AuLcDir_DIINFO);
-		else /* likely */
-			au_rw_class(&au_di(dentry)->di_rwsem,
-				    au_lc_key + AuLcNonDir_DIINFO);
-	}
-out_si:
-	si_read_unlock(sb);
-out:
-	return ret;
-}
-
-/* ---------------------------------------------------------------------- */
-
-struct aopen_node {
-	struct hlist_node hlist;
-	struct file *file, *h_file;
-};
-
-static int au_do_aopen(struct inode *inode, struct file *file)
-{
-	struct au_sphlhead *aopen;
-	struct aopen_node *node;
-	struct au_do_open_args args = {
-		.no_lock	= 1,
-		.open		= au_do_open_nondir
-	};
-
-	aopen = &au_sbi(inode->i_sb)->si_aopen;
-	spin_lock(&aopen->spin);
-	hlist_for_each_entry(node, &aopen->head, hlist)
-		if (node->file == file) {
-			args.h_file = node->h_file;
-			break;
-		}
-	spin_unlock(&aopen->spin);
-	/* AuDebugOn(!args.h_file); */
-
-	return au_do_open(file, &args);
-}
-
-static int aufs_atomic_open(struct inode *dir, struct dentry *dentry,
-			    struct file *file, unsigned int open_flag,
-			    umode_t create_mode, int *opened)
-{
-	int err, h_opened = *opened;
-	struct dentry *parent;
-	struct dentry *d;
-	struct au_sphlhead *aopen;
-	struct vfsub_aopen_args args = {
-		.open_flag	= open_flag,
-		.create_mode	= create_mode,
-		.opened		= &h_opened
-	};
-	struct aopen_node aopen_node = {
-		.file	= file
-	};
-
-	IMustLock(dir);
-	AuDbg("open_flag 0x%x\n", open_flag);
-	AuDbgDentry(dentry);
-
-	err = 0;
-	if (!au_di(dentry)) {
-		d = aufs_lookup(dir, dentry, /*flags*/0);
-		if (IS_ERR(d)) {
-			err = PTR_ERR(d);
-			goto out;
-		} else if (d) {
-			/*
-			 * obsoleted dentry found.
-			 * another error will be returned later.
-			 */
-			d_drop(d);
-			dput(d);
-			AuDbgDentry(d);
-		}
-		AuDbgDentry(dentry);
-	}
-
-	if (d_is_positive(dentry)
-	    || d_unhashed(dentry)
-	    || d_unlinked(dentry)
-	    || !(open_flag & O_CREAT))
-		goto out_no_open;
-
-	err = aufs_read_lock(dentry, AuLock_DW | AuLock_FLUSH | AuLock_GEN);
-	if (unlikely(err))
-		goto out;
-
-	parent = dentry->d_parent;	/* dir is locked */
-	di_write_lock_parent(parent);
-	err = au_lkup_dentry(dentry, /*bstart*/0, /*type*/0);
-	if (unlikely(err))
-		goto out_unlock;
-
-	AuDbgDentry(dentry);
-	if (d_is_positive(dentry))
-		goto out_unlock;
-
-	args.file = get_empty_filp();
-	err = PTR_ERR(args.file);
-	if (IS_ERR(args.file))
-		goto out_unlock;
-
-	args.file->f_flags = file->f_flags;
-	err = au_aopen_or_create(dir, dentry, &args);
-	AuTraceErr(err);
-	AuDbgFile(args.file);
-	if (unlikely(err < 0)) {
-		if (h_opened & FILE_OPENED)
-			fput(args.file);
-		else
-			put_filp(args.file);
-		goto out_unlock;
-	}
-
-	/* some filesystems don't set FILE_CREATED while succeeded? */
-	*opened |= FILE_CREATED;
-	if (h_opened & FILE_OPENED)
-		aopen_node.h_file = args.file;
-	else {
-		put_filp(args.file);
-		args.file = NULL;
-	}
-	aopen = &au_sbi(dir->i_sb)->si_aopen;
-	au_sphl_add(&aopen_node.hlist, aopen);
-	err = finish_open(file, dentry, au_do_aopen, opened);
-	au_sphl_del(&aopen_node.hlist, aopen);
-	AuTraceErr(err);
-	AuDbgFile(file);
-	if (aopen_node.h_file)
-		fput(aopen_node.h_file);
-
-out_unlock:
-	di_write_unlock(parent);
-	aufs_read_unlock(dentry, AuLock_DW);
-	AuDbgDentry(dentry);
-	if (unlikely(err))
-		goto out;
-out_no_open:
-	if (!err && !(*opened & FILE_CREATED)) {
-		AuLabel(out_no_open);
-		dget(dentry);
-		err = finish_no_open(file, dentry);
-	}
-out:
-	AuDbg("%pd%s%s\n", dentry,
-	      (*opened & FILE_CREATED) ? " created" : "",
-	      (*opened & FILE_OPENED) ? " opened" : "");
-	AuTraceErr(err);
-	return err;
-}
-
-
-/* ---------------------------------------------------------------------- */
-
-static int au_wr_dir_cpup(struct dentry *dentry, struct dentry *parent,
-			  const unsigned char add_entry, aufs_bindex_t bcpup,
-			  aufs_bindex_t bstart)
-{
-	int err;
-	struct dentry *h_parent;
-	struct inode *h_dir;
-
-	if (add_entry)
-		IMustLock(d_inode(parent));
-	else
-		di_write_lock_parent(parent);
-
-	err = 0;
-	if (!au_h_dptr(parent, bcpup)) {
-		if (bstart > bcpup)
-			err = au_cpup_dirs(dentry, bcpup);
-		else if (bstart < bcpup)
-			err = au_cpdown_dirs(dentry, bcpup);
-		else
-			BUG();
-	}
-	if (!err && add_entry && !au_ftest_wrdir(add_entry, TMPFILE)) {
-		h_parent = au_h_dptr(parent, bcpup);
-		h_dir = d_inode(h_parent);
-		mutex_lock_nested(&h_dir->i_mutex, AuLsc_I_PARENT);
-		err = au_lkup_neg(dentry, bcpup, /*wh*/0);
-		/* todo: no unlock here */
-		mutex_unlock(&h_dir->i_mutex);
-
-		AuDbg("bcpup %d\n", bcpup);
-		if (!err) {
-			if (d_really_is_negative(dentry))
-				au_set_h_dptr(dentry, bstart, NULL);
-			au_update_dbrange(dentry, /*do_put_zero*/0);
-		}
-	}
-
-	if (!add_entry)
-		di_write_unlock(parent);
-	if (!err)
-		err = bcpup; /* success */
-
-	AuTraceErr(err);
-	return err;
-}
-
-/*
- * decide the branch and the parent dir where we will create a new entry.
- * returns new bindex or an error.
- * copyup the parent dir if needed.
- */
-int au_wr_dir(struct dentry *dentry, struct dentry *src_dentry,
-	      struct au_wr_dir_args *args)
-{
-	int err;
-	unsigned int flags;
-	aufs_bindex_t bcpup, bstart, src_bstart;
-	const unsigned char add_entry
-		= au_ftest_wrdir(args->flags, ADD_ENTRY)
-		| au_ftest_wrdir(args->flags, TMPFILE);
-	struct super_block *sb;
-	struct dentry *parent;
-	struct au_sbinfo *sbinfo;
-
-	sb = dentry->d_sb;
-	sbinfo = au_sbi(sb);
-	parent = dget_parent(dentry);
-	bstart = au_dbstart(dentry);
-	bcpup = bstart;
-	if (args->force_btgt < 0) {
-		if (src_dentry) {
-			src_bstart = au_dbstart(src_dentry);
-			if (src_bstart < bstart)
-				bcpup = src_bstart;
-		} else if (add_entry) {
-			flags = 0;
-			if (au_ftest_wrdir(args->flags, ISDIR))
-				au_fset_wbr(flags, DIR);
-			err = AuWbrCreate(sbinfo, dentry, flags);
-			bcpup = err;
-		}
-
-		if (bcpup < 0 || au_test_ro(sb, bcpup, d_inode(dentry))) {
-			if (add_entry)
-				err = AuWbrCopyup(sbinfo, dentry);
-			else {
-				if (!IS_ROOT(dentry)) {
-					di_read_lock_parent(parent, !AuLock_IR);
-					err = AuWbrCopyup(sbinfo, dentry);
-					di_read_unlock(parent, !AuLock_IR);
-				} else
-					err = AuWbrCopyup(sbinfo, dentry);
-			}
-			bcpup = err;
-			if (unlikely(err < 0))
-				goto out;
-		}
-	} else {
-		bcpup = args->force_btgt;
-		AuDebugOn(au_test_ro(sb, bcpup, d_inode(dentry)));
-	}
-
-	AuDbg("bstart %d, bcpup %d\n", bstart, bcpup);
-	err = bcpup;
-	if (bcpup == bstart)
-		goto out; /* success */
-
-	/* copyup the new parent into the branch we process */
-	err = au_wr_dir_cpup(dentry, parent, add_entry, bcpup, bstart);
-	if (err >= 0) {
-		if (d_really_is_negative(dentry)) {
-			au_set_h_dptr(dentry, bstart, NULL);
-			au_set_dbstart(dentry, bcpup);
-			au_set_dbend(dentry, bcpup);
-		}
-		AuDebugOn(add_entry
-			  && !au_ftest_wrdir(args->flags, TMPFILE)
-			  && !au_h_dptr(dentry, bcpup));
-	}
-
-out:
-	dput(parent);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void au_pin_hdir_unlock(struct au_pin *p)
-{
-	if (p->hdir)
-		au_hn_imtx_unlock(p->hdir);
-}
-
-int au_pin_hdir_lock(struct au_pin *p)
-{
-	int err;
-
-	err = 0;
-	if (!p->hdir)
-		goto out;
-
-	/* even if an error happens later, keep this lock */
-	au_hn_imtx_lock_nested(p->hdir, p->lsc_hi);
-
-	err = -EBUSY;
-	if (unlikely(p->hdir->hi_inode != d_inode(p->h_parent)))
-		goto out;
-
-	err = 0;
-	if (p->h_dentry)
-		err = au_h_verify(p->h_dentry, p->udba, p->hdir->hi_inode,
-				  p->h_parent, p->br);
-
-out:
-	return err;
-}
-
-int au_pin_hdir_relock(struct au_pin *p)
-{
-	int err, i;
-	struct inode *h_i;
-	struct dentry *h_d[] = {
-		p->h_dentry,
-		p->h_parent
-	};
-
-	err = au_pin_hdir_lock(p);
-	if (unlikely(err))
-		goto out;
-
-	for (i = 0; !err && i < sizeof(h_d)/sizeof(*h_d); i++) {
-		if (!h_d[i])
-			continue;
-		if (d_is_positive(h_d[i])) {
-			h_i = d_inode(h_d[i]);
-			err = !h_i->i_nlink;
-		}
-	}
-
-out:
-	return err;
-}
-
-void au_pin_hdir_set_owner(struct au_pin *p, struct task_struct *task)
-{
-#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_SMP)
-	p->hdir->hi_inode->i_mutex.owner = task;
-#endif
-}
-
-void au_pin_hdir_acquire_nest(struct au_pin *p)
-{
-	if (p->hdir) {
-		mutex_acquire_nest(&p->hdir->hi_inode->i_mutex.dep_map,
-				   p->lsc_hi, 0, NULL, _RET_IP_);
-		au_pin_hdir_set_owner(p, current);
-	}
-}
-
-void au_pin_hdir_release(struct au_pin *p)
-{
-	if (p->hdir) {
-		au_pin_hdir_set_owner(p, p->task);
-		mutex_release(&p->hdir->hi_inode->i_mutex.dep_map, 1, _RET_IP_);
-	}
-}
-
-struct dentry *au_pinned_h_parent(struct au_pin *pin)
-{
-	if (pin && pin->parent)
-		return au_h_dptr(pin->parent, pin->bindex);
-	return NULL;
-}
-
-void au_unpin(struct au_pin *p)
-{
-	if (p->hdir)
-		au_pin_hdir_unlock(p);
-	if (p->h_mnt && au_ftest_pin(p->flags, MNT_WRITE))
-		vfsub_mnt_drop_write(p->h_mnt);
-	if (!p->hdir)
-		return;
-
-	if (!au_ftest_pin(p->flags, DI_LOCKED))
-		di_read_unlock(p->parent, AuLock_IR);
-	iput(p->hdir->hi_inode);
-	dput(p->parent);
-	p->parent = NULL;
-	p->hdir = NULL;
-	p->h_mnt = NULL;
-	/* do not clear p->task */
-}
-
-int au_do_pin(struct au_pin *p)
-{
-	int err;
-	struct super_block *sb;
-	struct inode *h_dir;
-
-	err = 0;
-	sb = p->dentry->d_sb;
-	p->br = au_sbr(sb, p->bindex);
-	if (IS_ROOT(p->dentry)) {
-		if (au_ftest_pin(p->flags, MNT_WRITE)) {
-			p->h_mnt = au_br_mnt(p->br);
-			err = vfsub_mnt_want_write(p->h_mnt);
-			if (unlikely(err)) {
-				au_fclr_pin(p->flags, MNT_WRITE);
-				goto out_err;
-			}
-		}
-		goto out;
-	}
-
-	p->h_dentry = NULL;
-	if (p->bindex <= au_dbend(p->dentry))
-		p->h_dentry = au_h_dptr(p->dentry, p->bindex);
-
-	p->parent = dget_parent(p->dentry);
-	if (!au_ftest_pin(p->flags, DI_LOCKED))
-		di_read_lock(p->parent, AuLock_IR, p->lsc_di);
-
-	h_dir = NULL;
-	p->h_parent = au_h_dptr(p->parent, p->bindex);
-	p->hdir = au_hi(d_inode(p->parent), p->bindex);
-	if (p->hdir)
-		h_dir = p->hdir->hi_inode;
-
-	/*
-	 * udba case, or
-	 * if DI_LOCKED is not set, then p->parent may be different
-	 * and h_parent can be NULL.
-	 */
-	if (unlikely(!p->hdir || !h_dir || !p->h_parent)) {
-		err = -EBUSY;
-		if (!au_ftest_pin(p->flags, DI_LOCKED))
-			di_read_unlock(p->parent, AuLock_IR);
-		dput(p->parent);
-		p->parent = NULL;
-		goto out_err;
-	}
-
-	if (au_ftest_pin(p->flags, MNT_WRITE)) {
-		p->h_mnt = au_br_mnt(p->br);
-		err = vfsub_mnt_want_write(p->h_mnt);
-		if (unlikely(err)) {
-			au_fclr_pin(p->flags, MNT_WRITE);
-			if (!au_ftest_pin(p->flags, DI_LOCKED))
-				di_read_unlock(p->parent, AuLock_IR);
-			dput(p->parent);
-			p->parent = NULL;
-			goto out_err;
-		}
-	}
-
-	au_igrab(h_dir);
-	err = au_pin_hdir_lock(p);
-	if (!err)
-		goto out; /* success */
-
-	au_unpin(p);
-
-out_err:
-	pr_err("err %d\n", err);
-	err = au_busy_or_stale();
-out:
-	return err;
-}
-
-void au_pin_init(struct au_pin *p, struct dentry *dentry,
-		 aufs_bindex_t bindex, int lsc_di, int lsc_hi,
-		 unsigned int udba, unsigned char flags)
-{
-	p->dentry = dentry;
-	p->udba = udba;
-	p->lsc_di = lsc_di;
-	p->lsc_hi = lsc_hi;
-	p->flags = flags;
-	p->bindex = bindex;
-
-	p->parent = NULL;
-	p->hdir = NULL;
-	p->h_mnt = NULL;
-
-	p->h_dentry = NULL;
-	p->h_parent = NULL;
-	p->br = NULL;
-	p->task = current;
-}
-
-int au_pin(struct au_pin *pin, struct dentry *dentry, aufs_bindex_t bindex,
-	   unsigned int udba, unsigned char flags)
-{
-	au_pin_init(pin, dentry, bindex, AuLsc_DI_PARENT, AuLsc_I_PARENT2,
-		    udba, flags);
-	return au_do_pin(pin);
-}
-
-/* ---------------------------------------------------------------------- */
-
-/*
- * ->setattr() and ->getattr() are called in various cases.
- * chmod, stat: dentry is revalidated.
- * fchmod, fstat: file and dentry are not revalidated, additionally they may be
- *		  unhashed.
- * for ->setattr(), ia->ia_file is passed from ftruncate only.
- */
-/* todo: consolidate with do_refresh() and simple_reval_dpath() */
-int au_reval_for_attr(struct dentry *dentry, unsigned int sigen)
-{
-	int err;
-	struct dentry *parent;
-
-	err = 0;
-	if (au_digen_test(dentry, sigen)) {
-		parent = dget_parent(dentry);
-		di_read_lock_parent(parent, AuLock_IR);
-		err = au_refresh_dentry(dentry, parent);
-		di_read_unlock(parent, AuLock_IR);
-		dput(parent);
-	}
-
-	AuTraceErr(err);
-	return err;
-}
-
-int au_pin_and_icpup(struct dentry *dentry, struct iattr *ia,
-		     struct au_icpup_args *a)
-{
-	int err;
-	loff_t sz;
-	aufs_bindex_t bstart, ibstart;
-	struct dentry *hi_wh, *parent;
-	struct inode *inode;
-	struct au_wr_dir_args wr_dir_args = {
-		.force_btgt	= -1,
-		.flags		= 0
-	};
-
-	if (d_is_dir(dentry))
-		au_fset_wrdir(wr_dir_args.flags, ISDIR);
-	/* plink or hi_wh() case */
-	bstart = au_dbstart(dentry);
-	inode = d_inode(dentry);
-	ibstart = au_ibstart(inode);
-	if (bstart != ibstart && !au_test_ro(inode->i_sb, ibstart, inode))
-		wr_dir_args.force_btgt = ibstart;
-	err = au_wr_dir(dentry, /*src_dentry*/NULL, &wr_dir_args);
-	if (unlikely(err < 0))
-		goto out;
-	a->btgt = err;
-	if (err != bstart)
-		au_fset_icpup(a->flags, DID_CPUP);
-
-	err = 0;
-	a->pin_flags = AuPin_MNT_WRITE;
-	parent = NULL;
-	if (!IS_ROOT(dentry)) {
-		au_fset_pin(a->pin_flags, DI_LOCKED);
-		parent = dget_parent(dentry);
-		di_write_lock_parent(parent);
-	}
-
-	err = au_pin(&a->pin, dentry, a->btgt, a->udba, a->pin_flags);
-	if (unlikely(err))
-		goto out_parent;
-
-	a->h_path.dentry = au_h_dptr(dentry, bstart);
-	sz = -1;
-	a->h_inode = d_inode(a->h_path.dentry);
-	if (ia && (ia->ia_valid & ATTR_SIZE)) {
-		mutex_lock_nested(&a->h_inode->i_mutex, AuLsc_I_CHILD);
-		if (ia->ia_size < i_size_read(a->h_inode))
-			sz = ia->ia_size;
-		mutex_unlock(&a->h_inode->i_mutex);
-	}
-
-	hi_wh = NULL;
-	if (au_ftest_icpup(a->flags, DID_CPUP) && d_unlinked(dentry)) {
-		hi_wh = au_hi_wh(inode, a->btgt);
-		if (!hi_wh) {
-			struct au_cp_generic cpg = {
-				.dentry	= dentry,
-				.bdst	= a->btgt,
-				.bsrc	= -1,
-				.len	= sz,
-				.pin	= &a->pin
-			};
-			err = au_sio_cpup_wh(&cpg, /*file*/NULL);
-			if (unlikely(err))
-				goto out_unlock;
-			hi_wh = au_hi_wh(inode, a->btgt);
-			/* todo: revalidate hi_wh? */
-		}
-	}
-
-	if (parent) {
-		au_pin_set_parent_lflag(&a->pin, /*lflag*/0);
-		di_downgrade_lock(parent, AuLock_IR);
-		dput(parent);
-		parent = NULL;
-	}
-	if (!au_ftest_icpup(a->flags, DID_CPUP))
-		goto out; /* success */
-
-	if (!d_unhashed(dentry)) {
-		struct au_cp_generic cpg = {
-			.dentry	= dentry,
-			.bdst	= a->btgt,
-			.bsrc	= bstart,
-			.len	= sz,
-			.pin	= &a->pin,
-			.flags	= AuCpup_DTIME | AuCpup_HOPEN
-		};
-		err = au_sio_cpup_simple(&cpg);
-		if (!err)
-			a->h_path.dentry = au_h_dptr(dentry, a->btgt);
-	} else if (!hi_wh)
-		a->h_path.dentry = au_h_dptr(dentry, a->btgt);
-	else
-		a->h_path.dentry = hi_wh; /* do not dget here */
-
-out_unlock:
-	a->h_inode = d_inode(a->h_path.dentry);
-	if (!err)
-		goto out; /* success */
-	au_unpin(&a->pin);
-out_parent:
-	if (parent) {
-		di_write_unlock(parent);
-		dput(parent);
-	}
-out:
-	if (!err)
-		mutex_lock_nested(&a->h_inode->i_mutex, AuLsc_I_CHILD);
-	return err;
-}
-
-static int aufs_setattr(struct dentry *dentry, struct iattr *ia)
-{
-	int err;
-	struct inode *inode, *delegated;
-	struct super_block *sb;
-	struct file *file;
-	struct au_icpup_args *a;
-
-	inode = d_inode(dentry);
-	IMustLock(inode);
-
-	err = -ENOMEM;
-	a = kzalloc(sizeof(*a), GFP_NOFS);
-	if (unlikely(!a))
-		goto out;
-
-	if (ia->ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID))
-		ia->ia_valid &= ~ATTR_MODE;
-
-	file = NULL;
-	sb = dentry->d_sb;
-	err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM);
-	if (unlikely(err))
-		goto out_kfree;
-
-	if (ia->ia_valid & ATTR_FILE) {
-		/* currently ftruncate(2) only */
-		AuDebugOn(!d_is_reg(dentry));
-		file = ia->ia_file;
-		err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/1);
-		if (unlikely(err))
-			goto out_si;
-		ia->ia_file = au_hf_top(file);
-		a->udba = AuOpt_UDBA_NONE;
-	} else {
-		/* fchmod() doesn't pass ia_file */
-		a->udba = au_opt_udba(sb);
-		di_write_lock_child(dentry);
-		/* no d_unlinked(), to set UDBA_NONE for root */
-		if (d_unhashed(dentry))
-			a->udba = AuOpt_UDBA_NONE;
-		if (a->udba != AuOpt_UDBA_NONE) {
-			AuDebugOn(IS_ROOT(dentry));
-			err = au_reval_for_attr(dentry, au_sigen(sb));
-			if (unlikely(err))
-				goto out_dentry;
-		}
-	}
-
-	err = au_pin_and_icpup(dentry, ia, a);
-	if (unlikely(err < 0))
-		goto out_dentry;
-	if (au_ftest_icpup(a->flags, DID_CPUP)) {
-		ia->ia_file = NULL;
-		ia->ia_valid &= ~ATTR_FILE;
-	}
-
-	a->h_path.mnt = au_sbr_mnt(sb, a->btgt);
-	if ((ia->ia_valid & (ATTR_MODE | ATTR_CTIME))
-	    == (ATTR_MODE | ATTR_CTIME)) {
-		err = security_path_chmod(&a->h_path, ia->ia_mode);
-		if (unlikely(err))
-			goto out_unlock;
-	} else if ((ia->ia_valid & (ATTR_UID | ATTR_GID))
-		   && (ia->ia_valid & ATTR_CTIME)) {
-		err = security_path_chown(&a->h_path, ia->ia_uid, ia->ia_gid);
-		if (unlikely(err))
-			goto out_unlock;
-	}
-
-	if (ia->ia_valid & ATTR_SIZE) {
-		struct file *f;
-
-		if (ia->ia_size < i_size_read(inode))
-			/* unmap only */
-			truncate_setsize(inode, ia->ia_size);
-
-		f = NULL;
-		if (ia->ia_valid & ATTR_FILE)
-			f = ia->ia_file;
-		mutex_unlock(&a->h_inode->i_mutex);
-		err = vfsub_trunc(&a->h_path, ia->ia_size, ia->ia_valid, f);
-		mutex_lock_nested(&a->h_inode->i_mutex, AuLsc_I_CHILD);
-	} else {
-		delegated = NULL;
-		while (1) {
-			err = vfsub_notify_change(&a->h_path, ia, &delegated);
-			if (delegated) {
-				err = break_deleg_wait(&delegated);
-				if (!err)
-					continue;
-			}
-			break;
-		}
-	}
-	/*
-	 * regardless aufs 'acl' option setting.
-	 * why don't all acl-aware fs call this func from their ->setattr()?
-	 */
-	if (!err && (ia->ia_valid & ATTR_MODE))
-		err = vfsub_acl_chmod(a->h_inode, ia->ia_mode);
-	if (!err)
-		au_cpup_attr_changeable(inode);
-
-out_unlock:
-	mutex_unlock(&a->h_inode->i_mutex);
-	au_unpin(&a->pin);
-	if (unlikely(err))
-		au_update_dbstart(dentry);
-out_dentry:
-	di_write_unlock(dentry);
-	if (file) {
-		fi_write_unlock(file);
-		ia->ia_file = file;
-		ia->ia_valid |= ATTR_FILE;
-	}
-out_si:
-	si_read_unlock(sb);
-out_kfree:
-	kfree(a);
-out:
-	AuTraceErr(err);
-	return err;
-}
-
-#if IS_ENABLED(CONFIG_AUFS_XATTR) || IS_ENABLED(CONFIG_FS_POSIX_ACL)
-static int au_h_path_to_set_attr(struct dentry *dentry,
-				 struct au_icpup_args *a, struct path *h_path)
-{
-	int err;
-	struct super_block *sb;
-
-	sb = dentry->d_sb;
-	a->udba = au_opt_udba(sb);
-	/* no d_unlinked(), to set UDBA_NONE for root */
-	if (d_unhashed(dentry))
-		a->udba = AuOpt_UDBA_NONE;
-	if (a->udba != AuOpt_UDBA_NONE) {
-		AuDebugOn(IS_ROOT(dentry));
-		err = au_reval_for_attr(dentry, au_sigen(sb));
-		if (unlikely(err))
-			goto out;
-	}
-	err = au_pin_and_icpup(dentry, /*ia*/NULL, a);
-	if (unlikely(err < 0))
-		goto out;
-
-	h_path->dentry = a->h_path.dentry;
-	h_path->mnt = au_sbr_mnt(sb, a->btgt);
-
-out:
-	return err;
-}
-
-ssize_t au_srxattr(struct dentry *dentry, struct au_srxattr *arg)
-{
-	int err;
-	struct path h_path;
-	struct super_block *sb;
-	struct au_icpup_args *a;
-	struct inode *inode, *h_inode;
-
-	inode = d_inode(dentry);
-	IMustLock(inode);
-
-	err = -ENOMEM;
-	a = kzalloc(sizeof(*a), GFP_NOFS);
-	if (unlikely(!a))
-		goto out;
-
-	sb = dentry->d_sb;
-	err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM);
-	if (unlikely(err))
-		goto out_kfree;
-
-	h_path.dentry = NULL;	/* silence gcc */
-	di_write_lock_child(dentry);
-	err = au_h_path_to_set_attr(dentry, a, &h_path);
-	if (unlikely(err))
-		goto out_di;
-
-	mutex_unlock(&a->h_inode->i_mutex);
-	switch (arg->type) {
-	case AU_XATTR_SET:
-		err = vfsub_setxattr(h_path.dentry,
-				     arg->u.set.name, arg->u.set.value,
-				     arg->u.set.size, arg->u.set.flags);
-		break;
-	case AU_XATTR_REMOVE:
-		err = vfsub_removexattr(h_path.dentry, arg->u.remove.name);
-		break;
-	case AU_ACL_SET:
-		err = -EOPNOTSUPP;
-		h_inode = d_inode(h_path.dentry);
-		if (h_inode->i_op->set_acl)
-			err = h_inode->i_op->set_acl(h_inode,
-						     arg->u.acl_set.acl,
-						     arg->u.acl_set.type);
-		break;
-	}
-	if (!err)
-		au_cpup_attr_timesizes(inode);
-
-	au_unpin(&a->pin);
-	if (unlikely(err))
-		au_update_dbstart(dentry);
-
-out_di:
-	di_write_unlock(dentry);
-	si_read_unlock(sb);
-out_kfree:
-	kfree(a);
-out:
-	AuTraceErr(err);
-	return err;
-}
-#endif
-
-static void au_refresh_iattr(struct inode *inode, struct kstat *st,
-			     unsigned int nlink)
-{
-	unsigned int n;
-
-	inode->i_mode = st->mode;
-	/* don't i_[ug]id_write() here */
-	inode->i_uid = st->uid;
-	inode->i_gid = st->gid;
-	inode->i_atime = st->atime;
-	inode->i_mtime = st->mtime;
-	inode->i_ctime = st->ctime;
-
-	au_cpup_attr_nlink(inode, /*force*/0);
-	if (S_ISDIR(inode->i_mode)) {
-		n = inode->i_nlink;
-		n -= nlink;
-		n += st->nlink;
-		smp_mb(); /* for i_nlink */
-		/* 0 can happen */
-		set_nlink(inode, n);
-	}
-
-	spin_lock(&inode->i_lock);
-	inode->i_blocks = st->blocks;
-	i_size_write(inode, st->size);
-	spin_unlock(&inode->i_lock);
-}
-
-/*
- * common routine for aufs_getattr() and aufs_getxattr().
- * returns zero or negative (an error).
- * @dentry will be read-locked in success.
- */
-int au_h_path_getattr(struct dentry *dentry, int force, struct path *h_path)
-{
-	int err;
-	unsigned int mnt_flags, sigen;
-	unsigned char udba_none;
-	aufs_bindex_t bindex;
-	struct super_block *sb, *h_sb;
-	struct inode *inode;
-
-	h_path->mnt = NULL;
-	h_path->dentry = NULL;
-
-	err = 0;
-	sb = dentry->d_sb;
-	mnt_flags = au_mntflags(sb);
-	udba_none = !!au_opt_test(mnt_flags, UDBA_NONE);
-
-	/* support fstat(2) */
-	if (!d_unlinked(dentry) && !udba_none) {
-		sigen = au_sigen(sb);
-		err = au_digen_test(dentry, sigen);
-		if (!err) {
-			di_read_lock_child(dentry, AuLock_IR);
-			err = au_dbrange_test(dentry);
-			if (unlikely(err)) {
-				di_read_unlock(dentry, AuLock_IR);
-				goto out;
-			}
-		} else {
-			AuDebugOn(IS_ROOT(dentry));
-			di_write_lock_child(dentry);
-			err = au_dbrange_test(dentry);
-			if (!err)
-				err = au_reval_for_attr(dentry, sigen);
-			if (!err)
-				di_downgrade_lock(dentry, AuLock_IR);
-			else {
-				di_write_unlock(dentry);
-				goto out;
-			}
-		}
-	} else
-		di_read_lock_child(dentry, AuLock_IR);
-
-	inode = d_inode(dentry);
-	bindex = au_ibstart(inode);
-	h_path->mnt = au_sbr_mnt(sb, bindex);
-	h_sb = h_path->mnt->mnt_sb;
-	if (!force
-	    && !au_test_fs_bad_iattr(h_sb)
-	    && udba_none)
-		goto out; /* success */
-
-	if (au_dbstart(dentry) == bindex)
-		h_path->dentry = au_h_dptr(dentry, bindex);
-	else if (au_opt_test(mnt_flags, PLINK) && au_plink_test(inode)) {
-		h_path->dentry = au_plink_lkup(inode, bindex);
-		if (IS_ERR(h_path->dentry))
-			/* pretending success */
-			h_path->dentry = NULL;
-		else
-			dput(h_path->dentry);
-	}
-
-out:
-	return err;
-}
-
-static int aufs_getattr(struct vfsmount *mnt __maybe_unused,
-			struct dentry *dentry, struct kstat *st)
-{
-	int err;
-	unsigned char positive;
-	struct path h_path;
-	struct inode *inode;
-	struct super_block *sb;
-
-	inode = d_inode(dentry);
-	sb = dentry->d_sb;
-	err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM);
-	if (unlikely(err))
-		goto out;
-	err = au_h_path_getattr(dentry, /*force*/0, &h_path);
-	if (unlikely(err))
-		goto out_si;
-	if (unlikely(!h_path.dentry))
-		/* illegally overlapped or something */
-		goto out_fill; /* pretending success */
-
-	positive = d_is_positive(h_path.dentry);
-	if (positive)
-		err = vfs_getattr(&h_path, st);
-	if (!err) {
-		if (positive)
-			au_refresh_iattr(inode, st,
-					 d_inode(h_path.dentry)->i_nlink);
-		goto out_fill; /* success */
-	}
-	AuTraceErr(err);
-	goto out_di;
-
-out_fill:
-	generic_fillattr(inode, st);
-out_di:
-	di_read_unlock(dentry, AuLock_IR);
-out_si:
-	si_read_unlock(sb);
-out:
-	AuTraceErr(err);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/*
- * Assumption:
- * - the number of symlinks is not so many.
- *
- * Structure:
- * - sbinfo (instead of iinfo) contains an hlist of struct au_symlink.
- *   If iinfo contained the hlist, then it would be rather large waste of memory
- *   I am afraid.
- * - struct au_symlink contains the necessary info for h_inode follow_link() and
- *   put_link().
- */
-
-struct au_symlink {
-	union {
-		struct hlist_node hlist;
-		struct rcu_head rcu;
-	};
-
-	struct inode *h_inode;
-	void *h_cookie;
-};
-
-static void au_symlink_add(struct super_block *sb, struct au_symlink *slink,
-			   struct inode *h_inode, void *cookie)
-{
-	struct au_sbinfo *sbinfo;
-
-	ihold(h_inode);
-	slink->h_inode = h_inode;
-	slink->h_cookie = cookie;
-	sbinfo = au_sbi(sb);
-	au_sphl_add(&slink->hlist, &sbinfo->si_symlink);
-}
-
-static void au_symlink_del(struct super_block *sb, struct au_symlink *slink)
-{
-	struct au_sbinfo *sbinfo;
-
-	/* do not iput() within rcu */
-	iput(slink->h_inode);
-	slink->h_inode = NULL;
-	sbinfo = au_sbi(sb);
-	au_sphl_del_rcu(&slink->hlist, &sbinfo->si_symlink);
-	kfree_rcu(slink, rcu);
-}
-
-static const char *aufs_follow_link(struct dentry *dentry, void **cookie)
-{
-	const char *ret;
-	struct inode *inode, *h_inode;
-	struct dentry *h_dentry;
-	struct au_symlink *slink;
-	int err;
-	aufs_bindex_t bindex;
-
-	ret = NULL; /* suppress a warning */
-	err = aufs_read_lock(dentry, AuLock_IR | AuLock_GEN);
-	if (unlikely(err))
-		goto out;
-
-	err = au_d_hashed_positive(dentry);
-	if (unlikely(err))
-		goto out_unlock;
-
-	err = -EINVAL;
-	inode = d_inode(dentry);
-	bindex = au_ibstart(inode);
-	h_inode = au_h_iptr(inode, bindex);
-	if (unlikely(!h_inode->i_op->follow_link))
-		goto out_unlock;
-
-	err = -ENOMEM;
-	slink = kmalloc(sizeof(*slink), GFP_NOFS);
-	if (unlikely(!slink))
-		goto out_unlock;
-
-	err = -EBUSY;
-	h_dentry = NULL;
-	if (au_dbstart(dentry) <= bindex) {
-		h_dentry = au_h_dptr(dentry, bindex);
-		if (h_dentry)
-			dget(h_dentry);
-	}
-	if (!h_dentry) {
-		h_dentry = d_find_any_alias(h_inode);
-		if (IS_ERR(h_dentry)) {
-			err = PTR_ERR(h_dentry);
-			goto out_free;
-		}
-	}
-	if (unlikely(!h_dentry))
-		goto out_free;
-
-	err = 0;
-	AuDbg("%pf\n", h_inode->i_op->follow_link);
-	AuDbgDentry(h_dentry);
-	ret = h_inode->i_op->follow_link(h_dentry, cookie);
-	dput(h_dentry);
-
-	if (!IS_ERR_OR_NULL(ret)) {
-		au_symlink_add(inode->i_sb, slink, h_inode, *cookie);
-		*cookie = slink;
-		AuDbg("slink %p\n", slink);
-		goto out_unlock; /* success */
-	}
-
-out_free:
-	slink->h_inode = NULL;
-	kfree_rcu(slink, rcu);
-out_unlock:
-	aufs_read_unlock(dentry, AuLock_IR);
-out:
-	if (unlikely(err))
-		ret = ERR_PTR(err);
-	AuTraceErrPtr(ret);
-	return ret;
-}
-
-static void aufs_put_link(struct inode *inode, void *cookie)
-{
-	struct au_symlink *slink;
-	struct inode *h_inode;
-
-	slink = cookie;
-	AuDbg("slink %p\n", slink);
-	h_inode = slink->h_inode;
-	AuDbg("%pf\n", h_inode->i_op->put_link);
-	AuDbgInode(h_inode);
-	if (h_inode->i_op->put_link)
-		h_inode->i_op->put_link(h_inode, slink->h_cookie);
-	au_symlink_del(inode->i_sb, slink);
-}
-
-/* ---------------------------------------------------------------------- */
-
-static int aufs_update_time(struct inode *inode, struct timespec *ts, int flags)
-{
-	int err;
-	struct super_block *sb;
-	struct inode *h_inode;
-
-	sb = inode->i_sb;
-	/* mmap_sem might be acquired already, cf. aufs_mmap() */
-	lockdep_off();
-	si_read_lock(sb, AuLock_FLUSH);
-	ii_write_lock_child(inode);
-	lockdep_on();
-	h_inode = au_h_iptr(inode, au_ibstart(inode));
-	err = vfsub_update_time(h_inode, ts, flags);
-	lockdep_off();
-	if (!err)
-		au_cpup_attr_timesizes(inode);
-	ii_write_unlock(inode);
-	si_read_unlock(sb);
-	lockdep_on();
-
-	if (!err && (flags & S_VERSION))
-		inode_inc_iversion(inode);
-
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* no getattr version will be set by module.c:aufs_init() */
-struct inode_operations aufs_iop_nogetattr[AuIop_Last],
-	aufs_iop[] = {
-	[AuIop_SYMLINK] = {
-		.permission	= aufs_permission,
-#ifdef CONFIG_FS_POSIX_ACL
-		.get_acl	= aufs_get_acl,
-		.set_acl	= aufs_set_acl, /* unsupport for symlink? */
-#endif
-
-		.setattr	= aufs_setattr,
-		.getattr	= aufs_getattr,
-
-#ifdef CONFIG_AUFS_XATTR
-		.setxattr	= aufs_setxattr,
-		.getxattr	= aufs_getxattr,
-		.listxattr	= aufs_listxattr,
-		.removexattr	= aufs_removexattr,
-#endif
-
-		.readlink	= generic_readlink,
-		.follow_link	= aufs_follow_link,
-		.put_link	= aufs_put_link,
-
-		/* .update_time	= aufs_update_time */
-	},
-	[AuIop_DIR] = {
-		.create		= aufs_create,
-		.lookup		= aufs_lookup,
-		.link		= aufs_link,
-		.unlink		= aufs_unlink,
-		.symlink	= aufs_symlink,
-		.mkdir		= aufs_mkdir,
-		.rmdir		= aufs_rmdir,
-		.mknod		= aufs_mknod,
-		.rename		= aufs_rename,
-
-		.permission	= aufs_permission,
-#ifdef CONFIG_FS_POSIX_ACL
-		.get_acl	= aufs_get_acl,
-		.set_acl	= aufs_set_acl,
-#endif
-
-		.setattr	= aufs_setattr,
-		.getattr	= aufs_getattr,
-
-#ifdef CONFIG_AUFS_XATTR
-		.setxattr	= aufs_setxattr,
-		.getxattr	= aufs_getxattr,
-		.listxattr	= aufs_listxattr,
-		.removexattr	= aufs_removexattr,
-#endif
-
-		.update_time	= aufs_update_time,
-		.atomic_open	= aufs_atomic_open,
-		.tmpfile	= aufs_tmpfile
-	},
-	[AuIop_OTHER] = {
-		.permission	= aufs_permission,
-#ifdef CONFIG_FS_POSIX_ACL
-		.get_acl	= aufs_get_acl,
-		.set_acl	= aufs_set_acl,
-#endif
-
-		.setattr	= aufs_setattr,
-		.getattr	= aufs_getattr,
-
-#ifdef CONFIG_AUFS_XATTR
-		.setxattr	= aufs_setxattr,
-		.getxattr	= aufs_getxattr,
-		.listxattr	= aufs_listxattr,
-		.removexattr	= aufs_removexattr,
-#endif
-
-		.update_time	= aufs_update_time
-	}
-};
diff --git a/fs/aufs/i_op_add.c b/fs/aufs/i_op_add.c
deleted file mode 100644
index 3fc355859..000000000
--- a/fs/aufs/i_op_add.c
+++ /dev/null
@@ -1,919 +0,0 @@
-/*
- * Copyright (C) 2005-2016 Junjiro R. Okajima
- */
-
-/*
- * inode operations (add entry)
- */
-
-#include "aufs.h"
-
-/*
- * final procedure of adding a new entry, except link(2).
- * remove whiteout, instantiate, copyup the parent dir's times and size
- * and update version.
- * if it failed, re-create the removed whiteout.
- */
-static int epilog(struct inode *dir, aufs_bindex_t bindex,
-		  struct dentry *wh_dentry, struct dentry *dentry)
-{
-	int err, rerr;
-	aufs_bindex_t bwh;
-	struct path h_path;
-	struct super_block *sb;
-	struct inode *inode, *h_dir;
-	struct dentry *wh;
-
-	bwh = -1;
-	sb = dir->i_sb;
-	if (wh_dentry) {
-		h_dir = d_inode(wh_dentry->d_parent); /* dir inode is locked */
-		IMustLock(h_dir);
-		AuDebugOn(au_h_iptr(dir, bindex) != h_dir);
-		bwh = au_dbwh(dentry);
-		h_path.dentry = wh_dentry;
-		h_path.mnt = au_sbr_mnt(sb, bindex);
-		err = au_wh_unlink_dentry(au_h_iptr(dir, bindex), &h_path,
-					  dentry);
-		if (unlikely(err))
-			goto out;
-	}
-
-	inode = au_new_inode(dentry, /*must_new*/1);
-	if (!IS_ERR(inode)) {
-		d_instantiate(dentry, inode);
-		dir = d_inode(dentry->d_parent); /* dir inode is locked */
-		IMustLock(dir);
-		au_dir_ts(dir, bindex);
-		dir->i_version++;
-		au_fhsm_wrote(sb, bindex, /*force*/0);
-		return 0; /* success */
-	}
-
-	err = PTR_ERR(inode);
-	if (!wh_dentry)
-		goto out;
-
-	/* revert */
-	/* dir inode is locked */
-	wh = au_wh_create(dentry, bwh, wh_dentry->d_parent);
-	rerr = PTR_ERR(wh);
-	if (IS_ERR(wh)) {
-		AuIOErr("%pd reverting whiteout failed(%d, %d)\n",
-			dentry, err, rerr);
-		err = -EIO;
-	} else
-		dput(wh);
-
-out:
-	return err;
-}
-
-static int au_d_may_add(struct dentry *dentry)
-{
-	int err;
-
-	err = 0;
-	if (unlikely(d_unhashed(dentry)))
-		err = -ENOENT;
-	if (unlikely(d_really_is_positive(dentry)))
-		err = -EEXIST;
-	return err;
-}
-
-/*
- * simple tests for the adding inode operations.
- * following the checks in vfs, plus the parent-child relationship.
- */
-int au_may_add(struct dentry *dentry, aufs_bindex_t bindex,
-	       struct dentry *h_parent, int isdir)
-{
-	int err;
-	umode_t h_mode;
-	struct dentry *h_dentry;
-	struct inode *h_inode;
-
-	err = -ENAMETOOLONG;
-	if (unlikely(dentry->d_name.len > AUFS_MAX_NAMELEN))
-		goto out;
-
-	h_dentry = au_h_dptr(dentry, bindex);
-	if (d_really_is_negative(dentry)) {
-		err = -EEXIST;
-		if (unlikely(d_is_positive(h_dentry)))
-			goto out;
-	} else {
-		/* rename(2) case */
-		err = -EIO;
-		if (unlikely(d_is_negative(h_dentry)))
-			goto out;
-		h_inode = d_inode(h_dentry);
-		if (unlikely(!h_inode->i_nlink))
-			goto out;
-
-		h_mode = h_inode->i_mode;
-		if (!isdir) {
-			err = -EISDIR;
-			if (unlikely(S_ISDIR(h_mode)))
-				goto out;
-		} else if (unlikely(!S_ISDIR(h_mode))) {
-			err = -ENOTDIR;
-			goto out;
-		}
-	}
-
-	err = 0;
-	/* expected parent dir is locked */
-	if (unlikely(h_parent != h_dentry->d_parent))
-		err = -EIO;
-
-out:
-	AuTraceErr(err);
-	return err;
-}
-
-/*
- * initial procedure of adding a new entry.
- * prepare writable branch and the parent dir, lock it,
- * and lookup whiteout for the new entry.
- */
-static struct dentry*
-lock_hdir_lkup_wh(struct dentry *dentry, struct au_dtime *dt,
-		  struct dentry *src_dentry, struct au_pin *pin,
-		  struct au_wr_dir_args *wr_dir_args)
-{
-	struct dentry *wh_dentry, *h_parent;
-	struct super_block *sb;
-	struct au_branch *br;
-	int err;
-	unsigned int udba;
-	aufs_bindex_t bcpup;
-
-	AuDbg("%pd\n", dentry);
-
-	err = au_wr_dir(dentry, src_dentry, wr_dir_args);
-	bcpup = err;
-	wh_dentry = ERR_PTR(err);
-	if (unlikely(err < 0))
-		goto out;
-
-	sb = dentry->d_sb;
-	udba = au_opt_udba(sb);
-	err = au_pin(pin, dentry, bcpup, udba,
-		     AuPin_DI_LOCKED | AuPin_MNT_WRITE);
-	wh_dentry = ERR_PTR(err);
-	if (unlikely(err))
-		goto out;
-
-	h_parent = au_pinned_h_parent(pin);
-	if (udba != AuOpt_UDBA_NONE
-	    && au_dbstart(dentry) == bcpup)
-		err = au_may_add(dentry, bcpup, h_parent,
-				 au_ftest_wrdir(wr_dir_args->flags, ISDIR));
-	else if (unlikely(dentry->d_name.len > AUFS_MAX_NAMELEN))
-		err = -ENAMETOOLONG;
-	wh_dentry = ERR_PTR(err);
-	if (unlikely(err))
-		goto out_unpin;
-
-	br = au_sbr(sb, bcpup);
-	if (dt) {
-		struct path tmp = {
-			.dentry	= h_parent,
-			.mnt	= au_br_mnt(br)
-		};
-		au_dtime_store(dt, au_pinned_parent(pin), &tmp);
-	}
-
-	wh_dentry = NULL;
-	if (bcpup != au_dbwh(dentry))
-		goto out; /* success */
-
-	/*
-	 * ENAMETOOLONG here means that if we allowed create such name, then it
-	 * would not be able to removed in the future. So we don't allow such
-	 * name here and we don't handle ENAMETOOLONG differently here.
-	 */
-	wh_dentry = au_wh_lkup(h_parent, &dentry->d_name, br);
-
-out_unpin:
-	if (IS_ERR(wh_dentry))
-		au_unpin(pin);
-out:
-	return wh_dentry;
-}
-
-/* ---------------------------------------------------------------------- */
-
-enum { Mknod, Symlink, Creat };
-struct simple_arg {
-	int type;
-	union {
-		struct {
-			umode_t			mode;
-			bool			want_excl;
-			bool			try_aopen;
-			struct vfsub_aopen_args	*aopen;
-		} c;
-		struct {
-			const char *symname;
-		} s;
-		struct {
-			umode_t mode;
-			dev_t dev;
-		} m;
-	} u;
-};
-
-static int add_simple(struct inode *dir, struct dentry *dentry,
-		      struct simple_arg *arg)
-{
-	int err, rerr;
-	aufs_bindex_t bstart;
-	unsigned char created;
-	const unsigned char try_aopen
-		= (arg->type == Creat && arg->u.c.try_aopen);
-	struct dentry *wh_dentry, *parent;
-	struct inode *h_dir;
-	struct super_block *sb;
-	struct au_branch *br;
-	/* to reuduce stack size */
-	struct {
-		struct au_dtime dt;
-		struct au_pin pin;
-		struct path h_path;
-		struct au_wr_dir_args wr_dir_args;
-	} *a;
-
-	AuDbg("%pd\n", dentry);
-	IMustLock(dir);
-
-	err = -ENOMEM;
-	a = kmalloc(sizeof(*a), GFP_NOFS);
-	if (unlikely(!a))
-		goto out;
-	a->wr_dir_args.force_btgt = -1;
-	a->wr_dir_args.flags = AuWrDir_ADD_ENTRY;
-
-	parent = dentry->d_parent; /* dir inode is locked */
-	if (!try_aopen) {
-		err = aufs_read_lock(dentry, AuLock_DW | AuLock_GEN);
-		if (unlikely(err))
-			goto out_free;
-	}
-	err = au_d_may_add(dentry);
-	if (unlikely(err))
-		goto out_unlock;
-	if (!try_aopen)
-		di_write_lock_parent(parent);
-	wh_dentry = lock_hdir_lkup_wh(dentry, &a->dt, /*src_dentry*/NULL,
-				      &a->pin, &a->wr_dir_args);
-	err = PTR_ERR(wh_dentry);
-	if (IS_ERR(wh_dentry))
-		goto out_parent;
-
-	bstart = au_dbstart(dentry);
-	sb = dentry->d_sb;
-	br = au_sbr(sb, bstart);
-	a->h_path.dentry = au_h_dptr(dentry, bstart);
-	a->h_path.mnt = au_br_mnt(br);
-	h_dir = au_pinned_h_dir(&a->pin);
-	switch (arg->type) {
-	case Creat:
-		err = 0;
-		if (!try_aopen || !h_dir->i_op->atomic_open)
-			err = vfsub_create(h_dir, &a->h_path, arg->u.c.mode,
-					   arg->u.c.want_excl);
-		else
-			err = vfsub_atomic_open(h_dir, a->h_path.dentry,
-						arg->u.c.aopen, br);
-		break;
-	case Symlink:
-		err = vfsub_symlink(h_dir, &a->h_path, arg->u.s.symname);
-		break;
-	case Mknod:
-		err = vfsub_mknod(h_dir, &a->h_path, arg->u.m.mode,
-				  arg->u.m.dev);
-		break;
-	default:
-		BUG();
-	}
-	created = !err;
-	if (!err)
-		err = epilog(dir, bstart, wh_dentry, dentry);
-
-	/* revert */
-	if (unlikely(created && err && d_is_positive(a->h_path.dentry))) {
-		/* no delegation since it is just created */
-		rerr = vfsub_unlink(h_dir, &a->h_path, /*delegated*/NULL,
-				    /*force*/0);
-		if (rerr) {
-			AuIOErr("%pd revert failure(%d, %d)\n",
-				dentry, err, rerr);
-			err = -EIO;
-		}
-		au_dtime_revert(&a->dt);
-	}
-
-	if (!err && try_aopen && !h_dir->i_op->atomic_open)
-		*arg->u.c.aopen->opened |= FILE_CREATED;
-
-	au_unpin(&a->pin);
-	dput(wh_dentry);
-
-out_parent:
-	if (!try_aopen)
-		di_write_unlock(parent);
-out_unlock:
-	if (unlikely(err)) {
-		au_update_dbstart(dentry);
-		d_drop(dentry);
-	}
-	if (!try_aopen)
-		aufs_read_unlock(dentry, AuLock_DW);
-out_free:
-	kfree(a);
-out:
-	return err;
-}
-
-int aufs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
-	       dev_t dev)
-{
-	struct simple_arg arg = {
-		.type = Mknod,
-		.u.m = {
-			.mode	= mode,
-			.dev	= dev
-		}
-	};
-	return add_simple(dir, dentry, &arg);
-}
-
-int aufs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
-{
-	struct simple_arg arg = {
-		.type = Symlink,
-		.u.s.symname = symname
-	};
-	return add_simple(dir, dentry, &arg);
-}
-
-int aufs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-		bool want_excl)
-{
-	struct simple_arg arg = {
-		.type = Creat,
-		.u.c = {
-			.mode		= mode,
-			.want_excl	= want_excl
-		}
-	};
-	return add_simple(dir, dentry, &arg);
-}
-
-int au_aopen_or_create(struct inode *dir, struct dentry *dentry,
-		       struct vfsub_aopen_args *aopen_args)
-{
-	struct simple_arg arg = {
-		.type = Creat,
-		.u.c = {
-			.mode		= aopen_args->create_mode,
-			.want_excl	= aopen_args->open_flag & O_EXCL,
-			.try_aopen	= true,
-			.aopen		= aopen_args
-		}
-	};
-	return add_simple(dir, dentry, &arg);
-}
-
-int aufs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
-{
-	int err;
-	aufs_bindex_t bindex;
-	struct super_block *sb;
-	struct dentry *parent, *h_parent, *h_dentry;
-	struct inode *h_dir, *inode;
-	struct vfsmount *h_mnt;
-	struct au_wr_dir_args wr_dir_args = {
-		.force_btgt	= -1,
-		.flags		= AuWrDir_TMPFILE
-	};
-
-	/* copy-up may happen */
-	mutex_lock(&dir->i_mutex);
-
-	sb = dir->i_sb;
-	err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM);
-	if (unlikely(err))
-		goto out;
-
-	err = au_di_init(dentry);
-	if (unlikely(err))
-		goto out_si;
-
-	err = -EBUSY;
-	parent = d_find_any_alias(dir);
-	AuDebugOn(!parent);
-	di_write_lock_parent(parent);
-	if (unlikely(d_inode(parent) != dir))
-		goto out_parent;
-
-	err = au_digen_test(parent, au_sigen(sb));
-	if (unlikely(err))
-		goto out_parent;
-
-	bindex = au_dbstart(parent);
-	au_set_dbstart(dentry, bindex);
-	au_set_dbend(dentry, bindex);
-	err = au_wr_dir(dentry, /*src_dentry*/NULL, &wr_dir_args);
-	bindex = err;
-	if (unlikely(err < 0))
-		goto out_parent;
-
-	err = -EOPNOTSUPP;
-	h_dir = au_h_iptr(dir, bindex);
-	if (unlikely(!h_dir->i_op->tmpfile))
-		goto out_parent;
-
-	h_mnt = au_sbr_mnt(sb, bindex);
-	err = vfsub_mnt_want_write(h_mnt);
-	if (unlikely(err))
-		goto out_parent;
-
-	h_parent = au_h_dptr(parent, bindex);
-	err = inode_permission(d_inode(h_parent), MAY_WRITE | MAY_EXEC);
-	if (unlikely(err))
-		goto out_mnt;
-
-	err = -ENOMEM;
-	h_dentry = d_alloc(h_parent, &dentry->d_name);
-	if (unlikely(!h_dentry))
-		goto out_mnt;
-
-	err = h_dir->i_op->tmpfile(h_dir, h_dentry, mode);
-	if (unlikely(err))
-		goto out_dentry;
-
-	au_set_dbstart(dentry, bindex);
-	au_set_dbend(dentry, bindex);
-	au_set_h_dptr(dentry, bindex, dget(h_dentry));
-	inode = au_new_inode(dentry, /*must_new*/1);
-	if (IS_ERR(inode)) {
-		err = PTR_ERR(inode);
-		au_set_h_dptr(dentry, bindex, NULL);
-		au_set_dbstart(dentry, -1);
-		au_set_dbend(dentry, -1);
-	} else {
-		if (!inode->i_nlink)
-			set_nlink(inode, 1);
-		d_tmpfile(dentry, inode);
-		au_di(dentry)->di_tmpfile = 1;
-
-		/* update without i_mutex */
-		if (au_ibstart(dir) == au_dbstart(dentry))
-			au_cpup_attr_timesizes(dir);
-	}
-
-out_dentry:
-	dput(h_dentry);
-out_mnt:
-	vfsub_mnt_drop_write(h_mnt);
-out_parent:
-	di_write_unlock(parent);
-	dput(parent);
-	di_write_unlock(dentry);
-	if (!err)
-#if 0
-		/* verbose coding for lock class name */
-		au_rw_class(&au_di(dentry)->di_rwsem,
-			    au_lc_key + AuLcNonDir_DIINFO);
-#else
-		;
-#endif
-	else {
-		au_di_fin(dentry);
-		dentry->d_fsdata = NULL;
-	}
-out_si:
-	si_read_unlock(sb);
-out:
-	mutex_unlock(&dir->i_mutex);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-struct au_link_args {
-	aufs_bindex_t bdst, bsrc;
-	struct au_pin pin;
-	struct path h_path;
-	struct dentry *src_parent, *parent;
-};
-
-static int au_cpup_before_link(struct dentry *src_dentry,
-			       struct au_link_args *a)
-{
-	int err;
-	struct dentry *h_src_dentry;
-	struct au_cp_generic cpg = {
-		.dentry	= src_dentry,
-		.bdst	= a->bdst,
-		.bsrc	= a->bsrc,
-		.len	= -1,
-		.pin	= &a->pin,
-		.flags	= AuCpup_DTIME | AuCpup_HOPEN /* | AuCpup_KEEPLINO */
-	};
-
-	di_read_lock_parent(a->src_parent, AuLock_IR);
-	err = au_test_and_cpup_dirs(src_dentry, a->bdst);
-	if (unlikely(err))
-		goto out;
-
-	h_src_dentry = au_h_dptr(src_dentry, a->bsrc);
-	err = au_pin(&a->pin, src_dentry, a->bdst,
-		     au_opt_udba(src_dentry->d_sb),
-		     AuPin_DI_LOCKED | AuPin_MNT_WRITE);
-	if (unlikely(err))
-		goto out;
-
-	err = au_sio_cpup_simple(&cpg);
-	au_unpin(&a->pin);
-
-out:
-	di_read_unlock(a->src_parent, AuLock_IR);
-	return err;
-}
-
-static int au_cpup_or_link(struct dentry *src_dentry, struct dentry *dentry,
-			   struct au_link_args *a)
-{
-	int err;
-	unsigned char plink;
-	aufs_bindex_t bend;
-	struct dentry *h_src_dentry;
-	struct inode *h_inode, *inode, *delegated;
-	struct super_block *sb;
-	struct file *h_file;
-
-	plink = 0;
-	h_inode = NULL;
-	sb = src_dentry->d_sb;
-	inode = d_inode(src_dentry);
-	if (au_ibstart(inode) <= a->bdst)
-		h_inode = au_h_iptr(inode, a->bdst);
-	if (!h_inode || !h_inode->i_nlink) {
-		/* copyup src_dentry as the name of dentry. */
-		bend = au_dbend(dentry);
-		if (bend < a->bsrc)
-			au_set_dbend(dentry, a->bsrc);
-		au_set_h_dptr(dentry, a->bsrc,
-			      dget(au_h_dptr(src_dentry, a->bsrc)));
-		dget(a->h_path.dentry);
-		au_set_h_dptr(dentry, a->bdst, NULL);
-		AuDbg("temporary d_inode...\n");
-		spin_lock(&dentry->d_lock);
-		dentry->d_inode = d_inode(src_dentry); /* tmp */
-		spin_unlock(&dentry->d_lock);
-		h_file = au_h_open_pre(dentry, a->bsrc, /*force_wr*/0);
-		if (IS_ERR(h_file))
-			err = PTR_ERR(h_file);
-		else {
-			struct au_cp_generic cpg = {
-				.dentry	= dentry,
-				.bdst	= a->bdst,
-				.bsrc	= -1,
-				.len	= -1,
-				.pin	= &a->pin,
-				.flags	= AuCpup_KEEPLINO
-			};
-			err = au_sio_cpup_simple(&cpg);
-			au_h_open_post(dentry, a->bsrc, h_file);
-			if (!err) {
-				dput(a->h_path.dentry);
-				a->h_path.dentry = au_h_dptr(dentry, a->bdst);
-			} else
-				au_set_h_dptr(dentry, a->bdst,
-					      a->h_path.dentry);
-		}
-		spin_lock(&dentry->d_lock);
-		dentry->d_inode = NULL; /* restore */
-		spin_unlock(&dentry->d_lock);
-		AuDbg("temporary d_inode...done\n");
-		au_set_h_dptr(dentry, a->bsrc, NULL);
-		au_set_dbend(dentry, bend);
-	} else {
-		/* the inode of src_dentry already exists on a.bdst branch */
-		h_src_dentry = d_find_alias(h_inode);
-		if (!h_src_dentry && au_plink_test(inode)) {
-			plink = 1;
-			h_src_dentry = au_plink_lkup(inode, a->bdst);
-			err = PTR_ERR(h_src_dentry);
-			if (IS_ERR(h_src_dentry))
-				goto out;
-
-			if (unlikely(d_is_negative(h_src_dentry))) {
-				dput(h_src_dentry);
-				h_src_dentry = NULL;
-			}
-
-		}
-		if (h_src_dentry) {
-			delegated = NULL;
-			err = vfsub_link(h_src_dentry, au_pinned_h_dir(&a->pin),
-					 &a->h_path, &delegated);
-			if (unlikely(err == -EWOULDBLOCK)) {
-				pr_warn("cannot retry for NFSv4 delegation"
-					" for an internal link\n");
-				iput(delegated);
-			}
-			dput(h_src_dentry);
-		} else {
-			AuIOErr("no dentry found for hi%lu on b%d\n",
-				h_inode->i_ino, a->bdst);
-			err = -EIO;
-		}
-	}
-
-	if (!err && !plink)
-		au_plink_append(inode, a->bdst, a->h_path.dentry);
-
-out:
-	AuTraceErr(err);
-	return err;
-}
-
-int aufs_link(struct dentry *src_dentry, struct inode *dir,
-	      struct dentry *dentry)
-{
-	int err, rerr;
-	struct au_dtime dt;
-	struct au_link_args *a;
-	struct dentry *wh_dentry, *h_src_dentry;
-	struct inode *inode, *delegated;
-	struct super_block *sb;
-	struct au_wr_dir_args wr_dir_args = {
-		/* .force_btgt	= -1, */
-		.flags		= AuWrDir_ADD_ENTRY
-	};
-
-	IMustLock(dir);
-	inode = d_inode(src_dentry);
-	IMustLock(inode);
-
-	err = -ENOMEM;
-	a = kzalloc(sizeof(*a), GFP_NOFS);
-	if (unlikely(!a))
-		goto out;
-
-	a->parent = dentry->d_parent; /* dir inode is locked */
-	err = aufs_read_and_write_lock2(dentry, src_dentry,
-					AuLock_NOPLM | AuLock_GEN);
-	if (unlikely(err))
-		goto out_kfree;
-	err = au_d_linkable(src_dentry);
-	if (unlikely(err))
-		goto out_unlock;
-	err = au_d_may_add(dentry);
-	if (unlikely(err))
-		goto out_unlock;
-
-	a->src_parent = dget_parent(src_dentry);
-	wr_dir_args.force_btgt = au_ibstart(inode);
-
-	di_write_lock_parent(a->parent);
-	wr_dir_args.force_btgt = au_wbr(dentry, wr_dir_args.force_btgt);
-	wh_dentry = lock_hdir_lkup_wh(dentry, &dt, src_dentry, &a->pin,
-				      &wr_dir_args);
-	err = PTR_ERR(wh_dentry);
-	if (IS_ERR(wh_dentry))
-		goto out_parent;
-
-	err = 0;
-	sb = dentry->d_sb;
-	a->bdst = au_dbstart(dentry);
-	a->h_path.dentry = au_h_dptr(dentry, a->bdst);
-	a->h_path.mnt = au_sbr_mnt(sb, a->bdst);
-	a->bsrc = au_ibstart(inode);
-	h_src_dentry = au_h_d_alias(src_dentry, a->bsrc);
-	if (!h_src_dentry && au_di(src_dentry)->di_tmpfile)
-		h_src_dentry = dget(au_hi_wh(inode, a->bsrc));
-	if (!h_src_dentry) {
-		a->bsrc = au_dbstart(src_dentry);
-		h_src_dentry = au_h_d_alias(src_dentry, a->bsrc);
-		AuDebugOn(!h_src_dentry);
-	} else if (IS_ERR(h_src_dentry)) {
-		err = PTR_ERR(h_src_dentry);
-		goto out_parent;
-	}
-
-	if (au_opt_test(au_mntflags(sb), PLINK)) {
-		if (a->bdst < a->bsrc
-		    /* && h_src_dentry->d_sb != a->h_path.dentry->d_sb */)
-			err = au_cpup_or_link(src_dentry, dentry, a);
-		else {
-			delegated = NULL;
-			err = vfsub_link(h_src_dentry, au_pinned_h_dir(&a->pin),
-					 &a->h_path, &delegated);
-			if (unlikely(err == -EWOULDBLOCK)) {
-				pr_warn("cannot retry for NFSv4 delegation"
-					" for an internal link\n");
-				iput(delegated);
-			}
-		}
-		dput(h_src_dentry);
-	} else {
-		/*
-		 * copyup src_dentry to the branch we process,
-		 * and then link(2) to it.
-		 */
-		dput(h_src_dentry);
-		if (a->bdst < a->bsrc
-		    /* && h_src_dentry->d_sb != a->h_path.dentry->d_sb */) {
-			au_unpin(&a->pin);
-			di_write_unlock(a->parent);
-			err = au_cpup_before_link(src_dentry, a);
-			di_write_lock_parent(a->parent);
-			if (!err)
-				err = au_pin(&a->pin, dentry, a->bdst,
-					     au_opt_udba(sb),
-					     AuPin_DI_LOCKED | AuPin_MNT_WRITE);
-			if (unlikely(err))
-				goto out_wh;
-		}
-		if (!err) {
-			h_src_dentry = au_h_dptr(src_dentry, a->bdst);
-			err = -ENOENT;
-			if (h_src_dentry && d_is_positive(h_src_dentry)) {
-				delegated = NULL;
-				err = vfsub_link(h_src_dentry,
-						 au_pinned_h_dir(&a->pin),
-						 &a->h_path, &delegated);
-				if (unlikely(err == -EWOULDBLOCK)) {
-					pr_warn("cannot retry"
-						" for NFSv4 delegation"
-						" for an internal link\n");
-					iput(delegated);
-				}
-			}
-		}
-	}
-	if (unlikely(err))
-		goto out_unpin;
-
-	if (wh_dentry) {
-		a->h_path.dentry = wh_dentry;
-		err = au_wh_unlink_dentry(au_pinned_h_dir(&a->pin), &a->h_path,
-					  dentry);
-		if (unlikely(err))
-			goto out_revert;
-	}
-
-	au_dir_ts(dir, a->bdst);
-	dir->i_version++;
-	inc_nlink(inode);
-	inode->i_ctime = dir->i_ctime;
-	d_instantiate(dentry, au_igrab(inode));
-	if (d_unhashed(a->h_path.dentry))
-		/* some filesystem calls d_drop() */
-		d_drop(dentry);
-	/* some filesystems consume an inode even hardlink */
-	au_fhsm_wrote(sb, a->bdst, /*force*/0);
-	goto out_unpin; /* success */
-
-out_revert:
-	/* no delegation since it is just created */
-	rerr = vfsub_unlink(au_pinned_h_dir(&a->pin), &a->h_path,
-			    /*delegated*/NULL, /*force*/0);
-	if (unlikely(rerr)) {
-		AuIOErr("%pd reverting failed(%d, %d)\n", dentry, err, rerr);
-		err = -EIO;
-	}
-	au_dtime_revert(&dt);
-out_unpin:
-	au_unpin(&a->pin);
-out_wh:
-	dput(wh_dentry);
-out_parent:
-	di_write_unlock(a->parent);
-	dput(a->src_parent);
-out_unlock:
-	if (unlikely(err)) {
-		au_update_dbstart(dentry);
-		d_drop(dentry);
-	}
-	aufs_read_and_write_unlock2(dentry, src_dentry);
-out_kfree:
-	kfree(a);
-out:
-	AuTraceErr(err);
-	return err;
-}
-
-int aufs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
-{
-	int err, rerr;
-	aufs_bindex_t bindex;
-	unsigned char diropq;
-	struct path h_path;
-	struct dentry *wh_dentry, *parent, *opq_dentry;
-	struct mutex *h_mtx;
-	struct super_block *sb;
-	struct {
-		struct au_pin pin;
-		struct au_dtime dt;
-	} *a; /* reduce the stack usage */
-	struct au_wr_dir_args wr_dir_args = {
-		.force_btgt	= -1,
-		.flags		= AuWrDir_ADD_ENTRY | AuWrDir_ISDIR
-	};
-
-	IMustLock(dir);
-
-	err = -ENOMEM;
-	a = kmalloc(sizeof(*a), GFP_NOFS);
-	if (unlikely(!a))
-		goto out;
-
-	err = aufs_read_lock(dentry, AuLock_DW | AuLock_GEN);
-	if (unlikely(err))
-		goto out_free;
-	err = au_d_may_add(dentry);
-	if (unlikely(err))
-		goto out_unlock;
-
-	parent = dentry->d_parent; /* dir inode is locked */
-	di_write_lock_parent(parent);
-	wh_dentry = lock_hdir_lkup_wh(dentry, &a->dt, /*src_dentry*/NULL,
-				      &a->pin, &wr_dir_args);
-	err = PTR_ERR(wh_dentry);
-	if (IS_ERR(wh_dentry))
-		goto out_parent;
-
-	sb = dentry->d_sb;
-	bindex = au_dbstart(dentry);
-	h_path.dentry = au_h_dptr(dentry, bindex);
-	h_path.mnt = au_sbr_mnt(sb, bindex);
-	err = vfsub_mkdir(au_pinned_h_dir(&a->pin), &h_path, mode);
-	if (unlikely(err))
-		goto out_unpin;
-
-	/* make the dir opaque */
-	diropq = 0;
-	h_mtx = &d_inode(h_path.dentry)->i_mutex;
-	if (wh_dentry
-	    || au_opt_test(au_mntflags(sb), ALWAYS_DIROPQ)) {
-		mutex_lock_nested(h_mtx, AuLsc_I_CHILD);
-		opq_dentry = au_diropq_create(dentry, bindex);
-		mutex_unlock(h_mtx);
-		err = PTR_ERR(opq_dentry);
-		if (IS_ERR(opq_dentry))
-			goto out_dir;
-		dput(opq_dentry);
-		diropq = 1;
-	}
-
-	err = epilog(dir, bindex, wh_dentry, dentry);
-	if (!err) {
-		inc_nlink(dir);
-		goto out_unpin; /* success */
-	}
-
-	/* revert */
-	if (diropq) {
-		AuLabel(revert opq);
-		mutex_lock_nested(h_mtx, AuLsc_I_CHILD);
-		rerr = au_diropq_remove(dentry, bindex);
-		mutex_unlock(h_mtx);
-		if (rerr) {
-			AuIOErr("%pd reverting diropq failed(%d, %d)\n",
-				dentry, err, rerr);
-			err = -EIO;
-		}
-	}
-
-out_dir:
-	AuLabel(revert dir);
-	rerr = vfsub_rmdir(au_pinned_h_dir(&a->pin), &h_path);
-	if (rerr) {
-		AuIOErr("%pd reverting dir failed(%d, %d)\n",
-			dentry, err, rerr);
-		err = -EIO;
-	}
-	au_dtime_revert(&a->dt);
-out_unpin:
-	au_unpin(&a->pin);
-	dput(wh_dentry);
-out_parent:
-	di_write_unlock(parent);
-out_unlock:
-	if (unlikely(err)) {
-		au_update_dbstart(dentry);
-		d_drop(dentry);
-	}
-	aufs_read_unlock(dentry, AuLock_DW);
-out_free:
-	kfree(a);
-out:
-	return err;
-}
diff --git a/fs/aufs/i_op_del.c b/fs/aufs/i_op_del.c
deleted file mode 100644
index 68741aadb..000000000
--- a/fs/aufs/i_op_del.c
+++ /dev/null
@@ -1,497 +0,0 @@
-/*
- * Copyright (C) 2005-2016 Junjiro R. Okajima
- */
-
-/*
- * inode operations (del entry)
- */
-
-#include "aufs.h"
-
-/*
- * decide if a new whiteout for @dentry is necessary or not.
- * when it is necessary, prepare the parent dir for the upper branch whose
- * branch index is @bcpup for creation. the actual creation of the whiteout will
- * be done by caller.
- * return value:
- * 0: wh is unnecessary
- * plus: wh is necessary
- * minus: error
- */
-int au_wr_dir_need_wh(struct dentry *dentry, int isdir, aufs_bindex_t *bcpup)
-{
-	int need_wh, err;
-	aufs_bindex_t bstart;
-	struct super_block *sb;
-
-	sb = dentry->d_sb;
-	bstart = au_dbstart(dentry);
-	if (*bcpup < 0) {
-		*bcpup = bstart;
-		if (au_test_ro(sb, bstart, d_inode(dentry))) {
-			err = AuWbrCopyup(au_sbi(sb), dentry);
-			*bcpup = err;
-			if (unlikely(err < 0))
-				goto out;
-		}
-	} else
-		AuDebugOn(bstart < *bcpup
-			  || au_test_ro(sb, *bcpup, d_inode(dentry)));
-	AuDbg("bcpup %d, bstart %d\n", *bcpup, bstart);
-
-	if (*bcpup != bstart) {
-		err = au_cpup_dirs(dentry, *bcpup);
-		if (unlikely(err))
-			goto out;
-		need_wh = 1;
-	} else {
-		struct au_dinfo *dinfo, *tmp;
-
-		need_wh = -ENOMEM;
-		dinfo = au_di(dentry);
-		tmp = au_di_alloc(sb, AuLsc_DI_TMP);
-		if (tmp) {
-			au_di_cp(tmp, dinfo);
-			au_di_swap(tmp, dinfo);
-			/* returns the number of positive dentries */
-			need_wh = au_lkup_dentry(dentry, bstart + 1, /*type*/0);
-			au_di_swap(tmp, dinfo);
-			au_rw_write_unlock(&tmp->di_rwsem);
-			au_di_free(tmp);
-		}
-	}
-	AuDbg("need_wh %d\n", need_wh);
-	err = need_wh;
-
-out:
-	return err;
-}
-
-/*
- * simple tests for the del-entry operations.
- * following the checks in vfs, plus the parent-child relationship.
- */
-int au_may_del(struct dentry *dentry, aufs_bindex_t bindex,
-	       struct dentry *h_parent, int isdir)
-{
-	int err;
-	umode_t h_mode;
-	struct dentry *h_dentry, *h_latest;
-	struct inode *h_inode;
-
-	h_dentry = au_h_dptr(dentry, bindex);
-	if (d_really_is_positive(dentry)) {
-		err = -ENOENT;
-		if (unlikely(d_is_negative(h_dentry)))
-			goto out;
-		h_inode = d_inode(h_dentry);
-		if (unlikely(!h_inode->i_nlink))
-			goto out;
-
-		h_mode = h_inode->i_mode;
-		if (!isdir) {
-			err = -EISDIR;
-			if (unlikely(S_ISDIR(h_mode)))
-				goto out;
-		} else if (unlikely(!S_ISDIR(h_mode))) {
-			err = -ENOTDIR;
-			goto out;
-		}
-	} else {
-		/* rename(2) case */
-		err = -EIO;
-		if (unlikely(d_is_positive(h_dentry)))
-			goto out;
-	}
-
-	err = -ENOENT;
-	/* expected parent dir is locked */
-	if (unlikely(h_parent != h_dentry->d_parent))
-		goto out;
-	err = 0;
-
-	/*
-	 * rmdir a dir may break the consistency on some filesystem.
-	 * let's try heavy test.
-	 */
-	err = -EACCES;
-	if (unlikely(!au_opt_test(au_mntflags(dentry->d_sb), DIRPERM1)
-		     && au_test_h_perm(d_inode(h_parent),
-				       MAY_EXEC | MAY_WRITE)))
-		goto out;
-
-	h_latest = au_sio_lkup_one(&dentry->d_name, h_parent);
-	err = -EIO;
-	if (IS_ERR(h_latest))
-		goto out;
-	if (h_latest == h_dentry)
-		err = 0;
-	dput(h_latest);
-
-out:
-	return err;
-}
-
-/*
- * decide the branch where we operate for @dentry. the branch index will be set
- * @rbcpup. after diciding it, 'pin' it and store the timestamps of the parent
- * dir for reverting.
- * when a new whiteout is necessary, create it.
- */
-static struct dentry*
-lock_hdir_create_wh(struct dentry *dentry, int isdir, aufs_bindex_t *rbcpup,
-		    struct au_dtime *dt, struct au_pin *pin)
-{
-	struct dentry *wh_dentry;
-	struct super_block *sb;
-	struct path h_path;
-	int err, need_wh;
-	unsigned int udba;
-	aufs_bindex_t bcpup;
-
-	need_wh = au_wr_dir_need_wh(dentry, isdir, rbcpup);
-	wh_dentry = ERR_PTR(need_wh);
-	if (unlikely(need_wh < 0))
-		goto out;
-
-	sb = dentry->d_sb;
-	udba = au_opt_udba(sb);
-	bcpup = *rbcpup;
-	err = au_pin(pin, dentry, bcpup, udba,
-		     AuPin_DI_LOCKED | AuPin_MNT_WRITE);
-	wh_dentry = ERR_PTR(err);
-	if (unlikely(err))
-		goto out;
-
-	h_path.dentry = au_pinned_h_parent(pin);
-	if (udba != AuOpt_UDBA_NONE
-	    && au_dbstart(dentry) == bcpup) {
-		err = au_may_del(dentry, bcpup, h_path.dentry, isdir);
-		wh_dentry = ERR_PTR(err);
-		if (unlikely(err))
-			goto out_unpin;
-	}
-
-	h_path.mnt = au_sbr_mnt(sb, bcpup);
-	au_dtime_store(dt, au_pinned_parent(pin), &h_path);
-	wh_dentry = NULL;
-	if (!need_wh)
-		goto out; /* success, no need to create whiteout */
-
-	wh_dentry = au_wh_create(dentry, bcpup, h_path.dentry);
-	if (IS_ERR(wh_dentry))
-		goto out_unpin;
-
-	/* returns with the parent is locked and wh_dentry is dget-ed */
-	goto out; /* success */
-
-out_unpin:
-	au_unpin(pin);
-out:
-	return wh_dentry;
-}
-
-/*
- * when removing a dir, rename it to a unique temporary whiteout-ed name first
- * in order to be revertible and save time for removing many child whiteouts
- * under the dir.
- * returns 1 when there are too many child whiteout and caller should remove
- * them asynchronously. returns 0 when the number of children is enough small to
- * remove now or the branch fs is a remote fs.
- * otherwise return an error.
- */
-static int renwh_and_rmdir(struct dentry *dentry, aufs_bindex_t bindex,
-			   struct au_nhash *whlist, struct inode *dir)
-{
-	int rmdir_later, err, dirwh;
-	struct dentry *h_dentry;
-	struct super_block *sb;
-	struct inode *inode;
-
-	sb = dentry->d_sb;
-	SiMustAnyLock(sb);
-	h_dentry = au_h_dptr(dentry, bindex);
-	err = au_whtmp_ren(h_dentry, au_sbr(sb, bindex));
-	if (unlikely(err))
-		goto out;
-
-	/* stop monitoring */
-	inode = d_inode(dentry);
-	au_hn_free(au_hi(inode, bindex));
-
-	if (!au_test_fs_remote(h_dentry->d_sb)) {
-		dirwh = au_sbi(sb)->si_dirwh;
-		rmdir_later = (dirwh <= 1);
-		if (!rmdir_later)
-			rmdir_later = au_nhash_test_longer_wh(whlist, bindex,
-							      dirwh);
-		if (rmdir_later)
-			return rmdir_later;
-	}
-
-	err = au_whtmp_rmdir(dir, bindex, h_dentry, whlist);
-	if (unlikely(err)) {
-		AuIOErr("rmdir %pd, b%d failed, %d. ignored\n",
-			h_dentry, bindex, err);
-		err = 0;
-	}
-
-out:
-	AuTraceErr(err);
-	return err;
-}
-
-/*
- * final procedure for deleting a entry.
- * maintain dentry and iattr.
- */
-static void epilog(struct inode *dir, struct dentry *dentry,
-		   aufs_bindex_t bindex)
-{
-	struct inode *inode;
-
-	inode = d_inode(dentry);
-	d_drop(dentry);
-	inode->i_ctime = dir->i_ctime;
-
-	au_dir_ts(dir, bindex);
-	dir->i_version++;
-}
-
-/*
- * when an error happened, remove the created whiteout and revert everything.
- */
-static int do_revert(int err, struct inode *dir, aufs_bindex_t bindex,
-		     aufs_bindex_t bwh, struct dentry *wh_dentry,
-		     struct dentry *dentry, struct au_dtime *dt)
-{
-	int rerr;
-	struct path h_path = {
-		.dentry	= wh_dentry,
-		.mnt	= au_sbr_mnt(dir->i_sb, bindex)
-	};
-
-	rerr = au_wh_unlink_dentry(au_h_iptr(dir, bindex), &h_path, dentry);
-	if (!rerr) {
-		au_set_dbwh(dentry, bwh);
-		au_dtime_revert(dt);
-		return 0;
-	}
-
-	AuIOErr("%pd reverting whiteout failed(%d, %d)\n", dentry, err, rerr);
-	return -EIO;
-}
-
-/* ---------------------------------------------------------------------- */
-
-int aufs_unlink(struct inode *dir, struct dentry *dentry)
-{
-	int err;
-	aufs_bindex_t bwh, bindex, bstart;
-	struct inode *inode, *h_dir, *delegated;
-	struct dentry *parent, *wh_dentry;
-	/* to reuduce stack size */
-	struct {
-		struct au_dtime dt;
-		struct au_pin pin;
-		struct path h_path;
-	} *a;
-
-	IMustLock(dir);
-
-	err = -ENOMEM;
-	a = kmalloc(sizeof(*a), GFP_NOFS);
-	if (unlikely(!a))
-		goto out;
-
-	err = aufs_read_lock(dentry, AuLock_DW | AuLock_GEN);
-	if (unlikely(err))
-		goto out_free;
-	err = au_d_hashed_positive(dentry);
-	if (unlikely(err))
-		goto out_unlock;
-	inode = d_inode(dentry);
-	IMustLock(inode);
-	err = -EISDIR;
-	if (unlikely(d_is_dir(dentry)))
-		goto out_unlock; /* possible? */
-
-	bstart = au_dbstart(dentry);
-	bwh = au_dbwh(dentry);
-	bindex = -1;
-	parent = dentry->d_parent; /* dir inode is locked */
-	di_write_lock_parent(parent);
-	wh_dentry = lock_hdir_create_wh(dentry, /*isdir*/0, &bindex, &a->dt,
-					&a->pin);
-	err = PTR_ERR(wh_dentry);
-	if (IS_ERR(wh_dentry))
-		goto out_parent;
-
-	a->h_path.mnt = au_sbr_mnt(dentry->d_sb, bstart);
-	a->h_path.dentry = au_h_dptr(dentry, bstart);
-	dget(a->h_path.dentry);
-	if (bindex == bstart) {
-		h_dir = au_pinned_h_dir(&a->pin);
-		delegated = NULL;
-		err = vfsub_unlink(h_dir, &a->h_path, &delegated, /*force*/0);
-		if (unlikely(err == -EWOULDBLOCK)) {
-			pr_warn("cannot retry for NFSv4 delegation"
-				" for an internal unlink\n");
-			iput(delegated);
-		}
-	} else {
-		/* dir inode is locked */
-		h_dir = d_inode(wh_dentry->d_parent);
-		IMustLock(h_dir);
-		err = 0;
-	}
-
-	if (!err) {
-		vfsub_drop_nlink(inode);
-		epilog(dir, dentry, bindex);
-
-		/* update target timestamps */
-		if (bindex == bstart) {
-			vfsub_update_h_iattr(&a->h_path, /*did*/NULL);
-			/*ignore*/
-			inode->i_ctime = d_inode(a->h_path.dentry)->i_ctime;
-		} else
-			/* todo: this timestamp may be reverted later */
-			inode->i_ctime = h_dir->i_ctime;
-		goto out_unpin; /* success */
-	}
-
-	/* revert */
-	if (wh_dentry) {
-		int rerr;
-
-		rerr = do_revert(err, dir, bindex, bwh, wh_dentry, dentry,
-				 &a->dt);
-		if (rerr)
-			err = rerr;
-	}
-
-out_unpin:
-	au_unpin(&a->pin);
-	dput(wh_dentry);
-	dput(a->h_path.dentry);
-out_parent:
-	di_write_unlock(parent);
-out_unlock:
-	aufs_read_unlock(dentry, AuLock_DW);
-out_free:
-	kfree(a);
-out:
-	return err;
-}
-
-int aufs_rmdir(struct inode *dir, struct dentry *dentry)
-{
-	int err, rmdir_later;
-	aufs_bindex_t bwh, bindex, bstart;
-	struct inode *inode;
-	struct dentry *parent, *wh_dentry, *h_dentry;
-	struct au_whtmp_rmdir *args;
-	/* to reuduce stack size */
-	struct {
-		struct au_dtime dt;
-		struct au_pin pin;
-	} *a;
-
-	IMustLock(dir);
-
-	err = -ENOMEM;
-	a = kmalloc(sizeof(*a), GFP_NOFS);
-	if (unlikely(!a))
-		goto out;
-
-	err = aufs_read_lock(dentry, AuLock_DW | AuLock_FLUSH | AuLock_GEN);
-	if (unlikely(err))
-		goto out_free;
-	err = au_alive_dir(dentry);
-	if (unlikely(err))
-		goto out_unlock;
-	inode = d_inode(dentry);
-	IMustLock(inode);
-	err = -ENOTDIR;
-	if (unlikely(!d_is_dir(dentry)))
-		goto out_unlock; /* possible? */
-
-	err = -ENOMEM;
-	args = au_whtmp_rmdir_alloc(dir->i_sb, GFP_NOFS);
-	if (unlikely(!args))
-		goto out_unlock;
-
-	parent = dentry->d_parent; /* dir inode is locked */
-	di_write_lock_parent(parent);
-	err = au_test_empty(dentry, &args->whlist);
-	if (unlikely(err))
-		goto out_parent;
-
-	bstart = au_dbstart(dentry);
-	bwh = au_dbwh(dentry);
-	bindex = -1;
-	wh_dentry = lock_hdir_create_wh(dentry, /*isdir*/1, &bindex, &a->dt,
-					&a->pin);
-	err = PTR_ERR(wh_dentry);
-	if (IS_ERR(wh_dentry))
-		goto out_parent;
-
-	h_dentry = au_h_dptr(dentry, bstart);
-	dget(h_dentry);
-	rmdir_later = 0;
-	if (bindex == bstart) {
-		err = renwh_and_rmdir(dentry, bstart, &args->whlist, dir);
-		if (err > 0) {
-			rmdir_later = err;
-			err = 0;
-		}
-	} else {
-		/* stop monitoring */
-		au_hn_free(au_hi(inode, bstart));
-
-		/* dir inode is locked */
-		IMustLock(d_inode(wh_dentry->d_parent));
-		err = 0;
-	}
-
-	if (!err) {
-		vfsub_dead_dir(inode);
-		au_set_dbdiropq(dentry, -1);
-		epilog(dir, dentry, bindex);
-
-		if (rmdir_later) {
-			au_whtmp_kick_rmdir(dir, bstart, h_dentry, args);
-			args = NULL;
-		}
-
-		goto out_unpin; /* success */
-	}
-
-	/* revert */
-	AuLabel(revert);
-	if (wh_dentry) {
-		int rerr;
-
-		rerr = do_revert(err, dir, bindex, bwh, wh_dentry, dentry,
-				 &a->dt);
-		if (rerr)
-			err = rerr;
-	}
-
-out_unpin:
-	au_unpin(&a->pin);
-	dput(wh_dentry);
-	dput(h_dentry);
-out_parent:
-	di_write_unlock(parent);
-	if (args)
-		au_whtmp_rmdir_free(args);
-out_unlock:
-	aufs_read_unlock(dentry, AuLock_DW);
-out_free:
-	kfree(a);
-out:
-	AuTraceErr(err);
-	return err;
-}
diff --git a/fs/aufs/i_op_ren.c b/fs/aufs/i_op_ren.c
deleted file mode 100644
index c880144b5..000000000
--- a/fs/aufs/i_op_ren.c
+++ /dev/null
@@ -1,1002 +0,0 @@
-/*
- * Copyright (C) 2005-2016 Junjiro R. Okajima
- */
-
-/*
- * inode operation (rename entry)
- * todo: this is crazy monster
- */
-
-#include "aufs.h"
-
-enum { AuSRC, AuDST, AuSrcDst };
-enum { AuPARENT, AuCHILD, AuParentChild };
-
-#define AuRen_ISDIR	1
-#define AuRen_ISSAMEDIR	(1 << 1)
-#define AuRen_WHSRC	(1 << 2)
-#define AuRen_WHDST	(1 << 3)
-#define AuRen_MNT_WRITE	(1 << 4)
-#define AuRen_DT_DSTDIR	(1 << 5)
-#define AuRen_DIROPQ	(1 << 6)
-#define au_ftest_ren(flags, name)	((flags) & AuRen_##name)
-#define au_fset_ren(flags, name) \
-	do { (flags) |= AuRen_##name; } while (0)
-#define au_fclr_ren(flags, name) \
-	do { (flags) &= ~AuRen_##name; } while (0)
-
-struct au_ren_args {
-	struct {
-		struct dentry *dentry, *h_dentry, *parent, *h_parent,
-			*wh_dentry;
-		struct inode *dir, *inode;
-		struct au_hinode *hdir;
-		struct au_dtime dt[AuParentChild];
-		aufs_bindex_t bstart;
-	} sd[AuSrcDst];
-
-#define src_dentry	sd[AuSRC].dentry
-#define src_dir		sd[AuSRC].dir
-#define src_inode	sd[AuSRC].inode
-#define src_h_dentry	sd[AuSRC].h_dentry
-#define src_parent	sd[AuSRC].parent
-#define src_h_parent	sd[AuSRC].h_parent
-#define src_wh_dentry	sd[AuSRC].wh_dentry
-#define src_hdir	sd[AuSRC].hdir
-#define src_h_dir	sd[AuSRC].hdir->hi_inode
-#define src_dt		sd[AuSRC].dt
-#define src_bstart	sd[AuSRC].bstart
-
-#define dst_dentry	sd[AuDST].dentry
-#define dst_dir		sd[AuDST].dir
-#define dst_inode	sd[AuDST].inode
-#define dst_h_dentry	sd[AuDST].h_dentry
-#define dst_parent	sd[AuDST].parent
-#define dst_h_parent	sd[AuDST].h_parent
-#define dst_wh_dentry	sd[AuDST].wh_dentry
-#define dst_hdir	sd[AuDST].hdir
-#define dst_h_dir	sd[AuDST].hdir->hi_inode
-#define dst_dt		sd[AuDST].dt
-#define dst_bstart	sd[AuDST].bstart
-
-	struct dentry *h_trap;
-	struct au_branch *br;
-	struct au_hinode *src_hinode;
-	struct path h_path;
-	struct au_nhash whlist;
-	aufs_bindex_t btgt, src_bwh, src_bdiropq;
-
-	unsigned int flags;
-
-	struct au_whtmp_rmdir *thargs;
-	struct dentry *h_dst;
-};
-
-/* ---------------------------------------------------------------------- */
-
-/*
- * functions for reverting.
- * when an error happened in a single rename systemcall, we should revert
- * everything as if nothing happened.
- * we don't need to revert the copied-up/down the parent dir since they are
- * harmless.
- */
-
-#define RevertFailure(fmt, ...) do { \
-	AuIOErr("revert failure: " fmt " (%d, %d)\n", \
-		##__VA_ARGS__, err, rerr); \
-	err = -EIO; \
-} while (0)
-
-static void au_ren_rev_diropq(int err, struct au_ren_args *a)
-{
-	int rerr;
-
-	au_hn_imtx_lock_nested(a->src_hinode, AuLsc_I_CHILD);
-	rerr = au_diropq_remove(a->src_dentry, a->btgt);
-	au_hn_imtx_unlock(a->src_hinode);
-	au_set_dbdiropq(a->src_dentry, a->src_bdiropq);
-	if (rerr)
-		RevertFailure("remove diropq %pd", a->src_dentry);
-}
-
-static void au_ren_rev_rename(int err, struct au_ren_args *a)
-{
-	int rerr;
-	struct inode *delegated;
-
-	a->h_path.dentry = vfsub_lkup_one(&a->src_dentry->d_name,
-					  a->src_h_parent);
-	rerr = PTR_ERR(a->h_path.dentry);
-	if (IS_ERR(a->h_path.dentry)) {
-		RevertFailure("lkup one %pd", a->src_dentry);
-		return;
-	}
-
-	delegated = NULL;
-	rerr = vfsub_rename(a->dst_h_dir,
-			    au_h_dptr(a->src_dentry, a->btgt),
-			    a->src_h_dir, &a->h_path, &delegated);
-	if (unlikely(rerr == -EWOULDBLOCK)) {
-		pr_warn("cannot retry for NFSv4 delegation"
-			" for an internal rename\n");
-		iput(delegated);
-	}
-	d_drop(a->h_path.dentry);
-	dput(a->h_path.dentry);
-	/* au_set_h_dptr(a->src_dentry, a->btgt, NULL); */
-	if (rerr)
-		RevertFailure("rename %pd", a->src_dentry);
-}
-
-static void au_ren_rev_whtmp(int err, struct au_ren_args *a)
-{
-	int rerr;
-	struct inode *delegated;
-
-	a->h_path.dentry = vfsub_lkup_one(&a->dst_dentry->d_name,
-					  a->dst_h_parent);
-	rerr = PTR_ERR(a->h_path.dentry);
-	if (IS_ERR(a->h_path.dentry)) {
-		RevertFailure("lkup one %pd", a->dst_dentry);
-		return;
-	}
-	if (d_is_positive(a->h_path.dentry)) {
-		d_drop(a->h_path.dentry);
-		dput(a->h_path.dentry);
-		return;
-	}
-
-	delegated = NULL;
-	rerr = vfsub_rename(a->dst_h_dir, a->h_dst, a->dst_h_dir, &a->h_path,
-			    &delegated);
-	if (unlikely(rerr == -EWOULDBLOCK)) {
-		pr_warn("cannot retry for NFSv4 delegation"
-			" for an internal rename\n");
-		iput(delegated);
-	}
-	d_drop(a->h_path.dentry);
-	dput(a->h_path.dentry);
-	if (!rerr)
-		au_set_h_dptr(a->dst_dentry, a->btgt, dget(a->h_dst));
-	else
-		RevertFailure("rename %pd", a->h_dst);
-}
-
-static void au_ren_rev_whsrc(int err, struct au_ren_args *a)
-{
-	int rerr;
-
-	a->h_path.dentry = a->src_wh_dentry;
-	rerr = au_wh_unlink_dentry(a->src_h_dir, &a->h_path, a->src_dentry);
-	au_set_dbwh(a->src_dentry, a->src_bwh);
-	if (rerr)
-		RevertFailure("unlink %pd", a->src_wh_dentry);
-}
-#undef RevertFailure
-
-/* ---------------------------------------------------------------------- */
-
-/*
- * when we have to copyup the renaming entry, do it with the rename-target name
- * in order to minimize the cost (the later actual rename is unnecessary).
- * otherwise rename it on the target branch.
- */
-static int au_ren_or_cpup(struct au_ren_args *a)
-{
-	int err;
-	struct dentry *d;
-	struct inode *delegated;
-
-	d = a->src_dentry;
-	if (au_dbstart(d) == a->btgt) {
-		a->h_path.dentry = a->dst_h_dentry;
-		if (au_ftest_ren(a->flags, DIROPQ)
-		    && au_dbdiropq(d) == a->btgt)
-			au_fclr_ren(a->flags, DIROPQ);
-		AuDebugOn(au_dbstart(d) != a->btgt);
-		delegated = NULL;
-		err = vfsub_rename(a->src_h_dir, au_h_dptr(d, a->btgt),
-				   a->dst_h_dir, &a->h_path, &delegated);
-		if (unlikely(err == -EWOULDBLOCK)) {
-			pr_warn("cannot retry for NFSv4 delegation"
-				" for an internal rename\n");
-			iput(delegated);
-		}
-	} else
-		BUG();
-
-	if (!err && a->h_dst)
-		/* it will be set to dinfo later */
-		dget(a->h_dst);
-
-	return err;
-}
-
-/* cf. aufs_rmdir() */
-static int au_ren_del_whtmp(struct au_ren_args *a)
-{
-	int err;
-	struct inode *dir;
-
-	dir = a->dst_dir;
-	SiMustAnyLock(dir->i_sb);
-	if (!au_nhash_test_longer_wh(&a->whlist, a->btgt,
-				     au_sbi(dir->i_sb)->si_dirwh)
-	    || au_test_fs_remote(a->h_dst->d_sb)) {
-		err = au_whtmp_rmdir(dir, a->btgt, a->h_dst, &a->whlist);
-		if (unlikely(err))
-			pr_warn("failed removing whtmp dir %pd (%d), "
-				"ignored.\n", a->h_dst, err);
-	} else {
-		au_nhash_wh_free(&a->thargs->whlist);
-		a->thargs->whlist = a->whlist;
-		a->whlist.nh_num = 0;
-		au_whtmp_kick_rmdir(dir, a->btgt, a->h_dst, a->thargs);
-		dput(a->h_dst);
-		a->thargs = NULL;
-	}
-
-	return 0;
-}
-
-/* make it 'opaque' dir. */
-static int au_ren_diropq(struct au_ren_args *a)
-{
-	int err;
-	struct dentry *diropq;
-
-	err = 0;
-	a->src_bdiropq = au_dbdiropq(a->src_dentry);
-	a->src_hinode = au_hi(a->src_inode, a->btgt);
-	au_hn_imtx_lock_nested(a->src_hinode, AuLsc_I_CHILD);
-	diropq = au_diropq_create(a->src_dentry, a->btgt);
-	au_hn_imtx_unlock(a->src_hinode);
-	if (IS_ERR(diropq))
-		err = PTR_ERR(diropq);
-	else
-		dput(diropq);
-
-	return err;
-}
-
-static int do_rename(struct au_ren_args *a)
-{
-	int err;
-	struct dentry *d, *h_d;
-
-	/* prepare workqueue args for asynchronous rmdir */
-	h_d = a->dst_h_dentry;
-	if (au_ftest_ren(a->flags, ISDIR) && d_is_positive(h_d)) {
-		err = -ENOMEM;
-		a->thargs = au_whtmp_rmdir_alloc(a->src_dentry->d_sb, GFP_NOFS);
-		if (unlikely(!a->thargs))
-			goto out;
-		a->h_dst = dget(h_d);
-	}
-
-	/* create whiteout for src_dentry */
-	if (au_ftest_ren(a->flags, WHSRC)) {
-		a->src_bwh = au_dbwh(a->src_dentry);
-		AuDebugOn(a->src_bwh >= 0);
-		a->src_wh_dentry
-			= au_wh_create(a->src_dentry, a->btgt, a->src_h_parent);
-		err = PTR_ERR(a->src_wh_dentry);
-		if (IS_ERR(a->src_wh_dentry))
-			goto out_thargs;
-	}
-
-	/* lookup whiteout for dentry */
-	if (au_ftest_ren(a->flags, WHDST)) {
-		h_d = au_wh_lkup(a->dst_h_parent, &a->dst_dentry->d_name,
-				 a->br);
-		err = PTR_ERR(h_d);
-		if (IS_ERR(h_d))
-			goto out_whsrc;
-		if (d_is_negative(h_d))
-			dput(h_d);
-		else
-			a->dst_wh_dentry = h_d;
-	}
-
-	/* rename dentry to tmpwh */
-	if (a->thargs) {
-		err = au_whtmp_ren(a->dst_h_dentry, a->br);
-		if (unlikely(err))
-			goto out_whdst;
-
-		d = a->dst_dentry;
-		au_set_h_dptr(d, a->btgt, NULL);
-		err = au_lkup_neg(d, a->btgt, /*wh*/0);
-		if (unlikely(err))
-			goto out_whtmp;
-		a->dst_h_dentry = au_h_dptr(d, a->btgt);
-	}
-
-	BUG_ON(d_is_positive(a->dst_h_dentry) && a->src_bstart != a->btgt);
-
-	/* rename by vfs_rename or cpup */
-	d = a->dst_dentry;
-	if (au_ftest_ren(a->flags, ISDIR)
-	    && (a->dst_wh_dentry
-		|| au_dbdiropq(d) == a->btgt
-		/* hide the lower to keep xino */
-		|| a->btgt < au_dbend(d)
-		|| au_opt_test(au_mntflags(d->d_sb), ALWAYS_DIROPQ)))
-		au_fset_ren(a->flags, DIROPQ);
-	err = au_ren_or_cpup(a);
-	if (unlikely(err))
-		/* leave the copied-up one */
-		goto out_whtmp;
-
-	/* make dir opaque */
-	if (au_ftest_ren(a->flags, DIROPQ)) {
-		err = au_ren_diropq(a);
-		if (unlikely(err))
-			goto out_rename;
-	}
-
-	/* update target timestamps */
-	AuDebugOn(au_dbstart(a->src_dentry) != a->btgt);
-	a->h_path.dentry = au_h_dptr(a->src_dentry, a->btgt);
-	vfsub_update_h_iattr(&a->h_path, /*did*/NULL); /*ignore*/
-	a->src_inode->i_ctime = d_inode(a->h_path.dentry)->i_ctime;
-
-	/* remove whiteout for dentry */
-	if (a->dst_wh_dentry) {
-		a->h_path.dentry = a->dst_wh_dentry;
-		err = au_wh_unlink_dentry(a->dst_h_dir, &a->h_path,
-					  a->dst_dentry);
-		if (unlikely(err))
-			goto out_diropq;
-	}
-
-	/* remove whtmp */
-	if (a->thargs)
-		au_ren_del_whtmp(a); /* ignore this error */
-
-	au_fhsm_wrote(a->src_dentry->d_sb, a->btgt, /*force*/0);
-	err = 0;
-	goto out_success;
-
-out_diropq:
-	if (au_ftest_ren(a->flags, DIROPQ))
-		au_ren_rev_diropq(err, a);
-out_rename:
-	au_ren_rev_rename(err, a);
-	dput(a->h_dst);
-out_whtmp:
-	if (a->thargs)
-		au_ren_rev_whtmp(err, a);
-out_whdst:
-	dput(a->dst_wh_dentry);
-	a->dst_wh_dentry = NULL;
-out_whsrc:
-	if (a->src_wh_dentry)
-		au_ren_rev_whsrc(err, a);
-out_success:
-	dput(a->src_wh_dentry);
-	dput(a->dst_wh_dentry);
-out_thargs:
-	if (a->thargs) {
-		dput(a->h_dst);
-		au_whtmp_rmdir_free(a->thargs);
-		a->thargs = NULL;
-	}
-out:
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/*
- * test if @dentry dir can be rename destination or not.
- * success means, it is a logically empty dir.
- */
-static int may_rename_dstdir(struct dentry *dentry, struct au_nhash *whlist)
-{
-	return au_test_empty(dentry, whlist);
-}
-
-/*
- * test if @dentry dir can be rename source or not.
- * if it can, return 0 and @children is filled.
- * success means,
- * - it is a logically empty dir.
- * - or, it exists on writable branch and has no children including whiteouts
- *       on the lower branch.
- */
-static int may_rename_srcdir(struct dentry *dentry, aufs_bindex_t btgt)
-{
-	int err;
-	unsigned int rdhash;
-	aufs_bindex_t bstart;
-
-	bstart = au_dbstart(dentry);
-	if (bstart != btgt) {
-		struct au_nhash whlist;
-
-		SiMustAnyLock(dentry->d_sb);
-		rdhash = au_sbi(dentry->d_sb)->si_rdhash;
-		if (!rdhash)
-			rdhash = au_rdhash_est(au_dir_size(/*file*/NULL,
-							   dentry));
-		err = au_nhash_alloc(&whlist, rdhash, GFP_NOFS);
-		if (unlikely(err))
-			goto out;
-		err = au_test_empty(dentry, &whlist);
-		au_nhash_wh_free(&whlist);
-		goto out;
-	}
-
-	if (bstart == au_dbtaildir(dentry))
-		return 0; /* success */
-
-	err = au_test_empty_lower(dentry);
-
-out:
-	if (err == -ENOTEMPTY) {
-		AuWarn1("renaming dir who has child(ren) on multiple branches,"
-			" is not supported\n");
-		err = -EXDEV;
-	}
-	return err;
-}
-
-/* side effect: sets whlist and h_dentry */
-static int au_ren_may_dir(struct au_ren_args *a)
-{
-	int err;
-	unsigned int rdhash;
-	struct dentry *d;
-
-	d = a->dst_dentry;
-	SiMustAnyLock(d->d_sb);
-
-	err = 0;
-	if (au_ftest_ren(a->flags, ISDIR) && a->dst_inode) {
-		rdhash = au_sbi(d->d_sb)->si_rdhash;
-		if (!rdhash)
-			rdhash = au_rdhash_est(au_dir_size(/*file*/NULL, d));
-		err = au_nhash_alloc(&a->whlist, rdhash, GFP_NOFS);
-		if (unlikely(err))
-			goto out;
-
-		au_set_dbstart(d, a->dst_bstart);
-		err = may_rename_dstdir(d, &a->whlist);
-		au_set_dbstart(d, a->btgt);
-	}
-	a->dst_h_dentry = au_h_dptr(d, au_dbstart(d));
-	if (unlikely(err))
-		goto out;
-
-	d = a->src_dentry;
-	a->src_h_dentry = au_h_dptr(d, au_dbstart(d));
-	if (au_ftest_ren(a->flags, ISDIR)) {
-		err = may_rename_srcdir(d, a->btgt);
-		if (unlikely(err)) {
-			au_nhash_wh_free(&a->whlist);
-			a->whlist.nh_num = 0;
-		}
-	}
-out:
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/*
- * simple tests for rename.
- * following the checks in vfs, plus the parent-child relationship.
- */
-static int au_may_ren(struct au_ren_args *a)
-{
-	int err, isdir;
-	struct inode *h_inode;
-
-	if (a->src_bstart == a->btgt) {
-		err = au_may_del(a->src_dentry, a->btgt, a->src_h_parent,
-				 au_ftest_ren(a->flags, ISDIR));
-		if (unlikely(err))
-			goto out;
-		err = -EINVAL;
-		if (unlikely(a->src_h_dentry == a->h_trap))
-			goto out;
-	}
-
-	err = 0;
-	if (a->dst_bstart != a->btgt)
-		goto out;
-
-	err = -ENOTEMPTY;
-	if (unlikely(a->dst_h_dentry == a->h_trap))
-		goto out;
-
-	err = -EIO;
-	isdir = !!au_ftest_ren(a->flags, ISDIR);
-	if (d_really_is_negative(a->dst_dentry)) {
-		if (d_is_negative(a->dst_h_dentry))
-			err = au_may_add(a->dst_dentry, a->btgt,
-					 a->dst_h_parent, isdir);
-	} else {
-		if (unlikely(d_is_negative(a->dst_h_dentry)))
-			goto out;
-		h_inode = d_inode(a->dst_h_dentry);
-		if (h_inode->i_nlink)
-			err = au_may_del(a->dst_dentry, a->btgt,
-					 a->dst_h_parent, isdir);
-	}
-
-out:
-	if (unlikely(err == -ENOENT || err == -EEXIST))
-		err = -EIO;
-	AuTraceErr(err);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/*
- * locking order
- * (VFS)
- * - src_dir and dir by lock_rename()
- * - inode if exitsts
- * (aufs)
- * - lock all
- *   + src_dentry and dentry by aufs_read_and_write_lock2() which calls,
- *     + si_read_lock
- *     + di_write_lock2_child()
- *       + di_write_lock_child()
- *	   + ii_write_lock_child()
- *       + di_write_lock_child2()
- *	   + ii_write_lock_child2()
- *     + src_parent and parent
- *       + di_write_lock_parent()
- *	   + ii_write_lock_parent()
- *       + di_write_lock_parent2()
- *	   + ii_write_lock_parent2()
- *   + lower src_dir and dir by vfsub_lock_rename()
- *   + verify the every relationships between child and parent. if any
- *     of them failed, unlock all and return -EBUSY.
- */
-static void au_ren_unlock(struct au_ren_args *a)
-{
-	vfsub_unlock_rename(a->src_h_parent, a->src_hdir,
-			    a->dst_h_parent, a->dst_hdir);
-	if (au_ftest_ren(a->flags, MNT_WRITE))
-		vfsub_mnt_drop_write(au_br_mnt(a->br));
-}
-
-static int au_ren_lock(struct au_ren_args *a)
-{
-	int err;
-	unsigned int udba;
-
-	err = 0;
-	a->src_h_parent = au_h_dptr(a->src_parent, a->btgt);
-	a->src_hdir = au_hi(a->src_dir, a->btgt);
-	a->dst_h_parent = au_h_dptr(a->dst_parent, a->btgt);
-	a->dst_hdir = au_hi(a->dst_dir, a->btgt);
-
-	err = vfsub_mnt_want_write(au_br_mnt(a->br));
-	if (unlikely(err))
-		goto out;
-	au_fset_ren(a->flags, MNT_WRITE);
-	a->h_trap = vfsub_lock_rename(a->src_h_parent, a->src_hdir,
-				      a->dst_h_parent, a->dst_hdir);
-	udba = au_opt_udba(a->src_dentry->d_sb);
-	if (unlikely(a->src_hdir->hi_inode != d_inode(a->src_h_parent)
-		     || a->dst_hdir->hi_inode != d_inode(a->dst_h_parent)))
-		err = au_busy_or_stale();
-	if (!err && au_dbstart(a->src_dentry) == a->btgt)
-		err = au_h_verify(a->src_h_dentry, udba,
-				  d_inode(a->src_h_parent), a->src_h_parent,
-				  a->br);
-	if (!err && au_dbstart(a->dst_dentry) == a->btgt)
-		err = au_h_verify(a->dst_h_dentry, udba,
-				  d_inode(a->dst_h_parent), a->dst_h_parent,
-				  a->br);
-	if (!err)
-		goto out; /* success */
-
-	err = au_busy_or_stale();
-	au_ren_unlock(a);
-
-out:
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static void au_ren_refresh_dir(struct au_ren_args *a)
-{
-	struct inode *dir;
-
-	dir = a->dst_dir;
-	dir->i_version++;
-	if (au_ftest_ren(a->flags, ISDIR)) {
-		/* is this updating defined in POSIX? */
-		au_cpup_attr_timesizes(a->src_inode);
-		au_cpup_attr_nlink(dir, /*force*/1);
-	}
-
-	au_dir_ts(dir, a->btgt);
-
-	if (au_ftest_ren(a->flags, ISSAMEDIR))
-		return;
-
-	dir = a->src_dir;
-	dir->i_version++;
-	if (au_ftest_ren(a->flags, ISDIR))
-		au_cpup_attr_nlink(dir, /*force*/1);
-	au_dir_ts(dir, a->btgt);
-}
-
-static void au_ren_refresh(struct au_ren_args *a)
-{
-	aufs_bindex_t bend, bindex;
-	struct dentry *d, *h_d;
-	struct inode *i, *h_i;
-	struct super_block *sb;
-
-	d = a->dst_dentry;
-	d_drop(d);
-	if (a->h_dst)
-		/* already dget-ed by au_ren_or_cpup() */
-		au_set_h_dptr(d, a->btgt, a->h_dst);
-
-	i = a->dst_inode;
-	if (i) {
-		if (!au_ftest_ren(a->flags, ISDIR))
-			vfsub_drop_nlink(i);
-		else {
-			vfsub_dead_dir(i);
-			au_cpup_attr_timesizes(i);
-		}
-		au_update_dbrange(d, /*do_put_zero*/1);
-	} else {
-		bend = a->btgt;
-		for (bindex = au_dbstart(d); bindex < bend; bindex++)
-			au_set_h_dptr(d, bindex, NULL);
-		bend = au_dbend(d);
-		for (bindex = a->btgt + 1; bindex <= bend; bindex++)
-			au_set_h_dptr(d, bindex, NULL);
-		au_update_dbrange(d, /*do_put_zero*/0);
-	}
-
-	d = a->src_dentry;
-	au_set_dbwh(d, -1);
-	bend = au_dbend(d);
-	for (bindex = a->btgt + 1; bindex <= bend; bindex++) {
-		h_d = au_h_dptr(d, bindex);
-		if (h_d)
-			au_set_h_dptr(d, bindex, NULL);
-	}
-	au_set_dbend(d, a->btgt);
-
-	sb = d->d_sb;
-	i = a->src_inode;
-	if (au_opt_test(au_mntflags(sb), PLINK) && au_plink_test(i))
-		return; /* success */
-
-	bend = au_ibend(i);
-	for (bindex = a->btgt + 1; bindex <= bend; bindex++) {
-		h_i = au_h_iptr(i, bindex);
-		if (h_i) {
-			au_xino_write(sb, bindex, h_i->i_ino, /*ino*/0);
-			/* ignore this error */
-			au_set_h_iptr(i, bindex, NULL, 0);
-		}
-	}
-	au_set_ibend(i, a->btgt);
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* mainly for link(2) and rename(2) */
-int au_wbr(struct dentry *dentry, aufs_bindex_t btgt)
-{
-	aufs_bindex_t bdiropq, bwh;
-	struct dentry *parent;
-	struct au_branch *br;
-
-	parent = dentry->d_parent;
-	IMustLock(d_inode(parent)); /* dir is locked */
-
-	bdiropq = au_dbdiropq(parent);
-	bwh = au_dbwh(dentry);
-	br = au_sbr(dentry->d_sb, btgt);
-	if (au_br_rdonly(br)
-	    || (0 <= bdiropq && bdiropq < btgt)
-	    || (0 <= bwh && bwh < btgt))
-		btgt = -1;
-
-	AuDbg("btgt %d\n", btgt);
-	return btgt;
-}
-
-/* sets src_bstart, dst_bstart and btgt */
-static int au_ren_wbr(struct au_ren_args *a)
-{
-	int err;
-	struct au_wr_dir_args wr_dir_args = {
-		/* .force_btgt	= -1, */
-		.flags		= AuWrDir_ADD_ENTRY
-	};
-
-	a->src_bstart = au_dbstart(a->src_dentry);
-	a->dst_bstart = au_dbstart(a->dst_dentry);
-	if (au_ftest_ren(a->flags, ISDIR))
-		au_fset_wrdir(wr_dir_args.flags, ISDIR);
-	wr_dir_args.force_btgt = a->src_bstart;
-	if (a->dst_inode && a->dst_bstart < a->src_bstart)
-		wr_dir_args.force_btgt = a->dst_bstart;
-	wr_dir_args.force_btgt = au_wbr(a->dst_dentry, wr_dir_args.force_btgt);
-	err = au_wr_dir(a->dst_dentry, a->src_dentry, &wr_dir_args);
-	a->btgt = err;
-
-	return err;
-}
-
-static void au_ren_dt(struct au_ren_args *a)
-{
-	a->h_path.dentry = a->src_h_parent;
-	au_dtime_store(a->src_dt + AuPARENT, a->src_parent, &a->h_path);
-	if (!au_ftest_ren(a->flags, ISSAMEDIR)) {
-		a->h_path.dentry = a->dst_h_parent;
-		au_dtime_store(a->dst_dt + AuPARENT, a->dst_parent, &a->h_path);
-	}
-
-	au_fclr_ren(a->flags, DT_DSTDIR);
-	if (!au_ftest_ren(a->flags, ISDIR))
-		return;
-
-	a->h_path.dentry = a->src_h_dentry;
-	au_dtime_store(a->src_dt + AuCHILD, a->src_dentry, &a->h_path);
-	if (d_is_positive(a->dst_h_dentry)) {
-		au_fset_ren(a->flags, DT_DSTDIR);
-		a->h_path.dentry = a->dst_h_dentry;
-		au_dtime_store(a->dst_dt + AuCHILD, a->dst_dentry, &a->h_path);
-	}
-}
-
-static void au_ren_rev_dt(int err, struct au_ren_args *a)
-{
-	struct dentry *h_d;
-	struct mutex *h_mtx;
-
-	au_dtime_revert(a->src_dt + AuPARENT);
-	if (!au_ftest_ren(a->flags, ISSAMEDIR))
-		au_dtime_revert(a->dst_dt + AuPARENT);
-
-	if (au_ftest_ren(a->flags, ISDIR) && err != -EIO) {
-		h_d = a->src_dt[AuCHILD].dt_h_path.dentry;
-		h_mtx = &d_inode(h_d)->i_mutex;
-		mutex_lock_nested(h_mtx, AuLsc_I_CHILD);
-		au_dtime_revert(a->src_dt + AuCHILD);
-		mutex_unlock(h_mtx);
-
-		if (au_ftest_ren(a->flags, DT_DSTDIR)) {
-			h_d = a->dst_dt[AuCHILD].dt_h_path.dentry;
-			h_mtx = &d_inode(h_d)->i_mutex;
-			mutex_lock_nested(h_mtx, AuLsc_I_CHILD);
-			au_dtime_revert(a->dst_dt + AuCHILD);
-			mutex_unlock(h_mtx);
-		}
-	}
-}
-
-/* ---------------------------------------------------------------------- */
-
-int aufs_rename(struct inode *_src_dir, struct dentry *_src_dentry,
-		struct inode *_dst_dir, struct dentry *_dst_dentry)
-{
-	int err, flags;
-	/* reduce stack space */
-	struct au_ren_args *a;
-
-	AuDbg("%pd, %pd\n", _src_dentry, _dst_dentry);
-	IMustLock(_src_dir);
-	IMustLock(_dst_dir);
-
-	err = -ENOMEM;
-	BUILD_BUG_ON(sizeof(*a) > PAGE_SIZE);
-	a = kzalloc(sizeof(*a), GFP_NOFS);
-	if (unlikely(!a))
-		goto out;
-
-	a->src_dir = _src_dir;
-	a->src_dentry = _src_dentry;
-	a->src_inode = NULL;
-	if (d_really_is_positive(a->src_dentry))
-		a->src_inode = d_inode(a->src_dentry);
-	a->src_parent = a->src_dentry->d_parent; /* dir inode is locked */
-	a->dst_dir = _dst_dir;
-	a->dst_dentry = _dst_dentry;
-	a->dst_inode = NULL;
-	if (d_really_is_positive(a->dst_dentry))
-		a->dst_inode = d_inode(a->dst_dentry);
-	a->dst_parent = a->dst_dentry->d_parent; /* dir inode is locked */
-	if (a->dst_inode) {
-		IMustLock(a->dst_inode);
-		au_igrab(a->dst_inode);
-	}
-
-	err = -ENOTDIR;
-	flags = AuLock_FLUSH | AuLock_NOPLM | AuLock_GEN;
-	if (d_is_dir(a->src_dentry)) {
-		au_fset_ren(a->flags, ISDIR);
-		if (unlikely(d_really_is_positive(a->dst_dentry)
-			     && !d_is_dir(a->dst_dentry)))
-			goto out_free;
-		flags |= AuLock_DIRS;
-	}
-	err = aufs_read_and_write_lock2(a->dst_dentry, a->src_dentry, flags);
-	if (unlikely(err))
-		goto out_free;
-
-	err = au_d_hashed_positive(a->src_dentry);
-	if (unlikely(err))
-		goto out_unlock;
-	err = -ENOENT;
-	if (a->dst_inode) {
-		/*
-		 * If it is a dir, VFS unhash dst_dentry before this
-		 * function. It means we cannot rely upon d_unhashed().
-		 */
-		if (unlikely(!a->dst_inode->i_nlink))
-			goto out_unlock;
-		if (!S_ISDIR(a->dst_inode->i_mode)) {
-			err = au_d_hashed_positive(a->dst_dentry);
-			if (unlikely(err))
-				goto out_unlock;
-		} else if (unlikely(IS_DEADDIR(a->dst_inode)))
-			goto out_unlock;
-	} else if (unlikely(d_unhashed(a->dst_dentry)))
-		goto out_unlock;
-
-	/*
-	 * is it possible?
-	 * yes, it happened (in linux-3.3-rcN) but I don't know why.
-	 * there may exist a problem somewhere else.
-	 */
-	err = -EINVAL;
-	if (unlikely(d_inode(a->dst_parent) == d_inode(a->src_dentry)))
-		goto out_unlock;
-
-	au_fset_ren(a->flags, ISSAMEDIR); /* temporary */
-	di_write_lock_parent(a->dst_parent);
-
-	/* which branch we process */
-	err = au_ren_wbr(a);
-	if (unlikely(err < 0))
-		goto out_parent;
-	a->br = au_sbr(a->dst_dentry->d_sb, a->btgt);
-	a->h_path.mnt = au_br_mnt(a->br);
-
-	/* are they available to be renamed */
-	err = au_ren_may_dir(a);
-	if (unlikely(err))
-		goto out_children;
-
-	/* prepare the writable parent dir on the same branch */
-	if (a->dst_bstart == a->btgt) {
-		au_fset_ren(a->flags, WHDST);
-	} else {
-		err = au_cpup_dirs(a->dst_dentry, a->btgt);
-		if (unlikely(err))
-			goto out_children;
-	}
-
-	if (a->src_dir != a->dst_dir) {
-		/*
-		 * this temporary unlock is safe,
-		 * because both dir->i_mutex are locked.
-		 */
-		di_write_unlock(a->dst_parent);
-		di_write_lock_parent(a->src_parent);
-		err = au_wr_dir_need_wh(a->src_dentry,
-					au_ftest_ren(a->flags, ISDIR),
-					&a->btgt);
-		di_write_unlock(a->src_parent);
-		di_write_lock2_parent(a->src_parent, a->dst_parent, /*isdir*/1);
-		au_fclr_ren(a->flags, ISSAMEDIR);
-	} else
-		err = au_wr_dir_need_wh(a->src_dentry,
-					au_ftest_ren(a->flags, ISDIR),
-					&a->btgt);
-	if (unlikely(err < 0))
-		goto out_children;
-	if (err)
-		au_fset_ren(a->flags, WHSRC);
-
-	/* cpup src */
-	if (a->src_bstart != a->btgt) {
-		struct au_pin pin;
-
-		err = au_pin(&pin, a->src_dentry, a->btgt,
-			     au_opt_udba(a->src_dentry->d_sb),
-			     AuPin_DI_LOCKED | AuPin_MNT_WRITE);
-		if (!err) {
-			struct au_cp_generic cpg = {
-				.dentry	= a->src_dentry,
-				.bdst	= a->btgt,
-				.bsrc	= a->src_bstart,
-				.len	= -1,
-				.pin	= &pin,
-				.flags	= AuCpup_DTIME | AuCpup_HOPEN
-			};
-			AuDebugOn(au_dbstart(a->src_dentry) != a->src_bstart);
-			err = au_sio_cpup_simple(&cpg);
-			au_unpin(&pin);
-		}
-		if (unlikely(err))
-			goto out_children;
-		a->src_bstart = a->btgt;
-		a->src_h_dentry = au_h_dptr(a->src_dentry, a->btgt);
-		au_fset_ren(a->flags, WHSRC);
-	}
-
-	/* lock them all */
-	err = au_ren_lock(a);
-	if (unlikely(err))
-		/* leave the copied-up one */
-		goto out_children;
-
-	if (!au_opt_test(au_mntflags(a->dst_dir->i_sb), UDBA_NONE))
-		err = au_may_ren(a);
-	else if (unlikely(a->dst_dentry->d_name.len > AUFS_MAX_NAMELEN))
-		err = -ENAMETOOLONG;
-	if (unlikely(err))
-		goto out_hdir;
-
-	/* store timestamps to be revertible */
-	au_ren_dt(a);
-
-	/* here we go */
-	err = do_rename(a);
-	if (unlikely(err))
-		goto out_dt;
-
-	/* update dir attributes */
-	au_ren_refresh_dir(a);
-
-	/* dput/iput all lower dentries */
-	au_ren_refresh(a);
-
-	goto out_hdir; /* success */
-
-out_dt:
-	au_ren_rev_dt(err, a);
-out_hdir:
-	au_ren_unlock(a);
-out_children:
-	au_nhash_wh_free(&a->whlist);
-	if (err && a->dst_inode && a->dst_bstart != a->btgt) {
-		AuDbg("bstart %d, btgt %d\n", a->dst_bstart, a->btgt);
-		au_set_h_dptr(a->dst_dentry, a->btgt, NULL);
-		au_set_dbstart(a->dst_dentry, a->dst_bstart);
-	}
-out_parent:
-	if (!err)
-		d_move(a->src_dentry, a->dst_dentry);
-	else {
-		au_update_dbstart(a->dst_dentry);
-		if (!a->dst_inode)
-			d_drop(a->dst_dentry);
-	}
-	if (au_ftest_ren(a->flags, ISSAMEDIR))
-		di_write_unlock(a->dst_parent);
-	else
-		di_write_unlock2(a->src_parent, a->dst_parent);
-out_unlock:
-	aufs_read_and_write_unlock2(a->dst_dentry, a->src_dentry);
-out_free:
-	iput(a->dst_inode);
-	if (a->thargs)
-		au_whtmp_rmdir_free(a->thargs);
-	kfree(a);
-out:
-	AuTraceErr(err);
-	return err;
-}
diff --git a/fs/aufs/iinfo.c b/fs/aufs/iinfo.c
deleted file mode 100644
index 67ef672a0..000000000
--- a/fs/aufs/iinfo.c
+++ /dev/null
@@ -1,264 +0,0 @@
-/*
- * Copyright (C) 2005-2016 Junjiro R. Okajima
- */
-
-/*
- * inode private data
- */
-
-#include "aufs.h"
-
-struct inode *au_h_iptr(struct inode *inode, aufs_bindex_t bindex)
-{
-	struct inode *h_inode;
-
-	IiMustAnyLock(inode);
-
-	h_inode = au_ii(inode)->ii_hinode[0 + bindex].hi_inode;
-	AuDebugOn(h_inode && atomic_read(&h_inode->i_count) <= 0);
-	return h_inode;
-}
-
-/* todo: hard/soft set? */
-void au_hiput(struct au_hinode *hinode)
-{
-	au_hn_free(hinode);
-	dput(hinode->hi_whdentry);
-	iput(hinode->hi_inode);
-}
-
-unsigned int au_hi_flags(struct inode *inode, int isdir)
-{
-	unsigned int flags;
-	const unsigned int mnt_flags = au_mntflags(inode->i_sb);
-
-	flags = 0;
-	if (au_opt_test(mnt_flags, XINO))
-		au_fset_hi(flags, XINO);
-	if (isdir && au_opt_test(mnt_flags, UDBA_HNOTIFY))
-		au_fset_hi(flags, HNOTIFY);
-	return flags;
-}
-
-void au_set_h_iptr(struct inode *inode, aufs_bindex_t bindex,
-		   struct inode *h_inode, unsigned int flags)
-{
-	struct au_hinode *hinode;
-	struct inode *hi;
-	struct au_iinfo *iinfo = au_ii(inode);
-
-	IiMustWriteLock(inode);
-
-	hinode = iinfo->ii_hinode + bindex;
-	hi = hinode->hi_inode;
-	AuDebugOn(h_inode && atomic_read(&h_inode->i_count) <= 0);
-
-	if (hi)
-		au_hiput(hinode);
-	hinode->hi_inode = h_inode;
-	if (h_inode) {
-		int err;
-		struct super_block *sb = inode->i_sb;
-		struct au_branch *br;
-
-		AuDebugOn(inode->i_mode
-			  && (h_inode->i_mode & S_IFMT)
-			  != (inode->i_mode & S_IFMT));
-		if (bindex == iinfo->ii_bstart)
-			au_cpup_igen(inode, h_inode);
-		br = au_sbr(sb, bindex);
-		hinode->hi_id = br->br_id;
-		if (au_ftest_hi(flags, XINO)) {
-			err = au_xino_write(sb, bindex, h_inode->i_ino,
-					    inode->i_ino);
-			if (unlikely(err))
-				AuIOErr1("failed au_xino_write() %d\n", err);
-		}
-
-		if (au_ftest_hi(flags, HNOTIFY)
-		    && au_br_hnotifyable(br->br_perm)) {
-			err = au_hn_alloc(hinode, inode);
-			if (unlikely(err))
-				AuIOErr1("au_hn_alloc() %d\n", err);
-		}
-	}
-}
-
-void au_set_hi_wh(struct inode *inode, aufs_bindex_t bindex,
-		  struct dentry *h_wh)
-{
-	struct au_hinode *hinode;
-
-	IiMustWriteLock(inode);
-
-	hinode = au_ii(inode)->ii_hinode + bindex;
-	AuDebugOn(hinode->hi_whdentry);
-	hinode->hi_whdentry = h_wh;
-}
-
-void au_update_iigen(struct inode *inode, int half)
-{
-	struct au_iinfo *iinfo;
-	struct au_iigen *iigen;
-	unsigned int sigen;
-
-	sigen = au_sigen(inode->i_sb);
-	iinfo = au_ii(inode);
-	iigen = &iinfo->ii_generation;
-	spin_lock(&iigen->ig_spin);
-	iigen->ig_generation = sigen;
-	if (half)
-		au_ig_fset(iigen->ig_flags, HALF_REFRESHED);
-	else
-		au_ig_fclr(iigen->ig_flags, HALF_REFRESHED);
-	spin_unlock(&iigen->ig_spin);
-}
-
-/* it may be called at remount time, too */
-void au_update_ibrange(struct inode *inode, int do_put_zero)
-{
-	struct au_iinfo *iinfo;
-	aufs_bindex_t bindex, bend;
-
-	iinfo = au_ii(inode);
-	if (!iinfo)
-		return;
-
-	IiMustWriteLock(inode);
-
-	if (do_put_zero && iinfo->ii_bstart >= 0) {
-		for (bindex = iinfo->ii_bstart; bindex <= iinfo->ii_bend;
-		     bindex++) {
-			struct inode *h_i;
-
-			h_i = iinfo->ii_hinode[0 + bindex].hi_inode;
-			if (h_i
-			    && !h_i->i_nlink
-			    && !(h_i->i_state & I_LINKABLE))
-				au_set_h_iptr(inode, bindex, NULL, 0);
-		}
-	}
-
-	iinfo->ii_bstart = -1;
-	iinfo->ii_bend = -1;
-	bend = au_sbend(inode->i_sb);
-	for (bindex = 0; bindex <= bend; bindex++)
-		if (iinfo->ii_hinode[0 + bindex].hi_inode) {
-			iinfo->ii_bstart = bindex;
-			break;
-		}
-	if (iinfo->ii_bstart >= 0)
-		for (bindex = bend; bindex >= iinfo->ii_bstart; bindex--)
-			if (iinfo->ii_hinode[0 + bindex].hi_inode) {
-				iinfo->ii_bend = bindex;
-				break;
-			}
-	AuDebugOn(iinfo->ii_bstart > iinfo->ii_bend);
-}
-
-/* ---------------------------------------------------------------------- */
-
-void au_icntnr_init_once(void *_c)
-{
-	struct au_icntnr *c = _c;
-	struct au_iinfo *iinfo = &c->iinfo;
-	static struct lock_class_key aufs_ii;
-
-	spin_lock_init(&iinfo->ii_generation.ig_spin);
-	au_rw_init(&iinfo->ii_rwsem);
-	au_rw_class(&iinfo->ii_rwsem, &aufs_ii);
-	inode_init_once(&c->vfs_inode);
-}
-
-int au_iinfo_init(struct inode *inode)
-{
-	struct au_iinfo *iinfo;
-	struct super_block *sb;
-	int nbr, i;
-
-	sb = inode->i_sb;
-	iinfo = &(container_of(inode, struct au_icntnr, vfs_inode)->iinfo);
-	nbr = au_sbend(sb) + 1;
-	if (unlikely(nbr <= 0))
-		nbr = 1;
-	iinfo->ii_hinode = kcalloc(nbr, sizeof(*iinfo->ii_hinode), GFP_NOFS);
-	if (iinfo->ii_hinode) {
-		au_ninodes_inc(sb);
-		for (i = 0; i < nbr; i++)
-			iinfo->ii_hinode[i].hi_id = -1;
-
-		iinfo->ii_generation.ig_generation = au_sigen(sb);
-		iinfo->ii_bstart = -1;
-		iinfo->ii_bend = -1;
-		iinfo->ii_vdir = NULL;
-		return 0;
-	}
-	return -ENOMEM;
-}
-
-int au_ii_realloc(struct au_iinfo *iinfo, int nbr)
-{
-	int err, sz;
-	struct au_hinode *hip;
-
-	AuRwMustWriteLock(&iinfo->ii_rwsem);
-
-	err = -ENOMEM;
-	sz = sizeof(*hip) * (iinfo->ii_bend + 1);
-	if (!sz)
-		sz = sizeof(*hip);
-	hip = au_kzrealloc(iinfo->ii_hinode, sz, sizeof(*hip) * nbr, GFP_NOFS);
-	if (hip) {
-		iinfo->ii_hinode = hip;
-		err = 0;
-	}
-
-	return err;
-}
-
-void au_iinfo_fin(struct inode *inode)
-{
-	struct au_iinfo *iinfo;
-	struct au_hinode *hi;
-	struct super_block *sb;
-	aufs_bindex_t bindex, bend;
-	const unsigned char unlinked = !inode->i_nlink;
-
-	iinfo = au_ii(inode);
-	/* bad_inode case */
-	if (!iinfo)
-		return;
-
-	sb = inode->i_sb;
-	au_ninodes_dec(sb);
-	if (si_pid_test(sb))
-		au_xino_delete_inode(inode, unlinked);
-	else {
-		/*
-		 * it is safe to hide the dependency between sbinfo and
-		 * sb->s_umount.
-		 */
-		lockdep_off();
-		si_noflush_read_lock(sb);
-		au_xino_delete_inode(inode, unlinked);
-		si_read_unlock(sb);
-		lockdep_on();
-	}
-
-	if (iinfo->ii_vdir)
-		au_vdir_free(iinfo->ii_vdir);
-
-	bindex = iinfo->ii_bstart;
-	if (bindex >= 0) {
-		hi = iinfo->ii_hinode + bindex;
-		bend = iinfo->ii_bend;
-		while (bindex++ <= bend) {
-			if (hi->hi_inode)
-				au_hiput(hi);
-			hi++;
-		}
-	}
-	kfree(iinfo->ii_hinode);
-	iinfo->ii_hinode = NULL;
-	AuRwDestroy(&iinfo->ii_rwsem);
-}
diff --git a/fs/aufs/inode.c b/fs/aufs/inode.c
deleted file mode 100644
index 5a87727ba..000000000
--- a/fs/aufs/inode.c
+++ /dev/null
@@ -1,514 +0,0 @@
-/*
- * Copyright (C) 2005-2016 Junjiro R. Okajima
- */
-
-/*
- * inode functions
- */
-
-#include "aufs.h"
-
-struct inode *au_igrab(struct inode *inode)
-{
-	if (inode) {
-		AuDebugOn(!atomic_read(&inode->i_count));
-		ihold(inode);
-	}
-	return inode;
-}
-
-static void au_refresh_hinode_attr(struct inode *inode, int do_version)
-{
-	au_cpup_attr_all(inode, /*force*/0);
-	au_update_iigen(inode, /*half*/1);
-	if (do_version)
-		inode->i_version++;
-}
-
-static int au_ii_refresh(struct inode *inode, int *update)
-{
-	int err, e;
-	umode_t type;
-	aufs_bindex_t bindex, new_bindex;
-	struct super_block *sb;
-	struct au_iinfo *iinfo;
-	struct au_hinode *p, *q, tmp;
-
-	IiMustWriteLock(inode);
-
-	*update = 0;
-	sb = inode->i_sb;
-	type = inode->i_mode & S_IFMT;
-	iinfo = au_ii(inode);
-	err = au_ii_realloc(iinfo, au_sbend(sb) + 1);
-	if (unlikely(err))
-		goto out;
-
-	AuDebugOn(iinfo->ii_bstart < 0);
-	p = iinfo->ii_hinode + iinfo->ii_bstart;
-	for (bindex = iinfo->ii_bstart; bindex <= iinfo->ii_bend;
-	     bindex++, p++) {
-		if (!p->hi_inode)
-			continue;
-
-		AuDebugOn(type != (p->hi_inode->i_mode & S_IFMT));
-		new_bindex = au_br_index(sb, p->hi_id);
-		if (new_bindex == bindex)
-			continue;
-
-		if (new_bindex < 0) {
-			*update = 1;
-			au_hiput(p);
-			p->hi_inode = NULL;
-			continue;
-		}
-
-		if (new_bindex < iinfo->ii_bstart)
-			iinfo->ii_bstart = new_bindex;
-		if (iinfo->ii_bend < new_bindex)
-			iinfo->ii_bend = new_bindex;
-		/* swap two lower inode, and loop again */
-		q = iinfo->ii_hinode + new_bindex;
-		tmp = *q;
-		*q = *p;
-		*p = tmp;
-		if (tmp.hi_inode) {
-			bindex--;
-			p--;
-		}
-	}
-	au_update_ibrange(inode, /*do_put_zero*/0);
-	e = au_dy_irefresh(inode);
-	if (unlikely(e && !err))
-		err = e;
-
-out:
-	AuTraceErr(err);
-	return err;
-}
-
-void au_refresh_iop(struct inode *inode, int force_getattr)
-{
-	int type;
-	struct au_sbinfo *sbi = au_sbi(inode->i_sb);
-	const struct inode_operations *iop
-		= force_getattr ? aufs_iop : sbi->si_iop_array;
-
-	if (inode->i_op == iop)
-		return;
-
-	switch (inode->i_mode & S_IFMT) {
-	case S_IFDIR:
-		type = AuIop_DIR;
-		break;
-	case S_IFLNK:
-		type = AuIop_SYMLINK;
-		break;
-	default:
-		type = AuIop_OTHER;
-		break;
-	}
-
-	inode->i_op = iop + type;
-	/* unnecessary smp_wmb() */
-}
-
-int au_refresh_hinode_self(struct inode *inode)
-{
-	int err, update;
-
-	err = au_ii_refresh(inode, &update);
-	if (!err)
-		au_refresh_hinode_attr(inode, update && S_ISDIR(inode->i_mode));
-
-	AuTraceErr(err);
-	return err;
-}
-
-int au_refresh_hinode(struct inode *inode, struct dentry *dentry)
-{
-	int err, e, update;
-	unsigned int flags;
-	umode_t mode;
-	aufs_bindex_t bindex, bend;
-	unsigned char isdir;
-	struct au_hinode *p;
-	struct au_iinfo *iinfo;
-
-	err = au_ii_refresh(inode, &update);
-	if (unlikely(err))
-		goto out;
-
-	update = 0;
-	iinfo = au_ii(inode);
-	p = iinfo->ii_hinode + iinfo->ii_bstart;
-	mode = (inode->i_mode & S_IFMT);
-	isdir = S_ISDIR(mode);
-	flags = au_hi_flags(inode, isdir);
-	bend = au_dbend(dentry);
-	for (bindex = au_dbstart(dentry); bindex <= bend; bindex++) {
-		struct inode *h_i, *h_inode;
-		struct dentry *h_d;
-
-		h_d = au_h_dptr(dentry, bindex);
-		if (!h_d || d_is_negative(h_d))
-			continue;
-
-		h_inode = d_inode(h_d);
-		AuDebugOn(mode != (h_inode->i_mode & S_IFMT));
-		if (iinfo->ii_bstart <= bindex && bindex <= iinfo->ii_bend) {
-			h_i = au_h_iptr(inode, bindex);
-			if (h_i) {
-				if (h_i == h_inode)
-					continue;
-				err = -EIO;
-				break;
-			}
-		}
-		if (bindex < iinfo->ii_bstart)
-			iinfo->ii_bstart = bindex;
-		if (iinfo->ii_bend < bindex)
-			iinfo->ii_bend = bindex;
-		au_set_h_iptr(inode, bindex, au_igrab(h_inode), flags);
-		update = 1;
-	}
-	au_update_ibrange(inode, /*do_put_zero*/0);
-	e = au_dy_irefresh(inode);
-	if (unlikely(e && !err))
-		err = e;
-	if (!err)
-		au_refresh_hinode_attr(inode, update && isdir);
-
-out:
-	AuTraceErr(err);
-	return err;
-}
-
-static int set_inode(struct inode *inode, struct dentry *dentry)
-{
-	int err;
-	unsigned int flags;
-	umode_t mode;
-	aufs_bindex_t bindex, bstart, btail;
-	unsigned char isdir;
-	struct dentry *h_dentry;
-	struct inode *h_inode;
-	struct au_iinfo *iinfo;
-	struct inode_operations *iop;
-
-	IiMustWriteLock(inode);
-
-	err = 0;
-	isdir = 0;
-	iop = au_sbi(inode->i_sb)->si_iop_array;
-	bstart = au_dbstart(dentry);
-	h_dentry = au_h_dptr(dentry, bstart);
-	h_inode = d_inode(h_dentry);
-	mode = h_inode->i_mode;
-	switch (mode & S_IFMT) {
-	case S_IFREG:
-		btail = au_dbtail(dentry);
-		inode->i_op = iop + AuIop_OTHER;
-		inode->i_fop = &aufs_file_fop;
-		err = au_dy_iaop(inode, bstart, h_inode);
-		if (unlikely(err))
-			goto out;
-		break;
-	case S_IFDIR:
-		isdir = 1;
-		btail = au_dbtaildir(dentry);
-		inode->i_op = iop + AuIop_DIR;
-		inode->i_fop = &aufs_dir_fop;
-		break;
-	case S_IFLNK:
-		btail = au_dbtail(dentry);
-		inode->i_op = iop + AuIop_SYMLINK;
-		break;
-	case S_IFBLK:
-	case S_IFCHR:
-	case S_IFIFO:
-	case S_IFSOCK:
-		btail = au_dbtail(dentry);
-		inode->i_op = iop + AuIop_OTHER;
-		init_special_inode(inode, mode, h_inode->i_rdev);
-		break;
-	default:
-		AuIOErr("Unknown file type 0%o\n", mode);
-		err = -EIO;
-		goto out;
-	}
-
-	/* do not set hnotify for whiteouted dirs (SHWH mode) */
-	flags = au_hi_flags(inode, isdir);
-	if (au_opt_test(au_mntflags(dentry->d_sb), SHWH)
-	    && au_ftest_hi(flags, HNOTIFY)
-	    && dentry->d_name.len > AUFS_WH_PFX_LEN
-	    && !memcmp(dentry->d_name.name, AUFS_WH_PFX, AUFS_WH_PFX_LEN))
-		au_fclr_hi(flags, HNOTIFY);
-	iinfo = au_ii(inode);
-	iinfo->ii_bstart = bstart;
-	iinfo->ii_bend = btail;
-	for (bindex = bstart; bindex <= btail; bindex++) {
-		h_dentry = au_h_dptr(dentry, bindex);
-		if (h_dentry)
-			au_set_h_iptr(inode, bindex,
-				      au_igrab(d_inode(h_dentry)), flags);
-	}
-	au_cpup_attr_all(inode, /*force*/1);
-	/*
-	 * to force calling aufs_get_acl() every time,
-	 * do not call cache_no_acl() for aufs inode.
-	 */
-
-out:
-	return err;
-}
-
-/*
- * successful returns with iinfo write_locked
- * minus: errno
- * zero: success, matched
- * plus: no error, but unmatched
- */
-static int reval_inode(struct inode *inode, struct dentry *dentry)
-{
-	int err;
-	unsigned int gen, igflags;
-	aufs_bindex_t bindex, bend;
-	struct inode *h_inode, *h_dinode;
-	struct dentry *h_dentry;
-
-	/*
-	 * before this function, if aufs got any iinfo lock, it must be only
-	 * one, the parent dir.
-	 * it can happen by UDBA and the obsoleted inode number.
-	 */
-	err = -EIO;
-	if (unlikely(inode->i_ino == parent_ino(dentry)))
-		goto out;
-
-	err = 1;
-	ii_write_lock_new_child(inode);
-	h_dentry = au_h_dptr(dentry, au_dbstart(dentry));
-	h_dinode = d_inode(h_dentry);
-	bend = au_ibend(inode);
-	for (bindex = au_ibstart(inode); bindex <= bend; bindex++) {
-		h_inode = au_h_iptr(inode, bindex);
-		if (!h_inode || h_inode != h_dinode)
-			continue;
-
-		err = 0;
-		gen = au_iigen(inode, &igflags);
-		if (gen == au_digen(dentry)
-		    && !au_ig_ftest(igflags, HALF_REFRESHED))
-			break;
-
-		/* fully refresh inode using dentry */
-		err = au_refresh_hinode(inode, dentry);
-		if (!err)
-			au_update_iigen(inode, /*half*/0);
-		break;
-	}
-
-	if (unlikely(err))
-		ii_write_unlock(inode);
-out:
-	return err;
-}
-
-int au_ino(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino,
-	   unsigned int d_type, ino_t *ino)
-{
-	int err;
-	struct mutex *mtx;
-
-	/* prevent hardlinked inode number from race condition */
-	mtx = NULL;
-	if (d_type != DT_DIR) {
-		mtx = &au_sbr(sb, bindex)->br_xino.xi_nondir_mtx;
-		mutex_lock(mtx);
-	}
-	err = au_xino_read(sb, bindex, h_ino, ino);
-	if (unlikely(err))
-		goto out;
-
-	if (!*ino) {
-		err = -EIO;
-		*ino = au_xino_new_ino(sb);
-		if (unlikely(!*ino))
-			goto out;
-		err = au_xino_write(sb, bindex, h_ino, *ino);
-		if (unlikely(err))
-			goto out;
-	}
-
-out:
-	if (mtx)
-		mutex_unlock(mtx);
-	return err;
-}
-
-/* successful returns with iinfo write_locked */
-/* todo: return with unlocked? */
-struct inode *au_new_inode(struct dentry *dentry, int must_new)
-{
-	struct inode *inode, *h_inode;
-	struct dentry *h_dentry;
-	struct super_block *sb;
-	struct mutex *mtx;
-	ino_t h_ino, ino;
-	int err;
-	aufs_bindex_t bstart;
-
-	sb = dentry->d_sb;
-	bstart = au_dbstart(dentry);
-	h_dentry = au_h_dptr(dentry, bstart);
-	h_inode = d_inode(h_dentry);
-	h_ino = h_inode->i_ino;
-
-	/*
-	 * stop 'race'-ing between hardlinks under different
-	 * parents.
-	 */
-	mtx = NULL;
-	if (!d_is_dir(h_dentry))
-		mtx = &au_sbr(sb, bstart)->br_xino.xi_nondir_mtx;
-
-new_ino:
-	if (mtx)
-		mutex_lock(mtx);
-	err = au_xino_read(sb, bstart, h_ino, &ino);
-	inode = ERR_PTR(err);
-	if (unlikely(err))
-		goto out;
-
-	if (!ino) {
-		ino = au_xino_new_ino(sb);
-		if (unlikely(!ino)) {
-			inode = ERR_PTR(-EIO);
-			goto out;
-		}
-	}
-
-	AuDbg("i%lu\n", (unsigned long)ino);
-	inode = au_iget_locked(sb, ino);
-	err = PTR_ERR(inode);
-	if (IS_ERR(inode))
-		goto out;
-
-	AuDbg("%lx, new %d\n", inode->i_state, !!(inode->i_state & I_NEW));
-	if (inode->i_state & I_NEW) {
-		/* verbose coding for lock class name */
-		if (unlikely(d_is_symlink(h_dentry)))
-			au_rw_class(&au_ii(inode)->ii_rwsem,
-				    au_lc_key + AuLcSymlink_IIINFO);
-		else if (unlikely(d_is_dir(h_dentry)))
-			au_rw_class(&au_ii(inode)->ii_rwsem,
-				    au_lc_key + AuLcDir_IIINFO);
-		else /* likely */
-			au_rw_class(&au_ii(inode)->ii_rwsem,
-				    au_lc_key + AuLcNonDir_IIINFO);
-
-		ii_write_lock_new_child(inode);
-		err = set_inode(inode, dentry);
-		if (!err) {
-			unlock_new_inode(inode);
-			goto out; /* success */
-		}
-
-		/*
-		 * iget_failed() calls iput(), but we need to call
-		 * ii_write_unlock() after iget_failed(). so dirty hack for
-		 * i_count.
-		 */
-		atomic_inc(&inode->i_count);
-		iget_failed(inode);
-		ii_write_unlock(inode);
-		au_xino_write(sb, bstart, h_ino, /*ino*/0);
-		/* ignore this error */
-		goto out_iput;
-	} else if (!must_new && !IS_DEADDIR(inode) && inode->i_nlink) {
-		/*
-		 * horrible race condition between lookup, readdir and copyup
-		 * (or something).
-		 */
-		if (mtx)
-			mutex_unlock(mtx);
-		err = reval_inode(inode, dentry);
-		if (unlikely(err < 0)) {
-			mtx = NULL;
-			goto out_iput;
-		}
-
-		if (!err) {
-			mtx = NULL;
-			goto out; /* success */
-		} else if (mtx)
-			mutex_lock(mtx);
-	}
-
-	if (unlikely(au_test_fs_unique_ino(h_inode)))
-		AuWarn1("Warning: Un-notified UDBA or repeatedly renamed dir,"
-			" b%d, %s, %pd, hi%lu, i%lu.\n",
-			bstart, au_sbtype(h_dentry->d_sb), dentry,
-			(unsigned long)h_ino, (unsigned long)ino);
-	ino = 0;
-	err = au_xino_write(sb, bstart, h_ino, /*ino*/0);
-	if (!err) {
-		iput(inode);
-		if (mtx)
-			mutex_unlock(mtx);
-		goto new_ino;
-	}
-
-out_iput:
-	iput(inode);
-	inode = ERR_PTR(err);
-out:
-	if (mtx)
-		mutex_unlock(mtx);
-	return inode;
-}
-
-/* ---------------------------------------------------------------------- */
-
-int au_test_ro(struct super_block *sb, aufs_bindex_t bindex,
-	       struct inode *inode)
-{
-	int err;
-	struct inode *hi;
-
-	err = au_br_rdonly(au_sbr(sb, bindex));
-
-	/* pseudo-link after flushed may happen out of bounds */
-	if (!err
-	    && inode
-	    && au_ibstart(inode) <= bindex
-	    && bindex <= au_ibend(inode)) {
-		/*
-		 * permission check is unnecessary since vfsub routine
-		 * will be called later
-		 */
-		hi = au_h_iptr(inode, bindex);
-		if (hi)
-			err = IS_IMMUTABLE(hi) ? -EROFS : 0;
-	}
-
-	return err;
-}
-
-int au_test_h_perm(struct inode *h_inode, int mask)
-{
-	if (uid_eq(current_fsuid(), GLOBAL_ROOT_UID))
-		return 0;
-	return inode_permission(h_inode, mask);
-}
-
-int au_test_h_perm_sio(struct inode *h_inode, int mask)
-{
-	if (au_test_nfs(h_inode->i_sb)
-	    && (mask & MAY_WRITE)
-	    && S_ISDIR(h_inode->i_mode))
-		mask |= MAY_READ; /* force permission check */
-	return au_test_h_perm(h_inode, mask);
-}
diff --git a/fs/aufs/inode.h b/fs/aufs/inode.h
deleted file mode 100644
index 534b9e814..000000000
--- a/fs/aufs/inode.h
+++ /dev/null
@@ -1,672 +0,0 @@
-/*
- * Copyright (C) 2005-2016 Junjiro R. Okajima
- */
-
-/*
- * inode operations
- */
-
-#ifndef __AUFS_INODE_H__
-#define __AUFS_INODE_H__
-
-#ifdef __KERNEL__
-
-#include <linux/fsnotify.h>
-#include "rwsem.h"
-
-struct vfsmount;
-
-struct au_hnotify {
-#ifdef CONFIG_AUFS_HNOTIFY
-#ifdef CONFIG_AUFS_HFSNOTIFY
-	/* never use fsnotify_add_vfsmount_mark() */
-	struct fsnotify_mark		hn_mark;
-#endif
-	struct inode			*hn_aufs_inode;	/* no get/put */
-#endif
-} ____cacheline_aligned_in_smp;
-
-struct au_hinode {
-	struct inode		*hi_inode;
-	aufs_bindex_t		hi_id;
-#ifdef CONFIG_AUFS_HNOTIFY
-	struct au_hnotify	*hi_notify;
-#endif
-
-	/* reference to the copied-up whiteout with get/put */
-	struct dentry		*hi_whdentry;
-};
-
-/* ig_flags */
-#define AuIG_HALF_REFRESHED		1
-#define au_ig_ftest(flags, name)	((flags) & AuIG_##name)
-#define au_ig_fset(flags, name) \
-	do { (flags) |= AuIG_##name; } while (0)
-#define au_ig_fclr(flags, name) \
-	do { (flags) &= ~AuIG_##name; } while (0)
-
-struct au_iigen {
-	spinlock_t	ig_spin;
-	__u32		ig_generation, ig_flags;
-};
-
-struct au_vdir;
-struct au_iinfo {
-	struct au_iigen		ii_generation;
-	struct super_block	*ii_hsb1;	/* no get/put */
-
-	struct au_rwsem		ii_rwsem;
-	aufs_bindex_t		ii_bstart, ii_bend;
-	__u32			ii_higen;
-	struct au_hinode	*ii_hinode;
-	struct au_vdir		*ii_vdir;
-};
-
-struct au_icntnr {
-	struct au_iinfo iinfo;
-	struct inode vfs_inode;
-} ____cacheline_aligned_in_smp;
-
-/* au_pin flags */
-#define AuPin_DI_LOCKED		1
-#define AuPin_MNT_WRITE		(1 << 1)
-#define au_ftest_pin(flags, name)	((flags) & AuPin_##name)
-#define au_fset_pin(flags, name) \
-	do { (flags) |= AuPin_##name; } while (0)
-#define au_fclr_pin(flags, name) \
-	do { (flags) &= ~AuPin_##name; } while (0)
-
-struct au_pin {
-	/* input */
-	struct dentry *dentry;
-	unsigned int udba;
-	unsigned char lsc_di, lsc_hi, flags;
-	aufs_bindex_t bindex;
-
-	/* output */
-	struct dentry *parent;
-	struct au_hinode *hdir;
-	struct vfsmount *h_mnt;
-
-	/* temporary unlock/relock for copyup */
-	struct dentry *h_dentry, *h_parent;
-	struct au_branch *br;
-	struct task_struct *task;
-};
-
-void au_pin_hdir_unlock(struct au_pin *p);
-int au_pin_hdir_lock(struct au_pin *p);
-int au_pin_hdir_relock(struct au_pin *p);
-void au_pin_hdir_set_owner(struct au_pin *p, struct task_struct *task);
-void au_pin_hdir_acquire_nest(struct au_pin *p);
-void au_pin_hdir_release(struct au_pin *p);
-
-/* ---------------------------------------------------------------------- */
-
-static inline struct au_iinfo *au_ii(struct inode *inode)
-{
-	struct au_iinfo *iinfo;
-
-	iinfo = &(container_of(inode, struct au_icntnr, vfs_inode)->iinfo);
-	if (iinfo->ii_hinode)
-		return iinfo;
-	return NULL; /* debugging bad_inode case */
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* inode.c */
-struct inode *au_igrab(struct inode *inode);
-void au_refresh_iop(struct inode *inode, int force_getattr);
-int au_refresh_hinode_self(struct inode *inode);
-int au_refresh_hinode(struct inode *inode, struct dentry *dentry);
-int au_ino(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino,
-	   unsigned int d_type, ino_t *ino);
-struct inode *au_new_inode(struct dentry *dentry, int must_new);
-int au_test_ro(struct super_block *sb, aufs_bindex_t bindex,
-	       struct inode *inode);
-int au_test_h_perm(struct inode *h_inode, int mask);
-int au_test_h_perm_sio(struct inode *h_inode, int mask);
-
-static inline int au_wh_ino(struct super_block *sb, aufs_bindex_t bindex,
-			    ino_t h_ino, unsigned int d_type, ino_t *ino)
-{
-#ifdef CONFIG_AUFS_SHWH
-	return au_ino(sb, bindex, h_ino, d_type, ino);
-#else
-	return 0;
-#endif
-}
-
-/* i_op.c */
-enum {
-	AuIop_SYMLINK,
-	AuIop_DIR,
-	AuIop_OTHER,
-	AuIop_Last
-};
-extern struct inode_operations aufs_iop[AuIop_Last],
-	aufs_iop_nogetattr[AuIop_Last];
-
-/* au_wr_dir flags */
-#define AuWrDir_ADD_ENTRY	1
-#define AuWrDir_ISDIR		(1 << 1)
-#define AuWrDir_TMPFILE		(1 << 2)
-#define au_ftest_wrdir(flags, name)	((flags) & AuWrDir_##name)
-#define au_fset_wrdir(flags, name) \
-	do { (flags) |= AuWrDir_##name; } while (0)
-#define au_fclr_wrdir(flags, name) \
-	do { (flags) &= ~AuWrDir_##name; } while (0)
-
-struct au_wr_dir_args {
-	aufs_bindex_t force_btgt;
-	unsigned char flags;
-};
-int au_wr_dir(struct dentry *dentry, struct dentry *src_dentry,
-	      struct au_wr_dir_args *args);
-
-struct dentry *au_pinned_h_parent(struct au_pin *pin);
-void au_pin_init(struct au_pin *pin, struct dentry *dentry,
-		 aufs_bindex_t bindex, int lsc_di, int lsc_hi,
-		 unsigned int udba, unsigned char flags);
-int au_pin(struct au_pin *pin, struct dentry *dentry, aufs_bindex_t bindex,
-	   unsigned int udba, unsigned char flags) __must_check;
-int au_do_pin(struct au_pin *pin) __must_check;
-void au_unpin(struct au_pin *pin);
-int au_reval_for_attr(struct dentry *dentry, unsigned int sigen);
-
-#define AuIcpup_DID_CPUP	1
-#define au_ftest_icpup(flags, name)	((flags) & AuIcpup_##name)
-#define au_fset_icpup(flags, name) \
-	do { (flags) |= AuIcpup_##name; } while (0)
-#define au_fclr_icpup(flags, name) \
-	do { (flags) &= ~AuIcpup_##name; } while (0)
-
-struct au_icpup_args {
-	unsigned char flags;
-	unsigned char pin_flags;
-	aufs_bindex_t btgt;
-	unsigned int udba;
-	struct au_pin pin;
-	struct path h_path;
-	struct inode *h_inode;
-};
-
-int au_pin_and_icpup(struct dentry *dentry, struct iattr *ia,
-		     struct au_icpup_args *a);
-
-int au_h_path_getattr(struct dentry *dentry, int force, struct path *h_path);
-
-/* i_op_add.c */
-int au_may_add(struct dentry *dentry, aufs_bindex_t bindex,
-	       struct dentry *h_parent, int isdir);
-int aufs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
-	       dev_t dev);
-int aufs_symlink(struct inode *dir, struct dentry *dentry, const char *symname);
-int aufs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-		bool want_excl);
-struct vfsub_aopen_args;
-int au_aopen_or_create(struct inode *dir, struct dentry *dentry,
-		       struct vfsub_aopen_args *args);
-int aufs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode);
-int aufs_link(struct dentry *src_dentry, struct inode *dir,
-	      struct dentry *dentry);
-int aufs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode);
-
-/* i_op_del.c */
-int au_wr_dir_need_wh(struct dentry *dentry, int isdir, aufs_bindex_t *bcpup);
-int au_may_del(struct dentry *dentry, aufs_bindex_t bindex,
-	       struct dentry *h_parent, int isdir);
-int aufs_unlink(struct inode *dir, struct dentry *dentry);
-int aufs_rmdir(struct inode *dir, struct dentry *dentry);
-
-/* i_op_ren.c */
-int au_wbr(struct dentry *dentry, aufs_bindex_t btgt);
-int aufs_rename(struct inode *src_dir, struct dentry *src_dentry,
-		struct inode *dir, struct dentry *dentry);
-
-/* iinfo.c */
-struct inode *au_h_iptr(struct inode *inode, aufs_bindex_t bindex);
-void au_hiput(struct au_hinode *hinode);
-void au_set_hi_wh(struct inode *inode, aufs_bindex_t bindex,
-		  struct dentry *h_wh);
-unsigned int au_hi_flags(struct inode *inode, int isdir);
-
-/* hinode flags */
-#define AuHi_XINO	1
-#define AuHi_HNOTIFY	(1 << 1)
-#define au_ftest_hi(flags, name)	((flags) & AuHi_##name)
-#define au_fset_hi(flags, name) \
-	do { (flags) |= AuHi_##name; } while (0)
-#define au_fclr_hi(flags, name) \
-	do { (flags) &= ~AuHi_##name; } while (0)
-
-#ifndef CONFIG_AUFS_HNOTIFY
-#undef AuHi_HNOTIFY
-#define AuHi_HNOTIFY	0
-#endif
-
-void au_set_h_iptr(struct inode *inode, aufs_bindex_t bindex,
-		   struct inode *h_inode, unsigned int flags);
-
-void au_update_iigen(struct inode *inode, int half);
-void au_update_ibrange(struct inode *inode, int do_put_zero);
-
-void au_icntnr_init_once(void *_c);
-int au_iinfo_init(struct inode *inode);
-void au_iinfo_fin(struct inode *inode);
-int au_ii_realloc(struct au_iinfo *iinfo, int nbr);
-
-#ifdef CONFIG_PROC_FS
-/* plink.c */
-int au_plink_maint(struct super_block *sb, int flags);
-struct au_sbinfo;
-void au_plink_maint_leave(struct au_sbinfo *sbinfo);
-int au_plink_maint_enter(struct super_block *sb);
-#ifdef CONFIG_AUFS_DEBUG
-void au_plink_list(struct super_block *sb);
-#else
-AuStubVoid(au_plink_list, struct super_block *sb)
-#endif
-int au_plink_test(struct inode *inode);
-struct dentry *au_plink_lkup(struct inode *inode, aufs_bindex_t bindex);
-void au_plink_append(struct inode *inode, aufs_bindex_t bindex,
-		     struct dentry *h_dentry);
-void au_plink_put(struct super_block *sb, int verbose);
-void au_plink_clean(struct super_block *sb, int verbose);
-void au_plink_half_refresh(struct super_block *sb, aufs_bindex_t br_id);
-#else
-AuStubInt0(au_plink_maint, struct super_block *sb, int flags);
-AuStubVoid(au_plink_maint_leave, struct au_sbinfo *sbinfo);
-AuStubInt0(au_plink_maint_enter, struct super_block *sb);
-AuStubVoid(au_plink_list, struct super_block *sb);
-AuStubInt0(au_plink_test, struct inode *inode);
-AuStub(struct dentry *, au_plink_lkup, return NULL,
-       struct inode *inode, aufs_bindex_t bindex);
-AuStubVoid(au_plink_append, struct inode *inode, aufs_bindex_t bindex,
-	   struct dentry *h_dentry);
-AuStubVoid(au_plink_put, struct super_block *sb, int verbose);
-AuStubVoid(au_plink_clean, struct super_block *sb, int verbose);
-AuStubVoid(au_plink_half_refresh, struct super_block *sb, aufs_bindex_t br_id);
-#endif /* CONFIG_PROC_FS */
-
-#ifdef CONFIG_AUFS_XATTR
-/* xattr.c */
-int au_cpup_xattr(struct dentry *h_dst, struct dentry *h_src, int ignore_flags,
-		  unsigned int verbose);
-ssize_t aufs_listxattr(struct dentry *dentry, char *list, size_t size);
-ssize_t aufs_getxattr(struct dentry *dentry, const char *name, void *value,
-		      size_t size);
-int aufs_setxattr(struct dentry *dentry, const char *name, const void *value,
-		  size_t size, int flags);
-int aufs_removexattr(struct dentry *dentry, const char *name);
-
-/* void au_xattr_init(struct super_block *sb); */
-#else
-AuStubInt0(au_cpup_xattr, struct dentry *h_dst, struct dentry *h_src,
-	   int ignore_flags, unsigned int verbose);
-/* AuStubVoid(au_xattr_init, struct super_block *sb); */
-#endif
-
-#ifdef CONFIG_FS_POSIX_ACL
-struct posix_acl *aufs_get_acl(struct inode *inode, int type);
-int aufs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
-#endif
-
-#if IS_ENABLED(CONFIG_AUFS_XATTR) || IS_ENABLED(CONFIG_FS_POSIX_ACL)
-enum {
-	AU_XATTR_SET,
-	AU_XATTR_REMOVE,
-	AU_ACL_SET
-};
-
-struct au_srxattr {
-	int type;
-	union {
-		struct {
-			const char	*name;
-			const void	*value;
-			size_t		size;
-			int		flags;
-		} set;
-		struct {
-			const char	*name;
-		} remove;
-		struct {
-			struct posix_acl *acl;
-			int		type;
-		} acl_set;
-	} u;
-};
-ssize_t au_srxattr(struct dentry *dentry, struct au_srxattr *arg);
-#endif
-
-/* ---------------------------------------------------------------------- */
-
-/* lock subclass for iinfo */
-enum {
-	AuLsc_II_CHILD,		/* child first */
-	AuLsc_II_CHILD2,	/* rename(2), link(2), and cpup at hnotify */
-	AuLsc_II_CHILD3,	/* copyup dirs */
-	AuLsc_II_PARENT,	/* see AuLsc_I_PARENT in vfsub.h */
-	AuLsc_II_PARENT2,
-	AuLsc_II_PARENT3,	/* copyup dirs */
-	AuLsc_II_NEW_CHILD
-};
-
-/*
- * ii_read_lock_child, ii_write_lock_child,
- * ii_read_lock_child2, ii_write_lock_child2,
- * ii_read_lock_child3, ii_write_lock_child3,
- * ii_read_lock_parent, ii_write_lock_parent,
- * ii_read_lock_parent2, ii_write_lock_parent2,
- * ii_read_lock_parent3, ii_write_lock_parent3,
- * ii_read_lock_new_child, ii_write_lock_new_child,
- */
-#define AuReadLockFunc(name, lsc) \
-static inline void ii_read_lock_##name(struct inode *i) \
-{ \
-	au_rw_read_lock_nested(&au_ii(i)->ii_rwsem, AuLsc_II_##lsc); \
-}
-
-#define AuWriteLockFunc(name, lsc) \
-static inline void ii_write_lock_##name(struct inode *i) \
-{ \
-	au_rw_write_lock_nested(&au_ii(i)->ii_rwsem, AuLsc_II_##lsc); \
-}
-
-#define AuRWLockFuncs(name, lsc) \
-	AuReadLockFunc(name, lsc) \
-	AuWriteLockFunc(name, lsc)
-
-AuRWLockFuncs(child, CHILD);
-AuRWLockFuncs(child2, CHILD2);
-AuRWLockFuncs(child3, CHILD3);
-AuRWLockFuncs(parent, PARENT);
-AuRWLockFuncs(parent2, PARENT2);
-AuRWLockFuncs(parent3, PARENT3);
-AuRWLockFuncs(new_child, NEW_CHILD);
-
-#undef AuReadLockFunc
-#undef AuWriteLockFunc
-#undef AuRWLockFuncs
-
-/*
- * ii_read_unlock, ii_write_unlock, ii_downgrade_lock
- */
-AuSimpleUnlockRwsemFuncs(ii, struct inode *i, &au_ii(i)->ii_rwsem);
-
-#define IiMustNoWaiters(i)	AuRwMustNoWaiters(&au_ii(i)->ii_rwsem)
-#define IiMustAnyLock(i)	AuRwMustAnyLock(&au_ii(i)->ii_rwsem)
-#define IiMustWriteLock(i)	AuRwMustWriteLock(&au_ii(i)->ii_rwsem)
-
-/* ---------------------------------------------------------------------- */
-
-static inline void au_icntnr_init(struct au_icntnr *c)
-{
-#ifdef CONFIG_AUFS_DEBUG
-	c->vfs_inode.i_mode = 0;
-#endif
-}
-
-static inline unsigned int au_iigen(struct inode *inode, unsigned int *igflags)
-{
-	unsigned int gen;
-	struct au_iinfo *iinfo;
-	struct au_iigen *iigen;
-
-	iinfo = au_ii(inode);
-	iigen = &iinfo->ii_generation;
-	spin_lock(&iigen->ig_spin);
-	if (igflags)
-		*igflags = iigen->ig_flags;
-	gen = iigen->ig_generation;
-	spin_unlock(&iigen->ig_spin);
-
-	return gen;
-}
-
-/* tiny test for inode number */
-/* tmpfs generation is too rough */
-static inline int au_test_higen(struct inode *inode, struct inode *h_inode)
-{
-	struct au_iinfo *iinfo;
-
-	iinfo = au_ii(inode);
-	AuRwMustAnyLock(&iinfo->ii_rwsem);
-	return !(iinfo->ii_hsb1 == h_inode->i_sb
-		 && iinfo->ii_higen == h_inode->i_generation);
-}
-
-static inline void au_iigen_dec(struct inode *inode)
-{
-	struct au_iinfo *iinfo;
-	struct au_iigen *iigen;
-
-	iinfo = au_ii(inode);
-	iigen = &iinfo->ii_generation;
-	spin_lock(&iigen->ig_spin);
-	iigen->ig_generation--;
-	spin_unlock(&iigen->ig_spin);
-}
-
-static inline int au_iigen_test(struct inode *inode, unsigned int sigen)
-{
-	int err;
-
-	err = 0;
-	if (unlikely(inode && au_iigen(inode, NULL) != sigen))
-		err = -EIO;
-
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static inline aufs_bindex_t au_ii_br_id(struct inode *inode,
-					aufs_bindex_t bindex)
-{
-	IiMustAnyLock(inode);
-	return au_ii(inode)->ii_hinode[0 + bindex].hi_id;
-}
-
-static inline aufs_bindex_t au_ibstart(struct inode *inode)
-{
-	IiMustAnyLock(inode);
-	return au_ii(inode)->ii_bstart;
-}
-
-static inline aufs_bindex_t au_ibend(struct inode *inode)
-{
-	IiMustAnyLock(inode);
-	return au_ii(inode)->ii_bend;
-}
-
-static inline struct au_vdir *au_ivdir(struct inode *inode)
-{
-	IiMustAnyLock(inode);
-	return au_ii(inode)->ii_vdir;
-}
-
-static inline struct dentry *au_hi_wh(struct inode *inode, aufs_bindex_t bindex)
-{
-	IiMustAnyLock(inode);
-	return au_ii(inode)->ii_hinode[0 + bindex].hi_whdentry;
-}
-
-static inline void au_set_ibstart(struct inode *inode, aufs_bindex_t bindex)
-{
-	IiMustWriteLock(inode);
-	au_ii(inode)->ii_bstart = bindex;
-}
-
-static inline void au_set_ibend(struct inode *inode, aufs_bindex_t bindex)
-{
-	IiMustWriteLock(inode);
-	au_ii(inode)->ii_bend = bindex;
-}
-
-static inline void au_set_ivdir(struct inode *inode, struct au_vdir *vdir)
-{
-	IiMustWriteLock(inode);
-	au_ii(inode)->ii_vdir = vdir;
-}
-
-static inline struct au_hinode *au_hi(struct inode *inode, aufs_bindex_t bindex)
-{
-	IiMustAnyLock(inode);
-	return au_ii(inode)->ii_hinode + bindex;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static inline struct dentry *au_pinned_parent(struct au_pin *pin)
-{
-	if (pin)
-		return pin->parent;
-	return NULL;
-}
-
-static inline struct inode *au_pinned_h_dir(struct au_pin *pin)
-{
-	if (pin && pin->hdir)
-		return pin->hdir->hi_inode;
-	return NULL;
-}
-
-static inline struct au_hinode *au_pinned_hdir(struct au_pin *pin)
-{
-	if (pin)
-		return pin->hdir;
-	return NULL;
-}
-
-static inline void au_pin_set_dentry(struct au_pin *pin, struct dentry *dentry)
-{
-	if (pin)
-		pin->dentry = dentry;
-}
-
-static inline void au_pin_set_parent_lflag(struct au_pin *pin,
-					   unsigned char lflag)
-{
-	if (pin) {
-		if (lflag)
-			au_fset_pin(pin->flags, DI_LOCKED);
-		else
-			au_fclr_pin(pin->flags, DI_LOCKED);
-	}
-}
-
-#if 0 /* reserved */
-static inline void au_pin_set_parent(struct au_pin *pin, struct dentry *parent)
-{
-	if (pin) {
-		dput(pin->parent);
-		pin->parent = dget(parent);
-	}
-}
-#endif
-
-/* ---------------------------------------------------------------------- */
-
-struct au_branch;
-#ifdef CONFIG_AUFS_HNOTIFY
-struct au_hnotify_op {
-	void (*ctl)(struct au_hinode *hinode, int do_set);
-	int (*alloc)(struct au_hinode *hinode);
-
-	/*
-	 * if it returns true, the the caller should free hinode->hi_notify,
-	 * otherwise ->free() frees it.
-	 */
-	int (*free)(struct au_hinode *hinode,
-		    struct au_hnotify *hn) __must_check;
-
-	void (*fin)(void);
-	int (*init)(void);
-
-	int (*reset_br)(unsigned int udba, struct au_branch *br, int perm);
-	void (*fin_br)(struct au_branch *br);
-	int (*init_br)(struct au_branch *br, int perm);
-};
-
-/* hnotify.c */
-int au_hn_alloc(struct au_hinode *hinode, struct inode *inode);
-void au_hn_free(struct au_hinode *hinode);
-void au_hn_ctl(struct au_hinode *hinode, int do_set);
-void au_hn_reset(struct inode *inode, unsigned int flags);
-int au_hnotify(struct inode *h_dir, struct au_hnotify *hnotify, u32 mask,
-	       struct qstr *h_child_qstr, struct inode *h_child_inode);
-int au_hnotify_reset_br(unsigned int udba, struct au_branch *br, int perm);
-int au_hnotify_init_br(struct au_branch *br, int perm);
-void au_hnotify_fin_br(struct au_branch *br);
-int __init au_hnotify_init(void);
-void au_hnotify_fin(void);
-
-/* hfsnotify.c */
-extern const struct au_hnotify_op au_hnotify_op;
-
-static inline
-void au_hn_init(struct au_hinode *hinode)
-{
-	hinode->hi_notify = NULL;
-}
-
-static inline struct au_hnotify *au_hn(struct au_hinode *hinode)
-{
-	return hinode->hi_notify;
-}
-
-#else
-AuStub(int, au_hn_alloc, return -EOPNOTSUPP,
-       struct au_hinode *hinode __maybe_unused,
-       struct inode *inode __maybe_unused)
-AuStub(struct au_hnotify *, au_hn, return NULL, struct au_hinode *hinode)
-AuStubVoid(au_hn_free, struct au_hinode *hinode __maybe_unused)
-AuStubVoid(au_hn_ctl, struct au_hinode *hinode __maybe_unused,
-	   int do_set __maybe_unused)
-AuStubVoid(au_hn_reset, struct inode *inode __maybe_unused,
-	   unsigned int flags __maybe_unused)
-AuStubInt0(au_hnotify_reset_br, unsigned int udba __maybe_unused,
-	   struct au_branch *br __maybe_unused,
-	   int perm __maybe_unused)
-AuStubInt0(au_hnotify_init_br, struct au_branch *br __maybe_unused,
-	   int perm __maybe_unused)
-AuStubVoid(au_hnotify_fin_br, struct au_branch *br __maybe_unused)
-AuStubInt0(__init au_hnotify_init, void)
-AuStubVoid(au_hnotify_fin, void)
-AuStubVoid(au_hn_init, struct au_hinode *hinode __maybe_unused)
-#endif /* CONFIG_AUFS_HNOTIFY */
-
-static inline void au_hn_suspend(struct au_hinode *hdir)
-{
-	au_hn_ctl(hdir, /*do_set*/0);
-}
-
-static inline void au_hn_resume(struct au_hinode *hdir)
-{
-	au_hn_ctl(hdir, /*do_set*/1);
-}
-
-static inline void au_hn_imtx_lock(struct au_hinode *hdir)
-{
-	mutex_lock(&hdir->hi_inode->i_mutex);
-	au_hn_suspend(hdir);
-}
-
-static inline void au_hn_imtx_lock_nested(struct au_hinode *hdir,
-					  unsigned int sc __maybe_unused)
-{
-	mutex_lock_nested(&hdir->hi_inode->i_mutex, sc);
-	au_hn_suspend(hdir);
-}
-
-static inline void au_hn_imtx_unlock(struct au_hinode *hdir)
-{
-	au_hn_resume(hdir);
-	mutex_unlock(&hdir->hi_inode->i_mutex);
-}
-
-#endif /* __KERNEL__ */
-#endif /* __AUFS_INODE_H__ */
diff --git a/fs/aufs/ioctl.c b/fs/aufs/ioctl.c
deleted file mode 100644
index 6528fb911..000000000
--- a/fs/aufs/ioctl.c
+++ /dev/null
@@ -1,206 +0,0 @@
-/*
- * Copyright (C) 2005-2016 Junjiro R. Okajima
- */
-
-/*
- * ioctl
- * plink-management and readdir in userspace.
- * assist the pathconf(3) wrapper library.
- * move-down
- * File-based Hierarchical Storage Management.
- */
-
-#include <linux/compat.h>
-#include <linux/file.h>
-#include "aufs.h"
-
-static int au_wbr_fd(struct path *path, struct aufs_wbr_fd __user *arg)
-{
-	int err, fd;
-	aufs_bindex_t wbi, bindex, bend;
-	struct file *h_file;
-	struct super_block *sb;
-	struct dentry *root;
-	struct au_branch *br;
-	struct aufs_wbr_fd wbrfd = {
-		.oflags	= au_dir_roflags,
-		.brid	= -1
-	};
-	const int valid = O_RDONLY | O_NONBLOCK | O_LARGEFILE | O_DIRECTORY
-		| O_NOATIME | O_CLOEXEC;
-
-	AuDebugOn(wbrfd.oflags & ~valid);
-
-	if (arg) {
-		err = copy_from_user(&wbrfd, arg, sizeof(wbrfd));
-		if (unlikely(err)) {
-			err = -EFAULT;
-			goto out;
-		}
-
-		err = -EINVAL;
-		AuDbg("wbrfd{0%o, %d}\n", wbrfd.oflags, wbrfd.brid);
-		wbrfd.oflags |= au_dir_roflags;
-		AuDbg("0%o\n", wbrfd.oflags);
-		if (unlikely(wbrfd.oflags & ~valid))
-			goto out;
-	}
-
-	fd = get_unused_fd_flags(0);
-	err = fd;
-	if (unlikely(fd < 0))
-		goto out;
-
-	h_file = ERR_PTR(-EINVAL);
-	wbi = 0;
-	br = NULL;
-	sb = path->dentry->d_sb;
-	root = sb->s_root;
-	aufs_read_lock(root, AuLock_IR);
-	bend = au_sbend(sb);
-	if (wbrfd.brid >= 0) {
-		wbi = au_br_index(sb, wbrfd.brid);
-		if (unlikely(wbi < 0 || wbi > bend))
-			goto out_unlock;
-	}
-
-	h_file = ERR_PTR(-ENOENT);
-	br = au_sbr(sb, wbi);
-	if (!au_br_writable(br->br_perm)) {
-		if (arg)
-			goto out_unlock;
-
-		bindex = wbi + 1;
-		wbi = -1;
-		for (; bindex <= bend; bindex++) {
-			br = au_sbr(sb, bindex);
-			if (au_br_writable(br->br_perm)) {
-				wbi = bindex;
-				br = au_sbr(sb, wbi);
-				break;
-			}
-		}
-	}
-	AuDbg("wbi %d\n", wbi);
-	if (wbi >= 0)
-		h_file = au_h_open(root, wbi, wbrfd.oflags, NULL,
-				   /*force_wr*/0);
-
-out_unlock:
-	aufs_read_unlock(root, AuLock_IR);
-	err = PTR_ERR(h_file);
-	if (IS_ERR(h_file))
-		goto out_fd;
-
-	atomic_dec(&br->br_count); /* cf. au_h_open() */
-	fd_install(fd, h_file);
-	err = fd;
-	goto out; /* success */
-
-out_fd:
-	put_unused_fd(fd);
-out:
-	AuTraceErr(err);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-long aufs_ioctl_dir(struct file *file, unsigned int cmd, unsigned long arg)
-{
-	long err;
-	struct dentry *dentry;
-
-	switch (cmd) {
-	case AUFS_CTL_RDU:
-	case AUFS_CTL_RDU_INO:
-		err = au_rdu_ioctl(file, cmd, arg);
-		break;
-
-	case AUFS_CTL_WBR_FD:
-		err = au_wbr_fd(&file->f_path, (void __user *)arg);
-		break;
-
-	case AUFS_CTL_IBUSY:
-		err = au_ibusy_ioctl(file, arg);
-		break;
-
-	case AUFS_CTL_BRINFO:
-		err = au_brinfo_ioctl(file, arg);
-		break;
-
-	case AUFS_CTL_FHSM_FD:
-		dentry = file->f_path.dentry;
-		if (IS_ROOT(dentry))
-			err = au_fhsm_fd(dentry->d_sb, arg);
-		else
-			err = -ENOTTY;
-		break;
-
-	default:
-		/* do not call the lower */
-		AuDbg("0x%x\n", cmd);
-		err = -ENOTTY;
-	}
-
-	AuTraceErr(err);
-	return err;
-}
-
-long aufs_ioctl_nondir(struct file *file, unsigned int cmd, unsigned long arg)
-{
-	long err;
-
-	switch (cmd) {
-	case AUFS_CTL_MVDOWN:
-		err = au_mvdown(file->f_path.dentry, (void __user *)arg);
-		break;
-
-	case AUFS_CTL_WBR_FD:
-		err = au_wbr_fd(&file->f_path, (void __user *)arg);
-		break;
-
-	default:
-		/* do not call the lower */
-		AuDbg("0x%x\n", cmd);
-		err = -ENOTTY;
-	}
-
-	AuTraceErr(err);
-	return err;
-}
-
-#ifdef CONFIG_COMPAT
-long aufs_compat_ioctl_dir(struct file *file, unsigned int cmd,
-			   unsigned long arg)
-{
-	long err;
-
-	switch (cmd) {
-	case AUFS_CTL_RDU:
-	case AUFS_CTL_RDU_INO:
-		err = au_rdu_compat_ioctl(file, cmd, arg);
-		break;
-
-	case AUFS_CTL_IBUSY:
-		err = au_ibusy_compat_ioctl(file, arg);
-		break;
-
-	case AUFS_CTL_BRINFO:
-		err = au_brinfo_compat_ioctl(file, arg);
-		break;
-
-	default:
-		err = aufs_ioctl_dir(file, cmd, arg);
-	}
-
-	AuTraceErr(err);
-	return err;
-}
-
-long aufs_compat_ioctl_nondir(struct file *file, unsigned int cmd,
-			      unsigned long arg)
-{
-	return aufs_ioctl_nondir(file, cmd, (unsigned long)compat_ptr(arg));
-}
-#endif
diff --git a/fs/aufs/loop.c b/fs/aufs/loop.c
deleted file mode 100644
index 5711e7a2f..000000000
--- a/fs/aufs/loop.c
+++ /dev/null
@@ -1,133 +0,0 @@
-/*
- * Copyright (C) 2005-2016 Junjiro R. Okajima
- */
-
-/*
- * support for loopback block device as a branch
- */
-
-#include "aufs.h"
-
-/* added into drivers/block/loop.c */
-static struct file *(*backing_file_func)(struct super_block *sb);
-
-/*
- * test if two lower dentries have overlapping branches.
- */
-int au_test_loopback_overlap(struct super_block *sb, struct dentry *h_adding)
-{
-	struct super_block *h_sb;
-	struct file *backing_file;
-
-	if (unlikely(!backing_file_func)) {
-		/* don't load "loop" module here */
-		backing_file_func = symbol_get(loop_backing_file);
-		if (unlikely(!backing_file_func))
-			/* "loop" module is not loaded */
-			return 0;
-	}
-
-	h_sb = h_adding->d_sb;
-	backing_file = backing_file_func(h_sb);
-	if (!backing_file)
-		return 0;
-
-	h_adding = backing_file->f_path.dentry;
-	/*
-	 * h_adding can be local NFS.
-	 * in this case aufs cannot detect the loop.
-	 */
-	if (unlikely(h_adding->d_sb == sb))
-		return 1;
-	return !!au_test_subdir(h_adding, sb->s_root);
-}
-
-/* true if a kernel thread named 'loop[0-9].*' accesses a file */
-int au_test_loopback_kthread(void)
-{
-	int ret;
-	struct task_struct *tsk = current;
-	char c, comm[sizeof(tsk->comm)];
-
-	ret = 0;
-	if (tsk->flags & PF_KTHREAD) {
-		get_task_comm(comm, tsk);
-		c = comm[4];
-		ret = ('0' <= c && c <= '9'
-		       && !strncmp(comm, "loop", 4));
-	}
-
-	return ret;
-}
-
-/* ---------------------------------------------------------------------- */
-
-#define au_warn_loopback_step	16
-static int au_warn_loopback_nelem = au_warn_loopback_step;
-static unsigned long *au_warn_loopback_array;
-
-void au_warn_loopback(struct super_block *h_sb)
-{
-	int i, new_nelem;
-	unsigned long *a, magic;
-	static DEFINE_SPINLOCK(spin);
-
-	magic = h_sb->s_magic;
-	spin_lock(&spin);
-	a = au_warn_loopback_array;
-	for (i = 0; i < au_warn_loopback_nelem && *a; i++)
-		if (a[i] == magic) {
-			spin_unlock(&spin);
-			return;
-		}
-
-	/* h_sb is new to us, print it */
-	if (i < au_warn_loopback_nelem) {
-		a[i] = magic;
-		goto pr;
-	}
-
-	/* expand the array */
-	new_nelem = au_warn_loopback_nelem + au_warn_loopback_step;
-	a = au_kzrealloc(au_warn_loopback_array,
-			 au_warn_loopback_nelem * sizeof(unsigned long),
-			 new_nelem * sizeof(unsigned long), GFP_ATOMIC);
-	if (a) {
-		au_warn_loopback_nelem = new_nelem;
-		au_warn_loopback_array = a;
-		a[i] = magic;
-		goto pr;
-	}
-
-	spin_unlock(&spin);
-	AuWarn1("realloc failed, ignored\n");
-	return;
-
-pr:
-	spin_unlock(&spin);
-	pr_warn("you may want to try another patch for loopback file "
-		"on %s(0x%lx) branch\n", au_sbtype(h_sb), magic);
-}
-
-int au_loopback_init(void)
-{
-	int err;
-	struct super_block *sb __maybe_unused;
-
-	BUILD_BUG_ON(sizeof(sb->s_magic) != sizeof(unsigned long));
-
-	err = 0;
-	au_warn_loopback_array = kcalloc(au_warn_loopback_step,
-					 sizeof(unsigned long), GFP_NOFS);
-	if (unlikely(!au_warn_loopback_array))
-		err = -ENOMEM;
-
-	return err;
-}
-
-void au_loopback_fin(void)
-{
-	if (backing_file_func)
-		symbol_put(loop_backing_file);
-	kfree(au_warn_loopback_array);
-}
diff --git a/fs/aufs/loop.h b/fs/aufs/loop.h
deleted file mode 100644
index 48bf070e8..000000000
--- a/fs/aufs/loop.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright (C) 2005-2016 Junjiro R. Okajima
- */
-
-/*
- * support for loopback mount as a branch
- */
-
-#ifndef __AUFS_LOOP_H__
-#define __AUFS_LOOP_H__
-
-#ifdef __KERNEL__
-
-struct dentry;
-struct super_block;
-
-#ifdef CONFIG_AUFS_BDEV_LOOP
-/* drivers/block/loop.c */
-struct file *loop_backing_file(struct super_block *sb);
-
-/* loop.c */
-int au_test_loopback_overlap(struct super_block *sb, struct dentry *h_adding);
-int au_test_loopback_kthread(void);
-void au_warn_loopback(struct super_block *h_sb);
-
-int au_loopback_init(void);
-void au_loopback_fin(void);
-#else
-AuStubInt0(au_test_loopback_overlap, struct super_block *sb,
-	   struct dentry *h_adding)
-AuStubInt0(au_test_loopback_kthread, void)
-AuStubVoid(au_warn_loopback, struct super_block *h_sb)
-
-AuStubInt0(au_loopback_init, void)
-AuStubVoid(au_loopback_fin, void)
-#endif /* BLK_DEV_LOOP */
-
-#endif /* __KERNEL__ */
-#endif /* __AUFS_LOOP_H__ */
diff --git a/fs/aufs/magic.mk b/fs/aufs/magic.mk
deleted file mode 100644
index 4f83bdf1d..000000000
--- a/fs/aufs/magic.mk
+++ /dev/null
@@ -1,30 +0,0 @@
-
-# defined in ${srctree}/fs/fuse/inode.c
-# tristate
-ifdef CONFIG_FUSE_FS
-ccflags-y += -DFUSE_SUPER_MAGIC=0x65735546
-endif
-
-# defined in ${srctree}/fs/xfs/xfs_sb.h
-# tristate
-ifdef CONFIG_XFS_FS
-ccflags-y += -DXFS_SB_MAGIC=0x58465342
-endif
-
-# defined in ${srctree}/fs/configfs/mount.c
-# tristate
-ifdef CONFIG_CONFIGFS_FS
-ccflags-y += -DCONFIGFS_MAGIC=0x62656570
-endif
-
-# defined in ${srctree}/fs/ubifs/ubifs.h
-# tristate
-ifdef CONFIG_UBIFS_FS
-ccflags-y += -DUBIFS_SUPER_MAGIC=0x24051905
-endif
-
-# defined in ${srctree}/fs/hfsplus/hfsplus_raw.h
-# tristate
-ifdef CONFIG_HFSPLUS_FS
-ccflags-y += -DHFSPLUS_SUPER_MAGIC=0x482b
-endif
diff --git a/fs/aufs/module.c b/fs/aufs/module.c
deleted file mode 100644
index 8a28377c5..000000000
--- a/fs/aufs/module.c
+++ /dev/null
@@ -1,207 +0,0 @@
-/*
- * Copyright (C) 2005-2016 Junjiro R. Okajima
- */
-
-/*
- * module global variables and operations
- */
-
-#include <linux/module.h>
-#include <linux/seq_file.h>
-#include "aufs.h"
-
-void *au_kzrealloc(void *p, unsigned int nused, unsigned int new_sz, gfp_t gfp)
-{
-	if (new_sz <= nused)
-		return p;
-
-	p = krealloc(p, new_sz, gfp);
-	if (p)
-		memset(p + nused, 0, new_sz - nused);
-	return p;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/*
- * aufs caches
- */
-struct kmem_cache *au_cachep[AuCache_Last];
-static int __init au_cache_init(void)
-{
-	au_cachep[AuCache_DINFO] = AuCacheCtor(au_dinfo, au_di_init_once);
-	if (au_cachep[AuCache_DINFO])
-		/* SLAB_DESTROY_BY_RCU */
-		au_cachep[AuCache_ICNTNR] = AuCacheCtor(au_icntnr,
-							au_icntnr_init_once);
-	if (au_cachep[AuCache_ICNTNR])
-		au_cachep[AuCache_FINFO] = AuCacheCtor(au_finfo,
-						       au_fi_init_once);
-	if (au_cachep[AuCache_FINFO])
-		au_cachep[AuCache_VDIR] = AuCache(au_vdir);
-	if (au_cachep[AuCache_VDIR])
-		au_cachep[AuCache_DEHSTR] = AuCache(au_vdir_dehstr);
-	if (au_cachep[AuCache_DEHSTR])
-		return 0;
-
-	return -ENOMEM;
-}
-
-static void au_cache_fin(void)
-{
-	int i;
-
-	/*
-	 * Make sure all delayed rcu free inodes are flushed before we
-	 * destroy cache.
-	 */
-	rcu_barrier();
-
-	/* excluding AuCache_HNOTIFY */
-	BUILD_BUG_ON(AuCache_HNOTIFY + 1 != AuCache_Last);
-	for (i = 0; i < AuCache_HNOTIFY; i++) {
-		kmem_cache_destroy(au_cachep[i]);
-		au_cachep[i] = NULL;
-	}
-}
-
-/* ---------------------------------------------------------------------- */
-
-int au_dir_roflags;
-
-#ifdef CONFIG_AUFS_SBILIST
-/*
- * iterate_supers_type() doesn't protect us from
- * remounting (branch management)
- */
-struct au_splhead au_sbilist;
-#endif
-
-struct lock_class_key au_lc_key[AuLcKey_Last];
-
-/*
- * functions for module interface.
- */
-MODULE_LICENSE("GPL");
-/* MODULE_LICENSE("GPL v2"); */
-MODULE_AUTHOR("Junjiro R. Okajima <aufs-users@lists.sourceforge.net>");
-MODULE_DESCRIPTION(AUFS_NAME
-	" -- Advanced multi layered unification filesystem");
-MODULE_VERSION(AUFS_VERSION);
-
-/* this module parameter has no meaning when SYSFS is disabled */
-int sysaufs_brs = 1;
-MODULE_PARM_DESC(brs, "use <sysfs>/fs/aufs/si_*/brN");
-module_param_named(brs, sysaufs_brs, int, S_IRUGO);
-
-/* this module parameter has no meaning when USER_NS is disabled */
-bool au_userns;
-MODULE_PARM_DESC(allow_userns, "allow unprivileged to mount under userns");
-module_param_named(allow_userns, au_userns, bool, S_IRUGO);
-
-/* ---------------------------------------------------------------------- */
-
-static char au_esc_chars[0x20 + 3]; /* 0x01-0x20, backslash, del, and NULL */
-
-int au_seq_path(struct seq_file *seq, struct path *path)
-{
-	int err;
-
-	err = seq_path(seq, path, au_esc_chars);
-	if (err > 0)
-		err = 0;
-	else if (err < 0)
-		err = -ENOMEM;
-
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static int __init aufs_init(void)
-{
-	int err, i;
-	char *p;
-
-	p = au_esc_chars;
-	for (i = 1; i <= ' '; i++)
-		*p++ = i;
-	*p++ = '\\';
-	*p++ = '\x7f';
-	*p = 0;
-
-	au_dir_roflags = au_file_roflags(O_DIRECTORY | O_LARGEFILE);
-
-	memcpy(aufs_iop_nogetattr, aufs_iop, sizeof(aufs_iop));
-	for (i = 0; i < AuIop_Last; i++)
-		aufs_iop_nogetattr[i].getattr = NULL;
-
-	au_sbilist_init();
-	sysaufs_brs_init();
-	au_debug_init();
-	au_dy_init();
-	err = sysaufs_init();
-	if (unlikely(err))
-		goto out;
-	err = au_procfs_init();
-	if (unlikely(err))
-		goto out_sysaufs;
-	err = au_wkq_init();
-	if (unlikely(err))
-		goto out_procfs;
-	err = au_loopback_init();
-	if (unlikely(err))
-		goto out_wkq;
-	err = au_hnotify_init();
-	if (unlikely(err))
-		goto out_loopback;
-	err = au_sysrq_init();
-	if (unlikely(err))
-		goto out_hin;
-	err = au_cache_init();
-	if (unlikely(err))
-		goto out_sysrq;
-
-	aufs_fs_type.fs_flags |= au_userns ? FS_USERNS_MOUNT : 0;
-	err = register_filesystem(&aufs_fs_type);
-	if (unlikely(err))
-		goto out_cache;
-
-	/* since we define pr_fmt, call printk directly */
-	printk(KERN_INFO AUFS_NAME " " AUFS_VERSION "\n");
-	goto out; /* success */
-
-out_cache:
-	au_cache_fin();
-out_sysrq:
-	au_sysrq_fin();
-out_hin:
-	au_hnotify_fin();
-out_loopback:
-	au_loopback_fin();
-out_wkq:
-	au_wkq_fin();
-out_procfs:
-	au_procfs_fin();
-out_sysaufs:
-	sysaufs_fin();
-	au_dy_fin();
-out:
-	return err;
-}
-
-static void __exit aufs_exit(void)
-{
-	unregister_filesystem(&aufs_fs_type);
-	au_cache_fin();
-	au_sysrq_fin();
-	au_hnotify_fin();
-	au_loopback_fin();
-	au_wkq_fin();
-	au_procfs_fin();
-	sysaufs_fin();
-	au_dy_fin();
-}
-
-module_init(aufs_init);
-module_exit(aufs_exit);
diff --git a/fs/aufs/module.h b/fs/aufs/module.h
deleted file mode 100644
index bb8644730..000000000
--- a/fs/aufs/module.h
+++ /dev/null
@@ -1,92 +0,0 @@
-/*
- * Copyright (C) 2005-2016 Junjiro R. Okajima
- */
-
-/*
- * module initialization and module-global
- */
-
-#ifndef __AUFS_MODULE_H__
-#define __AUFS_MODULE_H__
-
-#ifdef __KERNEL__
-
-#include <linux/slab.h>
-
-struct path;
-struct seq_file;
-
-/* module parameters */
-extern int sysaufs_brs;
-extern bool au_userns;
-
-/* ---------------------------------------------------------------------- */
-
-extern int au_dir_roflags;
-
-enum {
-	AuLcNonDir_FIINFO,
-	AuLcNonDir_DIINFO,
-	AuLcNonDir_IIINFO,
-
-	AuLcDir_FIINFO,
-	AuLcDir_DIINFO,
-	AuLcDir_IIINFO,
-
-	AuLcSymlink_DIINFO,
-	AuLcSymlink_IIINFO,
-
-	AuLcKey_Last
-};
-extern struct lock_class_key au_lc_key[AuLcKey_Last];
-
-void *au_kzrealloc(void *p, unsigned int nused, unsigned int new_sz, gfp_t gfp);
-int au_seq_path(struct seq_file *seq, struct path *path);
-
-#ifdef CONFIG_PROC_FS
-/* procfs.c */
-int __init au_procfs_init(void);
-void au_procfs_fin(void);
-#else
-AuStubInt0(au_procfs_init, void);
-AuStubVoid(au_procfs_fin, void);
-#endif
-
-/* ---------------------------------------------------------------------- */
-
-/* kmem cache */
-enum {
-	AuCache_DINFO,
-	AuCache_ICNTNR,
-	AuCache_FINFO,
-	AuCache_VDIR,
-	AuCache_DEHSTR,
-	AuCache_HNOTIFY, /* must be last */
-	AuCache_Last
-};
-
-#define AuCacheFlags		(SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD)
-#define AuCache(type)		KMEM_CACHE(type, AuCacheFlags)
-#define AuCacheCtor(type, ctor)	\
-	kmem_cache_create(#type, sizeof(struct type), \
-			  __alignof__(struct type), AuCacheFlags, ctor)
-
-extern struct kmem_cache *au_cachep[];
-
-#define AuCacheFuncs(name, index) \
-static inline struct au_##name *au_cache_alloc_##name(void) \
-{ return kmem_cache_alloc(au_cachep[AuCache_##index], GFP_NOFS); } \
-static inline void au_cache_free_##name(struct au_##name *p) \
-{ kmem_cache_free(au_cachep[AuCache_##index], p); }
-
-AuCacheFuncs(dinfo, DINFO);
-AuCacheFuncs(icntnr, ICNTNR);
-AuCacheFuncs(finfo, FINFO);
-AuCacheFuncs(vdir, VDIR);
-AuCacheFuncs(vdir_dehstr, DEHSTR);
-#ifdef CONFIG_AUFS_HNOTIFY
-AuCacheFuncs(hnotify, HNOTIFY);
-#endif
-
-#endif /* __KERNEL__ */
-#endif /* __AUFS_MODULE_H__ */
diff --git a/fs/aufs/mvdown.c b/fs/aufs/mvdown.c
deleted file mode 100644
index 1f2224f6d..000000000
--- a/fs/aufs/mvdown.c
+++ /dev/null
@@ -1,690 +0,0 @@
-/*
- * Copyright (C) 2011-2016 Junjiro R. Okajima
- */
-
-/*
- * move-down, opposite of copy-up
- */
-
-#include "aufs.h"
-
-struct au_mvd_args {
-	struct {
-		struct super_block *h_sb;
-		struct dentry *h_parent;
-		struct au_hinode *hdir;
-		struct inode *h_dir, *h_inode;
-		struct au_pin pin;
-	} info[AUFS_MVDOWN_NARRAY];
-
-	struct aufs_mvdown mvdown;
-	struct dentry *dentry, *parent;
-	struct inode *inode, *dir;
-	struct super_block *sb;
-	aufs_bindex_t bopq, bwh, bfound;
-	unsigned char rename_lock;
-};
-
-#define mvd_errno		mvdown.au_errno
-#define mvd_bsrc		mvdown.stbr[AUFS_MVDOWN_UPPER].bindex
-#define mvd_src_brid		mvdown.stbr[AUFS_MVDOWN_UPPER].brid
-#define mvd_bdst		mvdown.stbr[AUFS_MVDOWN_LOWER].bindex
-#define mvd_dst_brid		mvdown.stbr[AUFS_MVDOWN_LOWER].brid
-
-#define mvd_h_src_sb		info[AUFS_MVDOWN_UPPER].h_sb
-#define mvd_h_src_parent	info[AUFS_MVDOWN_UPPER].h_parent
-#define mvd_hdir_src		info[AUFS_MVDOWN_UPPER].hdir
-#define mvd_h_src_dir		info[AUFS_MVDOWN_UPPER].h_dir
-#define mvd_h_src_inode		info[AUFS_MVDOWN_UPPER].h_inode
-#define mvd_pin_src		info[AUFS_MVDOWN_UPPER].pin
-
-#define mvd_h_dst_sb		info[AUFS_MVDOWN_LOWER].h_sb
-#define mvd_h_dst_parent	info[AUFS_MVDOWN_LOWER].h_parent
-#define mvd_hdir_dst		info[AUFS_MVDOWN_LOWER].hdir
-#define mvd_h_dst_dir		info[AUFS_MVDOWN_LOWER].h_dir
-#define mvd_h_dst_inode		info[AUFS_MVDOWN_LOWER].h_inode
-#define mvd_pin_dst		info[AUFS_MVDOWN_LOWER].pin
-
-#define AU_MVD_PR(flag, ...) do {			\
-		if (flag)				\
-			pr_err(__VA_ARGS__);		\
-	} while (0)
-
-static int find_lower_writable(struct au_mvd_args *a)
-{
-	struct super_block *sb;
-	aufs_bindex_t bindex, bend;
-	struct au_branch *br;
-
-	sb = a->sb;
-	bindex = a->mvd_bsrc;
-	bend = au_sbend(sb);
-	if (a->mvdown.flags & AUFS_MVDOWN_FHSM_LOWER)
-		for (bindex++; bindex <= bend; bindex++) {
-			br = au_sbr(sb, bindex);
-			if (au_br_fhsm(br->br_perm)
-			    && (!(au_br_sb(br)->s_flags & MS_RDONLY)))
-				return bindex;
-		}
-	else if (!(a->mvdown.flags & AUFS_MVDOWN_ROLOWER))
-		for (bindex++; bindex <= bend; bindex++) {
-			br = au_sbr(sb, bindex);
-			if (!au_br_rdonly(br))
-				return bindex;
-		}
-	else
-		for (bindex++; bindex <= bend; bindex++) {
-			br = au_sbr(sb, bindex);
-			if (!(au_br_sb(br)->s_flags & MS_RDONLY)) {
-				if (au_br_rdonly(br))
-					a->mvdown.flags
-						|= AUFS_MVDOWN_ROLOWER_R;
-				return bindex;
-			}
-		}
-
-	return -1;
-}
-
-/* make the parent dir on bdst */
-static int au_do_mkdir(const unsigned char dmsg, struct au_mvd_args *a)
-{
-	int err;
-
-	err = 0;
-	a->mvd_hdir_src = au_hi(a->dir, a->mvd_bsrc);
-	a->mvd_hdir_dst = au_hi(a->dir, a->mvd_bdst);
-	a->mvd_h_src_parent = au_h_dptr(a->parent, a->mvd_bsrc);
-	a->mvd_h_dst_parent = NULL;
-	if (au_dbend(a->parent) >= a->mvd_bdst)
-		a->mvd_h_dst_parent = au_h_dptr(a->parent, a->mvd_bdst);
-	if (!a->mvd_h_dst_parent) {
-		err = au_cpdown_dirs(a->dentry, a->mvd_bdst);
-		if (unlikely(err)) {
-			AU_MVD_PR(dmsg, "cpdown_dirs failed\n");
-			goto out;
-		}
-		a->mvd_h_dst_parent = au_h_dptr(a->parent, a->mvd_bdst);
-	}
-
-out:
-	AuTraceErr(err);
-	return err;
-}
-
-/* lock them all */
-static int au_do_lock(const unsigned char dmsg, struct au_mvd_args *a)
-{
-	int err;
-	struct dentry *h_trap;
-
-	a->mvd_h_src_sb = au_sbr_sb(a->sb, a->mvd_bsrc);
-	a->mvd_h_dst_sb = au_sbr_sb(a->sb, a->mvd_bdst);
-	err = au_pin(&a->mvd_pin_dst, a->dentry, a->mvd_bdst,
-		     au_opt_udba(a->sb),
-		     AuPin_MNT_WRITE | AuPin_DI_LOCKED);
-	AuTraceErr(err);
-	if (unlikely(err)) {
-		AU_MVD_PR(dmsg, "pin_dst failed\n");
-		goto out;
-	}
-
-	if (a->mvd_h_src_sb != a->mvd_h_dst_sb) {
-		a->rename_lock = 0;
-		au_pin_init(&a->mvd_pin_src, a->dentry, a->mvd_bsrc,
-			    AuLsc_DI_PARENT, AuLsc_I_PARENT3,
-			    au_opt_udba(a->sb),
-			    AuPin_MNT_WRITE | AuPin_DI_LOCKED);
-		err = au_do_pin(&a->mvd_pin_src);
-		AuTraceErr(err);
-		a->mvd_h_src_dir = d_inode(a->mvd_h_src_parent);
-		if (unlikely(err)) {
-			AU_MVD_PR(dmsg, "pin_src failed\n");
-			goto out_dst;
-		}
-		goto out; /* success */
-	}
-
-	a->rename_lock = 1;
-	au_pin_hdir_unlock(&a->mvd_pin_dst);
-	err = au_pin(&a->mvd_pin_src, a->dentry, a->mvd_bsrc,
-		     au_opt_udba(a->sb),
-		     AuPin_MNT_WRITE | AuPin_DI_LOCKED);
-	AuTraceErr(err);
-	a->mvd_h_src_dir = d_inode(a->mvd_h_src_parent);
-	if (unlikely(err)) {
-		AU_MVD_PR(dmsg, "pin_src failed\n");
-		au_pin_hdir_lock(&a->mvd_pin_dst);
-		goto out_dst;
-	}
-	au_pin_hdir_unlock(&a->mvd_pin_src);
-	h_trap = vfsub_lock_rename(a->mvd_h_src_parent, a->mvd_hdir_src,
-				   a->mvd_h_dst_parent, a->mvd_hdir_dst);
-	if (h_trap) {
-		err = (h_trap != a->mvd_h_src_parent);
-		if (err)
-			err = (h_trap != a->mvd_h_dst_parent);
-	}
-	BUG_ON(err); /* it should never happen */
-	if (unlikely(a->mvd_h_src_dir != au_pinned_h_dir(&a->mvd_pin_src))) {
-		err = -EBUSY;
-		AuTraceErr(err);
-		vfsub_unlock_rename(a->mvd_h_src_parent, a->mvd_hdir_src,
-				    a->mvd_h_dst_parent, a->mvd_hdir_dst);
-		au_pin_hdir_lock(&a->mvd_pin_src);
-		au_unpin(&a->mvd_pin_src);
-		au_pin_hdir_lock(&a->mvd_pin_dst);
-		goto out_dst;
-	}
-	goto out; /* success */
-
-out_dst:
-	au_unpin(&a->mvd_pin_dst);
-out:
-	AuTraceErr(err);
-	return err;
-}
-
-static void au_do_unlock(const unsigned char dmsg, struct au_mvd_args *a)
-{
-	if (!a->rename_lock)
-		au_unpin(&a->mvd_pin_src);
-	else {
-		vfsub_unlock_rename(a->mvd_h_src_parent, a->mvd_hdir_src,
-				    a->mvd_h_dst_parent, a->mvd_hdir_dst);
-		au_pin_hdir_lock(&a->mvd_pin_src);
-		au_unpin(&a->mvd_pin_src);
-		au_pin_hdir_lock(&a->mvd_pin_dst);
-	}
-	au_unpin(&a->mvd_pin_dst);
-}
-
-/* copy-down the file */
-static int au_do_cpdown(const unsigned char dmsg, struct au_mvd_args *a)
-{
-	int err;
-	struct au_cp_generic cpg = {
-		.dentry	= a->dentry,
-		.bdst	= a->mvd_bdst,
-		.bsrc	= a->mvd_bsrc,
-		.len	= -1,
-		.pin	= &a->mvd_pin_dst,
-		.flags	= AuCpup_DTIME | AuCpup_HOPEN
-	};
-
-	AuDbg("b%d, b%d\n", cpg.bsrc, cpg.bdst);
-	if (a->mvdown.flags & AUFS_MVDOWN_OWLOWER)
-		au_fset_cpup(cpg.flags, OVERWRITE);
-	if (a->mvdown.flags & AUFS_MVDOWN_ROLOWER)
-		au_fset_cpup(cpg.flags, RWDST);
-	err = au_sio_cpdown_simple(&cpg);
-	if (unlikely(err))
-		AU_MVD_PR(dmsg, "cpdown failed\n");
-
-	AuTraceErr(err);
-	return err;
-}
-
-/*
- * unlink the whiteout on bdst if exist which may be created by UDBA while we
- * were sleeping
- */
-static int au_do_unlink_wh(const unsigned char dmsg, struct au_mvd_args *a)
-{
-	int err;
-	struct path h_path;
-	struct au_branch *br;
-	struct inode *delegated;
-
-	br = au_sbr(a->sb, a->mvd_bdst);
-	h_path.dentry = au_wh_lkup(a->mvd_h_dst_parent, &a->dentry->d_name, br);
-	err = PTR_ERR(h_path.dentry);
-	if (IS_ERR(h_path.dentry)) {
-		AU_MVD_PR(dmsg, "wh_lkup failed\n");
-		goto out;
-	}
-
-	err = 0;
-	if (d_is_positive(h_path.dentry)) {
-		h_path.mnt = au_br_mnt(br);
-		delegated = NULL;
-		err = vfsub_unlink(d_inode(a->mvd_h_dst_parent), &h_path,
-				   &delegated, /*force*/0);
-		if (unlikely(err == -EWOULDBLOCK)) {
-			pr_warn("cannot retry for NFSv4 delegation"
-				" for an internal unlink\n");
-			iput(delegated);
-		}
-		if (unlikely(err))
-			AU_MVD_PR(dmsg, "wh_unlink failed\n");
-	}
-	dput(h_path.dentry);
-
-out:
-	AuTraceErr(err);
-	return err;
-}
-
-/*
- * unlink the topmost h_dentry
- */
-static int au_do_unlink(const unsigned char dmsg, struct au_mvd_args *a)
-{
-	int err;
-	struct path h_path;
-	struct inode *delegated;
-
-	h_path.mnt = au_sbr_mnt(a->sb, a->mvd_bsrc);
-	h_path.dentry = au_h_dptr(a->dentry, a->mvd_bsrc);
-	delegated = NULL;
-	err = vfsub_unlink(a->mvd_h_src_dir, &h_path, &delegated, /*force*/0);
-	if (unlikely(err == -EWOULDBLOCK)) {
-		pr_warn("cannot retry for NFSv4 delegation"
-			" for an internal unlink\n");
-		iput(delegated);
-	}
-	if (unlikely(err))
-		AU_MVD_PR(dmsg, "unlink failed\n");
-
-	AuTraceErr(err);
-	return err;
-}
-
-/* Since mvdown succeeded, we ignore an error of this function */
-static void au_do_stfs(const unsigned char dmsg, struct au_mvd_args *a)
-{
-	int err;
-	struct au_branch *br;
-
-	a->mvdown.flags |= AUFS_MVDOWN_STFS_FAILED;
-	br = au_sbr(a->sb, a->mvd_bsrc);
-	err = au_br_stfs(br, &a->mvdown.stbr[AUFS_MVDOWN_UPPER].stfs);
-	if (!err) {
-		br = au_sbr(a->sb, a->mvd_bdst);
-		a->mvdown.stbr[AUFS_MVDOWN_LOWER].brid = br->br_id;
-		err = au_br_stfs(br, &a->mvdown.stbr[AUFS_MVDOWN_LOWER].stfs);
-	}
-	if (!err)
-		a->mvdown.flags &= ~AUFS_MVDOWN_STFS_FAILED;
-	else
-		AU_MVD_PR(dmsg, "statfs failed (%d), ignored\n", err);
-}
-
-/*
- * copy-down the file and unlink the bsrc file.
- * - unlink the bdst whout if exist
- * - copy-down the file (with whtmp name and rename)
- * - unlink the bsrc file
- */
-static int au_do_mvdown(const unsigned char dmsg, struct au_mvd_args *a)
-{
-	int err;
-
-	err = au_do_mkdir(dmsg, a);
-	if (!err)
-		err = au_do_lock(dmsg, a);
-	if (unlikely(err))
-		goto out;
-
-	/*
-	 * do not revert the activities we made on bdst since they should be
-	 * harmless in aufs.
-	 */
-
-	err = au_do_cpdown(dmsg, a);
-	if (!err)
-		err = au_do_unlink_wh(dmsg, a);
-	if (!err && !(a->mvdown.flags & AUFS_MVDOWN_KUPPER))
-		err = au_do_unlink(dmsg, a);
-	if (unlikely(err))
-		goto out_unlock;
-
-	AuDbg("%pd2, 0x%x, %d --> %d\n",
-	      a->dentry, a->mvdown.flags, a->mvd_bsrc, a->mvd_bdst);
-	if (find_lower_writable(a) < 0)
-		a->mvdown.flags |= AUFS_MVDOWN_BOTTOM;
-
-	if (a->mvdown.flags & AUFS_MVDOWN_STFS)
-		au_do_stfs(dmsg, a);
-
-	/* maintain internal array */
-	if (!(a->mvdown.flags & AUFS_MVDOWN_KUPPER)) {
-		au_set_h_dptr(a->dentry, a->mvd_bsrc, NULL);
-		au_set_dbstart(a->dentry, a->mvd_bdst);
-		au_set_h_iptr(a->inode, a->mvd_bsrc, NULL, /*flags*/0);
-		au_set_ibstart(a->inode, a->mvd_bdst);
-	} else {
-		/* hide the lower */
-		au_set_h_dptr(a->dentry, a->mvd_bdst, NULL);
-		au_set_dbend(a->dentry, a->mvd_bsrc);
-		au_set_h_iptr(a->inode, a->mvd_bdst, NULL, /*flags*/0);
-		au_set_ibend(a->inode, a->mvd_bsrc);
-	}
-	if (au_dbend(a->dentry) < a->mvd_bdst)
-		au_set_dbend(a->dentry, a->mvd_bdst);
-	if (au_ibend(a->inode) < a->mvd_bdst)
-		au_set_ibend(a->inode, a->mvd_bdst);
-
-out_unlock:
-	au_do_unlock(dmsg, a);
-out:
-	AuTraceErr(err);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* make sure the file is idle */
-static int au_mvd_args_busy(const unsigned char dmsg, struct au_mvd_args *a)
-{
-	int err, plinked;
-
-	err = 0;
-	plinked = !!au_opt_test(au_mntflags(a->sb), PLINK);
-	if (au_dbstart(a->dentry) == a->mvd_bsrc
-	    && au_dcount(a->dentry) == 1
-	    && atomic_read(&a->inode->i_count) == 1
-	    /* && a->mvd_h_src_inode->i_nlink == 1 */
-	    && (!plinked || !au_plink_test(a->inode))
-	    && a->inode->i_nlink == 1)
-		goto out;
-
-	err = -EBUSY;
-	AU_MVD_PR(dmsg,
-		  "b%d, d{b%d, c%d?}, i{c%d?, l%u}, hi{l%u}, p{%d, %d}\n",
-		  a->mvd_bsrc, au_dbstart(a->dentry), au_dcount(a->dentry),
-		  atomic_read(&a->inode->i_count), a->inode->i_nlink,
-		  a->mvd_h_src_inode->i_nlink,
-		  plinked, plinked ? au_plink_test(a->inode) : 0);
-
-out:
-	AuTraceErr(err);
-	return err;
-}
-
-/* make sure the parent dir is fine */
-static int au_mvd_args_parent(const unsigned char dmsg,
-			      struct au_mvd_args *a)
-{
-	int err;
-	aufs_bindex_t bindex;
-
-	err = 0;
-	if (unlikely(au_alive_dir(a->parent))) {
-		err = -ENOENT;
-		AU_MVD_PR(dmsg, "parent dir is dead\n");
-		goto out;
-	}
-
-	a->bopq = au_dbdiropq(a->parent);
-	bindex = au_wbr_nonopq(a->dentry, a->mvd_bdst);
-	AuDbg("b%d\n", bindex);
-	if (unlikely((bindex >= 0 && bindex < a->mvd_bdst)
-		     || (a->bopq != -1 && a->bopq < a->mvd_bdst))) {
-		err = -EINVAL;
-		a->mvd_errno = EAU_MVDOWN_OPAQUE;
-		AU_MVD_PR(dmsg, "ancestor is opaque b%d, b%d\n",
-			  a->bopq, a->mvd_bdst);
-	}
-
-out:
-	AuTraceErr(err);
-	return err;
-}
-
-static int au_mvd_args_intermediate(const unsigned char dmsg,
-				    struct au_mvd_args *a)
-{
-	int err;
-	struct au_dinfo *dinfo, *tmp;
-
-	/* lookup the next lower positive entry */
-	err = -ENOMEM;
-	tmp = au_di_alloc(a->sb, AuLsc_DI_TMP);
-	if (unlikely(!tmp))
-		goto out;
-
-	a->bfound = -1;
-	a->bwh = -1;
-	dinfo = au_di(a->dentry);
-	au_di_cp(tmp, dinfo);
-	au_di_swap(tmp, dinfo);
-
-	/* returns the number of positive dentries */
-	err = au_lkup_dentry(a->dentry, a->mvd_bsrc + 1, /*type*/0);
-	if (!err)
-		a->bwh = au_dbwh(a->dentry);
-	else if (err > 0)
-		a->bfound = au_dbstart(a->dentry);
-
-	au_di_swap(tmp, dinfo);
-	au_rw_write_unlock(&tmp->di_rwsem);
-	au_di_free(tmp);
-	if (unlikely(err < 0))
-		AU_MVD_PR(dmsg, "failed look-up lower\n");
-
-	/*
-	 * here, we have these cases.
-	 * bfound == -1
-	 *	no positive dentry under bsrc. there are more sub-cases.
-	 *	bwh < 0
-	 *		there no whiteout, we can safely move-down.
-	 *	bwh <= bsrc
-	 *		impossible
-	 *	bsrc < bwh && bwh < bdst
-	 *		there is a whiteout on RO branch. cannot proceed.
-	 *	bwh == bdst
-	 *		there is a whiteout on the RW target branch. it should
-	 *		be removed.
-	 *	bdst < bwh
-	 *		there is a whiteout somewhere unrelated branch.
-	 * -1 < bfound && bfound <= bsrc
-	 *	impossible.
-	 * bfound < bdst
-	 *	found, but it is on RO branch between bsrc and bdst. cannot
-	 *	proceed.
-	 * bfound == bdst
-	 *	found, replace it if AUFS_MVDOWN_FORCE is set. otherwise return
-	 *	error.
-	 * bdst < bfound
-	 *	found, after we create the file on bdst, it will be hidden.
-	 */
-
-	AuDebugOn(a->bfound == -1
-		  && a->bwh != -1
-		  && a->bwh <= a->mvd_bsrc);
-	AuDebugOn(-1 < a->bfound
-		  && a->bfound <= a->mvd_bsrc);
-
-	err = -EINVAL;
-	if (a->bfound == -1
-	    && a->mvd_bsrc < a->bwh
-	    && a->bwh != -1
-	    && a->bwh < a->mvd_bdst) {
-		a->mvd_errno = EAU_MVDOWN_WHITEOUT;
-		AU_MVD_PR(dmsg, "bsrc %d, bdst %d, bfound %d, bwh %d\n",
-			  a->mvd_bsrc, a->mvd_bdst, a->bfound, a->bwh);
-		goto out;
-	} else if (a->bfound != -1 && a->bfound < a->mvd_bdst) {
-		a->mvd_errno = EAU_MVDOWN_UPPER;
-		AU_MVD_PR(dmsg, "bdst %d, bfound %d\n",
-			  a->mvd_bdst, a->bfound);
-		goto out;
-	}
-
-	err = 0; /* success */
-
-out:
-	AuTraceErr(err);
-	return err;
-}
-
-static int au_mvd_args_exist(const unsigned char dmsg, struct au_mvd_args *a)
-{
-	int err;
-
-	err = 0;
-	if (!(a->mvdown.flags & AUFS_MVDOWN_OWLOWER)
-	    && a->bfound == a->mvd_bdst)
-		err = -EEXIST;
-	AuTraceErr(err);
-	return err;
-}
-
-static int au_mvd_args(const unsigned char dmsg, struct au_mvd_args *a)
-{
-	int err;
-	struct au_branch *br;
-
-	err = -EISDIR;
-	if (unlikely(S_ISDIR(a->inode->i_mode)))
-		goto out;
-
-	err = -EINVAL;
-	if (!(a->mvdown.flags & AUFS_MVDOWN_BRID_UPPER))
-		a->mvd_bsrc = au_ibstart(a->inode);
-	else {
-		a->mvd_bsrc = au_br_index(a->sb, a->mvd_src_brid);
-		if (unlikely(a->mvd_bsrc < 0
-			     || (a->mvd_bsrc < au_dbstart(a->dentry)
-				 || au_dbend(a->dentry) < a->mvd_bsrc
-				 || !au_h_dptr(a->dentry, a->mvd_bsrc))
-			     || (a->mvd_bsrc < au_ibstart(a->inode)
-				 || au_ibend(a->inode) < a->mvd_bsrc
-				 || !au_h_iptr(a->inode, a->mvd_bsrc)))) {
-			a->mvd_errno = EAU_MVDOWN_NOUPPER;
-			AU_MVD_PR(dmsg, "no upper\n");
-			goto out;
-		}
-	}
-	if (unlikely(a->mvd_bsrc == au_sbend(a->sb))) {
-		a->mvd_errno = EAU_MVDOWN_BOTTOM;
-		AU_MVD_PR(dmsg, "on the bottom\n");
-		goto out;
-	}
-	a->mvd_h_src_inode = au_h_iptr(a->inode, a->mvd_bsrc);
-	br = au_sbr(a->sb, a->mvd_bsrc);
-	err = au_br_rdonly(br);
-	if (!(a->mvdown.flags & AUFS_MVDOWN_ROUPPER)) {
-		if (unlikely(err))
-			goto out;
-	} else if (!(vfsub_native_ro(a->mvd_h_src_inode)
-		     || IS_APPEND(a->mvd_h_src_inode))) {
-		if (err)
-			a->mvdown.flags |= AUFS_MVDOWN_ROUPPER_R;
-		/* go on */
-	} else
-		goto out;
-
-	err = -EINVAL;
-	if (!(a->mvdown.flags & AUFS_MVDOWN_BRID_LOWER)) {
-		a->mvd_bdst = find_lower_writable(a);
-		if (unlikely(a->mvd_bdst < 0)) {
-			a->mvd_errno = EAU_MVDOWN_BOTTOM;
-			AU_MVD_PR(dmsg, "no writable lower branch\n");
-			goto out;
-		}
-	} else {
-		a->mvd_bdst = au_br_index(a->sb, a->mvd_dst_brid);
-		if (unlikely(a->mvd_bdst < 0
-			     || au_sbend(a->sb) < a->mvd_bdst)) {
-			a->mvd_errno = EAU_MVDOWN_NOLOWERBR;
-			AU_MVD_PR(dmsg, "no lower brid\n");
-			goto out;
-		}
-	}
-
-	err = au_mvd_args_busy(dmsg, a);
-	if (!err)
-		err = au_mvd_args_parent(dmsg, a);
-	if (!err)
-		err = au_mvd_args_intermediate(dmsg, a);
-	if (!err)
-		err = au_mvd_args_exist(dmsg, a);
-	if (!err)
-		AuDbg("b%d, b%d\n", a->mvd_bsrc, a->mvd_bdst);
-
-out:
-	AuTraceErr(err);
-	return err;
-}
-
-int au_mvdown(struct dentry *dentry, struct aufs_mvdown __user *uarg)
-{
-	int err, e;
-	unsigned char dmsg;
-	struct au_mvd_args *args;
-	struct inode *inode;
-
-	inode = d_inode(dentry);
-	err = -EPERM;
-	if (unlikely(!capable(CAP_SYS_ADMIN)))
-		goto out;
-
-	err = -ENOMEM;
-	args = kmalloc(sizeof(*args), GFP_NOFS);
-	if (unlikely(!args))
-		goto out;
-
-	err = copy_from_user(&args->mvdown, uarg, sizeof(args->mvdown));
-	if (!err)
-		err = !access_ok(VERIFY_WRITE, uarg, sizeof(*uarg));
-	if (unlikely(err)) {
-		err = -EFAULT;
-		AuTraceErr(err);
-		goto out_free;
-	}
-	AuDbg("flags 0x%x\n", args->mvdown.flags);
-	args->mvdown.flags &= ~(AUFS_MVDOWN_ROLOWER_R | AUFS_MVDOWN_ROUPPER_R);
-	args->mvdown.au_errno = 0;
-	args->dentry = dentry;
-	args->inode = inode;
-	args->sb = dentry->d_sb;
-
-	err = -ENOENT;
-	dmsg = !!(args->mvdown.flags & AUFS_MVDOWN_DMSG);
-	args->parent = dget_parent(dentry);
-	args->dir = d_inode(args->parent);
-	mutex_lock_nested(&args->dir->i_mutex, I_MUTEX_PARENT);
-	dput(args->parent);
-	if (unlikely(args->parent != dentry->d_parent)) {
-		AU_MVD_PR(dmsg, "parent dir is moved\n");
-		goto out_dir;
-	}
-
-	mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
-	err = aufs_read_lock(dentry, AuLock_DW | AuLock_FLUSH | AuLock_NOPLMW);
-	if (unlikely(err))
-		goto out_inode;
-
-	di_write_lock_parent(args->parent);
-	err = au_mvd_args(dmsg, args);
-	if (unlikely(err))
-		goto out_parent;
-
-	err = au_do_mvdown(dmsg, args);
-	if (unlikely(err))
-		goto out_parent;
-
-	au_cpup_attr_timesizes(args->dir);
-	au_cpup_attr_timesizes(inode);
-	if (!(args->mvdown.flags & AUFS_MVDOWN_KUPPER))
-		au_cpup_igen(inode, au_h_iptr(inode, args->mvd_bdst));
-	/* au_digen_dec(dentry); */
-
-out_parent:
-	di_write_unlock(args->parent);
-	aufs_read_unlock(dentry, AuLock_DW);
-out_inode:
-	mutex_unlock(&inode->i_mutex);
-out_dir:
-	mutex_unlock(&args->dir->i_mutex);
-out_free:
-	e = copy_to_user(uarg, &args->mvdown, sizeof(args->mvdown));
-	if (unlikely(e))
-		err = -EFAULT;
-	kfree(args);
-out:
-	AuTraceErr(err);
-	return err;
-}
diff --git a/fs/aufs/opts.c b/fs/aufs/opts.c
deleted file mode 100644
index 5c39817f3..000000000
--- a/fs/aufs/opts.c
+++ /dev/null
@@ -1,1846 +0,0 @@
-/*
- * Copyright (C) 2005-2016 Junjiro R. Okajima
- */
-
-/*
- * mount options/flags
- */
-
-#include <linux/namei.h>
-#include <linux/types.h> /* a distribution requires */
-#include <linux/parser.h>
-#include "aufs.h"
-
-/* ---------------------------------------------------------------------- */
-
-enum {
-	Opt_br,
-	Opt_add, Opt_del, Opt_mod, Opt_append, Opt_prepend,
-	Opt_idel, Opt_imod,
-	Opt_dirwh, Opt_rdcache, Opt_rdblk, Opt_rdhash,
-	Opt_rdblk_def, Opt_rdhash_def,
-	Opt_xino, Opt_noxino,
-	Opt_trunc_xino, Opt_trunc_xino_v, Opt_notrunc_xino,
-	Opt_trunc_xino_path, Opt_itrunc_xino,
-	Opt_trunc_xib, Opt_notrunc_xib,
-	Opt_shwh, Opt_noshwh,
-	Opt_plink, Opt_noplink, Opt_list_plink,
-	Opt_udba,
-	Opt_dio, Opt_nodio,
-	Opt_diropq_a, Opt_diropq_w,
-	Opt_warn_perm, Opt_nowarn_perm,
-	Opt_wbr_copyup, Opt_wbr_create,
-	Opt_fhsm_sec,
-	Opt_verbose, Opt_noverbose,
-	Opt_sum, Opt_nosum, Opt_wsum,
-	Opt_dirperm1, Opt_nodirperm1,
-	Opt_acl, Opt_noacl,
-	Opt_tail, Opt_ignore, Opt_ignore_silent, Opt_err
-};
-
-static match_table_t options = {
-	{Opt_br, "br=%s"},
-	{Opt_br, "br:%s"},
-
-	{Opt_add, "add=%d:%s"},
-	{Opt_add, "add:%d:%s"},
-	{Opt_add, "ins=%d:%s"},
-	{Opt_add, "ins:%d:%s"},
-	{Opt_append, "append=%s"},
-	{Opt_append, "append:%s"},
-	{Opt_prepend, "prepend=%s"},
-	{Opt_prepend, "prepend:%s"},
-
-	{Opt_del, "del=%s"},
-	{Opt_del, "del:%s"},
-	/* {Opt_idel, "idel:%d"}, */
-	{Opt_mod, "mod=%s"},
-	{Opt_mod, "mod:%s"},
-	/* {Opt_imod, "imod:%d:%s"}, */
-
-	{Opt_dirwh, "dirwh=%d"},
-
-	{Opt_xino, "xino=%s"},
-	{Opt_noxino, "noxino"},
-	{Opt_trunc_xino, "trunc_xino"},
-	{Opt_trunc_xino_v, "trunc_xino_v=%d:%d"},
-	{Opt_notrunc_xino, "notrunc_xino"},
-	{Opt_trunc_xino_path, "trunc_xino=%s"},
-	{Opt_itrunc_xino, "itrunc_xino=%d"},
-	/* {Opt_zxino, "zxino=%s"}, */
-	{Opt_trunc_xib, "trunc_xib"},
-	{Opt_notrunc_xib, "notrunc_xib"},
-
-#ifdef CONFIG_PROC_FS
-	{Opt_plink, "plink"},
-#else
-	{Opt_ignore_silent, "plink"},
-#endif
-
-	{Opt_noplink, "noplink"},
-
-#ifdef CONFIG_AUFS_DEBUG
-	{Opt_list_plink, "list_plink"},
-#endif
-
-	{Opt_udba, "udba=%s"},
-
-	{Opt_dio, "dio"},
-	{Opt_nodio, "nodio"},
-
-#ifdef CONFIG_AUFS_FHSM
-	{Opt_fhsm_sec, "fhsm_sec=%d"},
-#else
-	{Opt_ignore_silent, "fhsm_sec=%d"},
-#endif
-
-	{Opt_diropq_a, "diropq=always"},
-	{Opt_diropq_a, "diropq=a"},
-	{Opt_diropq_w, "diropq=whiteouted"},
-	{Opt_diropq_w, "diropq=w"},
-
-	{Opt_warn_perm, "warn_perm"},
-	{Opt_nowarn_perm, "nowarn_perm"},
-
-	/* keep them temporary */
-	{Opt_ignore_silent, "nodlgt"},
-	{Opt_ignore_silent, "clean_plink"},
-
-#ifdef CONFIG_AUFS_SHWH
-	{Opt_shwh, "shwh"},
-#endif
-	{Opt_noshwh, "noshwh"},
-
-	{Opt_dirperm1, "dirperm1"},
-	{Opt_nodirperm1, "nodirperm1"},
-
-	{Opt_verbose, "verbose"},
-	{Opt_verbose, "v"},
-	{Opt_noverbose, "noverbose"},
-	{Opt_noverbose, "quiet"},
-	{Opt_noverbose, "q"},
-	{Opt_noverbose, "silent"},
-
-	{Opt_sum, "sum"},
-	{Opt_nosum, "nosum"},
-	{Opt_wsum, "wsum"},
-
-	{Opt_rdcache, "rdcache=%d"},
-	{Opt_rdblk, "rdblk=%d"},
-	{Opt_rdblk_def, "rdblk=def"},
-	{Opt_rdhash, "rdhash=%d"},
-	{Opt_rdhash_def, "rdhash=def"},
-
-	{Opt_wbr_create, "create=%s"},
-	{Opt_wbr_create, "create_policy=%s"},
-	{Opt_wbr_copyup, "cpup=%s"},
-	{Opt_wbr_copyup, "copyup=%s"},
-	{Opt_wbr_copyup, "copyup_policy=%s"},
-
-	/* generic VFS flag */
-#ifdef CONFIG_FS_POSIX_ACL
-	{Opt_acl, "acl"},
-	{Opt_noacl, "noacl"},
-#else
-	{Opt_ignore_silent, "acl"},
-	{Opt_ignore_silent, "noacl"},
-#endif
-
-	/* internal use for the scripts */
-	{Opt_ignore_silent, "si=%s"},
-
-	{Opt_br, "dirs=%s"},
-	{Opt_ignore, "debug=%d"},
-	{Opt_ignore, "delete=whiteout"},
-	{Opt_ignore, "delete=all"},
-	{Opt_ignore, "imap=%s"},
-
-	/* temporary workaround, due to old mount(8)? */
-	{Opt_ignore_silent, "relatime"},
-
-	{Opt_err, NULL}
-};
-
-/* ---------------------------------------------------------------------- */
-
-static const char *au_parser_pattern(int val, match_table_t tbl)
-{
-	struct match_token *p;
-
-	p = tbl;
-	while (p->pattern) {
-		if (p->token == val)
-			return p->pattern;
-		p++;
-	}
-	BUG();
-	return "??";
-}
-
-static const char *au_optstr(int *val, match_table_t tbl)
-{
-	struct match_token *p;
-	int v;
-
-	v = *val;
-	if (!v)
-		goto out;
-	p = tbl;
-	while (p->pattern) {
-		if (p->token
-		    && (v & p->token) == p->token) {
-			*val &= ~p->token;
-			return p->pattern;
-		}
-		p++;
-	}
-
-out:
-	return NULL;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static match_table_t brperm = {
-	{AuBrPerm_RO, AUFS_BRPERM_RO},
-	{AuBrPerm_RR, AUFS_BRPERM_RR},
-	{AuBrPerm_RW, AUFS_BRPERM_RW},
-	{0, NULL}
-};
-
-static match_table_t brattr = {
-	/* general */
-	{AuBrAttr_COO_REG, AUFS_BRATTR_COO_REG},
-	{AuBrAttr_COO_ALL, AUFS_BRATTR_COO_ALL},
-	/* 'unpin' attrib is meaningless since linux-3.18-rc1 */
-	{AuBrAttr_UNPIN, AUFS_BRATTR_UNPIN},
-#ifdef CONFIG_AUFS_FHSM
-	{AuBrAttr_FHSM, AUFS_BRATTR_FHSM},
-#endif
-#ifdef CONFIG_AUFS_XATTR
-	{AuBrAttr_ICEX, AUFS_BRATTR_ICEX},
-	{AuBrAttr_ICEX_SEC, AUFS_BRATTR_ICEX_SEC},
-	{AuBrAttr_ICEX_SYS, AUFS_BRATTR_ICEX_SYS},
-	{AuBrAttr_ICEX_TR, AUFS_BRATTR_ICEX_TR},
-	{AuBrAttr_ICEX_USR, AUFS_BRATTR_ICEX_USR},
-	{AuBrAttr_ICEX_OTH, AUFS_BRATTR_ICEX_OTH},
-#endif
-
-	/* ro/rr branch */
-	{AuBrRAttr_WH, AUFS_BRRATTR_WH},
-
-	/* rw branch */
-	{AuBrWAttr_MOO, AUFS_BRWATTR_MOO},
-	{AuBrWAttr_NoLinkWH, AUFS_BRWATTR_NLWH},
-
-	{0, NULL}
-};
-
-static int br_attr_val(char *str, match_table_t table, substring_t args[])
-{
-	int attr, v;
-	char *p;
-
-	attr = 0;
-	do {
-		p = strchr(str, '+');
-		if (p)
-			*p = 0;
-		v = match_token(str, table, args);
-		if (v) {
-			if (v & AuBrAttr_CMOO_Mask)
-				attr &= ~AuBrAttr_CMOO_Mask;
-			attr |= v;
-		} else {
-			if (p)
-				*p = '+';
-			pr_warn("ignored branch attribute %s\n", str);
-			break;
-		}
-		if (p)
-			str = p + 1;
-	} while (p);
-
-	return attr;
-}
-
-static int au_do_optstr_br_attr(au_br_perm_str_t *str, int perm)
-{
-	int sz;
-	const char *p;
-	char *q;
-
-	q = str->a;
-	*q = 0;
-	p = au_optstr(&perm, brattr);
-	if (p) {
-		sz = strlen(p);
-		memcpy(q, p, sz + 1);
-		q += sz;
-	} else
-		goto out;
-
-	do {
-		p = au_optstr(&perm, brattr);
-		if (p) {
-			*q++ = '+';
-			sz = strlen(p);
-			memcpy(q, p, sz + 1);
-			q += sz;
-		}
-	} while (p);
-
-out:
-	return q - str->a;
-}
-
-static int noinline_for_stack br_perm_val(char *perm)
-{
-	int val, bad, sz;
-	char *p;
-	substring_t args[MAX_OPT_ARGS];
-	au_br_perm_str_t attr;
-
-	p = strchr(perm, '+');
-	if (p)
-		*p = 0;
-	val = match_token(perm, brperm, args);
-	if (!val) {
-		if (p)
-			*p = '+';
-		pr_warn("ignored branch permission %s\n", perm);
-		val = AuBrPerm_RO;
-		goto out;
-	}
-	if (!p)
-		goto out;
-
-	val |= br_attr_val(p + 1, brattr, args);
-
-	bad = 0;
-	switch (val & AuBrPerm_Mask) {
-	case AuBrPerm_RO:
-	case AuBrPerm_RR:
-		bad = val & AuBrWAttr_Mask;
-		val &= ~AuBrWAttr_Mask;
-		break;
-	case AuBrPerm_RW:
-		bad = val & AuBrRAttr_Mask;
-		val &= ~AuBrRAttr_Mask;
-		break;
-	}
-
-	/*
-	 * 'unpin' attrib becomes meaningless since linux-3.18-rc1, but aufs
-	 * does not treat it as an error, just warning.
-	 * this is a tiny guard for the user operation.
-	 */
-	if (val & AuBrAttr_UNPIN) {
-		bad |= AuBrAttr_UNPIN;
-		val &= ~AuBrAttr_UNPIN;
-	}
-
-	if (unlikely(bad)) {
-		sz = au_do_optstr_br_attr(&attr, bad);
-		AuDebugOn(!sz);
-		pr_warn("ignored branch attribute %s\n", attr.a);
-	}
-
-out:
-	return val;
-}
-
-void au_optstr_br_perm(au_br_perm_str_t *str, int perm)
-{
-	au_br_perm_str_t attr;
-	const char *p;
-	char *q;
-	int sz;
-
-	q = str->a;
-	p = au_optstr(&perm, brperm);
-	AuDebugOn(!p || !*p);
-	sz = strlen(p);
-	memcpy(q, p, sz + 1);
-	q += sz;
-
-	sz = au_do_optstr_br_attr(&attr, perm);
-	if (sz) {
-		*q++ = '+';
-		memcpy(q, attr.a, sz + 1);
-	}
-
-	AuDebugOn(strlen(str->a) >= sizeof(str->a));
-}
-
-/* ---------------------------------------------------------------------- */
-
-static match_table_t udbalevel = {
-	{AuOpt_UDBA_REVAL, "reval"},
-	{AuOpt_UDBA_NONE, "none"},
-#ifdef CONFIG_AUFS_HNOTIFY
-	{AuOpt_UDBA_HNOTIFY, "notify"}, /* abstraction */
-#ifdef CONFIG_AUFS_HFSNOTIFY
-	{AuOpt_UDBA_HNOTIFY, "fsnotify"},
-#endif
-#endif
-	{-1, NULL}
-};
-
-static int noinline_for_stack udba_val(char *str)
-{
-	substring_t args[MAX_OPT_ARGS];
-
-	return match_token(str, udbalevel, args);
-}
-
-const char *au_optstr_udba(int udba)
-{
-	return au_parser_pattern(udba, udbalevel);
-}
-
-/* ---------------------------------------------------------------------- */
-
-static match_table_t au_wbr_create_policy = {
-	{AuWbrCreate_TDP, "tdp"},
-	{AuWbrCreate_TDP, "top-down-parent"},
-	{AuWbrCreate_RR, "rr"},
-	{AuWbrCreate_RR, "round-robin"},
-	{AuWbrCreate_MFS, "mfs"},
-	{AuWbrCreate_MFS, "most-free-space"},
-	{AuWbrCreate_MFSV, "mfs:%d"},
-	{AuWbrCreate_MFSV, "most-free-space:%d"},
-
-	{AuWbrCreate_MFSRR, "mfsrr:%d"},
-	{AuWbrCreate_MFSRRV, "mfsrr:%d:%d"},
-	{AuWbrCreate_PMFS, "pmfs"},
-	{AuWbrCreate_PMFSV, "pmfs:%d"},
-	{AuWbrCreate_PMFSRR, "pmfsrr:%d"},
-	{AuWbrCreate_PMFSRRV, "pmfsrr:%d:%d"},
-
-	{-1, NULL}
-};
-
-/*
- * cf. linux/lib/parser.c and cmdline.c
- * gave up calling memparse() since it uses simple_strtoull() instead of
- * kstrto...().
- */
-static int noinline_for_stack
-au_match_ull(substring_t *s, unsigned long long *result)
-{
-	int err;
-	unsigned int len;
-	char a[32];
-
-	err = -ERANGE;
-	len = s->to - s->from;
-	if (len + 1 <= sizeof(a)) {
-		memcpy(a, s->from, len);
-		a[len] = '\0';
-		err = kstrtoull(a, 0, result);
-	}
-	return err;
-}
-
-static int au_wbr_mfs_wmark(substring_t *arg, char *str,
-			    struct au_opt_wbr_create *create)
-{
-	int err;
-	unsigned long long ull;
-
-	err = 0;
-	if (!au_match_ull(arg, &ull))
-		create->mfsrr_watermark = ull;
-	else {
-		pr_err("bad integer in %s\n", str);
-		err = -EINVAL;
-	}
-
-	return err;
-}
-
-static int au_wbr_mfs_sec(substring_t *arg, char *str,
-			  struct au_opt_wbr_create *create)
-{
-	int n, err;
-
-	err = 0;
-	if (!match_int(arg, &n) && 0 <= n && n <= AUFS_MFS_MAX_SEC)
-		create->mfs_second = n;
-	else {
-		pr_err("bad integer in %s\n", str);
-		err = -EINVAL;
-	}
-
-	return err;
-}
-
-static int noinline_for_stack
-au_wbr_create_val(char *str, struct au_opt_wbr_create *create)
-{
-	int err, e;
-	substring_t args[MAX_OPT_ARGS];
-
-	err = match_token(str, au_wbr_create_policy, args);
-	create->wbr_create = err;
-	switch (err) {
-	case AuWbrCreate_MFSRRV:
-	case AuWbrCreate_PMFSRRV:
-		e = au_wbr_mfs_wmark(&args[0], str, create);
-		if (!e)
-			e = au_wbr_mfs_sec(&args[1], str, create);
-		if (unlikely(e))
-			err = e;
-		break;
-	case AuWbrCreate_MFSRR:
-	case AuWbrCreate_PMFSRR:
-		e = au_wbr_mfs_wmark(&args[0], str, create);
-		if (unlikely(e)) {
-			err = e;
-			break;
-		}
-		/*FALLTHROUGH*/
-	case AuWbrCreate_MFS:
-	case AuWbrCreate_PMFS:
-		create->mfs_second = AUFS_MFS_DEF_SEC;
-		break;
-	case AuWbrCreate_MFSV:
-	case AuWbrCreate_PMFSV:
-		e = au_wbr_mfs_sec(&args[0], str, create);
-		if (unlikely(e))
-			err = e;
-		break;
-	}
-
-	return err;
-}
-
-const char *au_optstr_wbr_create(int wbr_create)
-{
-	return au_parser_pattern(wbr_create, au_wbr_create_policy);
-}
-
-static match_table_t au_wbr_copyup_policy = {
-	{AuWbrCopyup_TDP, "tdp"},
-	{AuWbrCopyup_TDP, "top-down-parent"},
-	{AuWbrCopyup_BUP, "bup"},
-	{AuWbrCopyup_BUP, "bottom-up-parent"},
-	{AuWbrCopyup_BU, "bu"},
-	{AuWbrCopyup_BU, "bottom-up"},
-	{-1, NULL}
-};
-
-static int noinline_for_stack au_wbr_copyup_val(char *str)
-{
-	substring_t args[MAX_OPT_ARGS];
-
-	return match_token(str, au_wbr_copyup_policy, args);
-}
-
-const char *au_optstr_wbr_copyup(int wbr_copyup)
-{
-	return au_parser_pattern(wbr_copyup, au_wbr_copyup_policy);
-}
-
-/* ---------------------------------------------------------------------- */
-
-static const int lkup_dirflags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
-
-static void dump_opts(struct au_opts *opts)
-{
-#ifdef CONFIG_AUFS_DEBUG
-	/* reduce stack space */
-	union {
-		struct au_opt_add *add;
-		struct au_opt_del *del;
-		struct au_opt_mod *mod;
-		struct au_opt_xino *xino;
-		struct au_opt_xino_itrunc *xino_itrunc;
-		struct au_opt_wbr_create *create;
-	} u;
-	struct au_opt *opt;
-
-	opt = opts->opt;
-	while (opt->type != Opt_tail) {
-		switch (opt->type) {
-		case Opt_add:
-			u.add = &opt->add;
-			AuDbg("add {b%d, %s, 0x%x, %p}\n",
-				  u.add->bindex, u.add->pathname, u.add->perm,
-				  u.add->path.dentry);
-			break;
-		case Opt_del:
-		case Opt_idel:
-			u.del = &opt->del;
-			AuDbg("del {%s, %p}\n",
-			      u.del->pathname, u.del->h_path.dentry);
-			break;
-		case Opt_mod:
-		case Opt_imod:
-			u.mod = &opt->mod;
-			AuDbg("mod {%s, 0x%x, %p}\n",
-				  u.mod->path, u.mod->perm, u.mod->h_root);
-			break;
-		case Opt_append:
-			u.add = &opt->add;
-			AuDbg("append {b%d, %s, 0x%x, %p}\n",
-				  u.add->bindex, u.add->pathname, u.add->perm,
-				  u.add->path.dentry);
-			break;
-		case Opt_prepend:
-			u.add = &opt->add;
-			AuDbg("prepend {b%d, %s, 0x%x, %p}\n",
-				  u.add->bindex, u.add->pathname, u.add->perm,
-				  u.add->path.dentry);
-			break;
-		case Opt_dirwh:
-			AuDbg("dirwh %d\n", opt->dirwh);
-			break;
-		case Opt_rdcache:
-			AuDbg("rdcache %d\n", opt->rdcache);
-			break;
-		case Opt_rdblk:
-			AuDbg("rdblk %u\n", opt->rdblk);
-			break;
-		case Opt_rdblk_def:
-			AuDbg("rdblk_def\n");
-			break;
-		case Opt_rdhash:
-			AuDbg("rdhash %u\n", opt->rdhash);
-			break;
-		case Opt_rdhash_def:
-			AuDbg("rdhash_def\n");
-			break;
-		case Opt_xino:
-			u.xino = &opt->xino;
-			AuDbg("xino {%s %pD}\n", u.xino->path, u.xino->file);
-			break;
-		case Opt_trunc_xino:
-			AuLabel(trunc_xino);
-			break;
-		case Opt_notrunc_xino:
-			AuLabel(notrunc_xino);
-			break;
-		case Opt_trunc_xino_path:
-		case Opt_itrunc_xino:
-			u.xino_itrunc = &opt->xino_itrunc;
-			AuDbg("trunc_xino %d\n", u.xino_itrunc->bindex);
-			break;
-		case Opt_noxino:
-			AuLabel(noxino);
-			break;
-		case Opt_trunc_xib:
-			AuLabel(trunc_xib);
-			break;
-		case Opt_notrunc_xib:
-			AuLabel(notrunc_xib);
-			break;
-		case Opt_shwh:
-			AuLabel(shwh);
-			break;
-		case Opt_noshwh:
-			AuLabel(noshwh);
-			break;
-		case Opt_dirperm1:
-			AuLabel(dirperm1);
-			break;
-		case Opt_nodirperm1:
-			AuLabel(nodirperm1);
-			break;
-		case Opt_plink:
-			AuLabel(plink);
-			break;
-		case Opt_noplink:
-			AuLabel(noplink);
-			break;
-		case Opt_list_plink:
-			AuLabel(list_plink);
-			break;
-		case Opt_udba:
-			AuDbg("udba %d, %s\n",
-				  opt->udba, au_optstr_udba(opt->udba));
-			break;
-		case Opt_dio:
-			AuLabel(dio);
-			break;
-		case Opt_nodio:
-			AuLabel(nodio);
-			break;
-		case Opt_diropq_a:
-			AuLabel(diropq_a);
-			break;
-		case Opt_diropq_w:
-			AuLabel(diropq_w);
-			break;
-		case Opt_warn_perm:
-			AuLabel(warn_perm);
-			break;
-		case Opt_nowarn_perm:
-			AuLabel(nowarn_perm);
-			break;
-		case Opt_verbose:
-			AuLabel(verbose);
-			break;
-		case Opt_noverbose:
-			AuLabel(noverbose);
-			break;
-		case Opt_sum:
-			AuLabel(sum);
-			break;
-		case Opt_nosum:
-			AuLabel(nosum);
-			break;
-		case Opt_wsum:
-			AuLabel(wsum);
-			break;
-		case Opt_wbr_create:
-			u.create = &opt->wbr_create;
-			AuDbg("create %d, %s\n", u.create->wbr_create,
-				  au_optstr_wbr_create(u.create->wbr_create));
-			switch (u.create->wbr_create) {
-			case AuWbrCreate_MFSV:
-			case AuWbrCreate_PMFSV:
-				AuDbg("%d sec\n", u.create->mfs_second);
-				break;
-			case AuWbrCreate_MFSRR:
-				AuDbg("%llu watermark\n",
-					  u.create->mfsrr_watermark);
-				break;
-			case AuWbrCreate_MFSRRV:
-			case AuWbrCreate_PMFSRRV:
-				AuDbg("%llu watermark, %d sec\n",
-					  u.create->mfsrr_watermark,
-					  u.create->mfs_second);
-				break;
-			}
-			break;
-		case Opt_wbr_copyup:
-			AuDbg("copyup %d, %s\n", opt->wbr_copyup,
-				  au_optstr_wbr_copyup(opt->wbr_copyup));
-			break;
-		case Opt_fhsm_sec:
-			AuDbg("fhsm_sec %u\n", opt->fhsm_second);
-			break;
-		case Opt_acl:
-			AuLabel(acl);
-			break;
-		case Opt_noacl:
-			AuLabel(noacl);
-			break;
-		default:
-			BUG();
-		}
-		opt++;
-	}
-#endif
-}
-
-void au_opts_free(struct au_opts *opts)
-{
-	struct au_opt *opt;
-
-	opt = opts->opt;
-	while (opt->type != Opt_tail) {
-		switch (opt->type) {
-		case Opt_add:
-		case Opt_append:
-		case Opt_prepend:
-			path_put(&opt->add.path);
-			break;
-		case Opt_del:
-		case Opt_idel:
-			path_put(&opt->del.h_path);
-			break;
-		case Opt_mod:
-		case Opt_imod:
-			dput(opt->mod.h_root);
-			break;
-		case Opt_xino:
-			fput(opt->xino.file);
-			break;
-		}
-		opt++;
-	}
-}
-
-static int opt_add(struct au_opt *opt, char *opt_str, unsigned long sb_flags,
-		   aufs_bindex_t bindex)
-{
-	int err;
-	struct au_opt_add *add = &opt->add;
-	char *p;
-
-	add->bindex = bindex;
-	add->perm = AuBrPerm_RO;
-	add->pathname = opt_str;
-	p = strchr(opt_str, '=');
-	if (p) {
-		*p++ = 0;
-		if (*p)
-			add->perm = br_perm_val(p);
-	}
-
-	err = vfsub_kern_path(add->pathname, lkup_dirflags, &add->path);
-	if (!err) {
-		if (!p) {
-			add->perm = AuBrPerm_RO;
-			if (au_test_fs_rr(add->path.dentry->d_sb))
-				add->perm = AuBrPerm_RR;
-			else if (!bindex && !(sb_flags & MS_RDONLY))
-				add->perm = AuBrPerm_RW;
-		}
-		opt->type = Opt_add;
-		goto out;
-	}
-	pr_err("lookup failed %s (%d)\n", add->pathname, err);
-	err = -EINVAL;
-
-out:
-	return err;
-}
-
-static int au_opts_parse_del(struct au_opt_del *del, substring_t args[])
-{
-	int err;
-
-	del->pathname = args[0].from;
-	AuDbg("del path %s\n", del->pathname);
-
-	err = vfsub_kern_path(del->pathname, lkup_dirflags, &del->h_path);
-	if (unlikely(err))
-		pr_err("lookup failed %s (%d)\n", del->pathname, err);
-
-	return err;
-}
-
-#if 0 /* reserved for future use */
-static int au_opts_parse_idel(struct super_block *sb, aufs_bindex_t bindex,
-			      struct au_opt_del *del, substring_t args[])
-{
-	int err;
-	struct dentry *root;
-
-	err = -EINVAL;
-	root = sb->s_root;
-	aufs_read_lock(root, AuLock_FLUSH);
-	if (bindex < 0 || au_sbend(sb) < bindex) {
-		pr_err("out of bounds, %d\n", bindex);
-		goto out;
-	}
-
-	err = 0;
-	del->h_path.dentry = dget(au_h_dptr(root, bindex));
-	del->h_path.mnt = mntget(au_sbr_mnt(sb, bindex));
-
-out:
-	aufs_read_unlock(root, !AuLock_IR);
-	return err;
-}
-#endif
-
-static int noinline_for_stack
-au_opts_parse_mod(struct au_opt_mod *mod, substring_t args[])
-{
-	int err;
-	struct path path;
-	char *p;
-
-	err = -EINVAL;
-	mod->path = args[0].from;
-	p = strchr(mod->path, '=');
-	if (unlikely(!p)) {
-		pr_err("no permssion %s\n", args[0].from);
-		goto out;
-	}
-
-	*p++ = 0;
-	err = vfsub_kern_path(mod->path, lkup_dirflags, &path);
-	if (unlikely(err)) {
-		pr_err("lookup failed %s (%d)\n", mod->path, err);
-		goto out;
-	}
-
-	mod->perm = br_perm_val(p);
-	AuDbg("mod path %s, perm 0x%x, %s\n", mod->path, mod->perm, p);
-	mod->h_root = dget(path.dentry);
-	path_put(&path);
-
-out:
-	return err;
-}
-
-#if 0 /* reserved for future use */
-static int au_opts_parse_imod(struct super_block *sb, aufs_bindex_t bindex,
-			      struct au_opt_mod *mod, substring_t args[])
-{
-	int err;
-	struct dentry *root;
-
-	err = -EINVAL;
-	root = sb->s_root;
-	aufs_read_lock(root, AuLock_FLUSH);
-	if (bindex < 0 || au_sbend(sb) < bindex) {
-		pr_err("out of bounds, %d\n", bindex);
-		goto out;
-	}
-
-	err = 0;
-	mod->perm = br_perm_val(args[1].from);
-	AuDbg("mod path %s, perm 0x%x, %s\n",
-	      mod->path, mod->perm, args[1].from);
-	mod->h_root = dget(au_h_dptr(root, bindex));
-
-out:
-	aufs_read_unlock(root, !AuLock_IR);
-	return err;
-}
-#endif
-
-static int au_opts_parse_xino(struct super_block *sb, struct au_opt_xino *xino,
-			      substring_t args[])
-{
-	int err;
-	struct file *file;
-
-	file = au_xino_create(sb, args[0].from, /*silent*/0);
-	err = PTR_ERR(file);
-	if (IS_ERR(file))
-		goto out;
-
-	err = -EINVAL;
-	if (unlikely(file->f_path.dentry->d_sb == sb)) {
-		fput(file);
-		pr_err("%s must be outside\n", args[0].from);
-		goto out;
-	}
-
-	err = 0;
-	xino->file = file;
-	xino->path = args[0].from;
-
-out:
-	return err;
-}
-
-static int noinline_for_stack
-au_opts_parse_xino_itrunc_path(struct super_block *sb,
-			       struct au_opt_xino_itrunc *xino_itrunc,
-			       substring_t args[])
-{
-	int err;
-	aufs_bindex_t bend, bindex;
-	struct path path;
-	struct dentry *root;
-
-	err = vfsub_kern_path(args[0].from, lkup_dirflags, &path);
-	if (unlikely(err)) {
-		pr_err("lookup failed %s (%d)\n", args[0].from, err);
-		goto out;
-	}
-
-	xino_itrunc->bindex = -1;
-	root = sb->s_root;
-	aufs_read_lock(root, AuLock_FLUSH);
-	bend = au_sbend(sb);
-	for (bindex = 0; bindex <= bend; bindex++) {
-		if (au_h_dptr(root, bindex) == path.dentry) {
-			xino_itrunc->bindex = bindex;
-			break;
-		}
-	}
-	aufs_read_unlock(root, !AuLock_IR);
-	path_put(&path);
-
-	if (unlikely(xino_itrunc->bindex < 0)) {
-		pr_err("no such branch %s\n", args[0].from);
-		err = -EINVAL;
-	}
-
-out:
-	return err;
-}
-
-/* called without aufs lock */
-int au_opts_parse(struct super_block *sb, char *str, struct au_opts *opts)
-{
-	int err, n, token;
-	aufs_bindex_t bindex;
-	unsigned char skipped;
-	struct dentry *root;
-	struct au_opt *opt, *opt_tail;
-	char *opt_str;
-	/* reduce the stack space */
-	union {
-		struct au_opt_xino_itrunc *xino_itrunc;
-		struct au_opt_wbr_create *create;
-	} u;
-	struct {
-		substring_t args[MAX_OPT_ARGS];
-	} *a;
-
-	err = -ENOMEM;
-	a = kmalloc(sizeof(*a), GFP_NOFS);
-	if (unlikely(!a))
-		goto out;
-
-	root = sb->s_root;
-	err = 0;
-	bindex = 0;
-	opt = opts->opt;
-	opt_tail = opt + opts->max_opt - 1;
-	opt->type = Opt_tail;
-	while (!err && (opt_str = strsep(&str, ",")) && *opt_str) {
-		err = -EINVAL;
-		skipped = 0;
-		token = match_token(opt_str, options, a->args);
-		switch (token) {
-		case Opt_br:
-			err = 0;
-			while (!err && (opt_str = strsep(&a->args[0].from, ":"))
-			       && *opt_str) {
-				err = opt_add(opt, opt_str, opts->sb_flags,
-					      bindex++);
-				if (unlikely(!err && ++opt > opt_tail)) {
-					err = -E2BIG;
-					break;
-				}
-				opt->type = Opt_tail;
-				skipped = 1;
-			}
-			break;
-		case Opt_add:
-			if (unlikely(match_int(&a->args[0], &n))) {
-				pr_err("bad integer in %s\n", opt_str);
-				break;
-			}
-			bindex = n;
-			err = opt_add(opt, a->args[1].from, opts->sb_flags,
-				      bindex);
-			if (!err)
-				opt->type = token;
-			break;
-		case Opt_append:
-			err = opt_add(opt, a->args[0].from, opts->sb_flags,
-				      /*dummy bindex*/1);
-			if (!err)
-				opt->type = token;
-			break;
-		case Opt_prepend:
-			err = opt_add(opt, a->args[0].from, opts->sb_flags,
-				      /*bindex*/0);
-			if (!err)
-				opt->type = token;
-			break;
-		case Opt_del:
-			err = au_opts_parse_del(&opt->del, a->args);
-			if (!err)
-				opt->type = token;
-			break;
-#if 0 /* reserved for future use */
-		case Opt_idel:
-			del->pathname = "(indexed)";
-			if (unlikely(match_int(&args[0], &n))) {
-				pr_err("bad integer in %s\n", opt_str);
-				break;
-			}
-			err = au_opts_parse_idel(sb, n, &opt->del, a->args);
-			if (!err)
-				opt->type = token;
-			break;
-#endif
-		case Opt_mod:
-			err = au_opts_parse_mod(&opt->mod, a->args);
-			if (!err)
-				opt->type = token;
-			break;
-#ifdef IMOD /* reserved for future use */
-		case Opt_imod:
-			u.mod->path = "(indexed)";
-			if (unlikely(match_int(&a->args[0], &n))) {
-				pr_err("bad integer in %s\n", opt_str);
-				break;
-			}
-			err = au_opts_parse_imod(sb, n, &opt->mod, a->args);
-			if (!err)
-				opt->type = token;
-			break;
-#endif
-		case Opt_xino:
-			err = au_opts_parse_xino(sb, &opt->xino, a->args);
-			if (!err)
-				opt->type = token;
-			break;
-
-		case Opt_trunc_xino_path:
-			err = au_opts_parse_xino_itrunc_path
-				(sb, &opt->xino_itrunc, a->args);
-			if (!err)
-				opt->type = token;
-			break;
-
-		case Opt_itrunc_xino:
-			u.xino_itrunc = &opt->xino_itrunc;
-			if (unlikely(match_int(&a->args[0], &n))) {
-				pr_err("bad integer in %s\n", opt_str);
-				break;
-			}
-			u.xino_itrunc->bindex = n;
-			aufs_read_lock(root, AuLock_FLUSH);
-			if (n < 0 || au_sbend(sb) < n) {
-				pr_err("out of bounds, %d\n", n);
-				aufs_read_unlock(root, !AuLock_IR);
-				break;
-			}
-			aufs_read_unlock(root, !AuLock_IR);
-			err = 0;
-			opt->type = token;
-			break;
-
-		case Opt_dirwh:
-			if (unlikely(match_int(&a->args[0], &opt->dirwh)))
-				break;
-			err = 0;
-			opt->type = token;
-			break;
-
-		case Opt_rdcache:
-			if (unlikely(match_int(&a->args[0], &n))) {
-				pr_err("bad integer in %s\n", opt_str);
-				break;
-			}
-			if (unlikely(n > AUFS_RDCACHE_MAX)) {
-				pr_err("rdcache must be smaller than %d\n",
-				       AUFS_RDCACHE_MAX);
-				break;
-			}
-			opt->rdcache = n;
-			err = 0;
-			opt->type = token;
-			break;
-		case Opt_rdblk:
-			if (unlikely(match_int(&a->args[0], &n)
-				     || n < 0
-				     || n > KMALLOC_MAX_SIZE)) {
-				pr_err("bad integer in %s\n", opt_str);
-				break;
-			}
-			if (unlikely(n && n < NAME_MAX)) {
-				pr_err("rdblk must be larger than %d\n",
-				       NAME_MAX);
-				break;
-			}
-			opt->rdblk = n;
-			err = 0;
-			opt->type = token;
-			break;
-		case Opt_rdhash:
-			if (unlikely(match_int(&a->args[0], &n)
-				     || n < 0
-				     || n * sizeof(struct hlist_head)
-				     > KMALLOC_MAX_SIZE)) {
-				pr_err("bad integer in %s\n", opt_str);
-				break;
-			}
-			opt->rdhash = n;
-			err = 0;
-			opt->type = token;
-			break;
-
-		case Opt_trunc_xino:
-		case Opt_notrunc_xino:
-		case Opt_noxino:
-		case Opt_trunc_xib:
-		case Opt_notrunc_xib:
-		case Opt_shwh:
-		case Opt_noshwh:
-		case Opt_dirperm1:
-		case Opt_nodirperm1:
-		case Opt_plink:
-		case Opt_noplink:
-		case Opt_list_plink:
-		case Opt_dio:
-		case Opt_nodio:
-		case Opt_diropq_a:
-		case Opt_diropq_w:
-		case Opt_warn_perm:
-		case Opt_nowarn_perm:
-		case Opt_verbose:
-		case Opt_noverbose:
-		case Opt_sum:
-		case Opt_nosum:
-		case Opt_wsum:
-		case Opt_rdblk_def:
-		case Opt_rdhash_def:
-		case Opt_acl:
-		case Opt_noacl:
-			err = 0;
-			opt->type = token;
-			break;
-
-		case Opt_udba:
-			opt->udba = udba_val(a->args[0].from);
-			if (opt->udba >= 0) {
-				err = 0;
-				opt->type = token;
-			} else
-				pr_err("wrong value, %s\n", opt_str);
-			break;
-
-		case Opt_wbr_create:
-			u.create = &opt->wbr_create;
-			u.create->wbr_create
-				= au_wbr_create_val(a->args[0].from, u.create);
-			if (u.create->wbr_create >= 0) {
-				err = 0;
-				opt->type = token;
-			} else
-				pr_err("wrong value, %s\n", opt_str);
-			break;
-		case Opt_wbr_copyup:
-			opt->wbr_copyup = au_wbr_copyup_val(a->args[0].from);
-			if (opt->wbr_copyup >= 0) {
-				err = 0;
-				opt->type = token;
-			} else
-				pr_err("wrong value, %s\n", opt_str);
-			break;
-
-		case Opt_fhsm_sec:
-			if (unlikely(match_int(&a->args[0], &n)
-				     || n < 0)) {
-				pr_err("bad integer in %s\n", opt_str);
-				break;
-			}
-			if (sysaufs_brs) {
-				opt->fhsm_second = n;
-				opt->type = token;
-			} else
-				pr_warn("ignored %s\n", opt_str);
-			err = 0;
-			break;
-
-		case Opt_ignore:
-			pr_warn("ignored %s\n", opt_str);
-			/*FALLTHROUGH*/
-		case Opt_ignore_silent:
-			skipped = 1;
-			err = 0;
-			break;
-		case Opt_err:
-			pr_err("unknown option %s\n", opt_str);
-			break;
-		}
-
-		if (!err && !skipped) {
-			if (unlikely(++opt > opt_tail)) {
-				err = -E2BIG;
-				opt--;
-				opt->type = Opt_tail;
-				break;
-			}
-			opt->type = Opt_tail;
-		}
-	}
-
-	kfree(a);
-	dump_opts(opts);
-	if (unlikely(err))
-		au_opts_free(opts);
-
-out:
-	return err;
-}
-
-static int au_opt_wbr_create(struct super_block *sb,
-			     struct au_opt_wbr_create *create)
-{
-	int err;
-	struct au_sbinfo *sbinfo;
-
-	SiMustWriteLock(sb);
-
-	err = 1; /* handled */
-	sbinfo = au_sbi(sb);
-	if (sbinfo->si_wbr_create_ops->fin) {
-		err = sbinfo->si_wbr_create_ops->fin(sb);
-		if (!err)
-			err = 1;
-	}
-
-	sbinfo->si_wbr_create = create->wbr_create;
-	sbinfo->si_wbr_create_ops = au_wbr_create_ops + create->wbr_create;
-	switch (create->wbr_create) {
-	case AuWbrCreate_MFSRRV:
-	case AuWbrCreate_MFSRR:
-	case AuWbrCreate_PMFSRR:
-	case AuWbrCreate_PMFSRRV:
-		sbinfo->si_wbr_mfs.mfsrr_watermark = create->mfsrr_watermark;
-		/*FALLTHROUGH*/
-	case AuWbrCreate_MFS:
-	case AuWbrCreate_MFSV:
-	case AuWbrCreate_PMFS:
-	case AuWbrCreate_PMFSV:
-		sbinfo->si_wbr_mfs.mfs_expire
-			= msecs_to_jiffies(create->mfs_second * MSEC_PER_SEC);
-		break;
-	}
-
-	if (sbinfo->si_wbr_create_ops->init)
-		sbinfo->si_wbr_create_ops->init(sb); /* ignore */
-
-	return err;
-}
-
-/*
- * returns,
- * plus: processed without an error
- * zero: unprocessed
- */
-static int au_opt_simple(struct super_block *sb, struct au_opt *opt,
-			 struct au_opts *opts)
-{
-	int err;
-	struct au_sbinfo *sbinfo;
-
-	SiMustWriteLock(sb);
-
-	err = 1; /* handled */
-	sbinfo = au_sbi(sb);
-	switch (opt->type) {
-	case Opt_udba:
-		sbinfo->si_mntflags &= ~AuOptMask_UDBA;
-		sbinfo->si_mntflags |= opt->udba;
-		opts->given_udba |= opt->udba;
-		break;
-
-	case Opt_plink:
-		au_opt_set(sbinfo->si_mntflags, PLINK);
-		break;
-	case Opt_noplink:
-		if (au_opt_test(sbinfo->si_mntflags, PLINK))
-			au_plink_put(sb, /*verbose*/1);
-		au_opt_clr(sbinfo->si_mntflags, PLINK);
-		break;
-	case Opt_list_plink:
-		if (au_opt_test(sbinfo->si_mntflags, PLINK))
-			au_plink_list(sb);
-		break;
-
-	case Opt_dio:
-		au_opt_set(sbinfo->si_mntflags, DIO);
-		au_fset_opts(opts->flags, REFRESH_DYAOP);
-		break;
-	case Opt_nodio:
-		au_opt_clr(sbinfo->si_mntflags, DIO);
-		au_fset_opts(opts->flags, REFRESH_DYAOP);
-		break;
-
-	case Opt_fhsm_sec:
-		au_fhsm_set(sbinfo, opt->fhsm_second);
-		break;
-
-	case Opt_diropq_a:
-		au_opt_set(sbinfo->si_mntflags, ALWAYS_DIROPQ);
-		break;
-	case Opt_diropq_w:
-		au_opt_clr(sbinfo->si_mntflags, ALWAYS_DIROPQ);
-		break;
-
-	case Opt_warn_perm:
-		au_opt_set(sbinfo->si_mntflags, WARN_PERM);
-		break;
-	case Opt_nowarn_perm:
-		au_opt_clr(sbinfo->si_mntflags, WARN_PERM);
-		break;
-
-	case Opt_verbose:
-		au_opt_set(sbinfo->si_mntflags, VERBOSE);
-		break;
-	case Opt_noverbose:
-		au_opt_clr(sbinfo->si_mntflags, VERBOSE);
-		break;
-
-	case Opt_sum:
-		au_opt_set(sbinfo->si_mntflags, SUM);
-		break;
-	case Opt_wsum:
-		au_opt_clr(sbinfo->si_mntflags, SUM);
-		au_opt_set(sbinfo->si_mntflags, SUM_W);
-	case Opt_nosum:
-		au_opt_clr(sbinfo->si_mntflags, SUM);
-		au_opt_clr(sbinfo->si_mntflags, SUM_W);
-		break;
-
-	case Opt_wbr_create:
-		err = au_opt_wbr_create(sb, &opt->wbr_create);
-		break;
-	case Opt_wbr_copyup:
-		sbinfo->si_wbr_copyup = opt->wbr_copyup;
-		sbinfo->si_wbr_copyup_ops = au_wbr_copyup_ops + opt->wbr_copyup;
-		break;
-
-	case Opt_dirwh:
-		sbinfo->si_dirwh = opt->dirwh;
-		break;
-
-	case Opt_rdcache:
-		sbinfo->si_rdcache
-			= msecs_to_jiffies(opt->rdcache * MSEC_PER_SEC);
-		break;
-	case Opt_rdblk:
-		sbinfo->si_rdblk = opt->rdblk;
-		break;
-	case Opt_rdblk_def:
-		sbinfo->si_rdblk = AUFS_RDBLK_DEF;
-		break;
-	case Opt_rdhash:
-		sbinfo->si_rdhash = opt->rdhash;
-		break;
-	case Opt_rdhash_def:
-		sbinfo->si_rdhash = AUFS_RDHASH_DEF;
-		break;
-
-	case Opt_shwh:
-		au_opt_set(sbinfo->si_mntflags, SHWH);
-		break;
-	case Opt_noshwh:
-		au_opt_clr(sbinfo->si_mntflags, SHWH);
-		break;
-
-	case Opt_dirperm1:
-		au_opt_set(sbinfo->si_mntflags, DIRPERM1);
-		break;
-	case Opt_nodirperm1:
-		au_opt_clr(sbinfo->si_mntflags, DIRPERM1);
-		break;
-
-	case Opt_trunc_xino:
-		au_opt_set(sbinfo->si_mntflags, TRUNC_XINO);
-		break;
-	case Opt_notrunc_xino:
-		au_opt_clr(sbinfo->si_mntflags, TRUNC_XINO);
-		break;
-
-	case Opt_trunc_xino_path:
-	case Opt_itrunc_xino:
-		err = au_xino_trunc(sb, opt->xino_itrunc.bindex);
-		if (!err)
-			err = 1;
-		break;
-
-	case Opt_trunc_xib:
-		au_fset_opts(opts->flags, TRUNC_XIB);
-		break;
-	case Opt_notrunc_xib:
-		au_fclr_opts(opts->flags, TRUNC_XIB);
-		break;
-
-	case Opt_acl:
-		sb->s_flags |= MS_POSIXACL;
-		break;
-	case Opt_noacl:
-		sb->s_flags &= ~MS_POSIXACL;
-		break;
-
-	default:
-		err = 0;
-		break;
-	}
-
-	return err;
-}
-
-/*
- * returns tri-state.
- * plus: processed without an error
- * zero: unprocessed
- * minus: error
- */
-static int au_opt_br(struct super_block *sb, struct au_opt *opt,
-		     struct au_opts *opts)
-{
-	int err, do_refresh;
-
-	err = 0;
-	switch (opt->type) {
-	case Opt_append:
-		opt->add.bindex = au_sbend(sb) + 1;
-		if (opt->add.bindex < 0)
-			opt->add.bindex = 0;
-		goto add;
-	case Opt_prepend:
-		opt->add.bindex = 0;
-	add: /* indented label */
-	case Opt_add:
-		err = au_br_add(sb, &opt->add,
-				au_ftest_opts(opts->flags, REMOUNT));
-		if (!err) {
-			err = 1;
-			au_fset_opts(opts->flags, REFRESH);
-		}
-		break;
-
-	case Opt_del:
-	case Opt_idel:
-		err = au_br_del(sb, &opt->del,
-				au_ftest_opts(opts->flags, REMOUNT));
-		if (!err) {
-			err = 1;
-			au_fset_opts(opts->flags, TRUNC_XIB);
-			au_fset_opts(opts->flags, REFRESH);
-		}
-		break;
-
-	case Opt_mod:
-	case Opt_imod:
-		err = au_br_mod(sb, &opt->mod,
-				au_ftest_opts(opts->flags, REMOUNT),
-				&do_refresh);
-		if (!err) {
-			err = 1;
-			if (do_refresh)
-				au_fset_opts(opts->flags, REFRESH);
-		}
-		break;
-	}
-
-	return err;
-}
-
-static int au_opt_xino(struct super_block *sb, struct au_opt *opt,
-		       struct au_opt_xino **opt_xino,
-		       struct au_opts *opts)
-{
-	int err;
-	aufs_bindex_t bend, bindex;
-	struct dentry *root, *parent, *h_root;
-
-	err = 0;
-	switch (opt->type) {
-	case Opt_xino:
-		err = au_xino_set(sb, &opt->xino,
-				  !!au_ftest_opts(opts->flags, REMOUNT));
-		if (unlikely(err))
-			break;
-
-		*opt_xino = &opt->xino;
-		au_xino_brid_set(sb, -1);
-
-		/* safe d_parent access */
-		parent = opt->xino.file->f_path.dentry->d_parent;
-		root = sb->s_root;
-		bend = au_sbend(sb);
-		for (bindex = 0; bindex <= bend; bindex++) {
-			h_root = au_h_dptr(root, bindex);
-			if (h_root == parent) {
-				au_xino_brid_set(sb, au_sbr_id(sb, bindex));
-				break;
-			}
-		}
-		break;
-
-	case Opt_noxino:
-		au_xino_clr(sb);
-		au_xino_brid_set(sb, -1);
-		*opt_xino = (void *)-1;
-		break;
-	}
-
-	return err;
-}
-
-int au_opts_verify(struct super_block *sb, unsigned long sb_flags,
-		   unsigned int pending)
-{
-	int err, fhsm;
-	aufs_bindex_t bindex, bend;
-	unsigned char do_plink, skip, do_free, can_no_dreval;
-	struct au_branch *br;
-	struct au_wbr *wbr;
-	struct dentry *root, *dentry;
-	struct inode *dir, *h_dir;
-	struct au_sbinfo *sbinfo;
-	struct au_hinode *hdir;
-
-	SiMustAnyLock(sb);
-
-	sbinfo = au_sbi(sb);
-	AuDebugOn(!(sbinfo->si_mntflags & AuOptMask_UDBA));
-
-	if (!(sb_flags & MS_RDONLY)) {
-		if (unlikely(!au_br_writable(au_sbr_perm(sb, 0))))
-			pr_warn("first branch should be rw\n");
-		if (unlikely(au_opt_test(sbinfo->si_mntflags, SHWH)))
-			pr_warn("shwh should be used with ro\n");
-	}
-
-	if (au_opt_test((sbinfo->si_mntflags | pending), UDBA_HNOTIFY)
-	    && !au_opt_test(sbinfo->si_mntflags, XINO))
-		pr_warn("udba=*notify requires xino\n");
-
-	if (au_opt_test(sbinfo->si_mntflags, DIRPERM1))
-		pr_warn("dirperm1 breaks the protection"
-			" by the permission bits on the lower branch\n");
-
-	err = 0;
-	fhsm = 0;
-	root = sb->s_root;
-	dir = d_inode(root);
-	do_plink = !!au_opt_test(sbinfo->si_mntflags, PLINK);
-	can_no_dreval = !!au_opt_test((sbinfo->si_mntflags | pending),
-				      UDBA_NONE);
-	bend = au_sbend(sb);
-	for (bindex = 0; !err && bindex <= bend; bindex++) {
-		skip = 0;
-		h_dir = au_h_iptr(dir, bindex);
-		br = au_sbr(sb, bindex);
-
-		if ((br->br_perm & AuBrAttr_ICEX)
-		    && !h_dir->i_op->listxattr)
-			br->br_perm &= ~AuBrAttr_ICEX;
-#if 0
-		if ((br->br_perm & AuBrAttr_ICEX_SEC)
-		    && (au_br_sb(br)->s_flags & MS_NOSEC))
-			br->br_perm &= ~AuBrAttr_ICEX_SEC;
-#endif
-
-		do_free = 0;
-		wbr = br->br_wbr;
-		if (wbr)
-			wbr_wh_read_lock(wbr);
-
-		if (!au_br_writable(br->br_perm)) {
-			do_free = !!wbr;
-			skip = (!wbr
-				|| (!wbr->wbr_whbase
-				    && !wbr->wbr_plink
-				    && !wbr->wbr_orph));
-		} else if (!au_br_wh_linkable(br->br_perm)) {
-			/* skip = (!br->br_whbase && !br->br_orph); */
-			skip = (!wbr || !wbr->wbr_whbase);
-			if (skip && wbr) {
-				if (do_plink)
-					skip = !!wbr->wbr_plink;
-				else
-					skip = !wbr->wbr_plink;
-			}
-		} else {
-			/* skip = (br->br_whbase && br->br_ohph); */
-			skip = (wbr && wbr->wbr_whbase);
-			if (skip) {
-				if (do_plink)
-					skip = !!wbr->wbr_plink;
-				else
-					skip = !wbr->wbr_plink;
-			}
-		}
-		if (wbr)
-			wbr_wh_read_unlock(wbr);
-
-		if (can_no_dreval) {
-			dentry = br->br_path.dentry;
-			spin_lock(&dentry->d_lock);
-			if (dentry->d_flags &
-			    (DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE))
-				can_no_dreval = 0;
-			spin_unlock(&dentry->d_lock);
-		}
-
-		if (au_br_fhsm(br->br_perm)) {
-			fhsm++;
-			AuDebugOn(!br->br_fhsm);
-		}
-
-		if (skip)
-			continue;
-
-		hdir = au_hi(dir, bindex);
-		au_hn_imtx_lock_nested(hdir, AuLsc_I_PARENT);
-		if (wbr)
-			wbr_wh_write_lock(wbr);
-		err = au_wh_init(br, sb);
-		if (wbr)
-			wbr_wh_write_unlock(wbr);
-		au_hn_imtx_unlock(hdir);
-
-		if (!err && do_free) {
-			kfree(wbr);
-			br->br_wbr = NULL;
-		}
-	}
-
-	if (can_no_dreval)
-		au_fset_si(sbinfo, NO_DREVAL);
-	else
-		au_fclr_si(sbinfo, NO_DREVAL);
-
-	if (fhsm >= 2) {
-		au_fset_si(sbinfo, FHSM);
-		for (bindex = bend; bindex >= 0; bindex--) {
-			br = au_sbr(sb, bindex);
-			if (au_br_fhsm(br->br_perm)) {
-				au_fhsm_set_bottom(sb, bindex);
-				break;
-			}
-		}
-	} else {
-		au_fclr_si(sbinfo, FHSM);
-		au_fhsm_set_bottom(sb, -1);
-	}
-
-	return err;
-}
-
-int au_opts_mount(struct super_block *sb, struct au_opts *opts)
-{
-	int err;
-	unsigned int tmp;
-	aufs_bindex_t bindex, bend;
-	struct au_opt *opt;
-	struct au_opt_xino *opt_xino, xino;
-	struct au_sbinfo *sbinfo;
-	struct au_branch *br;
-	struct inode *dir;
-
-	SiMustWriteLock(sb);
-
-	err = 0;
-	opt_xino = NULL;
-	opt = opts->opt;
-	while (err >= 0 && opt->type != Opt_tail)
-		err = au_opt_simple(sb, opt++, opts);
-	if (err > 0)
-		err = 0;
-	else if (unlikely(err < 0))
-		goto out;
-
-	/* disable xino and udba temporary */
-	sbinfo = au_sbi(sb);
-	tmp = sbinfo->si_mntflags;
-	au_opt_clr(sbinfo->si_mntflags, XINO);
-	au_opt_set_udba(sbinfo->si_mntflags, UDBA_REVAL);
-
-	opt = opts->opt;
-	while (err >= 0 && opt->type != Opt_tail)
-		err = au_opt_br(sb, opt++, opts);
-	if (err > 0)
-		err = 0;
-	else if (unlikely(err < 0))
-		goto out;
-
-	bend = au_sbend(sb);
-	if (unlikely(bend < 0)) {
-		err = -EINVAL;
-		pr_err("no branches\n");
-		goto out;
-	}
-
-	if (au_opt_test(tmp, XINO))
-		au_opt_set(sbinfo->si_mntflags, XINO);
-	opt = opts->opt;
-	while (!err && opt->type != Opt_tail)
-		err = au_opt_xino(sb, opt++, &opt_xino, opts);
-	if (unlikely(err))
-		goto out;
-
-	err = au_opts_verify(sb, sb->s_flags, tmp);
-	if (unlikely(err))
-		goto out;
-
-	/* restore xino */
-	if (au_opt_test(tmp, XINO) && !opt_xino) {
-		xino.file = au_xino_def(sb);
-		err = PTR_ERR(xino.file);
-		if (IS_ERR(xino.file))
-			goto out;
-
-		err = au_xino_set(sb, &xino, /*remount*/0);
-		fput(xino.file);
-		if (unlikely(err))
-			goto out;
-	}
-
-	/* restore udba */
-	tmp &= AuOptMask_UDBA;
-	sbinfo->si_mntflags &= ~AuOptMask_UDBA;
-	sbinfo->si_mntflags |= tmp;
-	bend = au_sbend(sb);
-	for (bindex = 0; bindex <= bend; bindex++) {
-		br = au_sbr(sb, bindex);
-		err = au_hnotify_reset_br(tmp, br, br->br_perm);
-		if (unlikely(err))
-			AuIOErr("hnotify failed on br %d, %d, ignored\n",
-				bindex, err);
-		/* go on even if err */
-	}
-	if (au_opt_test(tmp, UDBA_HNOTIFY)) {
-		dir = d_inode(sb->s_root);
-		au_hn_reset(dir, au_hi_flags(dir, /*isdir*/1) & ~AuHi_XINO);
-	}
-
-out:
-	return err;
-}
-
-int au_opts_remount(struct super_block *sb, struct au_opts *opts)
-{
-	int err, rerr;
-	unsigned char no_dreval;
-	struct inode *dir;
-	struct au_opt_xino *opt_xino;
-	struct au_opt *opt;
-	struct au_sbinfo *sbinfo;
-
-	SiMustWriteLock(sb);
-
-	err = 0;
-	dir = d_inode(sb->s_root);
-	sbinfo = au_sbi(sb);
-	opt_xino = NULL;
-	opt = opts->opt;
-	while (err >= 0 && opt->type != Opt_tail) {
-		err = au_opt_simple(sb, opt, opts);
-		if (!err)
-			err = au_opt_br(sb, opt, opts);
-		if (!err)
-			err = au_opt_xino(sb, opt, &opt_xino, opts);
-		opt++;
-	}
-	if (err > 0)
-		err = 0;
-	AuTraceErr(err);
-	/* go on even err */
-
-	no_dreval = !!au_ftest_si(sbinfo, NO_DREVAL);
-	rerr = au_opts_verify(sb, opts->sb_flags, /*pending*/0);
-	if (unlikely(rerr && !err))
-		err = rerr;
-
-	if (no_dreval != !!au_ftest_si(sbinfo, NO_DREVAL))
-		au_fset_opts(opts->flags, REFRESH_IDOP);
-
-	if (au_ftest_opts(opts->flags, TRUNC_XIB)) {
-		rerr = au_xib_trunc(sb);
-		if (unlikely(rerr && !err))
-			err = rerr;
-	}
-
-	/* will be handled by the caller */
-	if (!au_ftest_opts(opts->flags, REFRESH)
-	    && (opts->given_udba
-		|| au_opt_test(sbinfo->si_mntflags, XINO)
-		|| au_ftest_opts(opts->flags, REFRESH_IDOP)
-		    ))
-		au_fset_opts(opts->flags, REFRESH);
-
-	AuDbg("status 0x%x\n", opts->flags);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-unsigned int au_opt_udba(struct super_block *sb)
-{
-	return au_mntflags(sb) & AuOptMask_UDBA;
-}
diff --git a/fs/aufs/opts.h b/fs/aufs/opts.h
deleted file mode 100644
index 0d6c2e1c7..000000000
--- a/fs/aufs/opts.h
+++ /dev/null
@@ -1,198 +0,0 @@
-/*
- * Copyright (C) 2005-2016 Junjiro R. Okajima
- */
-
-/*
- * mount options/flags
- */
-
-#ifndef __AUFS_OPTS_H__
-#define __AUFS_OPTS_H__
-
-#ifdef __KERNEL__
-
-#include <linux/path.h>
-
-struct file;
-struct super_block;
-
-/* ---------------------------------------------------------------------- */
-
-/* mount flags */
-#define AuOpt_XINO		1		/* external inode number bitmap
-						   and translation table */
-#define AuOpt_TRUNC_XINO	(1 << 1)	/* truncate xino files */
-#define AuOpt_UDBA_NONE		(1 << 2)	/* users direct branch access */
-#define AuOpt_UDBA_REVAL	(1 << 3)
-#define AuOpt_UDBA_HNOTIFY	(1 << 4)
-#define AuOpt_SHWH		(1 << 5)	/* show whiteout */
-#define AuOpt_PLINK		(1 << 6)	/* pseudo-link */
-#define AuOpt_DIRPERM1		(1 << 7)	/* ignore the lower dir's perm
-						   bits */
-#define AuOpt_ALWAYS_DIROPQ	(1 << 9)	/* policy to creating diropq */
-#define AuOpt_SUM		(1 << 10)	/* summation for statfs(2) */
-#define AuOpt_SUM_W		(1 << 11)	/* unimplemented */
-#define AuOpt_WARN_PERM		(1 << 12)	/* warn when add-branch */
-#define AuOpt_VERBOSE		(1 << 13)	/* busy inode when del-branch */
-#define AuOpt_DIO		(1 << 14)	/* direct io */
-
-#ifndef CONFIG_AUFS_HNOTIFY
-#undef AuOpt_UDBA_HNOTIFY
-#define AuOpt_UDBA_HNOTIFY	0
-#endif
-#ifndef CONFIG_AUFS_SHWH
-#undef AuOpt_SHWH
-#define AuOpt_SHWH		0
-#endif
-
-#define AuOpt_Def	(AuOpt_XINO \
-			 | AuOpt_UDBA_REVAL \
-			 | AuOpt_PLINK \
-			 /* | AuOpt_DIRPERM1 */ \
-			 | AuOpt_WARN_PERM)
-#define AuOptMask_UDBA	(AuOpt_UDBA_NONE \
-			 | AuOpt_UDBA_REVAL \
-			 | AuOpt_UDBA_HNOTIFY)
-
-#define au_opt_test(flags, name)	(flags & AuOpt_##name)
-#define au_opt_set(flags, name) do { \
-	BUILD_BUG_ON(AuOpt_##name & AuOptMask_UDBA); \
-	((flags) |= AuOpt_##name); \
-} while (0)
-#define au_opt_set_udba(flags, name) do { \
-	(flags) &= ~AuOptMask_UDBA; \
-	((flags) |= AuOpt_##name); \
-} while (0)
-#define au_opt_clr(flags, name) do { \
-	((flags) &= ~AuOpt_##name); \
-} while (0)
-
-static inline unsigned int au_opts_plink(unsigned int mntflags)
-{
-#ifdef CONFIG_PROC_FS
-	return mntflags;
-#else
-	return mntflags & ~AuOpt_PLINK;
-#endif
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* policies to select one among multiple writable branches */
-enum {
-	AuWbrCreate_TDP,	/* top down parent */
-	AuWbrCreate_RR,		/* round robin */
-	AuWbrCreate_MFS,	/* most free space */
-	AuWbrCreate_MFSV,	/* mfs with seconds */
-	AuWbrCreate_MFSRR,	/* mfs then rr */
-	AuWbrCreate_MFSRRV,	/* mfs then rr with seconds */
-	AuWbrCreate_PMFS,	/* parent and mfs */
-	AuWbrCreate_PMFSV,	/* parent and mfs with seconds */
-	AuWbrCreate_PMFSRR,	/* parent, mfs and round-robin */
-	AuWbrCreate_PMFSRRV,	/* plus seconds */
-
-	AuWbrCreate_Def = AuWbrCreate_TDP
-};
-
-enum {
-	AuWbrCopyup_TDP,	/* top down parent */
-	AuWbrCopyup_BUP,	/* bottom up parent */
-	AuWbrCopyup_BU,		/* bottom up */
-
-	AuWbrCopyup_Def = AuWbrCopyup_TDP
-};
-
-/* ---------------------------------------------------------------------- */
-
-struct au_opt_add {
-	aufs_bindex_t	bindex;
-	char		*pathname;
-	int		perm;
-	struct path	path;
-};
-
-struct au_opt_del {
-	char		*pathname;
-	struct path	h_path;
-};
-
-struct au_opt_mod {
-	char		*path;
-	int		perm;
-	struct dentry	*h_root;
-};
-
-struct au_opt_xino {
-	char		*path;
-	struct file	*file;
-};
-
-struct au_opt_xino_itrunc {
-	aufs_bindex_t	bindex;
-};
-
-struct au_opt_wbr_create {
-	int			wbr_create;
-	int			mfs_second;
-	unsigned long long	mfsrr_watermark;
-};
-
-struct au_opt {
-	int type;
-	union {
-		struct au_opt_xino	xino;
-		struct au_opt_xino_itrunc xino_itrunc;
-		struct au_opt_add	add;
-		struct au_opt_del	del;
-		struct au_opt_mod	mod;
-		int			dirwh;
-		int			rdcache;
-		unsigned int		rdblk;
-		unsigned int		rdhash;
-		int			udba;
-		struct au_opt_wbr_create wbr_create;
-		int			wbr_copyup;
-		unsigned int		fhsm_second;
-	};
-};
-
-/* opts flags */
-#define AuOpts_REMOUNT		1
-#define AuOpts_REFRESH		(1 << 1)
-#define AuOpts_TRUNC_XIB	(1 << 2)
-#define AuOpts_REFRESH_DYAOP	(1 << 3)
-#define AuOpts_REFRESH_IDOP	(1 << 4)
-#define au_ftest_opts(flags, name)	((flags) & AuOpts_##name)
-#define au_fset_opts(flags, name) \
-	do { (flags) |= AuOpts_##name; } while (0)
-#define au_fclr_opts(flags, name) \
-	do { (flags) &= ~AuOpts_##name; } while (0)
-
-struct au_opts {
-	struct au_opt	*opt;
-	int		max_opt;
-
-	unsigned int	given_udba;
-	unsigned int	flags;
-	unsigned long	sb_flags;
-};
-
-/* ---------------------------------------------------------------------- */
-
-/* opts.c */
-void au_optstr_br_perm(au_br_perm_str_t *str, int perm);
-const char *au_optstr_udba(int udba);
-const char *au_optstr_wbr_copyup(int wbr_copyup);
-const char *au_optstr_wbr_create(int wbr_create);
-
-void au_opts_free(struct au_opts *opts);
-int au_opts_parse(struct super_block *sb, char *str, struct au_opts *opts);
-int au_opts_verify(struct super_block *sb, unsigned long sb_flags,
-		   unsigned int pending);
-int au_opts_mount(struct super_block *sb, struct au_opts *opts);
-int au_opts_remount(struct super_block *sb, struct au_opts *opts);
-
-unsigned int au_opt_udba(struct super_block *sb);
-
-#endif /* __KERNEL__ */
-#endif /* __AUFS_OPTS_H__ */
diff --git a/fs/aufs/plink.c b/fs/aufs/plink.c
deleted file mode 100644
index 6fdab1e0e..000000000
--- a/fs/aufs/plink.c
+++ /dev/null
@@ -1,515 +0,0 @@
-/*
- * Copyright (C) 2005-2016 Junjiro R. Okajima
- */
-
-/*
- * pseudo-link
- */
-
-#include "aufs.h"
-
-/*
- * the pseudo-link maintenance mode.
- * during a user process maintains the pseudo-links,
- * prohibit adding a new plink and branch manipulation.
- *
- * Flags
- * NOPLM:
- *	For entry functions which will handle plink, and i_mutex is already held
- *	in VFS.
- *	They cannot wait and should return an error at once.
- *	Callers has to check the error.
- * NOPLMW:
- *	For entry functions which will handle plink, but i_mutex is not held
- *	in VFS.
- *	They can wait the plink maintenance mode to finish.
- *
- * They behave like F_SETLK and F_SETLKW.
- * If the caller never handle plink, then both flags are unnecessary.
- */
-
-int au_plink_maint(struct super_block *sb, int flags)
-{
-	int err;
-	pid_t pid, ppid;
-	struct au_sbinfo *sbi;
-
-	SiMustAnyLock(sb);
-
-	err = 0;
-	if (!au_opt_test(au_mntflags(sb), PLINK))
-		goto out;
-
-	sbi = au_sbi(sb);
-	pid = sbi->si_plink_maint_pid;
-	if (!pid || pid == current->pid)
-		goto out;
-
-	/* todo: it highly depends upon /sbin/mount.aufs */
-	rcu_read_lock();
-	ppid = task_pid_vnr(rcu_dereference(current->real_parent));
-	rcu_read_unlock();
-	if (pid == ppid)
-		goto out;
-
-	if (au_ftest_lock(flags, NOPLMW)) {
-		/* if there is no i_mutex lock in VFS, we don't need to wait */
-		/* AuDebugOn(!lockdep_depth(current)); */
-		while (sbi->si_plink_maint_pid) {
-			si_read_unlock(sb);
-			/* gave up wake_up_bit() */
-			wait_event(sbi->si_plink_wq, !sbi->si_plink_maint_pid);
-
-			if (au_ftest_lock(flags, FLUSH))
-				au_nwt_flush(&sbi->si_nowait);
-			si_noflush_read_lock(sb);
-		}
-	} else if (au_ftest_lock(flags, NOPLM)) {
-		AuDbg("ppid %d, pid %d\n", ppid, pid);
-		err = -EAGAIN;
-	}
-
-out:
-	return err;
-}
-
-void au_plink_maint_leave(struct au_sbinfo *sbinfo)
-{
-	spin_lock(&sbinfo->si_plink_maint_lock);
-	sbinfo->si_plink_maint_pid = 0;
-	spin_unlock(&sbinfo->si_plink_maint_lock);
-	wake_up_all(&sbinfo->si_plink_wq);
-}
-
-int au_plink_maint_enter(struct super_block *sb)
-{
-	int err;
-	struct au_sbinfo *sbinfo;
-
-	err = 0;
-	sbinfo = au_sbi(sb);
-	/* make sure i am the only one in this fs */
-	si_write_lock(sb, AuLock_FLUSH);
-	if (au_opt_test(au_mntflags(sb), PLINK)) {
-		spin_lock(&sbinfo->si_plink_maint_lock);
-		if (!sbinfo->si_plink_maint_pid)
-			sbinfo->si_plink_maint_pid = current->pid;
-		else
-			err = -EBUSY;
-		spin_unlock(&sbinfo->si_plink_maint_lock);
-	}
-	si_write_unlock(sb);
-
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-#ifdef CONFIG_AUFS_DEBUG
-void au_plink_list(struct super_block *sb)
-{
-	int i;
-	struct au_sbinfo *sbinfo;
-	struct hlist_head *plink_hlist;
-	struct pseudo_link *plink;
-
-	SiMustAnyLock(sb);
-
-	sbinfo = au_sbi(sb);
-	AuDebugOn(!au_opt_test(au_mntflags(sb), PLINK));
-	AuDebugOn(au_plink_maint(sb, AuLock_NOPLM));
-
-	for (i = 0; i < AuPlink_NHASH; i++) {
-		plink_hlist = &sbinfo->si_plink[i].head;
-		rcu_read_lock();
-		hlist_for_each_entry_rcu(plink, plink_hlist, hlist)
-			AuDbg("%lu\n", plink->inode->i_ino);
-		rcu_read_unlock();
-	}
-}
-#endif
-
-/* is the inode pseudo-linked? */
-int au_plink_test(struct inode *inode)
-{
-	int found, i;
-	struct au_sbinfo *sbinfo;
-	struct hlist_head *plink_hlist;
-	struct pseudo_link *plink;
-
-	sbinfo = au_sbi(inode->i_sb);
-	AuRwMustAnyLock(&sbinfo->si_rwsem);
-	AuDebugOn(!au_opt_test(au_mntflags(inode->i_sb), PLINK));
-	AuDebugOn(au_plink_maint(inode->i_sb, AuLock_NOPLM));
-
-	found = 0;
-	i = au_plink_hash(inode->i_ino);
-	plink_hlist = &sbinfo->si_plink[i].head;
-	rcu_read_lock();
-	hlist_for_each_entry_rcu(plink, plink_hlist, hlist)
-		if (plink->inode == inode) {
-			found = 1;
-			break;
-		}
-	rcu_read_unlock();
-	return found;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/*
- * generate a name for plink.
- * the file will be stored under AUFS_WH_PLINKDIR.
- */
-/* 20 is max digits length of ulong 64 */
-#define PLINK_NAME_LEN	((20 + 1) * 2)
-
-static int plink_name(char *name, int len, struct inode *inode,
-		      aufs_bindex_t bindex)
-{
-	int rlen;
-	struct inode *h_inode;
-
-	h_inode = au_h_iptr(inode, bindex);
-	rlen = snprintf(name, len, "%lu.%lu", inode->i_ino, h_inode->i_ino);
-	return rlen;
-}
-
-struct au_do_plink_lkup_args {
-	struct dentry **errp;
-	struct qstr *tgtname;
-	struct dentry *h_parent;
-	struct au_branch *br;
-};
-
-static struct dentry *au_do_plink_lkup(struct qstr *tgtname,
-				       struct dentry *h_parent,
-				       struct au_branch *br)
-{
-	struct dentry *h_dentry;
-	struct mutex *h_mtx;
-
-	h_mtx = &d_inode(h_parent)->i_mutex;
-	mutex_lock_nested(h_mtx, AuLsc_I_CHILD2);
-	h_dentry = vfsub_lkup_one(tgtname, h_parent);
-	mutex_unlock(h_mtx);
-	return h_dentry;
-}
-
-static void au_call_do_plink_lkup(void *args)
-{
-	struct au_do_plink_lkup_args *a = args;
-	*a->errp = au_do_plink_lkup(a->tgtname, a->h_parent, a->br);
-}
-
-/* lookup the plink-ed @inode under the branch at @bindex */
-struct dentry *au_plink_lkup(struct inode *inode, aufs_bindex_t bindex)
-{
-	struct dentry *h_dentry, *h_parent;
-	struct au_branch *br;
-	int wkq_err;
-	char a[PLINK_NAME_LEN];
-	struct qstr tgtname = QSTR_INIT(a, 0);
-
-	AuDebugOn(au_plink_maint(inode->i_sb, AuLock_NOPLM));
-
-	br = au_sbr(inode->i_sb, bindex);
-	h_parent = br->br_wbr->wbr_plink;
-	tgtname.len = plink_name(a, sizeof(a), inode, bindex);
-
-	if (!uid_eq(current_fsuid(), GLOBAL_ROOT_UID)) {
-		struct au_do_plink_lkup_args args = {
-			.errp		= &h_dentry,
-			.tgtname	= &tgtname,
-			.h_parent	= h_parent,
-			.br		= br
-		};
-
-		wkq_err = au_wkq_wait(au_call_do_plink_lkup, &args);
-		if (unlikely(wkq_err))
-			h_dentry = ERR_PTR(wkq_err);
-	} else
-		h_dentry = au_do_plink_lkup(&tgtname, h_parent, br);
-
-	return h_dentry;
-}
-
-/* create a pseudo-link */
-static int do_whplink(struct qstr *tgt, struct dentry *h_parent,
-		      struct dentry *h_dentry, struct au_branch *br)
-{
-	int err;
-	struct path h_path = {
-		.mnt = au_br_mnt(br)
-	};
-	struct inode *h_dir, *delegated;
-
-	h_dir = d_inode(h_parent);
-	mutex_lock_nested(&h_dir->i_mutex, AuLsc_I_CHILD2);
-again:
-	h_path.dentry = vfsub_lkup_one(tgt, h_parent);
-	err = PTR_ERR(h_path.dentry);
-	if (IS_ERR(h_path.dentry))
-		goto out;
-
-	err = 0;
-	/* wh.plink dir is not monitored */
-	/* todo: is it really safe? */
-	if (d_is_positive(h_path.dentry)
-	    && d_inode(h_path.dentry) != d_inode(h_dentry)) {
-		delegated = NULL;
-		err = vfsub_unlink(h_dir, &h_path, &delegated, /*force*/0);
-		if (unlikely(err == -EWOULDBLOCK)) {
-			pr_warn("cannot retry for NFSv4 delegation"
-				" for an internal unlink\n");
-			iput(delegated);
-		}
-		dput(h_path.dentry);
-		h_path.dentry = NULL;
-		if (!err)
-			goto again;
-	}
-	if (!err && d_is_negative(h_path.dentry)) {
-		delegated = NULL;
-		err = vfsub_link(h_dentry, h_dir, &h_path, &delegated);
-		if (unlikely(err == -EWOULDBLOCK)) {
-			pr_warn("cannot retry for NFSv4 delegation"
-				" for an internal link\n");
-			iput(delegated);
-		}
-	}
-	dput(h_path.dentry);
-
-out:
-	mutex_unlock(&h_dir->i_mutex);
-	return err;
-}
-
-struct do_whplink_args {
-	int *errp;
-	struct qstr *tgt;
-	struct dentry *h_parent;
-	struct dentry *h_dentry;
-	struct au_branch *br;
-};
-
-static void call_do_whplink(void *args)
-{
-	struct do_whplink_args *a = args;
-	*a->errp = do_whplink(a->tgt, a->h_parent, a->h_dentry, a->br);
-}
-
-static int whplink(struct dentry *h_dentry, struct inode *inode,
-		   aufs_bindex_t bindex, struct au_branch *br)
-{
-	int err, wkq_err;
-	struct au_wbr *wbr;
-	struct dentry *h_parent;
-	char a[PLINK_NAME_LEN];
-	struct qstr tgtname = QSTR_INIT(a, 0);
-
-	wbr = au_sbr(inode->i_sb, bindex)->br_wbr;
-	h_parent = wbr->wbr_plink;
-	tgtname.len = plink_name(a, sizeof(a), inode, bindex);
-
-	/* always superio. */
-	if (!uid_eq(current_fsuid(), GLOBAL_ROOT_UID)) {
-		struct do_whplink_args args = {
-			.errp		= &err,
-			.tgt		= &tgtname,
-			.h_parent	= h_parent,
-			.h_dentry	= h_dentry,
-			.br		= br
-		};
-		wkq_err = au_wkq_wait(call_do_whplink, &args);
-		if (unlikely(wkq_err))
-			err = wkq_err;
-	} else
-		err = do_whplink(&tgtname, h_parent, h_dentry, br);
-
-	return err;
-}
-
-/* free a single plink */
-static void do_put_plink(struct pseudo_link *plink, int do_del)
-{
-	if (do_del)
-		hlist_del(&plink->hlist);
-	iput(plink->inode);
-	kfree(plink);
-}
-
-static void do_put_plink_rcu(struct rcu_head *rcu)
-{
-	struct pseudo_link *plink;
-
-	plink = container_of(rcu, struct pseudo_link, rcu);
-	iput(plink->inode);
-	kfree(plink);
-}
-
-/*
- * create a new pseudo-link for @h_dentry on @bindex.
- * the linked inode is held in aufs @inode.
- */
-void au_plink_append(struct inode *inode, aufs_bindex_t bindex,
-		     struct dentry *h_dentry)
-{
-	struct super_block *sb;
-	struct au_sbinfo *sbinfo;
-	struct hlist_head *plink_hlist;
-	struct pseudo_link *plink, *tmp;
-	struct au_sphlhead *sphl;
-	int found, err, cnt, i;
-
-	sb = inode->i_sb;
-	sbinfo = au_sbi(sb);
-	AuDebugOn(!au_opt_test(au_mntflags(sb), PLINK));
-	AuDebugOn(au_plink_maint(sb, AuLock_NOPLM));
-
-	found = au_plink_test(inode);
-	if (found)
-		return;
-
-	i = au_plink_hash(inode->i_ino);
-	sphl = sbinfo->si_plink + i;
-	plink_hlist = &sphl->head;
-	tmp = kmalloc(sizeof(*plink), GFP_NOFS);
-	if (tmp)
-		tmp->inode = au_igrab(inode);
-	else {
-		err = -ENOMEM;
-		goto out;
-	}
-
-	spin_lock(&sphl->spin);
-	hlist_for_each_entry(plink, plink_hlist, hlist) {
-		if (plink->inode == inode) {
-			found = 1;
-			break;
-		}
-	}
-	if (!found)
-		hlist_add_head_rcu(&tmp->hlist, plink_hlist);
-	spin_unlock(&sphl->spin);
-	if (!found) {
-		cnt = au_sphl_count(sphl);
-#define msg "unexpectedly unblanced or too many pseudo-links"
-		if (cnt > AUFS_PLINK_WARN)
-			AuWarn1(msg ", %d\n", cnt);
-#undef msg
-		err = whplink(h_dentry, inode, bindex, au_sbr(sb, bindex));
-	} else {
-		do_put_plink(tmp, 0);
-		return;
-	}
-
-out:
-	if (unlikely(err)) {
-		pr_warn("err %d, damaged pseudo link.\n", err);
-		if (tmp) {
-			au_sphl_del_rcu(&tmp->hlist, sphl);
-			call_rcu(&tmp->rcu, do_put_plink_rcu);
-		}
-	}
-}
-
-/* free all plinks */
-void au_plink_put(struct super_block *sb, int verbose)
-{
-	int i, warned;
-	struct au_sbinfo *sbinfo;
-	struct hlist_head *plink_hlist;
-	struct hlist_node *tmp;
-	struct pseudo_link *plink;
-
-	SiMustWriteLock(sb);
-
-	sbinfo = au_sbi(sb);
-	AuDebugOn(!au_opt_test(au_mntflags(sb), PLINK));
-	AuDebugOn(au_plink_maint(sb, AuLock_NOPLM));
-
-	/* no spin_lock since sbinfo is write-locked */
-	warned = 0;
-	for (i = 0; i < AuPlink_NHASH; i++) {
-		plink_hlist = &sbinfo->si_plink[i].head;
-		if (!warned && verbose && !hlist_empty(plink_hlist)) {
-			pr_warn("pseudo-link is not flushed");
-			warned = 1;
-		}
-		hlist_for_each_entry_safe(plink, tmp, plink_hlist, hlist)
-			do_put_plink(plink, 0);
-		INIT_HLIST_HEAD(plink_hlist);
-	}
-}
-
-void au_plink_clean(struct super_block *sb, int verbose)
-{
-	struct dentry *root;
-
-	root = sb->s_root;
-	aufs_write_lock(root);
-	if (au_opt_test(au_mntflags(sb), PLINK))
-		au_plink_put(sb, verbose);
-	aufs_write_unlock(root);
-}
-
-static int au_plink_do_half_refresh(struct inode *inode, aufs_bindex_t br_id)
-{
-	int do_put;
-	aufs_bindex_t bstart, bend, bindex;
-
-	do_put = 0;
-	bstart = au_ibstart(inode);
-	bend = au_ibend(inode);
-	if (bstart >= 0) {
-		for (bindex = bstart; bindex <= bend; bindex++) {
-			if (!au_h_iptr(inode, bindex)
-			    || au_ii_br_id(inode, bindex) != br_id)
-				continue;
-			au_set_h_iptr(inode, bindex, NULL, 0);
-			do_put = 1;
-			break;
-		}
-		if (do_put)
-			for (bindex = bstart; bindex <= bend; bindex++)
-				if (au_h_iptr(inode, bindex)) {
-					do_put = 0;
-					break;
-				}
-	} else
-		do_put = 1;
-
-	return do_put;
-}
-
-/* free the plinks on a branch specified by @br_id */
-void au_plink_half_refresh(struct super_block *sb, aufs_bindex_t br_id)
-{
-	struct au_sbinfo *sbinfo;
-	struct hlist_head *plink_hlist;
-	struct hlist_node *tmp;
-	struct pseudo_link *plink;
-	struct inode *inode;
-	int i, do_put;
-
-	SiMustWriteLock(sb);
-
-	sbinfo = au_sbi(sb);
-	AuDebugOn(!au_opt_test(au_mntflags(sb), PLINK));
-	AuDebugOn(au_plink_maint(sb, AuLock_NOPLM));
-
-	/* no spin_lock since sbinfo is write-locked */
-	for (i = 0; i < AuPlink_NHASH; i++) {
-		plink_hlist = &sbinfo->si_plink[i].head;
-		hlist_for_each_entry_safe(plink, tmp, plink_hlist, hlist) {
-			inode = au_igrab(plink->inode);
-			ii_write_lock_child(inode);
-			do_put = au_plink_do_half_refresh(inode, br_id);
-			if (do_put)
-				do_put_plink(plink, 1);
-			ii_write_unlock(inode);
-			iput(inode);
-		}
-	}
-}
diff --git a/fs/aufs/poll.c b/fs/aufs/poll.c
deleted file mode 100644
index dd2baf5dc..000000000
--- a/fs/aufs/poll.c
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright (C) 2005-2016 Junjiro R. Okajima
- */
-
-/*
- * poll operation
- * There is only one filesystem which implements ->poll operation, currently.
- */
-
-#include "aufs.h"
-
-unsigned int aufs_poll(struct file *file, poll_table *wait)
-{
-	unsigned int mask;
-	int err;
-	struct file *h_file;
-	struct super_block *sb;
-
-	/* We should pretend an error happened. */
-	mask = POLLERR /* | POLLIN | POLLOUT */;
-	sb = file->f_path.dentry->d_sb;
-	si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW);
-
-	h_file = au_read_pre(file, /*keep_fi*/0);
-	err = PTR_ERR(h_file);
-	if (IS_ERR(h_file))
-		goto out;
-
-	/* it is not an error if h_file has no operation */
-	mask = DEFAULT_POLLMASK;
-	if (h_file->f_op->poll)
-		mask = h_file->f_op->poll(h_file, wait);
-	fput(h_file); /* instead of au_read_post() */
-
-out:
-	si_read_unlock(sb);
-	AuTraceErr((int)mask);
-	return mask;
-}
diff --git a/fs/aufs/posix_acl.c b/fs/aufs/posix_acl.c
deleted file mode 100644
index a3c442c08..000000000
--- a/fs/aufs/posix_acl.c
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Copyright (C) 2014-2016 Junjiro R. Okajima
- */
-
-/*
- * posix acl operations
- */
-
-#include <linux/fs.h>
-#include "aufs.h"
-
-struct posix_acl *aufs_get_acl(struct inode *inode, int type)
-{
-	struct posix_acl *acl;
-	int err;
-	aufs_bindex_t bindex;
-	struct inode *h_inode;
-	struct super_block *sb;
-
-	acl = NULL;
-	sb = inode->i_sb;
-	si_read_lock(sb, AuLock_FLUSH);
-	ii_read_lock_child(inode);
-	if (!(sb->s_flags & MS_POSIXACL))
-		goto out;
-
-	bindex = au_ibstart(inode);
-	h_inode = au_h_iptr(inode, bindex);
-	if (unlikely(!h_inode
-		     || ((h_inode->i_mode & S_IFMT)
-			 != (inode->i_mode & S_IFMT)))) {
-		err = au_busy_or_stale();
-		acl = ERR_PTR(err);
-		goto out;
-	}
-
-	/* always topmost only */
-	acl = get_acl(h_inode, type);
-
-out:
-	ii_read_unlock(inode);
-	si_read_unlock(sb);
-
-	AuTraceErrPtr(acl);
-	return acl;
-}
-
-int aufs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
-{
-	int err;
-	ssize_t ssz;
-	struct dentry *dentry;
-	struct au_srxattr arg = {
-		.type = AU_ACL_SET,
-		.u.acl_set = {
-			.acl	= acl,
-			.type	= type
-		},
-	};
-
-	mutex_lock(&inode->i_mutex);
-	if (inode->i_ino == AUFS_ROOT_INO)
-		dentry = dget(inode->i_sb->s_root);
-	else {
-		dentry = d_find_alias(inode);
-		if (!dentry)
-			dentry = d_find_any_alias(inode);
-		if (!dentry) {
-			pr_warn("cannot handle this inode, "
-				"please report to aufs-users ML\n");
-			err = -ENOENT;
-			goto out;
-		}
-	}
-
-	ssz = au_srxattr(dentry, &arg);
-	dput(dentry);
-	err = ssz;
-	if (ssz >= 0)
-		err = 0;
-
-out:
-	mutex_unlock(&inode->i_mutex);
-	return err;
-}
diff --git a/fs/aufs/procfs.c b/fs/aufs/procfs.c
deleted file mode 100644
index 2c8893edf..000000000
--- a/fs/aufs/procfs.c
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- * Copyright (C) 2010-2016 Junjiro R. Okajima
- */
-
-/*
- * procfs interfaces
- */
-
-#include <linux/proc_fs.h>
-#include "aufs.h"
-
-static int au_procfs_plm_release(struct inode *inode, struct file *file)
-{
-	struct au_sbinfo *sbinfo;
-
-	sbinfo = file->private_data;
-	if (sbinfo) {
-		au_plink_maint_leave(sbinfo);
-		kobject_put(&sbinfo->si_kobj);
-	}
-
-	return 0;
-}
-
-static void au_procfs_plm_write_clean(struct file *file)
-{
-	struct au_sbinfo *sbinfo;
-
-	sbinfo = file->private_data;
-	if (sbinfo)
-		au_plink_clean(sbinfo->si_sb, /*verbose*/0);
-}
-
-static int au_procfs_plm_write_si(struct file *file, unsigned long id)
-{
-	int err;
-	struct super_block *sb;
-	struct au_sbinfo *sbinfo;
-
-	err = -EBUSY;
-	if (unlikely(file->private_data))
-		goto out;
-
-	sb = NULL;
-	/* don't use au_sbilist_lock() here */
-	spin_lock(&au_sbilist.spin);
-	list_for_each_entry(sbinfo, &au_sbilist.head, si_list)
-		if (id == sysaufs_si_id(sbinfo)) {
-			kobject_get(&sbinfo->si_kobj);
-			sb = sbinfo->si_sb;
-			break;
-		}
-	spin_unlock(&au_sbilist.spin);
-
-	err = -EINVAL;
-	if (unlikely(!sb))
-		goto out;
-
-	err = au_plink_maint_enter(sb);
-	if (!err)
-		/* keep kobject_get() */
-		file->private_data = sbinfo;
-	else
-		kobject_put(&sbinfo->si_kobj);
-out:
-	return err;
-}
-
-/*
- * Accept a valid "si=xxxx" only.
- * Once it is accepted successfully, accept "clean" too.
- */
-static ssize_t au_procfs_plm_write(struct file *file, const char __user *ubuf,
-				   size_t count, loff_t *ppos)
-{
-	ssize_t err;
-	unsigned long id;
-	/* last newline is allowed */
-	char buf[3 + sizeof(unsigned long) * 2 + 1];
-
-	err = -EACCES;
-	if (unlikely(!capable(CAP_SYS_ADMIN)))
-		goto out;
-
-	err = -EINVAL;
-	if (unlikely(count > sizeof(buf)))
-		goto out;
-
-	err = copy_from_user(buf, ubuf, count);
-	if (unlikely(err)) {
-		err = -EFAULT;
-		goto out;
-	}
-	buf[count] = 0;
-
-	err = -EINVAL;
-	if (!strcmp("clean", buf)) {
-		au_procfs_plm_write_clean(file);
-		goto out_success;
-	} else if (unlikely(strncmp("si=", buf, 3)))
-		goto out;
-
-	err = kstrtoul(buf + 3, 16, &id);
-	if (unlikely(err))
-		goto out;
-
-	err = au_procfs_plm_write_si(file, id);
-	if (unlikely(err))
-		goto out;
-
-out_success:
-	err = count; /* success */
-out:
-	return err;
-}
-
-static const struct file_operations au_procfs_plm_fop = {
-	.write		= au_procfs_plm_write,
-	.release	= au_procfs_plm_release,
-	.owner		= THIS_MODULE
-};
-
-/* ---------------------------------------------------------------------- */
-
-static struct proc_dir_entry *au_procfs_dir;
-
-void au_procfs_fin(void)
-{
-	remove_proc_entry(AUFS_PLINK_MAINT_NAME, au_procfs_dir);
-	remove_proc_entry(AUFS_PLINK_MAINT_DIR, NULL);
-}
-
-int __init au_procfs_init(void)
-{
-	int err;
-	struct proc_dir_entry *entry;
-
-	err = -ENOMEM;
-	au_procfs_dir = proc_mkdir(AUFS_PLINK_MAINT_DIR, NULL);
-	if (unlikely(!au_procfs_dir))
-		goto out;
-
-	entry = proc_create(AUFS_PLINK_MAINT_NAME, S_IFREG | S_IWUSR,
-			    au_procfs_dir, &au_procfs_plm_fop);
-	if (unlikely(!entry))
-		goto out_dir;
-
-	err = 0;
-	goto out; /* success */
-
-
-out_dir:
-	remove_proc_entry(AUFS_PLINK_MAINT_DIR, NULL);
-out:
-	return err;
-}
diff --git a/fs/aufs/rdu.c b/fs/aufs/rdu.c
deleted file mode 100644
index a9e9e9893..000000000
--- a/fs/aufs/rdu.c
+++ /dev/null
@@ -1,375 +0,0 @@
-/*
- * Copyright (C) 2005-2016 Junjiro R. Okajima
- */
-
-/*
- * readdir in userspace.
- */
-
-#include <linux/compat.h>
-#include <linux/fs_stack.h>
-#include <linux/security.h>
-#include "aufs.h"
-
-/* bits for struct aufs_rdu.flags */
-#define	AuRdu_CALLED	1
-#define	AuRdu_CONT	(1 << 1)
-#define	AuRdu_FULL	(1 << 2)
-#define au_ftest_rdu(flags, name)	((flags) & AuRdu_##name)
-#define au_fset_rdu(flags, name) \
-	do { (flags) |= AuRdu_##name; } while (0)
-#define au_fclr_rdu(flags, name) \
-	do { (flags) &= ~AuRdu_##name; } while (0)
-
-struct au_rdu_arg {
-	struct dir_context		ctx;
-	struct aufs_rdu			*rdu;
-	union au_rdu_ent_ul		ent;
-	unsigned long			end;
-
-	struct super_block		*sb;
-	int				err;
-};
-
-static int au_rdu_fill(struct dir_context *ctx, const char *name, int nlen,
-		       loff_t offset, u64 h_ino, unsigned int d_type)
-{
-	int err, len;
-	struct au_rdu_arg *arg = container_of(ctx, struct au_rdu_arg, ctx);
-	struct aufs_rdu *rdu = arg->rdu;
-	struct au_rdu_ent ent;
-
-	err = 0;
-	arg->err = 0;
-	au_fset_rdu(rdu->cookie.flags, CALLED);
-	len = au_rdu_len(nlen);
-	if (arg->ent.ul + len  < arg->end) {
-		ent.ino = h_ino;
-		ent.bindex = rdu->cookie.bindex;
-		ent.type = d_type;
-		ent.nlen = nlen;
-		if (unlikely(nlen > AUFS_MAX_NAMELEN))
-			ent.type = DT_UNKNOWN;
-
-		/* unnecessary to support mmap_sem since this is a dir */
-		err = -EFAULT;
-		if (copy_to_user(arg->ent.e, &ent, sizeof(ent)))
-			goto out;
-		if (copy_to_user(arg->ent.e->name, name, nlen))
-			goto out;
-		/* the terminating NULL */
-		if (__put_user(0, arg->ent.e->name + nlen))
-			goto out;
-		err = 0;
-		/* AuDbg("%p, %.*s\n", arg->ent.p, nlen, name); */
-		arg->ent.ul += len;
-		rdu->rent++;
-	} else {
-		err = -EFAULT;
-		au_fset_rdu(rdu->cookie.flags, FULL);
-		rdu->full = 1;
-		rdu->tail = arg->ent;
-	}
-
-out:
-	/* AuTraceErr(err); */
-	return err;
-}
-
-static int au_rdu_do(struct file *h_file, struct au_rdu_arg *arg)
-{
-	int err;
-	loff_t offset;
-	struct au_rdu_cookie *cookie = &arg->rdu->cookie;
-
-	/* we don't have to care (FMODE_32BITHASH | FMODE_64BITHASH) for ext4 */
-	offset = vfsub_llseek(h_file, cookie->h_pos, SEEK_SET);
-	err = offset;
-	if (unlikely(offset != cookie->h_pos))
-		goto out;
-
-	err = 0;
-	do {
-		arg->err = 0;
-		au_fclr_rdu(cookie->flags, CALLED);
-		/* smp_mb(); */
-		err = vfsub_iterate_dir(h_file, &arg->ctx);
-		if (err >= 0)
-			err = arg->err;
-	} while (!err
-		 && au_ftest_rdu(cookie->flags, CALLED)
-		 && !au_ftest_rdu(cookie->flags, FULL));
-	cookie->h_pos = h_file->f_pos;
-
-out:
-	AuTraceErr(err);
-	return err;
-}
-
-static int au_rdu(struct file *file, struct aufs_rdu *rdu)
-{
-	int err;
-	aufs_bindex_t bend;
-	struct au_rdu_arg arg = {
-		.ctx = {
-			.actor = au_rdu_fill
-		}
-	};
-	struct dentry *dentry;
-	struct inode *inode;
-	struct file *h_file;
-	struct au_rdu_cookie *cookie = &rdu->cookie;
-
-	err = !access_ok(VERIFY_WRITE, rdu->ent.e, rdu->sz);
-	if (unlikely(err)) {
-		err = -EFAULT;
-		AuTraceErr(err);
-		goto out;
-	}
-	rdu->rent = 0;
-	rdu->tail = rdu->ent;
-	rdu->full = 0;
-	arg.rdu = rdu;
-	arg.ent = rdu->ent;
-	arg.end = arg.ent.ul;
-	arg.end += rdu->sz;
-
-	err = -ENOTDIR;
-	if (unlikely(!file->f_op->iterate))
-		goto out;
-
-	err = security_file_permission(file, MAY_READ);
-	AuTraceErr(err);
-	if (unlikely(err))
-		goto out;
-
-	dentry = file->f_path.dentry;
-	inode = d_inode(dentry);
-#if 1
-	mutex_lock(&inode->i_mutex);
-#else
-	err = mutex_lock_killable(&inode->i_mutex);
-	AuTraceErr(err);
-	if (unlikely(err))
-		goto out;
-#endif
-
-	arg.sb = inode->i_sb;
-	err = si_read_lock(arg.sb, AuLock_FLUSH | AuLock_NOPLM);
-	if (unlikely(err))
-		goto out_mtx;
-	err = au_alive_dir(dentry);
-	if (unlikely(err))
-		goto out_si;
-	/* todo: reval? */
-	fi_read_lock(file);
-
-	err = -EAGAIN;
-	if (unlikely(au_ftest_rdu(cookie->flags, CONT)
-		     && cookie->generation != au_figen(file)))
-		goto out_unlock;
-
-	err = 0;
-	if (!rdu->blk) {
-		rdu->blk = au_sbi(arg.sb)->si_rdblk;
-		if (!rdu->blk)
-			rdu->blk = au_dir_size(file, /*dentry*/NULL);
-	}
-	bend = au_fbstart(file);
-	if (cookie->bindex < bend)
-		cookie->bindex = bend;
-	bend = au_fbend_dir(file);
-	/* AuDbg("b%d, b%d\n", cookie->bindex, bend); */
-	for (; !err && cookie->bindex <= bend;
-	     cookie->bindex++, cookie->h_pos = 0) {
-		h_file = au_hf_dir(file, cookie->bindex);
-		if (!h_file)
-			continue;
-
-		au_fclr_rdu(cookie->flags, FULL);
-		err = au_rdu_do(h_file, &arg);
-		AuTraceErr(err);
-		if (unlikely(au_ftest_rdu(cookie->flags, FULL) || err))
-			break;
-	}
-	AuDbg("rent %llu\n", rdu->rent);
-
-	if (!err && !au_ftest_rdu(cookie->flags, CONT)) {
-		rdu->shwh = !!au_opt_test(au_sbi(arg.sb)->si_mntflags, SHWH);
-		au_fset_rdu(cookie->flags, CONT);
-		cookie->generation = au_figen(file);
-	}
-
-	ii_read_lock_child(inode);
-	fsstack_copy_attr_atime(inode, au_h_iptr(inode, au_ibstart(inode)));
-	ii_read_unlock(inode);
-
-out_unlock:
-	fi_read_unlock(file);
-out_si:
-	si_read_unlock(arg.sb);
-out_mtx:
-	mutex_unlock(&inode->i_mutex);
-out:
-	AuTraceErr(err);
-	return err;
-}
-
-static int au_rdu_ino(struct file *file, struct aufs_rdu *rdu)
-{
-	int err;
-	ino_t ino;
-	unsigned long long nent;
-	union au_rdu_ent_ul *u;
-	struct au_rdu_ent ent;
-	struct super_block *sb;
-
-	err = 0;
-	nent = rdu->nent;
-	u = &rdu->ent;
-	sb = file->f_path.dentry->d_sb;
-	si_read_lock(sb, AuLock_FLUSH);
-	while (nent-- > 0) {
-		/* unnecessary to support mmap_sem since this is a dir */
-		err = copy_from_user(&ent, u->e, sizeof(ent));
-		if (!err)
-			err = !access_ok(VERIFY_WRITE, &u->e->ino, sizeof(ino));
-		if (unlikely(err)) {
-			err = -EFAULT;
-			AuTraceErr(err);
-			break;
-		}
-
-		/* AuDbg("b%d, i%llu\n", ent.bindex, ent.ino); */
-		if (!ent.wh)
-			err = au_ino(sb, ent.bindex, ent.ino, ent.type, &ino);
-		else
-			err = au_wh_ino(sb, ent.bindex, ent.ino, ent.type,
-					&ino);
-		if (unlikely(err)) {
-			AuTraceErr(err);
-			break;
-		}
-
-		err = __put_user(ino, &u->e->ino);
-		if (unlikely(err)) {
-			err = -EFAULT;
-			AuTraceErr(err);
-			break;
-		}
-		u->ul += au_rdu_len(ent.nlen);
-	}
-	si_read_unlock(sb);
-
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static int au_rdu_verify(struct aufs_rdu *rdu)
-{
-	AuDbg("rdu{%llu, %p, %u | %u | %llu, %u, %u | "
-	      "%llu, b%d, 0x%x, g%u}\n",
-	      rdu->sz, rdu->ent.e, rdu->verify[AufsCtlRduV_SZ],
-	      rdu->blk,
-	      rdu->rent, rdu->shwh, rdu->full,
-	      rdu->cookie.h_pos, rdu->cookie.bindex, rdu->cookie.flags,
-	      rdu->cookie.generation);
-
-	if (rdu->verify[AufsCtlRduV_SZ] == sizeof(*rdu))
-		return 0;
-
-	AuDbg("%u:%u\n",
-	      rdu->verify[AufsCtlRduV_SZ], (unsigned int)sizeof(*rdu));
-	return -EINVAL;
-}
-
-long au_rdu_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
-{
-	long err, e;
-	struct aufs_rdu rdu;
-	void __user *p = (void __user *)arg;
-
-	err = copy_from_user(&rdu, p, sizeof(rdu));
-	if (unlikely(err)) {
-		err = -EFAULT;
-		AuTraceErr(err);
-		goto out;
-	}
-	err = au_rdu_verify(&rdu);
-	if (unlikely(err))
-		goto out;
-
-	switch (cmd) {
-	case AUFS_CTL_RDU:
-		err = au_rdu(file, &rdu);
-		if (unlikely(err))
-			break;
-
-		e = copy_to_user(p, &rdu, sizeof(rdu));
-		if (unlikely(e)) {
-			err = -EFAULT;
-			AuTraceErr(err);
-		}
-		break;
-	case AUFS_CTL_RDU_INO:
-		err = au_rdu_ino(file, &rdu);
-		break;
-
-	default:
-		/* err = -ENOTTY; */
-		err = -EINVAL;
-	}
-
-out:
-	AuTraceErr(err);
-	return err;
-}
-
-#ifdef CONFIG_COMPAT
-long au_rdu_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
-{
-	long err, e;
-	struct aufs_rdu rdu;
-	void __user *p = compat_ptr(arg);
-
-	/* todo: get_user()? */
-	err = copy_from_user(&rdu, p, sizeof(rdu));
-	if (unlikely(err)) {
-		err = -EFAULT;
-		AuTraceErr(err);
-		goto out;
-	}
-	rdu.ent.e = compat_ptr(rdu.ent.ul);
-	err = au_rdu_verify(&rdu);
-	if (unlikely(err))
-		goto out;
-
-	switch (cmd) {
-	case AUFS_CTL_RDU:
-		err = au_rdu(file, &rdu);
-		if (unlikely(err))
-			break;
-
-		rdu.ent.ul = ptr_to_compat(rdu.ent.e);
-		rdu.tail.ul = ptr_to_compat(rdu.tail.e);
-		e = copy_to_user(p, &rdu, sizeof(rdu));
-		if (unlikely(e)) {
-			err = -EFAULT;
-			AuTraceErr(err);
-		}
-		break;
-	case AUFS_CTL_RDU_INO:
-		err = au_rdu_ino(file, &rdu);
-		break;
-
-	default:
-		/* err = -ENOTTY; */
-		err = -EINVAL;
-	}
-
-out:
-	AuTraceErr(err);
-	return err;
-}
-#endif
diff --git a/fs/aufs/rwsem.h b/fs/aufs/rwsem.h
deleted file mode 100644
index ef50c2ccb..000000000
--- a/fs/aufs/rwsem.h
+++ /dev/null
@@ -1,178 +0,0 @@
-/*
- * Copyright (C) 2005-2016 Junjiro R. Okajima
- */
-
-/*
- * simple read-write semaphore wrappers
- */
-
-#ifndef __AUFS_RWSEM_H__
-#define __AUFS_RWSEM_H__
-
-#ifdef __KERNEL__
-
-#include "debug.h"
-
-struct au_rwsem {
-	struct rw_semaphore	rwsem;
-#ifdef CONFIG_AUFS_DEBUG
-	/* just for debugging, not almighty counter */
-	atomic_t		rcnt, wcnt;
-#endif
-};
-
-#ifdef CONFIG_AUFS_DEBUG
-#define AuDbgCntInit(rw) do { \
-	atomic_set(&(rw)->rcnt, 0); \
-	atomic_set(&(rw)->wcnt, 0); \
-	smp_mb(); /* atomic set */ \
-} while (0)
-
-#define AuDbgRcntInc(rw)	atomic_inc(&(rw)->rcnt)
-#define AuDbgRcntDec(rw)	WARN_ON(atomic_dec_return(&(rw)->rcnt) < 0)
-#define AuDbgWcntInc(rw)	atomic_inc(&(rw)->wcnt)
-#define AuDbgWcntDec(rw)	WARN_ON(atomic_dec_return(&(rw)->wcnt) < 0)
-#else
-#define AuDbgCntInit(rw)	do {} while (0)
-#define AuDbgRcntInc(rw)	do {} while (0)
-#define AuDbgRcntDec(rw)	do {} while (0)
-#define AuDbgWcntInc(rw)	do {} while (0)
-#define AuDbgWcntDec(rw)	do {} while (0)
-#endif /* CONFIG_AUFS_DEBUG */
-
-/* to debug easier, do not make them inlined functions */
-#define AuRwMustNoWaiters(rw)	AuDebugOn(!list_empty(&(rw)->rwsem.wait_list))
-/* rwsem_is_locked() is unusable */
-#define AuRwMustReadLock(rw)	AuDebugOn(atomic_read(&(rw)->rcnt) <= 0)
-#define AuRwMustWriteLock(rw)	AuDebugOn(atomic_read(&(rw)->wcnt) <= 0)
-#define AuRwMustAnyLock(rw)	AuDebugOn(atomic_read(&(rw)->rcnt) <= 0 \
-					&& atomic_read(&(rw)->wcnt) <= 0)
-#define AuRwDestroy(rw)		AuDebugOn(atomic_read(&(rw)->rcnt) \
-					|| atomic_read(&(rw)->wcnt))
-
-#define au_rw_class(rw, key)	lockdep_set_class(&(rw)->rwsem, key)
-
-static inline void au_rw_init(struct au_rwsem *rw)
-{
-	AuDbgCntInit(rw);
-	init_rwsem(&rw->rwsem);
-}
-
-static inline void au_rw_init_wlock(struct au_rwsem *rw)
-{
-	au_rw_init(rw);
-	down_write(&rw->rwsem);
-	AuDbgWcntInc(rw);
-}
-
-static inline void au_rw_init_wlock_nested(struct au_rwsem *rw,
-					   unsigned int lsc)
-{
-	au_rw_init(rw);
-	down_write_nested(&rw->rwsem, lsc);
-	AuDbgWcntInc(rw);
-}
-
-static inline void au_rw_read_lock(struct au_rwsem *rw)
-{
-	down_read(&rw->rwsem);
-	AuDbgRcntInc(rw);
-}
-
-static inline void au_rw_read_lock_nested(struct au_rwsem *rw, unsigned int lsc)
-{
-	down_read_nested(&rw->rwsem, lsc);
-	AuDbgRcntInc(rw);
-}
-
-static inline void au_rw_read_unlock(struct au_rwsem *rw)
-{
-	AuRwMustReadLock(rw);
-	AuDbgRcntDec(rw);
-	up_read(&rw->rwsem);
-}
-
-static inline void au_rw_dgrade_lock(struct au_rwsem *rw)
-{
-	AuRwMustWriteLock(rw);
-	AuDbgRcntInc(rw);
-	AuDbgWcntDec(rw);
-	downgrade_write(&rw->rwsem);
-}
-
-static inline void au_rw_write_lock(struct au_rwsem *rw)
-{
-	down_write(&rw->rwsem);
-	AuDbgWcntInc(rw);
-}
-
-static inline void au_rw_write_lock_nested(struct au_rwsem *rw,
-					   unsigned int lsc)
-{
-	down_write_nested(&rw->rwsem, lsc);
-	AuDbgWcntInc(rw);
-}
-
-static inline void au_rw_write_unlock(struct au_rwsem *rw)
-{
-	AuRwMustWriteLock(rw);
-	AuDbgWcntDec(rw);
-	up_write(&rw->rwsem);
-}
-
-/* why is not _nested version defined */
-static inline int au_rw_read_trylock(struct au_rwsem *rw)
-{
-	int ret;
-
-	ret = down_read_trylock(&rw->rwsem);
-	if (ret)
-		AuDbgRcntInc(rw);
-	return ret;
-}
-
-static inline int au_rw_write_trylock(struct au_rwsem *rw)
-{
-	int ret;
-
-	ret = down_write_trylock(&rw->rwsem);
-	if (ret)
-		AuDbgWcntInc(rw);
-	return ret;
-}
-
-#undef AuDbgCntInit
-#undef AuDbgRcntInc
-#undef AuDbgRcntDec
-#undef AuDbgWcntInc
-#undef AuDbgWcntDec
-
-#define AuSimpleLockRwsemFuncs(prefix, param, rwsem) \
-static inline void prefix##_read_lock(param) \
-{ au_rw_read_lock(rwsem); } \
-static inline void prefix##_write_lock(param) \
-{ au_rw_write_lock(rwsem); } \
-static inline int prefix##_read_trylock(param) \
-{ return au_rw_read_trylock(rwsem); } \
-static inline int prefix##_write_trylock(param) \
-{ return au_rw_write_trylock(rwsem); }
-/* why is not _nested version defined */
-/* static inline void prefix##_read_trylock_nested(param, lsc)
-{ au_rw_read_trylock_nested(rwsem, lsc)); }
-static inline void prefix##_write_trylock_nestd(param, lsc)
-{ au_rw_write_trylock_nested(rwsem, lsc); } */
-
-#define AuSimpleUnlockRwsemFuncs(prefix, param, rwsem) \
-static inline void prefix##_read_unlock(param) \
-{ au_rw_read_unlock(rwsem); } \
-static inline void prefix##_write_unlock(param) \
-{ au_rw_write_unlock(rwsem); } \
-static inline void prefix##_downgrade_lock(param) \
-{ au_rw_dgrade_lock(rwsem); }
-
-#define AuSimpleRwsemFuncs(prefix, param, rwsem) \
-	AuSimpleLockRwsemFuncs(prefix, param, rwsem) \
-	AuSimpleUnlockRwsemFuncs(prefix, param, rwsem)
-
-#endif /* __KERNEL__ */
-#endif /* __AUFS_RWSEM_H__ */
diff --git a/fs/aufs/sbinfo.c b/fs/aufs/sbinfo.c
deleted file mode 100644
index e3c58f643..000000000
--- a/fs/aufs/sbinfo.c
+++ /dev/null
@@ -1,353 +0,0 @@
-/*
- * Copyright (C) 2005-2016 Junjiro R. Okajima
- */
-
-/*
- * superblock private data
- */
-
-#include "aufs.h"
-
-/*
- * they are necessary regardless sysfs is disabled.
- */
-void au_si_free(struct kobject *kobj)
-{
-	int i;
-	struct au_sbinfo *sbinfo;
-	char *locked __maybe_unused; /* debug only */
-
-	sbinfo = container_of(kobj, struct au_sbinfo, si_kobj);
-	for (i = 0; i < AuPlink_NHASH; i++)
-		AuDebugOn(!hlist_empty(&sbinfo->si_plink[i].head));
-	AuDebugOn(atomic_read(&sbinfo->si_nowait.nw_len));
-
-	AuDebugOn(!hlist_empty(&sbinfo->si_symlink.head));
-
-	au_rw_write_lock(&sbinfo->si_rwsem);
-	au_br_free(sbinfo);
-	au_rw_write_unlock(&sbinfo->si_rwsem);
-
-	AuDebugOn(radix_tree_gang_lookup
-		  (&sbinfo->au_si_pid.tree, (void **)&locked,
-		   /*first_index*/PID_MAX_DEFAULT - 1,
-		   /*max_items*/sizeof(locked)/sizeof(*locked)));
-
-	kfree(sbinfo->si_branch);
-	kfree(sbinfo->au_si_pid.bitmap);
-	mutex_destroy(&sbinfo->si_xib_mtx);
-	AuRwDestroy(&sbinfo->si_rwsem);
-
-	kfree(sbinfo);
-}
-
-int au_si_alloc(struct super_block *sb)
-{
-	int err, i;
-	struct au_sbinfo *sbinfo;
-	static struct lock_class_key aufs_si;
-
-	err = -ENOMEM;
-	sbinfo = kzalloc(sizeof(*sbinfo), GFP_NOFS);
-	if (unlikely(!sbinfo))
-		goto out;
-
-	BUILD_BUG_ON(sizeof(unsigned long) !=
-		     sizeof(*sbinfo->au_si_pid.bitmap));
-	sbinfo->au_si_pid.bitmap = kcalloc(BITS_TO_LONGS(PID_MAX_DEFAULT),
-					sizeof(*sbinfo->au_si_pid.bitmap),
-					GFP_NOFS);
-	if (unlikely(!sbinfo->au_si_pid.bitmap))
-		goto out_sbinfo;
-
-	/* will be reallocated separately */
-	sbinfo->si_branch = kzalloc(sizeof(*sbinfo->si_branch), GFP_NOFS);
-	if (unlikely(!sbinfo->si_branch))
-		goto out_pidmap;
-
-	err = sysaufs_si_init(sbinfo);
-	if (unlikely(err))
-		goto out_br;
-
-	au_nwt_init(&sbinfo->si_nowait);
-	au_rw_init_wlock(&sbinfo->si_rwsem);
-	au_rw_class(&sbinfo->si_rwsem, &aufs_si);
-	spin_lock_init(&sbinfo->au_si_pid.tree_lock);
-	INIT_RADIX_TREE(&sbinfo->au_si_pid.tree, GFP_ATOMIC | __GFP_NOFAIL);
-
-	atomic_long_set(&sbinfo->si_ninodes, 0);
-	atomic_long_set(&sbinfo->si_nfiles, 0);
-
-	sbinfo->si_bend = -1;
-	sbinfo->si_last_br_id = AUFS_BRANCH_MAX / 2;
-
-	sbinfo->si_wbr_copyup = AuWbrCopyup_Def;
-	sbinfo->si_wbr_create = AuWbrCreate_Def;
-	sbinfo->si_wbr_copyup_ops = au_wbr_copyup_ops + sbinfo->si_wbr_copyup;
-	sbinfo->si_wbr_create_ops = au_wbr_create_ops + sbinfo->si_wbr_create;
-
-	au_fhsm_init(sbinfo);
-
-	sbinfo->si_mntflags = au_opts_plink(AuOpt_Def);
-
-	au_sphl_init(&sbinfo->si_symlink);
-
-	sbinfo->si_xino_jiffy = jiffies;
-	sbinfo->si_xino_expire
-		= msecs_to_jiffies(AUFS_XINO_DEF_SEC * MSEC_PER_SEC);
-	mutex_init(&sbinfo->si_xib_mtx);
-	sbinfo->si_xino_brid = -1;
-	/* leave si_xib_last_pindex and si_xib_next_bit */
-
-	au_sphl_init(&sbinfo->si_aopen);
-
-	sbinfo->si_rdcache = msecs_to_jiffies(AUFS_RDCACHE_DEF * MSEC_PER_SEC);
-	sbinfo->si_rdblk = AUFS_RDBLK_DEF;
-	sbinfo->si_rdhash = AUFS_RDHASH_DEF;
-	sbinfo->si_dirwh = AUFS_DIRWH_DEF;
-
-	for (i = 0; i < AuPlink_NHASH; i++)
-		au_sphl_init(sbinfo->si_plink + i);
-	init_waitqueue_head(&sbinfo->si_plink_wq);
-	spin_lock_init(&sbinfo->si_plink_maint_lock);
-
-	au_sphl_init(&sbinfo->si_files);
-
-	/* with getattr by default */
-	sbinfo->si_iop_array = aufs_iop;
-
-	/* leave other members for sysaufs and si_mnt. */
-	sbinfo->si_sb = sb;
-	sb->s_fs_info = sbinfo;
-	si_pid_set(sb);
-	return 0; /* success */
-
-out_br:
-	kfree(sbinfo->si_branch);
-out_pidmap:
-	kfree(sbinfo->au_si_pid.bitmap);
-out_sbinfo:
-	kfree(sbinfo);
-out:
-	return err;
-}
-
-int au_sbr_realloc(struct au_sbinfo *sbinfo, int nbr)
-{
-	int err, sz;
-	struct au_branch **brp;
-
-	AuRwMustWriteLock(&sbinfo->si_rwsem);
-
-	err = -ENOMEM;
-	sz = sizeof(*brp) * (sbinfo->si_bend + 1);
-	if (unlikely(!sz))
-		sz = sizeof(*brp);
-	brp = au_kzrealloc(sbinfo->si_branch, sz, sizeof(*brp) * nbr, GFP_NOFS);
-	if (brp) {
-		sbinfo->si_branch = brp;
-		err = 0;
-	}
-
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-unsigned int au_sigen_inc(struct super_block *sb)
-{
-	unsigned int gen;
-	struct inode *inode;
-
-	SiMustWriteLock(sb);
-
-	gen = ++au_sbi(sb)->si_generation;
-	au_update_digen(sb->s_root);
-	inode = d_inode(sb->s_root);
-	au_update_iigen(inode, /*half*/0);
-	inode->i_version++;
-	return gen;
-}
-
-aufs_bindex_t au_new_br_id(struct super_block *sb)
-{
-	aufs_bindex_t br_id;
-	int i;
-	struct au_sbinfo *sbinfo;
-
-	SiMustWriteLock(sb);
-
-	sbinfo = au_sbi(sb);
-	for (i = 0; i <= AUFS_BRANCH_MAX; i++) {
-		br_id = ++sbinfo->si_last_br_id;
-		AuDebugOn(br_id < 0);
-		if (br_id && au_br_index(sb, br_id) < 0)
-			return br_id;
-	}
-
-	return -1;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* it is ok that new 'nwt' tasks are appended while we are sleeping */
-int si_read_lock(struct super_block *sb, int flags)
-{
-	int err;
-
-	err = 0;
-	if (au_ftest_lock(flags, FLUSH))
-		au_nwt_flush(&au_sbi(sb)->si_nowait);
-
-	si_noflush_read_lock(sb);
-	err = au_plink_maint(sb, flags);
-	if (unlikely(err))
-		si_read_unlock(sb);
-
-	return err;
-}
-
-int si_write_lock(struct super_block *sb, int flags)
-{
-	int err;
-
-	if (au_ftest_lock(flags, FLUSH))
-		au_nwt_flush(&au_sbi(sb)->si_nowait);
-
-	si_noflush_write_lock(sb);
-	err = au_plink_maint(sb, flags);
-	if (unlikely(err))
-		si_write_unlock(sb);
-
-	return err;
-}
-
-/* dentry and super_block lock. call at entry point */
-int aufs_read_lock(struct dentry *dentry, int flags)
-{
-	int err;
-	struct super_block *sb;
-
-	sb = dentry->d_sb;
-	err = si_read_lock(sb, flags);
-	if (unlikely(err))
-		goto out;
-
-	if (au_ftest_lock(flags, DW))
-		di_write_lock_child(dentry);
-	else
-		di_read_lock_child(dentry, flags);
-
-	if (au_ftest_lock(flags, GEN)) {
-		err = au_digen_test(dentry, au_sigen(sb));
-		if (!au_opt_test(au_mntflags(sb), UDBA_NONE))
-			AuDebugOn(!err && au_dbrange_test(dentry));
-		else if (!err)
-			err = au_dbrange_test(dentry);
-		if (unlikely(err))
-			aufs_read_unlock(dentry, flags);
-	}
-
-out:
-	return err;
-}
-
-void aufs_read_unlock(struct dentry *dentry, int flags)
-{
-	if (au_ftest_lock(flags, DW))
-		di_write_unlock(dentry);
-	else
-		di_read_unlock(dentry, flags);
-	si_read_unlock(dentry->d_sb);
-}
-
-void aufs_write_lock(struct dentry *dentry)
-{
-	si_write_lock(dentry->d_sb, AuLock_FLUSH | AuLock_NOPLMW);
-	di_write_lock_child(dentry);
-}
-
-void aufs_write_unlock(struct dentry *dentry)
-{
-	di_write_unlock(dentry);
-	si_write_unlock(dentry->d_sb);
-}
-
-int aufs_read_and_write_lock2(struct dentry *d1, struct dentry *d2, int flags)
-{
-	int err;
-	unsigned int sigen;
-	struct super_block *sb;
-
-	sb = d1->d_sb;
-	err = si_read_lock(sb, flags);
-	if (unlikely(err))
-		goto out;
-
-	di_write_lock2_child(d1, d2, au_ftest_lock(flags, DIRS));
-
-	if (au_ftest_lock(flags, GEN)) {
-		sigen = au_sigen(sb);
-		err = au_digen_test(d1, sigen);
-		AuDebugOn(!err && au_dbrange_test(d1));
-		if (!err) {
-			err = au_digen_test(d2, sigen);
-			AuDebugOn(!err && au_dbrange_test(d2));
-		}
-		if (unlikely(err))
-			aufs_read_and_write_unlock2(d1, d2);
-	}
-
-out:
-	return err;
-}
-
-void aufs_read_and_write_unlock2(struct dentry *d1, struct dentry *d2)
-{
-	di_write_unlock2(d1, d2);
-	si_read_unlock(d1->d_sb);
-}
-
-/* ---------------------------------------------------------------------- */
-
-int si_pid_test_slow(struct super_block *sb)
-{
-	void *p;
-
-	rcu_read_lock();
-	p = radix_tree_lookup(&au_sbi(sb)->au_si_pid.tree, current->pid);
-	rcu_read_unlock();
-
-	return (long)!!p;
-}
-
-void si_pid_set_slow(struct super_block *sb)
-{
-	int err;
-	struct au_sbinfo *sbinfo;
-
-	AuDebugOn(si_pid_test_slow(sb));
-
-	sbinfo = au_sbi(sb);
-	err = radix_tree_preload(GFP_NOFS | __GFP_NOFAIL);
-	AuDebugOn(err);
-	spin_lock(&sbinfo->au_si_pid.tree_lock);
-	err = radix_tree_insert(&sbinfo->au_si_pid.tree, current->pid,
-				/*any valid ptr*/sb);
-	spin_unlock(&sbinfo->au_si_pid.tree_lock);
-	AuDebugOn(err);
-	radix_tree_preload_end();
-}
-
-void si_pid_clr_slow(struct super_block *sb)
-{
-	void *p;
-	struct au_sbinfo *sbinfo;
-
-	AuDebugOn(!si_pid_test_slow(sb));
-
-	sbinfo = au_sbi(sb);
-	spin_lock(&sbinfo->au_si_pid.tree_lock);
-	p = radix_tree_delete(&sbinfo->au_si_pid.tree, current->pid);
-	spin_unlock(&sbinfo->au_si_pid.tree_lock);
-}
diff --git a/fs/aufs/spl.h b/fs/aufs/spl.h
deleted file mode 100644
index f9b528826..000000000
--- a/fs/aufs/spl.h
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Copyright (C) 2005-2016 Junjiro R. Okajima
- */
-
-/*
- * simple list protected by a spinlock
- */
-
-#ifndef __AUFS_SPL_H__
-#define __AUFS_SPL_H__
-
-#ifdef __KERNEL__
-
-struct au_splhead {
-	spinlock_t		spin;
-	struct list_head	head;
-};
-
-static inline void au_spl_init(struct au_splhead *spl)
-{
-	spin_lock_init(&spl->spin);
-	INIT_LIST_HEAD(&spl->head);
-}
-
-static inline void au_spl_add(struct list_head *list, struct au_splhead *spl)
-{
-	spin_lock(&spl->spin);
-	list_add(list, &spl->head);
-	spin_unlock(&spl->spin);
-}
-
-static inline void au_spl_del(struct list_head *list, struct au_splhead *spl)
-{
-	spin_lock(&spl->spin);
-	list_del(list);
-	spin_unlock(&spl->spin);
-}
-
-static inline void au_spl_del_rcu(struct list_head *list,
-				  struct au_splhead *spl)
-{
-	spin_lock(&spl->spin);
-	list_del_rcu(list);
-	spin_unlock(&spl->spin);
-}
-
-/* ---------------------------------------------------------------------- */
-
-struct au_sphlhead {
-	spinlock_t		spin;
-	struct hlist_head	head;
-};
-
-static inline void au_sphl_init(struct au_sphlhead *sphl)
-{
-	spin_lock_init(&sphl->spin);
-	INIT_HLIST_HEAD(&sphl->head);
-}
-
-static inline void au_sphl_add(struct hlist_node *hlist,
-			       struct au_sphlhead *sphl)
-{
-	spin_lock(&sphl->spin);
-	hlist_add_head(hlist, &sphl->head);
-	spin_unlock(&sphl->spin);
-}
-
-static inline void au_sphl_del(struct hlist_node *hlist,
-			       struct au_sphlhead *sphl)
-{
-	spin_lock(&sphl->spin);
-	hlist_del(hlist);
-	spin_unlock(&sphl->spin);
-}
-
-static inline void au_sphl_del_rcu(struct hlist_node *hlist,
-				   struct au_sphlhead *sphl)
-{
-	spin_lock(&sphl->spin);
-	hlist_del_rcu(hlist);
-	spin_unlock(&sphl->spin);
-}
-
-static inline unsigned long au_sphl_count(struct au_sphlhead *sphl)
-{
-	unsigned long cnt;
-	struct hlist_node *pos;
-
-	cnt = 0;
-	spin_lock(&sphl->spin);
-	hlist_for_each(pos, &sphl->head)
-		cnt++;
-	spin_unlock(&sphl->spin);
-	return cnt;
-}
-
-#endif /* __KERNEL__ */
-#endif /* __AUFS_SPL_H__ */
diff --git a/fs/aufs/super.c b/fs/aufs/super.c
deleted file mode 100644
index b41d78913..000000000
--- a/fs/aufs/super.c
+++ /dev/null
@@ -1,1026 +0,0 @@
-/*
- * Copyright (C) 2005-2016 Junjiro R. Okajima
- */
-
-/*
- * mount and super_block operations
- */
-
-#include <linux/mm.h>
-#include <linux/seq_file.h>
-#include <linux/statfs.h>
-#include <linux/vmalloc.h>
-#include "aufs.h"
-
-/*
- * super_operations
- */
-static struct inode *aufs_alloc_inode(struct super_block *sb __maybe_unused)
-{
-	struct au_icntnr *c;
-
-	c = au_cache_alloc_icntnr();
-	if (c) {
-		au_icntnr_init(c);
-		c->vfs_inode.i_version = 1; /* sigen(sb); */
-		c->iinfo.ii_hinode = NULL;
-		return &c->vfs_inode;
-	}
-	return NULL;
-}
-
-static void aufs_destroy_inode_cb(struct rcu_head *head)
-{
-	struct inode *inode = container_of(head, struct inode, i_rcu);
-
-	INIT_HLIST_HEAD(&inode->i_dentry);
-	au_cache_free_icntnr(container_of(inode, struct au_icntnr, vfs_inode));
-}
-
-static void aufs_destroy_inode(struct inode *inode)
-{
-	au_iinfo_fin(inode);
-	call_rcu(&inode->i_rcu, aufs_destroy_inode_cb);
-}
-
-struct inode *au_iget_locked(struct super_block *sb, ino_t ino)
-{
-	struct inode *inode;
-	int err;
-
-	inode = iget_locked(sb, ino);
-	if (unlikely(!inode)) {
-		inode = ERR_PTR(-ENOMEM);
-		goto out;
-	}
-	if (!(inode->i_state & I_NEW))
-		goto out;
-
-	err = au_xigen_new(inode);
-	if (!err)
-		err = au_iinfo_init(inode);
-	if (!err)
-		inode->i_version++;
-	else {
-		iget_failed(inode);
-		inode = ERR_PTR(err);
-	}
-
-out:
-	/* never return NULL */
-	AuDebugOn(!inode);
-	AuTraceErrPtr(inode);
-	return inode;
-}
-
-/* lock free root dinfo */
-static int au_show_brs(struct seq_file *seq, struct super_block *sb)
-{
-	int err;
-	aufs_bindex_t bindex, bend;
-	struct path path;
-	struct au_hdentry *hdp;
-	struct au_branch *br;
-	au_br_perm_str_t perm;
-
-	err = 0;
-	bend = au_sbend(sb);
-	hdp = au_di(sb->s_root)->di_hdentry;
-	for (bindex = 0; !err && bindex <= bend; bindex++) {
-		br = au_sbr(sb, bindex);
-		path.mnt = au_br_mnt(br);
-		path.dentry = hdp[bindex].hd_dentry;
-		err = au_seq_path(seq, &path);
-		if (!err) {
-			au_optstr_br_perm(&perm, br->br_perm);
-			seq_printf(seq, "=%s", perm.a);
-			if (bindex != bend)
-				seq_putc(seq, ':');
-		}
-	}
-	if (unlikely(err || seq_has_overflowed(seq)))
-		err = -E2BIG;
-
-	return err;
-}
-
-static void au_show_wbr_create(struct seq_file *m, int v,
-			       struct au_sbinfo *sbinfo)
-{
-	const char *pat;
-
-	AuRwMustAnyLock(&sbinfo->si_rwsem);
-
-	seq_puts(m, ",create=");
-	pat = au_optstr_wbr_create(v);
-	switch (v) {
-	case AuWbrCreate_TDP:
-	case AuWbrCreate_RR:
-	case AuWbrCreate_MFS:
-	case AuWbrCreate_PMFS:
-		seq_puts(m, pat);
-		break;
-	case AuWbrCreate_MFSV:
-		seq_printf(m, /*pat*/"mfs:%lu",
-			   jiffies_to_msecs(sbinfo->si_wbr_mfs.mfs_expire)
-			   / MSEC_PER_SEC);
-		break;
-	case AuWbrCreate_PMFSV:
-		seq_printf(m, /*pat*/"pmfs:%lu",
-			   jiffies_to_msecs(sbinfo->si_wbr_mfs.mfs_expire)
-			   / MSEC_PER_SEC);
-		break;
-	case AuWbrCreate_MFSRR:
-		seq_printf(m, /*pat*/"mfsrr:%llu",
-			   sbinfo->si_wbr_mfs.mfsrr_watermark);
-		break;
-	case AuWbrCreate_MFSRRV:
-		seq_printf(m, /*pat*/"mfsrr:%llu:%lu",
-			   sbinfo->si_wbr_mfs.mfsrr_watermark,
-			   jiffies_to_msecs(sbinfo->si_wbr_mfs.mfs_expire)
-			   / MSEC_PER_SEC);
-		break;
-	case AuWbrCreate_PMFSRR:
-		seq_printf(m, /*pat*/"pmfsrr:%llu",
-			   sbinfo->si_wbr_mfs.mfsrr_watermark);
-		break;
-	case AuWbrCreate_PMFSRRV:
-		seq_printf(m, /*pat*/"pmfsrr:%llu:%lu",
-			   sbinfo->si_wbr_mfs.mfsrr_watermark,
-			   jiffies_to_msecs(sbinfo->si_wbr_mfs.mfs_expire)
-			   / MSEC_PER_SEC);
-		break;
-	}
-}
-
-static int au_show_xino(struct seq_file *seq, struct super_block *sb)
-{
-#ifdef CONFIG_SYSFS
-	return 0;
-#else
-	int err;
-	const int len = sizeof(AUFS_XINO_FNAME) - 1;
-	aufs_bindex_t bindex, brid;
-	struct qstr *name;
-	struct file *f;
-	struct dentry *d, *h_root;
-	struct au_hdentry *hdp;
-
-	AuRwMustAnyLock(&sbinfo->si_rwsem);
-
-	err = 0;
-	f = au_sbi(sb)->si_xib;
-	if (!f)
-		goto out;
-
-	/* stop printing the default xino path on the first writable branch */
-	h_root = NULL;
-	brid = au_xino_brid(sb);
-	if (brid >= 0) {
-		bindex = au_br_index(sb, brid);
-		hdp = au_di(sb->s_root)->di_hdentry;
-		h_root = hdp[0 + bindex].hd_dentry;
-	}
-	d = f->f_path.dentry;
-	name = &d->d_name;
-	/* safe ->d_parent because the file is unlinked */
-	if (d->d_parent == h_root
-	    && name->len == len
-	    && !memcmp(name->name, AUFS_XINO_FNAME, len))
-		goto out;
-
-	seq_puts(seq, ",xino=");
-	err = au_xino_path(seq, f);
-
-out:
-	return err;
-#endif
-}
-
-/* seq_file will re-call me in case of too long string */
-static int aufs_show_options(struct seq_file *m, struct dentry *dentry)
-{
-	int err;
-	unsigned int mnt_flags, v;
-	struct super_block *sb;
-	struct au_sbinfo *sbinfo;
-
-#define AuBool(name, str) do { \
-	v = au_opt_test(mnt_flags, name); \
-	if (v != au_opt_test(AuOpt_Def, name)) \
-		seq_printf(m, ",%s" #str, v ? "" : "no"); \
-} while (0)
-
-#define AuStr(name, str) do { \
-	v = mnt_flags & AuOptMask_##name; \
-	if (v != (AuOpt_Def & AuOptMask_##name)) \
-		seq_printf(m, "," #str "=%s", au_optstr_##str(v)); \
-} while (0)
-
-#define AuUInt(name, str, val) do { \
-	if (val != AUFS_##name##_DEF) \
-		seq_printf(m, "," #str "=%u", val); \
-} while (0)
-
-	sb = dentry->d_sb;
-	if (sb->s_flags & MS_POSIXACL)
-		seq_puts(m, ",acl");
-
-	/* lock free root dinfo */
-	si_noflush_read_lock(sb);
-	sbinfo = au_sbi(sb);
-	seq_printf(m, ",si=%lx", sysaufs_si_id(sbinfo));
-
-	mnt_flags = au_mntflags(sb);
-	if (au_opt_test(mnt_flags, XINO)) {
-		err = au_show_xino(m, sb);
-		if (unlikely(err))
-			goto out;
-	} else
-		seq_puts(m, ",noxino");
-
-	AuBool(TRUNC_XINO, trunc_xino);
-	AuStr(UDBA, udba);
-	AuBool(SHWH, shwh);
-	AuBool(PLINK, plink);
-	AuBool(DIO, dio);
-	AuBool(DIRPERM1, dirperm1);
-
-	v = sbinfo->si_wbr_create;
-	if (v != AuWbrCreate_Def)
-		au_show_wbr_create(m, v, sbinfo);
-
-	v = sbinfo->si_wbr_copyup;
-	if (v != AuWbrCopyup_Def)
-		seq_printf(m, ",cpup=%s", au_optstr_wbr_copyup(v));
-
-	v = au_opt_test(mnt_flags, ALWAYS_DIROPQ);
-	if (v != au_opt_test(AuOpt_Def, ALWAYS_DIROPQ))
-		seq_printf(m, ",diropq=%c", v ? 'a' : 'w');
-
-	AuUInt(DIRWH, dirwh, sbinfo->si_dirwh);
-
-	v = jiffies_to_msecs(sbinfo->si_rdcache) / MSEC_PER_SEC;
-	AuUInt(RDCACHE, rdcache, v);
-
-	AuUInt(RDBLK, rdblk, sbinfo->si_rdblk);
-	AuUInt(RDHASH, rdhash, sbinfo->si_rdhash);
-
-	au_fhsm_show(m, sbinfo);
-
-	AuBool(SUM, sum);
-	/* AuBool(SUM_W, wsum); */
-	AuBool(WARN_PERM, warn_perm);
-	AuBool(VERBOSE, verbose);
-
-out:
-	/* be sure to print "br:" last */
-	if (!sysaufs_brs) {
-		seq_puts(m, ",br:");
-		au_show_brs(m, sb);
-	}
-	si_read_unlock(sb);
-	return 0;
-
-#undef AuBool
-#undef AuStr
-#undef AuUInt
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* sum mode which returns the summation for statfs(2) */
-
-static u64 au_add_till_max(u64 a, u64 b)
-{
-	u64 old;
-
-	old = a;
-	a += b;
-	if (old <= a)
-		return a;
-	return ULLONG_MAX;
-}
-
-static u64 au_mul_till_max(u64 a, long mul)
-{
-	u64 old;
-
-	old = a;
-	a *= mul;
-	if (old <= a)
-		return a;
-	return ULLONG_MAX;
-}
-
-static int au_statfs_sum(struct super_block *sb, struct kstatfs *buf)
-{
-	int err;
-	long bsize, factor;
-	u64 blocks, bfree, bavail, files, ffree;
-	aufs_bindex_t bend, bindex, i;
-	unsigned char shared;
-	struct path h_path;
-	struct super_block *h_sb;
-
-	err = 0;
-	bsize = LONG_MAX;
-	files = 0;
-	ffree = 0;
-	blocks = 0;
-	bfree = 0;
-	bavail = 0;
-	bend = au_sbend(sb);
-	for (bindex = 0; bindex <= bend; bindex++) {
-		h_path.mnt = au_sbr_mnt(sb, bindex);
-		h_sb = h_path.mnt->mnt_sb;
-		shared = 0;
-		for (i = 0; !shared && i < bindex; i++)
-			shared = (au_sbr_sb(sb, i) == h_sb);
-		if (shared)
-			continue;
-
-		/* sb->s_root for NFS is unreliable */
-		h_path.dentry = h_path.mnt->mnt_root;
-		err = vfs_statfs(&h_path, buf);
-		if (unlikely(err))
-			goto out;
-
-		if (bsize > buf->f_bsize) {
-			/*
-			 * we will reduce bsize, so we have to expand blocks
-			 * etc. to match them again
-			 */
-			factor = (bsize / buf->f_bsize);
-			blocks = au_mul_till_max(blocks, factor);
-			bfree = au_mul_till_max(bfree, factor);
-			bavail = au_mul_till_max(bavail, factor);
-			bsize = buf->f_bsize;
-		}
-
-		factor = (buf->f_bsize / bsize);
-		blocks = au_add_till_max(blocks,
-				au_mul_till_max(buf->f_blocks, factor));
-		bfree = au_add_till_max(bfree,
-				au_mul_till_max(buf->f_bfree, factor));
-		bavail = au_add_till_max(bavail,
-				au_mul_till_max(buf->f_bavail, factor));
-		files = au_add_till_max(files, buf->f_files);
-		ffree = au_add_till_max(ffree, buf->f_ffree);
-	}
-
-	buf->f_bsize = bsize;
-	buf->f_blocks = blocks;
-	buf->f_bfree = bfree;
-	buf->f_bavail = bavail;
-	buf->f_files = files;
-	buf->f_ffree = ffree;
-	buf->f_frsize = 0;
-
-out:
-	return err;
-}
-
-static int aufs_statfs(struct dentry *dentry, struct kstatfs *buf)
-{
-	int err;
-	struct path h_path;
-	struct super_block *sb;
-
-	/* lock free root dinfo */
-	sb = dentry->d_sb;
-	si_noflush_read_lock(sb);
-	if (!au_opt_test(au_mntflags(sb), SUM)) {
-		/* sb->s_root for NFS is unreliable */
-		h_path.mnt = au_sbr_mnt(sb, 0);
-		h_path.dentry = h_path.mnt->mnt_root;
-		err = vfs_statfs(&h_path, buf);
-	} else
-		err = au_statfs_sum(sb, buf);
-	si_read_unlock(sb);
-
-	if (!err) {
-		buf->f_type = AUFS_SUPER_MAGIC;
-		buf->f_namelen = AUFS_MAX_NAMELEN;
-		memset(&buf->f_fsid, 0, sizeof(buf->f_fsid));
-	}
-	/* buf->f_bsize = buf->f_blocks = buf->f_bfree = buf->f_bavail = -1; */
-
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static int aufs_sync_fs(struct super_block *sb, int wait)
-{
-	int err, e;
-	aufs_bindex_t bend, bindex;
-	struct au_branch *br;
-	struct super_block *h_sb;
-
-	err = 0;
-	si_noflush_read_lock(sb);
-	bend = au_sbend(sb);
-	for (bindex = 0; bindex <= bend; bindex++) {
-		br = au_sbr(sb, bindex);
-		if (!au_br_writable(br->br_perm))
-			continue;
-
-		h_sb = au_sbr_sb(sb, bindex);
-		if (h_sb->s_op->sync_fs) {
-			e = h_sb->s_op->sync_fs(h_sb, wait);
-			if (unlikely(e && !err))
-				err = e;
-			/* go on even if an error happens */
-		}
-	}
-	si_read_unlock(sb);
-
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* final actions when unmounting a file system */
-static void aufs_put_super(struct super_block *sb)
-{
-	struct au_sbinfo *sbinfo;
-
-	sbinfo = au_sbi(sb);
-	if (!sbinfo)
-		return;
-
-	dbgaufs_si_fin(sbinfo);
-	kobject_put(&sbinfo->si_kobj);
-}
-
-/* ---------------------------------------------------------------------- */
-
-void *au_array_alloc(unsigned long long *hint, au_arraycb_t cb,
-		     struct super_block *sb, void *arg)
-{
-	void *array;
-	unsigned long long n, sz;
-
-	array = NULL;
-	n = 0;
-	if (!*hint)
-		goto out;
-
-	if (*hint > ULLONG_MAX / sizeof(array)) {
-		array = ERR_PTR(-EMFILE);
-		pr_err("hint %llu\n", *hint);
-		goto out;
-	}
-
-	sz = sizeof(array) * *hint;
-	array = kzalloc(sz, GFP_NOFS);
-	if (unlikely(!array))
-		array = vzalloc(sz);
-	if (unlikely(!array)) {
-		array = ERR_PTR(-ENOMEM);
-		goto out;
-	}
-
-	n = cb(sb, array, *hint, arg);
-	AuDebugOn(n > *hint);
-
-out:
-	*hint = n;
-	return array;
-}
-
-static unsigned long long au_iarray_cb(struct super_block *sb, void *a,
-				       unsigned long long max __maybe_unused,
-				       void *arg)
-{
-	unsigned long long n;
-	struct inode **p, *inode;
-	struct list_head *head;
-
-	n = 0;
-	p = a;
-	head = arg;
-	spin_lock(&sb->s_inode_list_lock);
-	list_for_each_entry(inode, head, i_sb_list) {
-		if (!is_bad_inode(inode)
-		    && au_ii(inode)->ii_bstart >= 0) {
-			spin_lock(&inode->i_lock);
-			if (atomic_read(&inode->i_count)) {
-				au_igrab(inode);
-				*p++ = inode;
-				n++;
-				AuDebugOn(n > max);
-			}
-			spin_unlock(&inode->i_lock);
-		}
-	}
-	spin_unlock(&sb->s_inode_list_lock);
-
-	return n;
-}
-
-struct inode **au_iarray_alloc(struct super_block *sb, unsigned long long *max)
-{
-	*max = atomic_long_read(&au_sbi(sb)->si_ninodes);
-	return au_array_alloc(max, au_iarray_cb, sb, &sb->s_inodes);
-}
-
-void au_iarray_free(struct inode **a, unsigned long long max)
-{
-	unsigned long long ull;
-
-	for (ull = 0; ull < max; ull++)
-		iput(a[ull]);
-	kvfree(a);
-}
-
-/* ---------------------------------------------------------------------- */
-
-/*
- * refresh dentry and inode at remount time.
- */
-/* todo: consolidate with simple_reval_dpath() and au_reval_for_attr() */
-static int au_do_refresh(struct dentry *dentry, unsigned int dir_flags,
-		      struct dentry *parent)
-{
-	int err;
-
-	di_write_lock_child(dentry);
-	di_read_lock_parent(parent, AuLock_IR);
-	err = au_refresh_dentry(dentry, parent);
-	if (!err && dir_flags)
-		au_hn_reset(d_inode(dentry), dir_flags);
-	di_read_unlock(parent, AuLock_IR);
-	di_write_unlock(dentry);
-
-	return err;
-}
-
-static int au_do_refresh_d(struct dentry *dentry, unsigned int sigen,
-			   struct au_sbinfo *sbinfo,
-			   const unsigned int dir_flags, unsigned int do_idop)
-{
-	int err;
-	struct dentry *parent;
-
-	err = 0;
-	parent = dget_parent(dentry);
-	if (!au_digen_test(parent, sigen) && au_digen_test(dentry, sigen)) {
-		if (d_really_is_positive(dentry)) {
-			if (!d_is_dir(dentry))
-				err = au_do_refresh(dentry, /*dir_flags*/0,
-						 parent);
-			else {
-				err = au_do_refresh(dentry, dir_flags, parent);
-				if (unlikely(err))
-					au_fset_si(sbinfo, FAILED_REFRESH_DIR);
-			}
-		} else
-			err = au_do_refresh(dentry, /*dir_flags*/0, parent);
-		AuDbgDentry(dentry);
-	}
-	dput(parent);
-
-	if (!err) {
-		if (do_idop)
-			au_refresh_dop(dentry, /*force_reval*/0);
-	} else
-		au_refresh_dop(dentry, /*force_reval*/1);
-
-	AuTraceErr(err);
-	return err;
-}
-
-static int au_refresh_d(struct super_block *sb, unsigned int do_idop)
-{
-	int err, i, j, ndentry, e;
-	unsigned int sigen;
-	struct au_dcsub_pages dpages;
-	struct au_dpage *dpage;
-	struct dentry **dentries, *d;
-	struct au_sbinfo *sbinfo;
-	struct dentry *root = sb->s_root;
-	const unsigned int dir_flags = au_hi_flags(d_inode(root), /*isdir*/1);
-
-	if (do_idop)
-		au_refresh_dop(root, /*force_reval*/0);
-
-	err = au_dpages_init(&dpages, GFP_NOFS);
-	if (unlikely(err))
-		goto out;
-	err = au_dcsub_pages(&dpages, root, NULL, NULL);
-	if (unlikely(err))
-		goto out_dpages;
-
-	sigen = au_sigen(sb);
-	sbinfo = au_sbi(sb);
-	for (i = 0; i < dpages.ndpage; i++) {
-		dpage = dpages.dpages + i;
-		dentries = dpage->dentries;
-		ndentry = dpage->ndentry;
-		for (j = 0; j < ndentry; j++) {
-			d = dentries[j];
-			e = au_do_refresh_d(d, sigen, sbinfo, dir_flags,
-					    do_idop);
-			if (unlikely(e && !err))
-				err = e;
-			/* go on even err */
-		}
-	}
-
-out_dpages:
-	au_dpages_free(&dpages);
-out:
-	return err;
-}
-
-static int au_refresh_i(struct super_block *sb, unsigned int do_idop)
-{
-	int err, e;
-	unsigned int sigen;
-	unsigned long long max, ull;
-	struct inode *inode, **array;
-
-	array = au_iarray_alloc(sb, &max);
-	err = PTR_ERR(array);
-	if (IS_ERR(array))
-		goto out;
-
-	err = 0;
-	sigen = au_sigen(sb);
-	for (ull = 0; ull < max; ull++) {
-		inode = array[ull];
-		if (unlikely(!inode))
-			break;
-
-		e = 0;
-		ii_write_lock_child(inode);
-		if (au_iigen(inode, NULL) != sigen) {
-			e = au_refresh_hinode_self(inode);
-			if (unlikely(e)) {
-				au_refresh_iop(inode, /*force_getattr*/1);
-				pr_err("error %d, i%lu\n", e, inode->i_ino);
-				if (!err)
-					err = e;
-				/* go on even if err */
-			}
-		}
-		if (!e && do_idop)
-			au_refresh_iop(inode, /*force_getattr*/0);
-		ii_write_unlock(inode);
-	}
-
-	au_iarray_free(array, max);
-
-out:
-	return err;
-}
-
-static void au_remount_refresh(struct super_block *sb, unsigned int do_idop)
-{
-	int err, e;
-	unsigned int udba;
-	aufs_bindex_t bindex, bend;
-	struct dentry *root;
-	struct inode *inode;
-	struct au_branch *br;
-	struct au_sbinfo *sbi;
-
-	au_sigen_inc(sb);
-	sbi = au_sbi(sb);
-	au_fclr_si(sbi, FAILED_REFRESH_DIR);
-
-	root = sb->s_root;
-	DiMustNoWaiters(root);
-	inode = d_inode(root);
-	IiMustNoWaiters(inode);
-
-	udba = au_opt_udba(sb);
-	bend = au_sbend(sb);
-	for (bindex = 0; bindex <= bend; bindex++) {
-		br = au_sbr(sb, bindex);
-		err = au_hnotify_reset_br(udba, br, br->br_perm);
-		if (unlikely(err))
-			AuIOErr("hnotify failed on br %d, %d, ignored\n",
-				bindex, err);
-		/* go on even if err */
-	}
-	au_hn_reset(inode, au_hi_flags(inode, /*isdir*/1));
-
-	if (do_idop) {
-		if (au_ftest_si(sbi, NO_DREVAL)) {
-			AuDebugOn(sb->s_d_op == &aufs_dop_noreval);
-			sb->s_d_op = &aufs_dop_noreval;
-			AuDebugOn(sbi->si_iop_array == aufs_iop_nogetattr);
-			sbi->si_iop_array = aufs_iop_nogetattr;
-		} else {
-			AuDebugOn(sb->s_d_op == &aufs_dop);
-			sb->s_d_op = &aufs_dop;
-			AuDebugOn(sbi->si_iop_array == aufs_iop);
-			sbi->si_iop_array = aufs_iop;
-		}
-		pr_info("reset to %pf and %pf\n",
-			sb->s_d_op, sbi->si_iop_array);
-	}
-
-	di_write_unlock(root);
-	err = au_refresh_d(sb, do_idop);
-	e = au_refresh_i(sb, do_idop);
-	if (unlikely(e && !err))
-		err = e;
-	/* aufs_write_lock() calls ..._child() */
-	di_write_lock_child(root);
-
-	au_cpup_attr_all(inode, /*force*/1);
-
-	if (unlikely(err))
-		AuIOErr("refresh failed, ignored, %d\n", err);
-}
-
-/* stop extra interpretation of errno in mount(8), and strange error messages */
-static int cvt_err(int err)
-{
-	AuTraceErr(err);
-
-	switch (err) {
-	case -ENOENT:
-	case -ENOTDIR:
-	case -EEXIST:
-	case -EIO:
-		err = -EINVAL;
-	}
-	return err;
-}
-
-static int aufs_remount_fs(struct super_block *sb, int *flags, char *data)
-{
-	int err, do_dx;
-	unsigned int mntflags;
-	struct au_opts opts = {
-		.opt = NULL
-	};
-	struct dentry *root;
-	struct inode *inode;
-	struct au_sbinfo *sbinfo;
-
-	err = 0;
-	root = sb->s_root;
-	if (!data || !*data) {
-		err = si_write_lock(sb, AuLock_FLUSH | AuLock_NOPLM);
-		if (!err) {
-			di_write_lock_child(root);
-			err = au_opts_verify(sb, *flags, /*pending*/0);
-			aufs_write_unlock(root);
-		}
-		goto out;
-	}
-
-	err = -ENOMEM;
-	opts.opt = (void *)__get_free_page(GFP_NOFS);
-	if (unlikely(!opts.opt))
-		goto out;
-	opts.max_opt = PAGE_SIZE / sizeof(*opts.opt);
-	opts.flags = AuOpts_REMOUNT;
-	opts.sb_flags = *flags;
-
-	/* parse it before aufs lock */
-	err = au_opts_parse(sb, data, &opts);
-	if (unlikely(err))
-		goto out_opts;
-
-	sbinfo = au_sbi(sb);
-	inode = d_inode(root);
-	mutex_lock(&inode->i_mutex);
-	err = si_write_lock(sb, AuLock_FLUSH | AuLock_NOPLM);
-	if (unlikely(err))
-		goto out_mtx;
-	di_write_lock_child(root);
-
-	/* au_opts_remount() may return an error */
-	err = au_opts_remount(sb, &opts);
-	au_opts_free(&opts);
-
-	if (au_ftest_opts(opts.flags, REFRESH))
-		au_remount_refresh(sb, au_ftest_opts(opts.flags, REFRESH_IDOP));
-
-	if (au_ftest_opts(opts.flags, REFRESH_DYAOP)) {
-		mntflags = au_mntflags(sb);
-		do_dx = !!au_opt_test(mntflags, DIO);
-		au_dy_arefresh(do_dx);
-	}
-
-	au_fhsm_wrote_all(sb, /*force*/1); /* ?? */
-	aufs_write_unlock(root);
-
-out_mtx:
-	mutex_unlock(&inode->i_mutex);
-out_opts:
-	free_page((unsigned long)opts.opt);
-out:
-	err = cvt_err(err);
-	AuTraceErr(err);
-	return err;
-}
-
-static const struct super_operations aufs_sop = {
-	.alloc_inode	= aufs_alloc_inode,
-	.destroy_inode	= aufs_destroy_inode,
-	/* always deleting, no clearing */
-	.drop_inode	= generic_delete_inode,
-	.show_options	= aufs_show_options,
-	.statfs		= aufs_statfs,
-	.put_super	= aufs_put_super,
-	.sync_fs	= aufs_sync_fs,
-	.remount_fs	= aufs_remount_fs
-};
-
-/* ---------------------------------------------------------------------- */
-
-static int alloc_root(struct super_block *sb)
-{
-	int err;
-	struct inode *inode;
-	struct dentry *root;
-
-	err = -ENOMEM;
-	inode = au_iget_locked(sb, AUFS_ROOT_INO);
-	err = PTR_ERR(inode);
-	if (IS_ERR(inode))
-		goto out;
-
-	inode->i_op = aufs_iop + AuIop_DIR; /* with getattr by default */
-	inode->i_fop = &aufs_dir_fop;
-	inode->i_mode = S_IFDIR;
-	set_nlink(inode, 2);
-	unlock_new_inode(inode);
-
-	root = d_make_root(inode);
-	if (unlikely(!root))
-		goto out;
-	err = PTR_ERR(root);
-	if (IS_ERR(root))
-		goto out;
-
-	err = au_di_init(root);
-	if (!err) {
-		sb->s_root = root;
-		return 0; /* success */
-	}
-	dput(root);
-
-out:
-	return err;
-}
-
-static int aufs_fill_super(struct super_block *sb, void *raw_data,
-			   int silent __maybe_unused)
-{
-	int err;
-	struct au_opts opts = {
-		.opt = NULL
-	};
-	struct au_sbinfo *sbinfo;
-	struct dentry *root;
-	struct inode *inode;
-	char *arg = raw_data;
-
-	if (unlikely(!arg || !*arg)) {
-		err = -EINVAL;
-		pr_err("no arg\n");
-		goto out;
-	}
-
-	err = -ENOMEM;
-	opts.opt = (void *)__get_free_page(GFP_NOFS);
-	if (unlikely(!opts.opt))
-		goto out;
-	opts.max_opt = PAGE_SIZE / sizeof(*opts.opt);
-	opts.sb_flags = sb->s_flags;
-
-	err = au_si_alloc(sb);
-	if (unlikely(err))
-		goto out_opts;
-	sbinfo = au_sbi(sb);
-
-	/* all timestamps always follow the ones on the branch */
-	sb->s_flags |= MS_NOATIME | MS_NODIRATIME;
-	sb->s_op = &aufs_sop;
-	sb->s_d_op = &aufs_dop;
-	sb->s_magic = AUFS_SUPER_MAGIC;
-	sb->s_maxbytes = 0;
-	sb->s_stack_depth = 1;
-	au_export_init(sb);
-	/* au_xattr_init(sb); */
-
-	err = alloc_root(sb);
-	if (unlikely(err)) {
-		si_write_unlock(sb);
-		goto out_info;
-	}
-	root = sb->s_root;
-	inode = d_inode(root);
-
-	/*
-	 * actually we can parse options regardless aufs lock here.
-	 * but at remount time, parsing must be done before aufs lock.
-	 * so we follow the same rule.
-	 */
-	ii_write_lock_parent(inode);
-	aufs_write_unlock(root);
-	err = au_opts_parse(sb, arg, &opts);
-	if (unlikely(err))
-		goto out_root;
-
-	/* lock vfs_inode first, then aufs. */
-	mutex_lock(&inode->i_mutex);
-	aufs_write_lock(root);
-	err = au_opts_mount(sb, &opts);
-	au_opts_free(&opts);
-	if (!err && au_ftest_si(sbinfo, NO_DREVAL)) {
-		sb->s_d_op = &aufs_dop_noreval;
-		pr_info("%pf\n", sb->s_d_op);
-		au_refresh_dop(root, /*force_reval*/0);
-		sbinfo->si_iop_array = aufs_iop_nogetattr;
-		au_refresh_iop(inode, /*force_getattr*/0);
-	}
-	aufs_write_unlock(root);
-	mutex_unlock(&inode->i_mutex);
-	if (!err)
-		goto out_opts; /* success */
-
-out_root:
-	dput(root);
-	sb->s_root = NULL;
-out_info:
-	dbgaufs_si_fin(sbinfo);
-	kobject_put(&sbinfo->si_kobj);
-	sb->s_fs_info = NULL;
-out_opts:
-	free_page((unsigned long)opts.opt);
-out:
-	AuTraceErr(err);
-	err = cvt_err(err);
-	AuTraceErr(err);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static struct dentry *aufs_mount(struct file_system_type *fs_type, int flags,
-				 const char *dev_name __maybe_unused,
-				 void *raw_data)
-{
-	struct dentry *root;
-	struct super_block *sb;
-
-	/* all timestamps always follow the ones on the branch */
-	/* mnt->mnt_flags |= MNT_NOATIME | MNT_NODIRATIME; */
-	root = mount_nodev(fs_type, flags, raw_data, aufs_fill_super);
-	if (IS_ERR(root))
-		goto out;
-
-	sb = root->d_sb;
-	si_write_lock(sb, !AuLock_FLUSH);
-	sysaufs_brs_add(sb, 0);
-	si_write_unlock(sb);
-	au_sbilist_add(sb);
-
-out:
-	return root;
-}
-
-static void aufs_kill_sb(struct super_block *sb)
-{
-	struct au_sbinfo *sbinfo;
-
-	sbinfo = au_sbi(sb);
-	if (sbinfo) {
-		au_sbilist_del(sb);
-		aufs_write_lock(sb->s_root);
-		au_fhsm_fin(sb);
-		if (sbinfo->si_wbr_create_ops->fin)
-			sbinfo->si_wbr_create_ops->fin(sb);
-		if (au_opt_test(sbinfo->si_mntflags, UDBA_HNOTIFY)) {
-			au_opt_set_udba(sbinfo->si_mntflags, UDBA_NONE);
-			au_remount_refresh(sb, /*do_idop*/0);
-		}
-		if (au_opt_test(sbinfo->si_mntflags, PLINK))
-			au_plink_put(sb, /*verbose*/1);
-		au_xino_clr(sb);
-		sbinfo->si_sb = NULL;
-		aufs_write_unlock(sb->s_root);
-		au_nwt_flush(&sbinfo->si_nowait);
-	}
-	kill_anon_super(sb);
-}
-
-struct file_system_type aufs_fs_type = {
-	.name		= AUFS_FSTYPE,
-	/* a race between rename and others */
-	.fs_flags	= FS_RENAME_DOES_D_MOVE,
-	.mount		= aufs_mount,
-	.kill_sb	= aufs_kill_sb,
-	/* no need to __module_get() and module_put(). */
-	.owner		= THIS_MODULE,
-};
diff --git a/fs/aufs/super.h b/fs/aufs/super.h
deleted file mode 100644
index 2761df917..000000000
--- a/fs/aufs/super.h
+++ /dev/null
@@ -1,628 +0,0 @@
-/*
- * Copyright (C) 2005-2016 Junjiro R. Okajima
- */
-
-/*
- * super_block operations
- */
-
-#ifndef __AUFS_SUPER_H__
-#define __AUFS_SUPER_H__
-
-#ifdef __KERNEL__
-
-#include <linux/fs.h>
-#include <linux/kobject.h>
-#include "rwsem.h"
-#include "spl.h"
-#include "wkq.h"
-
-/* policies to select one among multiple writable branches */
-struct au_wbr_copyup_operations {
-	int (*copyup)(struct dentry *dentry);
-};
-
-#define AuWbr_DIR	1		/* target is a dir */
-#define AuWbr_PARENT	(1 << 1)	/* always require a parent */
-
-#define au_ftest_wbr(flags, name)	((flags) & AuWbr_##name)
-#define au_fset_wbr(flags, name)	{ (flags) |= AuWbr_##name; }
-#define au_fclr_wbr(flags, name)	{ (flags) &= ~AuWbr_##name; }
-
-struct au_wbr_create_operations {
-	int (*create)(struct dentry *dentry, unsigned int flags);
-	int (*init)(struct super_block *sb);
-	int (*fin)(struct super_block *sb);
-};
-
-struct au_wbr_mfs {
-	struct mutex	mfs_lock; /* protect this structure */
-	unsigned long	mfs_jiffy;
-	unsigned long	mfs_expire;
-	aufs_bindex_t	mfs_bindex;
-
-	unsigned long long	mfsrr_bytes;
-	unsigned long long	mfsrr_watermark;
-};
-
-struct pseudo_link {
-	union {
-		struct hlist_node hlist;
-		struct rcu_head rcu;
-	};
-	struct inode *inode;
-};
-
-#define AuPlink_NHASH 100
-static inline int au_plink_hash(ino_t ino)
-{
-	return ino % AuPlink_NHASH;
-}
-
-/* File-based Hierarchical Storage Management */
-struct au_fhsm {
-#ifdef CONFIG_AUFS_FHSM
-	/* allow only one process who can receive the notification */
-	spinlock_t		fhsm_spin;
-	pid_t			fhsm_pid;
-	wait_queue_head_t	fhsm_wqh;
-	atomic_t		fhsm_readable;
-
-	/* these are protected by si_rwsem */
-	unsigned long		fhsm_expire;
-	aufs_bindex_t		fhsm_bottom;
-#endif
-};
-
-struct au_branch;
-struct au_sbinfo {
-	/* nowait tasks in the system-wide workqueue */
-	struct au_nowait_tasks	si_nowait;
-
-	/*
-	 * tried sb->s_umount, but failed due to the dependecy between i_mutex.
-	 * rwsem for au_sbinfo is necessary.
-	 */
-	struct au_rwsem		si_rwsem;
-
-	/* prevent recursive locking in deleting inode */
-	struct {
-		unsigned long		*bitmap;
-		spinlock_t		tree_lock;
-		struct radix_tree_root	tree;
-	} au_si_pid;
-
-	/*
-	 * dirty approach to protect sb->sb_inodes and ->s_files (gone) from
-	 * remount.
-	 */
-	atomic_long_t		si_ninodes, si_nfiles;
-
-	/* branch management */
-	unsigned int		si_generation;
-
-	/* see AuSi_ flags */
-	unsigned char		au_si_status;
-
-	aufs_bindex_t		si_bend;
-
-	/* dirty trick to keep br_id plus */
-	unsigned int		si_last_br_id :
-				sizeof(aufs_bindex_t) * BITS_PER_BYTE - 1;
-	struct au_branch	**si_branch;
-
-	/* policy to select a writable branch */
-	unsigned char		si_wbr_copyup;
-	unsigned char		si_wbr_create;
-	struct au_wbr_copyup_operations *si_wbr_copyup_ops;
-	struct au_wbr_create_operations *si_wbr_create_ops;
-
-	/* round robin */
-	atomic_t		si_wbr_rr_next;
-
-	/* most free space */
-	struct au_wbr_mfs	si_wbr_mfs;
-
-	/* File-based Hierarchical Storage Management */
-	struct au_fhsm		si_fhsm;
-
-	/* mount flags */
-	/* include/asm-ia64/siginfo.h defines a macro named si_flags */
-	unsigned int		si_mntflags;
-
-	/* symlink to follow_link() and put_link() */
-	struct au_sphlhead	si_symlink;
-
-	/* external inode number (bitmap and translation table) */
-	vfs_readf_t		si_xread;
-	vfs_writef_t		si_xwrite;
-	struct file		*si_xib;
-	struct mutex		si_xib_mtx; /* protect xib members */
-	unsigned long		*si_xib_buf;
-	unsigned long		si_xib_last_pindex;
-	int			si_xib_next_bit;
-	aufs_bindex_t		si_xino_brid;
-	unsigned long		si_xino_jiffy;
-	unsigned long		si_xino_expire;
-	/* reserved for future use */
-	/* unsigned long long	si_xib_limit; */	/* Max xib file size */
-
-#ifdef CONFIG_AUFS_EXPORT
-	/* i_generation */
-	struct file		*si_xigen;
-	atomic_t		si_xigen_next;
-#endif
-
-	/* dirty trick to suppoer atomic_open */
-	struct au_sphlhead	si_aopen;
-
-	/* vdir parameters */
-	unsigned long		si_rdcache;	/* max cache time in jiffies */
-	unsigned int		si_rdblk;	/* deblk size */
-	unsigned int		si_rdhash;	/* hash size */
-
-	/*
-	 * If the number of whiteouts are larger than si_dirwh, leave all of
-	 * them after au_whtmp_ren to reduce the cost of rmdir(2).
-	 * future fsck.aufs or kernel thread will remove them later.
-	 * Otherwise, remove all whiteouts and the dir in rmdir(2).
-	 */
-	unsigned int		si_dirwh;
-
-	/* pseudo_link list */
-	struct au_sphlhead	si_plink[AuPlink_NHASH];
-	wait_queue_head_t	si_plink_wq;
-	spinlock_t		si_plink_maint_lock;
-	pid_t			si_plink_maint_pid;
-
-	/* file list */
-	struct au_sphlhead	si_files;
-
-	/* with/without getattr, brother of sb->s_d_op */
-	struct inode_operations *si_iop_array;
-
-	/*
-	 * sysfs and lifetime management.
-	 * this is not a small structure and it may be a waste of memory in case
-	 * of sysfs is disabled, particulary when many aufs-es are mounted.
-	 * but using sysfs is majority.
-	 */
-	struct kobject		si_kobj;
-#ifdef CONFIG_DEBUG_FS
-	struct dentry		 *si_dbgaufs;
-	struct dentry		 *si_dbgaufs_plink;
-	struct dentry		 *si_dbgaufs_xib;
-#ifdef CONFIG_AUFS_EXPORT
-	struct dentry		 *si_dbgaufs_xigen;
-#endif
-#endif
-
-#ifdef CONFIG_AUFS_SBILIST
-	struct list_head	si_list;
-#endif
-
-	/* dirty, necessary for unmounting, sysfs and sysrq */
-	struct super_block	*si_sb;
-};
-
-/* sbinfo status flags */
-/*
- * set true when refresh_dirs() failed at remount time.
- * then try refreshing dirs at access time again.
- * if it is false, refreshing dirs at access time is unnecesary
- */
-#define AuSi_FAILED_REFRESH_DIR	1
-#define AuSi_FHSM		(1 << 1)	/* fhsm is active now */
-#define AuSi_NO_DREVAL		(1 << 2)	/* disable all d_revalidate */
-
-#ifndef CONFIG_AUFS_FHSM
-#undef AuSi_FHSM
-#define AuSi_FHSM		0
-#endif
-
-static inline unsigned char au_do_ftest_si(struct au_sbinfo *sbi,
-					   unsigned int flag)
-{
-	AuRwMustAnyLock(&sbi->si_rwsem);
-	return sbi->au_si_status & flag;
-}
-#define au_ftest_si(sbinfo, name)	au_do_ftest_si(sbinfo, AuSi_##name)
-#define au_fset_si(sbinfo, name) do { \
-	AuRwMustWriteLock(&(sbinfo)->si_rwsem); \
-	(sbinfo)->au_si_status |= AuSi_##name; \
-} while (0)
-#define au_fclr_si(sbinfo, name) do { \
-	AuRwMustWriteLock(&(sbinfo)->si_rwsem); \
-	(sbinfo)->au_si_status &= ~AuSi_##name; \
-} while (0)
-
-/* ---------------------------------------------------------------------- */
-
-/* policy to select one among writable branches */
-#define AuWbrCopyup(sbinfo, ...) \
-	((sbinfo)->si_wbr_copyup_ops->copyup(__VA_ARGS__))
-#define AuWbrCreate(sbinfo, ...) \
-	((sbinfo)->si_wbr_create_ops->create(__VA_ARGS__))
-
-/* flags for si_read_lock()/aufs_read_lock()/di_read_lock() */
-#define AuLock_DW		1		/* write-lock dentry */
-#define AuLock_IR		(1 << 1)	/* read-lock inode */
-#define AuLock_IW		(1 << 2)	/* write-lock inode */
-#define AuLock_FLUSH		(1 << 3)	/* wait for 'nowait' tasks */
-#define AuLock_DIRS		(1 << 4)	/* target is a pair of dirs */
-#define AuLock_NOPLM		(1 << 5)	/* return err in plm mode */
-#define AuLock_NOPLMW		(1 << 6)	/* wait for plm mode ends */
-#define AuLock_GEN		(1 << 7)	/* test digen/iigen */
-#define au_ftest_lock(flags, name)	((flags) & AuLock_##name)
-#define au_fset_lock(flags, name) \
-	do { (flags) |= AuLock_##name; } while (0)
-#define au_fclr_lock(flags, name) \
-	do { (flags) &= ~AuLock_##name; } while (0)
-
-/* ---------------------------------------------------------------------- */
-
-/* super.c */
-extern struct file_system_type aufs_fs_type;
-struct inode *au_iget_locked(struct super_block *sb, ino_t ino);
-typedef unsigned long long (*au_arraycb_t)(struct super_block *sb, void *array,
-					   unsigned long long max, void *arg);
-void *au_array_alloc(unsigned long long *hint, au_arraycb_t cb,
-		     struct super_block *sb, void *arg);
-struct inode **au_iarray_alloc(struct super_block *sb, unsigned long long *max);
-void au_iarray_free(struct inode **a, unsigned long long max);
-
-/* sbinfo.c */
-void au_si_free(struct kobject *kobj);
-int au_si_alloc(struct super_block *sb);
-int au_sbr_realloc(struct au_sbinfo *sbinfo, int nbr);
-
-unsigned int au_sigen_inc(struct super_block *sb);
-aufs_bindex_t au_new_br_id(struct super_block *sb);
-
-int si_read_lock(struct super_block *sb, int flags);
-int si_write_lock(struct super_block *sb, int flags);
-int aufs_read_lock(struct dentry *dentry, int flags);
-void aufs_read_unlock(struct dentry *dentry, int flags);
-void aufs_write_lock(struct dentry *dentry);
-void aufs_write_unlock(struct dentry *dentry);
-int aufs_read_and_write_lock2(struct dentry *d1, struct dentry *d2, int flags);
-void aufs_read_and_write_unlock2(struct dentry *d1, struct dentry *d2);
-
-int si_pid_test_slow(struct super_block *sb);
-void si_pid_set_slow(struct super_block *sb);
-void si_pid_clr_slow(struct super_block *sb);
-
-/* wbr_policy.c */
-extern struct au_wbr_copyup_operations au_wbr_copyup_ops[];
-extern struct au_wbr_create_operations au_wbr_create_ops[];
-int au_cpdown_dirs(struct dentry *dentry, aufs_bindex_t bdst);
-int au_wbr_nonopq(struct dentry *dentry, aufs_bindex_t bindex);
-int au_wbr_do_copyup_bu(struct dentry *dentry, aufs_bindex_t bstart);
-
-/* mvdown.c */
-int au_mvdown(struct dentry *dentry, struct aufs_mvdown __user *arg);
-
-#ifdef CONFIG_AUFS_FHSM
-/* fhsm.c */
-
-static inline pid_t au_fhsm_pid(struct au_fhsm *fhsm)
-{
-	pid_t pid;
-
-	spin_lock(&fhsm->fhsm_spin);
-	pid = fhsm->fhsm_pid;
-	spin_unlock(&fhsm->fhsm_spin);
-
-	return pid;
-}
-
-void au_fhsm_wrote(struct super_block *sb, aufs_bindex_t bindex, int force);
-void au_fhsm_wrote_all(struct super_block *sb, int force);
-int au_fhsm_fd(struct super_block *sb, int oflags);
-int au_fhsm_br_alloc(struct au_branch *br);
-void au_fhsm_set_bottom(struct super_block *sb, aufs_bindex_t bindex);
-void au_fhsm_fin(struct super_block *sb);
-void au_fhsm_init(struct au_sbinfo *sbinfo);
-void au_fhsm_set(struct au_sbinfo *sbinfo, unsigned int sec);
-void au_fhsm_show(struct seq_file *seq, struct au_sbinfo *sbinfo);
-#else
-AuStubVoid(au_fhsm_wrote, struct super_block *sb, aufs_bindex_t bindex,
-	   int force)
-AuStubVoid(au_fhsm_wrote_all, struct super_block *sb, int force)
-AuStub(int, au_fhsm_fd, return -EOPNOTSUPP, struct super_block *sb, int oflags)
-AuStub(pid_t, au_fhsm_pid, return 0, struct au_fhsm *fhsm)
-AuStubInt0(au_fhsm_br_alloc, struct au_branch *br)
-AuStubVoid(au_fhsm_set_bottom, struct super_block *sb, aufs_bindex_t bindex)
-AuStubVoid(au_fhsm_fin, struct super_block *sb)
-AuStubVoid(au_fhsm_init, struct au_sbinfo *sbinfo)
-AuStubVoid(au_fhsm_set, struct au_sbinfo *sbinfo, unsigned int sec)
-AuStubVoid(au_fhsm_show, struct seq_file *seq, struct au_sbinfo *sbinfo)
-#endif
-
-/* ---------------------------------------------------------------------- */
-
-static inline struct au_sbinfo *au_sbi(struct super_block *sb)
-{
-	return sb->s_fs_info;
-}
-
-/* ---------------------------------------------------------------------- */
-
-#ifdef CONFIG_AUFS_EXPORT
-int au_test_nfsd(void);
-void au_export_init(struct super_block *sb);
-void au_xigen_inc(struct inode *inode);
-int au_xigen_new(struct inode *inode);
-int au_xigen_set(struct super_block *sb, struct file *base);
-void au_xigen_clr(struct super_block *sb);
-
-static inline int au_busy_or_stale(void)
-{
-	if (!au_test_nfsd())
-		return -EBUSY;
-	return -ESTALE;
-}
-#else
-AuStubInt0(au_test_nfsd, void)
-AuStubVoid(au_export_init, struct super_block *sb)
-AuStubVoid(au_xigen_inc, struct inode *inode)
-AuStubInt0(au_xigen_new, struct inode *inode)
-AuStubInt0(au_xigen_set, struct super_block *sb, struct file *base)
-AuStubVoid(au_xigen_clr, struct super_block *sb)
-AuStub(int, au_busy_or_stale, return -EBUSY, void)
-#endif /* CONFIG_AUFS_EXPORT */
-
-/* ---------------------------------------------------------------------- */
-
-#ifdef CONFIG_AUFS_SBILIST
-/* module.c */
-extern struct au_splhead au_sbilist;
-
-static inline void au_sbilist_init(void)
-{
-	au_spl_init(&au_sbilist);
-}
-
-static inline void au_sbilist_add(struct super_block *sb)
-{
-	au_spl_add(&au_sbi(sb)->si_list, &au_sbilist);
-}
-
-static inline void au_sbilist_del(struct super_block *sb)
-{
-	au_spl_del(&au_sbi(sb)->si_list, &au_sbilist);
-}
-
-#ifdef CONFIG_AUFS_MAGIC_SYSRQ
-static inline void au_sbilist_lock(void)
-{
-	spin_lock(&au_sbilist.spin);
-}
-
-static inline void au_sbilist_unlock(void)
-{
-	spin_unlock(&au_sbilist.spin);
-}
-#define AuGFP_SBILIST	GFP_ATOMIC
-#else
-AuStubVoid(au_sbilist_lock, void)
-AuStubVoid(au_sbilist_unlock, void)
-#define AuGFP_SBILIST	GFP_NOFS
-#endif /* CONFIG_AUFS_MAGIC_SYSRQ */
-#else
-AuStubVoid(au_sbilist_init, void)
-AuStubVoid(au_sbilist_add, struct super_block *sb)
-AuStubVoid(au_sbilist_del, struct super_block *sb)
-AuStubVoid(au_sbilist_lock, void)
-AuStubVoid(au_sbilist_unlock, void)
-#define AuGFP_SBILIST	GFP_NOFS
-#endif
-
-/* ---------------------------------------------------------------------- */
-
-static inline void dbgaufs_si_null(struct au_sbinfo *sbinfo)
-{
-	/*
-	 * This function is a dynamic '__init' function actually,
-	 * so the tiny check for si_rwsem is unnecessary.
-	 */
-	/* AuRwMustWriteLock(&sbinfo->si_rwsem); */
-#ifdef CONFIG_DEBUG_FS
-	sbinfo->si_dbgaufs = NULL;
-	sbinfo->si_dbgaufs_plink = NULL;
-	sbinfo->si_dbgaufs_xib = NULL;
-#ifdef CONFIG_AUFS_EXPORT
-	sbinfo->si_dbgaufs_xigen = NULL;
-#endif
-#endif
-}
-
-/* ---------------------------------------------------------------------- */
-
-static inline pid_t si_pid_bit(void)
-{
-	/* the origin of pid is 1, but the bitmap's is 0 */
-	return current->pid - 1;
-}
-
-static inline int si_pid_test(struct super_block *sb)
-{
-	pid_t bit;
-
-	bit = si_pid_bit();
-	if (bit < PID_MAX_DEFAULT)
-		return test_bit(bit, au_sbi(sb)->au_si_pid.bitmap);
-	return si_pid_test_slow(sb);
-}
-
-static inline void si_pid_set(struct super_block *sb)
-{
-	pid_t bit;
-
-	bit = si_pid_bit();
-	if (bit < PID_MAX_DEFAULT) {
-		AuDebugOn(test_bit(bit, au_sbi(sb)->au_si_pid.bitmap));
-		set_bit(bit, au_sbi(sb)->au_si_pid.bitmap);
-		/* smp_mb(); */
-	} else
-		si_pid_set_slow(sb);
-}
-
-static inline void si_pid_clr(struct super_block *sb)
-{
-	pid_t bit;
-
-	bit = si_pid_bit();
-	if (bit < PID_MAX_DEFAULT) {
-		AuDebugOn(!test_bit(bit, au_sbi(sb)->au_si_pid.bitmap));
-		clear_bit(bit, au_sbi(sb)->au_si_pid.bitmap);
-		/* smp_mb(); */
-	} else
-		si_pid_clr_slow(sb);
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* lock superblock. mainly for entry point functions */
-/*
- * __si_read_lock, __si_write_lock,
- * __si_read_unlock, __si_write_unlock, __si_downgrade_lock
- */
-AuSimpleRwsemFuncs(__si, struct super_block *sb, &au_sbi(sb)->si_rwsem);
-
-#define SiMustNoWaiters(sb)	AuRwMustNoWaiters(&au_sbi(sb)->si_rwsem)
-#define SiMustAnyLock(sb)	AuRwMustAnyLock(&au_sbi(sb)->si_rwsem)
-#define SiMustWriteLock(sb)	AuRwMustWriteLock(&au_sbi(sb)->si_rwsem)
-
-static inline void si_noflush_read_lock(struct super_block *sb)
-{
-	__si_read_lock(sb);
-	si_pid_set(sb);
-}
-
-static inline int si_noflush_read_trylock(struct super_block *sb)
-{
-	int locked;
-
-	locked = __si_read_trylock(sb);
-	if (locked)
-		si_pid_set(sb);
-	return locked;
-}
-
-static inline void si_noflush_write_lock(struct super_block *sb)
-{
-	__si_write_lock(sb);
-	si_pid_set(sb);
-}
-
-static inline int si_noflush_write_trylock(struct super_block *sb)
-{
-	int locked;
-
-	locked = __si_write_trylock(sb);
-	if (locked)
-		si_pid_set(sb);
-	return locked;
-}
-
-#if 0 /* reserved */
-static inline int si_read_trylock(struct super_block *sb, int flags)
-{
-	if (au_ftest_lock(flags, FLUSH))
-		au_nwt_flush(&au_sbi(sb)->si_nowait);
-	return si_noflush_read_trylock(sb);
-}
-#endif
-
-static inline void si_read_unlock(struct super_block *sb)
-{
-	si_pid_clr(sb);
-	__si_read_unlock(sb);
-}
-
-#if 0 /* reserved */
-static inline int si_write_trylock(struct super_block *sb, int flags)
-{
-	if (au_ftest_lock(flags, FLUSH))
-		au_nwt_flush(&au_sbi(sb)->si_nowait);
-	return si_noflush_write_trylock(sb);
-}
-#endif
-
-static inline void si_write_unlock(struct super_block *sb)
-{
-	si_pid_clr(sb);
-	__si_write_unlock(sb);
-}
-
-#if 0 /* reserved */
-static inline void si_downgrade_lock(struct super_block *sb)
-{
-	__si_downgrade_lock(sb);
-}
-#endif
-
-/* ---------------------------------------------------------------------- */
-
-static inline aufs_bindex_t au_sbend(struct super_block *sb)
-{
-	SiMustAnyLock(sb);
-	return au_sbi(sb)->si_bend;
-}
-
-static inline unsigned int au_mntflags(struct super_block *sb)
-{
-	SiMustAnyLock(sb);
-	return au_sbi(sb)->si_mntflags;
-}
-
-static inline unsigned int au_sigen(struct super_block *sb)
-{
-	SiMustAnyLock(sb);
-	return au_sbi(sb)->si_generation;
-}
-
-static inline void au_ninodes_inc(struct super_block *sb)
-{
-	atomic_long_inc(&au_sbi(sb)->si_ninodes);
-}
-
-static inline void au_ninodes_dec(struct super_block *sb)
-{
-	AuDebugOn(!atomic_long_read(&au_sbi(sb)->si_ninodes));
-	atomic_long_dec(&au_sbi(sb)->si_ninodes);
-}
-
-static inline void au_nfiles_inc(struct super_block *sb)
-{
-	atomic_long_inc(&au_sbi(sb)->si_nfiles);
-}
-
-static inline void au_nfiles_dec(struct super_block *sb)
-{
-	AuDebugOn(!atomic_long_read(&au_sbi(sb)->si_nfiles));
-	atomic_long_dec(&au_sbi(sb)->si_nfiles);
-}
-
-static inline struct au_branch *au_sbr(struct super_block *sb,
-				       aufs_bindex_t bindex)
-{
-	SiMustAnyLock(sb);
-	return au_sbi(sb)->si_branch[0 + bindex];
-}
-
-static inline void au_xino_brid_set(struct super_block *sb, aufs_bindex_t brid)
-{
-	SiMustWriteLock(sb);
-	au_sbi(sb)->si_xino_brid = brid;
-}
-
-static inline aufs_bindex_t au_xino_brid(struct super_block *sb)
-{
-	SiMustAnyLock(sb);
-	return au_sbi(sb)->si_xino_brid;
-}
-
-#endif /* __KERNEL__ */
-#endif /* __AUFS_SUPER_H__ */
diff --git a/fs/aufs/sysaufs.c b/fs/aufs/sysaufs.c
deleted file mode 100644
index 8ec10fb31..000000000
--- a/fs/aufs/sysaufs.c
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
- * Copyright (C) 2005-2016 Junjiro R. Okajima
- */
-
-/*
- * sysfs interface and lifetime management
- * they are necessary regardless sysfs is disabled.
- */
-
-#include <linux/random.h>
-#include "aufs.h"
-
-unsigned long sysaufs_si_mask;
-struct kset *sysaufs_kset;
-
-#define AuSiAttr(_name) { \
-	.attr   = { .name = __stringify(_name), .mode = 0444 },	\
-	.show   = sysaufs_si_##_name,				\
-}
-
-static struct sysaufs_si_attr sysaufs_si_attr_xi_path = AuSiAttr(xi_path);
-struct attribute *sysaufs_si_attrs[] = {
-	&sysaufs_si_attr_xi_path.attr,
-	NULL,
-};
-
-static const struct sysfs_ops au_sbi_ops = {
-	.show   = sysaufs_si_show
-};
-
-static struct kobj_type au_sbi_ktype = {
-	.release	= au_si_free,
-	.sysfs_ops	= &au_sbi_ops,
-	.default_attrs	= sysaufs_si_attrs
-};
-
-/* ---------------------------------------------------------------------- */
-
-int sysaufs_si_init(struct au_sbinfo *sbinfo)
-{
-	int err;
-
-	sbinfo->si_kobj.kset = sysaufs_kset;
-	/* cf. sysaufs_name() */
-	err = kobject_init_and_add
-		(&sbinfo->si_kobj, &au_sbi_ktype, /*&sysaufs_kset->kobj*/NULL,
-		 SysaufsSiNamePrefix "%lx", sysaufs_si_id(sbinfo));
-
-	dbgaufs_si_null(sbinfo);
-	if (!err) {
-		err = dbgaufs_si_init(sbinfo);
-		if (unlikely(err))
-			kobject_put(&sbinfo->si_kobj);
-	}
-	return err;
-}
-
-void sysaufs_fin(void)
-{
-	dbgaufs_fin();
-	sysfs_remove_group(&sysaufs_kset->kobj, sysaufs_attr_group);
-	kset_unregister(sysaufs_kset);
-}
-
-int __init sysaufs_init(void)
-{
-	int err;
-
-	do {
-		get_random_bytes(&sysaufs_si_mask, sizeof(sysaufs_si_mask));
-	} while (!sysaufs_si_mask);
-
-	err = -EINVAL;
-	sysaufs_kset = kset_create_and_add(AUFS_NAME, NULL, fs_kobj);
-	if (unlikely(!sysaufs_kset))
-		goto out;
-	err = PTR_ERR(sysaufs_kset);
-	if (IS_ERR(sysaufs_kset))
-		goto out;
-	err = sysfs_create_group(&sysaufs_kset->kobj, sysaufs_attr_group);
-	if (unlikely(err)) {
-		kset_unregister(sysaufs_kset);
-		goto out;
-	}
-
-	err = dbgaufs_init();
-	if (unlikely(err))
-		sysaufs_fin();
-out:
-	return err;
-}
diff --git a/fs/aufs/sysaufs.h b/fs/aufs/sysaufs.h
deleted file mode 100644
index 1f799835e..000000000
--- a/fs/aufs/sysaufs.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/*
- * Copyright (C) 2005-2016 Junjiro R. Okajima
- */
-
-/*
- * sysfs interface and mount lifetime management
- */
-
-#ifndef __SYSAUFS_H__
-#define __SYSAUFS_H__
-
-#ifdef __KERNEL__
-
-#include <linux/sysfs.h>
-#include "module.h"
-
-struct super_block;
-struct au_sbinfo;
-
-struct sysaufs_si_attr {
-	struct attribute attr;
-	int (*show)(struct seq_file *seq, struct super_block *sb);
-};
-
-/* ---------------------------------------------------------------------- */
-
-/* sysaufs.c */
-extern unsigned long sysaufs_si_mask;
-extern struct kset *sysaufs_kset;
-extern struct attribute *sysaufs_si_attrs[];
-int sysaufs_si_init(struct au_sbinfo *sbinfo);
-int __init sysaufs_init(void);
-void sysaufs_fin(void);
-
-/* ---------------------------------------------------------------------- */
-
-/* some people doesn't like to show a pointer in kernel */
-static inline unsigned long sysaufs_si_id(struct au_sbinfo *sbinfo)
-{
-	return sysaufs_si_mask ^ (unsigned long)sbinfo;
-}
-
-#define SysaufsSiNamePrefix	"si_"
-#define SysaufsSiNameLen	(sizeof(SysaufsSiNamePrefix) + 16)
-static inline void sysaufs_name(struct au_sbinfo *sbinfo, char *name)
-{
-	snprintf(name, SysaufsSiNameLen, SysaufsSiNamePrefix "%lx",
-		 sysaufs_si_id(sbinfo));
-}
-
-struct au_branch;
-#ifdef CONFIG_SYSFS
-/* sysfs.c */
-extern struct attribute_group *sysaufs_attr_group;
-
-int sysaufs_si_xi_path(struct seq_file *seq, struct super_block *sb);
-ssize_t sysaufs_si_show(struct kobject *kobj, struct attribute *attr,
-			 char *buf);
-long au_brinfo_ioctl(struct file *file, unsigned long arg);
-#ifdef CONFIG_COMPAT
-long au_brinfo_compat_ioctl(struct file *file, unsigned long arg);
-#endif
-
-void sysaufs_br_init(struct au_branch *br);
-void sysaufs_brs_add(struct super_block *sb, aufs_bindex_t bindex);
-void sysaufs_brs_del(struct super_block *sb, aufs_bindex_t bindex);
-
-#define sysaufs_brs_init()	do {} while (0)
-
-#else
-#define sysaufs_attr_group	NULL
-
-AuStubInt0(sysaufs_si_xi_path, struct seq_file *seq, struct super_block *sb)
-AuStub(ssize_t, sysaufs_si_show, return 0, struct kobject *kobj,
-       struct attribute *attr, char *buf)
-AuStubVoid(sysaufs_br_init, struct au_branch *br)
-AuStubVoid(sysaufs_brs_add, struct super_block *sb, aufs_bindex_t bindex)
-AuStubVoid(sysaufs_brs_del, struct super_block *sb, aufs_bindex_t bindex)
-
-static inline void sysaufs_brs_init(void)
-{
-	sysaufs_brs = 0;
-}
-
-#endif /* CONFIG_SYSFS */
-
-#endif /* __KERNEL__ */
-#endif /* __SYSAUFS_H__ */
diff --git a/fs/aufs/sysfs.c b/fs/aufs/sysfs.c
deleted file mode 100644
index ed42f53d0..000000000
--- a/fs/aufs/sysfs.c
+++ /dev/null
@@ -1,340 +0,0 @@
-/*
- * Copyright (C) 2005-2016 Junjiro R. Okajima
- */
-
-/*
- * sysfs interface
- */
-
-#include <linux/compat.h>
-#include <linux/seq_file.h>
-#include "aufs.h"
-
-static struct attribute *au_attr[] = {
-	NULL,	/* need to NULL terminate the list of attributes */
-};
-
-static struct attribute_group sysaufs_attr_group_body = {
-	.attrs = au_attr
-};
-
-struct attribute_group *sysaufs_attr_group = &sysaufs_attr_group_body;
-
-/* ---------------------------------------------------------------------- */
-
-int sysaufs_si_xi_path(struct seq_file *seq, struct super_block *sb)
-{
-	int err;
-
-	SiMustAnyLock(sb);
-
-	err = 0;
-	if (au_opt_test(au_mntflags(sb), XINO)) {
-		err = au_xino_path(seq, au_sbi(sb)->si_xib);
-		seq_putc(seq, '\n');
-	}
-	return err;
-}
-
-/*
- * the lifetime of branch is independent from the entry under sysfs.
- * sysfs handles the lifetime of the entry, and never call ->show() after it is
- * unlinked.
- */
-static int sysaufs_si_br(struct seq_file *seq, struct super_block *sb,
-			 aufs_bindex_t bindex, int idx)
-{
-	int err;
-	struct path path;
-	struct dentry *root;
-	struct au_branch *br;
-	au_br_perm_str_t perm;
-
-	AuDbg("b%d\n", bindex);
-
-	err = 0;
-	root = sb->s_root;
-	di_read_lock_parent(root, !AuLock_IR);
-	br = au_sbr(sb, bindex);
-
-	switch (idx) {
-	case AuBrSysfs_BR:
-		path.mnt = au_br_mnt(br);
-		path.dentry = au_h_dptr(root, bindex);
-		err = au_seq_path(seq, &path);
-		if (!err) {
-			au_optstr_br_perm(&perm, br->br_perm);
-			seq_printf(seq, "=%s\n", perm.a);
-		}
-		break;
-	case AuBrSysfs_BRID:
-		seq_printf(seq, "%d\n", br->br_id);
-		break;
-	}
-	di_read_unlock(root, !AuLock_IR);
-	if (unlikely(err || seq_has_overflowed(seq)))
-		err = -E2BIG;
-
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static struct seq_file *au_seq(char *p, ssize_t len)
-{
-	struct seq_file *seq;
-
-	seq = kzalloc(sizeof(*seq), GFP_NOFS);
-	if (seq) {
-		/* mutex_init(&seq.lock); */
-		seq->buf = p;
-		seq->size = len;
-		return seq; /* success */
-	}
-
-	seq = ERR_PTR(-ENOMEM);
-	return seq;
-}
-
-#define SysaufsBr_PREFIX	"br"
-#define SysaufsBrid_PREFIX	"brid"
-
-/* todo: file size may exceed PAGE_SIZE */
-ssize_t sysaufs_si_show(struct kobject *kobj, struct attribute *attr,
-			char *buf)
-{
-	ssize_t err;
-	int idx;
-	long l;
-	aufs_bindex_t bend;
-	struct au_sbinfo *sbinfo;
-	struct super_block *sb;
-	struct seq_file *seq;
-	char *name;
-	struct attribute **cattr;
-
-	sbinfo = container_of(kobj, struct au_sbinfo, si_kobj);
-	sb = sbinfo->si_sb;
-
-	/*
-	 * prevent a race condition between sysfs and aufs.
-	 * for instance, sysfs_file_read() calls sysfs_get_active_two() which
-	 * prohibits maintaining the sysfs entries.
-	 * hew we acquire read lock after sysfs_get_active_two().
-	 * on the other hand, the remount process may maintain the sysfs/aufs
-	 * entries after acquiring write lock.
-	 * it can cause a deadlock.
-	 * simply we gave up processing read here.
-	 */
-	err = -EBUSY;
-	if (unlikely(!si_noflush_read_trylock(sb)))
-		goto out;
-
-	seq = au_seq(buf, PAGE_SIZE);
-	err = PTR_ERR(seq);
-	if (IS_ERR(seq))
-		goto out_unlock;
-
-	name = (void *)attr->name;
-	cattr = sysaufs_si_attrs;
-	while (*cattr) {
-		if (!strcmp(name, (*cattr)->name)) {
-			err = container_of(*cattr, struct sysaufs_si_attr, attr)
-				->show(seq, sb);
-			goto out_seq;
-		}
-		cattr++;
-	}
-
-	if (!strncmp(name, SysaufsBrid_PREFIX,
-		     sizeof(SysaufsBrid_PREFIX) - 1)) {
-		idx = AuBrSysfs_BRID;
-		name += sizeof(SysaufsBrid_PREFIX) - 1;
-	} else if (!strncmp(name, SysaufsBr_PREFIX,
-			    sizeof(SysaufsBr_PREFIX) - 1)) {
-		idx = AuBrSysfs_BR;
-		name += sizeof(SysaufsBr_PREFIX) - 1;
-	} else
-		  BUG();
-
-	err = kstrtol(name, 10, &l);
-	if (!err) {
-		bend = au_sbend(sb);
-		if (l <= bend)
-			err = sysaufs_si_br(seq, sb, (aufs_bindex_t)l, idx);
-		else
-			err = -ENOENT;
-	}
-
-out_seq:
-	if (!err) {
-		err = seq->count;
-		/* sysfs limit */
-		if (unlikely(err == PAGE_SIZE))
-			err = -EFBIG;
-	}
-	kfree(seq);
-out_unlock:
-	si_read_unlock(sb);
-out:
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static int au_brinfo(struct super_block *sb, union aufs_brinfo __user *arg)
-{
-	int err;
-	int16_t brid;
-	aufs_bindex_t bindex, bend;
-	size_t sz;
-	char *buf;
-	struct seq_file *seq;
-	struct au_branch *br;
-
-	si_read_lock(sb, AuLock_FLUSH);
-	bend = au_sbend(sb);
-	err = bend + 1;
-	if (!arg)
-		goto out;
-
-	err = -ENOMEM;
-	buf = (void *)__get_free_page(GFP_NOFS);
-	if (unlikely(!buf))
-		goto out;
-
-	seq = au_seq(buf, PAGE_SIZE);
-	err = PTR_ERR(seq);
-	if (IS_ERR(seq))
-		goto out_buf;
-
-	sz = sizeof(*arg) - offsetof(union aufs_brinfo, path);
-	for (bindex = 0; bindex <= bend; bindex++, arg++) {
-		err = !access_ok(VERIFY_WRITE, arg, sizeof(*arg));
-		if (unlikely(err))
-			break;
-
-		br = au_sbr(sb, bindex);
-		brid = br->br_id;
-		BUILD_BUG_ON(sizeof(brid) != sizeof(arg->id));
-		err = __put_user(brid, &arg->id);
-		if (unlikely(err))
-			break;
-
-		BUILD_BUG_ON(sizeof(br->br_perm) != sizeof(arg->perm));
-		err = __put_user(br->br_perm, &arg->perm);
-		if (unlikely(err))
-			break;
-
-		err = au_seq_path(seq, &br->br_path);
-		if (unlikely(err))
-			break;
-		seq_putc(seq, '\0');
-		if (!seq_has_overflowed(seq)) {
-			err = copy_to_user(arg->path, seq->buf, seq->count);
-			seq->count = 0;
-			if (unlikely(err))
-				break;
-		} else {
-			err = -E2BIG;
-			goto out_seq;
-		}
-	}
-	if (unlikely(err))
-		err = -EFAULT;
-
-out_seq:
-	kfree(seq);
-out_buf:
-	free_page((unsigned long)buf);
-out:
-	si_read_unlock(sb);
-	return err;
-}
-
-long au_brinfo_ioctl(struct file *file, unsigned long arg)
-{
-	return au_brinfo(file->f_path.dentry->d_sb, (void __user *)arg);
-}
-
-#ifdef CONFIG_COMPAT
-long au_brinfo_compat_ioctl(struct file *file, unsigned long arg)
-{
-	return au_brinfo(file->f_path.dentry->d_sb, compat_ptr(arg));
-}
-#endif
-
-/* ---------------------------------------------------------------------- */
-
-void sysaufs_br_init(struct au_branch *br)
-{
-	int i;
-	struct au_brsysfs *br_sysfs;
-	struct attribute *attr;
-
-	br_sysfs = br->br_sysfs;
-	for (i = 0; i < ARRAY_SIZE(br->br_sysfs); i++) {
-		attr = &br_sysfs->attr;
-		sysfs_attr_init(attr);
-		attr->name = br_sysfs->name;
-		attr->mode = S_IRUGO;
-		br_sysfs++;
-	}
-}
-
-void sysaufs_brs_del(struct super_block *sb, aufs_bindex_t bindex)
-{
-	struct au_branch *br;
-	struct kobject *kobj;
-	struct au_brsysfs *br_sysfs;
-	int i;
-	aufs_bindex_t bend;
-
-	dbgaufs_brs_del(sb, bindex);
-
-	if (!sysaufs_brs)
-		return;
-
-	kobj = &au_sbi(sb)->si_kobj;
-	bend = au_sbend(sb);
-	for (; bindex <= bend; bindex++) {
-		br = au_sbr(sb, bindex);
-		br_sysfs = br->br_sysfs;
-		for (i = 0; i < ARRAY_SIZE(br->br_sysfs); i++) {
-			sysfs_remove_file(kobj, &br_sysfs->attr);
-			br_sysfs++;
-		}
-	}
-}
-
-void sysaufs_brs_add(struct super_block *sb, aufs_bindex_t bindex)
-{
-	int err, i;
-	aufs_bindex_t bend;
-	struct kobject *kobj;
-	struct au_branch *br;
-	struct au_brsysfs *br_sysfs;
-
-	dbgaufs_brs_add(sb, bindex);
-
-	if (!sysaufs_brs)
-		return;
-
-	kobj = &au_sbi(sb)->si_kobj;
-	bend = au_sbend(sb);
-	for (; bindex <= bend; bindex++) {
-		br = au_sbr(sb, bindex);
-		br_sysfs = br->br_sysfs;
-		snprintf(br_sysfs[AuBrSysfs_BR].name, sizeof(br_sysfs->name),
-			 SysaufsBr_PREFIX "%d", bindex);
-		snprintf(br_sysfs[AuBrSysfs_BRID].name, sizeof(br_sysfs->name),
-			 SysaufsBrid_PREFIX "%d", bindex);
-		for (i = 0; i < ARRAY_SIZE(br->br_sysfs); i++) {
-			err = sysfs_create_file(kobj, &br_sysfs->attr);
-			if (unlikely(err))
-				pr_warn("failed %s under sysfs(%d)\n",
-					br_sysfs->name, err);
-			br_sysfs++;
-		}
-	}
-}
diff --git a/fs/aufs/sysrq.c b/fs/aufs/sysrq.c
deleted file mode 100644
index 7921ed716..000000000
--- a/fs/aufs/sysrq.c
+++ /dev/null
@@ -1,144 +0,0 @@
-/*
- * Copyright (C) 2005-2016 Junjiro R. Okajima
- */
-
-/*
- * magic sysrq hanlder
- */
-
-/* #include <linux/sysrq.h> */
-#include <linux/writeback.h>
-#include "aufs.h"
-
-/* ---------------------------------------------------------------------- */
-
-static void sysrq_sb(struct super_block *sb)
-{
-	char *plevel;
-	struct au_sbinfo *sbinfo;
-	struct file *file;
-	struct au_sphlhead *files;
-	struct au_finfo *finfo;
-
-	plevel = au_plevel;
-	au_plevel = KERN_WARNING;
-
-	/* since we define pr_fmt, call printk directly */
-#define pr(str) printk(KERN_WARNING AUFS_NAME ": " str)
-
-	sbinfo = au_sbi(sb);
-	printk(KERN_WARNING "si=%lx\n", sysaufs_si_id(sbinfo));
-	pr("superblock\n");
-	au_dpri_sb(sb);
-
-#if 0
-	pr("root dentry\n");
-	au_dpri_dentry(sb->s_root);
-	pr("root inode\n");
-	au_dpri_inode(d_inode(sb->s_root));
-#endif
-
-#if 0
-	do {
-		int err, i, j, ndentry;
-		struct au_dcsub_pages dpages;
-		struct au_dpage *dpage;
-
-		err = au_dpages_init(&dpages, GFP_ATOMIC);
-		if (unlikely(err))
-			break;
-		err = au_dcsub_pages(&dpages, sb->s_root, NULL, NULL);
-		if (!err)
-			for (i = 0; i < dpages.ndpage; i++) {
-				dpage = dpages.dpages + i;
-				ndentry = dpage->ndentry;
-				for (j = 0; j < ndentry; j++)
-					au_dpri_dentry(dpage->dentries[j]);
-			}
-		au_dpages_free(&dpages);
-	} while (0);
-#endif
-
-#if 1
-	{
-		struct inode *i;
-
-		pr("isolated inode\n");
-		spin_lock(&sb->s_inode_list_lock);
-		list_for_each_entry(i, &sb->s_inodes, i_sb_list) {
-			spin_lock(&i->i_lock);
-			if (1 || hlist_empty(&i->i_dentry))
-				au_dpri_inode(i);
-			spin_unlock(&i->i_lock);
-		}
-		spin_unlock(&sb->s_inode_list_lock);
-	}
-#endif
-	pr("files\n");
-	files = &au_sbi(sb)->si_files;
-	spin_lock(&files->spin);
-	hlist_for_each_entry(finfo, &files->head, fi_hlist) {
-		umode_t mode;
-
-		file = finfo->fi_file;
-		mode = file_inode(file)->i_mode;
-		if (!special_file(mode))
-			au_dpri_file(file);
-	}
-	spin_unlock(&files->spin);
-	pr("done\n");
-
-#undef pr
-	au_plevel = plevel;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* module parameter */
-static char *aufs_sysrq_key = "a";
-module_param_named(sysrq, aufs_sysrq_key, charp, S_IRUGO);
-MODULE_PARM_DESC(sysrq, "MagicSysRq key for " AUFS_NAME);
-
-static void au_sysrq(int key __maybe_unused)
-{
-	struct au_sbinfo *sbinfo;
-
-	lockdep_off();
-	au_sbilist_lock();
-	list_for_each_entry(sbinfo, &au_sbilist.head, si_list)
-		sysrq_sb(sbinfo->si_sb);
-	au_sbilist_unlock();
-	lockdep_on();
-}
-
-static struct sysrq_key_op au_sysrq_op = {
-	.handler	= au_sysrq,
-	.help_msg	= "Aufs",
-	.action_msg	= "Aufs",
-	.enable_mask	= SYSRQ_ENABLE_DUMP
-};
-
-/* ---------------------------------------------------------------------- */
-
-int __init au_sysrq_init(void)
-{
-	int err;
-	char key;
-
-	err = -1;
-	key = *aufs_sysrq_key;
-	if ('a' <= key && key <= 'z')
-		err = register_sysrq_key(key, &au_sysrq_op);
-	if (unlikely(err))
-		pr_err("err %d, sysrq=%c\n", err, key);
-	return err;
-}
-
-void au_sysrq_fin(void)
-{
-	int err;
-
-	err = unregister_sysrq_key(*aufs_sysrq_key, &au_sysrq_op);
-	if (unlikely(err))
-		pr_err("err %d (ignored)\n", err);
-}
diff --git a/fs/aufs/vdir.c b/fs/aufs/vdir.c
deleted file mode 100644
index f64cc2b7a..000000000
--- a/fs/aufs/vdir.c
+++ /dev/null
@@ -1,875 +0,0 @@
-/*
- * Copyright (C) 2005-2016 Junjiro R. Okajima
- */
-
-/*
- * virtual or vertical directory
- */
-
-#include "aufs.h"
-
-static unsigned int calc_size(int nlen)
-{
-	return ALIGN(sizeof(struct au_vdir_de) + nlen, sizeof(ino_t));
-}
-
-static int set_deblk_end(union au_vdir_deblk_p *p,
-			 union au_vdir_deblk_p *deblk_end)
-{
-	if (calc_size(0) <= deblk_end->deblk - p->deblk) {
-		p->de->de_str.len = 0;
-		/* smp_mb(); */
-		return 0;
-	}
-	return -1; /* error */
-}
-
-/* returns true or false */
-static int is_deblk_end(union au_vdir_deblk_p *p,
-			union au_vdir_deblk_p *deblk_end)
-{
-	if (calc_size(0) <= deblk_end->deblk - p->deblk)
-		return !p->de->de_str.len;
-	return 1;
-}
-
-static unsigned char *last_deblk(struct au_vdir *vdir)
-{
-	return vdir->vd_deblk[vdir->vd_nblk - 1];
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* estimate the appropriate size for name hash table */
-unsigned int au_rdhash_est(loff_t sz)
-{
-	unsigned int n;
-
-	n = UINT_MAX;
-	sz >>= 10;
-	if (sz < n)
-		n = sz;
-	if (sz < AUFS_RDHASH_DEF)
-		n = AUFS_RDHASH_DEF;
-	/* pr_info("n %u\n", n); */
-	return n;
-}
-
-/*
- * the allocated memory has to be freed by
- * au_nhash_wh_free() or au_nhash_de_free().
- */
-int au_nhash_alloc(struct au_nhash *nhash, unsigned int num_hash, gfp_t gfp)
-{
-	struct hlist_head *head;
-	unsigned int u;
-	size_t sz;
-
-	sz = sizeof(*nhash->nh_head) * num_hash;
-	head = kmalloc(sz, gfp);
-	if (head) {
-		nhash->nh_num = num_hash;
-		nhash->nh_head = head;
-		for (u = 0; u < num_hash; u++)
-			INIT_HLIST_HEAD(head++);
-		return 0; /* success */
-	}
-
-	return -ENOMEM;
-}
-
-static void nhash_count(struct hlist_head *head)
-{
-#if 0
-	unsigned long n;
-	struct hlist_node *pos;
-
-	n = 0;
-	hlist_for_each(pos, head)
-		n++;
-	pr_info("%lu\n", n);
-#endif
-}
-
-static void au_nhash_wh_do_free(struct hlist_head *head)
-{
-	struct au_vdir_wh *pos;
-	struct hlist_node *node;
-
-	hlist_for_each_entry_safe(pos, node, head, wh_hash)
-		kfree(pos);
-}
-
-static void au_nhash_de_do_free(struct hlist_head *head)
-{
-	struct au_vdir_dehstr *pos;
-	struct hlist_node *node;
-
-	hlist_for_each_entry_safe(pos, node, head, hash)
-		au_cache_free_vdir_dehstr(pos);
-}
-
-static void au_nhash_do_free(struct au_nhash *nhash,
-			     void (*free)(struct hlist_head *head))
-{
-	unsigned int n;
-	struct hlist_head *head;
-
-	n = nhash->nh_num;
-	if (!n)
-		return;
-
-	head = nhash->nh_head;
-	while (n-- > 0) {
-		nhash_count(head);
-		free(head++);
-	}
-	kfree(nhash->nh_head);
-}
-
-void au_nhash_wh_free(struct au_nhash *whlist)
-{
-	au_nhash_do_free(whlist, au_nhash_wh_do_free);
-}
-
-static void au_nhash_de_free(struct au_nhash *delist)
-{
-	au_nhash_do_free(delist, au_nhash_de_do_free);
-}
-
-/* ---------------------------------------------------------------------- */
-
-int au_nhash_test_longer_wh(struct au_nhash *whlist, aufs_bindex_t btgt,
-			    int limit)
-{
-	int num;
-	unsigned int u, n;
-	struct hlist_head *head;
-	struct au_vdir_wh *pos;
-
-	num = 0;
-	n = whlist->nh_num;
-	head = whlist->nh_head;
-	for (u = 0; u < n; u++, head++)
-		hlist_for_each_entry(pos, head, wh_hash)
-			if (pos->wh_bindex == btgt && ++num > limit)
-				return 1;
-	return 0;
-}
-
-static struct hlist_head *au_name_hash(struct au_nhash *nhash,
-				       unsigned char *name,
-				       unsigned int len)
-{
-	unsigned int v;
-	/* const unsigned int magic_bit = 12; */
-
-	AuDebugOn(!nhash->nh_num || !nhash->nh_head);
-
-	v = 0;
-	while (len--)
-		v += *name++;
-	/* v = hash_long(v, magic_bit); */
-	v %= nhash->nh_num;
-	return nhash->nh_head + v;
-}
-
-static int au_nhash_test_name(struct au_vdir_destr *str, const char *name,
-			      int nlen)
-{
-	return str->len == nlen && !memcmp(str->name, name, nlen);
-}
-
-/* returns found or not */
-int au_nhash_test_known_wh(struct au_nhash *whlist, char *name, int nlen)
-{
-	struct hlist_head *head;
-	struct au_vdir_wh *pos;
-	struct au_vdir_destr *str;
-
-	head = au_name_hash(whlist, name, nlen);
-	hlist_for_each_entry(pos, head, wh_hash) {
-		str = &pos->wh_str;
-		AuDbg("%.*s\n", str->len, str->name);
-		if (au_nhash_test_name(str, name, nlen))
-			return 1;
-	}
-	return 0;
-}
-
-/* returns found(true) or not */
-static int test_known(struct au_nhash *delist, char *name, int nlen)
-{
-	struct hlist_head *head;
-	struct au_vdir_dehstr *pos;
-	struct au_vdir_destr *str;
-
-	head = au_name_hash(delist, name, nlen);
-	hlist_for_each_entry(pos, head, hash) {
-		str = pos->str;
-		AuDbg("%.*s\n", str->len, str->name);
-		if (au_nhash_test_name(str, name, nlen))
-			return 1;
-	}
-	return 0;
-}
-
-static void au_shwh_init_wh(struct au_vdir_wh *wh, ino_t ino,
-			    unsigned char d_type)
-{
-#ifdef CONFIG_AUFS_SHWH
-	wh->wh_ino = ino;
-	wh->wh_type = d_type;
-#endif
-}
-
-/* ---------------------------------------------------------------------- */
-
-int au_nhash_append_wh(struct au_nhash *whlist, char *name, int nlen, ino_t ino,
-		       unsigned int d_type, aufs_bindex_t bindex,
-		       unsigned char shwh)
-{
-	int err;
-	struct au_vdir_destr *str;
-	struct au_vdir_wh *wh;
-
-	AuDbg("%.*s\n", nlen, name);
-	AuDebugOn(!whlist->nh_num || !whlist->nh_head);
-
-	err = -ENOMEM;
-	wh = kmalloc(sizeof(*wh) + nlen, GFP_NOFS);
-	if (unlikely(!wh))
-		goto out;
-
-	err = 0;
-	wh->wh_bindex = bindex;
-	if (shwh)
-		au_shwh_init_wh(wh, ino, d_type);
-	str = &wh->wh_str;
-	str->len = nlen;
-	memcpy(str->name, name, nlen);
-	hlist_add_head(&wh->wh_hash, au_name_hash(whlist, name, nlen));
-	/* smp_mb(); */
-
-out:
-	return err;
-}
-
-static int append_deblk(struct au_vdir *vdir)
-{
-	int err;
-	unsigned long ul;
-	const unsigned int deblk_sz = vdir->vd_deblk_sz;
-	union au_vdir_deblk_p p, deblk_end;
-	unsigned char **o;
-
-	err = -ENOMEM;
-	o = krealloc(vdir->vd_deblk, sizeof(*o) * (vdir->vd_nblk + 1),
-		     GFP_NOFS);
-	if (unlikely(!o))
-		goto out;
-
-	vdir->vd_deblk = o;
-	p.deblk = kmalloc(deblk_sz, GFP_NOFS);
-	if (p.deblk) {
-		ul = vdir->vd_nblk++;
-		vdir->vd_deblk[ul] = p.deblk;
-		vdir->vd_last.ul = ul;
-		vdir->vd_last.p.deblk = p.deblk;
-		deblk_end.deblk = p.deblk + deblk_sz;
-		err = set_deblk_end(&p, &deblk_end);
-	}
-
-out:
-	return err;
-}
-
-static int append_de(struct au_vdir *vdir, char *name, int nlen, ino_t ino,
-		     unsigned int d_type, struct au_nhash *delist)
-{
-	int err;
-	unsigned int sz;
-	const unsigned int deblk_sz = vdir->vd_deblk_sz;
-	union au_vdir_deblk_p p, *room, deblk_end;
-	struct au_vdir_dehstr *dehstr;
-
-	p.deblk = last_deblk(vdir);
-	deblk_end.deblk = p.deblk + deblk_sz;
-	room = &vdir->vd_last.p;
-	AuDebugOn(room->deblk < p.deblk || deblk_end.deblk <= room->deblk
-		  || !is_deblk_end(room, &deblk_end));
-
-	sz = calc_size(nlen);
-	if (unlikely(sz > deblk_end.deblk - room->deblk)) {
-		err = append_deblk(vdir);
-		if (unlikely(err))
-			goto out;
-
-		p.deblk = last_deblk(vdir);
-		deblk_end.deblk = p.deblk + deblk_sz;
-		/* smp_mb(); */
-		AuDebugOn(room->deblk != p.deblk);
-	}
-
-	err = -ENOMEM;
-	dehstr = au_cache_alloc_vdir_dehstr();
-	if (unlikely(!dehstr))
-		goto out;
-
-	dehstr->str = &room->de->de_str;
-	hlist_add_head(&dehstr->hash, au_name_hash(delist, name, nlen));
-	room->de->de_ino = ino;
-	room->de->de_type = d_type;
-	room->de->de_str.len = nlen;
-	memcpy(room->de->de_str.name, name, nlen);
-
-	err = 0;
-	room->deblk += sz;
-	if (unlikely(set_deblk_end(room, &deblk_end)))
-		err = append_deblk(vdir);
-	/* smp_mb(); */
-
-out:
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void au_vdir_free(struct au_vdir *vdir)
-{
-	unsigned char **deblk;
-
-	deblk = vdir->vd_deblk;
-	while (vdir->vd_nblk--)
-		kfree(*deblk++);
-	kfree(vdir->vd_deblk);
-	au_cache_free_vdir(vdir);
-}
-
-static struct au_vdir *alloc_vdir(struct file *file)
-{
-	struct au_vdir *vdir;
-	struct super_block *sb;
-	int err;
-
-	sb = file->f_path.dentry->d_sb;
-	SiMustAnyLock(sb);
-
-	err = -ENOMEM;
-	vdir = au_cache_alloc_vdir();
-	if (unlikely(!vdir))
-		goto out;
-
-	vdir->vd_deblk = kzalloc(sizeof(*vdir->vd_deblk), GFP_NOFS);
-	if (unlikely(!vdir->vd_deblk))
-		goto out_free;
-
-	vdir->vd_deblk_sz = au_sbi(sb)->si_rdblk;
-	if (!vdir->vd_deblk_sz) {
-		/* estimate the appropriate size for deblk */
-		vdir->vd_deblk_sz = au_dir_size(file, /*dentry*/NULL);
-		/* pr_info("vd_deblk_sz %u\n", vdir->vd_deblk_sz); */
-	}
-	vdir->vd_nblk = 0;
-	vdir->vd_version = 0;
-	vdir->vd_jiffy = 0;
-	err = append_deblk(vdir);
-	if (!err)
-		return vdir; /* success */
-
-	kfree(vdir->vd_deblk);
-
-out_free:
-	au_cache_free_vdir(vdir);
-out:
-	vdir = ERR_PTR(err);
-	return vdir;
-}
-
-static int reinit_vdir(struct au_vdir *vdir)
-{
-	int err;
-	union au_vdir_deblk_p p, deblk_end;
-
-	while (vdir->vd_nblk > 1) {
-		kfree(vdir->vd_deblk[vdir->vd_nblk - 1]);
-		/* vdir->vd_deblk[vdir->vd_nblk - 1] = NULL; */
-		vdir->vd_nblk--;
-	}
-	p.deblk = vdir->vd_deblk[0];
-	deblk_end.deblk = p.deblk + vdir->vd_deblk_sz;
-	err = set_deblk_end(&p, &deblk_end);
-	/* keep vd_dblk_sz */
-	vdir->vd_last.ul = 0;
-	vdir->vd_last.p.deblk = vdir->vd_deblk[0];
-	vdir->vd_version = 0;
-	vdir->vd_jiffy = 0;
-	/* smp_mb(); */
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-#define AuFillVdir_CALLED	1
-#define AuFillVdir_WHABLE	(1 << 1)
-#define AuFillVdir_SHWH		(1 << 2)
-#define au_ftest_fillvdir(flags, name)	((flags) & AuFillVdir_##name)
-#define au_fset_fillvdir(flags, name) \
-	do { (flags) |= AuFillVdir_##name; } while (0)
-#define au_fclr_fillvdir(flags, name) \
-	do { (flags) &= ~AuFillVdir_##name; } while (0)
-
-#ifndef CONFIG_AUFS_SHWH
-#undef AuFillVdir_SHWH
-#define AuFillVdir_SHWH		0
-#endif
-
-struct fillvdir_arg {
-	struct dir_context	ctx;
-	struct file		*file;
-	struct au_vdir		*vdir;
-	struct au_nhash		delist;
-	struct au_nhash		whlist;
-	aufs_bindex_t		bindex;
-	unsigned int		flags;
-	int			err;
-};
-
-static int fillvdir(struct dir_context *ctx, const char *__name, int nlen,
-		    loff_t offset __maybe_unused, u64 h_ino,
-		    unsigned int d_type)
-{
-	struct fillvdir_arg *arg = container_of(ctx, struct fillvdir_arg, ctx);
-	char *name = (void *)__name;
-	struct super_block *sb;
-	ino_t ino;
-	const unsigned char shwh = !!au_ftest_fillvdir(arg->flags, SHWH);
-
-	arg->err = 0;
-	sb = arg->file->f_path.dentry->d_sb;
-	au_fset_fillvdir(arg->flags, CALLED);
-	/* smp_mb(); */
-	if (nlen <= AUFS_WH_PFX_LEN
-	    || memcmp(name, AUFS_WH_PFX, AUFS_WH_PFX_LEN)) {
-		if (test_known(&arg->delist, name, nlen)
-		    || au_nhash_test_known_wh(&arg->whlist, name, nlen))
-			goto out; /* already exists or whiteouted */
-
-		arg->err = au_ino(sb, arg->bindex, h_ino, d_type, &ino);
-		if (!arg->err) {
-			if (unlikely(nlen > AUFS_MAX_NAMELEN))
-				d_type = DT_UNKNOWN;
-			arg->err = append_de(arg->vdir, name, nlen, ino,
-					     d_type, &arg->delist);
-		}
-	} else if (au_ftest_fillvdir(arg->flags, WHABLE)) {
-		name += AUFS_WH_PFX_LEN;
-		nlen -= AUFS_WH_PFX_LEN;
-		if (au_nhash_test_known_wh(&arg->whlist, name, nlen))
-			goto out; /* already whiteouted */
-
-		if (shwh)
-			arg->err = au_wh_ino(sb, arg->bindex, h_ino, d_type,
-					     &ino);
-		if (!arg->err) {
-			if (nlen <= AUFS_MAX_NAMELEN + AUFS_WH_PFX_LEN)
-				d_type = DT_UNKNOWN;
-			arg->err = au_nhash_append_wh
-				(&arg->whlist, name, nlen, ino, d_type,
-				 arg->bindex, shwh);
-		}
-	}
-
-out:
-	if (!arg->err)
-		arg->vdir->vd_jiffy = jiffies;
-	/* smp_mb(); */
-	AuTraceErr(arg->err);
-	return arg->err;
-}
-
-static int au_handle_shwh(struct super_block *sb, struct au_vdir *vdir,
-			  struct au_nhash *whlist, struct au_nhash *delist)
-{
-#ifdef CONFIG_AUFS_SHWH
-	int err;
-	unsigned int nh, u;
-	struct hlist_head *head;
-	struct au_vdir_wh *pos;
-	struct hlist_node *n;
-	char *p, *o;
-	struct au_vdir_destr *destr;
-
-	AuDebugOn(!au_opt_test(au_mntflags(sb), SHWH));
-
-	err = -ENOMEM;
-	o = p = (void *)__get_free_page(GFP_NOFS);
-	if (unlikely(!p))
-		goto out;
-
-	err = 0;
-	nh = whlist->nh_num;
-	memcpy(p, AUFS_WH_PFX, AUFS_WH_PFX_LEN);
-	p += AUFS_WH_PFX_LEN;
-	for (u = 0; u < nh; u++) {
-		head = whlist->nh_head + u;
-		hlist_for_each_entry_safe(pos, n, head, wh_hash) {
-			destr = &pos->wh_str;
-			memcpy(p, destr->name, destr->len);
-			err = append_de(vdir, o, destr->len + AUFS_WH_PFX_LEN,
-					pos->wh_ino, pos->wh_type, delist);
-			if (unlikely(err))
-				break;
-		}
-	}
-
-	free_page((unsigned long)o);
-
-out:
-	AuTraceErr(err);
-	return err;
-#else
-	return 0;
-#endif
-}
-
-static int au_do_read_vdir(struct fillvdir_arg *arg)
-{
-	int err;
-	unsigned int rdhash;
-	loff_t offset;
-	aufs_bindex_t bend, bindex, bstart;
-	unsigned char shwh;
-	struct file *hf, *file;
-	struct super_block *sb;
-
-	file = arg->file;
-	sb = file->f_path.dentry->d_sb;
-	SiMustAnyLock(sb);
-
-	rdhash = au_sbi(sb)->si_rdhash;
-	if (!rdhash)
-		rdhash = au_rdhash_est(au_dir_size(file, /*dentry*/NULL));
-	err = au_nhash_alloc(&arg->delist, rdhash, GFP_NOFS);
-	if (unlikely(err))
-		goto out;
-	err = au_nhash_alloc(&arg->whlist, rdhash, GFP_NOFS);
-	if (unlikely(err))
-		goto out_delist;
-
-	err = 0;
-	arg->flags = 0;
-	shwh = 0;
-	if (au_opt_test(au_mntflags(sb), SHWH)) {
-		shwh = 1;
-		au_fset_fillvdir(arg->flags, SHWH);
-	}
-	bstart = au_fbstart(file);
-	bend = au_fbend_dir(file);
-	for (bindex = bstart; !err && bindex <= bend; bindex++) {
-		hf = au_hf_dir(file, bindex);
-		if (!hf)
-			continue;
-
-		offset = vfsub_llseek(hf, 0, SEEK_SET);
-		err = offset;
-		if (unlikely(offset))
-			break;
-
-		arg->bindex = bindex;
-		au_fclr_fillvdir(arg->flags, WHABLE);
-		if (shwh
-		    || (bindex != bend
-			&& au_br_whable(au_sbr_perm(sb, bindex))))
-			au_fset_fillvdir(arg->flags, WHABLE);
-		do {
-			arg->err = 0;
-			au_fclr_fillvdir(arg->flags, CALLED);
-			/* smp_mb(); */
-			err = vfsub_iterate_dir(hf, &arg->ctx);
-			if (err >= 0)
-				err = arg->err;
-		} while (!err && au_ftest_fillvdir(arg->flags, CALLED));
-
-		/*
-		 * dir_relax() may be good for concurrency, but aufs should not
-		 * use it since it will cause a lockdep problem.
-		 */
-	}
-
-	if (!err && shwh)
-		err = au_handle_shwh(sb, arg->vdir, &arg->whlist, &arg->delist);
-
-	au_nhash_wh_free(&arg->whlist);
-
-out_delist:
-	au_nhash_de_free(&arg->delist);
-out:
-	return err;
-}
-
-static int read_vdir(struct file *file, int may_read)
-{
-	int err;
-	unsigned long expire;
-	unsigned char do_read;
-	struct fillvdir_arg arg = {
-		.ctx = {
-			.actor = fillvdir
-		}
-	};
-	struct inode *inode;
-	struct au_vdir *vdir, *allocated;
-
-	err = 0;
-	inode = file_inode(file);
-	IMustLock(inode);
-	SiMustAnyLock(inode->i_sb);
-
-	allocated = NULL;
-	do_read = 0;
-	expire = au_sbi(inode->i_sb)->si_rdcache;
-	vdir = au_ivdir(inode);
-	if (!vdir) {
-		do_read = 1;
-		vdir = alloc_vdir(file);
-		err = PTR_ERR(vdir);
-		if (IS_ERR(vdir))
-			goto out;
-		err = 0;
-		allocated = vdir;
-	} else if (may_read
-		   && (inode->i_version != vdir->vd_version
-		       || time_after(jiffies, vdir->vd_jiffy + expire))) {
-		do_read = 1;
-		err = reinit_vdir(vdir);
-		if (unlikely(err))
-			goto out;
-	}
-
-	if (!do_read)
-		return 0; /* success */
-
-	arg.file = file;
-	arg.vdir = vdir;
-	err = au_do_read_vdir(&arg);
-	if (!err) {
-		/* file->f_pos = 0; */ /* todo: ctx->pos? */
-		vdir->vd_version = inode->i_version;
-		vdir->vd_last.ul = 0;
-		vdir->vd_last.p.deblk = vdir->vd_deblk[0];
-		if (allocated)
-			au_set_ivdir(inode, allocated);
-	} else if (allocated)
-		au_vdir_free(allocated);
-
-out:
-	return err;
-}
-
-static int copy_vdir(struct au_vdir *tgt, struct au_vdir *src)
-{
-	int err, rerr;
-	unsigned long ul, n;
-	const unsigned int deblk_sz = src->vd_deblk_sz;
-
-	AuDebugOn(tgt->vd_nblk != 1);
-
-	err = -ENOMEM;
-	if (tgt->vd_nblk < src->vd_nblk) {
-		unsigned char **p;
-
-		p = krealloc(tgt->vd_deblk, sizeof(*p) * src->vd_nblk,
-			     GFP_NOFS);
-		if (unlikely(!p))
-			goto out;
-		tgt->vd_deblk = p;
-	}
-
-	if (tgt->vd_deblk_sz != deblk_sz) {
-		unsigned char *p;
-
-		tgt->vd_deblk_sz = deblk_sz;
-		p = krealloc(tgt->vd_deblk[0], deblk_sz, GFP_NOFS);
-		if (unlikely(!p))
-			goto out;
-		tgt->vd_deblk[0] = p;
-	}
-	memcpy(tgt->vd_deblk[0], src->vd_deblk[0], deblk_sz);
-	tgt->vd_version = src->vd_version;
-	tgt->vd_jiffy = src->vd_jiffy;
-
-	n = src->vd_nblk;
-	for (ul = 1; ul < n; ul++) {
-		tgt->vd_deblk[ul] = kmemdup(src->vd_deblk[ul], deblk_sz,
-					    GFP_NOFS);
-		if (unlikely(!tgt->vd_deblk[ul]))
-			goto out;
-		tgt->vd_nblk++;
-	}
-	tgt->vd_nblk = n;
-	tgt->vd_last.ul = tgt->vd_last.ul;
-	tgt->vd_last.p.deblk = tgt->vd_deblk[tgt->vd_last.ul];
-	tgt->vd_last.p.deblk += src->vd_last.p.deblk
-		- src->vd_deblk[src->vd_last.ul];
-	/* smp_mb(); */
-	return 0; /* success */
-
-out:
-	rerr = reinit_vdir(tgt);
-	BUG_ON(rerr);
-	return err;
-}
-
-int au_vdir_init(struct file *file)
-{
-	int err;
-	struct inode *inode;
-	struct au_vdir *vdir_cache, *allocated;
-
-	/* test file->f_pos here instead of ctx->pos */
-	err = read_vdir(file, !file->f_pos);
-	if (unlikely(err))
-		goto out;
-
-	allocated = NULL;
-	vdir_cache = au_fvdir_cache(file);
-	if (!vdir_cache) {
-		vdir_cache = alloc_vdir(file);
-		err = PTR_ERR(vdir_cache);
-		if (IS_ERR(vdir_cache))
-			goto out;
-		allocated = vdir_cache;
-	} else if (!file->f_pos && vdir_cache->vd_version != file->f_version) {
-		/* test file->f_pos here instead of ctx->pos */
-		err = reinit_vdir(vdir_cache);
-		if (unlikely(err))
-			goto out;
-	} else
-		return 0; /* success */
-
-	inode = file_inode(file);
-	err = copy_vdir(vdir_cache, au_ivdir(inode));
-	if (!err) {
-		file->f_version = inode->i_version;
-		if (allocated)
-			au_set_fvdir_cache(file, allocated);
-	} else if (allocated)
-		au_vdir_free(allocated);
-
-out:
-	return err;
-}
-
-static loff_t calc_offset(struct au_vdir *vdir)
-{
-	loff_t offset;
-	union au_vdir_deblk_p p;
-
-	p.deblk = vdir->vd_deblk[vdir->vd_last.ul];
-	offset = vdir->vd_last.p.deblk - p.deblk;
-	offset += vdir->vd_deblk_sz * vdir->vd_last.ul;
-	return offset;
-}
-
-/* returns true or false */
-static int seek_vdir(struct file *file, struct dir_context *ctx)
-{
-	int valid;
-	unsigned int deblk_sz;
-	unsigned long ul, n;
-	loff_t offset;
-	union au_vdir_deblk_p p, deblk_end;
-	struct au_vdir *vdir_cache;
-
-	valid = 1;
-	vdir_cache = au_fvdir_cache(file);
-	offset = calc_offset(vdir_cache);
-	AuDbg("offset %lld\n", offset);
-	if (ctx->pos == offset)
-		goto out;
-
-	vdir_cache->vd_last.ul = 0;
-	vdir_cache->vd_last.p.deblk = vdir_cache->vd_deblk[0];
-	if (!ctx->pos)
-		goto out;
-
-	valid = 0;
-	deblk_sz = vdir_cache->vd_deblk_sz;
-	ul = div64_u64(ctx->pos, deblk_sz);
-	AuDbg("ul %lu\n", ul);
-	if (ul >= vdir_cache->vd_nblk)
-		goto out;
-
-	n = vdir_cache->vd_nblk;
-	for (; ul < n; ul++) {
-		p.deblk = vdir_cache->vd_deblk[ul];
-		deblk_end.deblk = p.deblk + deblk_sz;
-		offset = ul;
-		offset *= deblk_sz;
-		while (!is_deblk_end(&p, &deblk_end) && offset < ctx->pos) {
-			unsigned int l;
-
-			l = calc_size(p.de->de_str.len);
-			offset += l;
-			p.deblk += l;
-		}
-		if (!is_deblk_end(&p, &deblk_end)) {
-			valid = 1;
-			vdir_cache->vd_last.ul = ul;
-			vdir_cache->vd_last.p = p;
-			break;
-		}
-	}
-
-out:
-	/* smp_mb(); */
-	AuTraceErr(!valid);
-	return valid;
-}
-
-int au_vdir_fill_de(struct file *file, struct dir_context *ctx)
-{
-	unsigned int l, deblk_sz;
-	union au_vdir_deblk_p deblk_end;
-	struct au_vdir *vdir_cache;
-	struct au_vdir_de *de;
-
-	vdir_cache = au_fvdir_cache(file);
-	if (!seek_vdir(file, ctx))
-		return 0;
-
-	deblk_sz = vdir_cache->vd_deblk_sz;
-	while (1) {
-		deblk_end.deblk = vdir_cache->vd_deblk[vdir_cache->vd_last.ul];
-		deblk_end.deblk += deblk_sz;
-		while (!is_deblk_end(&vdir_cache->vd_last.p, &deblk_end)) {
-			de = vdir_cache->vd_last.p.de;
-			AuDbg("%.*s, off%lld, i%lu, dt%d\n",
-			      de->de_str.len, de->de_str.name, ctx->pos,
-			      (unsigned long)de->de_ino, de->de_type);
-			if (unlikely(!dir_emit(ctx, de->de_str.name,
-					       de->de_str.len, de->de_ino,
-					       de->de_type))) {
-				/* todo: ignore the error caused by udba? */
-				/* return err; */
-				return 0;
-			}
-
-			l = calc_size(de->de_str.len);
-			vdir_cache->vd_last.p.deblk += l;
-			ctx->pos += l;
-		}
-		if (vdir_cache->vd_last.ul < vdir_cache->vd_nblk - 1) {
-			vdir_cache->vd_last.ul++;
-			vdir_cache->vd_last.p.deblk
-				= vdir_cache->vd_deblk[vdir_cache->vd_last.ul];
-			ctx->pos = deblk_sz * vdir_cache->vd_last.ul;
-			continue;
-		}
-		break;
-	}
-
-	/* smp_mb(); */
-	return 0;
-}
diff --git a/fs/aufs/vfsub.c b/fs/aufs/vfsub.c
deleted file mode 100644
index 89f999c97..000000000
--- a/fs/aufs/vfsub.c
+++ /dev/null
@@ -1,853 +0,0 @@
-/*
- * Copyright (C) 2005-2016 Junjiro R. Okajima
- */
-
-/*
- * sub-routines for VFS
- */
-
-#include <linux/namei.h>
-#include <linux/nsproxy.h>
-#include <linux/security.h>
-#include <linux/splice.h>
-#include "../fs/mount.h"
-#include "aufs.h"
-
-#ifdef CONFIG_AUFS_BR_FUSE
-int vfsub_test_mntns(struct vfsmount *mnt, struct super_block *h_sb)
-{
-	struct nsproxy *ns;
-
-	if (!au_test_fuse(h_sb) || !au_userns)
-		return 0;
-
-	ns = current->nsproxy;
-	/* no {get,put}_nsproxy(ns) */
-	return real_mount(mnt)->mnt_ns == ns->mnt_ns ? 0 : -EACCES;
-}
-#endif
-
-/* ---------------------------------------------------------------------- */
-
-int vfsub_update_h_iattr(struct path *h_path, int *did)
-{
-	int err;
-	struct kstat st;
-	struct super_block *h_sb;
-
-	/* for remote fs, leave work for its getattr or d_revalidate */
-	/* for bad i_attr fs, handle them in aufs_getattr() */
-	/* still some fs may acquire i_mutex. we need to skip them */
-	err = 0;
-	if (!did)
-		did = &err;
-	h_sb = h_path->dentry->d_sb;
-	*did = (!au_test_fs_remote(h_sb) && au_test_fs_refresh_iattr(h_sb));
-	if (*did)
-		err = vfs_getattr(h_path, &st);
-
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-struct file *vfsub_dentry_open(struct path *path, int flags)
-{
-	struct file *file;
-
-	file = dentry_open(path, flags /* | __FMODE_NONOTIFY */,
-			   current_cred());
-	if (!IS_ERR_OR_NULL(file)
-	    && (file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
-		i_readcount_inc(d_inode(path->dentry));
-
-	return file;
-}
-
-struct file *vfsub_filp_open(const char *path, int oflags, int mode)
-{
-	struct file *file;
-
-	lockdep_off();
-	file = filp_open(path,
-			 oflags /* | __FMODE_NONOTIFY */,
-			 mode);
-	lockdep_on();
-	if (IS_ERR(file))
-		goto out;
-	vfsub_update_h_iattr(&file->f_path, /*did*/NULL); /*ignore*/
-
-out:
-	return file;
-}
-
-/*
- * Ideally this function should call VFS:do_last() in order to keep all its
- * checkings. But it is very hard for aufs to regenerate several VFS internal
- * structure such as nameidata. This is a second (or third) best approach.
- * cf. linux/fs/namei.c:do_last(), lookup_open() and atomic_open().
- */
-int vfsub_atomic_open(struct inode *dir, struct dentry *dentry,
-		      struct vfsub_aopen_args *args, struct au_branch *br)
-{
-	int err;
-	struct file *file = args->file;
-	/* copied from linux/fs/namei.c:atomic_open() */
-	struct dentry *const DENTRY_NOT_SET = (void *)-1UL;
-
-	IMustLock(dir);
-	AuDebugOn(!dir->i_op->atomic_open);
-
-	err = au_br_test_oflag(args->open_flag, br);
-	if (unlikely(err))
-		goto out;
-
-	args->file->f_path.dentry = DENTRY_NOT_SET;
-	args->file->f_path.mnt = au_br_mnt(br);
-	err = dir->i_op->atomic_open(dir, dentry, file, args->open_flag,
-				     args->create_mode, args->opened);
-	if (err >= 0) {
-		/* some filesystems don't set FILE_CREATED while succeeded? */
-		if (*args->opened & FILE_CREATED)
-			fsnotify_create(dir, dentry);
-	} else
-		goto out;
-
-
-	if (!err) {
-		/* todo: call VFS:may_open() here */
-		err = open_check_o_direct(file);
-		/* todo: ima_file_check() too? */
-		if (!err && (args->open_flag & __FMODE_EXEC))
-			err = deny_write_access(file);
-		if (unlikely(err))
-			/* note that the file is created and still opened */
-			goto out;
-	}
-
-	atomic_inc(&br->br_count);
-	fsnotify_open(file);
-
-out:
-	return err;
-}
-
-int vfsub_kern_path(const char *name, unsigned int flags, struct path *path)
-{
-	int err;
-
-	err = kern_path(name, flags, path);
-	if (!err && d_is_positive(path->dentry))
-		vfsub_update_h_iattr(path, /*did*/NULL); /*ignore*/
-	return err;
-}
-
-struct dentry *vfsub_lookup_one_len(const char *name, struct dentry *parent,
-				    int len)
-{
-	struct path path = {
-		.mnt = NULL
-	};
-
-	/* VFS checks it too, but by WARN_ON_ONCE() */
-	IMustLock(d_inode(parent));
-
-	path.dentry = lookup_one_len(name, parent, len);
-	if (IS_ERR(path.dentry))
-		goto out;
-	if (d_is_positive(path.dentry))
-		vfsub_update_h_iattr(&path, /*did*/NULL); /*ignore*/
-
-out:
-	AuTraceErrPtr(path.dentry);
-	return path.dentry;
-}
-
-void vfsub_call_lkup_one(void *args)
-{
-	struct vfsub_lkup_one_args *a = args;
-	*a->errp = vfsub_lkup_one(a->name, a->parent);
-}
-
-/* ---------------------------------------------------------------------- */
-
-struct dentry *vfsub_lock_rename(struct dentry *d1, struct au_hinode *hdir1,
-				 struct dentry *d2, struct au_hinode *hdir2)
-{
-	struct dentry *d;
-
-	lockdep_off();
-	d = lock_rename(d1, d2);
-	lockdep_on();
-	au_hn_suspend(hdir1);
-	if (hdir1 != hdir2)
-		au_hn_suspend(hdir2);
-
-	return d;
-}
-
-void vfsub_unlock_rename(struct dentry *d1, struct au_hinode *hdir1,
-			 struct dentry *d2, struct au_hinode *hdir2)
-{
-	au_hn_resume(hdir1);
-	if (hdir1 != hdir2)
-		au_hn_resume(hdir2);
-	lockdep_off();
-	unlock_rename(d1, d2);
-	lockdep_on();
-}
-
-/* ---------------------------------------------------------------------- */
-
-int vfsub_create(struct inode *dir, struct path *path, int mode, bool want_excl)
-{
-	int err;
-	struct dentry *d;
-
-	IMustLock(dir);
-
-	d = path->dentry;
-	path->dentry = d->d_parent;
-	err = security_path_mknod(path, d, mode, 0);
-	path->dentry = d;
-	if (unlikely(err))
-		goto out;
-
-	lockdep_off();
-	err = vfs_create(dir, path->dentry, mode, want_excl);
-	lockdep_on();
-	if (!err) {
-		struct path tmp = *path;
-		int did;
-
-		vfsub_update_h_iattr(&tmp, &did);
-		if (did) {
-			tmp.dentry = path->dentry->d_parent;
-			vfsub_update_h_iattr(&tmp, /*did*/NULL);
-		}
-		/*ignore*/
-	}
-
-out:
-	return err;
-}
-
-int vfsub_symlink(struct inode *dir, struct path *path, const char *symname)
-{
-	int err;
-	struct dentry *d;
-
-	IMustLock(dir);
-
-	d = path->dentry;
-	path->dentry = d->d_parent;
-	err = security_path_symlink(path, d, symname);
-	path->dentry = d;
-	if (unlikely(err))
-		goto out;
-
-	lockdep_off();
-	err = vfs_symlink(dir, path->dentry, symname);
-	lockdep_on();
-	if (!err) {
-		struct path tmp = *path;
-		int did;
-
-		vfsub_update_h_iattr(&tmp, &did);
-		if (did) {
-			tmp.dentry = path->dentry->d_parent;
-			vfsub_update_h_iattr(&tmp, /*did*/NULL);
-		}
-		/*ignore*/
-	}
-
-out:
-	return err;
-}
-
-int vfsub_mknod(struct inode *dir, struct path *path, int mode, dev_t dev)
-{
-	int err;
-	struct dentry *d;
-
-	IMustLock(dir);
-
-	d = path->dentry;
-	path->dentry = d->d_parent;
-	err = security_path_mknod(path, d, mode, new_encode_dev(dev));
-	path->dentry = d;
-	if (unlikely(err))
-		goto out;
-
-	lockdep_off();
-	err = vfs_mknod(dir, path->dentry, mode, dev);
-	lockdep_on();
-	if (!err) {
-		struct path tmp = *path;
-		int did;
-
-		vfsub_update_h_iattr(&tmp, &did);
-		if (did) {
-			tmp.dentry = path->dentry->d_parent;
-			vfsub_update_h_iattr(&tmp, /*did*/NULL);
-		}
-		/*ignore*/
-	}
-
-out:
-	return err;
-}
-
-static int au_test_nlink(struct inode *inode)
-{
-	const unsigned int link_max = UINT_MAX >> 1; /* rough margin */
-
-	if (!au_test_fs_no_limit_nlink(inode->i_sb)
-	    || inode->i_nlink < link_max)
-		return 0;
-	return -EMLINK;
-}
-
-int vfsub_link(struct dentry *src_dentry, struct inode *dir, struct path *path,
-	       struct inode **delegated_inode)
-{
-	int err;
-	struct dentry *d;
-
-	IMustLock(dir);
-
-	err = au_test_nlink(d_inode(src_dentry));
-	if (unlikely(err))
-		return err;
-
-	/* we don't call may_linkat() */
-	d = path->dentry;
-	path->dentry = d->d_parent;
-	err = security_path_link(src_dentry, path, d);
-	path->dentry = d;
-	if (unlikely(err))
-		goto out;
-
-	lockdep_off();
-	err = vfs_link(src_dentry, dir, path->dentry, delegated_inode);
-	lockdep_on();
-	if (!err) {
-		struct path tmp = *path;
-		int did;
-
-		/* fuse has different memory inode for the same inumber */
-		vfsub_update_h_iattr(&tmp, &did);
-		if (did) {
-			tmp.dentry = path->dentry->d_parent;
-			vfsub_update_h_iattr(&tmp, /*did*/NULL);
-			tmp.dentry = src_dentry;
-			vfsub_update_h_iattr(&tmp, /*did*/NULL);
-		}
-		/*ignore*/
-	}
-
-out:
-	return err;
-}
-
-int vfsub_rename(struct inode *src_dir, struct dentry *src_dentry,
-		 struct inode *dir, struct path *path,
-		 struct inode **delegated_inode)
-{
-	int err;
-	struct path tmp = {
-		.mnt	= path->mnt
-	};
-	struct dentry *d;
-
-	IMustLock(dir);
-	IMustLock(src_dir);
-
-	d = path->dentry;
-	path->dentry = d->d_parent;
-	tmp.dentry = src_dentry->d_parent;
-	err = security_path_rename(&tmp, src_dentry, path, d, /*flags*/0);
-	path->dentry = d;
-	if (unlikely(err))
-		goto out;
-
-	lockdep_off();
-	err = vfs_rename(src_dir, src_dentry, dir, path->dentry,
-			 delegated_inode, /*flags*/0);
-	lockdep_on();
-	if (!err) {
-		int did;
-
-		tmp.dentry = d->d_parent;
-		vfsub_update_h_iattr(&tmp, &did);
-		if (did) {
-			tmp.dentry = src_dentry;
-			vfsub_update_h_iattr(&tmp, /*did*/NULL);
-			tmp.dentry = src_dentry->d_parent;
-			vfsub_update_h_iattr(&tmp, /*did*/NULL);
-		}
-		/*ignore*/
-	}
-
-out:
-	return err;
-}
-
-int vfsub_mkdir(struct inode *dir, struct path *path, int mode)
-{
-	int err;
-	struct dentry *d;
-
-	IMustLock(dir);
-
-	d = path->dentry;
-	path->dentry = d->d_parent;
-	err = security_path_mkdir(path, d, mode);
-	path->dentry = d;
-	if (unlikely(err))
-		goto out;
-
-	lockdep_off();
-	err = vfs_mkdir(dir, path->dentry, mode);
-	lockdep_on();
-	if (!err) {
-		struct path tmp = *path;
-		int did;
-
-		vfsub_update_h_iattr(&tmp, &did);
-		if (did) {
-			tmp.dentry = path->dentry->d_parent;
-			vfsub_update_h_iattr(&tmp, /*did*/NULL);
-		}
-		/*ignore*/
-	}
-
-out:
-	return err;
-}
-
-int vfsub_rmdir(struct inode *dir, struct path *path)
-{
-	int err;
-	struct dentry *d;
-
-	IMustLock(dir);
-
-	d = path->dentry;
-	path->dentry = d->d_parent;
-	err = security_path_rmdir(path, d);
-	path->dentry = d;
-	if (unlikely(err))
-		goto out;
-
-	lockdep_off();
-	err = vfs_rmdir(dir, path->dentry);
-	lockdep_on();
-	if (!err) {
-		struct path tmp = {
-			.dentry	= path->dentry->d_parent,
-			.mnt	= path->mnt
-		};
-
-		vfsub_update_h_iattr(&tmp, /*did*/NULL); /*ignore*/
-	}
-
-out:
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* todo: support mmap_sem? */
-ssize_t vfsub_read_u(struct file *file, char __user *ubuf, size_t count,
-		     loff_t *ppos)
-{
-	ssize_t err;
-
-	lockdep_off();
-	err = vfs_read(file, ubuf, count, ppos);
-	lockdep_on();
-	if (err >= 0)
-		vfsub_update_h_iattr(&file->f_path, /*did*/NULL); /*ignore*/
-	return err;
-}
-
-/* todo: kernel_read()? */
-ssize_t vfsub_read_k(struct file *file, void *kbuf, size_t count,
-		     loff_t *ppos)
-{
-	ssize_t err;
-	mm_segment_t oldfs;
-	union {
-		void *k;
-		char __user *u;
-	} buf;
-
-	buf.k = kbuf;
-	oldfs = get_fs();
-	set_fs(KERNEL_DS);
-	err = vfsub_read_u(file, buf.u, count, ppos);
-	set_fs(oldfs);
-	return err;
-}
-
-ssize_t vfsub_write_u(struct file *file, const char __user *ubuf, size_t count,
-		      loff_t *ppos)
-{
-	ssize_t err;
-
-	lockdep_off();
-	err = vfs_write(file, ubuf, count, ppos);
-	lockdep_on();
-	if (err >= 0)
-		vfsub_update_h_iattr(&file->f_path, /*did*/NULL); /*ignore*/
-	return err;
-}
-
-ssize_t vfsub_write_k(struct file *file, void *kbuf, size_t count, loff_t *ppos)
-{
-	ssize_t err;
-	mm_segment_t oldfs;
-	union {
-		void *k;
-		const char __user *u;
-	} buf;
-
-	buf.k = kbuf;
-	oldfs = get_fs();
-	set_fs(KERNEL_DS);
-	err = vfsub_write_u(file, buf.u, count, ppos);
-	set_fs(oldfs);
-	return err;
-}
-
-int vfsub_flush(struct file *file, fl_owner_t id)
-{
-	int err;
-
-	err = 0;
-	if (file->f_op->flush) {
-		if (!au_test_nfs(file->f_path.dentry->d_sb))
-			err = file->f_op->flush(file, id);
-		else {
-			lockdep_off();
-			err = file->f_op->flush(file, id);
-			lockdep_on();
-		}
-		if (!err)
-			vfsub_update_h_iattr(&file->f_path, /*did*/NULL);
-		/*ignore*/
-	}
-	return err;
-}
-
-int vfsub_iterate_dir(struct file *file, struct dir_context *ctx)
-{
-	int err;
-
-	AuDbg("%pD, ctx{%pf, %llu}\n", file, ctx->actor, ctx->pos);
-
-	lockdep_off();
-	err = iterate_dir(file, ctx);
-	lockdep_on();
-	if (err >= 0)
-		vfsub_update_h_iattr(&file->f_path, /*did*/NULL); /*ignore*/
-	return err;
-}
-
-long vfsub_splice_to(struct file *in, loff_t *ppos,
-		     struct pipe_inode_info *pipe, size_t len,
-		     unsigned int flags)
-{
-	long err;
-
-	lockdep_off();
-	err = do_splice_to(in, ppos, pipe, len, flags);
-	lockdep_on();
-	file_accessed(in);
-	if (err >= 0)
-		vfsub_update_h_iattr(&in->f_path, /*did*/NULL); /*ignore*/
-	return err;
-}
-
-long vfsub_splice_from(struct pipe_inode_info *pipe, struct file *out,
-		       loff_t *ppos, size_t len, unsigned int flags)
-{
-	long err;
-
-	lockdep_off();
-	err = do_splice_from(pipe, out, ppos, len, flags);
-	lockdep_on();
-	if (err >= 0)
-		vfsub_update_h_iattr(&out->f_path, /*did*/NULL); /*ignore*/
-	return err;
-}
-
-int vfsub_fsync(struct file *file, struct path *path, int datasync)
-{
-	int err;
-
-	/* file can be NULL */
-	lockdep_off();
-	err = vfs_fsync(file, datasync);
-	lockdep_on();
-	if (!err) {
-		if (!path) {
-			AuDebugOn(!file);
-			path = &file->f_path;
-		}
-		vfsub_update_h_iattr(path, /*did*/NULL); /*ignore*/
-	}
-	return err;
-}
-
-/* cf. open.c:do_sys_truncate() and do_sys_ftruncate() */
-int vfsub_trunc(struct path *h_path, loff_t length, unsigned int attr,
-		struct file *h_file)
-{
-	int err;
-	struct inode *h_inode;
-	struct super_block *h_sb;
-
-	if (!h_file) {
-		err = vfsub_truncate(h_path, length);
-		goto out;
-	}
-
-	h_inode = d_inode(h_path->dentry);
-	h_sb = h_inode->i_sb;
-	lockdep_off();
-	sb_start_write(h_sb);
-	lockdep_on();
-	err = locks_verify_truncate(h_inode, h_file, length);
-	if (!err)
-		err = security_path_truncate(h_path);
-	if (!err) {
-		lockdep_off();
-		err = do_truncate(h_path->dentry, length, attr, h_file);
-		lockdep_on();
-	}
-	lockdep_off();
-	sb_end_write(h_sb);
-	lockdep_on();
-
-out:
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-struct au_vfsub_mkdir_args {
-	int *errp;
-	struct inode *dir;
-	struct path *path;
-	int mode;
-};
-
-static void au_call_vfsub_mkdir(void *args)
-{
-	struct au_vfsub_mkdir_args *a = args;
-	*a->errp = vfsub_mkdir(a->dir, a->path, a->mode);
-}
-
-int vfsub_sio_mkdir(struct inode *dir, struct path *path, int mode)
-{
-	int err, do_sio, wkq_err;
-
-	do_sio = au_test_h_perm_sio(dir, MAY_EXEC | MAY_WRITE);
-	if (!do_sio) {
-		lockdep_off();
-		err = vfsub_mkdir(dir, path, mode);
-		lockdep_on();
-	} else {
-		struct au_vfsub_mkdir_args args = {
-			.errp	= &err,
-			.dir	= dir,
-			.path	= path,
-			.mode	= mode
-		};
-		wkq_err = au_wkq_wait(au_call_vfsub_mkdir, &args);
-		if (unlikely(wkq_err))
-			err = wkq_err;
-	}
-
-	return err;
-}
-
-struct au_vfsub_rmdir_args {
-	int *errp;
-	struct inode *dir;
-	struct path *path;
-};
-
-static void au_call_vfsub_rmdir(void *args)
-{
-	struct au_vfsub_rmdir_args *a = args;
-	*a->errp = vfsub_rmdir(a->dir, a->path);
-}
-
-int vfsub_sio_rmdir(struct inode *dir, struct path *path)
-{
-	int err, do_sio, wkq_err;
-
-	do_sio = au_test_h_perm_sio(dir, MAY_EXEC | MAY_WRITE);
-	if (!do_sio) {
-		lockdep_off();
-		err = vfsub_rmdir(dir, path);
-		lockdep_on();
-	} else {
-		struct au_vfsub_rmdir_args args = {
-			.errp	= &err,
-			.dir	= dir,
-			.path	= path
-		};
-		wkq_err = au_wkq_wait(au_call_vfsub_rmdir, &args);
-		if (unlikely(wkq_err))
-			err = wkq_err;
-	}
-
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-struct notify_change_args {
-	int *errp;
-	struct path *path;
-	struct iattr *ia;
-	struct inode **delegated_inode;
-};
-
-static void call_notify_change(void *args)
-{
-	struct notify_change_args *a = args;
-	struct inode *h_inode;
-
-	h_inode = d_inode(a->path->dentry);
-	IMustLock(h_inode);
-
-	*a->errp = -EPERM;
-	if (!IS_IMMUTABLE(h_inode) && !IS_APPEND(h_inode)) {
-		lockdep_off();
-		*a->errp = notify_change(a->path->dentry, a->ia,
-					 a->delegated_inode);
-		lockdep_on();
-		if (!*a->errp)
-			vfsub_update_h_iattr(a->path, /*did*/NULL); /*ignore*/
-	}
-	AuTraceErr(*a->errp);
-}
-
-int vfsub_notify_change(struct path *path, struct iattr *ia,
-			struct inode **delegated_inode)
-{
-	int err;
-	struct notify_change_args args = {
-		.errp			= &err,
-		.path			= path,
-		.ia			= ia,
-		.delegated_inode	= delegated_inode
-	};
-
-	call_notify_change(&args);
-
-	return err;
-}
-
-int vfsub_sio_notify_change(struct path *path, struct iattr *ia,
-			    struct inode **delegated_inode)
-{
-	int err, wkq_err;
-	struct notify_change_args args = {
-		.errp			= &err,
-		.path			= path,
-		.ia			= ia,
-		.delegated_inode	= delegated_inode
-	};
-
-	wkq_err = au_wkq_wait(call_notify_change, &args);
-	if (unlikely(wkq_err))
-		err = wkq_err;
-
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-struct unlink_args {
-	int *errp;
-	struct inode *dir;
-	struct path *path;
-	struct inode **delegated_inode;
-};
-
-static void call_unlink(void *args)
-{
-	struct unlink_args *a = args;
-	struct dentry *d = a->path->dentry;
-	struct inode *h_inode;
-	const int stop_sillyrename = (au_test_nfs(d->d_sb)
-				      && au_dcount(d) == 1);
-
-	IMustLock(a->dir);
-
-	a->path->dentry = d->d_parent;
-	*a->errp = security_path_unlink(a->path, d);
-	a->path->dentry = d;
-	if (unlikely(*a->errp))
-		return;
-
-	if (!stop_sillyrename)
-		dget(d);
-	h_inode = NULL;
-	if (d_is_positive(d)) {
-		h_inode = d_inode(d);
-		ihold(h_inode);
-	}
-
-	lockdep_off();
-	*a->errp = vfs_unlink(a->dir, d, a->delegated_inode);
-	lockdep_on();
-	if (!*a->errp) {
-		struct path tmp = {
-			.dentry = d->d_parent,
-			.mnt	= a->path->mnt
-		};
-		vfsub_update_h_iattr(&tmp, /*did*/NULL); /*ignore*/
-	}
-
-	if (!stop_sillyrename)
-		dput(d);
-	if (h_inode)
-		iput(h_inode);
-
-	AuTraceErr(*a->errp);
-}
-
-/*
- * @dir: must be locked.
- * @dentry: target dentry.
- */
-int vfsub_unlink(struct inode *dir, struct path *path,
-		 struct inode **delegated_inode, int force)
-{
-	int err;
-	struct unlink_args args = {
-		.errp			= &err,
-		.dir			= dir,
-		.path			= path,
-		.delegated_inode	= delegated_inode
-	};
-
-	if (!force)
-		call_unlink(&args);
-	else {
-		int wkq_err;
-
-		wkq_err = au_wkq_wait(call_unlink, &args);
-		if (unlikely(wkq_err))
-			err = wkq_err;
-	}
-
-	return err;
-}
diff --git a/fs/aufs/vfsub.h b/fs/aufs/vfsub.h
deleted file mode 100644
index f2e1c49af..000000000
--- a/fs/aufs/vfsub.h
+++ /dev/null
@@ -1,295 +0,0 @@
-/*
- * Copyright (C) 2005-2016 Junjiro R. Okajima
- */
-
-/*
- * sub-routines for VFS
- */
-
-#ifndef __AUFS_VFSUB_H__
-#define __AUFS_VFSUB_H__
-
-#ifdef __KERNEL__
-
-#include <linux/fs.h>
-#include <linux/mount.h>
-#include <linux/posix_acl.h>
-#include <linux/xattr.h>
-#include "debug.h"
-
-/* copied from linux/fs/internal.h */
-/* todo: BAD approach!! */
-extern void __mnt_drop_write(struct vfsmount *);
-extern int open_check_o_direct(struct file *f);
-
-/* ---------------------------------------------------------------------- */
-
-/* lock subclass for lower inode */
-/* default MAX_LOCKDEP_SUBCLASSES(8) is not enough */
-/* reduce? gave up. */
-enum {
-	AuLsc_I_Begin = I_MUTEX_PARENT2, /* 5 */
-	AuLsc_I_PARENT,		/* lower inode, parent first */
-	AuLsc_I_PARENT2,	/* copyup dirs */
-	AuLsc_I_PARENT3,	/* copyup wh */
-	AuLsc_I_CHILD,
-	AuLsc_I_CHILD2,
-	AuLsc_I_End
-};
-
-/* to debug easier, do not make them inlined functions */
-#define MtxMustLock(mtx)	AuDebugOn(!mutex_is_locked(mtx))
-#define IMustLock(i)		MtxMustLock(&(i)->i_mutex)
-
-/* ---------------------------------------------------------------------- */
-
-static inline void vfsub_drop_nlink(struct inode *inode)
-{
-	AuDebugOn(!inode->i_nlink);
-	drop_nlink(inode);
-}
-
-static inline void vfsub_dead_dir(struct inode *inode)
-{
-	AuDebugOn(!S_ISDIR(inode->i_mode));
-	inode->i_flags |= S_DEAD;
-	clear_nlink(inode);
-}
-
-static inline int vfsub_native_ro(struct inode *inode)
-{
-	return (inode->i_sb->s_flags & MS_RDONLY)
-		|| IS_RDONLY(inode)
-		/* || IS_APPEND(inode) */
-		|| IS_IMMUTABLE(inode);
-}
-
-#ifdef CONFIG_AUFS_BR_FUSE
-int vfsub_test_mntns(struct vfsmount *mnt, struct super_block *h_sb);
-#else
-AuStubInt0(vfsub_test_mntns, struct vfsmount *mnt, struct super_block *h_sb);
-#endif
-
-/* ---------------------------------------------------------------------- */
-
-int vfsub_update_h_iattr(struct path *h_path, int *did);
-struct file *vfsub_dentry_open(struct path *path, int flags);
-struct file *vfsub_filp_open(const char *path, int oflags, int mode);
-struct vfsub_aopen_args {
-	struct file	*file;
-	unsigned int	open_flag;
-	umode_t		create_mode;
-	int		*opened;
-};
-struct au_branch;
-int vfsub_atomic_open(struct inode *dir, struct dentry *dentry,
-		      struct vfsub_aopen_args *args, struct au_branch *br);
-int vfsub_kern_path(const char *name, unsigned int flags, struct path *path);
-
-struct dentry *vfsub_lookup_one_len(const char *name, struct dentry *parent,
-				    int len);
-
-struct vfsub_lkup_one_args {
-	struct dentry **errp;
-	struct qstr *name;
-	struct dentry *parent;
-};
-
-static inline struct dentry *vfsub_lkup_one(struct qstr *name,
-					    struct dentry *parent)
-{
-	return vfsub_lookup_one_len(name->name, parent, name->len);
-}
-
-void vfsub_call_lkup_one(void *args);
-
-/* ---------------------------------------------------------------------- */
-
-static inline int vfsub_mnt_want_write(struct vfsmount *mnt)
-{
-	int err;
-
-	lockdep_off();
-	err = mnt_want_write(mnt);
-	lockdep_on();
-	return err;
-}
-
-static inline void vfsub_mnt_drop_write(struct vfsmount *mnt)
-{
-	lockdep_off();
-	mnt_drop_write(mnt);
-	lockdep_on();
-}
-
-#if 0 /* reserved */
-static inline void vfsub_mnt_drop_write_file(struct file *file)
-{
-	lockdep_off();
-	mnt_drop_write_file(file);
-	lockdep_on();
-}
-#endif
-
-/* ---------------------------------------------------------------------- */
-
-struct au_hinode;
-struct dentry *vfsub_lock_rename(struct dentry *d1, struct au_hinode *hdir1,
-				 struct dentry *d2, struct au_hinode *hdir2);
-void vfsub_unlock_rename(struct dentry *d1, struct au_hinode *hdir1,
-			 struct dentry *d2, struct au_hinode *hdir2);
-
-int vfsub_create(struct inode *dir, struct path *path, int mode,
-		 bool want_excl);
-int vfsub_symlink(struct inode *dir, struct path *path,
-		  const char *symname);
-int vfsub_mknod(struct inode *dir, struct path *path, int mode, dev_t dev);
-int vfsub_link(struct dentry *src_dentry, struct inode *dir,
-	       struct path *path, struct inode **delegated_inode);
-int vfsub_rename(struct inode *src_hdir, struct dentry *src_dentry,
-		 struct inode *hdir, struct path *path,
-		 struct inode **delegated_inode);
-int vfsub_mkdir(struct inode *dir, struct path *path, int mode);
-int vfsub_rmdir(struct inode *dir, struct path *path);
-
-/* ---------------------------------------------------------------------- */
-
-ssize_t vfsub_read_u(struct file *file, char __user *ubuf, size_t count,
-		     loff_t *ppos);
-ssize_t vfsub_read_k(struct file *file, void *kbuf, size_t count,
-			loff_t *ppos);
-ssize_t vfsub_write_u(struct file *file, const char __user *ubuf, size_t count,
-		      loff_t *ppos);
-ssize_t vfsub_write_k(struct file *file, void *kbuf, size_t count,
-		      loff_t *ppos);
-int vfsub_flush(struct file *file, fl_owner_t id);
-int vfsub_iterate_dir(struct file *file, struct dir_context *ctx);
-
-static inline loff_t vfsub_f_size_read(struct file *file)
-{
-	return i_size_read(file_inode(file));
-}
-
-static inline unsigned int vfsub_file_flags(struct file *file)
-{
-	unsigned int flags;
-
-	spin_lock(&file->f_lock);
-	flags = file->f_flags;
-	spin_unlock(&file->f_lock);
-
-	return flags;
-}
-
-#if 0 /* reserved */
-static inline void vfsub_file_accessed(struct file *h_file)
-{
-	file_accessed(h_file);
-	vfsub_update_h_iattr(&h_file->f_path, /*did*/NULL); /*ignore*/
-}
-#endif
-
-#if 0 /* reserved */
-static inline void vfsub_touch_atime(struct vfsmount *h_mnt,
-				     struct dentry *h_dentry)
-{
-	struct path h_path = {
-		.dentry	= h_dentry,
-		.mnt	= h_mnt
-	};
-	touch_atime(&h_path);
-	vfsub_update_h_iattr(&h_path, /*did*/NULL); /*ignore*/
-}
-#endif
-
-static inline int vfsub_update_time(struct inode *h_inode, struct timespec *ts,
-				    int flags)
-{
-	return generic_update_time(h_inode, ts, flags);
-	/* no vfsub_update_h_iattr() since we don't have struct path */
-}
-
-#ifdef CONFIG_FS_POSIX_ACL
-static inline int vfsub_acl_chmod(struct inode *h_inode, umode_t h_mode)
-{
-	int err;
-
-	err = posix_acl_chmod(h_inode, h_mode);
-	if (err == -EOPNOTSUPP)
-		err = 0;
-	return err;
-}
-#else
-AuStubInt0(vfsub_acl_chmod, struct inode *h_inode, umode_t h_mode);
-#endif
-
-long vfsub_splice_to(struct file *in, loff_t *ppos,
-		     struct pipe_inode_info *pipe, size_t len,
-		     unsigned int flags);
-long vfsub_splice_from(struct pipe_inode_info *pipe, struct file *out,
-		       loff_t *ppos, size_t len, unsigned int flags);
-
-static inline long vfsub_truncate(struct path *path, loff_t length)
-{
-	long err;
-
-	lockdep_off();
-	err = vfs_truncate(path, length);
-	lockdep_on();
-	return err;
-}
-
-int vfsub_trunc(struct path *h_path, loff_t length, unsigned int attr,
-		struct file *h_file);
-int vfsub_fsync(struct file *file, struct path *path, int datasync);
-
-/* ---------------------------------------------------------------------- */
-
-static inline loff_t vfsub_llseek(struct file *file, loff_t offset, int origin)
-{
-	loff_t err;
-
-	lockdep_off();
-	err = vfs_llseek(file, offset, origin);
-	lockdep_on();
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-int vfsub_sio_mkdir(struct inode *dir, struct path *path, int mode);
-int vfsub_sio_rmdir(struct inode *dir, struct path *path);
-int vfsub_sio_notify_change(struct path *path, struct iattr *ia,
-			    struct inode **delegated_inode);
-int vfsub_notify_change(struct path *path, struct iattr *ia,
-			struct inode **delegated_inode);
-int vfsub_unlink(struct inode *dir, struct path *path,
-		 struct inode **delegated_inode, int force);
-
-/* ---------------------------------------------------------------------- */
-
-static inline int vfsub_setxattr(struct dentry *dentry, const char *name,
-				 const void *value, size_t size, int flags)
-{
-	int err;
-
-	lockdep_off();
-	err = vfs_setxattr(dentry, name, value, size, flags);
-	lockdep_on();
-
-	return err;
-}
-
-static inline int vfsub_removexattr(struct dentry *dentry, const char *name)
-{
-	int err;
-
-	lockdep_off();
-	err = vfs_removexattr(dentry, name);
-	lockdep_on();
-
-	return err;
-}
-
-#endif /* __KERNEL__ */
-#endif /* __AUFS_VFSUB_H__ */
diff --git a/fs/aufs/wbr_policy.c b/fs/aufs/wbr_policy.c
deleted file mode 100644
index c822b428d..000000000
--- a/fs/aufs/wbr_policy.c
+++ /dev/null
@@ -1,752 +0,0 @@
-/*
- * Copyright (C) 2005-2016 Junjiro R. Okajima
- */
-
-/*
- * policies for selecting one among multiple writable branches
- */
-
-#include <linux/statfs.h>
-#include "aufs.h"
-
-/* subset of cpup_attr() */
-static noinline_for_stack
-int au_cpdown_attr(struct path *h_path, struct dentry *h_src)
-{
-	int err, sbits;
-	struct iattr ia;
-	struct inode *h_isrc;
-
-	h_isrc = d_inode(h_src);
-	ia.ia_valid = ATTR_FORCE | ATTR_MODE | ATTR_UID | ATTR_GID;
-	ia.ia_mode = h_isrc->i_mode;
-	ia.ia_uid = h_isrc->i_uid;
-	ia.ia_gid = h_isrc->i_gid;
-	sbits = !!(ia.ia_mode & (S_ISUID | S_ISGID));
-	au_cpup_attr_flags(d_inode(h_path->dentry), h_isrc->i_flags);
-	/* no delegation since it is just created */
-	err = vfsub_sio_notify_change(h_path, &ia, /*delegated*/NULL);
-
-	/* is this nfs only? */
-	if (!err && sbits && au_test_nfs(h_path->dentry->d_sb)) {
-		ia.ia_valid = ATTR_FORCE | ATTR_MODE;
-		ia.ia_mode = h_isrc->i_mode;
-		err = vfsub_sio_notify_change(h_path, &ia, /*delegated*/NULL);
-	}
-
-	return err;
-}
-
-#define AuCpdown_PARENT_OPQ	1
-#define AuCpdown_WHED		(1 << 1)
-#define AuCpdown_MADE_DIR	(1 << 2)
-#define AuCpdown_DIROPQ		(1 << 3)
-#define au_ftest_cpdown(flags, name)	((flags) & AuCpdown_##name)
-#define au_fset_cpdown(flags, name) \
-	do { (flags) |= AuCpdown_##name; } while (0)
-#define au_fclr_cpdown(flags, name) \
-	do { (flags) &= ~AuCpdown_##name; } while (0)
-
-static int au_cpdown_dir_opq(struct dentry *dentry, aufs_bindex_t bdst,
-			     unsigned int *flags)
-{
-	int err;
-	struct dentry *opq_dentry;
-
-	opq_dentry = au_diropq_create(dentry, bdst);
-	err = PTR_ERR(opq_dentry);
-	if (IS_ERR(opq_dentry))
-		goto out;
-	dput(opq_dentry);
-	au_fset_cpdown(*flags, DIROPQ);
-
-out:
-	return err;
-}
-
-static int au_cpdown_dir_wh(struct dentry *dentry, struct dentry *h_parent,
-			    struct inode *dir, aufs_bindex_t bdst)
-{
-	int err;
-	struct path h_path;
-	struct au_branch *br;
-
-	br = au_sbr(dentry->d_sb, bdst);
-	h_path.dentry = au_wh_lkup(h_parent, &dentry->d_name, br);
-	err = PTR_ERR(h_path.dentry);
-	if (IS_ERR(h_path.dentry))
-		goto out;
-
-	err = 0;
-	if (d_is_positive(h_path.dentry)) {
-		h_path.mnt = au_br_mnt(br);
-		err = au_wh_unlink_dentry(au_h_iptr(dir, bdst), &h_path,
-					  dentry);
-	}
-	dput(h_path.dentry);
-
-out:
-	return err;
-}
-
-static int au_cpdown_dir(struct dentry *dentry, aufs_bindex_t bdst,
-			 struct au_pin *pin,
-			 struct dentry *h_parent, void *arg)
-{
-	int err, rerr;
-	aufs_bindex_t bopq, bstart;
-	struct path h_path;
-	struct dentry *parent;
-	struct inode *h_dir, *h_inode, *inode, *dir;
-	unsigned int *flags = arg;
-
-	bstart = au_dbstart(dentry);
-	/* dentry is di-locked */
-	parent = dget_parent(dentry);
-	dir = d_inode(parent);
-	h_dir = d_inode(h_parent);
-	AuDebugOn(h_dir != au_h_iptr(dir, bdst));
-	IMustLock(h_dir);
-
-	err = au_lkup_neg(dentry, bdst, /*wh*/0);
-	if (unlikely(err < 0))
-		goto out;
-	h_path.dentry = au_h_dptr(dentry, bdst);
-	h_path.mnt = au_sbr_mnt(dentry->d_sb, bdst);
-	err = vfsub_sio_mkdir(au_h_iptr(dir, bdst), &h_path,
-			      S_IRWXU | S_IRUGO | S_IXUGO);
-	if (unlikely(err))
-		goto out_put;
-	au_fset_cpdown(*flags, MADE_DIR);
-
-	bopq = au_dbdiropq(dentry);
-	au_fclr_cpdown(*flags, WHED);
-	au_fclr_cpdown(*flags, DIROPQ);
-	if (au_dbwh(dentry) == bdst)
-		au_fset_cpdown(*flags, WHED);
-	if (!au_ftest_cpdown(*flags, PARENT_OPQ) && bopq <= bdst)
-		au_fset_cpdown(*flags, PARENT_OPQ);
-	h_inode = d_inode(h_path.dentry);
-	mutex_lock_nested(&h_inode->i_mutex, AuLsc_I_CHILD);
-	if (au_ftest_cpdown(*flags, WHED)) {
-		err = au_cpdown_dir_opq(dentry, bdst, flags);
-		if (unlikely(err)) {
-			mutex_unlock(&h_inode->i_mutex);
-			goto out_dir;
-		}
-	}
-
-	err = au_cpdown_attr(&h_path, au_h_dptr(dentry, bstart));
-	mutex_unlock(&h_inode->i_mutex);
-	if (unlikely(err))
-		goto out_opq;
-
-	if (au_ftest_cpdown(*flags, WHED)) {
-		err = au_cpdown_dir_wh(dentry, h_parent, dir, bdst);
-		if (unlikely(err))
-			goto out_opq;
-	}
-
-	inode = d_inode(dentry);
-	if (au_ibend(inode) < bdst)
-		au_set_ibend(inode, bdst);
-	au_set_h_iptr(inode, bdst, au_igrab(h_inode),
-		      au_hi_flags(inode, /*isdir*/1));
-	au_fhsm_wrote(dentry->d_sb, bdst, /*force*/0);
-	goto out; /* success */
-
-	/* revert */
-out_opq:
-	if (au_ftest_cpdown(*flags, DIROPQ)) {
-		mutex_lock_nested(&h_inode->i_mutex, AuLsc_I_CHILD);
-		rerr = au_diropq_remove(dentry, bdst);
-		mutex_unlock(&h_inode->i_mutex);
-		if (unlikely(rerr)) {
-			AuIOErr("failed removing diropq for %pd b%d (%d)\n",
-				dentry, bdst, rerr);
-			err = -EIO;
-			goto out;
-		}
-	}
-out_dir:
-	if (au_ftest_cpdown(*flags, MADE_DIR)) {
-		rerr = vfsub_sio_rmdir(au_h_iptr(dir, bdst), &h_path);
-		if (unlikely(rerr)) {
-			AuIOErr("failed removing %pd b%d (%d)\n",
-				dentry, bdst, rerr);
-			err = -EIO;
-		}
-	}
-out_put:
-	au_set_h_dptr(dentry, bdst, NULL);
-	if (au_dbend(dentry) == bdst)
-		au_update_dbend(dentry);
-out:
-	dput(parent);
-	return err;
-}
-
-int au_cpdown_dirs(struct dentry *dentry, aufs_bindex_t bdst)
-{
-	int err;
-	unsigned int flags;
-
-	flags = 0;
-	err = au_cp_dirs(dentry, bdst, au_cpdown_dir, &flags);
-
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* policies for create */
-
-int au_wbr_nonopq(struct dentry *dentry, aufs_bindex_t bindex)
-{
-	int err, i, j, ndentry;
-	aufs_bindex_t bopq;
-	struct au_dcsub_pages dpages;
-	struct au_dpage *dpage;
-	struct dentry **dentries, *parent, *d;
-
-	err = au_dpages_init(&dpages, GFP_NOFS);
-	if (unlikely(err))
-		goto out;
-	parent = dget_parent(dentry);
-	err = au_dcsub_pages_rev_aufs(&dpages, parent, /*do_include*/0);
-	if (unlikely(err))
-		goto out_free;
-
-	err = bindex;
-	for (i = 0; i < dpages.ndpage; i++) {
-		dpage = dpages.dpages + i;
-		dentries = dpage->dentries;
-		ndentry = dpage->ndentry;
-		for (j = 0; j < ndentry; j++) {
-			d = dentries[j];
-			di_read_lock_parent2(d, !AuLock_IR);
-			bopq = au_dbdiropq(d);
-			di_read_unlock(d, !AuLock_IR);
-			if (bopq >= 0 && bopq < err)
-				err = bopq;
-		}
-	}
-
-out_free:
-	dput(parent);
-	au_dpages_free(&dpages);
-out:
-	return err;
-}
-
-static int au_wbr_bu(struct super_block *sb, aufs_bindex_t bindex)
-{
-	for (; bindex >= 0; bindex--)
-		if (!au_br_rdonly(au_sbr(sb, bindex)))
-			return bindex;
-	return -EROFS;
-}
-
-/* top down parent */
-static int au_wbr_create_tdp(struct dentry *dentry,
-			     unsigned int flags __maybe_unused)
-{
-	int err;
-	aufs_bindex_t bstart, bindex;
-	struct super_block *sb;
-	struct dentry *parent, *h_parent;
-
-	sb = dentry->d_sb;
-	bstart = au_dbstart(dentry);
-	err = bstart;
-	if (!au_br_rdonly(au_sbr(sb, bstart)))
-		goto out;
-
-	err = -EROFS;
-	parent = dget_parent(dentry);
-	for (bindex = au_dbstart(parent); bindex < bstart; bindex++) {
-		h_parent = au_h_dptr(parent, bindex);
-		if (!h_parent || d_is_negative(h_parent))
-			continue;
-
-		if (!au_br_rdonly(au_sbr(sb, bindex))) {
-			err = bindex;
-			break;
-		}
-	}
-	dput(parent);
-
-	/* bottom up here */
-	if (unlikely(err < 0)) {
-		err = au_wbr_bu(sb, bstart - 1);
-		if (err >= 0)
-			err = au_wbr_nonopq(dentry, err);
-	}
-
-out:
-	AuDbg("b%d\n", err);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* an exception for the policy other than tdp */
-static int au_wbr_create_exp(struct dentry *dentry)
-{
-	int err;
-	aufs_bindex_t bwh, bdiropq;
-	struct dentry *parent;
-
-	err = -1;
-	bwh = au_dbwh(dentry);
-	parent = dget_parent(dentry);
-	bdiropq = au_dbdiropq(parent);
-	if (bwh >= 0) {
-		if (bdiropq >= 0)
-			err = min(bdiropq, bwh);
-		else
-			err = bwh;
-		AuDbg("%d\n", err);
-	} else if (bdiropq >= 0) {
-		err = bdiropq;
-		AuDbg("%d\n", err);
-	}
-	dput(parent);
-
-	if (err >= 0)
-		err = au_wbr_nonopq(dentry, err);
-
-	if (err >= 0 && au_br_rdonly(au_sbr(dentry->d_sb, err)))
-		err = -1;
-
-	AuDbg("%d\n", err);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* round robin */
-static int au_wbr_create_init_rr(struct super_block *sb)
-{
-	int err;
-
-	err = au_wbr_bu(sb, au_sbend(sb));
-	atomic_set(&au_sbi(sb)->si_wbr_rr_next, -err); /* less important */
-	/* smp_mb(); */
-
-	AuDbg("b%d\n", err);
-	return err;
-}
-
-static int au_wbr_create_rr(struct dentry *dentry, unsigned int flags)
-{
-	int err, nbr;
-	unsigned int u;
-	aufs_bindex_t bindex, bend;
-	struct super_block *sb;
-	atomic_t *next;
-
-	err = au_wbr_create_exp(dentry);
-	if (err >= 0)
-		goto out;
-
-	sb = dentry->d_sb;
-	next = &au_sbi(sb)->si_wbr_rr_next;
-	bend = au_sbend(sb);
-	nbr = bend + 1;
-	for (bindex = 0; bindex <= bend; bindex++) {
-		if (!au_ftest_wbr(flags, DIR)) {
-			err = atomic_dec_return(next) + 1;
-			/* modulo for 0 is meaningless */
-			if (unlikely(!err))
-				err = atomic_dec_return(next) + 1;
-		} else
-			err = atomic_read(next);
-		AuDbg("%d\n", err);
-		u = err;
-		err = u % nbr;
-		AuDbg("%d\n", err);
-		if (!au_br_rdonly(au_sbr(sb, err)))
-			break;
-		err = -EROFS;
-	}
-
-	if (err >= 0)
-		err = au_wbr_nonopq(dentry, err);
-
-out:
-	AuDbg("%d\n", err);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* most free space */
-static void au_mfs(struct dentry *dentry, struct dentry *parent)
-{
-	struct super_block *sb;
-	struct au_branch *br;
-	struct au_wbr_mfs *mfs;
-	struct dentry *h_parent;
-	aufs_bindex_t bindex, bend;
-	int err;
-	unsigned long long b, bavail;
-	struct path h_path;
-	/* reduce the stack usage */
-	struct kstatfs *st;
-
-	st = kmalloc(sizeof(*st), GFP_NOFS);
-	if (unlikely(!st)) {
-		AuWarn1("failed updating mfs(%d), ignored\n", -ENOMEM);
-		return;
-	}
-
-	bavail = 0;
-	sb = dentry->d_sb;
-	mfs = &au_sbi(sb)->si_wbr_mfs;
-	MtxMustLock(&mfs->mfs_lock);
-	mfs->mfs_bindex = -EROFS;
-	mfs->mfsrr_bytes = 0;
-	if (!parent) {
-		bindex = 0;
-		bend = au_sbend(sb);
-	} else {
-		bindex = au_dbstart(parent);
-		bend = au_dbtaildir(parent);
-	}
-
-	for (; bindex <= bend; bindex++) {
-		if (parent) {
-			h_parent = au_h_dptr(parent, bindex);
-			if (!h_parent || d_is_negative(h_parent))
-				continue;
-		}
-		br = au_sbr(sb, bindex);
-		if (au_br_rdonly(br))
-			continue;
-
-		/* sb->s_root for NFS is unreliable */
-		h_path.mnt = au_br_mnt(br);
-		h_path.dentry = h_path.mnt->mnt_root;
-		err = vfs_statfs(&h_path, st);
-		if (unlikely(err)) {
-			AuWarn1("failed statfs, b%d, %d\n", bindex, err);
-			continue;
-		}
-
-		/* when the available size is equal, select the lower one */
-		BUILD_BUG_ON(sizeof(b) < sizeof(st->f_bavail)
-			     || sizeof(b) < sizeof(st->f_bsize));
-		b = st->f_bavail * st->f_bsize;
-		br->br_wbr->wbr_bytes = b;
-		if (b >= bavail) {
-			bavail = b;
-			mfs->mfs_bindex = bindex;
-			mfs->mfs_jiffy = jiffies;
-		}
-	}
-
-	mfs->mfsrr_bytes = bavail;
-	AuDbg("b%d\n", mfs->mfs_bindex);
-	kfree(st);
-}
-
-static int au_wbr_create_mfs(struct dentry *dentry, unsigned int flags)
-{
-	int err;
-	struct dentry *parent;
-	struct super_block *sb;
-	struct au_wbr_mfs *mfs;
-
-	err = au_wbr_create_exp(dentry);
-	if (err >= 0)
-		goto out;
-
-	sb = dentry->d_sb;
-	parent = NULL;
-	if (au_ftest_wbr(flags, PARENT))
-		parent = dget_parent(dentry);
-	mfs = &au_sbi(sb)->si_wbr_mfs;
-	mutex_lock(&mfs->mfs_lock);
-	if (time_after(jiffies, mfs->mfs_jiffy + mfs->mfs_expire)
-	    || mfs->mfs_bindex < 0
-	    || au_br_rdonly(au_sbr(sb, mfs->mfs_bindex)))
-		au_mfs(dentry, parent);
-	mutex_unlock(&mfs->mfs_lock);
-	err = mfs->mfs_bindex;
-	dput(parent);
-
-	if (err >= 0)
-		err = au_wbr_nonopq(dentry, err);
-
-out:
-	AuDbg("b%d\n", err);
-	return err;
-}
-
-static int au_wbr_create_init_mfs(struct super_block *sb)
-{
-	struct au_wbr_mfs *mfs;
-
-	mfs = &au_sbi(sb)->si_wbr_mfs;
-	mutex_init(&mfs->mfs_lock);
-	mfs->mfs_jiffy = 0;
-	mfs->mfs_bindex = -EROFS;
-
-	return 0;
-}
-
-static int au_wbr_create_fin_mfs(struct super_block *sb __maybe_unused)
-{
-	mutex_destroy(&au_sbi(sb)->si_wbr_mfs.mfs_lock);
-	return 0;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* most free space and then round robin */
-static int au_wbr_create_mfsrr(struct dentry *dentry, unsigned int flags)
-{
-	int err;
-	struct au_wbr_mfs *mfs;
-
-	err = au_wbr_create_mfs(dentry, flags);
-	if (err >= 0) {
-		mfs = &au_sbi(dentry->d_sb)->si_wbr_mfs;
-		mutex_lock(&mfs->mfs_lock);
-		if (mfs->mfsrr_bytes < mfs->mfsrr_watermark)
-			err = au_wbr_create_rr(dentry, flags);
-		mutex_unlock(&mfs->mfs_lock);
-	}
-
-	AuDbg("b%d\n", err);
-	return err;
-}
-
-static int au_wbr_create_init_mfsrr(struct super_block *sb)
-{
-	int err;
-
-	au_wbr_create_init_mfs(sb); /* ignore */
-	err = au_wbr_create_init_rr(sb);
-
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* top down parent and most free space */
-static int au_wbr_create_pmfs(struct dentry *dentry, unsigned int flags)
-{
-	int err, e2;
-	unsigned long long b;
-	aufs_bindex_t bindex, bstart, bend;
-	struct super_block *sb;
-	struct dentry *parent, *h_parent;
-	struct au_branch *br;
-
-	err = au_wbr_create_tdp(dentry, flags);
-	if (unlikely(err < 0))
-		goto out;
-	parent = dget_parent(dentry);
-	bstart = au_dbstart(parent);
-	bend = au_dbtaildir(parent);
-	if (bstart == bend)
-		goto out_parent; /* success */
-
-	e2 = au_wbr_create_mfs(dentry, flags);
-	if (e2 < 0)
-		goto out_parent; /* success */
-
-	/* when the available size is equal, select upper one */
-	sb = dentry->d_sb;
-	br = au_sbr(sb, err);
-	b = br->br_wbr->wbr_bytes;
-	AuDbg("b%d, %llu\n", err, b);
-
-	for (bindex = bstart; bindex <= bend; bindex++) {
-		h_parent = au_h_dptr(parent, bindex);
-		if (!h_parent || d_is_negative(h_parent))
-			continue;
-
-		br = au_sbr(sb, bindex);
-		if (!au_br_rdonly(br) && br->br_wbr->wbr_bytes > b) {
-			b = br->br_wbr->wbr_bytes;
-			err = bindex;
-			AuDbg("b%d, %llu\n", err, b);
-		}
-	}
-
-	if (err >= 0)
-		err = au_wbr_nonopq(dentry, err);
-
-out_parent:
-	dput(parent);
-out:
-	AuDbg("b%d\n", err);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/*
- * - top down parent
- * - most free space with parent
- * - most free space round-robin regardless parent
- */
-static int au_wbr_create_pmfsrr(struct dentry *dentry, unsigned int flags)
-{
-	int err;
-	unsigned long long watermark;
-	struct super_block *sb;
-	struct au_branch *br;
-	struct au_wbr_mfs *mfs;
-
-	err = au_wbr_create_pmfs(dentry, flags | AuWbr_PARENT);
-	if (unlikely(err < 0))
-		goto out;
-
-	sb = dentry->d_sb;
-	br = au_sbr(sb, err);
-	mfs = &au_sbi(sb)->si_wbr_mfs;
-	mutex_lock(&mfs->mfs_lock);
-	watermark = mfs->mfsrr_watermark;
-	mutex_unlock(&mfs->mfs_lock);
-	if (br->br_wbr->wbr_bytes < watermark)
-		/* regardless the parent dir */
-		err = au_wbr_create_mfsrr(dentry, flags);
-
-out:
-	AuDbg("b%d\n", err);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* policies for copyup */
-
-/* top down parent */
-static int au_wbr_copyup_tdp(struct dentry *dentry)
-{
-	return au_wbr_create_tdp(dentry, /*flags, anything is ok*/0);
-}
-
-/* bottom up parent */
-static int au_wbr_copyup_bup(struct dentry *dentry)
-{
-	int err;
-	aufs_bindex_t bindex, bstart;
-	struct dentry *parent, *h_parent;
-	struct super_block *sb;
-
-	err = -EROFS;
-	sb = dentry->d_sb;
-	parent = dget_parent(dentry);
-	bstart = au_dbstart(parent);
-	for (bindex = au_dbstart(dentry); bindex >= bstart; bindex--) {
-		h_parent = au_h_dptr(parent, bindex);
-		if (!h_parent || d_is_negative(h_parent))
-			continue;
-
-		if (!au_br_rdonly(au_sbr(sb, bindex))) {
-			err = bindex;
-			break;
-		}
-	}
-	dput(parent);
-
-	/* bottom up here */
-	if (unlikely(err < 0))
-		err = au_wbr_bu(sb, bstart - 1);
-
-	AuDbg("b%d\n", err);
-	return err;
-}
-
-/* bottom up */
-int au_wbr_do_copyup_bu(struct dentry *dentry, aufs_bindex_t bstart)
-{
-	int err;
-
-	err = au_wbr_bu(dentry->d_sb, bstart);
-	AuDbg("b%d\n", err);
-	if (err > bstart)
-		err = au_wbr_nonopq(dentry, err);
-
-	AuDbg("b%d\n", err);
-	return err;
-}
-
-static int au_wbr_copyup_bu(struct dentry *dentry)
-{
-	int err;
-	aufs_bindex_t bstart;
-
-	bstart = au_dbstart(dentry);
-	err = au_wbr_do_copyup_bu(dentry, bstart);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-struct au_wbr_copyup_operations au_wbr_copyup_ops[] = {
-	[AuWbrCopyup_TDP] = {
-		.copyup	= au_wbr_copyup_tdp
-	},
-	[AuWbrCopyup_BUP] = {
-		.copyup	= au_wbr_copyup_bup
-	},
-	[AuWbrCopyup_BU] = {
-		.copyup	= au_wbr_copyup_bu
-	}
-};
-
-struct au_wbr_create_operations au_wbr_create_ops[] = {
-	[AuWbrCreate_TDP] = {
-		.create	= au_wbr_create_tdp
-	},
-	[AuWbrCreate_RR] = {
-		.create	= au_wbr_create_rr,
-		.init	= au_wbr_create_init_rr
-	},
-	[AuWbrCreate_MFS] = {
-		.create	= au_wbr_create_mfs,
-		.init	= au_wbr_create_init_mfs,
-		.fin	= au_wbr_create_fin_mfs
-	},
-	[AuWbrCreate_MFSV] = {
-		.create	= au_wbr_create_mfs,
-		.init	= au_wbr_create_init_mfs,
-		.fin	= au_wbr_create_fin_mfs
-	},
-	[AuWbrCreate_MFSRR] = {
-		.create	= au_wbr_create_mfsrr,
-		.init	= au_wbr_create_init_mfsrr,
-		.fin	= au_wbr_create_fin_mfs
-	},
-	[AuWbrCreate_MFSRRV] = {
-		.create	= au_wbr_create_mfsrr,
-		.init	= au_wbr_create_init_mfsrr,
-		.fin	= au_wbr_create_fin_mfs
-	},
-	[AuWbrCreate_PMFS] = {
-		.create	= au_wbr_create_pmfs,
-		.init	= au_wbr_create_init_mfs,
-		.fin	= au_wbr_create_fin_mfs
-	},
-	[AuWbrCreate_PMFSV] = {
-		.create	= au_wbr_create_pmfs,
-		.init	= au_wbr_create_init_mfs,
-		.fin	= au_wbr_create_fin_mfs
-	},
-	[AuWbrCreate_PMFSRR] = {
-		.create	= au_wbr_create_pmfsrr,
-		.init	= au_wbr_create_init_mfsrr,
-		.fin	= au_wbr_create_fin_mfs
-	},
-	[AuWbrCreate_PMFSRRV] = {
-		.create	= au_wbr_create_pmfsrr,
-		.init	= au_wbr_create_init_mfsrr,
-		.fin	= au_wbr_create_fin_mfs
-	}
-};
diff --git a/fs/aufs/whout.c b/fs/aufs/whout.c
deleted file mode 100644
index 04eb9af2b..000000000
--- a/fs/aufs/whout.c
+++ /dev/null
@@ -1,1047 +0,0 @@
-/*
- * Copyright (C) 2005-2016 Junjiro R. Okajima
- */
-
-/*
- * whiteout for logical deletion and opaque directory
- */
-
-#include "aufs.h"
-
-#define WH_MASK			S_IRUGO
-
-/*
- * If a directory contains this file, then it is opaque.  We start with the
- * .wh. flag so that it is blocked by lookup.
- */
-static struct qstr diropq_name = QSTR_INIT(AUFS_WH_DIROPQ,
-					   sizeof(AUFS_WH_DIROPQ) - 1);
-
-/*
- * generate whiteout name, which is NOT terminated by NULL.
- * @name: original d_name.name
- * @len: original d_name.len
- * @wh: whiteout qstr
- * returns zero when succeeds, otherwise error.
- * succeeded value as wh->name should be freed by kfree().
- */
-int au_wh_name_alloc(struct qstr *wh, const struct qstr *name)
-{
-	char *p;
-
-	if (unlikely(name->len > PATH_MAX - AUFS_WH_PFX_LEN))
-		return -ENAMETOOLONG;
-
-	wh->len = name->len + AUFS_WH_PFX_LEN;
-	p = kmalloc(wh->len, GFP_NOFS);
-	wh->name = p;
-	if (p) {
-		memcpy(p, AUFS_WH_PFX, AUFS_WH_PFX_LEN);
-		memcpy(p + AUFS_WH_PFX_LEN, name->name, name->len);
-		/* smp_mb(); */
-		return 0;
-	}
-	return -ENOMEM;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/*
- * test if the @wh_name exists under @h_parent.
- * @try_sio specifies the necessary of super-io.
- */
-int au_wh_test(struct dentry *h_parent, struct qstr *wh_name, int try_sio)
-{
-	int err;
-	struct dentry *wh_dentry;
-
-	if (!try_sio)
-		wh_dentry = vfsub_lkup_one(wh_name, h_parent);
-	else
-		wh_dentry = au_sio_lkup_one(wh_name, h_parent);
-	err = PTR_ERR(wh_dentry);
-	if (IS_ERR(wh_dentry)) {
-		if (err == -ENAMETOOLONG)
-			err = 0;
-		goto out;
-	}
-
-	err = 0;
-	if (d_is_negative(wh_dentry))
-		goto out_wh; /* success */
-
-	err = 1;
-	if (d_is_reg(wh_dentry))
-		goto out_wh; /* success */
-
-	err = -EIO;
-	AuIOErr("%pd Invalid whiteout entry type 0%o.\n",
-		wh_dentry, d_inode(wh_dentry)->i_mode);
-
-out_wh:
-	dput(wh_dentry);
-out:
-	return err;
-}
-
-/*
- * test if the @h_dentry sets opaque or not.
- */
-int au_diropq_test(struct dentry *h_dentry)
-{
-	int err;
-	struct inode *h_dir;
-
-	h_dir = d_inode(h_dentry);
-	err = au_wh_test(h_dentry, &diropq_name,
-			 au_test_h_perm_sio(h_dir, MAY_EXEC));
-	return err;
-}
-
-/*
- * returns a negative dentry whose name is unique and temporary.
- */
-struct dentry *au_whtmp_lkup(struct dentry *h_parent, struct au_branch *br,
-			     struct qstr *prefix)
-{
-	struct dentry *dentry;
-	int i;
-	char defname[NAME_MAX - AUFS_MAX_NAMELEN + DNAME_INLINE_LEN + 1],
-		*name, *p;
-	/* strict atomic_t is unnecessary here */
-	static unsigned short cnt;
-	struct qstr qs;
-
-	BUILD_BUG_ON(sizeof(cnt) * 2 > AUFS_WH_TMP_LEN);
-
-	name = defname;
-	qs.len = sizeof(defname) - DNAME_INLINE_LEN + prefix->len - 1;
-	if (unlikely(prefix->len > DNAME_INLINE_LEN)) {
-		dentry = ERR_PTR(-ENAMETOOLONG);
-		if (unlikely(qs.len > NAME_MAX))
-			goto out;
-		dentry = ERR_PTR(-ENOMEM);
-		name = kmalloc(qs.len + 1, GFP_NOFS);
-		if (unlikely(!name))
-			goto out;
-	}
-
-	/* doubly whiteout-ed */
-	memcpy(name, AUFS_WH_PFX AUFS_WH_PFX, AUFS_WH_PFX_LEN * 2);
-	p = name + AUFS_WH_PFX_LEN * 2;
-	memcpy(p, prefix->name, prefix->len);
-	p += prefix->len;
-	*p++ = '.';
-	AuDebugOn(name + qs.len + 1 - p <= AUFS_WH_TMP_LEN);
-
-	qs.name = name;
-	for (i = 0; i < 3; i++) {
-		sprintf(p, "%.*x", AUFS_WH_TMP_LEN, cnt++);
-		dentry = au_sio_lkup_one(&qs, h_parent);
-		if (IS_ERR(dentry) || d_is_negative(dentry))
-			goto out_name;
-		dput(dentry);
-	}
-	/* pr_warn("could not get random name\n"); */
-	dentry = ERR_PTR(-EEXIST);
-	AuDbg("%.*s\n", AuLNPair(&qs));
-	BUG();
-
-out_name:
-	if (name != defname)
-		kfree(name);
-out:
-	AuTraceErrPtr(dentry);
-	return dentry;
-}
-
-/*
- * rename the @h_dentry on @br to the whiteouted temporary name.
- */
-int au_whtmp_ren(struct dentry *h_dentry, struct au_branch *br)
-{
-	int err;
-	struct path h_path = {
-		.mnt = au_br_mnt(br)
-	};
-	struct inode *h_dir, *delegated;
-	struct dentry *h_parent;
-
-	h_parent = h_dentry->d_parent; /* dir inode is locked */
-	h_dir = d_inode(h_parent);
-	IMustLock(h_dir);
-
-	h_path.dentry = au_whtmp_lkup(h_parent, br, &h_dentry->d_name);
-	err = PTR_ERR(h_path.dentry);
-	if (IS_ERR(h_path.dentry))
-		goto out;
-
-	/* under the same dir, no need to lock_rename() */
-	delegated = NULL;
-	err = vfsub_rename(h_dir, h_dentry, h_dir, &h_path, &delegated);
-	AuTraceErr(err);
-	if (unlikely(err == -EWOULDBLOCK)) {
-		pr_warn("cannot retry for NFSv4 delegation"
-			" for an internal rename\n");
-		iput(delegated);
-	}
-	dput(h_path.dentry);
-
-out:
-	AuTraceErr(err);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-/*
- * functions for removing a whiteout
- */
-
-static int do_unlink_wh(struct inode *h_dir, struct path *h_path)
-{
-	int err, force;
-	struct inode *delegated;
-
-	/*
-	 * forces superio when the dir has a sticky bit.
-	 * this may be a violation of unix fs semantics.
-	 */
-	force = (h_dir->i_mode & S_ISVTX)
-		&& !uid_eq(current_fsuid(), d_inode(h_path->dentry)->i_uid);
-	delegated = NULL;
-	err = vfsub_unlink(h_dir, h_path, &delegated, force);
-	if (unlikely(err == -EWOULDBLOCK)) {
-		pr_warn("cannot retry for NFSv4 delegation"
-			" for an internal unlink\n");
-		iput(delegated);
-	}
-	return err;
-}
-
-int au_wh_unlink_dentry(struct inode *h_dir, struct path *h_path,
-			struct dentry *dentry)
-{
-	int err;
-
-	err = do_unlink_wh(h_dir, h_path);
-	if (!err && dentry)
-		au_set_dbwh(dentry, -1);
-
-	return err;
-}
-
-static int unlink_wh_name(struct dentry *h_parent, struct qstr *wh,
-			  struct au_branch *br)
-{
-	int err;
-	struct path h_path = {
-		.mnt = au_br_mnt(br)
-	};
-
-	err = 0;
-	h_path.dentry = vfsub_lkup_one(wh, h_parent);
-	if (IS_ERR(h_path.dentry))
-		err = PTR_ERR(h_path.dentry);
-	else {
-		if (d_is_reg(h_path.dentry))
-			err = do_unlink_wh(d_inode(h_parent), &h_path);
-		dput(h_path.dentry);
-	}
-
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-/*
- * initialize/clean whiteout for a branch
- */
-
-static void au_wh_clean(struct inode *h_dir, struct path *whpath,
-			const int isdir)
-{
-	int err;
-	struct inode *delegated;
-
-	if (d_is_negative(whpath->dentry))
-		return;
-
-	if (isdir)
-		err = vfsub_rmdir(h_dir, whpath);
-	else {
-		delegated = NULL;
-		err = vfsub_unlink(h_dir, whpath, &delegated, /*force*/0);
-		if (unlikely(err == -EWOULDBLOCK)) {
-			pr_warn("cannot retry for NFSv4 delegation"
-				" for an internal unlink\n");
-			iput(delegated);
-		}
-	}
-	if (unlikely(err))
-		pr_warn("failed removing %pd (%d), ignored.\n",
-			whpath->dentry, err);
-}
-
-static int test_linkable(struct dentry *h_root)
-{
-	struct inode *h_dir = d_inode(h_root);
-
-	if (h_dir->i_op->link)
-		return 0;
-
-	pr_err("%pd (%s) doesn't support link(2), use noplink and rw+nolwh\n",
-	       h_root, au_sbtype(h_root->d_sb));
-	return -ENOSYS;
-}
-
-/* todo: should this mkdir be done in /sbin/mount.aufs helper? */
-static int au_whdir(struct inode *h_dir, struct path *path)
-{
-	int err;
-
-	err = -EEXIST;
-	if (d_is_negative(path->dentry)) {
-		int mode = S_IRWXU;
-
-		if (au_test_nfs(path->dentry->d_sb))
-			mode |= S_IXUGO;
-		err = vfsub_mkdir(h_dir, path, mode);
-	} else if (d_is_dir(path->dentry))
-		err = 0;
-	else
-		pr_err("unknown %pd exists\n", path->dentry);
-
-	return err;
-}
-
-struct au_wh_base {
-	const struct qstr *name;
-	struct dentry *dentry;
-};
-
-static void au_wh_init_ro(struct inode *h_dir, struct au_wh_base base[],
-			  struct path *h_path)
-{
-	h_path->dentry = base[AuBrWh_BASE].dentry;
-	au_wh_clean(h_dir, h_path, /*isdir*/0);
-	h_path->dentry = base[AuBrWh_PLINK].dentry;
-	au_wh_clean(h_dir, h_path, /*isdir*/1);
-	h_path->dentry = base[AuBrWh_ORPH].dentry;
-	au_wh_clean(h_dir, h_path, /*isdir*/1);
-}
-
-/*
- * returns tri-state,
- * minus: error, caller should print the message
- * zero: succuess
- * plus: error, caller should NOT print the message
- */
-static int au_wh_init_rw_nolink(struct dentry *h_root, struct au_wbr *wbr,
-				int do_plink, struct au_wh_base base[],
-				struct path *h_path)
-{
-	int err;
-	struct inode *h_dir;
-
-	h_dir = d_inode(h_root);
-	h_path->dentry = base[AuBrWh_BASE].dentry;
-	au_wh_clean(h_dir, h_path, /*isdir*/0);
-	h_path->dentry = base[AuBrWh_PLINK].dentry;
-	if (do_plink) {
-		err = test_linkable(h_root);
-		if (unlikely(err)) {
-			err = 1;
-			goto out;
-		}
-
-		err = au_whdir(h_dir, h_path);
-		if (unlikely(err))
-			goto out;
-		wbr->wbr_plink = dget(base[AuBrWh_PLINK].dentry);
-	} else
-		au_wh_clean(h_dir, h_path, /*isdir*/1);
-	h_path->dentry = base[AuBrWh_ORPH].dentry;
-	err = au_whdir(h_dir, h_path);
-	if (unlikely(err))
-		goto out;
-	wbr->wbr_orph = dget(base[AuBrWh_ORPH].dentry);
-
-out:
-	return err;
-}
-
-/*
- * for the moment, aufs supports the branch filesystem which does not support
- * link(2). testing on FAT which does not support i_op->setattr() fully either,
- * copyup failed. finally, such filesystem will not be used as the writable
- * branch.
- *
- * returns tri-state, see above.
- */
-static int au_wh_init_rw(struct dentry *h_root, struct au_wbr *wbr,
-			 int do_plink, struct au_wh_base base[],
-			 struct path *h_path)
-{
-	int err;
-	struct inode *h_dir;
-
-	WbrWhMustWriteLock(wbr);
-
-	err = test_linkable(h_root);
-	if (unlikely(err)) {
-		err = 1;
-		goto out;
-	}
-
-	/*
-	 * todo: should this create be done in /sbin/mount.aufs helper?
-	 */
-	err = -EEXIST;
-	h_dir = d_inode(h_root);
-	if (d_is_negative(base[AuBrWh_BASE].dentry)) {
-		h_path->dentry = base[AuBrWh_BASE].dentry;
-		err = vfsub_create(h_dir, h_path, WH_MASK, /*want_excl*/true);
-	} else if (d_is_reg(base[AuBrWh_BASE].dentry))
-		err = 0;
-	else
-		pr_err("unknown %pd2 exists\n", base[AuBrWh_BASE].dentry);
-	if (unlikely(err))
-		goto out;
-
-	h_path->dentry = base[AuBrWh_PLINK].dentry;
-	if (do_plink) {
-		err = au_whdir(h_dir, h_path);
-		if (unlikely(err))
-			goto out;
-		wbr->wbr_plink = dget(base[AuBrWh_PLINK].dentry);
-	} else
-		au_wh_clean(h_dir, h_path, /*isdir*/1);
-	wbr->wbr_whbase = dget(base[AuBrWh_BASE].dentry);
-
-	h_path->dentry = base[AuBrWh_ORPH].dentry;
-	err = au_whdir(h_dir, h_path);
-	if (unlikely(err))
-		goto out;
-	wbr->wbr_orph = dget(base[AuBrWh_ORPH].dentry);
-
-out:
-	return err;
-}
-
-/*
- * initialize the whiteout base file/dir for @br.
- */
-int au_wh_init(struct au_branch *br, struct super_block *sb)
-{
-	int err, i;
-	const unsigned char do_plink
-		= !!au_opt_test(au_mntflags(sb), PLINK);
-	struct inode *h_dir;
-	struct path path = br->br_path;
-	struct dentry *h_root = path.dentry;
-	struct au_wbr *wbr = br->br_wbr;
-	static const struct qstr base_name[] = {
-		[AuBrWh_BASE] = QSTR_INIT(AUFS_BASE_NAME,
-					  sizeof(AUFS_BASE_NAME) - 1),
-		[AuBrWh_PLINK] = QSTR_INIT(AUFS_PLINKDIR_NAME,
-					   sizeof(AUFS_PLINKDIR_NAME) - 1),
-		[AuBrWh_ORPH] = QSTR_INIT(AUFS_ORPHDIR_NAME,
-					  sizeof(AUFS_ORPHDIR_NAME) - 1)
-	};
-	struct au_wh_base base[] = {
-		[AuBrWh_BASE] = {
-			.name	= base_name + AuBrWh_BASE,
-			.dentry	= NULL
-		},
-		[AuBrWh_PLINK] = {
-			.name	= base_name + AuBrWh_PLINK,
-			.dentry	= NULL
-		},
-		[AuBrWh_ORPH] = {
-			.name	= base_name + AuBrWh_ORPH,
-			.dentry	= NULL
-		}
-	};
-
-	if (wbr)
-		WbrWhMustWriteLock(wbr);
-
-	for (i = 0; i < AuBrWh_Last; i++) {
-		/* doubly whiteouted */
-		struct dentry *d;
-
-		d = au_wh_lkup(h_root, (void *)base[i].name, br);
-		err = PTR_ERR(d);
-		if (IS_ERR(d))
-			goto out;
-
-		base[i].dentry = d;
-		AuDebugOn(wbr
-			  && wbr->wbr_wh[i]
-			  && wbr->wbr_wh[i] != base[i].dentry);
-	}
-
-	if (wbr)
-		for (i = 0; i < AuBrWh_Last; i++) {
-			dput(wbr->wbr_wh[i]);
-			wbr->wbr_wh[i] = NULL;
-		}
-
-	err = 0;
-	if (!au_br_writable(br->br_perm)) {
-		h_dir = d_inode(h_root);
-		au_wh_init_ro(h_dir, base, &path);
-	} else if (!au_br_wh_linkable(br->br_perm)) {
-		err = au_wh_init_rw_nolink(h_root, wbr, do_plink, base, &path);
-		if (err > 0)
-			goto out;
-		else if (err)
-			goto out_err;
-	} else {
-		err = au_wh_init_rw(h_root, wbr, do_plink, base, &path);
-		if (err > 0)
-			goto out;
-		else if (err)
-			goto out_err;
-	}
-	goto out; /* success */
-
-out_err:
-	pr_err("an error(%d) on the writable branch %pd(%s)\n",
-	       err, h_root, au_sbtype(h_root->d_sb));
-out:
-	for (i = 0; i < AuBrWh_Last; i++)
-		dput(base[i].dentry);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-/*
- * whiteouts are all hard-linked usually.
- * when its link count reaches a ceiling, we create a new whiteout base
- * asynchronously.
- */
-
-struct reinit_br_wh {
-	struct super_block *sb;
-	struct au_branch *br;
-};
-
-static void reinit_br_wh(void *arg)
-{
-	int err;
-	aufs_bindex_t bindex;
-	struct path h_path;
-	struct reinit_br_wh *a = arg;
-	struct au_wbr *wbr;
-	struct inode *dir, *delegated;
-	struct dentry *h_root;
-	struct au_hinode *hdir;
-
-	err = 0;
-	wbr = a->br->br_wbr;
-	/* big aufs lock */
-	si_noflush_write_lock(a->sb);
-	if (!au_br_writable(a->br->br_perm))
-		goto out;
-	bindex = au_br_index(a->sb, a->br->br_id);
-	if (unlikely(bindex < 0))
-		goto out;
-
-	di_read_lock_parent(a->sb->s_root, AuLock_IR);
-	dir = d_inode(a->sb->s_root);
-	hdir = au_hi(dir, bindex);
-	h_root = au_h_dptr(a->sb->s_root, bindex);
-	AuDebugOn(h_root != au_br_dentry(a->br));
-
-	au_hn_imtx_lock_nested(hdir, AuLsc_I_PARENT);
-	wbr_wh_write_lock(wbr);
-	err = au_h_verify(wbr->wbr_whbase, au_opt_udba(a->sb), hdir->hi_inode,
-			  h_root, a->br);
-	if (!err) {
-		h_path.dentry = wbr->wbr_whbase;
-		h_path.mnt = au_br_mnt(a->br);
-		delegated = NULL;
-		err = vfsub_unlink(hdir->hi_inode, &h_path, &delegated,
-				   /*force*/0);
-		if (unlikely(err == -EWOULDBLOCK)) {
-			pr_warn("cannot retry for NFSv4 delegation"
-				" for an internal unlink\n");
-			iput(delegated);
-		}
-	} else {
-		pr_warn("%pd is moved, ignored\n", wbr->wbr_whbase);
-		err = 0;
-	}
-	dput(wbr->wbr_whbase);
-	wbr->wbr_whbase = NULL;
-	if (!err)
-		err = au_wh_init(a->br, a->sb);
-	wbr_wh_write_unlock(wbr);
-	au_hn_imtx_unlock(hdir);
-	di_read_unlock(a->sb->s_root, AuLock_IR);
-	if (!err)
-		au_fhsm_wrote(a->sb, bindex, /*force*/0);
-
-out:
-	if (wbr)
-		atomic_dec(&wbr->wbr_wh_running);
-	atomic_dec(&a->br->br_count);
-	si_write_unlock(a->sb);
-	au_nwt_done(&au_sbi(a->sb)->si_nowait);
-	kfree(arg);
-	if (unlikely(err))
-		AuIOErr("err %d\n", err);
-}
-
-static void kick_reinit_br_wh(struct super_block *sb, struct au_branch *br)
-{
-	int do_dec, wkq_err;
-	struct reinit_br_wh *arg;
-
-	do_dec = 1;
-	if (atomic_inc_return(&br->br_wbr->wbr_wh_running) != 1)
-		goto out;
-
-	/* ignore ENOMEM */
-	arg = kmalloc(sizeof(*arg), GFP_NOFS);
-	if (arg) {
-		/*
-		 * dec(wh_running), kfree(arg) and dec(br_count)
-		 * in reinit function
-		 */
-		arg->sb = sb;
-		arg->br = br;
-		atomic_inc(&br->br_count);
-		wkq_err = au_wkq_nowait(reinit_br_wh, arg, sb, /*flags*/0);
-		if (unlikely(wkq_err)) {
-			atomic_dec(&br->br_wbr->wbr_wh_running);
-			atomic_dec(&br->br_count);
-			kfree(arg);
-		}
-		do_dec = 0;
-	}
-
-out:
-	if (do_dec)
-		atomic_dec(&br->br_wbr->wbr_wh_running);
-}
-
-/* ---------------------------------------------------------------------- */
-
-/*
- * create the whiteout @wh.
- */
-static int link_or_create_wh(struct super_block *sb, aufs_bindex_t bindex,
-			     struct dentry *wh)
-{
-	int err;
-	struct path h_path = {
-		.dentry = wh
-	};
-	struct au_branch *br;
-	struct au_wbr *wbr;
-	struct dentry *h_parent;
-	struct inode *h_dir, *delegated;
-
-	h_parent = wh->d_parent; /* dir inode is locked */
-	h_dir = d_inode(h_parent);
-	IMustLock(h_dir);
-
-	br = au_sbr(sb, bindex);
-	h_path.mnt = au_br_mnt(br);
-	wbr = br->br_wbr;
-	wbr_wh_read_lock(wbr);
-	if (wbr->wbr_whbase) {
-		delegated = NULL;
-		err = vfsub_link(wbr->wbr_whbase, h_dir, &h_path, &delegated);
-		if (unlikely(err == -EWOULDBLOCK)) {
-			pr_warn("cannot retry for NFSv4 delegation"
-				" for an internal link\n");
-			iput(delegated);
-		}
-		if (!err || err != -EMLINK)
-			goto out;
-
-		/* link count full. re-initialize br_whbase. */
-		kick_reinit_br_wh(sb, br);
-	}
-
-	/* return this error in this context */
-	err = vfsub_create(h_dir, &h_path, WH_MASK, /*want_excl*/true);
-	if (!err)
-		au_fhsm_wrote(sb, bindex, /*force*/0);
-
-out:
-	wbr_wh_read_unlock(wbr);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/*
- * create or remove the diropq.
- */
-static struct dentry *do_diropq(struct dentry *dentry, aufs_bindex_t bindex,
-				unsigned int flags)
-{
-	struct dentry *opq_dentry, *h_dentry;
-	struct super_block *sb;
-	struct au_branch *br;
-	int err;
-
-	sb = dentry->d_sb;
-	br = au_sbr(sb, bindex);
-	h_dentry = au_h_dptr(dentry, bindex);
-	opq_dentry = vfsub_lkup_one(&diropq_name, h_dentry);
-	if (IS_ERR(opq_dentry))
-		goto out;
-
-	if (au_ftest_diropq(flags, CREATE)) {
-		err = link_or_create_wh(sb, bindex, opq_dentry);
-		if (!err) {
-			au_set_dbdiropq(dentry, bindex);
-			goto out; /* success */
-		}
-	} else {
-		struct path tmp = {
-			.dentry = opq_dentry,
-			.mnt	= au_br_mnt(br)
-		};
-		err = do_unlink_wh(au_h_iptr(d_inode(dentry), bindex), &tmp);
-		if (!err)
-			au_set_dbdiropq(dentry, -1);
-	}
-	dput(opq_dentry);
-	opq_dentry = ERR_PTR(err);
-
-out:
-	return opq_dentry;
-}
-
-struct do_diropq_args {
-	struct dentry **errp;
-	struct dentry *dentry;
-	aufs_bindex_t bindex;
-	unsigned int flags;
-};
-
-static void call_do_diropq(void *args)
-{
-	struct do_diropq_args *a = args;
-	*a->errp = do_diropq(a->dentry, a->bindex, a->flags);
-}
-
-struct dentry *au_diropq_sio(struct dentry *dentry, aufs_bindex_t bindex,
-			     unsigned int flags)
-{
-	struct dentry *diropq, *h_dentry;
-
-	h_dentry = au_h_dptr(dentry, bindex);
-	if (!au_test_h_perm_sio(d_inode(h_dentry), MAY_EXEC | MAY_WRITE))
-		diropq = do_diropq(dentry, bindex, flags);
-	else {
-		int wkq_err;
-		struct do_diropq_args args = {
-			.errp		= &diropq,
-			.dentry		= dentry,
-			.bindex		= bindex,
-			.flags		= flags
-		};
-
-		wkq_err = au_wkq_wait(call_do_diropq, &args);
-		if (unlikely(wkq_err))
-			diropq = ERR_PTR(wkq_err);
-	}
-
-	return diropq;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/*
- * lookup whiteout dentry.
- * @h_parent: lower parent dentry which must exist and be locked
- * @base_name: name of dentry which will be whiteouted
- * returns dentry for whiteout.
- */
-struct dentry *au_wh_lkup(struct dentry *h_parent, struct qstr *base_name,
-			  struct au_branch *br)
-{
-	int err;
-	struct qstr wh_name;
-	struct dentry *wh_dentry;
-
-	err = au_wh_name_alloc(&wh_name, base_name);
-	wh_dentry = ERR_PTR(err);
-	if (!err) {
-		wh_dentry = vfsub_lkup_one(&wh_name, h_parent);
-		kfree(wh_name.name);
-	}
-	return wh_dentry;
-}
-
-/*
- * link/create a whiteout for @dentry on @bindex.
- */
-struct dentry *au_wh_create(struct dentry *dentry, aufs_bindex_t bindex,
-			    struct dentry *h_parent)
-{
-	struct dentry *wh_dentry;
-	struct super_block *sb;
-	int err;
-
-	sb = dentry->d_sb;
-	wh_dentry = au_wh_lkup(h_parent, &dentry->d_name, au_sbr(sb, bindex));
-	if (!IS_ERR(wh_dentry) && d_is_negative(wh_dentry)) {
-		err = link_or_create_wh(sb, bindex, wh_dentry);
-		if (!err) {
-			au_set_dbwh(dentry, bindex);
-			au_fhsm_wrote(sb, bindex, /*force*/0);
-		} else {
-			dput(wh_dentry);
-			wh_dentry = ERR_PTR(err);
-		}
-	}
-
-	return wh_dentry;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* Delete all whiteouts in this directory on branch bindex. */
-static int del_wh_children(struct dentry *h_dentry, struct au_nhash *whlist,
-			   aufs_bindex_t bindex, struct au_branch *br)
-{
-	int err;
-	unsigned long ul, n;
-	struct qstr wh_name;
-	char *p;
-	struct hlist_head *head;
-	struct au_vdir_wh *pos;
-	struct au_vdir_destr *str;
-
-	err = -ENOMEM;
-	p = (void *)__get_free_page(GFP_NOFS);
-	wh_name.name = p;
-	if (unlikely(!wh_name.name))
-		goto out;
-
-	err = 0;
-	memcpy(p, AUFS_WH_PFX, AUFS_WH_PFX_LEN);
-	p += AUFS_WH_PFX_LEN;
-	n = whlist->nh_num;
-	head = whlist->nh_head;
-	for (ul = 0; !err && ul < n; ul++, head++) {
-		hlist_for_each_entry(pos, head, wh_hash) {
-			if (pos->wh_bindex != bindex)
-				continue;
-
-			str = &pos->wh_str;
-			if (str->len + AUFS_WH_PFX_LEN <= PATH_MAX) {
-				memcpy(p, str->name, str->len);
-				wh_name.len = AUFS_WH_PFX_LEN + str->len;
-				err = unlink_wh_name(h_dentry, &wh_name, br);
-				if (!err)
-					continue;
-				break;
-			}
-			AuIOErr("whiteout name too long %.*s\n",
-				str->len, str->name);
-			err = -EIO;
-			break;
-		}
-	}
-	free_page((unsigned long)wh_name.name);
-
-out:
-	return err;
-}
-
-struct del_wh_children_args {
-	int *errp;
-	struct dentry *h_dentry;
-	struct au_nhash *whlist;
-	aufs_bindex_t bindex;
-	struct au_branch *br;
-};
-
-static void call_del_wh_children(void *args)
-{
-	struct del_wh_children_args *a = args;
-	*a->errp = del_wh_children(a->h_dentry, a->whlist, a->bindex, a->br);
-}
-
-/* ---------------------------------------------------------------------- */
-
-struct au_whtmp_rmdir *au_whtmp_rmdir_alloc(struct super_block *sb, gfp_t gfp)
-{
-	struct au_whtmp_rmdir *whtmp;
-	int err;
-	unsigned int rdhash;
-
-	SiMustAnyLock(sb);
-
-	whtmp = kzalloc(sizeof(*whtmp), gfp);
-	if (unlikely(!whtmp)) {
-		whtmp = ERR_PTR(-ENOMEM);
-		goto out;
-	}
-
-	/* no estimation for dir size */
-	rdhash = au_sbi(sb)->si_rdhash;
-	if (!rdhash)
-		rdhash = AUFS_RDHASH_DEF;
-	err = au_nhash_alloc(&whtmp->whlist, rdhash, gfp);
-	if (unlikely(err)) {
-		kfree(whtmp);
-		whtmp = ERR_PTR(err);
-	}
-
-out:
-	return whtmp;
-}
-
-void au_whtmp_rmdir_free(struct au_whtmp_rmdir *whtmp)
-{
-	if (whtmp->br)
-		atomic_dec(&whtmp->br->br_count);
-	dput(whtmp->wh_dentry);
-	iput(whtmp->dir);
-	au_nhash_wh_free(&whtmp->whlist);
-	kfree(whtmp);
-}
-
-/*
- * rmdir the whiteouted temporary named dir @h_dentry.
- * @whlist: whiteouted children.
- */
-int au_whtmp_rmdir(struct inode *dir, aufs_bindex_t bindex,
-		   struct dentry *wh_dentry, struct au_nhash *whlist)
-{
-	int err;
-	unsigned int h_nlink;
-	struct path h_tmp;
-	struct inode *wh_inode, *h_dir;
-	struct au_branch *br;
-
-	h_dir = d_inode(wh_dentry->d_parent); /* dir inode is locked */
-	IMustLock(h_dir);
-
-	br = au_sbr(dir->i_sb, bindex);
-	wh_inode = d_inode(wh_dentry);
-	mutex_lock_nested(&wh_inode->i_mutex, AuLsc_I_CHILD);
-
-	/*
-	 * someone else might change some whiteouts while we were sleeping.
-	 * it means this whlist may have an obsoleted entry.
-	 */
-	if (!au_test_h_perm_sio(wh_inode, MAY_EXEC | MAY_WRITE))
-		err = del_wh_children(wh_dentry, whlist, bindex, br);
-	else {
-		int wkq_err;
-		struct del_wh_children_args args = {
-			.errp		= &err,
-			.h_dentry	= wh_dentry,
-			.whlist		= whlist,
-			.bindex		= bindex,
-			.br		= br
-		};
-
-		wkq_err = au_wkq_wait(call_del_wh_children, &args);
-		if (unlikely(wkq_err))
-			err = wkq_err;
-	}
-	mutex_unlock(&wh_inode->i_mutex);
-
-	if (!err) {
-		h_tmp.dentry = wh_dentry;
-		h_tmp.mnt = au_br_mnt(br);
-		h_nlink = h_dir->i_nlink;
-		err = vfsub_rmdir(h_dir, &h_tmp);
-		/* some fs doesn't change the parent nlink in some cases */
-		h_nlink -= h_dir->i_nlink;
-	}
-
-	if (!err) {
-		if (au_ibstart(dir) == bindex) {
-			/* todo: dir->i_mutex is necessary */
-			au_cpup_attr_timesizes(dir);
-			if (h_nlink)
-				vfsub_drop_nlink(dir);
-		}
-		return 0; /* success */
-	}
-
-	pr_warn("failed removing %pd(%d), ignored\n", wh_dentry, err);
-	return err;
-}
-
-static void call_rmdir_whtmp(void *args)
-{
-	int err;
-	aufs_bindex_t bindex;
-	struct au_whtmp_rmdir *a = args;
-	struct super_block *sb;
-	struct dentry *h_parent;
-	struct inode *h_dir;
-	struct au_hinode *hdir;
-
-	/* rmdir by nfsd may cause deadlock with this i_mutex */
-	/* mutex_lock(&a->dir->i_mutex); */
-	err = -EROFS;
-	sb = a->dir->i_sb;
-	si_read_lock(sb, !AuLock_FLUSH);
-	if (!au_br_writable(a->br->br_perm))
-		goto out;
-	bindex = au_br_index(sb, a->br->br_id);
-	if (unlikely(bindex < 0))
-		goto out;
-
-	err = -EIO;
-	ii_write_lock_parent(a->dir);
-	h_parent = dget_parent(a->wh_dentry);
-	h_dir = d_inode(h_parent);
-	hdir = au_hi(a->dir, bindex);
-	err = vfsub_mnt_want_write(au_br_mnt(a->br));
-	if (unlikely(err))
-		goto out_mnt;
-	au_hn_imtx_lock_nested(hdir, AuLsc_I_PARENT);
-	err = au_h_verify(a->wh_dentry, au_opt_udba(sb), h_dir, h_parent,
-			  a->br);
-	if (!err)
-		err = au_whtmp_rmdir(a->dir, bindex, a->wh_dentry, &a->whlist);
-	au_hn_imtx_unlock(hdir);
-	vfsub_mnt_drop_write(au_br_mnt(a->br));
-
-out_mnt:
-	dput(h_parent);
-	ii_write_unlock(a->dir);
-out:
-	/* mutex_unlock(&a->dir->i_mutex); */
-	au_whtmp_rmdir_free(a);
-	si_read_unlock(sb);
-	au_nwt_done(&au_sbi(sb)->si_nowait);
-	if (unlikely(err))
-		AuIOErr("err %d\n", err);
-}
-
-void au_whtmp_kick_rmdir(struct inode *dir, aufs_bindex_t bindex,
-			 struct dentry *wh_dentry, struct au_whtmp_rmdir *args)
-{
-	int wkq_err;
-	struct super_block *sb;
-
-	IMustLock(dir);
-
-	/* all post-process will be done in do_rmdir_whtmp(). */
-	sb = dir->i_sb;
-	args->dir = au_igrab(dir);
-	args->br = au_sbr(sb, bindex);
-	atomic_inc(&args->br->br_count);
-	args->wh_dentry = dget(wh_dentry);
-	wkq_err = au_wkq_nowait(call_rmdir_whtmp, args, sb, /*flags*/0);
-	if (unlikely(wkq_err)) {
-		pr_warn("rmdir error %pd (%d), ignored\n", wh_dentry, wkq_err);
-		au_whtmp_rmdir_free(args);
-	}
-}
diff --git a/fs/aufs/whout.h b/fs/aufs/whout.h
deleted file mode 100644
index 4077dd19e..000000000
--- a/fs/aufs/whout.h
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (C) 2005-2016 Junjiro R. Okajima
- */
-
-/*
- * whiteout for logical deletion and opaque directory
- */
-
-#ifndef __AUFS_WHOUT_H__
-#define __AUFS_WHOUT_H__
-
-#ifdef __KERNEL__
-
-#include "dir.h"
-
-/* whout.c */
-int au_wh_name_alloc(struct qstr *wh, const struct qstr *name);
-int au_wh_test(struct dentry *h_parent, struct qstr *wh_name, int try_sio);
-int au_diropq_test(struct dentry *h_dentry);
-struct au_branch;
-struct dentry *au_whtmp_lkup(struct dentry *h_parent, struct au_branch *br,
-			     struct qstr *prefix);
-int au_whtmp_ren(struct dentry *h_dentry, struct au_branch *br);
-int au_wh_unlink_dentry(struct inode *h_dir, struct path *h_path,
-			struct dentry *dentry);
-int au_wh_init(struct au_branch *br, struct super_block *sb);
-
-/* diropq flags */
-#define AuDiropq_CREATE	1
-#define au_ftest_diropq(flags, name)	((flags) & AuDiropq_##name)
-#define au_fset_diropq(flags, name) \
-	do { (flags) |= AuDiropq_##name; } while (0)
-#define au_fclr_diropq(flags, name) \
-	do { (flags) &= ~AuDiropq_##name; } while (0)
-
-struct dentry *au_diropq_sio(struct dentry *dentry, aufs_bindex_t bindex,
-			     unsigned int flags);
-struct dentry *au_wh_lkup(struct dentry *h_parent, struct qstr *base_name,
-			  struct au_branch *br);
-struct dentry *au_wh_create(struct dentry *dentry, aufs_bindex_t bindex,
-			    struct dentry *h_parent);
-
-/* real rmdir for the whiteout-ed dir */
-struct au_whtmp_rmdir {
-	struct inode *dir;
-	struct au_branch *br;
-	struct dentry *wh_dentry;
-	struct au_nhash whlist;
-};
-
-struct au_whtmp_rmdir *au_whtmp_rmdir_alloc(struct super_block *sb, gfp_t gfp);
-void au_whtmp_rmdir_free(struct au_whtmp_rmdir *whtmp);
-int au_whtmp_rmdir(struct inode *dir, aufs_bindex_t bindex,
-		   struct dentry *wh_dentry, struct au_nhash *whlist);
-void au_whtmp_kick_rmdir(struct inode *dir, aufs_bindex_t bindex,
-			 struct dentry *wh_dentry, struct au_whtmp_rmdir *args);
-
-/* ---------------------------------------------------------------------- */
-
-static inline struct dentry *au_diropq_create(struct dentry *dentry,
-					      aufs_bindex_t bindex)
-{
-	return au_diropq_sio(dentry, bindex, AuDiropq_CREATE);
-}
-
-static inline int au_diropq_remove(struct dentry *dentry, aufs_bindex_t bindex)
-{
-	return PTR_ERR(au_diropq_sio(dentry, bindex, !AuDiropq_CREATE));
-}
-
-#endif /* __KERNEL__ */
-#endif /* __AUFS_WHOUT_H__ */
diff --git a/fs/aufs/wkq.c b/fs/aufs/wkq.c
deleted file mode 100644
index 0f1500e93..000000000
--- a/fs/aufs/wkq.c
+++ /dev/null
@@ -1,200 +0,0 @@
-/*
- * Copyright (C) 2005-2016 Junjiro R. Okajima
- */
-
-/*
- * workqueue for asynchronous/super-io operations
- * todo: try new dredential scheme
- */
-
-#include <linux/module.h>
-#include "aufs.h"
-
-/* internal workqueue named AUFS_WKQ_NAME */
-
-static struct workqueue_struct *au_wkq;
-
-struct au_wkinfo {
-	struct work_struct wk;
-	struct kobject *kobj;
-
-	unsigned int flags; /* see wkq.h */
-
-	au_wkq_func_t func;
-	void *args;
-
-	struct completion *comp;
-};
-
-/* ---------------------------------------------------------------------- */
-
-static void wkq_func(struct work_struct *wk)
-{
-	struct au_wkinfo *wkinfo = container_of(wk, struct au_wkinfo, wk);
-
-	AuDebugOn(!uid_eq(current_fsuid(), GLOBAL_ROOT_UID));
-	AuDebugOn(rlimit(RLIMIT_FSIZE) != RLIM_INFINITY);
-
-	wkinfo->func(wkinfo->args);
-	if (au_ftest_wkq(wkinfo->flags, WAIT))
-		complete(wkinfo->comp);
-	else {
-		kobject_put(wkinfo->kobj);
-		module_put(THIS_MODULE); /* todo: ?? */
-		kfree(wkinfo);
-	}
-}
-
-/*
- * Since struct completion is large, try allocating it dynamically.
- */
-#if 1 /* defined(CONFIG_4KSTACKS) || defined(AuTest4KSTACKS) */
-#define AuWkqCompDeclare(name)	struct completion *comp = NULL
-
-static int au_wkq_comp_alloc(struct au_wkinfo *wkinfo, struct completion **comp)
-{
-	*comp = kmalloc(sizeof(**comp), GFP_NOFS);
-	if (*comp) {
-		init_completion(*comp);
-		wkinfo->comp = *comp;
-		return 0;
-	}
-	return -ENOMEM;
-}
-
-static void au_wkq_comp_free(struct completion *comp)
-{
-	kfree(comp);
-}
-
-#else
-
-/* no braces */
-#define AuWkqCompDeclare(name) \
-	DECLARE_COMPLETION_ONSTACK(_ ## name); \
-	struct completion *comp = &_ ## name
-
-static int au_wkq_comp_alloc(struct au_wkinfo *wkinfo, struct completion **comp)
-{
-	wkinfo->comp = *comp;
-	return 0;
-}
-
-static void au_wkq_comp_free(struct completion *comp __maybe_unused)
-{
-	/* empty */
-}
-#endif /* 4KSTACKS */
-
-static void au_wkq_run(struct au_wkinfo *wkinfo)
-{
-	if (au_ftest_wkq(wkinfo->flags, NEST)) {
-		if (au_wkq_test()) {
-			AuWarn1("wkq from wkq, unless silly-rename on NFS,"
-				" due to a dead dir by UDBA?\n");
-			AuDebugOn(au_ftest_wkq(wkinfo->flags, WAIT));
-		}
-	} else
-		au_dbg_verify_kthread();
-
-	if (au_ftest_wkq(wkinfo->flags, WAIT)) {
-		INIT_WORK_ONSTACK(&wkinfo->wk, wkq_func);
-		queue_work(au_wkq, &wkinfo->wk);
-	} else {
-		INIT_WORK(&wkinfo->wk, wkq_func);
-		schedule_work(&wkinfo->wk);
-	}
-}
-
-/*
- * Be careful. It is easy to make deadlock happen.
- * processA: lock, wkq and wait
- * processB: wkq and wait, lock in wkq
- * --> deadlock
- */
-int au_wkq_do_wait(unsigned int flags, au_wkq_func_t func, void *args)
-{
-	int err;
-	AuWkqCompDeclare(comp);
-	struct au_wkinfo wkinfo = {
-		.flags	= flags,
-		.func	= func,
-		.args	= args
-	};
-
-	err = au_wkq_comp_alloc(&wkinfo, &comp);
-	if (!err) {
-		au_wkq_run(&wkinfo);
-		/* no timeout, no interrupt */
-		wait_for_completion(wkinfo.comp);
-		au_wkq_comp_free(comp);
-		destroy_work_on_stack(&wkinfo.wk);
-	}
-
-	return err;
-
-}
-
-/*
- * Note: dget/dput() in func for aufs dentries are not supported. It will be a
- * problem in a concurrent umounting.
- */
-int au_wkq_nowait(au_wkq_func_t func, void *args, struct super_block *sb,
-		  unsigned int flags)
-{
-	int err;
-	struct au_wkinfo *wkinfo;
-
-	atomic_inc(&au_sbi(sb)->si_nowait.nw_len);
-
-	/*
-	 * wkq_func() must free this wkinfo.
-	 * it highly depends upon the implementation of workqueue.
-	 */
-	err = 0;
-	wkinfo = kmalloc(sizeof(*wkinfo), GFP_NOFS);
-	if (wkinfo) {
-		wkinfo->kobj = &au_sbi(sb)->si_kobj;
-		wkinfo->flags = flags & ~AuWkq_WAIT;
-		wkinfo->func = func;
-		wkinfo->args = args;
-		wkinfo->comp = NULL;
-		kobject_get(wkinfo->kobj);
-		__module_get(THIS_MODULE); /* todo: ?? */
-
-		au_wkq_run(wkinfo);
-	} else {
-		err = -ENOMEM;
-		au_nwt_done(&au_sbi(sb)->si_nowait);
-	}
-
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-void au_nwt_init(struct au_nowait_tasks *nwt)
-{
-	atomic_set(&nwt->nw_len, 0);
-	/* smp_mb(); */ /* atomic_set */
-	init_waitqueue_head(&nwt->nw_wq);
-}
-
-void au_wkq_fin(void)
-{
-	destroy_workqueue(au_wkq);
-}
-
-int __init au_wkq_init(void)
-{
-	int err;
-
-	err = 0;
-	au_wkq = alloc_workqueue(AUFS_WKQ_NAME, 0, WQ_DFL_ACTIVE);
-	if (IS_ERR(au_wkq))
-		err = PTR_ERR(au_wkq);
-	else if (!au_wkq)
-		err = -ENOMEM;
-
-	return err;
-}
diff --git a/fs/aufs/wkq.h b/fs/aufs/wkq.h
deleted file mode 100644
index f6c9b9902..000000000
--- a/fs/aufs/wkq.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Copyright (C) 2005-2016 Junjiro R. Okajima
- */
-
-/*
- * workqueue for asynchronous/super-io operations
- * todo: try new credentials management scheme
- */
-
-#ifndef __AUFS_WKQ_H__
-#define __AUFS_WKQ_H__
-
-#ifdef __KERNEL__
-
-struct super_block;
-
-/* ---------------------------------------------------------------------- */
-
-/*
- * in the next operation, wait for the 'nowait' tasks in system-wide workqueue
- */
-struct au_nowait_tasks {
-	atomic_t		nw_len;
-	wait_queue_head_t	nw_wq;
-};
-
-/* ---------------------------------------------------------------------- */
-
-typedef void (*au_wkq_func_t)(void *args);
-
-/* wkq flags */
-#define AuWkq_WAIT	1
-#define AuWkq_NEST	(1 << 1)
-#define au_ftest_wkq(flags, name)	((flags) & AuWkq_##name)
-#define au_fset_wkq(flags, name) \
-	do { (flags) |= AuWkq_##name; } while (0)
-#define au_fclr_wkq(flags, name) \
-	do { (flags) &= ~AuWkq_##name; } while (0)
-
-#ifndef CONFIG_AUFS_HNOTIFY
-#undef AuWkq_NEST
-#define AuWkq_NEST	0
-#endif
-
-/* wkq.c */
-int au_wkq_do_wait(unsigned int flags, au_wkq_func_t func, void *args);
-int au_wkq_nowait(au_wkq_func_t func, void *args, struct super_block *sb,
-		  unsigned int flags);
-void au_nwt_init(struct au_nowait_tasks *nwt);
-int __init au_wkq_init(void);
-void au_wkq_fin(void);
-
-/* ---------------------------------------------------------------------- */
-
-static inline int au_wkq_test(void)
-{
-	return current->flags & PF_WQ_WORKER;
-}
-
-static inline int au_wkq_wait(au_wkq_func_t func, void *args)
-{
-	return au_wkq_do_wait(AuWkq_WAIT, func, args);
-}
-
-static inline void au_nwt_done(struct au_nowait_tasks *nwt)
-{
-	if (atomic_dec_and_test(&nwt->nw_len))
-		wake_up_all(&nwt->nw_wq);
-}
-
-static inline int au_nwt_flush(struct au_nowait_tasks *nwt)
-{
-	wait_event(nwt->nw_wq, !atomic_read(&nwt->nw_len));
-	return 0;
-}
-
-#endif /* __KERNEL__ */
-#endif /* __AUFS_WKQ_H__ */
diff --git a/fs/aufs/xattr.c b/fs/aufs/xattr.c
deleted file mode 100644
index f592e05ea..000000000
--- a/fs/aufs/xattr.c
+++ /dev/null
@@ -1,331 +0,0 @@
-/*
- * Copyright (C) 2014-2016 Junjiro R. Okajima
- */
-
-/*
- * handling xattr functions
- */
-
-#include <linux/xattr.h>
-#include "aufs.h"
-
-static int au_xattr_ignore(int err, char *name, unsigned int ignore_flags)
-{
-	if (!ignore_flags)
-		goto out;
-	switch (err) {
-	case -ENOMEM:
-	case -EDQUOT:
-		goto out;
-	}
-
-	if ((ignore_flags & AuBrAttr_ICEX) == AuBrAttr_ICEX) {
-		err = 0;
-		goto out;
-	}
-
-#define cmp(brattr, prefix) do {					\
-		if (!strncmp(name, XATTR_##prefix##_PREFIX,		\
-			     XATTR_##prefix##_PREFIX_LEN)) {		\
-			if (ignore_flags & AuBrAttr_ICEX_##brattr)	\
-				err = 0;				\
-			goto out;					\
-		}							\
-	} while (0)
-
-	cmp(SEC, SECURITY);
-	cmp(SYS, SYSTEM);
-	cmp(TR, TRUSTED);
-	cmp(USR, USER);
-#undef cmp
-
-	if (ignore_flags & AuBrAttr_ICEX_OTH)
-		err = 0;
-
-out:
-	return err;
-}
-
-static const int au_xattr_out_of_list = AuBrAttr_ICEX_OTH << 1;
-
-static int au_do_cpup_xattr(struct dentry *h_dst, struct dentry *h_src,
-			    char *name, char **buf, unsigned int ignore_flags,
-			    unsigned int verbose)
-{
-	int err;
-	ssize_t ssz;
-	struct inode *h_idst;
-
-	ssz = vfs_getxattr_alloc(h_src, name, buf, 0, GFP_NOFS);
-	err = ssz;
-	if (unlikely(err <= 0)) {
-		if (err == -ENODATA
-		    || (err == -EOPNOTSUPP
-			&& ((ignore_flags & au_xattr_out_of_list)
-			    || (au_test_nfs_noacl(d_inode(h_src))
-				&& (!strcmp(name, XATTR_NAME_POSIX_ACL_ACCESS)
-				    || !strcmp(name,
-					       XATTR_NAME_POSIX_ACL_DEFAULT))))
-			    ))
-			err = 0;
-		if (err && (verbose || au_debug_test()))
-			pr_err("%s, err %d\n", name, err);
-		goto out;
-	}
-
-	/* unlock it temporary */
-	h_idst = d_inode(h_dst);
-	mutex_unlock(&h_idst->i_mutex);
-	err = vfsub_setxattr(h_dst, name, *buf, ssz, /*flags*/0);
-	mutex_lock_nested(&h_idst->i_mutex, AuLsc_I_CHILD2);
-	if (unlikely(err)) {
-		if (verbose || au_debug_test())
-			pr_err("%s, err %d\n", name, err);
-		err = au_xattr_ignore(err, name, ignore_flags);
-	}
-
-out:
-	return err;
-}
-
-int au_cpup_xattr(struct dentry *h_dst, struct dentry *h_src, int ignore_flags,
-		  unsigned int verbose)
-{
-	int err, unlocked, acl_access, acl_default;
-	ssize_t ssz;
-	struct inode *h_isrc, *h_idst;
-	char *value, *p, *o, *e;
-
-	/* try stopping to update the source inode while we are referencing */
-	/* there should not be the parent-child relationship between them */
-	h_isrc = d_inode(h_src);
-	h_idst = d_inode(h_dst);
-	mutex_unlock(&h_idst->i_mutex);
-	mutex_lock_nested(&h_isrc->i_mutex, AuLsc_I_CHILD);
-	mutex_lock_nested(&h_idst->i_mutex, AuLsc_I_CHILD2);
-	unlocked = 0;
-
-	/* some filesystems don't list POSIX ACL, for example tmpfs */
-	ssz = vfs_listxattr(h_src, NULL, 0);
-	err = ssz;
-	if (unlikely(err < 0)) {
-		AuTraceErr(err);
-		if (err == -ENODATA
-		    || err == -EOPNOTSUPP)
-			err = 0;	/* ignore */
-		goto out;
-	}
-
-	err = 0;
-	p = NULL;
-	o = NULL;
-	if (ssz) {
-		err = -ENOMEM;
-		p = kmalloc(ssz, GFP_NOFS);
-		o = p;
-		if (unlikely(!p))
-			goto out;
-		err = vfs_listxattr(h_src, p, ssz);
-	}
-	mutex_unlock(&h_isrc->i_mutex);
-	unlocked = 1;
-	AuDbg("err %d, ssz %zd\n", err, ssz);
-	if (unlikely(err < 0))
-		goto out_free;
-
-	err = 0;
-	e = p + ssz;
-	value = NULL;
-	acl_access = 0;
-	acl_default = 0;
-	while (!err && p < e) {
-		acl_access |= !strncmp(p, XATTR_NAME_POSIX_ACL_ACCESS,
-				       sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1);
-		acl_default |= !strncmp(p, XATTR_NAME_POSIX_ACL_DEFAULT,
-					sizeof(XATTR_NAME_POSIX_ACL_DEFAULT)
-					- 1);
-		err = au_do_cpup_xattr(h_dst, h_src, p, &value, ignore_flags,
-				       verbose);
-		p += strlen(p) + 1;
-	}
-	AuTraceErr(err);
-	ignore_flags |= au_xattr_out_of_list;
-	if (!err && !acl_access) {
-		err = au_do_cpup_xattr(h_dst, h_src,
-				       XATTR_NAME_POSIX_ACL_ACCESS, &value,
-				       ignore_flags, verbose);
-		AuTraceErr(err);
-	}
-	if (!err && !acl_default) {
-		err = au_do_cpup_xattr(h_dst, h_src,
-				       XATTR_NAME_POSIX_ACL_DEFAULT, &value,
-				       ignore_flags, verbose);
-		AuTraceErr(err);
-	}
-
-	kfree(value);
-
-out_free:
-	kfree(o);
-out:
-	if (!unlocked)
-		mutex_unlock(&h_isrc->i_mutex);
-	AuTraceErr(err);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-enum {
-	AU_XATTR_LIST,
-	AU_XATTR_GET
-};
-
-struct au_lgxattr {
-	int type;
-	union {
-		struct {
-			char	*list;
-			size_t	size;
-		} list;
-		struct {
-			const char	*name;
-			void		*value;
-			size_t		size;
-		} get;
-	} u;
-};
-
-static ssize_t au_lgxattr(struct dentry *dentry, struct au_lgxattr *arg)
-{
-	ssize_t err;
-	struct path h_path;
-	struct super_block *sb;
-
-	sb = dentry->d_sb;
-	err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM);
-	if (unlikely(err))
-		goto out;
-	err = au_h_path_getattr(dentry, /*force*/1, &h_path);
-	if (unlikely(err))
-		goto out_si;
-	if (unlikely(!h_path.dentry))
-		/* illegally overlapped or something */
-		goto out_di; /* pretending success */
-
-	/* always topmost entry only */
-	switch (arg->type) {
-	case AU_XATTR_LIST:
-		err = vfs_listxattr(h_path.dentry,
-				    arg->u.list.list, arg->u.list.size);
-		break;
-	case AU_XATTR_GET:
-		err = vfs_getxattr(h_path.dentry,
-				   arg->u.get.name, arg->u.get.value,
-				   arg->u.get.size);
-		break;
-	}
-
-out_di:
-	di_read_unlock(dentry, AuLock_IR);
-out_si:
-	si_read_unlock(sb);
-out:
-	AuTraceErr(err);
-	return err;
-}
-
-ssize_t aufs_listxattr(struct dentry *dentry, char *list, size_t size)
-{
-	struct au_lgxattr arg = {
-		.type = AU_XATTR_LIST,
-		.u.list = {
-			.list	= list,
-			.size	= size
-		},
-	};
-
-	return au_lgxattr(dentry, &arg);
-}
-
-ssize_t aufs_getxattr(struct dentry *dentry, const char *name, void *value,
-		      size_t size)
-{
-	struct au_lgxattr arg = {
-		.type = AU_XATTR_GET,
-		.u.get = {
-			.name	= name,
-			.value	= value,
-			.size	= size
-		},
-	};
-
-	return au_lgxattr(dentry, &arg);
-}
-
-int aufs_setxattr(struct dentry *dentry, const char *name, const void *value,
-		  size_t size, int flags)
-{
-	struct au_srxattr arg = {
-		.type = AU_XATTR_SET,
-		.u.set = {
-			.name	= name,
-			.value	= value,
-			.size	= size,
-			.flags	= flags
-		},
-	};
-
-	return au_srxattr(dentry, &arg);
-}
-
-int aufs_removexattr(struct dentry *dentry, const char *name)
-{
-	struct au_srxattr arg = {
-		.type = AU_XATTR_REMOVE,
-		.u.remove = {
-			.name	= name
-		},
-	};
-
-	return au_srxattr(dentry, &arg);
-}
-
-/* ---------------------------------------------------------------------- */
-
-#if 0
-static size_t au_xattr_list(struct dentry *dentry, char *list, size_t list_size,
-			    const char *name, size_t name_len, int type)
-{
-	return aufs_listxattr(dentry, list, list_size);
-}
-
-static int au_xattr_get(struct dentry *dentry, const char *name, void *buffer,
-			size_t size, int type)
-{
-	return aufs_getxattr(dentry, name, buffer, size);
-}
-
-static int au_xattr_set(struct dentry *dentry, const char *name,
-			const void *value, size_t size, int flags, int type)
-{
-	return aufs_setxattr(dentry, name, value, size, flags);
-}
-
-static const struct xattr_handler au_xattr_handler = {
-	/* no prefix, no flags */
-	.list	= au_xattr_list,
-	.get	= au_xattr_get,
-	.set	= au_xattr_set
-	/* why no remove? */
-};
-
-static const struct xattr_handler *au_xattr_handlers[] = {
-	&au_xattr_handler
-};
-
-void au_xattr_init(struct super_block *sb)
-{
-	/* sb->s_xattr = au_xattr_handlers; */
-}
-#endif
diff --git a/fs/aufs/xino.c b/fs/aufs/xino.c
deleted file mode 100644
index 994258e3f..000000000
--- a/fs/aufs/xino.c
+++ /dev/null
@@ -1,1305 +0,0 @@
-/*
- * Copyright (C) 2005-2016 Junjiro R. Okajima
- */
-
-/*
- * external inode number translation table and bitmap
- */
-
-#include <linux/seq_file.h>
-#include <linux/statfs.h>
-#include "aufs.h"
-
-/* todo: unnecessary to support mmap_sem since kernel-space? */
-ssize_t xino_fread(vfs_readf_t func, struct file *file, void *kbuf, size_t size,
-		   loff_t *pos)
-{
-	ssize_t err;
-	mm_segment_t oldfs;
-	union {
-		void *k;
-		char __user *u;
-	} buf;
-
-	buf.k = kbuf;
-	oldfs = get_fs();
-	set_fs(KERNEL_DS);
-	do {
-		/* todo: signal_pending? */
-		err = func(file, buf.u, size, pos);
-	} while (err == -EAGAIN || err == -EINTR);
-	set_fs(oldfs);
-
-#if 0 /* reserved for future use */
-	if (err > 0)
-		fsnotify_access(file->f_path.dentry);
-#endif
-
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static ssize_t xino_fwrite_wkq(vfs_writef_t func, struct file *file, void *buf,
-			       size_t size, loff_t *pos);
-
-static ssize_t do_xino_fwrite(vfs_writef_t func, struct file *file, void *kbuf,
-			      size_t size, loff_t *pos)
-{
-	ssize_t err;
-	mm_segment_t oldfs;
-	union {
-		void *k;
-		const char __user *u;
-	} buf;
-	int i;
-	const int prevent_endless = 10;
-
-	i = 0;
-	buf.k = kbuf;
-	oldfs = get_fs();
-	set_fs(KERNEL_DS);
-	do {
-		err = func(file, buf.u, size, pos);
-		if (err == -EINTR
-		    && !au_wkq_test()
-		    && fatal_signal_pending(current)) {
-			set_fs(oldfs);
-			err = xino_fwrite_wkq(func, file, kbuf, size, pos);
-			BUG_ON(err == -EINTR);
-			oldfs = get_fs();
-			set_fs(KERNEL_DS);
-		}
-	} while (i++ < prevent_endless
-		 && (err == -EAGAIN || err == -EINTR));
-	set_fs(oldfs);
-
-#if 0 /* reserved for future use */
-	if (err > 0)
-		fsnotify_modify(file->f_path.dentry);
-#endif
-
-	return err;
-}
-
-struct do_xino_fwrite_args {
-	ssize_t *errp;
-	vfs_writef_t func;
-	struct file *file;
-	void *buf;
-	size_t size;
-	loff_t *pos;
-};
-
-static void call_do_xino_fwrite(void *args)
-{
-	struct do_xino_fwrite_args *a = args;
-	*a->errp = do_xino_fwrite(a->func, a->file, a->buf, a->size, a->pos);
-}
-
-static ssize_t xino_fwrite_wkq(vfs_writef_t func, struct file *file, void *buf,
-			       size_t size, loff_t *pos)
-{
-	ssize_t err;
-	int wkq_err;
-	struct do_xino_fwrite_args args = {
-		.errp	= &err,
-		.func	= func,
-		.file	= file,
-		.buf	= buf,
-		.size	= size,
-		.pos	= pos
-	};
-
-	/*
-	 * it breaks RLIMIT_FSIZE and normal user's limit,
-	 * users should care about quota and real 'filesystem full.'
-	 */
-	wkq_err = au_wkq_wait(call_do_xino_fwrite, &args);
-	if (unlikely(wkq_err))
-		err = wkq_err;
-
-	return err;
-}
-
-ssize_t xino_fwrite(vfs_writef_t func, struct file *file, void *buf,
-		    size_t size, loff_t *pos)
-{
-	ssize_t err;
-
-	if (rlimit(RLIMIT_FSIZE) == RLIM_INFINITY) {
-		lockdep_off();
-		err = do_xino_fwrite(func, file, buf, size, pos);
-		lockdep_on();
-	} else
-		err = xino_fwrite_wkq(func, file, buf, size, pos);
-
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/*
- * create a new xinofile at the same place/path as @base_file.
- */
-struct file *au_xino_create2(struct file *base_file, struct file *copy_src)
-{
-	struct file *file;
-	struct dentry *base, *parent;
-	struct inode *dir, *delegated;
-	struct qstr *name;
-	struct path path;
-	int err;
-
-	base = base_file->f_path.dentry;
-	parent = base->d_parent; /* dir inode is locked */
-	dir = d_inode(parent);
-	IMustLock(dir);
-
-	file = ERR_PTR(-EINVAL);
-	name = &base->d_name;
-	path.dentry = vfsub_lookup_one_len(name->name, parent, name->len);
-	if (IS_ERR(path.dentry)) {
-		file = (void *)path.dentry;
-		pr_err("%pd lookup err %ld\n",
-		       base, PTR_ERR(path.dentry));
-		goto out;
-	}
-
-	/* no need to mnt_want_write() since we call dentry_open() later */
-	err = vfs_create(dir, path.dentry, S_IRUGO | S_IWUGO, NULL);
-	if (unlikely(err)) {
-		file = ERR_PTR(err);
-		pr_err("%pd create err %d\n", base, err);
-		goto out_dput;
-	}
-
-	path.mnt = base_file->f_path.mnt;
-	file = vfsub_dentry_open(&path,
-				 O_RDWR | O_CREAT | O_EXCL | O_LARGEFILE
-				 /* | __FMODE_NONOTIFY */);
-	if (IS_ERR(file)) {
-		pr_err("%pd open err %ld\n", base, PTR_ERR(file));
-		goto out_dput;
-	}
-
-	delegated = NULL;
-	err = vfsub_unlink(dir, &file->f_path, &delegated, /*force*/0);
-	if (unlikely(err == -EWOULDBLOCK)) {
-		pr_warn("cannot retry for NFSv4 delegation"
-			" for an internal unlink\n");
-		iput(delegated);
-	}
-	if (unlikely(err)) {
-		pr_err("%pd unlink err %d\n", base, err);
-		goto out_fput;
-	}
-
-	if (copy_src) {
-		/* no one can touch copy_src xino */
-		err = au_copy_file(file, copy_src, vfsub_f_size_read(copy_src));
-		if (unlikely(err)) {
-			pr_err("%pd copy err %d\n", base, err);
-			goto out_fput;
-		}
-	}
-	goto out_dput; /* success */
-
-out_fput:
-	fput(file);
-	file = ERR_PTR(err);
-out_dput:
-	dput(path.dentry);
-out:
-	return file;
-}
-
-struct au_xino_lock_dir {
-	struct au_hinode *hdir;
-	struct dentry *parent;
-	struct mutex *mtx;
-};
-
-static void au_xino_lock_dir(struct super_block *sb, struct file *xino,
-			     struct au_xino_lock_dir *ldir)
-{
-	aufs_bindex_t brid, bindex;
-
-	ldir->hdir = NULL;
-	bindex = -1;
-	brid = au_xino_brid(sb);
-	if (brid >= 0)
-		bindex = au_br_index(sb, brid);
-	if (bindex >= 0) {
-		ldir->hdir = au_hi(d_inode(sb->s_root), bindex);
-		au_hn_imtx_lock_nested(ldir->hdir, AuLsc_I_PARENT);
-	} else {
-		ldir->parent = dget_parent(xino->f_path.dentry);
-		ldir->mtx = &d_inode(ldir->parent)->i_mutex;
-		mutex_lock_nested(ldir->mtx, AuLsc_I_PARENT);
-	}
-}
-
-static void au_xino_unlock_dir(struct au_xino_lock_dir *ldir)
-{
-	if (ldir->hdir)
-		au_hn_imtx_unlock(ldir->hdir);
-	else {
-		mutex_unlock(ldir->mtx);
-		dput(ldir->parent);
-	}
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* trucate xino files asynchronously */
-
-int au_xino_trunc(struct super_block *sb, aufs_bindex_t bindex)
-{
-	int err;
-	unsigned long jiffy;
-	blkcnt_t blocks;
-	aufs_bindex_t bi, bend;
-	struct kstatfs *st;
-	struct au_branch *br;
-	struct file *new_xino, *file;
-	struct super_block *h_sb;
-	struct au_xino_lock_dir ldir;
-
-	err = -ENOMEM;
-	st = kmalloc(sizeof(*st), GFP_NOFS);
-	if (unlikely(!st))
-		goto out;
-
-	err = -EINVAL;
-	bend = au_sbend(sb);
-	if (unlikely(bindex < 0 || bend < bindex))
-		goto out_st;
-	br = au_sbr(sb, bindex);
-	file = br->br_xino.xi_file;
-	if (!file)
-		goto out_st;
-
-	err = vfs_statfs(&file->f_path, st);
-	if (unlikely(err))
-		AuErr1("statfs err %d, ignored\n", err);
-	jiffy = jiffies;
-	blocks = file_inode(file)->i_blocks;
-	pr_info("begin truncating xino(b%d), ib%llu, %llu/%llu free blks\n",
-		bindex, (u64)blocks, st->f_bfree, st->f_blocks);
-
-	au_xino_lock_dir(sb, file, &ldir);
-	/* mnt_want_write() is unnecessary here */
-	new_xino = au_xino_create2(file, file);
-	au_xino_unlock_dir(&ldir);
-	err = PTR_ERR(new_xino);
-	if (IS_ERR(new_xino)) {
-		pr_err("err %d, ignored\n", err);
-		goto out_st;
-	}
-	err = 0;
-	fput(file);
-	br->br_xino.xi_file = new_xino;
-
-	h_sb = au_br_sb(br);
-	for (bi = 0; bi <= bend; bi++) {
-		if (unlikely(bi == bindex))
-			continue;
-		br = au_sbr(sb, bi);
-		if (au_br_sb(br) != h_sb)
-			continue;
-
-		fput(br->br_xino.xi_file);
-		br->br_xino.xi_file = new_xino;
-		get_file(new_xino);
-	}
-
-	err = vfs_statfs(&new_xino->f_path, st);
-	if (!err) {
-		pr_info("end truncating xino(b%d), ib%llu, %llu/%llu free blks\n",
-			bindex, (u64)file_inode(new_xino)->i_blocks,
-			st->f_bfree, st->f_blocks);
-		if (file_inode(new_xino)->i_blocks < blocks)
-			au_sbi(sb)->si_xino_jiffy = jiffy;
-	} else
-		AuErr1("statfs err %d, ignored\n", err);
-
-out_st:
-	kfree(st);
-out:
-	return err;
-}
-
-struct xino_do_trunc_args {
-	struct super_block *sb;
-	struct au_branch *br;
-};
-
-static void xino_do_trunc(void *_args)
-{
-	struct xino_do_trunc_args *args = _args;
-	struct super_block *sb;
-	struct au_branch *br;
-	struct inode *dir;
-	int err;
-	aufs_bindex_t bindex;
-
-	err = 0;
-	sb = args->sb;
-	dir = d_inode(sb->s_root);
-	br = args->br;
-
-	si_noflush_write_lock(sb);
-	ii_read_lock_parent(dir);
-	bindex = au_br_index(sb, br->br_id);
-	err = au_xino_trunc(sb, bindex);
-	ii_read_unlock(dir);
-	if (unlikely(err))
-		pr_warn("err b%d, (%d)\n", bindex, err);
-	atomic_dec(&br->br_xino_running);
-	atomic_dec(&br->br_count);
-	si_write_unlock(sb);
-	au_nwt_done(&au_sbi(sb)->si_nowait);
-	kfree(args);
-}
-
-static int xino_trunc_test(struct super_block *sb, struct au_branch *br)
-{
-	int err;
-	struct kstatfs st;
-	struct au_sbinfo *sbinfo;
-
-	/* todo: si_xino_expire and the ratio should be customizable */
-	sbinfo = au_sbi(sb);
-	if (time_before(jiffies,
-			sbinfo->si_xino_jiffy + sbinfo->si_xino_expire))
-		return 0;
-
-	/* truncation border */
-	err = vfs_statfs(&br->br_xino.xi_file->f_path, &st);
-	if (unlikely(err)) {
-		AuErr1("statfs err %d, ignored\n", err);
-		return 0;
-	}
-	if (div64_u64(st.f_bfree * 100, st.f_blocks) >= AUFS_XINO_DEF_TRUNC)
-		return 0;
-
-	return 1;
-}
-
-static void xino_try_trunc(struct super_block *sb, struct au_branch *br)
-{
-	struct xino_do_trunc_args *args;
-	int wkq_err;
-
-	if (!xino_trunc_test(sb, br))
-		return;
-
-	if (atomic_inc_return(&br->br_xino_running) > 1)
-		goto out;
-
-	/* lock and kfree() will be called in trunc_xino() */
-	args = kmalloc(sizeof(*args), GFP_NOFS);
-	if (unlikely(!args)) {
-		AuErr1("no memory\n");
-		goto out_args;
-	}
-
-	atomic_inc(&br->br_count);
-	args->sb = sb;
-	args->br = br;
-	wkq_err = au_wkq_nowait(xino_do_trunc, args, sb, /*flags*/0);
-	if (!wkq_err)
-		return; /* success */
-
-	pr_err("wkq %d\n", wkq_err);
-	atomic_dec(&br->br_count);
-
-out_args:
-	kfree(args);
-out:
-	atomic_dec(&br->br_xino_running);
-}
-
-/* ---------------------------------------------------------------------- */
-
-static int au_xino_do_write(vfs_writef_t write, struct file *file,
-			    ino_t h_ino, ino_t ino)
-{
-	loff_t pos;
-	ssize_t sz;
-
-	pos = h_ino;
-	if (unlikely(au_loff_max / sizeof(ino) - 1 < pos)) {
-		AuIOErr1("too large hi%lu\n", (unsigned long)h_ino);
-		return -EFBIG;
-	}
-	pos *= sizeof(ino);
-	sz = xino_fwrite(write, file, &ino, sizeof(ino), &pos);
-	if (sz == sizeof(ino))
-		return 0; /* success */
-
-	AuIOErr("write failed (%zd)\n", sz);
-	return -EIO;
-}
-
-/*
- * write @ino to the xinofile for the specified branch{@sb, @bindex}
- * at the position of @h_ino.
- * even if @ino is zero, it is written to the xinofile and means no entry.
- * if the size of the xino file on a specific filesystem exceeds the watermark,
- * try truncating it.
- */
-int au_xino_write(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino,
-		  ino_t ino)
-{
-	int err;
-	unsigned int mnt_flags;
-	struct au_branch *br;
-
-	BUILD_BUG_ON(sizeof(long long) != sizeof(au_loff_max)
-		     || ((loff_t)-1) > 0);
-	SiMustAnyLock(sb);
-
-	mnt_flags = au_mntflags(sb);
-	if (!au_opt_test(mnt_flags, XINO))
-		return 0;
-
-	br = au_sbr(sb, bindex);
-	err = au_xino_do_write(au_sbi(sb)->si_xwrite, br->br_xino.xi_file,
-			       h_ino, ino);
-	if (!err) {
-		if (au_opt_test(mnt_flags, TRUNC_XINO)
-		    && au_test_fs_trunc_xino(au_br_sb(br)))
-			xino_try_trunc(sb, br);
-		return 0; /* success */
-	}
-
-	AuIOErr("write failed (%d)\n", err);
-	return -EIO;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* aufs inode number bitmap */
-
-static const int page_bits = (int)PAGE_SIZE * BITS_PER_BYTE;
-static ino_t xib_calc_ino(unsigned long pindex, int bit)
-{
-	ino_t ino;
-
-	AuDebugOn(bit < 0 || page_bits <= bit);
-	ino = AUFS_FIRST_INO + pindex * page_bits + bit;
-	return ino;
-}
-
-static void xib_calc_bit(ino_t ino, unsigned long *pindex, int *bit)
-{
-	AuDebugOn(ino < AUFS_FIRST_INO);
-	ino -= AUFS_FIRST_INO;
-	*pindex = ino / page_bits;
-	*bit = ino % page_bits;
-}
-
-static int xib_pindex(struct super_block *sb, unsigned long pindex)
-{
-	int err;
-	loff_t pos;
-	ssize_t sz;
-	struct au_sbinfo *sbinfo;
-	struct file *xib;
-	unsigned long *p;
-
-	sbinfo = au_sbi(sb);
-	MtxMustLock(&sbinfo->si_xib_mtx);
-	AuDebugOn(pindex > ULONG_MAX / PAGE_SIZE
-		  || !au_opt_test(sbinfo->si_mntflags, XINO));
-
-	if (pindex == sbinfo->si_xib_last_pindex)
-		return 0;
-
-	xib = sbinfo->si_xib;
-	p = sbinfo->si_xib_buf;
-	pos = sbinfo->si_xib_last_pindex;
-	pos *= PAGE_SIZE;
-	sz = xino_fwrite(sbinfo->si_xwrite, xib, p, PAGE_SIZE, &pos);
-	if (unlikely(sz != PAGE_SIZE))
-		goto out;
-
-	pos = pindex;
-	pos *= PAGE_SIZE;
-	if (vfsub_f_size_read(xib) >= pos + PAGE_SIZE)
-		sz = xino_fread(sbinfo->si_xread, xib, p, PAGE_SIZE, &pos);
-	else {
-		memset(p, 0, PAGE_SIZE);
-		sz = xino_fwrite(sbinfo->si_xwrite, xib, p, PAGE_SIZE, &pos);
-	}
-	if (sz == PAGE_SIZE) {
-		sbinfo->si_xib_last_pindex = pindex;
-		return 0; /* success */
-	}
-
-out:
-	AuIOErr1("write failed (%zd)\n", sz);
-	err = sz;
-	if (sz >= 0)
-		err = -EIO;
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-static void au_xib_clear_bit(struct inode *inode)
-{
-	int err, bit;
-	unsigned long pindex;
-	struct super_block *sb;
-	struct au_sbinfo *sbinfo;
-
-	AuDebugOn(inode->i_nlink);
-
-	sb = inode->i_sb;
-	xib_calc_bit(inode->i_ino, &pindex, &bit);
-	AuDebugOn(page_bits <= bit);
-	sbinfo = au_sbi(sb);
-	mutex_lock(&sbinfo->si_xib_mtx);
-	err = xib_pindex(sb, pindex);
-	if (!err) {
-		clear_bit(bit, sbinfo->si_xib_buf);
-		sbinfo->si_xib_next_bit = bit;
-	}
-	mutex_unlock(&sbinfo->si_xib_mtx);
-}
-
-/* for s_op->delete_inode() */
-void au_xino_delete_inode(struct inode *inode, const int unlinked)
-{
-	int err;
-	unsigned int mnt_flags;
-	aufs_bindex_t bindex, bend, bi;
-	unsigned char try_trunc;
-	struct au_iinfo *iinfo;
-	struct super_block *sb;
-	struct au_hinode *hi;
-	struct inode *h_inode;
-	struct au_branch *br;
-	vfs_writef_t xwrite;
-
-	sb = inode->i_sb;
-	mnt_flags = au_mntflags(sb);
-	if (!au_opt_test(mnt_flags, XINO)
-	    || inode->i_ino == AUFS_ROOT_INO)
-		return;
-
-	if (unlinked) {
-		au_xigen_inc(inode);
-		au_xib_clear_bit(inode);
-	}
-
-	iinfo = au_ii(inode);
-	if (!iinfo)
-		return;
-
-	bindex = iinfo->ii_bstart;
-	if (bindex < 0)
-		return;
-
-	xwrite = au_sbi(sb)->si_xwrite;
-	try_trunc = !!au_opt_test(mnt_flags, TRUNC_XINO);
-	hi = iinfo->ii_hinode + bindex;
-	bend = iinfo->ii_bend;
-	for (; bindex <= bend; bindex++, hi++) {
-		h_inode = hi->hi_inode;
-		if (!h_inode
-		    || (!unlinked && h_inode->i_nlink))
-			continue;
-
-		/* inode may not be revalidated */
-		bi = au_br_index(sb, hi->hi_id);
-		if (bi < 0)
-			continue;
-
-		br = au_sbr(sb, bi);
-		err = au_xino_do_write(xwrite, br->br_xino.xi_file,
-				       h_inode->i_ino, /*ino*/0);
-		if (!err && try_trunc
-		    && au_test_fs_trunc_xino(au_br_sb(br)))
-			xino_try_trunc(sb, br);
-	}
-}
-
-/* get an unused inode number from bitmap */
-ino_t au_xino_new_ino(struct super_block *sb)
-{
-	ino_t ino;
-	unsigned long *p, pindex, ul, pend;
-	struct au_sbinfo *sbinfo;
-	struct file *file;
-	int free_bit, err;
-
-	if (!au_opt_test(au_mntflags(sb), XINO))
-		return iunique(sb, AUFS_FIRST_INO);
-
-	sbinfo = au_sbi(sb);
-	mutex_lock(&sbinfo->si_xib_mtx);
-	p = sbinfo->si_xib_buf;
-	free_bit = sbinfo->si_xib_next_bit;
-	if (free_bit < page_bits && !test_bit(free_bit, p))
-		goto out; /* success */
-	free_bit = find_first_zero_bit(p, page_bits);
-	if (free_bit < page_bits)
-		goto out; /* success */
-
-	pindex = sbinfo->si_xib_last_pindex;
-	for (ul = pindex - 1; ul < ULONG_MAX; ul--) {
-		err = xib_pindex(sb, ul);
-		if (unlikely(err))
-			goto out_err;
-		free_bit = find_first_zero_bit(p, page_bits);
-		if (free_bit < page_bits)
-			goto out; /* success */
-	}
-
-	file = sbinfo->si_xib;
-	pend = vfsub_f_size_read(file) / PAGE_SIZE;
-	for (ul = pindex + 1; ul <= pend; ul++) {
-		err = xib_pindex(sb, ul);
-		if (unlikely(err))
-			goto out_err;
-		free_bit = find_first_zero_bit(p, page_bits);
-		if (free_bit < page_bits)
-			goto out; /* success */
-	}
-	BUG();
-
-out:
-	set_bit(free_bit, p);
-	sbinfo->si_xib_next_bit = free_bit + 1;
-	pindex = sbinfo->si_xib_last_pindex;
-	mutex_unlock(&sbinfo->si_xib_mtx);
-	ino = xib_calc_ino(pindex, free_bit);
-	AuDbg("i%lu\n", (unsigned long)ino);
-	return ino;
-out_err:
-	mutex_unlock(&sbinfo->si_xib_mtx);
-	AuDbg("i0\n");
-	return 0;
-}
-
-/*
- * read @ino from xinofile for the specified branch{@sb, @bindex}
- * at the position of @h_ino.
- * if @ino does not exist and @do_new is true, get new one.
- */
-int au_xino_read(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino,
-		 ino_t *ino)
-{
-	int err;
-	ssize_t sz;
-	loff_t pos;
-	struct file *file;
-	struct au_sbinfo *sbinfo;
-
-	*ino = 0;
-	if (!au_opt_test(au_mntflags(sb), XINO))
-		return 0; /* no xino */
-
-	err = 0;
-	sbinfo = au_sbi(sb);
-	pos = h_ino;
-	if (unlikely(au_loff_max / sizeof(*ino) - 1 < pos)) {
-		AuIOErr1("too large hi%lu\n", (unsigned long)h_ino);
-		return -EFBIG;
-	}
-	pos *= sizeof(*ino);
-
-	file = au_sbr(sb, bindex)->br_xino.xi_file;
-	if (vfsub_f_size_read(file) < pos + sizeof(*ino))
-		return 0; /* no ino */
-
-	sz = xino_fread(sbinfo->si_xread, file, ino, sizeof(*ino), &pos);
-	if (sz == sizeof(*ino))
-		return 0; /* success */
-
-	err = sz;
-	if (unlikely(sz >= 0)) {
-		err = -EIO;
-		AuIOErr("xino read error (%zd)\n", sz);
-	}
-
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* create and set a new xino file */
-
-struct file *au_xino_create(struct super_block *sb, char *fname, int silent)
-{
-	struct file *file;
-	struct dentry *h_parent, *d;
-	struct inode *h_dir, *inode;
-	int err;
-
-	/*
-	 * at mount-time, and the xino file is the default path,
-	 * hnotify is disabled so we have no notify events to ignore.
-	 * when a user specified the xino, we cannot get au_hdir to be ignored.
-	 */
-	file = vfsub_filp_open(fname, O_RDWR | O_CREAT | O_EXCL | O_LARGEFILE
-			       /* | __FMODE_NONOTIFY */,
-			       S_IRUGO | S_IWUGO);
-	if (IS_ERR(file)) {
-		if (!silent)
-			pr_err("open %s(%ld)\n", fname, PTR_ERR(file));
-		return file;
-	}
-
-	/* keep file count */
-	err = 0;
-	inode = file_inode(file);
-	h_parent = dget_parent(file->f_path.dentry);
-	h_dir = d_inode(h_parent);
-	mutex_lock_nested(&h_dir->i_mutex, AuLsc_I_PARENT);
-	/* mnt_want_write() is unnecessary here */
-	/* no delegation since it is just created */
-	if (inode->i_nlink)
-		err = vfsub_unlink(h_dir, &file->f_path, /*delegated*/NULL,
-				   /*force*/0);
-	mutex_unlock(&h_dir->i_mutex);
-	dput(h_parent);
-	if (unlikely(err)) {
-		if (!silent)
-			pr_err("unlink %s(%d)\n", fname, err);
-		goto out;
-	}
-
-	err = -EINVAL;
-	d = file->f_path.dentry;
-	if (unlikely(sb == d->d_sb)) {
-		if (!silent)
-			pr_err("%s must be outside\n", fname);
-		goto out;
-	}
-	if (unlikely(au_test_fs_bad_xino(d->d_sb))) {
-		if (!silent)
-			pr_err("xino doesn't support %s(%s)\n",
-			       fname, au_sbtype(d->d_sb));
-		goto out;
-	}
-	return file; /* success */
-
-out:
-	fput(file);
-	file = ERR_PTR(err);
-	return file;
-}
-
-/*
- * find another branch who is on the same filesystem of the specified
- * branch{@btgt}. search until @bend.
- */
-static int is_sb_shared(struct super_block *sb, aufs_bindex_t btgt,
-			aufs_bindex_t bend)
-{
-	aufs_bindex_t bindex;
-	struct super_block *tgt_sb = au_sbr_sb(sb, btgt);
-
-	for (bindex = 0; bindex < btgt; bindex++)
-		if (unlikely(tgt_sb == au_sbr_sb(sb, bindex)))
-			return bindex;
-	for (bindex++; bindex <= bend; bindex++)
-		if (unlikely(tgt_sb == au_sbr_sb(sb, bindex)))
-			return bindex;
-	return -1;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/*
- * initialize the xinofile for the specified branch @br
- * at the place/path where @base_file indicates.
- * test whether another branch is on the same filesystem or not,
- * if @do_test is true.
- */
-int au_xino_br(struct super_block *sb, struct au_branch *br, ino_t h_ino,
-	       struct file *base_file, int do_test)
-{
-	int err;
-	ino_t ino;
-	aufs_bindex_t bend, bindex;
-	struct au_branch *shared_br, *b;
-	struct file *file;
-	struct super_block *tgt_sb;
-
-	shared_br = NULL;
-	bend = au_sbend(sb);
-	if (do_test) {
-		tgt_sb = au_br_sb(br);
-		for (bindex = 0; bindex <= bend; bindex++) {
-			b = au_sbr(sb, bindex);
-			if (tgt_sb == au_br_sb(b)) {
-				shared_br = b;
-				break;
-			}
-		}
-	}
-
-	if (!shared_br || !shared_br->br_xino.xi_file) {
-		struct au_xino_lock_dir ldir;
-
-		au_xino_lock_dir(sb, base_file, &ldir);
-		/* mnt_want_write() is unnecessary here */
-		file = au_xino_create2(base_file, NULL);
-		au_xino_unlock_dir(&ldir);
-		err = PTR_ERR(file);
-		if (IS_ERR(file))
-			goto out;
-		br->br_xino.xi_file = file;
-	} else {
-		br->br_xino.xi_file = shared_br->br_xino.xi_file;
-		get_file(br->br_xino.xi_file);
-	}
-
-	ino = AUFS_ROOT_INO;
-	err = au_xino_do_write(au_sbi(sb)->si_xwrite, br->br_xino.xi_file,
-			       h_ino, ino);
-	if (unlikely(err)) {
-		fput(br->br_xino.xi_file);
-		br->br_xino.xi_file = NULL;
-	}
-
-out:
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/* trucate a xino bitmap file */
-
-/* todo: slow */
-static int do_xib_restore(struct super_block *sb, struct file *file, void *page)
-{
-	int err, bit;
-	ssize_t sz;
-	unsigned long pindex;
-	loff_t pos, pend;
-	struct au_sbinfo *sbinfo;
-	vfs_readf_t func;
-	ino_t *ino;
-	unsigned long *p;
-
-	err = 0;
-	sbinfo = au_sbi(sb);
-	MtxMustLock(&sbinfo->si_xib_mtx);
-	p = sbinfo->si_xib_buf;
-	func = sbinfo->si_xread;
-	pend = vfsub_f_size_read(file);
-	pos = 0;
-	while (pos < pend) {
-		sz = xino_fread(func, file, page, PAGE_SIZE, &pos);
-		err = sz;
-		if (unlikely(sz <= 0))
-			goto out;
-
-		err = 0;
-		for (ino = page; sz > 0; ino++, sz -= sizeof(ino)) {
-			if (unlikely(*ino < AUFS_FIRST_INO))
-				continue;
-
-			xib_calc_bit(*ino, &pindex, &bit);
-			AuDebugOn(page_bits <= bit);
-			err = xib_pindex(sb, pindex);
-			if (!err)
-				set_bit(bit, p);
-			else
-				goto out;
-		}
-	}
-
-out:
-	return err;
-}
-
-static int xib_restore(struct super_block *sb)
-{
-	int err;
-	aufs_bindex_t bindex, bend;
-	void *page;
-
-	err = -ENOMEM;
-	page = (void *)__get_free_page(GFP_NOFS);
-	if (unlikely(!page))
-		goto out;
-
-	err = 0;
-	bend = au_sbend(sb);
-	for (bindex = 0; !err && bindex <= bend; bindex++)
-		if (!bindex || is_sb_shared(sb, bindex, bindex - 1) < 0)
-			err = do_xib_restore
-				(sb, au_sbr(sb, bindex)->br_xino.xi_file, page);
-		else
-			AuDbg("b%d\n", bindex);
-	free_page((unsigned long)page);
-
-out:
-	return err;
-}
-
-int au_xib_trunc(struct super_block *sb)
-{
-	int err;
-	ssize_t sz;
-	loff_t pos;
-	struct au_xino_lock_dir ldir;
-	struct au_sbinfo *sbinfo;
-	unsigned long *p;
-	struct file *file;
-
-	SiMustWriteLock(sb);
-
-	err = 0;
-	sbinfo = au_sbi(sb);
-	if (!au_opt_test(sbinfo->si_mntflags, XINO))
-		goto out;
-
-	file = sbinfo->si_xib;
-	if (vfsub_f_size_read(file) <= PAGE_SIZE)
-		goto out;
-
-	au_xino_lock_dir(sb, file, &ldir);
-	/* mnt_want_write() is unnecessary here */
-	file = au_xino_create2(sbinfo->si_xib, NULL);
-	au_xino_unlock_dir(&ldir);
-	err = PTR_ERR(file);
-	if (IS_ERR(file))
-		goto out;
-	fput(sbinfo->si_xib);
-	sbinfo->si_xib = file;
-
-	p = sbinfo->si_xib_buf;
-	memset(p, 0, PAGE_SIZE);
-	pos = 0;
-	sz = xino_fwrite(sbinfo->si_xwrite, sbinfo->si_xib, p, PAGE_SIZE, &pos);
-	if (unlikely(sz != PAGE_SIZE)) {
-		err = sz;
-		AuIOErr("err %d\n", err);
-		if (sz >= 0)
-			err = -EIO;
-		goto out;
-	}
-
-	mutex_lock(&sbinfo->si_xib_mtx);
-	/* mnt_want_write() is unnecessary here */
-	err = xib_restore(sb);
-	mutex_unlock(&sbinfo->si_xib_mtx);
-
-out:
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/*
- * xino mount option handlers
- */
-
-/* xino bitmap */
-static void xino_clear_xib(struct super_block *sb)
-{
-	struct au_sbinfo *sbinfo;
-
-	SiMustWriteLock(sb);
-
-	sbinfo = au_sbi(sb);
-	sbinfo->si_xread = NULL;
-	sbinfo->si_xwrite = NULL;
-	if (sbinfo->si_xib)
-		fput(sbinfo->si_xib);
-	sbinfo->si_xib = NULL;
-	free_page((unsigned long)sbinfo->si_xib_buf);
-	sbinfo->si_xib_buf = NULL;
-}
-
-static int au_xino_set_xib(struct super_block *sb, struct file *base)
-{
-	int err;
-	loff_t pos;
-	struct au_sbinfo *sbinfo;
-	struct file *file;
-
-	SiMustWriteLock(sb);
-
-	sbinfo = au_sbi(sb);
-	file = au_xino_create2(base, sbinfo->si_xib);
-	err = PTR_ERR(file);
-	if (IS_ERR(file))
-		goto out;
-	if (sbinfo->si_xib)
-		fput(sbinfo->si_xib);
-	sbinfo->si_xib = file;
-	sbinfo->si_xread = vfs_readf(file);
-	sbinfo->si_xwrite = vfs_writef(file);
-
-	err = -ENOMEM;
-	if (!sbinfo->si_xib_buf)
-		sbinfo->si_xib_buf = (void *)get_zeroed_page(GFP_NOFS);
-	if (unlikely(!sbinfo->si_xib_buf))
-		goto out_unset;
-
-	sbinfo->si_xib_last_pindex = 0;
-	sbinfo->si_xib_next_bit = 0;
-	if (vfsub_f_size_read(file) < PAGE_SIZE) {
-		pos = 0;
-		err = xino_fwrite(sbinfo->si_xwrite, file, sbinfo->si_xib_buf,
-				  PAGE_SIZE, &pos);
-		if (unlikely(err != PAGE_SIZE))
-			goto out_free;
-	}
-	err = 0;
-	goto out; /* success */
-
-out_free:
-	free_page((unsigned long)sbinfo->si_xib_buf);
-	sbinfo->si_xib_buf = NULL;
-	if (err >= 0)
-		err = -EIO;
-out_unset:
-	fput(sbinfo->si_xib);
-	sbinfo->si_xib = NULL;
-	sbinfo->si_xread = NULL;
-	sbinfo->si_xwrite = NULL;
-out:
-	return err;
-}
-
-/* xino for each branch */
-static void xino_clear_br(struct super_block *sb)
-{
-	aufs_bindex_t bindex, bend;
-	struct au_branch *br;
-
-	bend = au_sbend(sb);
-	for (bindex = 0; bindex <= bend; bindex++) {
-		br = au_sbr(sb, bindex);
-		if (!br || !br->br_xino.xi_file)
-			continue;
-
-		fput(br->br_xino.xi_file);
-		br->br_xino.xi_file = NULL;
-	}
-}
-
-static int au_xino_set_br(struct super_block *sb, struct file *base)
-{
-	int err;
-	ino_t ino;
-	aufs_bindex_t bindex, bend, bshared;
-	struct {
-		struct file *old, *new;
-	} *fpair, *p;
-	struct au_branch *br;
-	struct inode *inode;
-	vfs_writef_t writef;
-
-	SiMustWriteLock(sb);
-
-	err = -ENOMEM;
-	bend = au_sbend(sb);
-	fpair = kcalloc(bend + 1, sizeof(*fpair), GFP_NOFS);
-	if (unlikely(!fpair))
-		goto out;
-
-	inode = d_inode(sb->s_root);
-	ino = AUFS_ROOT_INO;
-	writef = au_sbi(sb)->si_xwrite;
-	for (bindex = 0, p = fpair; bindex <= bend; bindex++, p++) {
-		br = au_sbr(sb, bindex);
-		bshared = is_sb_shared(sb, bindex, bindex - 1);
-		if (bshared >= 0) {
-			/* shared xino */
-			*p = fpair[bshared];
-			get_file(p->new);
-		}
-
-		if (!p->new) {
-			/* new xino */
-			p->old = br->br_xino.xi_file;
-			p->new = au_xino_create2(base, br->br_xino.xi_file);
-			err = PTR_ERR(p->new);
-			if (IS_ERR(p->new)) {
-				p->new = NULL;
-				goto out_pair;
-			}
-		}
-
-		err = au_xino_do_write(writef, p->new,
-				       au_h_iptr(inode, bindex)->i_ino, ino);
-		if (unlikely(err))
-			goto out_pair;
-	}
-
-	for (bindex = 0, p = fpair; bindex <= bend; bindex++, p++) {
-		br = au_sbr(sb, bindex);
-		if (br->br_xino.xi_file)
-			fput(br->br_xino.xi_file);
-		get_file(p->new);
-		br->br_xino.xi_file = p->new;
-	}
-
-out_pair:
-	for (bindex = 0, p = fpair; bindex <= bend; bindex++, p++)
-		if (p->new)
-			fput(p->new);
-		else
-			break;
-	kfree(fpair);
-out:
-	return err;
-}
-
-void au_xino_clr(struct super_block *sb)
-{
-	struct au_sbinfo *sbinfo;
-
-	au_xigen_clr(sb);
-	xino_clear_xib(sb);
-	xino_clear_br(sb);
-	sbinfo = au_sbi(sb);
-	/* lvalue, do not call au_mntflags() */
-	au_opt_clr(sbinfo->si_mntflags, XINO);
-}
-
-int au_xino_set(struct super_block *sb, struct au_opt_xino *xino, int remount)
-{
-	int err, skip;
-	struct dentry *parent, *cur_parent;
-	struct qstr *dname, *cur_name;
-	struct file *cur_xino;
-	struct inode *dir;
-	struct au_sbinfo *sbinfo;
-
-	SiMustWriteLock(sb);
-
-	err = 0;
-	sbinfo = au_sbi(sb);
-	parent = dget_parent(xino->file->f_path.dentry);
-	if (remount) {
-		skip = 0;
-		dname = &xino->file->f_path.dentry->d_name;
-		cur_xino = sbinfo->si_xib;
-		if (cur_xino) {
-			cur_parent = dget_parent(cur_xino->f_path.dentry);
-			cur_name = &cur_xino->f_path.dentry->d_name;
-			skip = (cur_parent == parent
-				&& au_qstreq(dname, cur_name));
-			dput(cur_parent);
-		}
-		if (skip)
-			goto out;
-	}
-
-	au_opt_set(sbinfo->si_mntflags, XINO);
-	dir = d_inode(parent);
-	mutex_lock_nested(&dir->i_mutex, AuLsc_I_PARENT);
-	/* mnt_want_write() is unnecessary here */
-	err = au_xino_set_xib(sb, xino->file);
-	if (!err)
-		err = au_xigen_set(sb, xino->file);
-	if (!err)
-		err = au_xino_set_br(sb, xino->file);
-	mutex_unlock(&dir->i_mutex);
-	if (!err)
-		goto out; /* success */
-
-	/* reset all */
-	AuIOErr("failed creating xino(%d).\n", err);
-	au_xigen_clr(sb);
-	xino_clear_xib(sb);
-
-out:
-	dput(parent);
-	return err;
-}
-
-/* ---------------------------------------------------------------------- */
-
-/*
- * create a xinofile at the default place/path.
- */
-struct file *au_xino_def(struct super_block *sb)
-{
-	struct file *file;
-	char *page, *p;
-	struct au_branch *br;
-	struct super_block *h_sb;
-	struct path path;
-	aufs_bindex_t bend, bindex, bwr;
-
-	br = NULL;
-	bend = au_sbend(sb);
-	bwr = -1;
-	for (bindex = 0; bindex <= bend; bindex++) {
-		br = au_sbr(sb, bindex);
-		if (au_br_writable(br->br_perm)
-		    && !au_test_fs_bad_xino(au_br_sb(br))) {
-			bwr = bindex;
-			break;
-		}
-	}
-
-	if (bwr >= 0) {
-		file = ERR_PTR(-ENOMEM);
-		page = (void *)__get_free_page(GFP_NOFS);
-		if (unlikely(!page))
-			goto out;
-		path.mnt = au_br_mnt(br);
-		path.dentry = au_h_dptr(sb->s_root, bwr);
-		p = d_path(&path, page, PATH_MAX - sizeof(AUFS_XINO_FNAME));
-		file = (void *)p;
-		if (!IS_ERR(p)) {
-			strcat(p, "/" AUFS_XINO_FNAME);
-			AuDbg("%s\n", p);
-			file = au_xino_create(sb, p, /*silent*/0);
-			if (!IS_ERR(file))
-				au_xino_brid_set(sb, br->br_id);
-		}
-		free_page((unsigned long)page);
-	} else {
-		file = au_xino_create(sb, AUFS_XINO_DEFPATH, /*silent*/0);
-		if (IS_ERR(file))
-			goto out;
-		h_sb = file->f_path.dentry->d_sb;
-		if (unlikely(au_test_fs_bad_xino(h_sb))) {
-			pr_err("xino doesn't support %s(%s)\n",
-			       AUFS_XINO_DEFPATH, au_sbtype(h_sb));
-			fput(file);
-			file = ERR_PTR(-EINVAL);
-		}
-		if (!IS_ERR(file))
-			au_xino_brid_set(sb, -1);
-	}
-
-out:
-	return file;
-}
-
-/* ---------------------------------------------------------------------- */
-
-int au_xino_path(struct seq_file *seq, struct file *file)
-{
-	int err;
-
-	err = au_seq_path(seq, &file->f_path);
-	if (unlikely(err))
-		goto out;
-
-#define Deleted "\\040(deleted)"
-	seq->count -= sizeof(Deleted) - 1;
-	AuDebugOn(memcmp(seq->buf + seq->count, Deleted,
-			 sizeof(Deleted) - 1));
-#undef Deleted
-
-out:
-	return err;
-}
diff --git a/fs/autofs4/symlink.c b/fs/autofs4/symlink.c
index da0c33481..84e037d1d 100644
--- a/fs/autofs4/symlink.c
+++ b/fs/autofs4/symlink.c
@@ -12,10 +12,16 @@
 
 #include "autofs_i.h"
 
-static const char *autofs4_follow_link(struct dentry *dentry, void **cookie)
+static const char *autofs4_get_link(struct dentry *dentry,
+				    struct inode *inode,
+				    struct delayed_call *done)
 {
-	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
-	struct autofs_info *ino = autofs4_dentry_ino(dentry);
+	struct autofs_sb_info *sbi;
+	struct autofs_info *ino;
+	if (!dentry)
+		return ERR_PTR(-ECHILD);
+	sbi = autofs4_sbi(dentry->d_sb);
+	ino = autofs4_dentry_ino(dentry);
 	if (ino && !autofs4_oz_mode(sbi))
 		ino->last_used = jiffies;
 	return d_inode(dentry)->i_private;
@@ -23,5 +29,5 @@ static const char *autofs4_follow_link(struct dentry *dentry, void **cookie)
 
 const struct inode_operations autofs4_symlink_inode_operations = {
 	.readlink	= generic_readlink,
-	.follow_link	= autofs4_follow_link
+	.get_link	= autofs4_get_link
 };
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 861b1e1c4..103f5d7c3 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -192,7 +192,7 @@ EXPORT_SYMBOL(make_bad_inode);
  *	Returns true if the inode in question has been marked as bad.
  */
  
-int is_bad_inode(struct inode *inode)
+bool is_bad_inode(struct inode *inode)
 {
 	return (inode->i_op == &bad_inode_ops);	
 }
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 46aedacfa..cc0e08252 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -42,7 +42,7 @@ static struct inode *befs_iget(struct super_block *, unsigned long);
 static struct inode *befs_alloc_inode(struct super_block *sb);
 static void befs_destroy_inode(struct inode *inode);
 static void befs_destroy_inodecache(void);
-static const char *befs_follow_link(struct dentry *, void **);
+static int befs_symlink_readpage(struct file *, struct page *);
 static int befs_utf2nls(struct super_block *sb, const char *in, int in_len,
 			char **out, int *out_len);
 static int befs_nls2utf(struct super_block *sb, const char *in, int in_len,
@@ -79,10 +79,8 @@ static const struct address_space_operations befs_aops = {
 	.bmap		= befs_bmap,
 };
 
-static const struct inode_operations befs_symlink_inode_operations = {
-	.readlink	= generic_readlink,
-	.follow_link	= befs_follow_link,
-	.put_link	= kfree_put_link,
+static const struct address_space_operations befs_symlink_aops = {
+	.readpage	= befs_symlink_readpage,
 };
 
 /* 
@@ -398,7 +396,9 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
 		inode->i_fop = &befs_dir_operations;
 	} else if (S_ISLNK(inode->i_mode)) {
 		if (befs_ino->i_flags & BEFS_LONG_SYMLINK) {
-			inode->i_op = &befs_symlink_inode_operations;
+			inode->i_op = &page_symlink_inode_operations;
+			inode_nohighmem(inode);
+			inode->i_mapping->a_ops = &befs_symlink_aops;
 		} else {
 			inode->i_link = befs_ino->i_data.symlink;
 			inode->i_op = &simple_symlink_inode_operations;
@@ -434,7 +434,7 @@ befs_init_inodecache(void)
 	befs_inode_cachep = kmem_cache_create("befs_inode_cache",
 					      sizeof (struct befs_inode_info),
 					      0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					      init_once);
 	if (befs_inode_cachep == NULL) {
 		pr_err("%s: Couldn't initialize inode slabcache\n", __func__);
@@ -463,31 +463,33 @@ befs_destroy_inodecache(void)
  * The data stream become link name. Unless the LONG_SYMLINK
  * flag is set.
  */
-static const char *
-befs_follow_link(struct dentry *dentry, void **cookie)
+static int befs_symlink_readpage(struct file *unused, struct page *page)
 {
-	struct super_block *sb = dentry->d_sb;
-	struct befs_inode_info *befs_ino = BEFS_I(d_inode(dentry));
+	struct inode *inode = page->mapping->host;
+	struct super_block *sb = inode->i_sb;
+	struct befs_inode_info *befs_ino = BEFS_I(inode);
 	befs_data_stream *data = &befs_ino->i_data.ds;
 	befs_off_t len = data->size;
-	char *link;
+	char *link = page_address(page);
 
-	if (len == 0) {
+	if (len == 0 || len > PAGE_SIZE) {
 		befs_error(sb, "Long symlink with illegal length");
-		return ERR_PTR(-EIO);
+		goto fail;
 	}
 	befs_debug(sb, "Follow long symlink");
 
-	link = kmalloc(len, GFP_NOFS);
-	if (!link)
-		return ERR_PTR(-ENOMEM);
 	if (befs_read_lsymlink(sb, data, link, len) != len) {
-		kfree(link);
 		befs_error(sb, "Failed to read entire long symlink");
-		return ERR_PTR(-EIO);
+		goto fail;
 	}
 	link[len - 1] = '\0';
-	return *cookie = link;
+	SetPageUptodate(page);
+	unlock_page(page);
+	return 0;
+fail:
+	SetPageError(page);
+	unlock_page(page);
+	return -EIO;
 }
 
 /*
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index fdcb4d69f..1e5c896f6 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -270,7 +270,7 @@ static int __init init_inodecache(void)
 	bfs_inode_cachep = kmem_cache_create("bfs_inode_cache",
 					     sizeof(struct bfs_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					     init_once);
 	if (bfs_inode_cachep == NULL)
 		return -ENOMEM;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 3a93755e8..7d914c67a 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -491,6 +491,7 @@ static inline int arch_elf_pt_proc(struct elfhdr *ehdr,
  * arch_check_elf() - check an ELF executable
  * @ehdr:	The main ELF header
  * @has_interp:	True if the ELF has an interpreter, else false.
+ * @interp_ehdr: The interpreter's ELF header
  * @state:	Architecture-specific state preserved throughout the process
  *		of loading the ELF.
  *
@@ -502,6 +503,7 @@ static inline int arch_elf_pt_proc(struct elfhdr *ehdr,
  *         with that return code.
  */
 static inline int arch_check_elf(struct elfhdr *ehdr, bool has_interp,
+				 struct elfhdr *interp_ehdr,
 				 struct arch_elf_state *state)
 {
 	/* Dummy implementation, always proceed */
@@ -651,7 +653,7 @@ static unsigned long randomize_stack_top(unsigned long stack_top)
 
 	if ((current->flags & PF_RANDOMIZE) &&
 		!(current->personality & ADDR_NO_RANDOMIZE)) {
-		random_variable = (unsigned long) get_random_int();
+		random_variable = get_random_long();
 		random_variable &= STACK_RND_MASK;
 		random_variable <<= PAGE_SHIFT;
 	}
@@ -829,7 +831,9 @@ static int load_elf_binary(struct linux_binprm *bprm)
 	 * still possible to return an error to the code that invoked
 	 * the exec syscall.
 	 */
-	retval = arch_check_elf(&loc->elf_ex, !!interpreter, &arch_state);
+	retval = arch_check_elf(&loc->elf_ex,
+				!!interpreter, &loc->interp_elf_ex,
+				&arch_state);
 	if (retval)
 		goto out_free_dentry;
 
diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c
index 78f005f37..3a3ced779 100644
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -638,11 +638,11 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer,
 	case 3:
 		/* Delete this handler. */
 		root = dget(file->f_path.dentry->d_sb->s_root);
-		mutex_lock(&d_inode(root)->i_mutex);
+		inode_lock(d_inode(root));
 
 		kill_node(e);
 
-		mutex_unlock(&d_inode(root)->i_mutex);
+		inode_unlock(d_inode(root));
 		dput(root);
 		break;
 	default:
@@ -675,7 +675,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
 		return PTR_ERR(e);
 
 	root = dget(sb->s_root);
-	mutex_lock(&d_inode(root)->i_mutex);
+	inode_lock(d_inode(root));
 	dentry = lookup_one_len(e->name, root, strlen(e->name));
 	err = PTR_ERR(dentry);
 	if (IS_ERR(dentry))
@@ -711,7 +711,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
 out2:
 	dput(dentry);
 out:
-	mutex_unlock(&d_inode(root)->i_mutex);
+	inode_unlock(d_inode(root));
 	dput(root);
 
 	if (err) {
@@ -754,12 +754,12 @@ static ssize_t bm_status_write(struct file *file, const char __user *buffer,
 	case 3:
 		/* Delete all handlers. */
 		root = dget(file->f_path.dentry->d_sb->s_root);
-		mutex_lock(&d_inode(root)->i_mutex);
+		inode_lock(d_inode(root));
 
 		while (!list_empty(&entries))
 			kill_node(list_entry(entries.next, Node, list));
 
-		mutex_unlock(&d_inode(root)->i_mutex);
+		inode_unlock(d_inode(root));
 		dput(root);
 		break;
 	default:
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 44d4a1e92..826b164a4 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -75,7 +75,7 @@ void kill_bdev(struct block_device *bdev)
 {
 	struct address_space *mapping = bdev->bd_inode->i_mapping;
 
-	if (mapping->nrpages == 0 && mapping->nrshadows == 0)
+	if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
 		return;
 
 	invalidate_bh_lrus();
@@ -156,11 +156,16 @@ blkdev_get_block(struct inode *inode, sector_t iblock,
 	return 0;
 }
 
+static struct inode *bdev_file_inode(struct file *file)
+{
+	return file->f_mapping->host;
+}
+
 static ssize_t
 blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
 {
 	struct file *file = iocb->ki_filp;
-	struct inode *inode = file->f_mapping->host;
+	struct inode *inode = bdev_file_inode(file);
 
 	if (IS_DAX(inode))
 		return dax_do_io(iocb, inode, iter, offset, blkdev_get_block,
@@ -338,18 +343,18 @@ static int blkdev_write_end(struct file *file, struct address_space *mapping,
  */
 static loff_t block_llseek(struct file *file, loff_t offset, int whence)
 {
-	struct inode *bd_inode = file->f_mapping->host;
+	struct inode *bd_inode = bdev_file_inode(file);
 	loff_t retval;
 
-	mutex_lock(&bd_inode->i_mutex);
+	inode_lock(bd_inode);
 	retval = fixed_size_llseek(file, offset, whence, i_size_read(bd_inode));
-	mutex_unlock(&bd_inode->i_mutex);
+	inode_unlock(bd_inode);
 	return retval;
 }
 	
 int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
 {
-	struct inode *bd_inode = filp->f_mapping->host;
+	struct inode *bd_inode = bdev_file_inode(filp);
 	struct block_device *bdev = I_BDEV(bd_inode);
 	int error;
 	
@@ -395,7 +400,7 @@ int bdev_read_page(struct block_device *bdev, sector_t sector,
 	if (!ops->rw_page || bdev_get_integrity(bdev))
 		return result;
 
-	result = blk_queue_enter(bdev->bd_queue, GFP_KERNEL);
+	result = blk_queue_enter(bdev->bd_queue, false);
 	if (result)
 		return result;
 	result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, READ);
@@ -432,7 +437,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector,
 
 	if (!ops->rw_page || bdev_get_integrity(bdev))
 		return -EOPNOTSUPP;
-	result = blk_queue_enter(bdev->bd_queue, GFP_KERNEL);
+	result = blk_queue_enter(bdev->bd_queue, false);
 	if (result)
 		return result;
 
@@ -450,10 +455,7 @@ EXPORT_SYMBOL_GPL(bdev_write_page);
 /**
  * bdev_direct_access() - Get the address for directly-accessibly memory
  * @bdev: The device containing the memory
- * @sector: The offset within the device
- * @addr: Where to put the address of the memory
- * @pfn: The Page Frame Number for the memory
- * @size: The number of bytes requested
+ * @dax: control and output parameters for ->direct_access
  *
  * If a block device is made up of directly addressable memory, this function
  * will tell the caller the PFN and the address of the memory.  The address
@@ -464,10 +466,10 @@ EXPORT_SYMBOL_GPL(bdev_write_page);
  * Return: negative errno if an error occurs, otherwise the number of bytes
  * accessible at this address.
  */
-long bdev_direct_access(struct block_device *bdev, sector_t sector,
-			void __pmem **addr, unsigned long *pfn, long size)
+long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax)
 {
-	long avail;
+	sector_t sector = dax->sector;
+	long avail, size = dax->size;
 	const struct block_device_operations *ops = bdev->bd_disk->fops;
 
 	/*
@@ -486,9 +488,11 @@ long bdev_direct_access(struct block_device *bdev, sector_t sector,
 	sector += get_start_sect(bdev);
 	if (sector % (PAGE_SIZE / 512))
 		return -EINVAL;
-	avail = ops->direct_access(bdev, sector, addr, pfn);
+	avail = ops->direct_access(bdev, sector, &dax->addr, &dax->pfn);
 	if (!avail)
 		return -ERANGE;
+	if (avail > 0 && avail & ~PAGE_MASK)
+		return -ENXIO;
 	return min(avail, size);
 }
 EXPORT_SYMBOL_GPL(bdev_direct_access);
@@ -590,7 +594,7 @@ void __init bdev_cache_init(void)
 
 	bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
 			0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
-				SLAB_MEM_SPREAD|SLAB_PANIC),
+				SLAB_MEM_SPREAD|SLAB_ACCOUNT|SLAB_PANIC),
 			init_once);
 	err = register_filesystem(&bd_type);
 	if (err)
@@ -696,7 +700,7 @@ static struct block_device *bd_acquire(struct inode *inode)
 	spin_lock(&bdev_lock);
 	bdev = inode->i_bdev;
 	if (bdev) {
-		ihold(bdev->bd_inode);
+		bdgrab(bdev);
 		spin_unlock(&bdev_lock);
 		return bdev;
 	}
@@ -712,7 +716,7 @@ static struct block_device *bd_acquire(struct inode *inode)
 			 * So, we can access it via ->i_mapping always
 			 * without igrab().
 			 */
-			ihold(bdev->bd_inode);
+			bdgrab(bdev);
 			inode->i_bdev = bdev;
 			inode->i_mapping = bdev->bd_inode->i_mapping;
 			list_add(&inode->i_devices, &bdev->bd_inodes);
@@ -735,7 +739,7 @@ void bd_forget(struct inode *inode)
 	spin_unlock(&bdev_lock);
 
 	if (bdev)
-		iput(bdev->bd_inode);
+		bdput(bdev);
 }
 
 /**
@@ -1042,12 +1046,9 @@ EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
 static void flush_disk(struct block_device *bdev, bool kill_dirty)
 {
 	if (__invalidate_device(bdev, kill_dirty)) {
-		char name[BDEVNAME_SIZE] = "";
-
-		if (bdev->bd_disk)
-			disk_name(bdev->bd_disk, 0, name);
 		printk(KERN_WARNING "VFS: busy inodes on changed media or "
-		       "resized disk %s\n", name);
+		       "resized disk %s\n",
+		       bdev->bd_disk ? bdev->bd_disk->disk_name : "");
 	}
 
 	if (!bdev->bd_disk)
@@ -1071,12 +1072,9 @@ void check_disk_size_change(struct gendisk *disk, struct block_device *bdev)
 	disk_size = (loff_t)get_capacity(disk) << 9;
 	bdev_size = i_size_read(bdev->bd_inode);
 	if (disk_size != bdev_size) {
-		char name[BDEVNAME_SIZE];
-
-		disk_name(disk, 0, name);
 		printk(KERN_INFO
 		       "%s: detected capacity change from %lld to %lld\n",
-		       name, bdev_size, disk_size);
+		       disk->disk_name, bdev_size, disk_size);
 		i_size_write(bdev->bd_inode, disk_size);
 		flush_disk(bdev, false);
 	}
@@ -1144,9 +1142,9 @@ void bd_set_size(struct block_device *bdev, loff_t size)
 {
 	unsigned bsize = bdev_logical_block_size(bdev);
 
-	mutex_lock(&bdev->bd_inode->i_mutex);
+	inode_lock(bdev->bd_inode);
 	i_size_write(bdev->bd_inode, size);
-	mutex_unlock(&bdev->bd_inode->i_mutex);
+	inode_unlock(bdev->bd_inode);
 	while (bsize < PAGE_CACHE_SIZE) {
 		if (size & bsize)
 			break;
@@ -1203,7 +1201,11 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 		bdev->bd_disk = disk;
 		bdev->bd_queue = disk->queue;
 		bdev->bd_contains = bdev;
-		bdev->bd_inode->i_flags = disk->fops->direct_access ? S_DAX : 0;
+		if (IS_ENABLED(CONFIG_BLK_DEV_DAX) && disk->fops->direct_access)
+			bdev->bd_inode->i_flags = S_DAX;
+		else
+			bdev->bd_inode->i_flags = 0;
+
 		if (!partno) {
 			ret = -ENXIO;
 			bdev->bd_part = disk_get_part(disk, partno);
@@ -1230,8 +1232,11 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 				}
 			}
 
-			if (!ret)
+			if (!ret) {
 				bd_set_size(bdev,(loff_t)get_capacity(disk)<<9);
+				if (!blkdev_dax_capable(bdev))
+					bdev->bd_inode->i_flags &= ~S_DAX;
+			}
 
 			/*
 			 * If the device is invalidated, rescan partition
@@ -1245,6 +1250,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 				else if (ret == -ENOMEDIUM)
 					invalidate_partitions(disk, bdev);
 			}
+
 			if (ret)
 				goto out_clear;
 		} else {
@@ -1265,12 +1271,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 				goto out_clear;
 			}
 			bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9);
-			/*
-			 * If the partition is not aligned on a page
-			 * boundary, we can't do dax I/O to it.
-			 */
-			if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512)) ||
-			    (bdev->bd_part->nr_sects % (PAGE_SIZE / 512)))
+			if (!blkdev_dax_capable(bdev))
 				bdev->bd_inode->i_flags &= ~S_DAX;
 		}
 	} else {
@@ -1605,14 +1606,14 @@ EXPORT_SYMBOL(blkdev_put);
 
 static int blkdev_close(struct inode * inode, struct file * filp)
 {
-	struct block_device *bdev = I_BDEV(filp->f_mapping->host);
+	struct block_device *bdev = I_BDEV(bdev_file_inode(filp));
 	blkdev_put(bdev, filp->f_mode);
 	return 0;
 }
 
 static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 {
-	struct block_device *bdev = I_BDEV(file->f_mapping->host);
+	struct block_device *bdev = I_BDEV(bdev_file_inode(file));
 	fmode_t mode = file->f_mode;
 
 	/*
@@ -1637,7 +1638,7 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg)
 ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct file *file = iocb->ki_filp;
-	struct inode *bd_inode = file->f_mapping->host;
+	struct inode *bd_inode = bdev_file_inode(file);
 	loff_t size = i_size_read(bd_inode);
 	struct blk_plug plug;
 	ssize_t ret;
@@ -1669,7 +1670,7 @@ EXPORT_SYMBOL_GPL(blkdev_write_iter);
 ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
 {
 	struct file *file = iocb->ki_filp;
-	struct inode *bd_inode = file->f_mapping->host;
+	struct inode *bd_inode = bdev_file_inode(file);
 	loff_t size = i_size_read(bd_inode);
 	loff_t pos = iocb->ki_pos;
 
@@ -1696,25 +1697,102 @@ static int blkdev_releasepage(struct page *page, gfp_t wait)
 	return try_to_free_buffers(page);
 }
 
+static int blkdev_writepages(struct address_space *mapping,
+			     struct writeback_control *wbc)
+{
+	if (dax_mapping(mapping)) {
+		struct block_device *bdev = I_BDEV(mapping->host);
+
+		return dax_writeback_mapping_range(mapping, bdev, wbc);
+	}
+	return generic_writepages(mapping, wbc);
+}
+
 static const struct address_space_operations def_blk_aops = {
 	.readpage	= blkdev_readpage,
 	.readpages	= blkdev_readpages,
 	.writepage	= blkdev_writepage,
 	.write_begin	= blkdev_write_begin,
 	.write_end	= blkdev_write_end,
-	.writepages	= generic_writepages,
+	.writepages	= blkdev_writepages,
 	.releasepage	= blkdev_releasepage,
 	.direct_IO	= blkdev_direct_IO,
 	.is_dirty_writeback = buffer_check_dirty_writeback,
 };
 
+#ifdef CONFIG_FS_DAX
+/*
+ * In the raw block case we do not need to contend with truncation nor
+ * unwritten file extents.  Without those concerns there is no need for
+ * additional locking beyond the mmap_sem context that these routines
+ * are already executing under.
+ *
+ * Note, there is no protection if the block device is dynamically
+ * resized (partition grow/shrink) during a fault. A stable block device
+ * size is already not enforced in the blkdev_direct_IO path.
+ *
+ * For DAX, it is the responsibility of the block device driver to
+ * ensure the whole-disk device size is stable while requests are in
+ * flight.
+ *
+ * Finally, unlike the filemap_page_mkwrite() case there is no
+ * filesystem superblock to sync against freezing.  We still include a
+ * pfn_mkwrite callback for dax drivers to receive write fault
+ * notifications.
+ */
+static int blkdev_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	return __dax_fault(vma, vmf, blkdev_get_block, NULL);
+}
+
+static int blkdev_dax_pfn_mkwrite(struct vm_area_struct *vma,
+		struct vm_fault *vmf)
+{
+	return dax_pfn_mkwrite(vma, vmf);
+}
+
+static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
+		pmd_t *pmd, unsigned int flags)
+{
+	return __dax_pmd_fault(vma, addr, pmd, flags, blkdev_get_block, NULL);
+}
+
+static const struct vm_operations_struct blkdev_dax_vm_ops = {
+	.fault		= blkdev_dax_fault,
+	.pmd_fault	= blkdev_dax_pmd_fault,
+	.pfn_mkwrite	= blkdev_dax_pfn_mkwrite,
+};
+
+static const struct vm_operations_struct blkdev_default_vm_ops = {
+	.fault		= filemap_fault,
+	.map_pages	= filemap_map_pages,
+};
+
+static int blkdev_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	struct inode *bd_inode = bdev_file_inode(file);
+
+	file_accessed(file);
+	if (IS_DAX(bd_inode)) {
+		vma->vm_ops = &blkdev_dax_vm_ops;
+		vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
+	} else {
+		vma->vm_ops = &blkdev_default_vm_ops;
+	}
+
+	return 0;
+}
+#else
+#define blkdev_mmap generic_file_mmap
+#endif
+
 const struct file_operations def_blk_fops = {
 	.open		= blkdev_open,
 	.release	= blkdev_close,
 	.llseek		= block_llseek,
 	.read_iter	= blkdev_read_iter,
 	.write_iter	= blkdev_write_iter,
-	.mmap		= generic_file_mmap,
+	.mmap		= blkdev_mmap,
 	.fsync		= blkdev_fsync,
 	.unlocked_ioctl	= block_ioctl,
 #ifdef CONFIG_COMPAT
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 6d1d0b93b..128ce17a8 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -9,11 +9,12 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   export.o tree-log.o free-space-cache.o zlib.o lzo.o \
 	   compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
 	   reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
-	   uuid-tree.o props.o hash.o
+	   uuid-tree.o props.o hash.o free-space-tree.o
 
 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
 btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
 
 btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \
 	tests/extent-buffer-tests.o tests/btrfs-tests.o \
-	tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o
+	tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o \
+	tests/free-space-tree-tests.o
diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c
index 9a0124a95..6d263bb16 100644
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -37,10 +37,10 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
 
 	switch (type) {
 	case ACL_TYPE_ACCESS:
-		name = POSIX_ACL_XATTR_ACCESS;
+		name = XATTR_NAME_POSIX_ACL_ACCESS;
 		break;
 	case ACL_TYPE_DEFAULT:
-		name = POSIX_ACL_XATTR_DEFAULT;
+		name = XATTR_NAME_POSIX_ACL_DEFAULT;
 		break;
 	default:
 		BUG();
@@ -48,7 +48,7 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type)
 
 	size = __btrfs_getxattr(inode, name, "", 0);
 	if (size > 0) {
-		value = kzalloc(size, GFP_NOFS);
+		value = kzalloc(size, GFP_KERNEL);
 		if (!value)
 			return ERR_PTR(-ENOMEM);
 		size = __btrfs_getxattr(inode, name, value, size);
@@ -81,7 +81,7 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans,
 
 	switch (type) {
 	case ACL_TYPE_ACCESS:
-		name = POSIX_ACL_XATTR_ACCESS;
+		name = XATTR_NAME_POSIX_ACL_ACCESS;
 		if (acl) {
 			ret = posix_acl_equiv_mode(acl, &inode->i_mode);
 			if (ret < 0)
@@ -94,7 +94,7 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans,
 	case ACL_TYPE_DEFAULT:
 		if (!S_ISDIR(inode->i_mode))
 			return acl ? -EINVAL : 0;
-		name = POSIX_ACL_XATTR_DEFAULT;
+		name = XATTR_NAME_POSIX_ACL_DEFAULT;
 		break;
 	default:
 		return -EINVAL;
@@ -102,7 +102,7 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans,
 
 	if (acl) {
 		size = posix_acl_xattr_size(acl->a_count);
-		value = kmalloc(size, GFP_NOFS);
+		value = kmalloc(size, GFP_KERNEL);
 		if (!value) {
 			ret = -ENOMEM;
 			goto out;
diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
index 9aba42b78..5fb60ea7e 100644
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -97,7 +97,7 @@ static struct __btrfs_workqueue *
 __btrfs_alloc_workqueue(const char *name, unsigned int flags, int limit_active,
 			 int thresh)
 {
-	struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
+	struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL);
 
 	if (!ret)
 		return NULL;
@@ -148,7 +148,7 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name,
 					      int limit_active,
 					      int thresh)
 {
-	struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS);
+	struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL);
 
 	if (!ret)
 		return NULL;
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index e2f659dc5..f6dac40f8 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -520,13 +520,10 @@ static inline int ref_for_same_block(struct __prelim_ref *ref1,
 static int __add_missing_keys(struct btrfs_fs_info *fs_info,
 			      struct list_head *head)
 {
-	struct list_head *pos;
+	struct __prelim_ref *ref;
 	struct extent_buffer *eb;
 
-	list_for_each(pos, head) {
-		struct __prelim_ref *ref;
-		ref = list_entry(pos, struct __prelim_ref, list);
-
+	list_for_each_entry(ref, head, list) {
 		if (ref->parent)
 			continue;
 		if (ref->key_for_search.type)
@@ -563,23 +560,15 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info,
  */
 static void __merge_refs(struct list_head *head, int mode)
 {
-	struct list_head *pos1;
+	struct __prelim_ref *pos1;
 
-	list_for_each(pos1, head) {
-		struct list_head *n2;
-		struct list_head *pos2;
-		struct __prelim_ref *ref1;
+	list_for_each_entry(pos1, head, list) {
+		struct __prelim_ref *pos2 = pos1, *tmp;
 
-		ref1 = list_entry(pos1, struct __prelim_ref, list);
-
-		for (pos2 = pos1->next, n2 = pos2->next; pos2 != head;
-		     pos2 = n2, n2 = pos2->next) {
-			struct __prelim_ref *ref2;
-			struct __prelim_ref *xchg;
+		list_for_each_entry_safe_continue(pos2, tmp, head, list) {
+			struct __prelim_ref *xchg, *ref1 = pos1, *ref2 = pos2;
 			struct extent_inode_elem *eie;
 
-			ref2 = list_entry(pos2, struct __prelim_ref, list);
-
 			if (!ref_for_same_block(ref1, ref2))
 				continue;
 			if (mode == 1) {
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 0ef5cc13f..61205e3bb 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -192,6 +192,10 @@ struct btrfs_inode {
 	/* File creation time. */
 	struct timespec i_otime;
 
+	/* Hook into fs_info->delayed_iputs */
+	struct list_head delayed_iput;
+	long delayed_iput_count;
+
 	struct inode vfs_inode;
 };
 
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 0340c57bf..861d47256 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -531,13 +531,9 @@ static struct btrfsic_block *btrfsic_block_hashtable_lookup(
 	    (((unsigned int)(dev_bytenr >> 16)) ^
 	     ((unsigned int)((uintptr_t)bdev))) &
 	     (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1);
-	struct list_head *elem;
-
-	list_for_each(elem, h->table + hashval) {
-		struct btrfsic_block *const b =
-		    list_entry(elem, struct btrfsic_block,
-			       collision_resolving_node);
+	struct btrfsic_block *b;
 
+	list_for_each_entry(b, h->table + hashval, collision_resolving_node) {
 		if (b->dev_state->bdev == bdev && b->dev_bytenr == dev_bytenr)
 			return b;
 	}
@@ -588,13 +584,9 @@ static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup(
 	     ((unsigned int)((uintptr_t)bdev_ref_to)) ^
 	     ((unsigned int)((uintptr_t)bdev_ref_from))) &
 	     (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1);
-	struct list_head *elem;
-
-	list_for_each(elem, h->table + hashval) {
-		struct btrfsic_block_link *const l =
-		    list_entry(elem, struct btrfsic_block_link,
-			       collision_resolving_node);
+	struct btrfsic_block_link *l;
 
+	list_for_each_entry(l, h->table + hashval, collision_resolving_node) {
 		BUG_ON(NULL == l->block_ref_to);
 		BUG_ON(NULL == l->block_ref_from);
 		if (l->block_ref_to->dev_state->bdev == bdev_ref_to &&
@@ -639,13 +631,9 @@ static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup(
 	const unsigned int hashval =
 	    (((unsigned int)((uintptr_t)bdev)) &
 	     (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1));
-	struct list_head *elem;
-
-	list_for_each(elem, h->table + hashval) {
-		struct btrfsic_dev_state *const ds =
-		    list_entry(elem, struct btrfsic_dev_state,
-			       collision_resolving_node);
+	struct btrfsic_dev_state *ds;
 
+	list_for_each_entry(ds, h->table + hashval, collision_resolving_node) {
 		if (ds->bdev == bdev)
 			return ds;
 	}
@@ -1720,29 +1708,20 @@ static int btrfsic_read_block(struct btrfsic_state *state,
 
 static void btrfsic_dump_database(struct btrfsic_state *state)
 {
-	struct list_head *elem_all;
+	const struct btrfsic_block *b_all;
 
 	BUG_ON(NULL == state);
 
 	printk(KERN_INFO "all_blocks_list:\n");
-	list_for_each(elem_all, &state->all_blocks_list) {
-		const struct btrfsic_block *const b_all =
-		    list_entry(elem_all, struct btrfsic_block,
-			       all_blocks_node);
-		struct list_head *elem_ref_to;
-		struct list_head *elem_ref_from;
+	list_for_each_entry(b_all, &state->all_blocks_list, all_blocks_node) {
+		const struct btrfsic_block_link *l;
 
 		printk(KERN_INFO "%c-block @%llu (%s/%llu/%d)\n",
 		       btrfsic_get_block_type(state, b_all),
 		       b_all->logical_bytenr, b_all->dev_state->name,
 		       b_all->dev_bytenr, b_all->mirror_num);
 
-		list_for_each(elem_ref_to, &b_all->ref_to_list) {
-			const struct btrfsic_block_link *const l =
-			    list_entry(elem_ref_to,
-				       struct btrfsic_block_link,
-				       node_ref_to);
-
+		list_for_each_entry(l, &b_all->ref_to_list, node_ref_to) {
 			printk(KERN_INFO " %c @%llu (%s/%llu/%d)"
 			       " refers %u* to"
 			       " %c @%llu (%s/%llu/%d)\n",
@@ -1757,12 +1736,7 @@ static void btrfsic_dump_database(struct btrfsic_state *state)
 			       l->block_ref_to->mirror_num);
 		}
 
-		list_for_each(elem_ref_from, &b_all->ref_from_list) {
-			const struct btrfsic_block_link *const l =
-			    list_entry(elem_ref_from,
-				       struct btrfsic_block_link,
-				       node_ref_from);
-
+		list_for_each_entry(l, &b_all->ref_from_list, node_ref_from) {
 			printk(KERN_INFO " %c @%llu (%s/%llu/%d)"
 			       " is ref %u* from"
 			       " %c @%llu (%s/%llu/%d)\n",
@@ -1845,8 +1819,7 @@ again:
 					       &state->block_hashtable);
 	if (NULL != block) {
 		u64 bytenr = 0;
-		struct list_head *elem_ref_to;
-		struct list_head *tmp_ref_to;
+		struct btrfsic_block_link *l, *tmp;
 
 		if (block->is_superblock) {
 			bytenr = btrfs_super_bytenr((struct btrfs_super_block *)
@@ -1967,13 +1940,8 @@ again:
 		 * because it still carries valueable information
 		 * like whether it was ever written and IO completed.
 		 */
-		list_for_each_safe(elem_ref_to, tmp_ref_to,
-				   &block->ref_to_list) {
-			struct btrfsic_block_link *const l =
-			    list_entry(elem_ref_to,
-				       struct btrfsic_block_link,
-				       node_ref_to);
-
+		list_for_each_entry_safe(l, tmp, &block->ref_to_list,
+					 node_ref_to) {
 			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
 				btrfsic_print_rem_link(state, l);
 			l->ref_cnt--;
@@ -2436,7 +2404,7 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
 					struct btrfsic_block *const block,
 					int recursion_level)
 {
-	struct list_head *elem_ref_to;
+	const struct btrfsic_block_link *l;
 	int ret = 0;
 
 	if (recursion_level >= 3 + BTRFS_MAX_LEVEL) {
@@ -2464,11 +2432,7 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
 	 * This algorithm is recursive because the amount of used stack
 	 * space is very small and the max recursion depth is limited.
 	 */
-	list_for_each(elem_ref_to, &block->ref_to_list) {
-		const struct btrfsic_block_link *const l =
-		    list_entry(elem_ref_to, struct btrfsic_block_link,
-			       node_ref_to);
-
+	list_for_each_entry(l, &block->ref_to_list, node_ref_to) {
 		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
 			printk(KERN_INFO
 			       "rl=%d, %c @%llu (%s/%llu/%d)"
@@ -2561,7 +2525,7 @@ static int btrfsic_is_block_ref_by_superblock(
 		const struct btrfsic_block *block,
 		int recursion_level)
 {
-	struct list_head *elem_ref_from;
+	const struct btrfsic_block_link *l;
 
 	if (recursion_level >= 3 + BTRFS_MAX_LEVEL) {
 		/* refer to comment at "abort cyclic linkage (case 1)" */
@@ -2576,11 +2540,7 @@ static int btrfsic_is_block_ref_by_superblock(
 	 * This algorithm is recursive because the amount of used stack space
 	 * is very small and the max recursion depth is limited.
 	 */
-	list_for_each(elem_ref_from, &block->ref_from_list) {
-		const struct btrfsic_block_link *const l =
-		    list_entry(elem_ref_from, struct btrfsic_block_link,
-			       node_ref_from);
-
+	list_for_each_entry(l, &block->ref_from_list, node_ref_from) {
 		if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
 			printk(KERN_INFO
 			       "rl=%d, %c @%llu (%s/%llu/%d)"
@@ -2669,7 +2629,7 @@ static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
 				  const struct btrfsic_block *block,
 				  int indent_level)
 {
-	struct list_head *elem_ref_to;
+	const struct btrfsic_block_link *l;
 	int indent_add;
 	static char buf[80];
 	int cursor_position;
@@ -2704,11 +2664,7 @@ static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
 	}
 
 	cursor_position = indent_level;
-	list_for_each(elem_ref_to, &block->ref_to_list) {
-		const struct btrfsic_block_link *const l =
-		    list_entry(elem_ref_to, struct btrfsic_block_link,
-			       node_ref_to);
-
+	list_for_each_entry(l, &block->ref_to_list, node_ref_to) {
 		while (cursor_position < indent_level) {
 			printk(" ");
 			cursor_position++;
@@ -3165,8 +3121,7 @@ int btrfsic_mount(struct btrfs_root *root,
 void btrfsic_unmount(struct btrfs_root *root,
 		     struct btrfs_fs_devices *fs_devices)
 {
-	struct list_head *elem_all;
-	struct list_head *tmp_all;
+	struct btrfsic_block *b_all, *tmp_all;
 	struct btrfsic_state *state;
 	struct list_head *dev_head = &fs_devices->devices;
 	struct btrfs_device *device;
@@ -3206,20 +3161,12 @@ void btrfsic_unmount(struct btrfs_root *root,
 	 * just free all memory that was allocated dynamically.
 	 * Free the blocks and the block_links.
 	 */
-	list_for_each_safe(elem_all, tmp_all, &state->all_blocks_list) {
-		struct btrfsic_block *const b_all =
-		    list_entry(elem_all, struct btrfsic_block,
-			       all_blocks_node);
-		struct list_head *elem_ref_to;
-		struct list_head *tmp_ref_to;
-
-		list_for_each_safe(elem_ref_to, tmp_ref_to,
-				   &b_all->ref_to_list) {
-			struct btrfsic_block_link *const l =
-			    list_entry(elem_ref_to,
-				       struct btrfsic_block_link,
-				       node_ref_to);
+	list_for_each_entry_safe(b_all, tmp_all, &state->all_blocks_list,
+				 all_blocks_node) {
+		struct btrfsic_block_link *l, *tmp;
 
+		list_for_each_entry_safe(l, tmp, &b_all->ref_to_list,
+					 node_ref_to) {
 			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
 				btrfsic_print_rem_link(state, l);
 
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index c473c42d7..3346cd8f9 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -637,11 +637,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	faili = nr_pages - 1;
 	cb->nr_pages = nr_pages;
 
-	/* In the parent-locked case, we only locked the range we are
-	 * interested in.  In all other cases, we can opportunistically
-	 * cache decompressed data that goes beyond the requested range. */
-	if (!(bio_flags & EXTENT_BIO_PARENT_LOCKED))
-		add_ra_bio_pages(inode, em_start + em_len, cb);
+	add_ra_bio_pages(inode, em_start + em_len, cb);
 
 	/* include any pages we added in add_ra-bio_pages */
 	uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 5b8e235c4..769e0ff1b 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1555,7 +1555,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
 		return 0;
 	}
 
-	search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
+	search_start = buf->start & ~((u64)SZ_1G - 1);
 
 	if (parent)
 		btrfs_set_lock_blocking(parent);
@@ -2248,7 +2248,6 @@ static void reada_for_search(struct btrfs_root *root,
 	u64 target;
 	u64 nread = 0;
 	u64 gen;
-	int direction = path->reada;
 	struct extent_buffer *eb;
 	u32 nr;
 	u32 blocksize;
@@ -2276,16 +2275,16 @@ static void reada_for_search(struct btrfs_root *root,
 	nr = slot;
 
 	while (1) {
-		if (direction < 0) {
+		if (path->reada == READA_BACK) {
 			if (nr == 0)
 				break;
 			nr--;
-		} else if (direction > 0) {
+		} else if (path->reada == READA_FORWARD) {
 			nr++;
 			if (nr >= nritems)
 				break;
 		}
-		if (path->reada < 0 && objectid) {
+		if (path->reada == READA_BACK && objectid) {
 			btrfs_node_key(node, &disk_key, nr);
 			if (btrfs_disk_key_objectid(&disk_key) != objectid)
 				break;
@@ -2493,7 +2492,7 @@ read_block_for_search(struct btrfs_trans_handle *trans,
 	btrfs_set_path_blocking(p);
 
 	free_extent_buffer(tmp);
-	if (p->reada)
+	if (p->reada != READA_NONE)
 		reada_for_search(root, p, level, slot, key->objectid);
 
 	btrfs_release_path(p);
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 385b449fd..bfe4a337f 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -35,6 +35,7 @@
 #include <linux/btrfs.h>
 #include <linux/workqueue.h>
 #include <linux/security.h>
+#include <linux/sizes.h>
 #include "extent_io.h"
 #include "extent_map.h"
 #include "async-thread.h"
@@ -96,6 +97,9 @@ struct btrfs_ordered_sum;
 /* for storing items that use the BTRFS_UUID_KEY* types */
 #define BTRFS_UUID_TREE_OBJECTID 9ULL
 
+/* tracks free space in block groups. */
+#define BTRFS_FREE_SPACE_TREE_OBJECTID 10ULL
+
 /* for storing balance parameters in the root tree */
 #define BTRFS_BALANCE_OBJECTID -4ULL
 
@@ -174,7 +178,7 @@ struct btrfs_ordered_sum;
 /* csum types */
 #define BTRFS_CSUM_TYPE_CRC32	0
 
-static int btrfs_csum_sizes[] = { 4 };
+static const int btrfs_csum_sizes[] = { 4 };
 
 /* four bytes for CRC32 */
 #define BTRFS_EMPTY_DIR_SIZE 0
@@ -196,9 +200,9 @@ static int btrfs_csum_sizes[] = { 4 };
 /* ioprio of readahead is set to idle */
 #define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0))
 
-#define BTRFS_DIRTY_METADATA_THRESH	(32 * 1024 * 1024)
+#define BTRFS_DIRTY_METADATA_THRESH	SZ_32M
 
-#define BTRFS_MAX_EXTENT_SIZE (128 * 1024 * 1024)
+#define BTRFS_MAX_EXTENT_SIZE SZ_128M
 
 /*
  * The key defines the order in the tree, and so it also defines (optimal)
@@ -500,6 +504,8 @@ struct btrfs_super_block {
  * Compat flags that we support.  If any incompat flags are set other than the
  * ones specified below then we will fail to mount
  */
+#define BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE	(1ULL << 0)
+
 #define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF	(1ULL << 0)
 #define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL	(1ULL << 1)
 #define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS	(1ULL << 2)
@@ -526,7 +532,10 @@ struct btrfs_super_block {
 #define BTRFS_FEATURE_COMPAT_SUPP		0ULL
 #define BTRFS_FEATURE_COMPAT_SAFE_SET		0ULL
 #define BTRFS_FEATURE_COMPAT_SAFE_CLEAR		0ULL
-#define BTRFS_FEATURE_COMPAT_RO_SUPP		0ULL
+
+#define BTRFS_FEATURE_COMPAT_RO_SUPP			\
+	(BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE)
+
 #define BTRFS_FEATURE_COMPAT_RO_SAFE_SET	0ULL
 #define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR	0ULL
 
@@ -590,14 +599,15 @@ struct btrfs_node {
  * The slots array records the index of the item or block pointer
  * used while walking the tree.
  */
+enum { READA_NONE = 0, READA_BACK, READA_FORWARD };
 struct btrfs_path {
 	struct extent_buffer *nodes[BTRFS_MAX_LEVEL];
 	int slots[BTRFS_MAX_LEVEL];
 	/* if there is real range locking, this locks field will change */
-	int locks[BTRFS_MAX_LEVEL];
-	int reada;
+	u8 locks[BTRFS_MAX_LEVEL];
+	u8 reada;
 	/* keep some upper locks as we walk down */
-	int lowest_level;
+	u8 lowest_level;
 
 	/*
 	 * set by btrfs_split_item, tells search_slot to keep all locks
@@ -1088,6 +1098,13 @@ struct btrfs_block_group_item {
 	__le64 flags;
 } __attribute__ ((__packed__));
 
+struct btrfs_free_space_info {
+	__le32 extent_count;
+	__le32 flags;
+} __attribute__ ((__packed__));
+
+#define BTRFS_FREE_SPACE_USING_BITMAPS (1ULL << 0)
+
 #define BTRFS_QGROUP_LEVEL_SHIFT		48
 static inline u64 btrfs_qgroup_level(u64 qgroupid)
 {
@@ -1296,6 +1313,9 @@ struct btrfs_caching_control {
 	atomic_t count;
 };
 
+/* Once caching_thread() finds this much free space, it will wake up waiters. */
+#define CACHING_CTL_WAKE_UP (1024 * 1024 * 2)
+
 struct btrfs_io_ctl {
 	void *cur, *orig;
 	struct page *page;
@@ -1321,8 +1341,20 @@ struct btrfs_block_group_cache {
 	u64 delalloc_bytes;
 	u64 bytes_super;
 	u64 flags;
-	u64 sectorsize;
 	u64 cache_generation;
+	u32 sectorsize;
+
+	/*
+	 * If the free space extent count exceeds this number, convert the block
+	 * group to bitmaps.
+	 */
+	u32 bitmap_high_thresh;
+
+	/*
+	 * If the free space extent count drops below this number, convert the
+	 * block group back to extents.
+	 */
+	u32 bitmap_low_thresh;
 
 	/*
 	 * It is just used for the delayed data space allocation because
@@ -1378,6 +1410,15 @@ struct btrfs_block_group_cache {
 	struct list_head io_list;
 
 	struct btrfs_io_ctl io_ctl;
+
+	/* Lock for free space tree operations. */
+	struct mutex free_space_lock;
+
+	/*
+	 * Does the block group need to be added to the free space tree?
+	 * Protected by free_space_lock.
+	 */
+	int needs_free_space;
 };
 
 /* delayed seq elem */
@@ -1429,6 +1470,7 @@ struct btrfs_fs_info {
 	struct btrfs_root *csum_root;
 	struct btrfs_root *quota_root;
 	struct btrfs_root *uuid_root;
+	struct btrfs_root *free_space_root;
 
 	/* the log root tree is a directory of all the other log roots */
 	struct btrfs_root *log_root_tree;
@@ -1816,6 +1858,8 @@ struct btrfs_fs_info {
 	 * and will be latter freed. Protected by fs_info->chunk_mutex.
 	 */
 	struct list_head pinned_chunks;
+
+	int creating_free_space_tree;
 };
 
 struct btrfs_subvolume_writers {
@@ -2092,6 +2136,27 @@ struct btrfs_ioctl_defrag_range_args {
  */
 #define BTRFS_BLOCK_GROUP_ITEM_KEY 192
 
+/*
+ * Every block group is represented in the free space tree by a free space info
+ * item, which stores some accounting information. It is keyed on
+ * (block_group_start, FREE_SPACE_INFO, block_group_length).
+ */
+#define BTRFS_FREE_SPACE_INFO_KEY 198
+
+/*
+ * A free space extent tracks an extent of space that is free in a block group.
+ * It is keyed on (start, FREE_SPACE_EXTENT, length).
+ */
+#define BTRFS_FREE_SPACE_EXTENT_KEY 199
+
+/*
+ * When a block group becomes very fragmented, we convert it to use bitmaps
+ * instead of extents. A free space bitmap is keyed on
+ * (start, FREE_SPACE_BITMAP, length); the corresponding item is a bitmap with
+ * (length / sectorsize) bits.
+ */
+#define BTRFS_FREE_SPACE_BITMAP_KEY 200
+
 #define BTRFS_DEV_EXTENT_KEY	204
 #define BTRFS_DEV_ITEM_KEY	216
 #define BTRFS_CHUNK_ITEM_KEY	228
@@ -2184,6 +2249,7 @@ struct btrfs_ioctl_defrag_range_args {
 #define BTRFS_MOUNT_RESCAN_UUID_TREE	(1 << 23)
 #define BTRFS_MOUNT_FRAGMENT_DATA	(1 << 24)
 #define BTRFS_MOUNT_FRAGMENT_METADATA	(1 << 25)
+#define BTRFS_MOUNT_FREE_SPACE_TREE	(1 << 26)
 
 #define BTRFS_DEFAULT_COMMIT_INTERVAL	(30)
 #define BTRFS_DEFAULT_MAX_INLINE	(8192)
@@ -2506,6 +2572,11 @@ BTRFS_SETGET_FUNCS(disk_block_group_flags,
 BTRFS_SETGET_STACK_FUNCS(block_group_flags,
 			struct btrfs_block_group_item, flags, 64);
 
+/* struct btrfs_free_space_info */
+BTRFS_SETGET_FUNCS(free_space_extent_count, struct btrfs_free_space_info,
+		   extent_count, 32);
+BTRFS_SETGET_FUNCS(free_space_flags, struct btrfs_free_space_info, flags, 32);
+
 /* struct btrfs_inode_ref */
 BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16);
 BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64);
@@ -3570,9 +3641,13 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
 int __get_raid_index(u64 flags);
 int btrfs_start_write_no_snapshoting(struct btrfs_root *root);
 void btrfs_end_write_no_snapshoting(struct btrfs_root *root);
+void btrfs_wait_for_snapshot_creation(struct btrfs_root *root);
 void check_system_chunk(struct btrfs_trans_handle *trans,
 			struct btrfs_root *root,
 			const u64 type);
+u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
+		       struct btrfs_fs_info *info, u64 start, u64 end);
+
 /* ctree.c */
 int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key,
 		     int level, int *slot);
@@ -3737,6 +3812,7 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info)
 	kfree(fs_info->csum_root);
 	kfree(fs_info->quota_root);
 	kfree(fs_info->uuid_root);
+	kfree(fs_info->free_space_root);
 	kfree(fs_info->super_copy);
 	kfree(fs_info->super_for_commit);
 	security_free_mnt_opts(&fs_info->security_opts);
@@ -3906,7 +3982,6 @@ void btrfs_extent_item_to_extent_map(struct inode *inode,
 /* inode.c */
 struct btrfs_delalloc_work {
 	struct inode *inode;
-	int wait;
 	int delay_iput;
 	struct completion completion;
 	struct list_head list;
@@ -3914,7 +3989,7 @@ struct btrfs_delalloc_work {
 };
 
 struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
-						    int wait, int delay_iput);
+						    int delay_iput);
 void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work);
 
 struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
@@ -4024,7 +4099,8 @@ void btrfs_get_block_group_info(struct list_head *groups_list,
 				struct btrfs_ioctl_space_info *space);
 void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
 			       struct btrfs_ioctl_balance_args *bargs);
-
+ssize_t btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen,
+			   struct file *dst_file, u64 dst_loff);
 
 /* file.c */
 int btrfs_auto_defrag_init(void);
@@ -4055,6 +4131,11 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode,
 		      loff_t pos, size_t write_bytes,
 		      struct extent_state **cached);
 int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
+ssize_t btrfs_copy_file_range(struct file *file_in, loff_t pos_in,
+			      struct file *file_out, loff_t pos_out,
+			      size_t len, unsigned int flags);
+int btrfs_clone_file_range(struct file *file_in, loff_t pos_in,
+			   struct file *file_out, loff_t pos_out, u64 len);
 
 /* tree-defrag.c */
 int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
@@ -4247,16 +4328,98 @@ static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info,
 	}
 }
 
+#define btrfs_clear_fs_incompat(__fs_info, opt) \
+	__btrfs_clear_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt)
+
+static inline void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info,
+					     u64 flag)
+{
+	struct btrfs_super_block *disk_super;
+	u64 features;
+
+	disk_super = fs_info->super_copy;
+	features = btrfs_super_incompat_flags(disk_super);
+	if (features & flag) {
+		spin_lock(&fs_info->super_lock);
+		features = btrfs_super_incompat_flags(disk_super);
+		if (features & flag) {
+			features &= ~flag;
+			btrfs_set_super_incompat_flags(disk_super, features);
+			btrfs_info(fs_info, "clearing %llu feature flag",
+					 flag);
+		}
+		spin_unlock(&fs_info->super_lock);
+	}
+}
+
 #define btrfs_fs_incompat(fs_info, opt) \
 	__btrfs_fs_incompat((fs_info), BTRFS_FEATURE_INCOMPAT_##opt)
 
-static inline int __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag)
+static inline bool __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag)
 {
 	struct btrfs_super_block *disk_super;
 	disk_super = fs_info->super_copy;
 	return !!(btrfs_super_incompat_flags(disk_super) & flag);
 }
 
+#define btrfs_set_fs_compat_ro(__fs_info, opt) \
+	__btrfs_set_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt)
+
+static inline void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info,
+					    u64 flag)
+{
+	struct btrfs_super_block *disk_super;
+	u64 features;
+
+	disk_super = fs_info->super_copy;
+	features = btrfs_super_compat_ro_flags(disk_super);
+	if (!(features & flag)) {
+		spin_lock(&fs_info->super_lock);
+		features = btrfs_super_compat_ro_flags(disk_super);
+		if (!(features & flag)) {
+			features |= flag;
+			btrfs_set_super_compat_ro_flags(disk_super, features);
+			btrfs_info(fs_info, "setting %llu ro feature flag",
+				   flag);
+		}
+		spin_unlock(&fs_info->super_lock);
+	}
+}
+
+#define btrfs_clear_fs_compat_ro(__fs_info, opt) \
+	__btrfs_clear_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt)
+
+static inline void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info,
+					      u64 flag)
+{
+	struct btrfs_super_block *disk_super;
+	u64 features;
+
+	disk_super = fs_info->super_copy;
+	features = btrfs_super_compat_ro_flags(disk_super);
+	if (features & flag) {
+		spin_lock(&fs_info->super_lock);
+		features = btrfs_super_compat_ro_flags(disk_super);
+		if (features & flag) {
+			features &= ~flag;
+			btrfs_set_super_compat_ro_flags(disk_super, features);
+			btrfs_info(fs_info, "clearing %llu ro feature flag",
+				   flag);
+		}
+		spin_unlock(&fs_info->super_lock);
+	}
+}
+
+#define btrfs_fs_compat_ro(fs_info, opt) \
+	__btrfs_fs_compat_ro((fs_info), BTRFS_FEATURE_COMPAT_RO_##opt)
+
+static inline int __btrfs_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag)
+{
+	struct btrfs_super_block *disk_super;
+	disk_super = fs_info->super_copy;
+	return !!(btrfs_super_compat_ro_flags(disk_super) & flag);
+}
+
 /*
  * Call btrfs_abort_transaction as early as possible when an error condition is
  * detected, that way the exact line number is reported.
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 02b934d0e..b57daa895 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -54,16 +54,11 @@ static inline void btrfs_init_delayed_node(
 	delayed_node->root = root;
 	delayed_node->inode_id = inode_id;
 	atomic_set(&delayed_node->refs, 0);
-	delayed_node->count = 0;
-	delayed_node->flags = 0;
 	delayed_node->ins_root = RB_ROOT;
 	delayed_node->del_root = RB_ROOT;
 	mutex_init(&delayed_node->mutex);
-	delayed_node->index_cnt = 0;
 	INIT_LIST_HEAD(&delayed_node->n_list);
 	INIT_LIST_HEAD(&delayed_node->p_list);
-	delayed_node->bytes_reserved = 0;
-	memset(&delayed_node->inode_item, 0, sizeof(delayed_node->inode_item));
 }
 
 static inline int btrfs_is_continuous_delayed_item(
@@ -132,7 +127,7 @@ again:
 	if (node)
 		return node;
 
-	node = kmem_cache_alloc(delayed_node_cache, GFP_NOFS);
+	node = kmem_cache_zalloc(delayed_node_cache, GFP_NOFS);
 	if (!node)
 		return ERR_PTR(-ENOMEM);
 	btrfs_init_delayed_node(node, root, ino);
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index e06dd75ad..914ac13bd 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -493,12 +493,12 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs,
 				memcpy(&existing_ref->extent_op->key,
 				       &ref->extent_op->key,
 				       sizeof(ref->extent_op->key));
-				existing_ref->extent_op->update_key = 1;
+				existing_ref->extent_op->update_key = true;
 			}
 			if (ref->extent_op->update_flags) {
 				existing_ref->extent_op->flags_to_set |=
 					ref->extent_op->flags_to_set;
-				existing_ref->extent_op->update_flags = 1;
+				existing_ref->extent_op->update_flags = true;
 			}
 			btrfs_free_delayed_extent_op(ref->extent_op);
 		}
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
index 00ed02cbf..c24b653c7 100644
--- a/fs/btrfs/delayed-ref.h
+++ b/fs/btrfs/delayed-ref.h
@@ -75,11 +75,11 @@ struct btrfs_delayed_ref_node {
 
 struct btrfs_delayed_extent_op {
 	struct btrfs_disk_key key;
+	u8 level;
+	bool update_key;
+	bool update_flags;
+	bool is_data;
 	u64 flags_to_set;
-	int level;
-	unsigned int update_key:1;
-	unsigned int update_flags:1;
-	unsigned int is_data:1;
 };
 
 /*
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 1e668fb7d..cbb7dbfb3 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -614,7 +614,7 @@ static void btrfs_dev_replace_update_device_in_mapping_tree(
 		em = lookup_extent_mapping(em_tree, start, (u64)-1);
 		if (!em)
 			break;
-		map = (struct map_lookup *)em->bdev;
+		map = em->map_lookup;
 		for (i = 0; i < map->num_stripes; i++)
 			if (srcdev == map->stripes[i].dev)
 				map->stripes[i].dev = tgtdev;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 41fb43183..4545e2e2a 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -42,6 +42,7 @@
 #include "locking.h"
 #include "tree-log.h"
 #include "free-space-cache.h"
+#include "free-space-tree.h"
 #include "inode-map.h"
 #include "check-integrity.h"
 #include "rcu-string.h"
@@ -54,6 +55,12 @@
 #include <asm/cpufeature.h>
 #endif
 
+#define BTRFS_SUPER_FLAG_SUPP	(BTRFS_HEADER_FLAG_WRITTEN |\
+				 BTRFS_HEADER_FLAG_RELOC |\
+				 BTRFS_SUPER_FLAG_ERROR |\
+				 BTRFS_SUPER_FLAG_SEEDING |\
+				 BTRFS_SUPER_FLAG_METADUMP)
+
 static const struct extent_io_ops btree_extent_io_ops;
 static void end_workqueue_fn(struct btrfs_work *work);
 static void free_fs_root(struct btrfs_root *root);
@@ -175,6 +182,7 @@ static struct btrfs_lockdep_keyset {
 	{ .id = BTRFS_TREE_RELOC_OBJECTID,	.name_stem = "treloc"	},
 	{ .id = BTRFS_DATA_RELOC_TREE_OBJECTID,	.name_stem = "dreloc"	},
 	{ .id = BTRFS_UUID_TREE_OBJECTID,	.name_stem = "uuid"	},
+	{ .id = BTRFS_FREE_SPACE_TREE_OBJECTID,	.name_stem = "free-space" },
 	{ .id = 0,				.name_stem = "tree"	},
 };
 
@@ -362,7 +370,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
 	}
 
 	lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1,
-			 0, &cached_state);
+			 &cached_state);
 	if (extent_buffer_uptodate(eb) &&
 	    btrfs_header_generation(eb) == parent_transid) {
 		ret = 0;
@@ -923,7 +931,7 @@ static int check_async_write(struct inode *inode, unsigned long bio_flags)
 	if (bio_flags & EXTENT_BIO_TREE_LOG)
 		return 0;
 #ifdef CONFIG_X86
-	if (cpu_has_xmm4_2)
+	if (static_cpu_has_safe(X86_FEATURE_XMM4_2))
 		return 0;
 #endif
 	return 1;
@@ -1665,6 +1673,9 @@ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info,
 	if (location->objectid == BTRFS_UUID_TREE_OBJECTID)
 		return fs_info->uuid_root ? fs_info->uuid_root :
 					    ERR_PTR(-ENOENT);
+	if (location->objectid == BTRFS_FREE_SPACE_TREE_OBJECTID)
+		return fs_info->free_space_root ? fs_info->free_space_root :
+						  ERR_PTR(-ENOENT);
 again:
 	root = btrfs_lookup_fs_root(fs_info, location->objectid);
 	if (root) {
@@ -2165,6 +2176,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
 	free_root_extent_buffers(info->uuid_root);
 	if (chunk_root)
 		free_root_extent_buffers(info->chunk_root);
+	free_root_extent_buffers(info->free_space_root);
 }
 
 void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
@@ -2465,6 +2477,15 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info,
 		fs_info->uuid_root = root;
 	}
 
+	if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
+		location.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID;
+		root = btrfs_read_tree_root(tree_root, &location);
+		if (IS_ERR(root))
+			return PTR_ERR(root);
+		set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
+		fs_info->free_space_root = root;
+	}
+
 	return 0;
 }
 
@@ -2745,26 +2766,6 @@ int open_ctree(struct super_block *sb,
 		goto fail_alloc;
 	}
 
-	/*
-	 * Leafsize and nodesize were always equal, this is only a sanity check.
-	 */
-	if (le32_to_cpu(disk_super->__unused_leafsize) !=
-	    btrfs_super_nodesize(disk_super)) {
-		printk(KERN_ERR "BTRFS: couldn't mount because metadata "
-		       "blocksizes don't match.  node %d leaf %d\n",
-		       btrfs_super_nodesize(disk_super),
-		       le32_to_cpu(disk_super->__unused_leafsize));
-		err = -EINVAL;
-		goto fail_alloc;
-	}
-	if (btrfs_super_nodesize(disk_super) > BTRFS_MAX_METADATA_BLOCKSIZE) {
-		printk(KERN_ERR "BTRFS: couldn't mount because metadata "
-		       "blocksize (%d) was too large\n",
-		       btrfs_super_nodesize(disk_super));
-		err = -EINVAL;
-		goto fail_alloc;
-	}
-
 	features = btrfs_super_incompat_flags(disk_super);
 	features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF;
 	if (tree_root->fs_info->compress_type == BTRFS_COMPRESS_LZO)
@@ -2827,7 +2828,7 @@ int open_ctree(struct super_block *sb,
 
 	fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super);
 	fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages,
-				    4 * 1024 * 1024 / PAGE_CACHE_SIZE);
+				    SZ_4M / PAGE_CACHE_SIZE);
 
 	tree_root->nodesize = nodesize;
 	tree_root->sectorsize = sectorsize;
@@ -2836,17 +2837,6 @@ int open_ctree(struct super_block *sb,
 	sb->s_blocksize = sectorsize;
 	sb->s_blocksize_bits = blksize_bits(sectorsize);
 
-	if (btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
-		printk(KERN_ERR "BTRFS: valid FS not found on %s\n", sb->s_id);
-		goto fail_sb_buffer;
-	}
-
-	if (sectorsize != PAGE_SIZE) {
-		printk(KERN_ERR "BTRFS: incompatible sector size (%lu) "
-		       "found on %s\n", (unsigned long)sectorsize, sb->s_id);
-		goto fail_sb_buffer;
-	}
-
 	mutex_lock(&fs_info->chunk_mutex);
 	ret = btrfs_read_sys_array(tree_root);
 	mutex_unlock(&fs_info->chunk_mutex);
@@ -3081,6 +3071,18 @@ retry_root_backup:
 	if (sb->s_flags & MS_RDONLY)
 		return 0;
 
+	if (btrfs_test_opt(tree_root, FREE_SPACE_TREE) &&
+	    !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
+		pr_info("BTRFS: creating free space tree\n");
+		ret = btrfs_create_free_space_tree(fs_info);
+		if (ret) {
+			pr_warn("BTRFS: failed to create free space tree %d\n",
+				ret);
+			close_ctree(tree_root);
+			return ret;
+		}
+	}
+
 	down_read(&fs_info->cleanup_work_sem);
 	if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) ||
 	    (ret = btrfs_orphan_cleanup(fs_info->tree_root))) {
@@ -3106,6 +3108,18 @@ retry_root_backup:
 
 	btrfs_qgroup_rescan_resume(fs_info);
 
+	if (btrfs_test_opt(tree_root, CLEAR_CACHE) &&
+	    btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) {
+		pr_info("BTRFS: clearing free space tree\n");
+		ret = btrfs_clear_free_space_tree(fs_info);
+		if (ret) {
+			pr_warn("BTRFS: failed to clear free space tree %d\n",
+				ret);
+			close_ctree(tree_root);
+			return ret;
+		}
+	}
+
 	if (!fs_info->uuid_root) {
 		pr_info("BTRFS: creating UUID tree\n");
 		ret = btrfs_create_uuid_tree(fs_info);
@@ -3932,11 +3946,6 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
 	return !ret;
 }
 
-int btrfs_set_buffer_uptodate(struct extent_buffer *buf)
-{
-	return set_extent_buffer_uptodate(buf);
-}
-
 void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
 {
 	struct btrfs_root *root;
@@ -3992,7 +4001,6 @@ static void __btrfs_btree_balance_dirty(struct btrfs_root *root,
 		balance_dirty_pages_ratelimited(
 				   root->fs_info->btree_inode->i_mapping);
 	}
-	return;
 }
 
 void btrfs_btree_balance_dirty(struct btrfs_root *root)
@@ -4015,8 +4023,17 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
 			      int read_only)
 {
 	struct btrfs_super_block *sb = fs_info->super_copy;
+	u64 nodesize = btrfs_super_nodesize(sb);
+	u64 sectorsize = btrfs_super_sectorsize(sb);
 	int ret = 0;
 
+	if (btrfs_super_magic(sb) != BTRFS_MAGIC) {
+		printk(KERN_ERR "BTRFS: no valid FS found\n");
+		ret = -EINVAL;
+	}
+	if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP)
+		printk(KERN_WARNING "BTRFS: unrecognized super flag: %llu\n",
+				btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP);
 	if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) {
 		printk(KERN_ERR "BTRFS: tree_root level too big: %d >= %d\n",
 				btrfs_super_root_level(sb), BTRFS_MAX_LEVEL);
@@ -4034,31 +4051,46 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info,
 	}
 
 	/*
-	 * The common minimum, we don't know if we can trust the nodesize/sectorsize
-	 * items yet, they'll be verified later. Issue just a warning.
+	 * Check sectorsize and nodesize first, other check will need it.
+	 * Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here.
 	 */
-	if (!IS_ALIGNED(btrfs_super_root(sb), 4096))
+	if (!is_power_of_2(sectorsize) || sectorsize < 4096 ||
+	    sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) {
+		printk(KERN_ERR "BTRFS: invalid sectorsize %llu\n", sectorsize);
+		ret = -EINVAL;
+	}
+	/* Only PAGE SIZE is supported yet */
+	if (sectorsize != PAGE_CACHE_SIZE) {
+		printk(KERN_ERR "BTRFS: sectorsize %llu not supported yet, only support %lu\n",
+				sectorsize, PAGE_CACHE_SIZE);
+		ret = -EINVAL;
+	}
+	if (!is_power_of_2(nodesize) || nodesize < sectorsize ||
+	    nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) {
+		printk(KERN_ERR "BTRFS: invalid nodesize %llu\n", nodesize);
+		ret = -EINVAL;
+	}
+	if (nodesize != le32_to_cpu(sb->__unused_leafsize)) {
+		printk(KERN_ERR "BTRFS: invalid leafsize %u, should be %llu\n",
+				le32_to_cpu(sb->__unused_leafsize),
+				nodesize);
+		ret = -EINVAL;
+	}
+
+	/* Root alignment check */
+	if (!IS_ALIGNED(btrfs_super_root(sb), sectorsize)) {
 		printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n",
 				btrfs_super_root(sb));
-	if (!IS_ALIGNED(btrfs_super_chunk_root(sb), 4096))
+		ret = -EINVAL;
+	}
+	if (!IS_ALIGNED(btrfs_super_chunk_root(sb), sectorsize)) {
 		printk(KERN_WARNING "BTRFS: chunk_root block unaligned: %llu\n",
 				btrfs_super_chunk_root(sb));
-	if (!IS_ALIGNED(btrfs_super_log_root(sb), 4096))
-		printk(KERN_WARNING "BTRFS: log_root block unaligned: %llu\n",
-				btrfs_super_log_root(sb));
-
-	/*
-	 * Check the lower bound, the alignment and other constraints are
-	 * checked later.
-	 */
-	if (btrfs_super_nodesize(sb) < 4096) {
-		printk(KERN_ERR "BTRFS: nodesize too small: %u < 4096\n",
-				btrfs_super_nodesize(sb));
 		ret = -EINVAL;
 	}
-	if (btrfs_super_sectorsize(sb) < 4096) {
-		printk(KERN_ERR "BTRFS: sectorsize too small: %u < 4096\n",
-				btrfs_super_sectorsize(sb));
+	if (!IS_ALIGNED(btrfs_super_log_root(sb), sectorsize)) {
+		printk(KERN_WARNING "BTRFS: log_root block unaligned: %llu\n",
+				btrfs_super_log_root(sb));
 		ret = -EINVAL;
 	}
 
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index adeb31830..8e79d0070 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -19,7 +19,7 @@
 #ifndef __DISKIO__
 #define __DISKIO__
 
-#define BTRFS_SUPER_INFO_OFFSET (64 * 1024)
+#define BTRFS_SUPER_INFO_OFFSET SZ_64K
 #define BTRFS_SUPER_INFO_SIZE 4096
 
 #define BTRFS_SUPER_MIRROR_MAX	 3
@@ -35,7 +35,7 @@ enum btrfs_wq_endio_type {
 
 static inline u64 btrfs_sb_offset(int mirror)
 {
-	u64 start = 16 * 1024;
+	u64 start = SZ_16K;
 	if (mirror)
 		return start << (BTRFS_SUPER_MIRROR_SHIFT * mirror);
 	return BTRFS_SUPER_INFO_OFFSET;
@@ -116,7 +116,6 @@ static inline void btrfs_put_fs_root(struct btrfs_root *root)
 void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
 int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
 			  int atomic);
-int btrfs_set_buffer_uptodate(struct extent_buffer *buf);
 int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid);
 u32 btrfs_csum_data(char *data, u32 seed, size_t len);
 void btrfs_csum_final(u32 crc, char *result);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 2368cac11..e2287c7c1 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -33,6 +33,7 @@
 #include "raid56.h"
 #include "locking.h"
 #include "free-space-cache.h"
+#include "free-space-tree.h"
 #include "math.h"
 #include "sysfs.h"
 #include "qgroup.h"
@@ -357,8 +358,8 @@ static void fragment_free_space(struct btrfs_root *root,
  * we need to check the pinned_extents for any extents that can't be used yet
  * since their free space will be released as soon as the transaction commits.
  */
-static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
-			      struct btrfs_fs_info *info, u64 start, u64 end)
+u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
+		       struct btrfs_fs_info *info, u64 start, u64 end)
 {
 	u64 extent_start, extent_end, size, total_added = 0;
 	int ret;
@@ -395,11 +396,10 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
 	return total_added;
 }
 
-static noinline void caching_thread(struct btrfs_work *work)
+static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl)
 {
 	struct btrfs_block_group_cache *block_group;
 	struct btrfs_fs_info *fs_info;
-	struct btrfs_caching_control *caching_ctl;
 	struct btrfs_root *extent_root;
 	struct btrfs_path *path;
 	struct extent_buffer *leaf;
@@ -407,17 +407,16 @@ static noinline void caching_thread(struct btrfs_work *work)
 	u64 total_found = 0;
 	u64 last = 0;
 	u32 nritems;
-	int ret = -ENOMEM;
+	int ret;
 	bool wakeup = true;
 
-	caching_ctl = container_of(work, struct btrfs_caching_control, work);
 	block_group = caching_ctl->block_group;
 	fs_info = block_group->fs_info;
 	extent_root = fs_info->extent_root;
 
 	path = btrfs_alloc_path();
 	if (!path)
-		goto out;
+		return -ENOMEM;
 
 	last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
 
@@ -438,20 +437,16 @@ static noinline void caching_thread(struct btrfs_work *work)
 	 */
 	path->skip_locking = 1;
 	path->search_commit_root = 1;
-	path->reada = 1;
+	path->reada = READA_FORWARD;
 
 	key.objectid = last;
 	key.offset = 0;
 	key.type = BTRFS_EXTENT_ITEM_KEY;
-again:
-	mutex_lock(&caching_ctl->mutex);
-	/* need to make sure the commit_root doesn't disappear */
-	down_read(&fs_info->commit_root_sem);
 
 next:
 	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
 	if (ret < 0)
-		goto err;
+		goto out;
 
 	leaf = path->nodes[0];
 	nritems = btrfs_header_nritems(leaf);
@@ -477,12 +472,14 @@ next:
 				up_read(&fs_info->commit_root_sem);
 				mutex_unlock(&caching_ctl->mutex);
 				cond_resched();
-				goto again;
+				mutex_lock(&caching_ctl->mutex);
+				down_read(&fs_info->commit_root_sem);
+				goto next;
 			}
 
 			ret = btrfs_next_leaf(extent_root, path);
 			if (ret < 0)
-				goto err;
+				goto out;
 			if (ret)
 				break;
 			leaf = path->nodes[0];
@@ -521,7 +518,7 @@ next:
 			else
 				last = key.objectid + key.offset;
 
-			if (total_found > (1024 * 1024 * 2)) {
+			if (total_found > CACHING_CTL_WAKE_UP) {
 				total_found = 0;
 				if (wakeup)
 					wake_up(&caching_ctl->wait);
@@ -534,9 +531,37 @@ next:
 	total_found += add_new_free_space(block_group, fs_info, last,
 					  block_group->key.objectid +
 					  block_group->key.offset);
+	caching_ctl->progress = (u64)-1;
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static noinline void caching_thread(struct btrfs_work *work)
+{
+	struct btrfs_block_group_cache *block_group;
+	struct btrfs_fs_info *fs_info;
+	struct btrfs_caching_control *caching_ctl;
+	struct btrfs_root *extent_root;
+	int ret;
+
+	caching_ctl = container_of(work, struct btrfs_caching_control, work);
+	block_group = caching_ctl->block_group;
+	fs_info = block_group->fs_info;
+	extent_root = fs_info->extent_root;
+
+	mutex_lock(&caching_ctl->mutex);
+	down_read(&fs_info->commit_root_sem);
+
+	if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+		ret = load_free_space_tree(caching_ctl);
+	else
+		ret = load_extent_tree_free(caching_ctl);
+
 	spin_lock(&block_group->lock);
 	block_group->caching_ctl = NULL;
-	block_group->cached = BTRFS_CACHE_FINISHED;
+	block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED;
 	spin_unlock(&block_group->lock);
 
 #ifdef CONFIG_BTRFS_DEBUG
@@ -555,20 +580,11 @@ next:
 #endif
 
 	caching_ctl->progress = (u64)-1;
-err:
-	btrfs_free_path(path);
-	up_read(&fs_info->commit_root_sem);
-
-	free_excluded_extents(extent_root, block_group);
 
+	up_read(&fs_info->commit_root_sem);
+	free_excluded_extents(fs_info->extent_root, block_group);
 	mutex_unlock(&caching_ctl->mutex);
-out:
-	if (ret) {
-		spin_lock(&block_group->lock);
-		block_group->caching_ctl = NULL;
-		block_group->cached = BTRFS_CACHE_ERROR;
-		spin_unlock(&block_group->lock);
-	}
+
 	wake_up(&caching_ctl->wait);
 
 	put_caching_control(caching_ctl);
@@ -680,8 +696,8 @@ static int cache_block_group(struct btrfs_block_group_cache *cache,
 		}
 	} else {
 		/*
-		 * We are not going to do the fast caching, set cached to the
-		 * appropriate value and wakeup any waiters.
+		 * We're either using the free space tree or no caching at all.
+		 * Set cached to the appropriate value and wakeup any waiters.
 		 */
 		spin_lock(&cache->lock);
 		if (load_cache_only) {
@@ -2115,7 +2131,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 	if (!path)
 		return -ENOMEM;
 
-	path->reada = 1;
+	path->reada = READA_FORWARD;
 	path->leave_spinning = 1;
 	/* this will setup the path even if it fails to insert the back ref */
 	ret = insert_inline_extent_backref(trans, fs_info->extent_root, path,
@@ -2141,7 +2157,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 	btrfs_mark_buffer_dirty(leaf);
 	btrfs_release_path(path);
 
-	path->reada = 1;
+	path->reada = READA_FORWARD;
 	path->leave_spinning = 1;
 	/* now insert the actual backref */
 	ret = insert_extent_backref(trans, root->fs_info->extent_root,
@@ -2254,7 +2270,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
 	}
 
 again:
-	path->reada = 1;
+	path->reada = READA_FORWARD;
 	path->leave_spinning = 1;
 	ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
 				path, 0, 1);
@@ -2910,6 +2926,9 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 	if (trans->aborted)
 		return 0;
 
+	if (root->fs_info->creating_free_space_tree)
+		return 0;
+
 	if (root == root->fs_info->extent_root)
 		root = root->fs_info->tree_root;
 
@@ -2988,9 +3007,9 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
 		return -ENOMEM;
 
 	extent_op->flags_to_set = flags;
-	extent_op->update_flags = 1;
-	extent_op->update_key = 0;
-	extent_op->is_data = is_data ? 1 : 0;
+	extent_op->update_flags = true;
+	extent_op->update_key = false;
+	extent_op->is_data = is_data ? true : false;
 	extent_op->level = level;
 
 	ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
@@ -3328,7 +3347,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group,
 	 * If this block group is smaller than 100 megs don't bother caching the
 	 * block group.
 	 */
-	if (block_group->key.offset < (100 * 1024 * 1024)) {
+	if (block_group->key.offset < (100 * SZ_1M)) {
 		spin_lock(&block_group->lock);
 		block_group->disk_cache_state = BTRFS_DC_WRITTEN;
 		spin_unlock(&block_group->lock);
@@ -3428,7 +3447,7 @@ again:
 	 * taking up quite a bit since it's not folded into the other space
 	 * cache.
 	 */
-	num_pages = div_u64(block_group->key.offset, 256 * 1024 * 1024);
+	num_pages = div_u64(block_group->key.offset, SZ_256M);
 	if (!num_pages)
 		num_pages = 1;
 
@@ -3684,11 +3703,21 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 		return -ENOMEM;
 
 	/*
-	 * We don't need the lock here since we are protected by the transaction
-	 * commit.  We want to do the cache_save_setup first and then run the
+	 * Even though we are in the critical section of the transaction commit,
+	 * we can still have concurrent tasks adding elements to this
+	 * transaction's list of dirty block groups. These tasks correspond to
+	 * endio free space workers started when writeback finishes for a
+	 * space cache, which run inode.c:btrfs_finish_ordered_io(), and can
+	 * allocate new block groups as a result of COWing nodes of the root
+	 * tree when updating the free space inode. The writeback for the space
+	 * caches is triggered by an earlier call to
+	 * btrfs_start_dirty_block_groups() and iterations of the following
+	 * loop.
+	 * Also we want to do the cache_save_setup first and then run the
 	 * delayed refs to make sure we have the best chance at doing this all
 	 * in one shot.
 	 */
+	spin_lock(&cur_trans->dirty_bgs_lock);
 	while (!list_empty(&cur_trans->dirty_bgs)) {
 		cache = list_first_entry(&cur_trans->dirty_bgs,
 					 struct btrfs_block_group_cache,
@@ -3700,11 +3729,13 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 		 * finish and then do it all again
 		 */
 		if (!list_empty(&cache->io_list)) {
+			spin_unlock(&cur_trans->dirty_bgs_lock);
 			list_del_init(&cache->io_list);
 			btrfs_wait_cache_io(root, trans, cache,
 					    &cache->io_ctl, path,
 					    cache->key.objectid);
 			btrfs_put_block_group(cache);
+			spin_lock(&cur_trans->dirty_bgs_lock);
 		}
 
 		/*
@@ -3712,6 +3743,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 		 * on any pending IO
 		 */
 		list_del_init(&cache->dirty_list);
+		spin_unlock(&cur_trans->dirty_bgs_lock);
 		should_put = 1;
 
 		cache_save_setup(cache, trans, path);
@@ -3736,6 +3768,25 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 		}
 		if (!ret) {
 			ret = write_one_cache_group(trans, root, path, cache);
+			/*
+			 * One of the free space endio workers might have
+			 * created a new block group while updating a free space
+			 * cache's inode (at inode.c:btrfs_finish_ordered_io())
+			 * and hasn't released its transaction handle yet, in
+			 * which case the new block group is still attached to
+			 * its transaction handle and its creation has not
+			 * finished yet (no block group item in the extent tree
+			 * yet, etc). If this is the case, wait for all free
+			 * space endio workers to finish and retry. This is a
+			 * a very rare case so no need for a more efficient and
+			 * complex approach.
+			 */
+			if (ret == -ENOENT) {
+				wait_event(cur_trans->writer_wait,
+				   atomic_read(&cur_trans->num_writers) == 1);
+				ret = write_one_cache_group(trans, root, path,
+							    cache);
+			}
 			if (ret)
 				btrfs_abort_transaction(trans, root, ret);
 		}
@@ -3743,7 +3794,9 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
 		/* if its not on the io list, we need to put the block group */
 		if (should_put)
 			btrfs_put_block_group(cache);
+		spin_lock(&cur_trans->dirty_bgs_lock);
 	}
+	spin_unlock(&cur_trans->dirty_bgs_lock);
 
 	while (!list_empty(io)) {
 		cache = list_first_entry(io, struct btrfs_block_group_cache,
@@ -4242,14 +4295,13 @@ static int should_alloc_chunk(struct btrfs_root *root,
 	 */
 	if (force == CHUNK_ALLOC_LIMITED) {
 		thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
-		thresh = max_t(u64, 64 * 1024 * 1024,
-			       div_factor_fine(thresh, 1));
+		thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1));
 
 		if (num_bytes - num_allocated < thresh)
 			return 1;
 	}
 
-	if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8))
+	if (num_allocated + SZ_2M < div_factor(num_bytes, 8))
 		return 0;
 	return 1;
 }
@@ -4449,7 +4501,7 @@ out:
 	 * transaction.
 	 */
 	if (trans->can_flush_pending_bgs &&
-	    trans->chunk_bytes_reserved >= (2 * 1024 * 1024ull)) {
+	    trans->chunk_bytes_reserved >= (u64)SZ_2M) {
 		btrfs_create_pending_block_groups(trans, trans->root);
 		btrfs_trans_release_chunk_metadata(trans);
 	}
@@ -4547,7 +4599,7 @@ static inline int calc_reclaim_items_nr(struct btrfs_root *root, u64 to_reclaim)
 	return nr;
 }
 
-#define EXTENT_SIZE_PER_ITEM	(256 * 1024)
+#define EXTENT_SIZE_PER_ITEM	SZ_256K
 
 /*
  * shrink metadata reservation for delalloc
@@ -4752,8 +4804,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root,
 	u64 expected;
 	u64 to_reclaim;
 
-	to_reclaim = min_t(u64, num_online_cpus() * 1024 * 1024,
-				16 * 1024 * 1024);
+	to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M);
 	spin_lock(&space_info->lock);
 	if (can_overcommit(root, space_info, to_reclaim,
 			   BTRFS_RESERVE_FLUSH_ALL)) {
@@ -4764,8 +4815,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root,
 	used = space_info->bytes_used + space_info->bytes_reserved +
 	       space_info->bytes_pinned + space_info->bytes_readonly +
 	       space_info->bytes_may_use;
-	if (can_overcommit(root, space_info, 1024 * 1024,
-			   BTRFS_RESERVE_FLUSH_ALL))
+	if (can_overcommit(root, space_info, SZ_1M, BTRFS_RESERVE_FLUSH_ALL))
 		expected = div_factor_fine(space_info->total_bytes, 95);
 	else
 		expected = div_factor_fine(space_info->total_bytes, 90);
@@ -5321,7 +5371,7 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
 	spin_lock(&sinfo->lock);
 	spin_lock(&block_rsv->lock);
 
-	block_rsv->size = min_t(u64, num_bytes, 512 * 1024 * 1024);
+	block_rsv->size = min_t(u64, num_bytes, SZ_512M);
 
 	num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
 		    sinfo->bytes_reserved + sinfo->bytes_readonly +
@@ -6225,11 +6275,11 @@ fetch_cluster_info(struct btrfs_root *root, struct btrfs_space_info *space_info,
 		return ret;
 
 	if (ssd)
-		*empty_cluster = 2 * 1024 * 1024;
+		*empty_cluster = SZ_2M;
 	if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
 		ret = &root->fs_info->meta_alloc_cluster;
 		if (!ssd)
-			*empty_cluster = 64 * 1024;
+			*empty_cluster = SZ_64K;
 	} else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && ssd) {
 		ret = &root->fs_info->data_alloc_cluster;
 	}
@@ -6441,7 +6491,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 	if (!path)
 		return -ENOMEM;
 
-	path->reada = 1;
+	path->reada = READA_FORWARD;
 	path->leave_spinning = 1;
 
 	is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
@@ -6664,6 +6714,13 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 			}
 		}
 
+		ret = add_to_free_space_tree(trans, root->fs_info, bytenr,
+					     num_bytes);
+		if (ret) {
+			btrfs_abort_transaction(trans, extent_root, ret);
+			goto out;
+		}
+
 		ret = update_block_group(trans, root, bytenr, num_bytes, 0);
 		if (ret) {
 			btrfs_abort_transaction(trans, extent_root, ret);
@@ -7675,6 +7732,11 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 	btrfs_free_path(path);
 
+	ret = remove_from_free_space_tree(trans, fs_info, ins->objectid,
+					  ins->offset);
+	if (ret)
+		return ret;
+
 	ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
 	if (ret) { /* -ENOENT, logic error */
 		btrfs_err(fs_info, "update block group failed for %llu %llu",
@@ -7755,6 +7817,11 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
 	btrfs_mark_buffer_dirty(leaf);
 	btrfs_free_path(path);
 
+	ret = remove_from_free_space_tree(trans, fs_info, ins->objectid,
+					  num_bytes);
+	if (ret)
+		return ret;
+
 	ret = update_block_group(trans, root, ins->objectid, root->nodesize,
 				 1);
 	if (ret) { /* -ENOENT, logic error */
@@ -7837,7 +7904,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
 
 	btrfs_set_lock_blocking(buf);
-	btrfs_set_buffer_uptodate(buf);
+	set_extent_buffer_uptodate(buf);
 
 	if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
 		buf->log_index = root->log_transid % 2;
@@ -7983,12 +8050,9 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
 		else
 			memset(&extent_op->key, 0, sizeof(extent_op->key));
 		extent_op->flags_to_set = flags;
-		if (skinny_metadata)
-			extent_op->update_key = 0;
-		else
-			extent_op->update_key = 1;
-		extent_op->update_flags = 1;
-		extent_op->is_data = 0;
+		extent_op->update_key = skinny_metadata ? false : true;
+		extent_op->update_flags = true;
+		extent_op->is_data = false;
 		extent_op->level = level;
 
 		ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
@@ -9127,7 +9191,7 @@ static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force)
 	if ((sinfo->flags &
 	     (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
 	    !force)
-		min_allocable_bytes = 1 * 1024 * 1024;
+		min_allocable_bytes = SZ_1M;
 	else
 		min_allocable_bytes = 0;
 
@@ -9659,6 +9723,8 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
 	cache->full_stripe_len = btrfs_full_stripe_len(root,
 					       &root->fs_info->mapping_tree,
 					       start);
+	set_free_space_tree_thresholds(cache);
+
 	atomic_set(&cache->count, 1);
 	spin_lock_init(&cache->lock);
 	init_rwsem(&cache->data_rwsem);
@@ -9670,6 +9736,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size)
 	INIT_LIST_HEAD(&cache->io_list);
 	btrfs_init_free_space_ctl(cache);
 	atomic_set(&cache->trimming, 0);
+	mutex_init(&cache->free_space_lock);
 
 	return cache;
 }
@@ -9694,7 +9761,7 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
-	path->reada = 1;
+	path->reada = READA_FORWARD;
 
 	cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
 	if (btrfs_test_opt(root, SPACE_CACHE) &&
@@ -9880,6 +9947,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
 					       key.objectid, key.offset);
 		if (ret)
 			btrfs_abort_transaction(trans, extent_root, ret);
+		add_block_group_free_space(trans, root->fs_info, block_group);
+		/* already aborted the transaction if it failed. */
 next:
 		list_del_init(&block_group->bg_list);
 	}
@@ -9910,6 +9979,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
 	cache->flags = type;
 	cache->last_byte_to_unpin = (u64)-1;
 	cache->cached = BTRFS_CACHE_FINISHED;
+	cache->needs_free_space = 1;
 	ret = exclude_super_stripes(root, cache);
 	if (ret) {
 		/*
@@ -10280,6 +10350,10 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
 
 	unlock_chunks(root);
 
+	ret = remove_block_group_free_space(trans, root->fs_info, block_group);
+	if (ret)
+		goto out;
+
 	btrfs_put_block_group(block_group);
 	btrfs_put_block_group(block_group);
 
@@ -10328,7 +10402,7 @@ btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info,
 	 * more device items and remove one chunk item), but this is done at
 	 * btrfs_remove_chunk() through a call to check_system_chunk().
 	 */
-	map = (struct map_lookup *)em->bdev;
+	map = em->map_lookup;
 	num_items = 3 + map->num_stripes;
 	free_extent_map(em);
 
@@ -10515,7 +10589,7 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
 
 	disk_super = fs_info->super_copy;
 	if (!btrfs_super_root(disk_super))
-		return 1;
+		return -EINVAL;
 
 	features = btrfs_super_incompat_flags(disk_super);
 	if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
@@ -10745,3 +10819,23 @@ int btrfs_start_write_no_snapshoting(struct btrfs_root *root)
 	}
 	return 1;
 }
+
+static int wait_snapshoting_atomic_t(atomic_t *a)
+{
+	schedule();
+	return 0;
+}
+
+void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
+{
+	while (true) {
+		int ret;
+
+		ret = btrfs_start_write_no_snapshoting(root);
+		if (ret)
+			break;
+		wait_on_atomic_t(&root->will_be_snapshoted,
+				 wait_snapshoting_atomic_t,
+				 TASK_UNINTERRUPTIBLE);
+	}
+}
diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h
deleted file mode 100644
index e69de29bb..000000000
--- a/fs/btrfs/extent-tree.h
+++ /dev/null
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 9abe18763..392592dc7 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1285,20 +1285,6 @@ search_again:
 }
 
 /* wrappers around set/clear extent bit */
-int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
-		     gfp_t mask)
-{
-	return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL,
-			      NULL, mask);
-}
-
-int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
-		    unsigned bits, gfp_t mask)
-{
-	return set_extent_bit(tree, start, end, bits, NULL,
-			      NULL, mask);
-}
-
 int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 			   unsigned bits, gfp_t mask,
 			   struct extent_changeset *changeset)
@@ -1323,17 +1309,6 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 				  cached, mask, NULL);
 }
 
-int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
-		      unsigned bits, gfp_t mask)
-{
-	int wake = 0;
-
-	if (bits & EXTENT_LOCKED)
-		wake = 1;
-
-	return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, mask);
-}
-
 int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 			     unsigned bits, gfp_t mask,
 			     struct extent_changeset *changeset)
@@ -1348,63 +1323,18 @@ int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 				  changeset);
 }
 
-int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
-			struct extent_state **cached_state, gfp_t mask)
-{
-	return set_extent_bit(tree, start, end,
-			      EXTENT_DELALLOC | EXTENT_UPTODATE,
-			      NULL, cached_state, mask);
-}
-
-int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end,
-		      struct extent_state **cached_state, gfp_t mask)
-{
-	return set_extent_bit(tree, start, end,
-			      EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG,
-			      NULL, cached_state, mask);
-}
-
-int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
-		       gfp_t mask)
-{
-	return clear_extent_bit(tree, start, end,
-				EXTENT_DIRTY | EXTENT_DELALLOC |
-				EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask);
-}
-
-int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
-		     gfp_t mask)
-{
-	return set_extent_bit(tree, start, end, EXTENT_NEW, NULL,
-			      NULL, mask);
-}
-
-int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
-			struct extent_state **cached_state, gfp_t mask)
-{
-	return set_extent_bit(tree, start, end, EXTENT_UPTODATE, NULL,
-			      cached_state, mask);
-}
-
-int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
-			  struct extent_state **cached_state, gfp_t mask)
-{
-	return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
-				cached_state, mask);
-}
-
 /*
  * either insert or lock state struct between start and end use mask to tell
  * us if waiting is desired.
  */
 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
-		     unsigned bits, struct extent_state **cached_state)
+		     struct extent_state **cached_state)
 {
 	int err;
 	u64 failed_start;
 
 	while (1) {
-		err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits,
+		err = __set_extent_bit(tree, start, end, EXTENT_LOCKED,
 				       EXTENT_LOCKED, &failed_start,
 				       cached_state, GFP_NOFS, NULL);
 		if (err == -EEXIST) {
@@ -1417,11 +1347,6 @@ int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 	return err;
 }
 
-int lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
-{
-	return lock_extent_bits(tree, start, end, 0, NULL);
-}
-
 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
 {
 	int err;
@@ -1438,20 +1363,7 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
 	return 1;
 }
 
-int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
-			 struct extent_state **cached, gfp_t mask)
-{
-	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
-				mask);
-}
-
-int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end)
-{
-	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
-				GFP_NOFS);
-}
-
-int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
+void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
 {
 	unsigned long index = start >> PAGE_CACHE_SHIFT;
 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
@@ -1464,10 +1376,9 @@ int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end)
 		page_cache_release(page);
 		index++;
 	}
-	return 0;
 }
 
-int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
+void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
 {
 	unsigned long index = start >> PAGE_CACHE_SHIFT;
 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
@@ -1481,13 +1392,12 @@ int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end)
 		page_cache_release(page);
 		index++;
 	}
-	return 0;
 }
 
 /*
  * helper function to set both pages and extents in the tree writeback
  */
-static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
+static void set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
 {
 	unsigned long index = start >> PAGE_CACHE_SHIFT;
 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
@@ -1500,7 +1410,6 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end)
 		page_cache_release(page);
 		index++;
 	}
-	return 0;
 }
 
 /* find the first state struct with 'bits' set after 'start', and
@@ -1800,7 +1709,7 @@ again:
 	BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */
 
 	/* step three, lock the state bits for the whole range */
-	lock_extent_bits(tree, delalloc_start, delalloc_end, 0, &cached_state);
+	lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state);
 
 	/* then test to make sure it is all still delalloc */
 	ret = test_range_bit(tree, delalloc_start, delalloc_end,
@@ -1820,7 +1729,7 @@ out_failed:
 	return found;
 }
 
-int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
+void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
 				 struct page *locked_page,
 				 unsigned clear_bits,
 				 unsigned long page_ops)
@@ -1835,7 +1744,7 @@ int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
 
 	clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
 	if (page_ops == 0)
-		return 0;
+		return;
 
 	if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0)
 		mapping_set_error(inode->i_mapping, -EIO);
@@ -1869,7 +1778,6 @@ int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
 		index += ret;
 		cond_resched();
 	}
-	return 0;
 }
 
 /*
@@ -2516,7 +2424,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
 
 /* lots and lots of room for performance fixes in the end_bio funcs */
 
-int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
+void end_extent_writepage(struct page *page, int err, u64 start, u64 end)
 {
 	int uptodate = (err == 0);
 	struct extent_io_tree *tree;
@@ -2537,7 +2445,6 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end)
 		ret = ret < 0 ? ret : -EIO;
 		mapping_set_error(page->mapping, ret);
 	}
-	return 0;
 }
 
 /*
@@ -2579,9 +2486,7 @@ static void end_bio_extent_writepage(struct bio *bio)
 		start = page_offset(page);
 		end = start + bvec->bv_offset + bvec->bv_len - 1;
 
-		if (end_extent_writepage(page, bio->bi_error, start, end))
-			continue;
-
+		end_extent_writepage(page, bio->bi_error, start, end);
 		end_page_writeback(page);
 	}
 
@@ -2992,12 +2897,11 @@ static int __do_readpage(struct extent_io_tree *tree,
 	struct block_device *bdev;
 	int ret;
 	int nr = 0;
-	int parent_locked = *bio_flags & EXTENT_BIO_PARENT_LOCKED;
 	size_t pg_offset = 0;
 	size_t iosize;
 	size_t disk_io_size;
 	size_t blocksize = inode->i_sb->s_blocksize;
-	unsigned long this_bio_flag = *bio_flags & EXTENT_BIO_PARENT_LOCKED;
+	unsigned long this_bio_flag = 0;
 
 	set_page_extent_mapped(page);
 
@@ -3037,18 +2941,16 @@ static int __do_readpage(struct extent_io_tree *tree,
 			kunmap_atomic(userpage);
 			set_extent_uptodate(tree, cur, cur + iosize - 1,
 					    &cached, GFP_NOFS);
-			if (!parent_locked)
-				unlock_extent_cached(tree, cur,
-						     cur + iosize - 1,
-						     &cached, GFP_NOFS);
+			unlock_extent_cached(tree, cur,
+					     cur + iosize - 1,
+					     &cached, GFP_NOFS);
 			break;
 		}
 		em = __get_extent_map(inode, page, pg_offset, cur,
 				      end - cur + 1, get_extent, em_cached);
 		if (IS_ERR_OR_NULL(em)) {
 			SetPageError(page);
-			if (!parent_locked)
-				unlock_extent(tree, cur, end);
+			unlock_extent(tree, cur, end);
 			break;
 		}
 		extent_offset = cur - em->start;
@@ -3133,12 +3035,9 @@ static int __do_readpage(struct extent_io_tree *tree,
 
 			set_extent_uptodate(tree, cur, cur + iosize - 1,
 					    &cached, GFP_NOFS);
-			if (parent_locked)
-				free_extent_state(cached);
-			else
-				unlock_extent_cached(tree, cur,
-						     cur + iosize - 1,
-						     &cached, GFP_NOFS);
+			unlock_extent_cached(tree, cur,
+					     cur + iosize - 1,
+					     &cached, GFP_NOFS);
 			cur = cur + iosize;
 			pg_offset += iosize;
 			continue;
@@ -3147,8 +3046,7 @@ static int __do_readpage(struct extent_io_tree *tree,
 		if (test_range_bit(tree, cur, cur_end,
 				   EXTENT_UPTODATE, 1, NULL)) {
 			check_page_uptodate(tree, page);
-			if (!parent_locked)
-				unlock_extent(tree, cur, cur + iosize - 1);
+			unlock_extent(tree, cur, cur + iosize - 1);
 			cur = cur + iosize;
 			pg_offset += iosize;
 			continue;
@@ -3158,8 +3056,7 @@ static int __do_readpage(struct extent_io_tree *tree,
 		 */
 		if (block_start == EXTENT_MAP_INLINE) {
 			SetPageError(page);
-			if (!parent_locked)
-				unlock_extent(tree, cur, cur + iosize - 1);
+			unlock_extent(tree, cur, cur + iosize - 1);
 			cur = cur + iosize;
 			pg_offset += iosize;
 			continue;
@@ -3178,8 +3075,7 @@ static int __do_readpage(struct extent_io_tree *tree,
 			*bio_flags = this_bio_flag;
 		} else {
 			SetPageError(page);
-			if (!parent_locked)
-				unlock_extent(tree, cur, cur + iosize - 1);
+			unlock_extent(tree, cur, cur + iosize - 1);
 		}
 		cur = cur + iosize;
 		pg_offset += iosize;
@@ -3308,20 +3204,6 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
 	return ret;
 }
 
-int extent_read_full_page_nolock(struct extent_io_tree *tree, struct page *page,
-				 get_extent_t *get_extent, int mirror_num)
-{
-	struct bio *bio = NULL;
-	unsigned long bio_flags = EXTENT_BIO_PARENT_LOCKED;
-	int ret;
-
-	ret = __do_readpage(tree, page, get_extent, NULL, &bio, mirror_num,
-			    &bio_flags, READ, NULL);
-	if (bio)
-		ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
-	return ret;
-}
-
 static noinline void update_nr_written(struct page *page,
 				      struct writeback_control *wbc,
 				      unsigned long nr_written)
@@ -4326,7 +4208,7 @@ int extent_invalidatepage(struct extent_io_tree *tree,
 	if (start > end)
 		return 0;
 
-	lock_extent_bits(tree, start, end, 0, &cached_state);
+	lock_extent_bits(tree, start, end, &cached_state);
 	wait_on_page_writeback(page);
 	clear_extent_bit(tree, start, end,
 			 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
@@ -4387,7 +4269,7 @@ int try_release_extent_mapping(struct extent_map_tree *map,
 	u64 end = start + PAGE_CACHE_SIZE - 1;
 
 	if (gfpflags_allow_blocking(mask) &&
-	    page->mapping->host->i_size > 16 * 1024 * 1024) {
+	    page->mapping->host->i_size > SZ_16M) {
 		u64 len;
 		while (start <= end) {
 			len = end - start + 1;
@@ -4536,7 +4418,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		last_for_get_extent = isize;
 	}
 
-	lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1, 0,
+	lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1,
 			 &cached_state);
 
 	em = get_extent_skip_holes(inode, start, last_for_get_extent,
@@ -4797,24 +4679,14 @@ struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
 	return new;
 }
 
-struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
-						u64 start)
+struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
+						  u64 start, unsigned long len)
 {
 	struct extent_buffer *eb;
-	unsigned long len;
 	unsigned long num_pages;
 	unsigned long i;
 
-	if (!fs_info) {
-		/*
-		 * Called only from tests that don't always have a fs_info
-		 * available, but we know that nodesize is 4096
-		 */
-		len = 4096;
-	} else {
-		len = fs_info->tree_root->nodesize;
-	}
-	num_pages = num_extent_pages(0, len);
+	num_pages = num_extent_pages(start, len);
 
 	eb = __alloc_extent_buffer(fs_info, start, len);
 	if (!eb)
@@ -4837,6 +4709,24 @@ err:
 	return NULL;
 }
 
+struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
+						u64 start)
+{
+	unsigned long len;
+
+	if (!fs_info) {
+		/*
+		 * Called only from tests that don't always have a fs_info
+		 * available, but we know that nodesize is 4096
+		 */
+		len = 4096;
+	} else {
+		len = fs_info->tree_root->nodesize;
+	}
+
+	return __alloc_dummy_extent_buffer(fs_info, start, len);
+}
+
 static void check_buffer_tree_ref(struct extent_buffer *eb)
 {
 	int refs;
@@ -5227,7 +5117,7 @@ int set_extent_buffer_dirty(struct extent_buffer *eb)
 	return was_dirty;
 }
 
-int clear_extent_buffer_uptodate(struct extent_buffer *eb)
+void clear_extent_buffer_uptodate(struct extent_buffer *eb)
 {
 	unsigned long i;
 	struct page *page;
@@ -5240,10 +5130,9 @@ int clear_extent_buffer_uptodate(struct extent_buffer *eb)
 		if (page)
 			ClearPageUptodate(page);
 	}
-	return 0;
 }
 
-int set_extent_buffer_uptodate(struct extent_buffer *eb)
+void set_extent_buffer_uptodate(struct extent_buffer *eb)
 {
 	unsigned long i;
 	struct page *page;
@@ -5255,7 +5144,6 @@ int set_extent_buffer_uptodate(struct extent_buffer *eb)
 		page = eb->pages[i];
 		SetPageUptodate(page);
 	}
-	return 0;
 }
 
 int extent_buffer_uptodate(struct extent_buffer *eb)
@@ -5594,6 +5482,155 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
 	}
 }
 
+/*
+ * The extent buffer bitmap operations are done with byte granularity because
+ * bitmap items are not guaranteed to be aligned to a word and therefore a
+ * single word in a bitmap may straddle two pages in the extent buffer.
+ */
+#define BIT_BYTE(nr) ((nr) / BITS_PER_BYTE)
+#define BYTE_MASK ((1 << BITS_PER_BYTE) - 1)
+#define BITMAP_FIRST_BYTE_MASK(start) \
+	((BYTE_MASK << ((start) & (BITS_PER_BYTE - 1))) & BYTE_MASK)
+#define BITMAP_LAST_BYTE_MASK(nbits) \
+	(BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1)))
+
+/*
+ * eb_bitmap_offset() - calculate the page and offset of the byte containing the
+ * given bit number
+ * @eb: the extent buffer
+ * @start: offset of the bitmap item in the extent buffer
+ * @nr: bit number
+ * @page_index: return index of the page in the extent buffer that contains the
+ * given bit number
+ * @page_offset: return offset into the page given by page_index
+ *
+ * This helper hides the ugliness of finding the byte in an extent buffer which
+ * contains a given bit.
+ */
+static inline void eb_bitmap_offset(struct extent_buffer *eb,
+				    unsigned long start, unsigned long nr,
+				    unsigned long *page_index,
+				    size_t *page_offset)
+{
+	size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1);
+	size_t byte_offset = BIT_BYTE(nr);
+	size_t offset;
+
+	/*
+	 * The byte we want is the offset of the extent buffer + the offset of
+	 * the bitmap item in the extent buffer + the offset of the byte in the
+	 * bitmap item.
+	 */
+	offset = start_offset + start + byte_offset;
+
+	*page_index = offset >> PAGE_CACHE_SHIFT;
+	*page_offset = offset & (PAGE_CACHE_SIZE - 1);
+}
+
+/**
+ * extent_buffer_test_bit - determine whether a bit in a bitmap item is set
+ * @eb: the extent buffer
+ * @start: offset of the bitmap item in the extent buffer
+ * @nr: bit number to test
+ */
+int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start,
+			   unsigned long nr)
+{
+	char *kaddr;
+	struct page *page;
+	unsigned long i;
+	size_t offset;
+
+	eb_bitmap_offset(eb, start, nr, &i, &offset);
+	page = eb->pages[i];
+	WARN_ON(!PageUptodate(page));
+	kaddr = page_address(page);
+	return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1)));
+}
+
+/**
+ * extent_buffer_bitmap_set - set an area of a bitmap
+ * @eb: the extent buffer
+ * @start: offset of the bitmap item in the extent buffer
+ * @pos: bit number of the first bit
+ * @len: number of bits to set
+ */
+void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start,
+			      unsigned long pos, unsigned long len)
+{
+	char *kaddr;
+	struct page *page;
+	unsigned long i;
+	size_t offset;
+	const unsigned int size = pos + len;
+	int bits_to_set = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
+	unsigned int mask_to_set = BITMAP_FIRST_BYTE_MASK(pos);
+
+	eb_bitmap_offset(eb, start, pos, &i, &offset);
+	page = eb->pages[i];
+	WARN_ON(!PageUptodate(page));
+	kaddr = page_address(page);
+
+	while (len >= bits_to_set) {
+		kaddr[offset] |= mask_to_set;
+		len -= bits_to_set;
+		bits_to_set = BITS_PER_BYTE;
+		mask_to_set = ~0U;
+		if (++offset >= PAGE_CACHE_SIZE && len > 0) {
+			offset = 0;
+			page = eb->pages[++i];
+			WARN_ON(!PageUptodate(page));
+			kaddr = page_address(page);
+		}
+	}
+	if (len) {
+		mask_to_set &= BITMAP_LAST_BYTE_MASK(size);
+		kaddr[offset] |= mask_to_set;
+	}
+}
+
+
+/**
+ * extent_buffer_bitmap_clear - clear an area of a bitmap
+ * @eb: the extent buffer
+ * @start: offset of the bitmap item in the extent buffer
+ * @pos: bit number of the first bit
+ * @len: number of bits to clear
+ */
+void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start,
+				unsigned long pos, unsigned long len)
+{
+	char *kaddr;
+	struct page *page;
+	unsigned long i;
+	size_t offset;
+	const unsigned int size = pos + len;
+	int bits_to_clear = BITS_PER_BYTE - (pos % BITS_PER_BYTE);
+	unsigned int mask_to_clear = BITMAP_FIRST_BYTE_MASK(pos);
+
+	eb_bitmap_offset(eb, start, pos, &i, &offset);
+	page = eb->pages[i];
+	WARN_ON(!PageUptodate(page));
+	kaddr = page_address(page);
+
+	while (len >= bits_to_clear) {
+		kaddr[offset] &= ~mask_to_clear;
+		len -= bits_to_clear;
+		bits_to_clear = BITS_PER_BYTE;
+		mask_to_clear = ~0U;
+		if (++offset >= PAGE_CACHE_SIZE && len > 0) {
+			offset = 0;
+			page = eb->pages[++i];
+			WARN_ON(!PageUptodate(page));
+			kaddr = page_address(page);
+		}
+	}
+	if (len) {
+		mask_to_clear &= BITMAP_LAST_BYTE_MASK(size);
+		kaddr[offset] &= ~mask_to_clear;
+	}
+}
+
 static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len)
 {
 	unsigned long distance = (src > dst) ? src - dst : dst - src;
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index f4c1ae118..880d5292e 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -29,7 +29,6 @@
  */
 #define EXTENT_BIO_COMPRESSED 1
 #define EXTENT_BIO_TREE_LOG 2
-#define EXTENT_BIO_PARENT_LOCKED 4
 #define EXTENT_BIO_FLAG_SHIFT 16
 
 /* these are bit numbers for test/set bit */
@@ -199,17 +198,17 @@ int try_release_extent_mapping(struct extent_map_tree *map,
 			       struct extent_io_tree *tree, struct page *page,
 			       gfp_t mask);
 int try_release_extent_buffer(struct page *page);
-int lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
 int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
-		     unsigned bits, struct extent_state **cached);
-int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end);
-int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
-			 struct extent_state **cached, gfp_t mask);
+		     struct extent_state **cached);
+
+static inline int lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
+{
+	return lock_extent_bits(tree, start, end, NULL);
+}
+
 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
 int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
 			  get_extent_t *get_extent, int mirror_num);
-int extent_read_full_page_nolock(struct extent_io_tree *tree, struct page *page,
-				 get_extent_t *get_extent, int mirror_num);
 int __init extent_io_init(void);
 void extent_io_exit(void);
 
@@ -221,39 +220,105 @@ void free_extent_state(struct extent_state *state);
 int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
 		   unsigned bits, int filled,
 		   struct extent_state *cached_state);
-int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
-		      unsigned bits, gfp_t mask);
 int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 			     unsigned bits, gfp_t mask,
 			     struct extent_changeset *changeset);
 int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 		     unsigned bits, int wake, int delete,
 		     struct extent_state **cached, gfp_t mask);
-int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
-		    unsigned bits, gfp_t mask);
+
+static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end)
+{
+	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL,
+				GFP_NOFS);
+}
+
+static inline int unlock_extent_cached(struct extent_io_tree *tree, u64 start,
+		u64 end, struct extent_state **cached, gfp_t mask)
+{
+	return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached,
+				mask);
+}
+
+static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start,
+		u64 end, unsigned bits, gfp_t mask)
+{
+	int wake = 0;
+
+	if (bits & EXTENT_LOCKED)
+		wake = 1;
+
+	return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, mask);
+}
+
 int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 			   unsigned bits, gfp_t mask,
 			   struct extent_changeset *changeset);
 int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 		   unsigned bits, u64 *failed_start,
 		   struct extent_state **cached_state, gfp_t mask);
-int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
-			struct extent_state **cached_state, gfp_t mask);
-int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
-			  struct extent_state **cached_state, gfp_t mask);
-int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
-		   gfp_t mask);
-int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
-		     gfp_t mask);
-int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
-		       gfp_t mask);
+
+static inline int set_extent_bits(struct extent_io_tree *tree, u64 start,
+		u64 end, unsigned bits, gfp_t mask)
+{
+	return set_extent_bit(tree, start, end, bits, NULL, NULL, mask);
+}
+
+static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start,
+		u64 end, struct extent_state **cached_state, gfp_t mask)
+{
+	return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0,
+				cached_state, mask);
+}
+
+static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start,
+		u64 end, gfp_t mask)
+{
+	return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL,
+			      NULL, mask);
+}
+
+static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start,
+		u64 end, gfp_t mask)
+{
+	return clear_extent_bit(tree, start, end,
+				EXTENT_DIRTY | EXTENT_DELALLOC |
+				EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask);
+}
+
 int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
 		       unsigned bits, unsigned clear_bits,
 		       struct extent_state **cached_state, gfp_t mask);
-int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
-			struct extent_state **cached_state, gfp_t mask);
-int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end,
-		      struct extent_state **cached_state, gfp_t mask);
+
+static inline int set_extent_delalloc(struct extent_io_tree *tree, u64 start,
+		u64 end, struct extent_state **cached_state, gfp_t mask)
+{
+	return set_extent_bit(tree, start, end,
+			      EXTENT_DELALLOC | EXTENT_UPTODATE,
+			      NULL, cached_state, mask);
+}
+
+static inline int set_extent_defrag(struct extent_io_tree *tree, u64 start,
+		u64 end, struct extent_state **cached_state, gfp_t mask)
+{
+	return set_extent_bit(tree, start, end,
+			      EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG,
+			      NULL, cached_state, mask);
+}
+
+static inline int set_extent_new(struct extent_io_tree *tree, u64 start,
+		u64 end, gfp_t mask)
+{
+	return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, NULL, mask);
+}
+
+static inline int set_extent_uptodate(struct extent_io_tree *tree, u64 start,
+		u64 end, struct extent_state **cached_state, gfp_t mask)
+{
+	return set_extent_bit(tree, start, end, EXTENT_UPTODATE, NULL,
+			      cached_state, mask);
+}
+
 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
 			  u64 *start_ret, u64 *end_ret, unsigned bits,
 			  struct extent_state **cached_state);
@@ -282,8 +347,10 @@ void set_page_extent_mapped(struct page *page);
 
 struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
 					  u64 start);
+struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
+						  u64 start, unsigned long len);
 struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info,
-		u64 start);
+						u64 start);
 struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src);
 struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
 					 u64 start);
@@ -328,19 +395,25 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
 			   unsigned long src_offset, unsigned long len);
 void memset_extent_buffer(struct extent_buffer *eb, char c,
 			  unsigned long start, unsigned long len);
+int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start,
+			   unsigned long pos);
+void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start,
+			      unsigned long pos, unsigned long len);
+void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start,
+				unsigned long pos, unsigned long len);
 void clear_extent_buffer_dirty(struct extent_buffer *eb);
 int set_extent_buffer_dirty(struct extent_buffer *eb);
-int set_extent_buffer_uptodate(struct extent_buffer *eb);
-int clear_extent_buffer_uptodate(struct extent_buffer *eb);
+void set_extent_buffer_uptodate(struct extent_buffer *eb);
+void clear_extent_buffer_uptodate(struct extent_buffer *eb);
 int extent_buffer_uptodate(struct extent_buffer *eb);
 int extent_buffer_under_io(struct extent_buffer *eb);
 int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset,
 		      unsigned long min_len, char **map,
 		      unsigned long *map_start,
 		      unsigned long *map_len);
-int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end);
-int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end);
-int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
+void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end);
+void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end);
+void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
 				 struct page *locked_page,
 				 unsigned bits_to_clear,
 				 unsigned long page_ops);
@@ -357,7 +430,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
 		      int mirror_num);
 int clean_io_failure(struct inode *inode, u64 start, struct page *page,
 		     unsigned int pg_offset);
-int end_extent_writepage(struct page *page, int err, u64 start, u64 end);
+void end_extent_writepage(struct page *page, int err, u64 start, u64 end);
 int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb,
 			 int mirror_num);
 
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index 6a98bddd8..84fb56d5c 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -76,7 +76,7 @@ void free_extent_map(struct extent_map *em)
 		WARN_ON(extent_map_in_tree(em));
 		WARN_ON(!list_empty(&em->list));
 		if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags))
-			kfree(em->bdev);
+			kfree(em->map_lookup);
 		kmem_cache_free(extent_map_cache, em);
 	}
 }
diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
index b2991fd85..eb8b8fae0 100644
--- a/fs/btrfs/extent_map.h
+++ b/fs/btrfs/extent_map.h
@@ -32,7 +32,15 @@ struct extent_map {
 	u64 block_len;
 	u64 generation;
 	unsigned long flags;
-	struct block_device *bdev;
+	union {
+		struct block_device *bdev;
+
+		/*
+		 * used for chunk mappings
+		 * flags & EXTENT_FLAG_FS_MAPPING must be set
+		 */
+		struct map_lookup *map_lookup;
+	};
 	atomic_t refs;
 	unsigned int compress_type;
 	struct list_head list;
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 58ece6558..a67e1c828 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -202,7 +202,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
 	}
 
 	if (bio->bi_iter.bi_size > PAGE_CACHE_SIZE * 8)
-		path->reada = 2;
+		path->reada = READA_FORWARD;
 
 	WARN_ON(bio->bi_vcnt <= 0);
 
@@ -328,7 +328,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
 
 	if (search_commit) {
 		path->skip_locking = 1;
-		path->reada = 2;
+		path->reada = READA_FORWARD;
 		path->search_commit_root = 1;
 	}
 
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 0f09526aa..098bb8f69 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -406,8 +406,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
 /* simple helper to fault in pages and copy.  This should go away
  * and be replaced with calls into generic code.
  */
-static noinline int btrfs_copy_from_user(loff_t pos, int num_pages,
-					 size_t write_bytes,
+static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
 					 struct page **prepared_pages,
 					 struct iov_iter *i)
 {
@@ -1394,7 +1393,7 @@ lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages,
 	if (start_pos < inode->i_size) {
 		struct btrfs_ordered_extent *ordered;
 		lock_extent_bits(&BTRFS_I(inode)->io_tree,
-				 start_pos, last_pos, 0, cached_state);
+				 start_pos, last_pos, cached_state);
 		ordered = btrfs_lookup_ordered_range(inode, start_pos,
 						     last_pos - start_pos + 1);
 		if (ordered &&
@@ -1588,8 +1587,7 @@ again:
 			ret = 0;
 		}
 
-		copied = btrfs_copy_from_user(pos, num_pages,
-					   write_bytes, pages, i);
+		copied = btrfs_copy_from_user(pos, write_bytes, pages, i);
 
 		/*
 		 * if we have trouble faulting in the pages, fall
@@ -1764,17 +1762,17 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
 	loff_t pos;
 	size_t count;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	err = generic_write_checks(iocb, from);
 	if (err <= 0) {
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 		return err;
 	}
 
 	current->backing_dev_info = inode_to_bdi(inode);
 	err = file_remove_privs(file);
 	if (err) {
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 		goto out;
 	}
 
@@ -1785,7 +1783,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
 	 * to stop this write operation to ensure FS consistency.
 	 */
 	if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) {
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 		err = -EROFS;
 		goto out;
 	}
@@ -1806,7 +1804,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
 		end_pos = round_up(pos + count, root->sectorsize);
 		err = btrfs_cont_expand(inode, i_size_read(inode), end_pos);
 		if (err) {
-			mutex_unlock(&inode->i_mutex);
+			inode_unlock(inode);
 			goto out;
 		}
 	}
@@ -1822,7 +1820,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
 			iocb->ki_pos = pos + num_written;
 	}
 
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	/*
 	 * We also have to set last_sub_trans to the current log transid,
@@ -1911,7 +1909,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	if (ret)
 		return ret;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	atomic_inc(&root->log_batch);
 	full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
 			     &BTRFS_I(inode)->runtime_flags);
@@ -1963,7 +1961,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 		ret = start_ordered_ops(inode, start, end);
 	}
 	if (ret) {
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 		goto out;
 	}
 	atomic_inc(&root->log_batch);
@@ -2009,7 +2007,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 		 */
 		clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
 			  &BTRFS_I(inode)->runtime_flags);
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 		goto out;
 	}
 
@@ -2033,7 +2031,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	trans = btrfs_start_transaction(root, 0);
 	if (IS_ERR(trans)) {
 		ret = PTR_ERR(trans);
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 		goto out;
 	}
 	trans->sync = true;
@@ -2056,7 +2054,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	 * file again, but that will end up using the synchronization
 	 * inside btrfs_sync_log to keep things safe.
 	 */
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	/*
 	 * If any of the ordered extents had an error, just return it to user
@@ -2305,7 +2303,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 	if (ret)
 		return ret;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE);
 	ret = find_first_non_hole(inode, &offset, &len);
 	if (ret < 0)
@@ -2345,7 +2343,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 		truncated_page = true;
 		ret = btrfs_truncate_page(inode, offset, 0, 0);
 		if (ret) {
-			mutex_unlock(&inode->i_mutex);
+			inode_unlock(inode);
 			return ret;
 		}
 	}
@@ -2398,7 +2396,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 		truncate_pagecache_range(inode, lockstart, lockend);
 
 		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
-				 0, &cached_state);
+				 &cached_state);
 		ordered = btrfs_lookup_first_ordered_extent(inode, lockend);
 
 		/*
@@ -2421,7 +2419,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 		ret = btrfs_wait_ordered_range(inode, lockstart,
 					       lockend - lockstart + 1);
 		if (ret) {
-			mutex_unlock(&inode->i_mutex);
+			inode_unlock(inode);
 			return ret;
 		}
 	}
@@ -2576,7 +2574,7 @@ out_only_mutex:
 			ret = btrfs_end_transaction(trans, root);
 		}
 	}
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	if (ret && !err)
 		err = ret;
 	return err;
@@ -2660,7 +2658,7 @@ static long btrfs_fallocate(struct file *file, int mode,
 	if (ret < 0)
 		return ret;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	ret = inode_newsize_ok(inode, alloc_end);
 	if (ret)
 		goto out;
@@ -2705,7 +2703,7 @@ static long btrfs_fallocate(struct file *file, int mode,
 		 * transaction
 		 */
 		lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
-				 locked_end, 0, &cached_state);
+				 locked_end, &cached_state);
 		ordered = btrfs_lookup_first_ordered_extent(inode,
 							    alloc_end - 1);
 		if (ordered &&
@@ -2818,7 +2816,7 @@ out:
 	 * So this is completely used as cleanup.
 	 */
 	btrfs_qgroup_free_data(inode, alloc_start, alloc_end - alloc_start);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	/* Let go of our reservation. */
 	btrfs_free_reserved_data_space(inode, alloc_start,
 				       alloc_end - alloc_start);
@@ -2852,7 +2850,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence)
 	lockend--;
 	len = lockend - lockstart + 1;
 
-	lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0,
+	lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
 			 &cached_state);
 
 	while (start < inode->i_size) {
@@ -2894,7 +2892,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
 	struct inode *inode = file->f_mapping->host;
 	int ret;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	switch (whence) {
 	case SEEK_END:
 	case SEEK_CUR:
@@ -2903,20 +2901,20 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
 	case SEEK_DATA:
 	case SEEK_HOLE:
 		if (offset >= i_size_read(inode)) {
-			mutex_unlock(&inode->i_mutex);
+			inode_unlock(inode);
 			return -ENXIO;
 		}
 
 		ret = find_desired_extent(inode, &offset, whence);
 		if (ret) {
-			mutex_unlock(&inode->i_mutex);
+			inode_unlock(inode);
 			return ret;
 		}
 	}
 
 	offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
 out:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return offset;
 }
 
@@ -2934,6 +2932,9 @@ const struct file_operations btrfs_file_operations = {
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= btrfs_ioctl,
 #endif
+	.copy_file_range = btrfs_copy_file_range,
+	.clone_file_range = btrfs_clone_file_range,
+	.dedupe_file_range = btrfs_dedupe_file_range,
 };
 
 void btrfs_auto_defrag_exit(void)
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index cfe99bec4..8f835bfa1 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -30,7 +30,7 @@
 #include "volumes.h"
 
 #define BITS_PER_BITMAP		(PAGE_CACHE_SIZE * 8)
-#define MAX_CACHE_BYTES_PER_GIG	(32 * 1024)
+#define MAX_CACHE_BYTES_PER_GIG	SZ_32K
 
 struct btrfs_trim_range {
 	u64 start;
@@ -1086,14 +1086,11 @@ write_pinned_extent_entries(struct btrfs_root *root,
 static noinline_for_stack int
 write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list)
 {
-	struct list_head *pos, *n;
+	struct btrfs_free_space *entry, *next;
 	int ret;
 
 	/* Write out the bitmaps */
-	list_for_each_safe(pos, n, bitmap_list) {
-		struct btrfs_free_space *entry =
-			list_entry(pos, struct btrfs_free_space, list);
-
+	list_for_each_entry_safe(entry, next, bitmap_list, list) {
 		ret = io_ctl_add_bitmap(io_ctl, entry->bitmap);
 		if (ret)
 			return -ENOSPC;
@@ -1119,13 +1116,10 @@ static int flush_dirty_cache(struct inode *inode)
 static void noinline_for_stack
 cleanup_bitmap_list(struct list_head *bitmap_list)
 {
-	struct list_head *pos, *n;
+	struct btrfs_free_space *entry, *next;
 
-	list_for_each_safe(pos, n, bitmap_list) {
-		struct btrfs_free_space *entry =
-			list_entry(pos, struct btrfs_free_space, list);
+	list_for_each_entry_safe(entry, next, bitmap_list, list)
 		list_del_init(&entry->list);
-	}
 }
 
 static void noinline_for_stack
@@ -1261,7 +1255,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 		goto out;
 
 	lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1,
-			 0, &cached_state);
+			 &cached_state);
 
 	io_ctl_set_generation(io_ctl, trans->transid);
 
@@ -1656,11 +1650,10 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
 	 * at or below 32k, so we need to adjust how much memory we allow to be
 	 * used by extent based free space tracking
 	 */
-	if (size < 1024 * 1024 * 1024)
+	if (size < SZ_1G)
 		max_bytes = MAX_CACHE_BYTES_PER_GIG;
 	else
-		max_bytes = MAX_CACHE_BYTES_PER_GIG *
-			div_u64(size, 1024 * 1024 * 1024);
+		max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G);
 
 	/*
 	 * we want to account for 1 more bitmap than what we have so we can make
@@ -2016,7 +2009,7 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
 	return true;
 }
 
-static struct btrfs_free_space_op free_space_op = {
+static const struct btrfs_free_space_op free_space_op = {
 	.recalc_thresholds	= recalculate_thresholds,
 	.use_bitmap		= use_bitmap,
 };
@@ -2489,8 +2482,7 @@ void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group)
 	 * track of free space, and if we pass 1/2 of that we want to
 	 * start converting things over to using bitmaps
 	 */
-	ctl->extents_thresh = ((1024 * 32) / 2) /
-				sizeof(struct btrfs_free_space);
+	ctl->extents_thresh = (SZ_32K / 2) / sizeof(struct btrfs_free_space);
 }
 
 /*
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index f251865eb..33178c490 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -37,7 +37,7 @@ struct btrfs_free_space_ctl {
 	int total_bitmaps;
 	int unit;
 	u64 start;
-	struct btrfs_free_space_op *op;
+	const struct btrfs_free_space_op *op;
 	void *private;
 	struct mutex cache_writeout_mutex;
 	struct list_head trimming_ranges;
diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c
new file mode 100644
index 000000000..53dbeaf6c
--- /dev/null
+++ b/fs/btrfs/free-space-tree.c
@@ -0,0 +1,1605 @@
+/*
+ * Copyright (C) 2015 Facebook.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/kernel.h>
+#include <linux/vmalloc.h>
+#include "ctree.h"
+#include "disk-io.h"
+#include "locking.h"
+#include "free-space-tree.h"
+#include "transaction.h"
+
+static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
+					struct btrfs_fs_info *fs_info,
+					struct btrfs_block_group_cache *block_group,
+					struct btrfs_path *path);
+
+void set_free_space_tree_thresholds(struct btrfs_block_group_cache *cache)
+{
+	u32 bitmap_range;
+	size_t bitmap_size;
+	u64 num_bitmaps, total_bitmap_size;
+
+	/*
+	 * We convert to bitmaps when the disk space required for using extents
+	 * exceeds that required for using bitmaps.
+	 */
+	bitmap_range = cache->sectorsize * BTRFS_FREE_SPACE_BITMAP_BITS;
+	num_bitmaps = div_u64(cache->key.offset + bitmap_range - 1,
+			      bitmap_range);
+	bitmap_size = sizeof(struct btrfs_item) + BTRFS_FREE_SPACE_BITMAP_SIZE;
+	total_bitmap_size = num_bitmaps * bitmap_size;
+	cache->bitmap_high_thresh = div_u64(total_bitmap_size,
+					    sizeof(struct btrfs_item));
+
+	/*
+	 * We allow for a small buffer between the high threshold and low
+	 * threshold to avoid thrashing back and forth between the two formats.
+	 */
+	if (cache->bitmap_high_thresh > 100)
+		cache->bitmap_low_thresh = cache->bitmap_high_thresh - 100;
+	else
+		cache->bitmap_low_thresh = 0;
+}
+
+static int add_new_free_space_info(struct btrfs_trans_handle *trans,
+				   struct btrfs_fs_info *fs_info,
+				   struct btrfs_block_group_cache *block_group,
+				   struct btrfs_path *path)
+{
+	struct btrfs_root *root = fs_info->free_space_root;
+	struct btrfs_free_space_info *info;
+	struct btrfs_key key;
+	struct extent_buffer *leaf;
+	int ret;
+
+	key.objectid = block_group->key.objectid;
+	key.type = BTRFS_FREE_SPACE_INFO_KEY;
+	key.offset = block_group->key.offset;
+
+	ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*info));
+	if (ret)
+		goto out;
+
+	leaf = path->nodes[0];
+	info = btrfs_item_ptr(leaf, path->slots[0],
+			      struct btrfs_free_space_info);
+	btrfs_set_free_space_extent_count(leaf, info, 0);
+	btrfs_set_free_space_flags(leaf, info, 0);
+	btrfs_mark_buffer_dirty(leaf);
+
+	ret = 0;
+out:
+	btrfs_release_path(path);
+	return ret;
+}
+
+struct btrfs_free_space_info *
+search_free_space_info(struct btrfs_trans_handle *trans,
+		       struct btrfs_fs_info *fs_info,
+		       struct btrfs_block_group_cache *block_group,
+		       struct btrfs_path *path, int cow)
+{
+	struct btrfs_root *root = fs_info->free_space_root;
+	struct btrfs_key key;
+	int ret;
+
+	key.objectid = block_group->key.objectid;
+	key.type = BTRFS_FREE_SPACE_INFO_KEY;
+	key.offset = block_group->key.offset;
+
+	ret = btrfs_search_slot(trans, root, &key, path, 0, cow);
+	if (ret < 0)
+		return ERR_PTR(ret);
+	if (ret != 0) {
+		btrfs_warn(fs_info, "missing free space info for %llu\n",
+			   block_group->key.objectid);
+		ASSERT(0);
+		return ERR_PTR(-ENOENT);
+	}
+
+	return btrfs_item_ptr(path->nodes[0], path->slots[0],
+			      struct btrfs_free_space_info);
+}
+
+/*
+ * btrfs_search_slot() but we're looking for the greatest key less than the
+ * passed key.
+ */
+static int btrfs_search_prev_slot(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *root,
+				  struct btrfs_key *key, struct btrfs_path *p,
+				  int ins_len, int cow)
+{
+	int ret;
+
+	ret = btrfs_search_slot(trans, root, key, p, ins_len, cow);
+	if (ret < 0)
+		return ret;
+
+	if (ret == 0) {
+		ASSERT(0);
+		return -EIO;
+	}
+
+	if (p->slots[0] == 0) {
+		ASSERT(0);
+		return -EIO;
+	}
+	p->slots[0]--;
+
+	return 0;
+}
+
+static inline u32 free_space_bitmap_size(u64 size, u32 sectorsize)
+{
+	return DIV_ROUND_UP((u32)div_u64(size, sectorsize), BITS_PER_BYTE);
+}
+
+static unsigned long *alloc_bitmap(u32 bitmap_size)
+{
+	void *mem;
+
+	/*
+	 * The allocation size varies, observed numbers were < 4K up to 16K.
+	 * Using vmalloc unconditionally would be too heavy, we'll try
+	 * contiguous allocations first.
+	 */
+	if  (bitmap_size <= PAGE_SIZE)
+		return kzalloc(bitmap_size, GFP_NOFS);
+
+	mem = kzalloc(bitmap_size, GFP_NOFS | __GFP_NOWARN);
+	if (mem)
+		return mem;
+
+	return __vmalloc(bitmap_size, GFP_NOFS | __GFP_HIGHMEM | __GFP_ZERO,
+			 PAGE_KERNEL);
+}
+
+int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
+				  struct btrfs_fs_info *fs_info,
+				  struct btrfs_block_group_cache *block_group,
+				  struct btrfs_path *path)
+{
+	struct btrfs_root *root = fs_info->free_space_root;
+	struct btrfs_free_space_info *info;
+	struct btrfs_key key, found_key;
+	struct extent_buffer *leaf;
+	unsigned long *bitmap;
+	char *bitmap_cursor;
+	u64 start, end;
+	u64 bitmap_range, i;
+	u32 bitmap_size, flags, expected_extent_count;
+	u32 extent_count = 0;
+	int done = 0, nr;
+	int ret;
+
+	bitmap_size = free_space_bitmap_size(block_group->key.offset,
+					     block_group->sectorsize);
+	bitmap = alloc_bitmap(bitmap_size);
+	if (!bitmap) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	start = block_group->key.objectid;
+	end = block_group->key.objectid + block_group->key.offset;
+
+	key.objectid = end - 1;
+	key.type = (u8)-1;
+	key.offset = (u64)-1;
+
+	while (!done) {
+		ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
+		if (ret)
+			goto out;
+
+		leaf = path->nodes[0];
+		nr = 0;
+		path->slots[0]++;
+		while (path->slots[0] > 0) {
+			btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0] - 1);
+
+			if (found_key.type == BTRFS_FREE_SPACE_INFO_KEY) {
+				ASSERT(found_key.objectid == block_group->key.objectid);
+				ASSERT(found_key.offset == block_group->key.offset);
+				done = 1;
+				break;
+			} else if (found_key.type == BTRFS_FREE_SPACE_EXTENT_KEY) {
+				u64 first, last;
+
+				ASSERT(found_key.objectid >= start);
+				ASSERT(found_key.objectid < end);
+				ASSERT(found_key.objectid + found_key.offset <= end);
+
+				first = div_u64(found_key.objectid - start,
+						block_group->sectorsize);
+				last = div_u64(found_key.objectid + found_key.offset - start,
+					       block_group->sectorsize);
+				bitmap_set(bitmap, first, last - first);
+
+				extent_count++;
+				nr++;
+				path->slots[0]--;
+			} else {
+				ASSERT(0);
+			}
+		}
+
+		ret = btrfs_del_items(trans, root, path, path->slots[0], nr);
+		if (ret)
+			goto out;
+		btrfs_release_path(path);
+	}
+
+	info = search_free_space_info(trans, fs_info, block_group, path, 1);
+	if (IS_ERR(info)) {
+		ret = PTR_ERR(info);
+		goto out;
+	}
+	leaf = path->nodes[0];
+	flags = btrfs_free_space_flags(leaf, info);
+	flags |= BTRFS_FREE_SPACE_USING_BITMAPS;
+	btrfs_set_free_space_flags(leaf, info, flags);
+	expected_extent_count = btrfs_free_space_extent_count(leaf, info);
+	btrfs_mark_buffer_dirty(leaf);
+	btrfs_release_path(path);
+
+	if (extent_count != expected_extent_count) {
+		btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u",
+			  block_group->key.objectid, extent_count,
+			  expected_extent_count);
+		ASSERT(0);
+		ret = -EIO;
+		goto out;
+	}
+
+	bitmap_cursor = (char *)bitmap;
+	bitmap_range = block_group->sectorsize * BTRFS_FREE_SPACE_BITMAP_BITS;
+	i = start;
+	while (i < end) {
+		unsigned long ptr;
+		u64 extent_size;
+		u32 data_size;
+
+		extent_size = min(end - i, bitmap_range);
+		data_size = free_space_bitmap_size(extent_size,
+						   block_group->sectorsize);
+
+		key.objectid = i;
+		key.type = BTRFS_FREE_SPACE_BITMAP_KEY;
+		key.offset = extent_size;
+
+		ret = btrfs_insert_empty_item(trans, root, path, &key,
+					      data_size);
+		if (ret)
+			goto out;
+
+		leaf = path->nodes[0];
+		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+		write_extent_buffer(leaf, bitmap_cursor, ptr,
+				    data_size);
+		btrfs_mark_buffer_dirty(leaf);
+		btrfs_release_path(path);
+
+		i += extent_size;
+		bitmap_cursor += data_size;
+	}
+
+	ret = 0;
+out:
+	kvfree(bitmap);
+	if (ret)
+		btrfs_abort_transaction(trans, root, ret);
+	return ret;
+}
+
+int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
+				  struct btrfs_fs_info *fs_info,
+				  struct btrfs_block_group_cache *block_group,
+				  struct btrfs_path *path)
+{
+	struct btrfs_root *root = fs_info->free_space_root;
+	struct btrfs_free_space_info *info;
+	struct btrfs_key key, found_key;
+	struct extent_buffer *leaf;
+	unsigned long *bitmap;
+	u64 start, end;
+	/* Initialize to silence GCC. */
+	u64 extent_start = 0;
+	u64 offset;
+	u32 bitmap_size, flags, expected_extent_count;
+	int prev_bit = 0, bit, bitnr;
+	u32 extent_count = 0;
+	int done = 0, nr;
+	int ret;
+
+	bitmap_size = free_space_bitmap_size(block_group->key.offset,
+					     block_group->sectorsize);
+	bitmap = alloc_bitmap(bitmap_size);
+	if (!bitmap) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	start = block_group->key.objectid;
+	end = block_group->key.objectid + block_group->key.offset;
+
+	key.objectid = end - 1;
+	key.type = (u8)-1;
+	key.offset = (u64)-1;
+
+	while (!done) {
+		ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
+		if (ret)
+			goto out;
+
+		leaf = path->nodes[0];
+		nr = 0;
+		path->slots[0]++;
+		while (path->slots[0] > 0) {
+			btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0] - 1);
+
+			if (found_key.type == BTRFS_FREE_SPACE_INFO_KEY) {
+				ASSERT(found_key.objectid == block_group->key.objectid);
+				ASSERT(found_key.offset == block_group->key.offset);
+				done = 1;
+				break;
+			} else if (found_key.type == BTRFS_FREE_SPACE_BITMAP_KEY) {
+				unsigned long ptr;
+				char *bitmap_cursor;
+				u32 bitmap_pos, data_size;
+
+				ASSERT(found_key.objectid >= start);
+				ASSERT(found_key.objectid < end);
+				ASSERT(found_key.objectid + found_key.offset <= end);
+
+				bitmap_pos = div_u64(found_key.objectid - start,
+						     block_group->sectorsize *
+						     BITS_PER_BYTE);
+				bitmap_cursor = ((char *)bitmap) + bitmap_pos;
+				data_size = free_space_bitmap_size(found_key.offset,
+								   block_group->sectorsize);
+
+				ptr = btrfs_item_ptr_offset(leaf, path->slots[0] - 1);
+				read_extent_buffer(leaf, bitmap_cursor, ptr,
+						   data_size);
+
+				nr++;
+				path->slots[0]--;
+			} else {
+				ASSERT(0);
+			}
+		}
+
+		ret = btrfs_del_items(trans, root, path, path->slots[0], nr);
+		if (ret)
+			goto out;
+		btrfs_release_path(path);
+	}
+
+	info = search_free_space_info(trans, fs_info, block_group, path, 1);
+	if (IS_ERR(info)) {
+		ret = PTR_ERR(info);
+		goto out;
+	}
+	leaf = path->nodes[0];
+	flags = btrfs_free_space_flags(leaf, info);
+	flags &= ~BTRFS_FREE_SPACE_USING_BITMAPS;
+	btrfs_set_free_space_flags(leaf, info, flags);
+	expected_extent_count = btrfs_free_space_extent_count(leaf, info);
+	btrfs_mark_buffer_dirty(leaf);
+	btrfs_release_path(path);
+
+	offset = start;
+	bitnr = 0;
+	while (offset < end) {
+		bit = !!test_bit(bitnr, bitmap);
+		if (prev_bit == 0 && bit == 1) {
+			extent_start = offset;
+		} else if (prev_bit == 1 && bit == 0) {
+			key.objectid = extent_start;
+			key.type = BTRFS_FREE_SPACE_EXTENT_KEY;
+			key.offset = offset - extent_start;
+
+			ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
+			if (ret)
+				goto out;
+			btrfs_release_path(path);
+
+			extent_count++;
+		}
+		prev_bit = bit;
+		offset += block_group->sectorsize;
+		bitnr++;
+	}
+	if (prev_bit == 1) {
+		key.objectid = extent_start;
+		key.type = BTRFS_FREE_SPACE_EXTENT_KEY;
+		key.offset = end - extent_start;
+
+		ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
+		if (ret)
+			goto out;
+		btrfs_release_path(path);
+
+		extent_count++;
+	}
+
+	if (extent_count != expected_extent_count) {
+		btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u",
+			  block_group->key.objectid, extent_count,
+			  expected_extent_count);
+		ASSERT(0);
+		ret = -EIO;
+		goto out;
+	}
+
+	ret = 0;
+out:
+	kvfree(bitmap);
+	if (ret)
+		btrfs_abort_transaction(trans, root, ret);
+	return ret;
+}
+
+static int update_free_space_extent_count(struct btrfs_trans_handle *trans,
+					  struct btrfs_fs_info *fs_info,
+					  struct btrfs_block_group_cache *block_group,
+					  struct btrfs_path *path,
+					  int new_extents)
+{
+	struct btrfs_free_space_info *info;
+	u32 flags;
+	u32 extent_count;
+	int ret = 0;
+
+	if (new_extents == 0)
+		return 0;
+
+	info = search_free_space_info(trans, fs_info, block_group, path, 1);
+	if (IS_ERR(info)) {
+		ret = PTR_ERR(info);
+		goto out;
+	}
+	flags = btrfs_free_space_flags(path->nodes[0], info);
+	extent_count = btrfs_free_space_extent_count(path->nodes[0], info);
+
+	extent_count += new_extents;
+	btrfs_set_free_space_extent_count(path->nodes[0], info, extent_count);
+	btrfs_mark_buffer_dirty(path->nodes[0]);
+	btrfs_release_path(path);
+
+	if (!(flags & BTRFS_FREE_SPACE_USING_BITMAPS) &&
+	    extent_count > block_group->bitmap_high_thresh) {
+		ret = convert_free_space_to_bitmaps(trans, fs_info, block_group,
+						    path);
+	} else if ((flags & BTRFS_FREE_SPACE_USING_BITMAPS) &&
+		   extent_count < block_group->bitmap_low_thresh) {
+		ret = convert_free_space_to_extents(trans, fs_info, block_group,
+						    path);
+	}
+
+out:
+	return ret;
+}
+
+int free_space_test_bit(struct btrfs_block_group_cache *block_group,
+			struct btrfs_path *path, u64 offset)
+{
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+	u64 found_start, found_end;
+	unsigned long ptr, i;
+
+	leaf = path->nodes[0];
+	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+	ASSERT(key.type == BTRFS_FREE_SPACE_BITMAP_KEY);
+
+	found_start = key.objectid;
+	found_end = key.objectid + key.offset;
+	ASSERT(offset >= found_start && offset < found_end);
+
+	ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+	i = div_u64(offset - found_start, block_group->sectorsize);
+	return !!extent_buffer_test_bit(leaf, ptr, i);
+}
+
+static void free_space_set_bits(struct btrfs_block_group_cache *block_group,
+				struct btrfs_path *path, u64 *start, u64 *size,
+				int bit)
+{
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+	u64 end = *start + *size;
+	u64 found_start, found_end;
+	unsigned long ptr, first, last;
+
+	leaf = path->nodes[0];
+	btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
+	ASSERT(key.type == BTRFS_FREE_SPACE_BITMAP_KEY);
+
+	found_start = key.objectid;
+	found_end = key.objectid + key.offset;
+	ASSERT(*start >= found_start && *start < found_end);
+	ASSERT(end > found_start);
+
+	if (end > found_end)
+		end = found_end;
+
+	ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+	first = div_u64(*start - found_start, block_group->sectorsize);
+	last = div_u64(end - found_start, block_group->sectorsize);
+	if (bit)
+		extent_buffer_bitmap_set(leaf, ptr, first, last - first);
+	else
+		extent_buffer_bitmap_clear(leaf, ptr, first, last - first);
+	btrfs_mark_buffer_dirty(leaf);
+
+	*size -= end - *start;
+	*start = end;
+}
+
+/*
+ * We can't use btrfs_next_item() in modify_free_space_bitmap() because
+ * btrfs_next_leaf() doesn't get the path for writing. We can forgo the fancy
+ * tree walking in btrfs_next_leaf() anyways because we know exactly what we're
+ * looking for.
+ */
+static int free_space_next_bitmap(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *root, struct btrfs_path *p)
+{
+	struct btrfs_key key;
+
+	if (p->slots[0] + 1 < btrfs_header_nritems(p->nodes[0])) {
+		p->slots[0]++;
+		return 0;
+	}
+
+	btrfs_item_key_to_cpu(p->nodes[0], &key, p->slots[0]);
+	btrfs_release_path(p);
+
+	key.objectid += key.offset;
+	key.type = (u8)-1;
+	key.offset = (u64)-1;
+
+	return btrfs_search_prev_slot(trans, root, &key, p, 0, 1);
+}
+
+/*
+ * If remove is 1, then we are removing free space, thus clearing bits in the
+ * bitmap. If remove is 0, then we are adding free space, thus setting bits in
+ * the bitmap.
+ */
+static int modify_free_space_bitmap(struct btrfs_trans_handle *trans,
+				    struct btrfs_fs_info *fs_info,
+				    struct btrfs_block_group_cache *block_group,
+				    struct btrfs_path *path,
+				    u64 start, u64 size, int remove)
+{
+	struct btrfs_root *root = fs_info->free_space_root;
+	struct btrfs_key key;
+	u64 end = start + size;
+	u64 cur_start, cur_size;
+	int prev_bit, next_bit;
+	int new_extents;
+	int ret;
+
+	/*
+	 * Read the bit for the block immediately before the extent of space if
+	 * that block is within the block group.
+	 */
+	if (start > block_group->key.objectid) {
+		u64 prev_block = start - block_group->sectorsize;
+
+		key.objectid = prev_block;
+		key.type = (u8)-1;
+		key.offset = (u64)-1;
+
+		ret = btrfs_search_prev_slot(trans, root, &key, path, 0, 1);
+		if (ret)
+			goto out;
+
+		prev_bit = free_space_test_bit(block_group, path, prev_block);
+
+		/* The previous block may have been in the previous bitmap. */
+		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+		if (start >= key.objectid + key.offset) {
+			ret = free_space_next_bitmap(trans, root, path);
+			if (ret)
+				goto out;
+		}
+	} else {
+		key.objectid = start;
+		key.type = (u8)-1;
+		key.offset = (u64)-1;
+
+		ret = btrfs_search_prev_slot(trans, root, &key, path, 0, 1);
+		if (ret)
+			goto out;
+
+		prev_bit = -1;
+	}
+
+	/*
+	 * Iterate over all of the bitmaps overlapped by the extent of space,
+	 * clearing/setting bits as required.
+	 */
+	cur_start = start;
+	cur_size = size;
+	while (1) {
+		free_space_set_bits(block_group, path, &cur_start, &cur_size,
+				    !remove);
+		if (cur_size == 0)
+			break;
+		ret = free_space_next_bitmap(trans, root, path);
+		if (ret)
+			goto out;
+	}
+
+	/*
+	 * Read the bit for the block immediately after the extent of space if
+	 * that block is within the block group.
+	 */
+	if (end < block_group->key.objectid + block_group->key.offset) {
+		/* The next block may be in the next bitmap. */
+		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+		if (end >= key.objectid + key.offset) {
+			ret = free_space_next_bitmap(trans, root, path);
+			if (ret)
+				goto out;
+		}
+
+		next_bit = free_space_test_bit(block_group, path, end);
+	} else {
+		next_bit = -1;
+	}
+
+	if (remove) {
+		new_extents = -1;
+		if (prev_bit == 1) {
+			/* Leftover on the left. */
+			new_extents++;
+		}
+		if (next_bit == 1) {
+			/* Leftover on the right. */
+			new_extents++;
+		}
+	} else {
+		new_extents = 1;
+		if (prev_bit == 1) {
+			/* Merging with neighbor on the left. */
+			new_extents--;
+		}
+		if (next_bit == 1) {
+			/* Merging with neighbor on the right. */
+			new_extents--;
+		}
+	}
+
+	btrfs_release_path(path);
+	ret = update_free_space_extent_count(trans, fs_info, block_group, path,
+					     new_extents);
+
+out:
+	return ret;
+}
+
+static int remove_free_space_extent(struct btrfs_trans_handle *trans,
+				    struct btrfs_fs_info *fs_info,
+				    struct btrfs_block_group_cache *block_group,
+				    struct btrfs_path *path,
+				    u64 start, u64 size)
+{
+	struct btrfs_root *root = fs_info->free_space_root;
+	struct btrfs_key key;
+	u64 found_start, found_end;
+	u64 end = start + size;
+	int new_extents = -1;
+	int ret;
+
+	key.objectid = start;
+	key.type = (u8)-1;
+	key.offset = (u64)-1;
+
+	ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
+	if (ret)
+		goto out;
+
+	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+	ASSERT(key.type == BTRFS_FREE_SPACE_EXTENT_KEY);
+
+	found_start = key.objectid;
+	found_end = key.objectid + key.offset;
+	ASSERT(start >= found_start && end <= found_end);
+
+	/*
+	 * Okay, now that we've found the free space extent which contains the
+	 * free space that we are removing, there are four cases:
+	 *
+	 * 1. We're using the whole extent: delete the key we found and
+	 * decrement the free space extent count.
+	 * 2. We are using part of the extent starting at the beginning: delete
+	 * the key we found and insert a new key representing the leftover at
+	 * the end. There is no net change in the number of extents.
+	 * 3. We are using part of the extent ending at the end: delete the key
+	 * we found and insert a new key representing the leftover at the
+	 * beginning. There is no net change in the number of extents.
+	 * 4. We are using part of the extent in the middle: delete the key we
+	 * found and insert two new keys representing the leftovers on each
+	 * side. Where we used to have one extent, we now have two, so increment
+	 * the extent count. We may need to convert the block group to bitmaps
+	 * as a result.
+	 */
+
+	/* Delete the existing key (cases 1-4). */
+	ret = btrfs_del_item(trans, root, path);
+	if (ret)
+		goto out;
+
+	/* Add a key for leftovers at the beginning (cases 3 and 4). */
+	if (start > found_start) {
+		key.objectid = found_start;
+		key.type = BTRFS_FREE_SPACE_EXTENT_KEY;
+		key.offset = start - found_start;
+
+		btrfs_release_path(path);
+		ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
+		if (ret)
+			goto out;
+		new_extents++;
+	}
+
+	/* Add a key for leftovers at the end (cases 2 and 4). */
+	if (end < found_end) {
+		key.objectid = end;
+		key.type = BTRFS_FREE_SPACE_EXTENT_KEY;
+		key.offset = found_end - end;
+
+		btrfs_release_path(path);
+		ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
+		if (ret)
+			goto out;
+		new_extents++;
+	}
+
+	btrfs_release_path(path);
+	ret = update_free_space_extent_count(trans, fs_info, block_group, path,
+					     new_extents);
+
+out:
+	return ret;
+}
+
+int __remove_from_free_space_tree(struct btrfs_trans_handle *trans,
+				  struct btrfs_fs_info *fs_info,
+				  struct btrfs_block_group_cache *block_group,
+				  struct btrfs_path *path, u64 start, u64 size)
+{
+	struct btrfs_free_space_info *info;
+	u32 flags;
+	int ret;
+
+	if (block_group->needs_free_space) {
+		ret = __add_block_group_free_space(trans, fs_info, block_group,
+						   path);
+		if (ret)
+			return ret;
+	}
+
+	info = search_free_space_info(NULL, fs_info, block_group, path, 0);
+	if (IS_ERR(info))
+		return PTR_ERR(info);
+	flags = btrfs_free_space_flags(path->nodes[0], info);
+	btrfs_release_path(path);
+
+	if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) {
+		return modify_free_space_bitmap(trans, fs_info, block_group,
+						path, start, size, 1);
+	} else {
+		return remove_free_space_extent(trans, fs_info, block_group,
+						path, start, size);
+	}
+}
+
+int remove_from_free_space_tree(struct btrfs_trans_handle *trans,
+				struct btrfs_fs_info *fs_info,
+				u64 start, u64 size)
+{
+	struct btrfs_block_group_cache *block_group;
+	struct btrfs_path *path;
+	int ret;
+
+	if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+		return 0;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	block_group = btrfs_lookup_block_group(fs_info, start);
+	if (!block_group) {
+		ASSERT(0);
+		ret = -ENOENT;
+		goto out;
+	}
+
+	mutex_lock(&block_group->free_space_lock);
+	ret = __remove_from_free_space_tree(trans, fs_info, block_group, path,
+					    start, size);
+	mutex_unlock(&block_group->free_space_lock);
+
+	btrfs_put_block_group(block_group);
+out:
+	btrfs_free_path(path);
+	if (ret)
+		btrfs_abort_transaction(trans, fs_info->free_space_root, ret);
+	return ret;
+}
+
+static int add_free_space_extent(struct btrfs_trans_handle *trans,
+				 struct btrfs_fs_info *fs_info,
+				 struct btrfs_block_group_cache *block_group,
+				 struct btrfs_path *path,
+				 u64 start, u64 size)
+{
+	struct btrfs_root *root = fs_info->free_space_root;
+	struct btrfs_key key, new_key;
+	u64 found_start, found_end;
+	u64 end = start + size;
+	int new_extents = 1;
+	int ret;
+
+	/*
+	 * We are adding a new extent of free space, but we need to merge
+	 * extents. There are four cases here:
+	 *
+	 * 1. The new extent does not have any immediate neighbors to merge
+	 * with: add the new key and increment the free space extent count. We
+	 * may need to convert the block group to bitmaps as a result.
+	 * 2. The new extent has an immediate neighbor before it: remove the
+	 * previous key and insert a new key combining both of them. There is no
+	 * net change in the number of extents.
+	 * 3. The new extent has an immediate neighbor after it: remove the next
+	 * key and insert a new key combining both of them. There is no net
+	 * change in the number of extents.
+	 * 4. The new extent has immediate neighbors on both sides: remove both
+	 * of the keys and insert a new key combining all of them. Where we used
+	 * to have two extents, we now have one, so decrement the extent count.
+	 */
+
+	new_key.objectid = start;
+	new_key.type = BTRFS_FREE_SPACE_EXTENT_KEY;
+	new_key.offset = size;
+
+	/* Search for a neighbor on the left. */
+	if (start == block_group->key.objectid)
+		goto right;
+	key.objectid = start - 1;
+	key.type = (u8)-1;
+	key.offset = (u64)-1;
+
+	ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
+	if (ret)
+		goto out;
+
+	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+	if (key.type != BTRFS_FREE_SPACE_EXTENT_KEY) {
+		ASSERT(key.type == BTRFS_FREE_SPACE_INFO_KEY);
+		btrfs_release_path(path);
+		goto right;
+	}
+
+	found_start = key.objectid;
+	found_end = key.objectid + key.offset;
+	ASSERT(found_start >= block_group->key.objectid &&
+	       found_end > block_group->key.objectid);
+	ASSERT(found_start < start && found_end <= start);
+
+	/*
+	 * Delete the neighbor on the left and absorb it into the new key (cases
+	 * 2 and 4).
+	 */
+	if (found_end == start) {
+		ret = btrfs_del_item(trans, root, path);
+		if (ret)
+			goto out;
+		new_key.objectid = found_start;
+		new_key.offset += key.offset;
+		new_extents--;
+	}
+	btrfs_release_path(path);
+
+right:
+	/* Search for a neighbor on the right. */
+	if (end == block_group->key.objectid + block_group->key.offset)
+		goto insert;
+	key.objectid = end;
+	key.type = (u8)-1;
+	key.offset = (u64)-1;
+
+	ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
+	if (ret)
+		goto out;
+
+	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+	if (key.type != BTRFS_FREE_SPACE_EXTENT_KEY) {
+		ASSERT(key.type == BTRFS_FREE_SPACE_INFO_KEY);
+		btrfs_release_path(path);
+		goto insert;
+	}
+
+	found_start = key.objectid;
+	found_end = key.objectid + key.offset;
+	ASSERT(found_start >= block_group->key.objectid &&
+	       found_end > block_group->key.objectid);
+	ASSERT((found_start < start && found_end <= start) ||
+	       (found_start >= end && found_end > end));
+
+	/*
+	 * Delete the neighbor on the right and absorb it into the new key
+	 * (cases 3 and 4).
+	 */
+	if (found_start == end) {
+		ret = btrfs_del_item(trans, root, path);
+		if (ret)
+			goto out;
+		new_key.offset += key.offset;
+		new_extents--;
+	}
+	btrfs_release_path(path);
+
+insert:
+	/* Insert the new key (cases 1-4). */
+	ret = btrfs_insert_empty_item(trans, root, path, &new_key, 0);
+	if (ret)
+		goto out;
+
+	btrfs_release_path(path);
+	ret = update_free_space_extent_count(trans, fs_info, block_group, path,
+					     new_extents);
+
+out:
+	return ret;
+}
+
+int __add_to_free_space_tree(struct btrfs_trans_handle *trans,
+			     struct btrfs_fs_info *fs_info,
+			     struct btrfs_block_group_cache *block_group,
+			     struct btrfs_path *path, u64 start, u64 size)
+{
+	struct btrfs_free_space_info *info;
+	u32 flags;
+	int ret;
+
+	if (block_group->needs_free_space) {
+		ret = __add_block_group_free_space(trans, fs_info, block_group,
+						   path);
+		if (ret)
+			return ret;
+	}
+
+	info = search_free_space_info(NULL, fs_info, block_group, path, 0);
+	if (IS_ERR(info))
+		return PTR_ERR(info);
+	flags = btrfs_free_space_flags(path->nodes[0], info);
+	btrfs_release_path(path);
+
+	if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) {
+		return modify_free_space_bitmap(trans, fs_info, block_group,
+						path, start, size, 0);
+	} else {
+		return add_free_space_extent(trans, fs_info, block_group, path,
+					     start, size);
+	}
+}
+
+int add_to_free_space_tree(struct btrfs_trans_handle *trans,
+			   struct btrfs_fs_info *fs_info,
+			   u64 start, u64 size)
+{
+	struct btrfs_block_group_cache *block_group;
+	struct btrfs_path *path;
+	int ret;
+
+	if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+		return 0;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	block_group = btrfs_lookup_block_group(fs_info, start);
+	if (!block_group) {
+		ASSERT(0);
+		ret = -ENOENT;
+		goto out;
+	}
+
+	mutex_lock(&block_group->free_space_lock);
+	ret = __add_to_free_space_tree(trans, fs_info, block_group, path, start,
+				       size);
+	mutex_unlock(&block_group->free_space_lock);
+
+	btrfs_put_block_group(block_group);
+out:
+	btrfs_free_path(path);
+	if (ret)
+		btrfs_abort_transaction(trans, fs_info->free_space_root, ret);
+	return ret;
+}
+
+/*
+ * Populate the free space tree by walking the extent tree. Operations on the
+ * extent tree that happen as a result of writes to the free space tree will go
+ * through the normal add/remove hooks.
+ */
+static int populate_free_space_tree(struct btrfs_trans_handle *trans,
+				    struct btrfs_fs_info *fs_info,
+				    struct btrfs_block_group_cache *block_group)
+{
+	struct btrfs_root *extent_root = fs_info->extent_root;
+	struct btrfs_path *path, *path2;
+	struct btrfs_key key;
+	u64 start, end;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+	path->reada = 1;
+
+	path2 = btrfs_alloc_path();
+	if (!path2) {
+		btrfs_free_path(path);
+		return -ENOMEM;
+	}
+
+	ret = add_new_free_space_info(trans, fs_info, block_group, path2);
+	if (ret)
+		goto out;
+
+	mutex_lock(&block_group->free_space_lock);
+
+	/*
+	 * Iterate through all of the extent and metadata items in this block
+	 * group, adding the free space between them and the free space at the
+	 * end. Note that EXTENT_ITEM and METADATA_ITEM are less than
+	 * BLOCK_GROUP_ITEM, so an extent may precede the block group that it's
+	 * contained in.
+	 */
+	key.objectid = block_group->key.objectid;
+	key.type = BTRFS_EXTENT_ITEM_KEY;
+	key.offset = 0;
+
+	ret = btrfs_search_slot_for_read(extent_root, &key, path, 1, 0);
+	if (ret < 0)
+		goto out_locked;
+	ASSERT(ret == 0);
+
+	start = block_group->key.objectid;
+	end = block_group->key.objectid + block_group->key.offset;
+	while (1) {
+		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+		if (key.type == BTRFS_EXTENT_ITEM_KEY ||
+		    key.type == BTRFS_METADATA_ITEM_KEY) {
+			if (key.objectid >= end)
+				break;
+
+			if (start < key.objectid) {
+				ret = __add_to_free_space_tree(trans, fs_info,
+							       block_group,
+							       path2, start,
+							       key.objectid -
+							       start);
+				if (ret)
+					goto out_locked;
+			}
+			start = key.objectid;
+			if (key.type == BTRFS_METADATA_ITEM_KEY)
+				start += fs_info->tree_root->nodesize;
+			else
+				start += key.offset;
+		} else if (key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
+			if (key.objectid != block_group->key.objectid)
+				break;
+		}
+
+		ret = btrfs_next_item(extent_root, path);
+		if (ret < 0)
+			goto out_locked;
+		if (ret)
+			break;
+	}
+	if (start < end) {
+		ret = __add_to_free_space_tree(trans, fs_info, block_group,
+					       path2, start, end - start);
+		if (ret)
+			goto out_locked;
+	}
+
+	ret = 0;
+out_locked:
+	mutex_unlock(&block_group->free_space_lock);
+out:
+	btrfs_free_path(path2);
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *tree_root = fs_info->tree_root;
+	struct btrfs_root *free_space_root;
+	struct btrfs_block_group_cache *block_group;
+	struct rb_node *node;
+	int ret;
+
+	trans = btrfs_start_transaction(tree_root, 0);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
+
+	fs_info->creating_free_space_tree = 1;
+	free_space_root = btrfs_create_tree(trans, fs_info,
+					    BTRFS_FREE_SPACE_TREE_OBJECTID);
+	if (IS_ERR(free_space_root)) {
+		ret = PTR_ERR(free_space_root);
+		goto abort;
+	}
+	fs_info->free_space_root = free_space_root;
+
+	node = rb_first(&fs_info->block_group_cache_tree);
+	while (node) {
+		block_group = rb_entry(node, struct btrfs_block_group_cache,
+				       cache_node);
+		ret = populate_free_space_tree(trans, fs_info, block_group);
+		if (ret)
+			goto abort;
+		node = rb_next(node);
+	}
+
+	btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE);
+	fs_info->creating_free_space_tree = 0;
+
+	ret = btrfs_commit_transaction(trans, tree_root);
+	if (ret)
+		return ret;
+
+	return 0;
+
+abort:
+	fs_info->creating_free_space_tree = 0;
+	btrfs_abort_transaction(trans, tree_root, ret);
+	btrfs_end_transaction(trans, tree_root);
+	return ret;
+}
+
+static int clear_free_space_tree(struct btrfs_trans_handle *trans,
+				 struct btrfs_root *root)
+{
+	struct btrfs_path *path;
+	struct btrfs_key key;
+	int nr;
+	int ret;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	path->leave_spinning = 1;
+
+	key.objectid = 0;
+	key.type = 0;
+	key.offset = 0;
+
+	while (1) {
+		ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+		if (ret < 0)
+			goto out;
+
+		nr = btrfs_header_nritems(path->nodes[0]);
+		if (!nr)
+			break;
+
+		path->slots[0] = 0;
+		ret = btrfs_del_items(trans, root, path, 0, nr);
+		if (ret)
+			goto out;
+
+		btrfs_release_path(path);
+	}
+
+	ret = 0;
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *tree_root = fs_info->tree_root;
+	struct btrfs_root *free_space_root = fs_info->free_space_root;
+	int ret;
+
+	trans = btrfs_start_transaction(tree_root, 0);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
+
+	btrfs_clear_fs_compat_ro(fs_info, FREE_SPACE_TREE);
+	fs_info->free_space_root = NULL;
+
+	ret = clear_free_space_tree(trans, free_space_root);
+	if (ret)
+		goto abort;
+
+	ret = btrfs_del_root(trans, tree_root, &free_space_root->root_key);
+	if (ret)
+		goto abort;
+
+	list_del(&free_space_root->dirty_list);
+
+	btrfs_tree_lock(free_space_root->node);
+	clean_tree_block(trans, tree_root->fs_info, free_space_root->node);
+	btrfs_tree_unlock(free_space_root->node);
+	btrfs_free_tree_block(trans, free_space_root, free_space_root->node,
+			      0, 1);
+
+	free_extent_buffer(free_space_root->node);
+	free_extent_buffer(free_space_root->commit_root);
+	kfree(free_space_root);
+
+	ret = btrfs_commit_transaction(trans, tree_root);
+	if (ret)
+		return ret;
+
+	return 0;
+
+abort:
+	btrfs_abort_transaction(trans, tree_root, ret);
+	btrfs_end_transaction(trans, tree_root);
+	return ret;
+}
+
+static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
+					struct btrfs_fs_info *fs_info,
+					struct btrfs_block_group_cache *block_group,
+					struct btrfs_path *path)
+{
+	u64 start, end;
+	int ret;
+
+	start = block_group->key.objectid;
+	end = block_group->key.objectid + block_group->key.offset;
+
+	block_group->needs_free_space = 0;
+
+	ret = add_new_free_space_info(trans, fs_info, block_group, path);
+	if (ret)
+		return ret;
+
+	return __add_to_free_space_tree(trans, fs_info, block_group, path,
+					block_group->key.objectid,
+					block_group->key.offset);
+}
+
+int add_block_group_free_space(struct btrfs_trans_handle *trans,
+			       struct btrfs_fs_info *fs_info,
+			       struct btrfs_block_group_cache *block_group)
+{
+	struct btrfs_path *path = NULL;
+	int ret = 0;
+
+	if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+		return 0;
+
+	mutex_lock(&block_group->free_space_lock);
+	if (!block_group->needs_free_space)
+		goto out;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = __add_block_group_free_space(trans, fs_info, block_group, path);
+
+out:
+	btrfs_free_path(path);
+	mutex_unlock(&block_group->free_space_lock);
+	if (ret)
+		btrfs_abort_transaction(trans, fs_info->free_space_root, ret);
+	return ret;
+}
+
+int remove_block_group_free_space(struct btrfs_trans_handle *trans,
+				  struct btrfs_fs_info *fs_info,
+				  struct btrfs_block_group_cache *block_group)
+{
+	struct btrfs_root *root = fs_info->free_space_root;
+	struct btrfs_path *path;
+	struct btrfs_key key, found_key;
+	struct extent_buffer *leaf;
+	u64 start, end;
+	int done = 0, nr;
+	int ret;
+
+	if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
+		return 0;
+
+	if (block_group->needs_free_space) {
+		/* We never added this block group to the free space tree. */
+		return 0;
+	}
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	start = block_group->key.objectid;
+	end = block_group->key.objectid + block_group->key.offset;
+
+	key.objectid = end - 1;
+	key.type = (u8)-1;
+	key.offset = (u64)-1;
+
+	while (!done) {
+		ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1);
+		if (ret)
+			goto out;
+
+		leaf = path->nodes[0];
+		nr = 0;
+		path->slots[0]++;
+		while (path->slots[0] > 0) {
+			btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0] - 1);
+
+			if (found_key.type == BTRFS_FREE_SPACE_INFO_KEY) {
+				ASSERT(found_key.objectid == block_group->key.objectid);
+				ASSERT(found_key.offset == block_group->key.offset);
+				done = 1;
+				nr++;
+				path->slots[0]--;
+				break;
+			} else if (found_key.type == BTRFS_FREE_SPACE_EXTENT_KEY ||
+				   found_key.type == BTRFS_FREE_SPACE_BITMAP_KEY) {
+				ASSERT(found_key.objectid >= start);
+				ASSERT(found_key.objectid < end);
+				ASSERT(found_key.objectid + found_key.offset <= end);
+				nr++;
+				path->slots[0]--;
+			} else {
+				ASSERT(0);
+			}
+		}
+
+		ret = btrfs_del_items(trans, root, path, path->slots[0], nr);
+		if (ret)
+			goto out;
+		btrfs_release_path(path);
+	}
+
+	ret = 0;
+out:
+	btrfs_free_path(path);
+	if (ret)
+		btrfs_abort_transaction(trans, root, ret);
+	return ret;
+}
+
+static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl,
+				   struct btrfs_path *path,
+				   u32 expected_extent_count)
+{
+	struct btrfs_block_group_cache *block_group;
+	struct btrfs_fs_info *fs_info;
+	struct btrfs_root *root;
+	struct btrfs_key key;
+	int prev_bit = 0, bit;
+	/* Initialize to silence GCC. */
+	u64 extent_start = 0;
+	u64 end, offset;
+	u64 total_found = 0;
+	u32 extent_count = 0;
+	int ret;
+
+	block_group = caching_ctl->block_group;
+	fs_info = block_group->fs_info;
+	root = fs_info->free_space_root;
+
+	end = block_group->key.objectid + block_group->key.offset;
+
+	while (1) {
+		ret = btrfs_next_item(root, path);
+		if (ret < 0)
+			goto out;
+		if (ret)
+			break;
+
+		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+		if (key.type == BTRFS_FREE_SPACE_INFO_KEY)
+			break;
+
+		ASSERT(key.type == BTRFS_FREE_SPACE_BITMAP_KEY);
+		ASSERT(key.objectid < end && key.objectid + key.offset <= end);
+
+		caching_ctl->progress = key.objectid;
+
+		offset = key.objectid;
+		while (offset < key.objectid + key.offset) {
+			bit = free_space_test_bit(block_group, path, offset);
+			if (prev_bit == 0 && bit == 1) {
+				extent_start = offset;
+			} else if (prev_bit == 1 && bit == 0) {
+				total_found += add_new_free_space(block_group,
+								  fs_info,
+								  extent_start,
+								  offset);
+				if (total_found > CACHING_CTL_WAKE_UP) {
+					total_found = 0;
+					wake_up(&caching_ctl->wait);
+				}
+				extent_count++;
+			}
+			prev_bit = bit;
+			offset += block_group->sectorsize;
+		}
+	}
+	if (prev_bit == 1) {
+		total_found += add_new_free_space(block_group, fs_info,
+						  extent_start, end);
+		extent_count++;
+	}
+
+	if (extent_count != expected_extent_count) {
+		btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u",
+			  block_group->key.objectid, extent_count,
+			  expected_extent_count);
+		ASSERT(0);
+		ret = -EIO;
+		goto out;
+	}
+
+	caching_ctl->progress = (u64)-1;
+
+	ret = 0;
+out:
+	return ret;
+}
+
+static int load_free_space_extents(struct btrfs_caching_control *caching_ctl,
+				   struct btrfs_path *path,
+				   u32 expected_extent_count)
+{
+	struct btrfs_block_group_cache *block_group;
+	struct btrfs_fs_info *fs_info;
+	struct btrfs_root *root;
+	struct btrfs_key key;
+	u64 end;
+	u64 total_found = 0;
+	u32 extent_count = 0;
+	int ret;
+
+	block_group = caching_ctl->block_group;
+	fs_info = block_group->fs_info;
+	root = fs_info->free_space_root;
+
+	end = block_group->key.objectid + block_group->key.offset;
+
+	while (1) {
+		ret = btrfs_next_item(root, path);
+		if (ret < 0)
+			goto out;
+		if (ret)
+			break;
+
+		btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+
+		if (key.type == BTRFS_FREE_SPACE_INFO_KEY)
+			break;
+
+		ASSERT(key.type == BTRFS_FREE_SPACE_EXTENT_KEY);
+		ASSERT(key.objectid < end && key.objectid + key.offset <= end);
+
+		caching_ctl->progress = key.objectid;
+
+		total_found += add_new_free_space(block_group, fs_info,
+						  key.objectid,
+						  key.objectid + key.offset);
+		if (total_found > CACHING_CTL_WAKE_UP) {
+			total_found = 0;
+			wake_up(&caching_ctl->wait);
+		}
+		extent_count++;
+	}
+
+	if (extent_count != expected_extent_count) {
+		btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u",
+			  block_group->key.objectid, extent_count,
+			  expected_extent_count);
+		ASSERT(0);
+		ret = -EIO;
+		goto out;
+	}
+
+	caching_ctl->progress = (u64)-1;
+
+	ret = 0;
+out:
+	return ret;
+}
+
+int load_free_space_tree(struct btrfs_caching_control *caching_ctl)
+{
+	struct btrfs_block_group_cache *block_group;
+	struct btrfs_fs_info *fs_info;
+	struct btrfs_free_space_info *info;
+	struct btrfs_path *path;
+	u32 extent_count, flags;
+	int ret;
+
+	block_group = caching_ctl->block_group;
+	fs_info = block_group->fs_info;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	/*
+	 * Just like caching_thread() doesn't want to deadlock on the extent
+	 * tree, we don't want to deadlock on the free space tree.
+	 */
+	path->skip_locking = 1;
+	path->search_commit_root = 1;
+	path->reada = 1;
+
+	info = search_free_space_info(NULL, fs_info, block_group, path, 0);
+	if (IS_ERR(info)) {
+		ret = PTR_ERR(info);
+		goto out;
+	}
+	extent_count = btrfs_free_space_extent_count(path->nodes[0], info);
+	flags = btrfs_free_space_flags(path->nodes[0], info);
+
+	/*
+	 * We left path pointing to the free space info item, so now
+	 * load_free_space_foo can just iterate through the free space tree from
+	 * there.
+	 */
+	if (flags & BTRFS_FREE_SPACE_USING_BITMAPS)
+		ret = load_free_space_bitmaps(caching_ctl, path, extent_count);
+	else
+		ret = load_free_space_extents(caching_ctl, path, extent_count);
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
diff --git a/fs/btrfs/free-space-tree.h b/fs/btrfs/free-space-tree.h
new file mode 100644
index 000000000..54ffced3b
--- /dev/null
+++ b/fs/btrfs/free-space-tree.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright (C) 2015 Facebook.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_FREE_SPACE_TREE
+#define __BTRFS_FREE_SPACE_TREE
+
+/*
+ * The default size for new free space bitmap items. The last bitmap in a block
+ * group may be truncated, and none of the free space tree code assumes that
+ * existing bitmaps are this size.
+ */
+#define BTRFS_FREE_SPACE_BITMAP_SIZE 256
+#define BTRFS_FREE_SPACE_BITMAP_BITS (BTRFS_FREE_SPACE_BITMAP_SIZE * BITS_PER_BYTE)
+
+void set_free_space_tree_thresholds(struct btrfs_block_group_cache *block_group);
+int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info);
+int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info);
+int load_free_space_tree(struct btrfs_caching_control *caching_ctl);
+int add_block_group_free_space(struct btrfs_trans_handle *trans,
+			       struct btrfs_fs_info *fs_info,
+			       struct btrfs_block_group_cache *block_group);
+int remove_block_group_free_space(struct btrfs_trans_handle *trans,
+				  struct btrfs_fs_info *fs_info,
+				  struct btrfs_block_group_cache *block_group);
+int add_to_free_space_tree(struct btrfs_trans_handle *trans,
+			   struct btrfs_fs_info *fs_info,
+			   u64 start, u64 size);
+int remove_from_free_space_tree(struct btrfs_trans_handle *trans,
+				struct btrfs_fs_info *fs_info,
+				u64 start, u64 size);
+
+/* Exposed for testing. */
+struct btrfs_free_space_info *
+search_free_space_info(struct btrfs_trans_handle *trans,
+		       struct btrfs_fs_info *fs_info,
+		       struct btrfs_block_group_cache *block_group,
+		       struct btrfs_path *path, int cow);
+int __add_to_free_space_tree(struct btrfs_trans_handle *trans,
+			     struct btrfs_fs_info *fs_info,
+			     struct btrfs_block_group_cache *block_group,
+			     struct btrfs_path *path, u64 start, u64 size);
+int __remove_from_free_space_tree(struct btrfs_trans_handle *trans,
+				  struct btrfs_fs_info *fs_info,
+				  struct btrfs_block_group_cache *block_group,
+				  struct btrfs_path *path, u64 start, u64 size);
+int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans,
+				  struct btrfs_fs_info *fs_info,
+				  struct btrfs_block_group_cache *block_group,
+				  struct btrfs_path *path);
+int convert_free_space_to_extents(struct btrfs_trans_handle *trans,
+				  struct btrfs_fs_info *fs_info,
+				  struct btrfs_block_group_cache *block_group,
+				  struct btrfs_path *path);
+int free_space_test_bit(struct btrfs_block_group_cache *block_group,
+			struct btrfs_path *path, u64 offset);
+
+#endif
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 07573dc16..e50316c4a 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -48,7 +48,7 @@ static int caching_kthread(void *data)
 	/* Since the commit root is read-only, we can safely skip locking. */
 	path->skip_locking = 1;
 	path->search_commit_root = 1;
-	path->reada = 2;
+	path->reada = READA_FORWARD;
 
 	key.objectid = BTRFS_FIRST_FREE_OBJECTID;
 	key.offset = 0;
@@ -282,7 +282,7 @@ void btrfs_unpin_free_ino(struct btrfs_root *root)
 	}
 }
 
-#define INIT_THRESHOLD	(((1024 * 32) / 2) / sizeof(struct btrfs_free_space))
+#define INIT_THRESHOLD	((SZ_32K / 2) / sizeof(struct btrfs_free_space))
 #define INODES_PER_BITMAP (PAGE_CACHE_SIZE * 8)
 
 /*
@@ -334,7 +334,7 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl,
 	return true;
 }
 
-static struct btrfs_free_space_op free_ino_op = {
+static const struct btrfs_free_space_op free_ino_op = {
 	.recalc_thresholds	= recalculate_thresholds,
 	.use_bitmap		= use_bitmap,
 };
@@ -356,7 +356,7 @@ static bool pinned_use_bitmap(struct btrfs_free_space_ctl *ctl,
 	return false;
 }
 
-static struct btrfs_free_space_op pinned_free_ino_op = {
+static const struct btrfs_free_space_op pinned_free_ino_op = {
 	.recalc_thresholds	= pinned_recalc_thresholds,
 	.use_bitmap		= pinned_use_bitmap,
 };
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 4bc9dbf29..d96f5cf38 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -66,6 +66,13 @@ struct btrfs_iget_args {
 	struct btrfs_root *root;
 };
 
+struct btrfs_dio_data {
+	u64 outstanding_extents;
+	u64 reserve;
+	u64 unsubmitted_oe_range_start;
+	u64 unsubmitted_oe_range_end;
+};
+
 static const struct inode_operations btrfs_dir_inode_operations;
 static const struct inode_operations btrfs_symlink_inode_operations;
 static const struct inode_operations btrfs_dir_ro_inode_operations;
@@ -74,17 +81,16 @@ static const struct inode_operations btrfs_file_inode_operations;
 static const struct address_space_operations btrfs_aops;
 static const struct address_space_operations btrfs_symlink_aops;
 static const struct file_operations btrfs_dir_file_operations;
-static struct extent_io_ops btrfs_extent_io_ops;
+static const struct extent_io_ops btrfs_extent_io_ops;
 
 static struct kmem_cache *btrfs_inode_cachep;
-static struct kmem_cache *btrfs_delalloc_work_cachep;
 struct kmem_cache *btrfs_trans_handle_cachep;
 struct kmem_cache *btrfs_transaction_cachep;
 struct kmem_cache *btrfs_path_cachep;
 struct kmem_cache *btrfs_free_space_cachep;
 
 #define S_SHIFT 12
-static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
+static const unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = {
 	[S_IFREG >> S_SHIFT]	= BTRFS_FT_REG_FILE,
 	[S_IFDIR >> S_SHIFT]	= BTRFS_FT_DIR,
 	[S_IFCHR >> S_SHIFT]	= BTRFS_FT_CHRDEV,
@@ -414,15 +420,15 @@ static noinline void compress_file_range(struct inode *inode,
 	unsigned long nr_pages_ret = 0;
 	unsigned long total_compressed = 0;
 	unsigned long total_in = 0;
-	unsigned long max_compressed = 128 * 1024;
-	unsigned long max_uncompressed = 128 * 1024;
+	unsigned long max_compressed = SZ_128K;
+	unsigned long max_uncompressed = SZ_128K;
 	int i;
 	int will_compress;
 	int compress_type = root->fs_info->compress_type;
 	int redirty = 0;
 
 	/* if this is a small write inside eof, kick off a defrag */
-	if ((end - start + 1) < 16 * 1024 &&
+	if ((end - start + 1) < SZ_16K &&
 	    (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
 		btrfs_add_inode_defrag(NULL, inode);
 
@@ -430,7 +436,7 @@ static noinline void compress_file_range(struct inode *inode,
 again:
 	will_compress = 0;
 	nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1;
-	nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE);
+	nr_pages = min_t(unsigned long, nr_pages, SZ_128K / PAGE_CACHE_SIZE);
 
 	/*
 	 * we don't want to send crud past the end of i_size through
@@ -944,7 +950,7 @@ static noinline int cow_file_range(struct inode *inode,
 	disk_num_bytes = num_bytes;
 
 	/* if this is a small write inside eof, kick off defrag */
-	if (num_bytes < 64 * 1024 &&
+	if (num_bytes < SZ_64K &&
 	    (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
 		btrfs_add_inode_defrag(NULL, inode);
 
@@ -1107,7 +1113,7 @@ static noinline void async_cow_submit(struct btrfs_work *work)
 	 * atomic_sub_return implies a barrier for waitqueue_active
 	 */
 	if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) <
-	    5 * 1024 * 1024 &&
+	    5 * SZ_1M &&
 	    waitqueue_active(&root->fs_info->async_submit_wait))
 		wake_up(&root->fs_info->async_submit_wait);
 
@@ -1132,7 +1138,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	unsigned long nr_pages;
 	u64 cur_end;
-	int limit = 10 * 1024 * 1024;
+	int limit = 10 * SZ_1M;
 
 	clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED,
 			 1, 0, NULL, GFP_NOFS);
@@ -1148,7 +1154,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
 		    !btrfs_test_opt(root, FORCE_COMPRESS))
 			cur_end = end;
 		else
-			cur_end = min(end, start + 512 * 1024 - 1);
+			cur_end = min(end, start + SZ_512K - 1);
 
 		async_cow->end = cur_end;
 		INIT_LIST_HEAD(&async_cow->extents);
@@ -1989,7 +1995,7 @@ again:
 	page_start = page_offset(page);
 	page_end = page_offset(page) + PAGE_CACHE_SIZE - 1;
 
-	lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0,
+	lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end,
 			 &cached_state);
 
 	/* already ordered? We're done */
@@ -2482,7 +2488,7 @@ static noinline int relink_extent_backref(struct btrfs_path *path,
 	lock_start = backref->file_pos;
 	lock_end = backref->file_pos + backref->num_bytes - 1;
 	lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end,
-			 0, &cached);
+			 &cached);
 
 	ordered = btrfs_lookup_first_ordered_extent(inode, lock_end);
 	if (ordered) {
@@ -2874,7 +2880,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
 
 	lock_extent_bits(io_tree, ordered_extent->file_offset,
 			 ordered_extent->file_offset + ordered_extent->len - 1,
-			 0, &cached_state);
+			 &cached_state);
 
 	ret = test_range_bit(io_tree, ordered_extent->file_offset,
 			ordered_extent->file_offset + ordered_extent->len - 1,
@@ -3106,52 +3112,46 @@ static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
 				      start, (size_t)(end - start + 1));
 }
 
-struct delayed_iput {
-	struct list_head list;
-	struct inode *inode;
-};
-
-/* JDM: If this is fs-wide, why can't we add a pointer to
- * btrfs_inode instead and avoid the allocation? */
 void btrfs_add_delayed_iput(struct inode *inode)
 {
 	struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
-	struct delayed_iput *delayed;
+	struct btrfs_inode *binode = BTRFS_I(inode);
 
 	if (atomic_add_unless(&inode->i_count, -1, 1))
 		return;
 
-	delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL);
-	delayed->inode = inode;
-
 	spin_lock(&fs_info->delayed_iput_lock);
-	list_add_tail(&delayed->list, &fs_info->delayed_iputs);
+	if (binode->delayed_iput_count == 0) {
+		ASSERT(list_empty(&binode->delayed_iput));
+		list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs);
+	} else {
+		binode->delayed_iput_count++;
+	}
 	spin_unlock(&fs_info->delayed_iput_lock);
 }
 
 void btrfs_run_delayed_iputs(struct btrfs_root *root)
 {
-	LIST_HEAD(list);
 	struct btrfs_fs_info *fs_info = root->fs_info;
-	struct delayed_iput *delayed;
-	int empty;
 
 	spin_lock(&fs_info->delayed_iput_lock);
-	empty = list_empty(&fs_info->delayed_iputs);
-	spin_unlock(&fs_info->delayed_iput_lock);
-	if (empty)
-		return;
-
-	spin_lock(&fs_info->delayed_iput_lock);
-	list_splice_init(&fs_info->delayed_iputs, &list);
-	spin_unlock(&fs_info->delayed_iput_lock);
-
-	while (!list_empty(&list)) {
-		delayed = list_entry(list.next, struct delayed_iput, list);
-		list_del(&delayed->list);
-		iput(delayed->inode);
-		kfree(delayed);
+	while (!list_empty(&fs_info->delayed_iputs)) {
+		struct btrfs_inode *inode;
+
+		inode = list_first_entry(&fs_info->delayed_iputs,
+				struct btrfs_inode, delayed_iput);
+		if (inode->delayed_iput_count) {
+			inode->delayed_iput_count--;
+			list_move_tail(&inode->delayed_iput,
+					&fs_info->delayed_iputs);
+		} else {
+			list_del_init(&inode->delayed_iput);
+		}
+		spin_unlock(&fs_info->delayed_iput_lock);
+		iput(&inode->vfs_inode);
+		spin_lock(&fs_info->delayed_iput_lock);
 	}
+	spin_unlock(&fs_info->delayed_iput_lock);
 }
 
 /*
@@ -3347,7 +3347,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
 		ret = -ENOMEM;
 		goto out;
 	}
-	path->reada = -1;
+	path->reada = READA_BACK;
 
 	key.objectid = BTRFS_ORPHAN_OBJECTID;
 	key.type = BTRFS_ORPHAN_ITEM_KEY;
@@ -3546,10 +3546,10 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf,
 	int scanned = 0;
 
 	if (!xattr_access) {
-		xattr_access = btrfs_name_hash(POSIX_ACL_XATTR_ACCESS,
-					strlen(POSIX_ACL_XATTR_ACCESS));
-		xattr_default = btrfs_name_hash(POSIX_ACL_XATTR_DEFAULT,
-					strlen(POSIX_ACL_XATTR_DEFAULT));
+		xattr_access = btrfs_name_hash(XATTR_NAME_POSIX_ACL_ACCESS,
+					strlen(XATTR_NAME_POSIX_ACL_ACCESS));
+		xattr_default = btrfs_name_hash(XATTR_NAME_POSIX_ACL_DEFAULT,
+					strlen(XATTR_NAME_POSIX_ACL_DEFAULT));
 	}
 
 	slot++;
@@ -3770,6 +3770,7 @@ cache_acl:
 		break;
 	case S_IFLNK:
 		inode->i_op = &btrfs_symlink_inode_operations;
+		inode_nohighmem(inode);
 		inode->i_mapping->a_ops = &btrfs_symlink_aops;
 		break;
 	default:
@@ -4313,7 +4314,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
-	path->reada = -1;
+	path->reada = READA_BACK;
 
 	/*
 	 * We want to drop from the next block forward in case this new size is
@@ -4344,7 +4345,7 @@ search_again:
 	 * up a huge file in a single leaf.  Most of the time that
 	 * bytes_deleted is > 0, it will be huge by the time we get here
 	 */
-	if (be_nice && bytes_deleted > 32 * 1024 * 1024) {
+	if (be_nice && bytes_deleted > SZ_32M) {
 		if (btrfs_should_end_transaction(trans, root)) {
 			err = -EAGAIN;
 			goto error;
@@ -4587,7 +4588,7 @@ error:
 
 	btrfs_free_path(path);
 
-	if (be_nice && bytes_deleted > 32 * 1024 * 1024) {
+	if (be_nice && bytes_deleted > SZ_32M) {
 		unsigned long updates = trans->delayed_ref_updates;
 		if (updates) {
 			trans->delayed_ref_updates = 0;
@@ -4664,7 +4665,7 @@ again:
 	}
 	wait_on_page_writeback(page);
 
-	lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state);
+	lock_extent_bits(io_tree, page_start, page_end, &cached_state);
 	set_page_extent_mapped(page);
 
 	ordered = btrfs_lookup_ordered_extent(inode, page_start);
@@ -4795,7 +4796,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size)
 	while (1) {
 		struct btrfs_ordered_extent *ordered;
 
-		lock_extent_bits(io_tree, hole_start, block_end - 1, 0,
+		lock_extent_bits(io_tree, hole_start, block_end - 1,
 				 &cached_state);
 		ordered = btrfs_lookup_ordered_range(inode, hole_start,
 						     block_end - hole_start);
@@ -4871,26 +4872,6 @@ next:
 	return err;
 }
 
-static int wait_snapshoting_atomic_t(atomic_t *a)
-{
-	schedule();
-	return 0;
-}
-
-static void wait_for_snapshot_creation(struct btrfs_root *root)
-{
-	while (true) {
-		int ret;
-
-		ret = btrfs_start_write_no_snapshoting(root);
-		if (ret)
-			break;
-		wait_on_atomic_t(&root->will_be_snapshoted,
-				 wait_snapshoting_atomic_t,
-				 TASK_UNINTERRUPTIBLE);
-	}
-}
-
 static int btrfs_setsize(struct inode *inode, struct iattr *attr)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -4922,7 +4903,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
 		 * truncation, it must capture all writes that happened before
 		 * this truncation.
 		 */
-		wait_for_snapshot_creation(root);
+		btrfs_wait_for_snapshot_creation(root);
 		ret = btrfs_cont_expand(inode, oldsize, newsize);
 		if (ret) {
 			btrfs_end_write_no_snapshoting(root);
@@ -5107,7 +5088,7 @@ static void evict_inode_truncate_pages(struct inode *inode)
 		end = state->end;
 		spin_unlock(&io_tree->lock);
 
-		lock_extent_bits(io_tree, start, end, 0, &cached_state);
+		lock_extent_bits(io_tree, start, end, &cached_state);
 
 		/*
 		 * If still has DELALLOC flag, the extent didn't reach disk,
@@ -5300,7 +5281,6 @@ void btrfs_evict_inode(struct inode *inode)
 no_delete:
 	btrfs_remove_delayed_node(inode);
 	clear_inode(inode);
-	return;
 }
 
 /*
@@ -5750,7 +5730,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx)
 	if (!path)
 		return -ENOMEM;
 
-	path->reada = 1;
+	path->reada = READA_FORWARD;
 
 	if (key_type == BTRFS_DIR_INDEX_KEY) {
 		INIT_LIST_HEAD(&ins_list);
@@ -6697,7 +6677,7 @@ static int merge_extent_mapping(struct extent_map_tree *em_tree,
 }
 
 static noinline int uncompress_inline(struct btrfs_path *path,
-				      struct inode *inode, struct page *page,
+				      struct page *page,
 				      size_t pg_offset, u64 extent_offset,
 				      struct btrfs_file_extent_item *item)
 {
@@ -6794,7 +6774,7 @@ again:
 		 * Chances are we'll be called again, so go ahead and do
 		 * readahead
 		 */
-		path->reada = 1;
+		path->reada = READA_FORWARD;
 	}
 
 	ret = btrfs_lookup_file_extent(trans, root, path,
@@ -6893,8 +6873,7 @@ next:
 		if (create == 0 && !PageUptodate(page)) {
 			if (btrfs_file_extent_compression(leaf, item) !=
 			    BTRFS_COMPRESS_NONE) {
-				ret = uncompress_inline(path, inode, page,
-							pg_offset,
+				ret = uncompress_inline(path, page, pg_offset,
 							extent_offset, item);
 				if (ret) {
 					err = ret;
@@ -7149,21 +7128,41 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
 	if (ret)
 		return ERR_PTR(ret);
 
-	em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
-			      ins.offset, ins.offset, ins.offset, 0);
-	if (IS_ERR(em)) {
-		btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
-		return em;
-	}
-
+	/*
+	 * Create the ordered extent before the extent map. This is to avoid
+	 * races with the fast fsync path that would lead to it logging file
+	 * extent items that point to disk extents that were not yet written to.
+	 * The fast fsync path collects ordered extents into a local list and
+	 * then collects all the new extent maps, so we must create the ordered
+	 * extent first and make sure the fast fsync path collects any new
+	 * ordered extents after collecting new extent maps as well.
+	 * The fsync path simply can not rely on inode_dio_wait() because it
+	 * causes deadlock with AIO.
+	 */
 	ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
 					   ins.offset, ins.offset, 0);
 	if (ret) {
 		btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
-		free_extent_map(em);
 		return ERR_PTR(ret);
 	}
 
+	em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
+			      ins.offset, ins.offset, ins.offset, 0);
+	if (IS_ERR(em)) {
+		struct btrfs_ordered_extent *oe;
+
+		btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
+		oe = btrfs_lookup_ordered_extent(inode, start);
+		ASSERT(oe);
+		if (WARN_ON(!oe))
+			return em;
+		set_bit(BTRFS_ORDERED_IOERR, &oe->flags);
+		set_bit(BTRFS_ORDERED_IO_DONE, &oe->flags);
+		btrfs_remove_ordered_extent(inode, oe);
+		/* Once for our lookup and once for the ordered extents tree. */
+		btrfs_put_ordered_extent(oe);
+		btrfs_put_ordered_extent(oe);
+	}
 	return em;
 }
 
@@ -7390,7 +7389,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
 
 	while (1) {
 		lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
-				 0, cached_state);
+				 cached_state);
 		/*
 		 * We're concerned with the entire range that we're going to be
 		 * doing DIO to, so we need to make sure theres no ordered
@@ -7418,25 +7417,21 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend,
 			btrfs_start_ordered_extent(inode, ordered, 1);
 			btrfs_put_ordered_extent(ordered);
 		} else {
-			/* Screw you mmap */
-			ret = btrfs_fdatawrite_range(inode, lockstart, lockend);
-			if (ret)
-				break;
-			ret = filemap_fdatawait_range(inode->i_mapping,
-						      lockstart,
-						      lockend);
-			if (ret)
-				break;
-
 			/*
-			 * If we found a page that couldn't be invalidated just
-			 * fall back to buffered.
+			 * We could trigger writeback for this range (and wait
+			 * for it to complete) and then invalidate the pages for
+			 * this range (through invalidate_inode_pages2_range()),
+			 * but that can lead us to a deadlock with a concurrent
+			 * call to readpages() (a buffered read or a defrag call
+			 * triggered a readahead) on a page lock due to an
+			 * ordered dio extent we created before but did not have
+			 * yet a corresponding bio submitted (whence it can not
+			 * complete), which makes readpages() wait for that
+			 * ordered extent to complete while holding a lock on
+			 * that page.
 			 */
-			ret = invalidate_inode_pages2_range(inode->i_mapping,
-					lockstart >> PAGE_CACHE_SHIFT,
-					lockend >> PAGE_CACHE_SHIFT);
-			if (ret)
-				break;
+			ret = -ENOTBLK;
+			break;
 		}
 
 		cond_resched();
@@ -7492,11 +7487,6 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start,
 	return em;
 }
 
-struct btrfs_dio_data {
-	u64 outstanding_extents;
-	u64 reserve;
-};
-
 static void adjust_dio_outstanding_extents(struct inode *inode,
 					   struct btrfs_dio_data *dio_data,
 					   const u64 len)
@@ -7680,6 +7670,7 @@ unlock:
 		btrfs_free_reserved_data_space(inode, start, len);
 		WARN_ON(dio_data->reserve < len);
 		dio_data->reserve -= len;
+		dio_data->unsubmitted_oe_range_end = start + len;
 		current->journal_info = dio_data;
 	}
 
@@ -8003,22 +7994,22 @@ static void btrfs_endio_direct_read(struct bio *bio)
 	bio_put(bio);
 }
 
-static void btrfs_endio_direct_write(struct bio *bio)
+static void btrfs_endio_direct_write_update_ordered(struct inode *inode,
+						    const u64 offset,
+						    const u64 bytes,
+						    const int uptodate)
 {
-	struct btrfs_dio_private *dip = bio->bi_private;
-	struct inode *inode = dip->inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_ordered_extent *ordered = NULL;
-	u64 ordered_offset = dip->logical_offset;
-	u64 ordered_bytes = dip->bytes;
-	struct bio *dio_bio;
+	u64 ordered_offset = offset;
+	u64 ordered_bytes = bytes;
 	int ret;
 
 again:
 	ret = btrfs_dec_test_first_ordered_pending(inode, &ordered,
 						   &ordered_offset,
 						   ordered_bytes,
-						   !bio->bi_error);
+						   uptodate);
 	if (!ret)
 		goto out_test;
 
@@ -8031,13 +8022,22 @@ out_test:
 	 * our bio might span multiple ordered extents.  If we haven't
 	 * completed the accounting for the whole dio, go back and try again
 	 */
-	if (ordered_offset < dip->logical_offset + dip->bytes) {
-		ordered_bytes = dip->logical_offset + dip->bytes -
-			ordered_offset;
+	if (ordered_offset < offset + bytes) {
+		ordered_bytes = offset + bytes - ordered_offset;
 		ordered = NULL;
 		goto again;
 	}
-	dio_bio = dip->dio_bio;
+}
+
+static void btrfs_endio_direct_write(struct bio *bio)
+{
+	struct btrfs_dio_private *dip = bio->bi_private;
+	struct bio *dio_bio = dip->dio_bio;
+
+	btrfs_endio_direct_write_update_ordered(dip->inode,
+						dip->logical_offset,
+						dip->bytes,
+						!bio->bi_error);
 
 	kfree(dip);
 
@@ -8346,6 +8346,21 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
 		dip->subio_endio = btrfs_subio_endio_read;
 	}
 
+	/*
+	 * Reset the range for unsubmitted ordered extents (to a 0 length range)
+	 * even if we fail to submit a bio, because in such case we do the
+	 * corresponding error handling below and it must not be done a second
+	 * time by btrfs_direct_IO().
+	 */
+	if (write) {
+		struct btrfs_dio_data *dio_data = current->journal_info;
+
+		dio_data->unsubmitted_oe_range_end = dip->logical_offset +
+			dip->bytes;
+		dio_data->unsubmitted_oe_range_start =
+			dio_data->unsubmitted_oe_range_end;
+	}
+
 	ret = btrfs_submit_direct_hook(rw, dip, skip_sum);
 	if (!ret)
 		return;
@@ -8374,24 +8389,15 @@ free_ordered:
 		dip = NULL;
 		io_bio = NULL;
 	} else {
-		if (write) {
-			struct btrfs_ordered_extent *ordered;
-
-			ordered = btrfs_lookup_ordered_extent(inode,
-							      file_offset);
-			set_bit(BTRFS_ORDERED_IOERR, &ordered->flags);
-			/*
-			 * Decrements our ref on the ordered extent and removes
-			 * the ordered extent from the inode's ordered tree,
-			 * doing all the proper resource cleanup such as for the
-			 * reserved space and waking up any waiters for this
-			 * ordered extent (through btrfs_remove_ordered_extent).
-			 */
-			btrfs_finish_ordered_io(ordered);
-		} else {
+		if (write)
+			btrfs_endio_direct_write_update_ordered(inode,
+						file_offset,
+						dio_bio->bi_iter.bi_size,
+						0);
+		else
 			unlock_extent(&BTRFS_I(inode)->io_tree, file_offset,
 			      file_offset + dio_bio->bi_iter.bi_size - 1);
-		}
+
 		dio_bio->bi_error = -EIO;
 		/*
 		 * Releases and cleans up our dio_bio, no need to bio_put()
@@ -8475,7 +8481,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 		 * not unlock the i_mutex at this case.
 		 */
 		if (offset + count <= inode->i_size) {
-			mutex_unlock(&inode->i_mutex);
+			inode_unlock(inode);
 			relock = true;
 		}
 		ret = btrfs_delalloc_reserve_space(inode, offset, count);
@@ -8491,6 +8497,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 		 * originally calculated.  Abuse current->journal_info for this.
 		 */
 		dio_data.reserve = round_up(count, root->sectorsize);
+		dio_data.unsubmitted_oe_range_start = (u64)offset;
+		dio_data.unsubmitted_oe_range_end = (u64)offset;
 		current->journal_info = &dio_data;
 	} else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK,
 				     &BTRFS_I(inode)->runtime_flags)) {
@@ -8509,6 +8517,19 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 			if (dio_data.reserve)
 				btrfs_delalloc_release_space(inode, offset,
 							     dio_data.reserve);
+			/*
+			 * On error we might have left some ordered extents
+			 * without submitting corresponding bios for them, so
+			 * cleanup them up to avoid other tasks getting them
+			 * and waiting for them to complete forever.
+			 */
+			if (dio_data.unsubmitted_oe_range_start <
+			    dio_data.unsubmitted_oe_range_end)
+				btrfs_endio_direct_write_update_ordered(inode,
+					dio_data.unsubmitted_oe_range_start,
+					dio_data.unsubmitted_oe_range_end -
+					dio_data.unsubmitted_oe_range_start,
+					0);
 		} else if (ret >= 0 && (size_t)ret < count)
 			btrfs_delalloc_release_space(inode, offset,
 						     count - (size_t)ret);
@@ -8517,7 +8538,7 @@ out:
 	if (wakeup)
 		inode_dio_end(inode);
 	if (relock)
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 
 	return ret;
 }
@@ -8639,7 +8660,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
 	}
 
 	if (!inode_evicting)
-		lock_extent_bits(tree, page_start, page_end, 0, &cached_state);
+		lock_extent_bits(tree, page_start, page_end, &cached_state);
 	ordered = btrfs_lookup_ordered_extent(inode, page_start);
 	if (ordered) {
 		/*
@@ -8677,7 +8698,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
 		btrfs_put_ordered_extent(ordered);
 		if (!inode_evicting) {
 			cached_state = NULL;
-			lock_extent_bits(tree, page_start, page_end, 0,
+			lock_extent_bits(tree, page_start, page_end,
 					 &cached_state);
 		}
 	}
@@ -8775,7 +8796,7 @@ again:
 	}
 	wait_on_page_writeback(page);
 
-	lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state);
+	lock_extent_bits(io_tree, page_start, page_end, &cached_state);
 	set_page_extent_mapped(page);
 
 	/*
@@ -9049,6 +9070,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 	ei->dir_index = 0;
 	ei->last_unlink_trans = 0;
 	ei->last_log_commit = 0;
+	ei->delayed_iput_count = 0;
 
 	spin_lock_init(&ei->lock);
 	ei->outstanding_extents = 0;
@@ -9073,6 +9095,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
 	mutex_init(&ei->delalloc_mutex);
 	btrfs_ordered_inode_tree_init(&ei->ordered_tree);
 	INIT_LIST_HEAD(&ei->delalloc_inodes);
+	INIT_LIST_HEAD(&ei->delayed_iput);
 	RB_CLEAR_NODE(&ei->rb_node);
 
 	return inode;
@@ -9177,15 +9200,14 @@ void btrfs_destroy_cachep(void)
 		kmem_cache_destroy(btrfs_path_cachep);
 	if (btrfs_free_space_cachep)
 		kmem_cache_destroy(btrfs_free_space_cachep);
-	if (btrfs_delalloc_work_cachep)
-		kmem_cache_destroy(btrfs_delalloc_work_cachep);
 }
 
 int btrfs_init_cachep(void)
 {
 	btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
 			sizeof(struct btrfs_inode), 0,
-			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once);
+			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT,
+			init_once);
 	if (!btrfs_inode_cachep)
 		goto fail;
 
@@ -9213,13 +9235,6 @@ int btrfs_init_cachep(void)
 	if (!btrfs_free_space_cachep)
 		goto fail;
 
-	btrfs_delalloc_work_cachep = kmem_cache_create("btrfs_delalloc_work",
-			sizeof(struct btrfs_delalloc_work), 0,
-			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
-			NULL);
-	if (!btrfs_delalloc_work_cachep)
-		goto fail;
-
 	return 0;
 fail:
 	btrfs_destroy_cachep();
@@ -9443,14 +9458,10 @@ static void btrfs_run_delalloc_work(struct btrfs_work *work)
 	delalloc_work = container_of(work, struct btrfs_delalloc_work,
 				     work);
 	inode = delalloc_work->inode;
-	if (delalloc_work->wait) {
-		btrfs_wait_ordered_range(inode, 0, (u64)-1);
-	} else {
+	filemap_flush(inode->i_mapping);
+	if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
+				&BTRFS_I(inode)->runtime_flags))
 		filemap_flush(inode->i_mapping);
-		if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
-			     &BTRFS_I(inode)->runtime_flags))
-			filemap_flush(inode->i_mapping);
-	}
 
 	if (delalloc_work->delay_iput)
 		btrfs_add_delayed_iput(inode);
@@ -9460,18 +9471,17 @@ static void btrfs_run_delalloc_work(struct btrfs_work *work)
 }
 
 struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
-						    int wait, int delay_iput)
+						    int delay_iput)
 {
 	struct btrfs_delalloc_work *work;
 
-	work = kmem_cache_zalloc(btrfs_delalloc_work_cachep, GFP_NOFS);
+	work = kmalloc(sizeof(*work), GFP_NOFS);
 	if (!work)
 		return NULL;
 
 	init_completion(&work->completion);
 	INIT_LIST_HEAD(&work->list);
 	work->inode = inode;
-	work->wait = wait;
 	work->delay_iput = delay_iput;
 	WARN_ON_ONCE(!inode);
 	btrfs_init_work(&work->work, btrfs_flush_delalloc_helper,
@@ -9483,7 +9493,7 @@ struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
 void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work)
 {
 	wait_for_completion(&work->completion);
-	kmem_cache_free(btrfs_delalloc_work_cachep, work);
+	kfree(work);
 }
 
 /*
@@ -9519,7 +9529,7 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput,
 		}
 		spin_unlock(&root->delalloc_lock);
 
-		work = btrfs_alloc_delalloc_work(inode, 0, delay_iput);
+		work = btrfs_alloc_delalloc_work(inode, delay_iput);
 		if (!work) {
 			if (delay_iput)
 				btrfs_add_delayed_iput(inode);
@@ -9696,10 +9706,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 	if (err)
 		goto out_unlock_inode;
 
-	err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
-	if (err)
-		goto out_unlock_inode;
-
 	path = btrfs_alloc_path();
 	if (!path) {
 		err = -ENOMEM;
@@ -9732,10 +9738,18 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 	btrfs_free_path(path);
 
 	inode->i_op = &btrfs_symlink_inode_operations;
+	inode_nohighmem(inode);
 	inode->i_mapping->a_ops = &btrfs_symlink_aops;
 	inode_set_bytes(inode, name_len);
 	btrfs_i_size_write(inode, name_len);
 	err = btrfs_update_inode(trans, root, inode);
+	/*
+	 * Last step, add directory indexes for our symlink inode. This is the
+	 * last step to avoid extra cleanup of these indexes if an error happens
+	 * elsewhere above.
+	 */
+	if (!err)
+		err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
 	if (err) {
 		drop_inode = 1;
 		goto out_unlock_inode;
@@ -9786,7 +9800,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
 			}
 		}
 
-		cur_bytes = min(num_bytes, 256ULL * 1024 * 1024);
+		cur_bytes = min_t(u64, num_bytes, SZ_256M);
 		cur_bytes = max(cur_bytes, min_size);
 		/*
 		 * If we are severely fragmented we could end up with really
@@ -10021,7 +10035,7 @@ static const struct inode_operations btrfs_dir_inode_operations = {
 	.setattr	= btrfs_setattr,
 	.mknod		= btrfs_mknod,
 	.setxattr	= btrfs_setxattr,
-	.getxattr	= btrfs_getxattr,
+	.getxattr	= generic_getxattr,
 	.listxattr	= btrfs_listxattr,
 	.removexattr	= btrfs_removexattr,
 	.permission	= btrfs_permission,
@@ -10050,7 +10064,7 @@ static const struct file_operations btrfs_dir_file_operations = {
 	.fsync		= btrfs_sync_file,
 };
 
-static struct extent_io_ops btrfs_extent_io_ops = {
+static const struct extent_io_ops btrfs_extent_io_ops = {
 	.fill_delalloc = run_delalloc_range,
 	.submit_bio_hook = btrfs_submit_bio_hook,
 	.merge_bio_hook = btrfs_merge_bio_hook,
@@ -10098,7 +10112,7 @@ static const struct inode_operations btrfs_file_inode_operations = {
 	.getattr	= btrfs_getattr,
 	.setattr	= btrfs_setattr,
 	.setxattr	= btrfs_setxattr,
-	.getxattr	= btrfs_getxattr,
+	.getxattr	= generic_getxattr,
 	.listxattr      = btrfs_listxattr,
 	.removexattr	= btrfs_removexattr,
 	.permission	= btrfs_permission,
@@ -10112,7 +10126,7 @@ static const struct inode_operations btrfs_special_inode_operations = {
 	.setattr	= btrfs_setattr,
 	.permission	= btrfs_permission,
 	.setxattr	= btrfs_setxattr,
-	.getxattr	= btrfs_getxattr,
+	.getxattr	= generic_getxattr,
 	.listxattr	= btrfs_listxattr,
 	.removexattr	= btrfs_removexattr,
 	.get_acl	= btrfs_get_acl,
@@ -10121,13 +10135,12 @@ static const struct inode_operations btrfs_special_inode_operations = {
 };
 static const struct inode_operations btrfs_symlink_inode_operations = {
 	.readlink	= generic_readlink,
-	.follow_link	= page_follow_link_light,
-	.put_link	= page_put_link,
+	.get_link	= page_get_link,
 	.getattr	= btrfs_getattr,
 	.setattr	= btrfs_setattr,
 	.permission	= btrfs_permission,
 	.setxattr	= btrfs_setxattr,
-	.getxattr	= btrfs_getxattr,
+	.getxattr	= generic_getxattr,
 	.listxattr	= btrfs_listxattr,
 	.removexattr	= btrfs_removexattr,
 	.update_time	= btrfs_update_time,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index f07d01bc4..48aee9846 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -240,7 +240,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 	if (ret)
 		return ret;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	ip_oldflags = ip->flags;
 	i_oldflags = inode->i_flags;
@@ -358,7 +358,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
 	}
 
  out_unlock:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	mnt_drop_write_file(file);
 	return ret;
 }
@@ -659,22 +659,28 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
 	if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state))
 		return -EINVAL;
 
+	pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
+	if (!pending_snapshot)
+		return -ENOMEM;
+
+	pending_snapshot->root_item = kzalloc(sizeof(struct btrfs_root_item),
+			GFP_NOFS);
+	pending_snapshot->path = btrfs_alloc_path();
+	if (!pending_snapshot->root_item || !pending_snapshot->path) {
+		ret = -ENOMEM;
+		goto free_pending;
+	}
+
 	atomic_inc(&root->will_be_snapshoted);
 	smp_mb__after_atomic();
 	btrfs_wait_for_no_snapshoting_writes(root);
 
 	ret = btrfs_start_delalloc_inodes(root, 0);
 	if (ret)
-		goto out;
+		goto dec_and_free;
 
 	btrfs_wait_ordered_extents(root, -1);
 
-	pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
-	if (!pending_snapshot) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
 	btrfs_init_block_rsv(&pending_snapshot->block_rsv,
 			     BTRFS_BLOCK_RSV_TEMP);
 	/*
@@ -690,7 +696,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
 					&pending_snapshot->qgroup_reserved,
 					false);
 	if (ret)
-		goto free;
+		goto dec_and_free;
 
 	pending_snapshot->dentry = dentry;
 	pending_snapshot->root = root;
@@ -741,11 +747,14 @@ fail:
 	btrfs_subvolume_release_metadata(BTRFS_I(dir)->root,
 					 &pending_snapshot->block_rsv,
 					 pending_snapshot->qgroup_reserved);
-free:
-	kfree(pending_snapshot);
-out:
+dec_and_free:
 	if (atomic_dec_and_test(&root->will_be_snapshoted))
 		wake_up_atomic_t(&root->will_be_snapshoted);
+free_pending:
+	kfree(pending_snapshot->root_item);
+	btrfs_free_path(pending_snapshot->path);
+	kfree(pending_snapshot);
+
 	return ret;
 }
 
@@ -872,7 +881,7 @@ out_up_read:
 out_dput:
 	dput(dentry);
 out_unlock:
-	mutex_unlock(&dir->i_mutex);
+	inode_unlock(dir);
 	return error;
 }
 
@@ -996,7 +1005,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start)
 		u64 end = start + len - 1;
 
 		/* get the big lock and read metadata off disk */
-		lock_extent_bits(io_tree, start, end, 0, &cached);
+		lock_extent_bits(io_tree, start, end, &cached);
 		em = btrfs_get_extent(inode, NULL, 0, start, len, 0);
 		unlock_extent_cached(io_tree, start, end, &cached, GFP_NOFS);
 
@@ -1020,7 +1029,7 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
 	if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
 		ret = false;
 	else if ((em->block_start + em->block_len == next->block_start) &&
-		 (em->block_len > 128 * 1024 && next->block_len > 128 * 1024))
+		 (em->block_len > SZ_128K && next->block_len > SZ_128K))
 		ret = false;
 
 	free_extent_map(next);
@@ -1144,7 +1153,7 @@ again:
 		page_end = page_start + PAGE_CACHE_SIZE - 1;
 		while (1) {
 			lock_extent_bits(tree, page_start, page_end,
-					 0, &cached_state);
+					 &cached_state);
 			ordered = btrfs_lookup_ordered_extent(inode,
 							      page_start);
 			unlock_extent_cached(tree, page_start, page_end,
@@ -1204,7 +1213,7 @@ again:
 	page_end = page_offset(pages[i_done - 1]) + PAGE_CACHE_SIZE;
 
 	lock_extent_bits(&BTRFS_I(inode)->io_tree,
-			 page_start, page_end - 1, 0, &cached_state);
+			 page_start, page_end - 1, &cached_state);
 	clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start,
 			  page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC |
 			  EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0,
@@ -1266,9 +1275,9 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 	int defrag_count = 0;
 	int compress_type = BTRFS_COMPRESS_ZLIB;
 	u32 extent_thresh = range->extent_thresh;
-	unsigned long max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT;
+	unsigned long max_cluster = SZ_256K >> PAGE_CACHE_SHIFT;
 	unsigned long cluster = max_cluster;
-	u64 new_align = ~((u64)128 * 1024 - 1);
+	u64 new_align = ~((u64)SZ_128K - 1);
 	struct page **pages = NULL;
 
 	if (isize == 0)
@@ -1285,7 +1294,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 	}
 
 	if (extent_thresh == 0)
-		extent_thresh = 256 * 1024;
+		extent_thresh = SZ_256K;
 
 	/*
 	 * if we were not given a file, allocate a readahead
@@ -1317,7 +1326,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 
 	if (newer_than) {
 		ret = find_new_extents(root, inode, newer_than,
-				       &newer_off, 64 * 1024);
+				       &newer_off, SZ_64K);
 		if (!ret) {
 			range->start = newer_off;
 			/*
@@ -1384,18 +1393,18 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 			ra_index += cluster;
 		}
 
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 		if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
 			BTRFS_I(inode)->force_compress = compress_type;
 		ret = cluster_pages_for_defrag(inode, pages, i, cluster);
 		if (ret < 0) {
-			mutex_unlock(&inode->i_mutex);
+			inode_unlock(inode);
 			goto out_ra;
 		}
 
 		defrag_count += ret;
 		balance_dirty_pages_ratelimited(inode->i_mapping);
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 
 		if (newer_than) {
 			if (newer_off == (u64)-1)
@@ -1407,9 +1416,8 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 			newer_off = max(newer_off + 1,
 					(u64)i << PAGE_CACHE_SHIFT);
 
-			ret = find_new_extents(root, inode,
-					       newer_than, &newer_off,
-					       64 * 1024);
+			ret = find_new_extents(root, inode, newer_than,
+					       &newer_off, SZ_64K);
 			if (!ret) {
 				range->start = newer_off;
 				i = (newer_off & new_align) >> PAGE_CACHE_SHIFT;
@@ -1457,9 +1465,9 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 
 out_ra:
 	if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) {
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 		BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE;
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 	}
 	if (!file)
 		kfree(ra);
@@ -1575,7 +1583,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
 		new_size = old_size + new_size;
 	}
 
-	if (new_size < 256 * 1024 * 1024) {
+	if (new_size < SZ_256M) {
 		ret = -EINVAL;
 		goto out_free;
 	}
@@ -2164,7 +2172,7 @@ static noinline int btrfs_ioctl_tree_search_v2(struct file *file,
 	struct inode *inode;
 	int ret;
 	size_t buf_size;
-	const size_t buf_limit = 16 * 1024 * 1024;
+	const size_t buf_limit = SZ_16M;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
@@ -2422,7 +2430,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 		goto out_dput;
 	}
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	/*
 	 * Don't allow to delete a subvolume with send in progress. This is
@@ -2535,7 +2543,7 @@ out_up_write:
 		spin_unlock(&dest->root_item_lock);
 	}
 out_unlock_inode:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	if (!err) {
 		d_invalidate(dentry);
 		btrfs_invalidate_inodes(dest);
@@ -2551,7 +2559,7 @@ out_unlock_inode:
 out_dput:
 	dput(dentry);
 out_unlock_dir:
-	mutex_unlock(&dir->i_mutex);
+	inode_unlock(dir);
 out_drop_write:
 	mnt_drop_write_file(file);
 out:
@@ -2871,8 +2879,8 @@ static int lock_extent_range(struct inode *inode, u64 off, u64 len,
 
 static void btrfs_double_inode_unlock(struct inode *inode1, struct inode *inode2)
 {
-	mutex_unlock(&inode1->i_mutex);
-	mutex_unlock(&inode2->i_mutex);
+	inode_unlock(inode1);
+	inode_unlock(inode2);
 }
 
 static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2)
@@ -2880,8 +2888,8 @@ static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2)
 	if (inode1 < inode2)
 		swap(inode1, inode2);
 
-	mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
-	mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
+	inode_lock_nested(inode1, I_MUTEX_PARENT);
+	inode_lock_nested(inode2, I_MUTEX_CHILD);
 }
 
 static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
@@ -3003,7 +3011,7 @@ static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst,
 		flush_dcache_page(dst_page);
 
 		if (memcmp(addr, dst_addr, cmp_len))
-			ret = BTRFS_SAME_DATA_DIFFERS;
+			ret = -EBADE;
 
 		kunmap_atomic(addr);
 		kunmap_atomic(dst_addr);
@@ -3055,7 +3063,7 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen,
 		return 0;
 
 	if (same_inode) {
-		mutex_lock(&src->i_mutex);
+		inode_lock(src);
 
 		ret = extent_same_check_offsets(src, loff, &len, olen);
 		if (ret)
@@ -3162,62 +3170,25 @@ again:
 	btrfs_cmp_data_free(&cmp);
 out_unlock:
 	if (same_inode)
-		mutex_unlock(&src->i_mutex);
+		inode_unlock(src);
 	else
 		btrfs_double_inode_unlock(src, dst);
 
 	return ret;
 }
 
-#define BTRFS_MAX_DEDUPE_LEN	(16 * 1024 * 1024)
+#define BTRFS_MAX_DEDUPE_LEN	SZ_16M
 
-static long btrfs_ioctl_file_extent_same(struct file *file,
-			struct btrfs_ioctl_same_args __user *argp)
+ssize_t btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen,
+				struct file *dst_file, u64 dst_loff)
 {
-	struct btrfs_ioctl_same_args *same = NULL;
-	struct btrfs_ioctl_same_extent_info *info;
-	struct inode *src = file_inode(file);
-	u64 off;
-	u64 len;
-	int i;
-	int ret;
-	unsigned long size;
+	struct inode *src = file_inode(src_file);
+	struct inode *dst = file_inode(dst_file);
 	u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
-	bool is_admin = capable(CAP_SYS_ADMIN);
-	u16 count;
-
-	if (!(file->f_mode & FMODE_READ))
-		return -EINVAL;
+	ssize_t res;
 
-	ret = mnt_want_write_file(file);
-	if (ret)
-		return ret;
-
-	if (get_user(count, &argp->dest_count)) {
-		ret = -EFAULT;
-		goto out;
-	}
-
-	size = offsetof(struct btrfs_ioctl_same_args __user, info[count]);
-
-	same = memdup_user(argp, size);
-
-	if (IS_ERR(same)) {
-		ret = PTR_ERR(same);
-		same = NULL;
-		goto out;
-	}
-
-	off = same->logical_offset;
-	len = same->length;
-
-	/*
-	 * Limit the total length we will dedupe for each operation.
-	 * This is intended to bound the total time spent in this
-	 * ioctl to something sane.
-	 */
-	if (len > BTRFS_MAX_DEDUPE_LEN)
-		len = BTRFS_MAX_DEDUPE_LEN;
+	if (olen > BTRFS_MAX_DEDUPE_LEN)
+		olen = BTRFS_MAX_DEDUPE_LEN;
 
 	if (WARN_ON_ONCE(bs < PAGE_CACHE_SIZE)) {
 		/*
@@ -3225,58 +3196,13 @@ static long btrfs_ioctl_file_extent_same(struct file *file,
 		 * result, btrfs_cmp_data() won't correctly handle
 		 * this situation without an update.
 		 */
-		ret = -EINVAL;
-		goto out;
-	}
-
-	ret = -EISDIR;
-	if (S_ISDIR(src->i_mode))
-		goto out;
-
-	ret = -EACCES;
-	if (!S_ISREG(src->i_mode))
-		goto out;
-
-	/* pre-format output fields to sane values */
-	for (i = 0; i < count; i++) {
-		same->info[i].bytes_deduped = 0ULL;
-		same->info[i].status = 0;
-	}
-
-	for (i = 0, info = same->info; i < count; i++, info++) {
-		struct inode *dst;
-		struct fd dst_file = fdget(info->fd);
-		if (!dst_file.file) {
-			info->status = -EBADF;
-			continue;
-		}
-		dst = file_inode(dst_file.file);
-
-		if (!(is_admin || (dst_file.file->f_mode & FMODE_WRITE))) {
-			info->status = -EINVAL;
-		} else if (file->f_path.mnt != dst_file.file->f_path.mnt) {
-			info->status = -EXDEV;
-		} else if (S_ISDIR(dst->i_mode)) {
-			info->status = -EISDIR;
-		} else if (!S_ISREG(dst->i_mode)) {
-			info->status = -EACCES;
-		} else {
-			info->status = btrfs_extent_same(src, off, len, dst,
-							info->logical_offset);
-			if (info->status == 0)
-				info->bytes_deduped += len;
-		}
-		fdput(dst_file);
+		return -EINVAL;
 	}
 
-	ret = copy_to_user(argp, same, size);
-	if (ret)
-		ret = -EFAULT;
-
-out:
-	mnt_drop_write_file(file);
-	kfree(same);
-	return ret;
+	res = btrfs_extent_same(src, loff, olen, dst, dst_loff);
+	if (res)
+		return res;
+	return olen;
 }
 
 static int clone_finish_inode_update(struct btrfs_trans_handle *trans,
@@ -3551,7 +3477,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode,
 		return ret;
 	}
 
-	path->reada = 2;
+	path->reada = READA_FORWARD;
 	/* clone data */
 	key.objectid = btrfs_ino(src);
 	key.type = BTRFS_EXTENT_DATA_KEY;
@@ -3852,17 +3778,16 @@ out:
 	return ret;
 }
 
-static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
-				       u64 off, u64 olen, u64 destoff)
+static noinline int btrfs_clone_files(struct file *file, struct file *file_src,
+					u64 off, u64 olen, u64 destoff)
 {
 	struct inode *inode = file_inode(file);
+	struct inode *src = file_inode(file_src);
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct fd src_file;
-	struct inode *src;
 	int ret;
 	u64 len = olen;
 	u64 bs = root->fs_info->sb->s_blocksize;
-	int same_inode = 0;
+	int same_inode = src == inode;
 
 	/*
 	 * TODO:
@@ -3875,54 +3800,25 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
 	 *   be either compressed or non-compressed.
 	 */
 
-	/* the destination must be opened for writing */
-	if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND))
-		return -EINVAL;
-
 	if (btrfs_root_readonly(root))
 		return -EROFS;
 
-	ret = mnt_want_write_file(file);
-	if (ret)
-		return ret;
-
-	src_file = fdget(srcfd);
-	if (!src_file.file) {
-		ret = -EBADF;
-		goto out_drop_write;
-	}
-
-	ret = -EXDEV;
-	if (src_file.file->f_path.mnt != file->f_path.mnt)
-		goto out_fput;
-
-	src = file_inode(src_file.file);
-
-	ret = -EINVAL;
-	if (src == inode)
-		same_inode = 1;
-
-	/* the src must be open for reading */
-	if (!(src_file.file->f_mode & FMODE_READ))
-		goto out_fput;
+	if (file_src->f_path.mnt != file->f_path.mnt ||
+	    src->i_sb != inode->i_sb)
+		return -EXDEV;
 
 	/* don't make the dst file partly checksummed */
 	if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
 	    (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM))
-		goto out_fput;
+		return -EINVAL;
 
-	ret = -EISDIR;
 	if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode))
-		goto out_fput;
-
-	ret = -EXDEV;
-	if (src->i_sb != inode->i_sb)
-		goto out_fput;
+		return -EISDIR;
 
 	if (!same_inode) {
 		btrfs_double_inode_lock(src, inode);
 	} else {
-		mutex_lock(&src->i_mutex);
+		inode_lock(src);
 	}
 
 	/* determine range to clone */
@@ -3999,22 +3895,26 @@ out_unlock:
 	if (!same_inode)
 		btrfs_double_inode_unlock(src, inode);
 	else
-		mutex_unlock(&src->i_mutex);
-out_fput:
-	fdput(src_file);
-out_drop_write:
-	mnt_drop_write_file(file);
+		inode_unlock(src);
 	return ret;
 }
 
-static long btrfs_ioctl_clone_range(struct file *file, void __user *argp)
+ssize_t btrfs_copy_file_range(struct file *file_in, loff_t pos_in,
+			      struct file *file_out, loff_t pos_out,
+			      size_t len, unsigned int flags)
 {
-	struct btrfs_ioctl_clone_range_args args;
+	ssize_t ret;
 
-	if (copy_from_user(&args, argp, sizeof(args)))
-		return -EFAULT;
-	return btrfs_ioctl_clone(file, args.src_fd, args.src_offset,
-				 args.src_length, args.dest_offset);
+	ret = btrfs_clone_files(file_out, file_in, pos_in, len, pos_out);
+	if (ret == 0)
+		ret = len;
+	return ret;
+}
+
+int btrfs_clone_file_range(struct file *src_file, loff_t off,
+		struct file *dst_file, loff_t destoff, u64 len)
+{
+	return btrfs_clone_files(dst_file, src_file, off, len, destoff);
 }
 
 /*
@@ -4226,7 +4126,7 @@ static long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg)
 		return -ENOMEM;
 
 	space_args.total_spaces = 0;
-	dest = kmalloc(alloc_size, GFP_NOFS);
+	dest = kmalloc(alloc_size, GFP_KERNEL);
 	if (!dest)
 		return -ENOMEM;
 	dest_orig = dest;
@@ -4603,7 +4503,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root,
 		goto out;
 	}
 
-	size = min_t(u32, loi->size, 64 * 1024);
+	size = min_t(u32, loi->size, SZ_64K);
 	inodes = init_data_container(size);
 	if (IS_ERR(inodes)) {
 		ret = PTR_ERR(inodes);
@@ -4752,7 +4652,7 @@ locked:
 		goto out_bargs;
 	}
 
-	bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
+	bctl = kzalloc(sizeof(*bctl), GFP_KERNEL);
 	if (!bctl) {
 		ret = -ENOMEM;
 		goto out_bargs;
@@ -4838,7 +4738,7 @@ static long btrfs_ioctl_balance_progress(struct btrfs_root *root,
 		goto out;
 	}
 
-	bargs = kzalloc(sizeof(*bargs), GFP_NOFS);
+	bargs = kzalloc(sizeof(*bargs), GFP_KERNEL);
 	if (!bargs) {
 		ret = -ENOMEM;
 		goto out;
@@ -5098,7 +4998,7 @@ static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg)
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	qsa = kzalloc(sizeof(*qsa), GFP_NOFS);
+	qsa = kzalloc(sizeof(*qsa), GFP_KERNEL);
 	if (!qsa)
 		return -ENOMEM;
 
@@ -5228,7 +5128,7 @@ static long btrfs_ioctl_set_received_subvol_32(struct file *file,
 		goto out;
 	}
 
-	args64 = kmalloc(sizeof(*args64), GFP_NOFS);
+	args64 = kmalloc(sizeof(*args64), GFP_KERNEL);
 	if (!args64) {
 		ret = -ENOMEM;
 		goto out;
@@ -5365,7 +5265,7 @@ out_unlock:
 static int btrfs_ioctl_get_supported_features(struct file *file,
 					      void __user *arg)
 {
-	static struct btrfs_ioctl_feature_flags features[3] = {
+	static const struct btrfs_ioctl_feature_flags features[3] = {
 		INIT_FEATURE_FLAGS(SUPP),
 		INIT_FEATURE_FLAGS(SAFE_SET),
 		INIT_FEATURE_FLAGS(SAFE_CLEAR)
@@ -5564,10 +5464,6 @@ long btrfs_ioctl(struct file *file, unsigned int
 		return btrfs_ioctl_dev_info(root, argp);
 	case BTRFS_IOC_BALANCE:
 		return btrfs_ioctl_balance(file, NULL);
-	case BTRFS_IOC_CLONE:
-		return btrfs_ioctl_clone(file, arg, 0, 0, 0);
-	case BTRFS_IOC_CLONE_RANGE:
-		return btrfs_ioctl_clone_range(file, argp);
 	case BTRFS_IOC_TRANS_START:
 		return btrfs_ioctl_trans_start(file);
 	case BTRFS_IOC_TRANS_END:
@@ -5645,8 +5541,6 @@ long btrfs_ioctl(struct file *file, unsigned int
 		return btrfs_ioctl_get_fslabel(file, argp);
 	case BTRFS_IOC_SET_FSLABEL:
 		return btrfs_ioctl_set_fslabel(file, argp);
-	case BTRFS_IOC_FILE_EXTENT_SAME:
-		return btrfs_ioctl_file_extent_same(file, argp);
 	case BTRFS_IOC_GET_SUPPORTED_FEATURES:
 		return btrfs_ioctl_get_supported_features(file, argp);
 	case BTRFS_IOC_GET_FEATURES:
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 8077461fc..d13128c70 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -56,7 +56,6 @@ void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw)
 		atomic_dec(&eb->spinning_readers);
 		read_unlock(&eb->lock);
 	}
-	return;
 }
 
 /*
@@ -96,7 +95,6 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw)
 		    waitqueue_active(&eb->read_lock_wq))
 			wake_up(&eb->read_lock_wq);
 	}
-	return;
 }
 
 /*
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 1a33d3eb3..55161369f 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -503,7 +503,6 @@ static void cache_rbio(struct btrfs_raid_bio *rbio)
 	}
 
 	spin_unlock_irqrestore(&table->cache_lock, flags);
-	return;
 }
 
 /*
@@ -610,13 +609,28 @@ static int rbio_can_merge(struct btrfs_raid_bio *last,
 	return 1;
 }
 
+static int rbio_stripe_page_index(struct btrfs_raid_bio *rbio, int stripe,
+				  int index)
+{
+	return stripe * rbio->stripe_npages + index;
+}
+
+/*
+ * these are just the pages from the rbio array, not from anything
+ * the FS sent down to us
+ */
+static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe,
+				     int index)
+{
+	return rbio->stripe_pages[rbio_stripe_page_index(rbio, stripe, index)];
+}
+
 /*
  * helper to index into the pstripe
  */
 static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
 {
-	index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
-	return rbio->stripe_pages[index];
+	return rbio_stripe_page(rbio, rbio->nr_data, index);
 }
 
 /*
@@ -627,10 +641,7 @@ static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
 {
 	if (rbio->nr_data + 1 == rbio->real_stripes)
 		return NULL;
-
-	index += ((rbio->nr_data + 1) * rbio->stripe_len) >>
-		PAGE_CACHE_SHIFT;
-	return rbio->stripe_pages[index];
+	return rbio_stripe_page(rbio, rbio->nr_data + 1, index);
 }
 
 /*
@@ -890,6 +901,7 @@ static void raid_write_end_io(struct bio *bio)
 {
 	struct btrfs_raid_bio *rbio = bio->bi_private;
 	int err = bio->bi_error;
+	int max_errors;
 
 	if (err)
 		fail_bio_stripe(rbio, bio);
@@ -902,11 +914,12 @@ static void raid_write_end_io(struct bio *bio)
 	err = 0;
 
 	/* OK, we have read all the stripes we need to. */
-	if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
+	max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ?
+		     0 : rbio->bbio->max_errors;
+	if (atomic_read(&rbio->error) > max_errors)
 		err = -EIO;
 
 	rbio_orig_end_io(rbio, err);
-	return;
 }
 
 /*
@@ -949,8 +962,7 @@ static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
  */
 static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
 {
-	unsigned long nr = stripe_len * nr_stripes;
-	return DIV_ROUND_UP(nr, PAGE_CACHE_SIZE);
+	return DIV_ROUND_UP(stripe_len, PAGE_CACHE_SIZE) * nr_stripes;
 }
 
 /*
@@ -968,8 +980,8 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
 	void *p;
 
 	rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2 +
-		       DIV_ROUND_UP(stripe_npages, BITS_PER_LONG / 8),
-			GFP_NOFS);
+		       DIV_ROUND_UP(stripe_npages, BITS_PER_LONG) *
+		       sizeof(long), GFP_NOFS);
 	if (!rbio)
 		return ERR_PTR(-ENOMEM);
 
@@ -1023,18 +1035,17 @@ static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
 		if (!page)
 			return -ENOMEM;
 		rbio->stripe_pages[i] = page;
-		ClearPageUptodate(page);
 	}
 	return 0;
 }
 
-/* allocate pages for just the p/q stripes */
+/* only allocate pages for p/q stripes */
 static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
 {
 	int i;
 	struct page *page;
 
-	i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
+	i = rbio_stripe_page_index(rbio, rbio->nr_data, 0);
 
 	for (; i < rbio->nr_pages; i++) {
 		if (rbio->stripe_pages[i])
@@ -1123,18 +1134,6 @@ static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
 }
 
 /*
- * these are just the pages from the rbio array, not from anything
- * the FS sent down to us
- */
-static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page)
-{
-	int index;
-	index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT);
-	index += page;
-	return rbio->stripe_pages[index];
-}
-
-/*
  * helper function to walk our bio list and populate the bio_pages array with
  * the result.  This seems expensive, but it is faster than constantly
  * searching through the bio list as we setup the IO in finish_rmw or stripe
@@ -1177,7 +1176,6 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
 {
 	struct btrfs_bio *bbio = rbio->bbio;
 	void *pointers[rbio->real_stripes];
-	int stripe_len = rbio->stripe_len;
 	int nr_data = rbio->nr_data;
 	int stripe;
 	int pagenr;
@@ -1185,7 +1183,6 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
 	int q_stripe = -1;
 	struct bio_list bio_list;
 	struct bio *bio;
-	int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT;
 	int ret;
 
 	bio_list_init(&bio_list);
@@ -1228,7 +1225,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
 	else
 		clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
 
-	for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
+	for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
 		struct page *p;
 		/* first collect one page from each data stripe */
 		for (stripe = 0; stripe < nr_data; stripe++) {
@@ -1270,7 +1267,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
 	 * everything else.
 	 */
 	for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
-		for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
+		for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
 			struct page *page;
 			if (stripe < rbio->nr_data) {
 				page = page_in_rbio(rbio, stripe, pagenr, 1);
@@ -1294,7 +1291,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
 		if (!bbio->tgtdev_map[stripe])
 			continue;
 
-		for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
+		for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
 			struct page *page;
 			if (stripe < rbio->nr_data) {
 				page = page_in_rbio(rbio, stripe, pagenr, 1);
@@ -1508,7 +1505,6 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
 	int bios_to_read = 0;
 	struct bio_list bio_list;
 	int ret;
-	int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
 	int pagenr;
 	int stripe;
 	struct bio *bio;
@@ -1527,7 +1523,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
 	 * stripe
 	 */
 	for (stripe = 0; stripe < rbio->nr_data; stripe++) {
-		for (pagenr = 0; pagenr < nr_pages; pagenr++) {
+		for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
 			struct page *page;
 			/*
 			 * we want to find all the pages missing from
@@ -1803,7 +1799,6 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
 	int pagenr, stripe;
 	void **pointers;
 	int faila = -1, failb = -1;
-	int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
 	struct page *page;
 	int err;
 	int i;
@@ -1826,7 +1821,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
 
 	index_rbio_pages(rbio);
 
-	for (pagenr = 0; pagenr < nr_pages; pagenr++) {
+	for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
 		/*
 		 * Now we just use bitmap to mark the horizontal stripes in
 		 * which we have data when doing parity scrub.
@@ -1937,7 +1932,7 @@ pstripe:
 		 * other endio functions will fiddle the uptodate bits
 		 */
 		if (rbio->operation == BTRFS_RBIO_WRITE) {
-			for (i = 0;  i < nr_pages; i++) {
+			for (i = 0;  i < rbio->stripe_npages; i++) {
 				if (faila != -1) {
 					page = rbio_stripe_page(rbio, faila, i);
 					SetPageUptodate(page);
@@ -2033,7 +2028,6 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
 	int bios_to_read = 0;
 	struct bio_list bio_list;
 	int ret;
-	int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE);
 	int pagenr;
 	int stripe;
 	struct bio *bio;
@@ -2057,7 +2051,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
 			continue;
 		}
 
-		for (pagenr = 0; pagenr < nr_pages; pagenr++) {
+		for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
 			struct page *p;
 
 			/*
@@ -2281,37 +2275,11 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
 			if (!page)
 				return -ENOMEM;
 			rbio->stripe_pages[index] = page;
-			ClearPageUptodate(page);
 		}
 	}
 	return 0;
 }
 
-/*
- * end io function used by finish_rmw.  When we finally
- * get here, we've written a full stripe
- */
-static void raid_write_parity_end_io(struct bio *bio)
-{
-	struct btrfs_raid_bio *rbio = bio->bi_private;
-	int err = bio->bi_error;
-
-	if (bio->bi_error)
-		fail_bio_stripe(rbio, bio);
-
-	bio_put(bio);
-
-	if (!atomic_dec_and_test(&rbio->stripes_pending))
-		return;
-
-	err = 0;
-
-	if (atomic_read(&rbio->error))
-		err = -EIO;
-
-	rbio_orig_end_io(rbio, err);
-}
-
 static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
 					 int need_check)
 {
@@ -2464,7 +2432,7 @@ submit_write:
 			break;
 
 		bio->bi_private = rbio;
-		bio->bi_end_io = raid_write_parity_end_io;
+		bio->bi_end_io = raid_write_end_io;
 		submit_bio(WRITE, bio);
 	}
 	return;
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index b4ca5454e..2bd001145 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -575,7 +575,8 @@ static int is_cowonly_root(u64 root_objectid)
 	    root_objectid == BTRFS_TREE_LOG_OBJECTID ||
 	    root_objectid == BTRFS_CSUM_TREE_OBJECTID ||
 	    root_objectid == BTRFS_UUID_TREE_OBJECTID ||
-	    root_objectid == BTRFS_QUOTA_TREE_OBJECTID)
+	    root_objectid == BTRFS_QUOTA_TREE_OBJECTID ||
+	    root_objectid == BTRFS_FREE_SPACE_TREE_OBJECTID)
 		return 1;
 	return 0;
 }
@@ -708,8 +709,8 @@ struct backref_node *build_backref_tree(struct reloc_control *rc,
 		err = -ENOMEM;
 		goto out;
 	}
-	path1->reada = 1;
-	path2->reada = 2;
+	path1->reada = READA_FORWARD;
+	path2->reada = READA_FORWARD;
 
 	node = alloc_backref_node(cache);
 	if (!node) {
@@ -2130,7 +2131,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
-	path->reada = 1;
+	path->reada = READA_FORWARD;
 
 	reloc_root = root->reloc_root;
 	root_item = &reloc_root->root_item;
@@ -3030,7 +3031,7 @@ int prealloc_file_extent_cluster(struct inode *inode,
 	int ret = 0;
 
 	BUG_ON(cluster->start != cluster->boundary[0]);
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	ret = btrfs_check_data_free_space(inode, cluster->start,
 					  cluster->end + 1 - cluster->start);
@@ -3057,7 +3058,7 @@ int prealloc_file_extent_cluster(struct inode *inode,
 	btrfs_free_reserved_data_space(inode, cluster->start,
 				       cluster->end + 1 - cluster->start);
 out:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return ret;
 }
 
@@ -3527,7 +3528,7 @@ static int find_data_references(struct reloc_control *rc,
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
-	path->reada = 1;
+	path->reada = READA_FORWARD;
 
 	root = read_fs_root(rc->extent_root->fs_info, ref_root);
 	if (IS_ERR(root)) {
@@ -3917,7 +3918,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
-	path->reada = 1;
+	path->reada = READA_FORWARD;
 
 	ret = prepare_to_relocate(rc);
 	if (ret) {
@@ -4343,7 +4344,7 @@ int btrfs_recover_relocation(struct btrfs_root *root)
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
-	path->reada = -1;
+	path->reada = READA_BACK;
 
 	key.objectid = BTRFS_TREE_RELOC_OBJECTID;
 	key.type = BTRFS_ROOT_ITEM_KEY;
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index b091d94ce..92bf5ee73 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -1514,8 +1514,6 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
 
 	if (sblock->no_io_error_seen)
 		scrub_recheck_block_checksum(sblock);
-
-	return;
 }
 
 static inline int scrub_check_fsid(u8 fsid[],
@@ -2815,7 +2813,7 @@ out:
 
 static inline int scrub_calc_parity_bitmap_len(int nsectors)
 {
-	return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * (BITS_PER_LONG / 8);
+	return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * sizeof(long);
 }
 
 static void scrub_parity_get(struct scrub_parity *sparity)
@@ -3460,7 +3458,7 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
 		return ret;
 	}
 
-	map = (struct map_lookup *)em->bdev;
+	map = em->map_lookup;
 	if (em->start != chunk_offset)
 		goto out;
 
@@ -3507,7 +3505,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
 	if (!path)
 		return -ENOMEM;
 
-	path->reada = 2;
+	path->reada = READA_FORWARD;
 	path->search_commit_root = 1;
 	path->skip_locking = 1;
 
@@ -3735,27 +3733,27 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
 	if (fs_info->scrub_workers_refcnt == 0) {
 		if (is_dev_replace)
 			fs_info->scrub_workers =
-				btrfs_alloc_workqueue("btrfs-scrub", flags,
+				btrfs_alloc_workqueue("scrub", flags,
 						      1, 4);
 		else
 			fs_info->scrub_workers =
-				btrfs_alloc_workqueue("btrfs-scrub", flags,
+				btrfs_alloc_workqueue("scrub", flags,
 						      max_active, 4);
 		if (!fs_info->scrub_workers)
 			goto fail_scrub_workers;
 
 		fs_info->scrub_wr_completion_workers =
-			btrfs_alloc_workqueue("btrfs-scrubwrc", flags,
+			btrfs_alloc_workqueue("scrubwrc", flags,
 					      max_active, 2);
 		if (!fs_info->scrub_wr_completion_workers)
 			goto fail_scrub_wr_completion_workers;
 
 		fs_info->scrub_nocow_workers =
-			btrfs_alloc_workqueue("btrfs-scrubnc", flags, 1, 0);
+			btrfs_alloc_workqueue("scrubnc", flags, 1, 0);
 		if (!fs_info->scrub_nocow_workers)
 			goto fail_scrub_nocow_workers;
 		fs_info->scrub_parity_workers =
-			btrfs_alloc_workqueue("btrfs-scrubparity", flags,
+			btrfs_alloc_workqueue("scrubparity", flags,
 					      max_active, 2);
 		if (!fs_info->scrub_parity_workers)
 			goto fail_scrub_parity_workers;
@@ -4211,7 +4209,7 @@ static int check_extent_to_block(struct inode *inode, u64 start, u64 len,
 
 	io_tree = &BTRFS_I(inode)->io_tree;
 
-	lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state);
+	lock_extent_bits(io_tree, lockstart, lockend, &cached_state);
 	ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
 	if (ordered) {
 		btrfs_put_ordered_extent(ordered);
@@ -4281,7 +4279,7 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
 		return PTR_ERR(inode);
 
 	/* Avoid truncate/dio/punch hole.. */
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	inode_dio_wait(inode);
 
 	physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
@@ -4360,7 +4358,7 @@ next_page:
 	}
 	ret = COPY_COMPLETE;
 out:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	iput(inode);
 	return ret;
 }
diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h
index 48d425aef..02e00166c 100644
--- a/fs/btrfs/send.h
+++ b/fs/btrfs/send.h
@@ -22,8 +22,8 @@
 #define BTRFS_SEND_STREAM_MAGIC "btrfs-stream"
 #define BTRFS_SEND_STREAM_VERSION 1
 
-#define BTRFS_SEND_BUF_SIZE (1024 * 64)
-#define BTRFS_SEND_READ_SIZE (1024 * 48)
+#define BTRFS_SEND_BUF_SIZE SZ_64K
+#define BTRFS_SEND_READ_SIZE (48 * SZ_1K)
 
 enum btrfs_tlv_type {
 	BTRFS_TLV_U8,
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index fe609b81d..d41e09fe8 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -295,10 +295,11 @@ enum {
 	Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress,
 	Opt_compress_type, Opt_compress_force, Opt_compress_force_type,
 	Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard,
-	Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed,
-	Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache,
-	Opt_no_space_cache, Opt_recovery, Opt_skip_balance,
-	Opt_check_integrity, Opt_check_integrity_including_extent_data,
+	Opt_space_cache, Opt_space_cache_version, Opt_clear_cache,
+	Opt_user_subvol_rm_allowed, Opt_enospc_debug, Opt_subvolrootid,
+	Opt_defrag, Opt_inode_cache, Opt_no_space_cache, Opt_recovery,
+	Opt_skip_balance, Opt_check_integrity,
+	Opt_check_integrity_including_extent_data,
 	Opt_check_integrity_print_mask, Opt_fatal_errors, Opt_rescan_uuid_tree,
 	Opt_commit_interval, Opt_barrier, Opt_nodefrag, Opt_nodiscard,
 	Opt_noenospc_debug, Opt_noflushoncommit, Opt_acl, Opt_datacow,
@@ -309,7 +310,7 @@ enum {
 	Opt_err,
 };
 
-static match_table_t tokens = {
+static const match_table_t tokens = {
 	{Opt_degraded, "degraded"},
 	{Opt_subvol, "subvol=%s"},
 	{Opt_subvolid, "subvolid=%s"},
@@ -340,6 +341,7 @@ static match_table_t tokens = {
 	{Opt_discard, "discard"},
 	{Opt_nodiscard, "nodiscard"},
 	{Opt_space_cache, "space_cache"},
+	{Opt_space_cache_version, "space_cache=%s"},
 	{Opt_clear_cache, "clear_cache"},
 	{Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"},
 	{Opt_enospc_debug, "enospc_debug"},
@@ -381,9 +383,14 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 	int ret = 0;
 	char *compress_type;
 	bool compress_force = false;
+	enum btrfs_compression_type saved_compress_type;
+	bool saved_compress_force;
+	int no_compress = 0;
 
 	cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
-	if (cache_gen)
+	if (btrfs_fs_compat_ro(root->fs_info, FREE_SPACE_TREE))
+		btrfs_set_opt(info->mount_opt, FREE_SPACE_TREE);
+	else if (cache_gen)
 		btrfs_set_opt(info->mount_opt, SPACE_CACHE);
 
 	if (!options)
@@ -458,6 +465,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			/* Fallthrough */
 		case Opt_compress:
 		case Opt_compress_type:
+			saved_compress_type = btrfs_test_opt(root, COMPRESS) ?
+				info->compress_type : BTRFS_COMPRESS_NONE;
+			saved_compress_force =
+				btrfs_test_opt(root, FORCE_COMPRESS);
 			if (token == Opt_compress ||
 			    token == Opt_compress_force ||
 			    strcmp(args[0].from, "zlib") == 0) {
@@ -466,6 +477,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 				btrfs_set_opt(info->mount_opt, COMPRESS);
 				btrfs_clear_opt(info->mount_opt, NODATACOW);
 				btrfs_clear_opt(info->mount_opt, NODATASUM);
+				no_compress = 0;
 			} else if (strcmp(args[0].from, "lzo") == 0) {
 				compress_type = "lzo";
 				info->compress_type = BTRFS_COMPRESS_LZO;
@@ -473,25 +485,21 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 				btrfs_clear_opt(info->mount_opt, NODATACOW);
 				btrfs_clear_opt(info->mount_opt, NODATASUM);
 				btrfs_set_fs_incompat(info, COMPRESS_LZO);
+				no_compress = 0;
 			} else if (strncmp(args[0].from, "no", 2) == 0) {
 				compress_type = "no";
 				btrfs_clear_opt(info->mount_opt, COMPRESS);
 				btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
 				compress_force = false;
+				no_compress++;
 			} else {
 				ret = -EINVAL;
 				goto out;
 			}
 
 			if (compress_force) {
-				btrfs_set_and_info(root, FORCE_COMPRESS,
-						   "force %s compression",
-						   compress_type);
+				btrfs_set_opt(info->mount_opt, FORCE_COMPRESS);
 			} else {
-				if (!btrfs_test_opt(root, COMPRESS))
-					btrfs_info(root->fs_info,
-						   "btrfs: use %s compression",
-						   compress_type);
 				/*
 				 * If we remount from compress-force=xxx to
 				 * compress=xxx, we need clear FORCE_COMPRESS
@@ -500,6 +508,17 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 				 */
 				btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS);
 			}
+			if ((btrfs_test_opt(root, COMPRESS) &&
+			     (info->compress_type != saved_compress_type ||
+			      compress_force != saved_compress_force)) ||
+			    (!btrfs_test_opt(root, COMPRESS) &&
+			     no_compress == 1)) {
+				btrfs_info(root->fs_info,
+					   "%s %s compression",
+					   (compress_force) ? "force" : "use",
+					   compress_type);
+			}
+			compress_force = false;
 			break;
 		case Opt_ssd:
 			btrfs_set_and_info(root, SSD,
@@ -617,15 +636,35 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 					     "turning off discard");
 			break;
 		case Opt_space_cache:
-			btrfs_set_and_info(root, SPACE_CACHE,
-					   "enabling disk space caching");
+		case Opt_space_cache_version:
+			if (token == Opt_space_cache ||
+			    strcmp(args[0].from, "v1") == 0) {
+				btrfs_clear_opt(root->fs_info->mount_opt,
+						FREE_SPACE_TREE);
+				btrfs_set_and_info(root, SPACE_CACHE,
+						   "enabling disk space caching");
+			} else if (strcmp(args[0].from, "v2") == 0) {
+				btrfs_clear_opt(root->fs_info->mount_opt,
+						SPACE_CACHE);
+				btrfs_set_and_info(root, FREE_SPACE_TREE,
+						   "enabling free space tree");
+			} else {
+				ret = -EINVAL;
+				goto out;
+			}
 			break;
 		case Opt_rescan_uuid_tree:
 			btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE);
 			break;
 		case Opt_no_space_cache:
-			btrfs_clear_and_info(root, SPACE_CACHE,
-					     "disabling disk space caching");
+			if (btrfs_test_opt(root, SPACE_CACHE)) {
+				btrfs_clear_and_info(root, SPACE_CACHE,
+						     "disabling disk space caching");
+			}
+			if (btrfs_test_opt(root, FREE_SPACE_TREE)) {
+				btrfs_clear_and_info(root, FREE_SPACE_TREE,
+						     "disabling free space tree");
+			}
 			break;
 		case Opt_inode_cache:
 			btrfs_set_pending_and_info(info, INODE_MAP_CACHE,
@@ -754,8 +793,17 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 		}
 	}
 out:
+	if (btrfs_fs_compat_ro(root->fs_info, FREE_SPACE_TREE) &&
+	    !btrfs_test_opt(root, FREE_SPACE_TREE) &&
+	    !btrfs_test_opt(root, CLEAR_CACHE)) {
+		btrfs_err(root->fs_info, "cannot disable free space tree");
+		ret = -EINVAL;
+
+	}
 	if (!ret && btrfs_test_opt(root, SPACE_CACHE))
 		btrfs_info(root->fs_info, "disk space caching is enabled");
+	if (!ret && btrfs_test_opt(root, FREE_SPACE_TREE))
+		btrfs_info(root->fs_info, "using free space tree");
 	kfree(orig);
 	return ret;
 }
@@ -1162,6 +1210,8 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
 		seq_puts(seq, ",noacl");
 	if (btrfs_test_opt(root, SPACE_CACHE))
 		seq_puts(seq, ",space_cache");
+	else if (btrfs_test_opt(root, FREE_SPACE_TREE))
+		seq_puts(seq, ",space_cache=v2");
 	else
 		seq_puts(seq, ",nospace_cache");
 	if (btrfs_test_opt(root, RESCAN_UUID_TREE))
@@ -1514,9 +1564,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags,
 		if ((flags ^ s->s_flags) & MS_RDONLY)
 			error = -EBUSY;
 	} else {
-		char b[BDEVNAME_SIZE];
-
-		strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
+		snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
 		btrfs_sb(s)->bdev_holder = fs_type;
 		error = btrfs_fill_super(s, fs_devices, data,
 					 flags & MS_SILENT ? 1 : 0);
@@ -1865,7 +1913,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes)
 		 * btrfs starts at an offset of at least 1MB when doing chunk
 		 * allocation.
 		 */
-		skip_space = 1024 * 1024;
+		skip_space = SZ_1M;
 
 		/* user can set the offset in fs_info->alloc_start. */
 		if (fs_info->alloc_start &&
@@ -2249,6 +2297,9 @@ static int btrfs_run_sanity_tests(void)
 	if (ret)
 		goto out;
 	ret = btrfs_test_qgroups();
+	if (ret)
+		goto out;
+	ret = btrfs_test_free_space_tree();
 out:
 	btrfs_destroy_test_fs();
 	return ret;
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index e0ac85949..539e7b5e3 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -202,6 +202,7 @@ BTRFS_FEAT_ATTR_INCOMPAT(extended_iref, EXTENDED_IREF);
 BTRFS_FEAT_ATTR_INCOMPAT(raid56, RAID56);
 BTRFS_FEAT_ATTR_INCOMPAT(skinny_metadata, SKINNY_METADATA);
 BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES);
+BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE);
 
 static struct attribute *btrfs_supported_feature_attrs[] = {
 	BTRFS_FEAT_ATTR_PTR(mixed_backref),
@@ -213,6 +214,7 @@ static struct attribute *btrfs_supported_feature_attrs[] = {
 	BTRFS_FEAT_ATTR_PTR(raid56),
 	BTRFS_FEAT_ATTR_PTR(skinny_metadata),
 	BTRFS_FEAT_ATTR_PTR(no_holes),
+	BTRFS_FEAT_ATTR_PTR(free_space_tree),
 	NULL
 };
 
@@ -780,6 +782,39 @@ failure:
 	return error;
 }
 
+
+/*
+ * Change per-fs features in /sys/fs/btrfs/UUID/features to match current
+ * values in superblock. Call after any changes to incompat/compat_ro flags
+ */
+void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info,
+		u64 bit, enum btrfs_feature_set set)
+{
+	struct btrfs_fs_devices *fs_devs;
+	struct kobject *fsid_kobj;
+	u64 features;
+	int ret;
+
+	if (!fs_info)
+		return;
+
+	features = get_features(fs_info, set);
+	ASSERT(bit & supported_feature_masks[set]);
+
+	fs_devs = fs_info->fs_devices;
+	fsid_kobj = &fs_devs->fsid_kobj;
+
+	if (!fsid_kobj->state_initialized)
+		return;
+
+	/*
+	 * FIXME: this is too heavy to update just one value, ideally we'd like
+	 * to use sysfs_update_group but some refactoring is needed first.
+	 */
+	sysfs_remove_group(fsid_kobj, &btrfs_feature_attr_group);
+	ret = sysfs_create_group(fsid_kobj, &btrfs_feature_attr_group);
+}
+
 static int btrfs_init_debugfs(void)
 {
 #ifdef CONFIG_DEBUG_FS
diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h
index 9c0952212..d7da1a4c2 100644
--- a/fs/btrfs/sysfs.h
+++ b/fs/btrfs/sysfs.h
@@ -56,7 +56,7 @@ static struct btrfs_feature_attr btrfs_attr_##_name = {			     \
 #define BTRFS_FEAT_ATTR_COMPAT(name, feature) \
 	BTRFS_FEAT_ATTR(name, FEAT_COMPAT, BTRFS_FEATURE_COMPAT, feature)
 #define BTRFS_FEAT_ATTR_COMPAT_RO(name, feature) \
-	BTRFS_FEAT_ATTR(name, FEAT_COMPAT_RO, BTRFS_FEATURE_COMPAT, feature)
+	BTRFS_FEAT_ATTR(name, FEAT_COMPAT_RO, BTRFS_FEATURE_COMPAT_RO, feature)
 #define BTRFS_FEAT_ATTR_INCOMPAT(name, feature) \
 	BTRFS_FEAT_ATTR(name, FEAT_INCOMPAT, BTRFS_FEATURE_INCOMPAT, feature)
 
@@ -90,4 +90,7 @@ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs,
 				struct kobject *parent);
 int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs);
 void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs);
+void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info,
+		u64 bit, enum btrfs_feature_set set);
+
 #endif /* _BTRFS_SYSFS_H_ */
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index 9626252ee..0e1e61a7e 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -21,6 +21,9 @@
 #include <linux/magic.h>
 #include "btrfs-tests.h"
 #include "../ctree.h"
+#include "../free-space-cache.h"
+#include "../free-space-tree.h"
+#include "../transaction.h"
 #include "../volumes.h"
 #include "../disk-io.h"
 #include "../qgroup.h"
@@ -79,18 +82,18 @@ void btrfs_destroy_test_fs(void)
 struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void)
 {
 	struct btrfs_fs_info *fs_info = kzalloc(sizeof(struct btrfs_fs_info),
-						GFP_NOFS);
+						GFP_KERNEL);
 
 	if (!fs_info)
 		return fs_info;
 	fs_info->fs_devices = kzalloc(sizeof(struct btrfs_fs_devices),
-				      GFP_NOFS);
+				      GFP_KERNEL);
 	if (!fs_info->fs_devices) {
 		kfree(fs_info);
 		return NULL;
 	}
 	fs_info->super_copy = kzalloc(sizeof(struct btrfs_super_block),
-				      GFP_NOFS);
+				      GFP_KERNEL);
 	if (!fs_info->super_copy) {
 		kfree(fs_info->fs_devices);
 		kfree(fs_info);
@@ -122,6 +125,9 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void)
 	INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
 	INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
 	INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
+	extent_io_tree_init(&fs_info->freed_extents[0], NULL);
+	extent_io_tree_init(&fs_info->freed_extents[1], NULL);
+	fs_info->pinned_extents = &fs_info->freed_extents[0];
 	return fs_info;
 }
 
@@ -169,3 +175,55 @@ void btrfs_free_dummy_root(struct btrfs_root *root)
 	kfree(root);
 }
 
+struct btrfs_block_group_cache *
+btrfs_alloc_dummy_block_group(unsigned long length)
+{
+	struct btrfs_block_group_cache *cache;
+
+	cache = kzalloc(sizeof(*cache), GFP_KERNEL);
+	if (!cache)
+		return NULL;
+	cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
+					GFP_KERNEL);
+	if (!cache->free_space_ctl) {
+		kfree(cache);
+		return NULL;
+	}
+	cache->fs_info = btrfs_alloc_dummy_fs_info();
+	if (!cache->fs_info) {
+		kfree(cache->free_space_ctl);
+		kfree(cache);
+		return NULL;
+	}
+
+	cache->key.objectid = 0;
+	cache->key.offset = length;
+	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+	cache->sectorsize = 4096;
+	cache->full_stripe_len = 4096;
+
+	INIT_LIST_HEAD(&cache->list);
+	INIT_LIST_HEAD(&cache->cluster_list);
+	INIT_LIST_HEAD(&cache->bg_list);
+	btrfs_init_free_space_ctl(cache);
+	mutex_init(&cache->free_space_lock);
+
+	return cache;
+}
+
+void btrfs_free_dummy_block_group(struct btrfs_block_group_cache *cache)
+{
+	if (!cache)
+		return;
+	__btrfs_remove_free_space_cache(cache->free_space_ctl);
+	kfree(cache->free_space_ctl);
+	kfree(cache);
+}
+
+void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans)
+{
+	memset(trans, 0, sizeof(*trans));
+	trans->transid = 1;
+	INIT_LIST_HEAD(&trans->qgroup_ref_list);
+	trans->type = __TRANS_DUMMY;
+}
diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h
index fd3954224..054b8c73c 100644
--- a/fs/btrfs/tests/btrfs-tests.h
+++ b/fs/btrfs/tests/btrfs-tests.h
@@ -24,17 +24,23 @@
 #define test_msg(fmt, ...) pr_info("BTRFS: selftest: " fmt, ##__VA_ARGS__)
 
 struct btrfs_root;
+struct btrfs_trans_handle;
 
 int btrfs_test_free_space_cache(void);
 int btrfs_test_extent_buffer_operations(void);
 int btrfs_test_extent_io(void);
 int btrfs_test_inodes(void);
 int btrfs_test_qgroups(void);
+int btrfs_test_free_space_tree(void);
 int btrfs_init_test_fs(void);
 void btrfs_destroy_test_fs(void);
 struct inode *btrfs_new_test_inode(void);
 struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void);
 void btrfs_free_dummy_root(struct btrfs_root *root);
+struct btrfs_block_group_cache *
+btrfs_alloc_dummy_block_group(unsigned long length);
+void btrfs_free_dummy_block_group(struct btrfs_block_group_cache *cache);
+void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans);
 #else
 static inline int btrfs_test_free_space_cache(void)
 {
@@ -63,6 +69,10 @@ static inline int btrfs_test_qgroups(void)
 {
 	return 0;
 }
+static inline int btrfs_test_free_space_tree(void)
+{
+	return 0;
+}
 #endif
 
 #endif
diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c
index 9e9f23681..669b58201 100644
--- a/fs/btrfs/tests/extent-io-tests.c
+++ b/fs/btrfs/tests/extent-io-tests.c
@@ -18,6 +18,8 @@
 
 #include <linux/pagemap.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/sizes.h>
 #include "btrfs-tests.h"
 #include "../extent_io.h"
 
@@ -70,12 +72,14 @@ static int test_find_delalloc(void)
 	struct page *page;
 	struct page *locked_page = NULL;
 	unsigned long index = 0;
-	u64 total_dirty = 256 * 1024 * 1024;
-	u64 max_bytes = 128 * 1024 * 1024;
+	u64 total_dirty = SZ_256M;
+	u64 max_bytes = SZ_128M;
 	u64 start, end, test_start;
 	u64 found;
 	int ret = -EINVAL;
 
+	test_msg("Running find delalloc tests\n");
+
 	inode = btrfs_new_test_inode();
 	if (!inode) {
 		test_msg("Failed to allocate test inode\n");
@@ -90,7 +94,7 @@ static int test_find_delalloc(void)
 	 * test.
 	 */
 	for (index = 0; index < (total_dirty >> PAGE_CACHE_SHIFT); index++) {
-		page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
+		page = find_or_create_page(inode->i_mapping, index, GFP_KERNEL);
 		if (!page) {
 			test_msg("Failed to allocate test page\n");
 			ret = -ENOMEM;
@@ -109,7 +113,7 @@ static int test_find_delalloc(void)
 	 * |--- delalloc ---|
 	 * |---  search  ---|
 	 */
-	set_extent_delalloc(&tmp, 0, 4095, NULL, GFP_NOFS);
+	set_extent_delalloc(&tmp, 0, 4095, NULL, GFP_KERNEL);
 	start = 0;
 	end = 0;
 	found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
@@ -133,14 +137,14 @@ static int test_find_delalloc(void)
 	 * |--- delalloc ---|
 	 *           |--- search ---|
 	 */
-	test_start = 64 * 1024 * 1024;
+	test_start = SZ_64M;
 	locked_page = find_lock_page(inode->i_mapping,
 				     test_start >> PAGE_CACHE_SHIFT);
 	if (!locked_page) {
 		test_msg("Couldn't find the locked page\n");
 		goto out_bits;
 	}
-	set_extent_delalloc(&tmp, 4096, max_bytes - 1, NULL, GFP_NOFS);
+	set_extent_delalloc(&tmp, 4096, max_bytes - 1, NULL, GFP_KERNEL);
 	start = test_start;
 	end = 0;
 	found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
@@ -195,7 +199,7 @@ static int test_find_delalloc(void)
 	 *
 	 * We are re-using our test_start from above since it works out well.
 	 */
-	set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, NULL, GFP_NOFS);
+	set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, NULL, GFP_KERNEL);
 	start = test_start;
 	end = 0;
 	found = find_lock_delalloc_range(inode, &tmp, locked_page, &start,
@@ -220,8 +224,8 @@ static int test_find_delalloc(void)
 	 * Now to test where we run into a page that is no longer dirty in the
 	 * range we want to find.
 	 */
-	page = find_get_page(inode->i_mapping, (max_bytes + (1 * 1024 * 1024))
-			     >> PAGE_CACHE_SHIFT);
+	page = find_get_page(inode->i_mapping,
+			     (max_bytes + SZ_1M) >> PAGE_CACHE_SHIFT);
 	if (!page) {
 		test_msg("Couldn't find our page\n");
 		goto out_bits;
@@ -258,7 +262,7 @@ static int test_find_delalloc(void)
 	}
 	ret = 0;
 out_bits:
-	clear_extent_bits(&tmp, 0, total_dirty - 1, (unsigned)-1, GFP_NOFS);
+	clear_extent_bits(&tmp, 0, total_dirty - 1, (unsigned)-1, GFP_KERNEL);
 out:
 	if (locked_page)
 		page_cache_release(locked_page);
@@ -268,8 +272,139 @@ out:
 	return ret;
 }
 
+static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb,
+			     unsigned long len)
+{
+	unsigned long i, x;
+
+	memset(bitmap, 0, len);
+	memset_extent_buffer(eb, 0, 0, len);
+	if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) {
+		test_msg("Bitmap was not zeroed\n");
+		return -EINVAL;
+	}
+
+	bitmap_set(bitmap, 0, len * BITS_PER_BYTE);
+	extent_buffer_bitmap_set(eb, 0, 0, len * BITS_PER_BYTE);
+	if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) {
+		test_msg("Setting all bits failed\n");
+		return -EINVAL;
+	}
+
+	bitmap_clear(bitmap, 0, len * BITS_PER_BYTE);
+	extent_buffer_bitmap_clear(eb, 0, 0, len * BITS_PER_BYTE);
+	if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) {
+		test_msg("Clearing all bits failed\n");
+		return -EINVAL;
+	}
+
+	bitmap_set(bitmap, (PAGE_CACHE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE,
+		   sizeof(long) * BITS_PER_BYTE);
+	extent_buffer_bitmap_set(eb, PAGE_CACHE_SIZE - sizeof(long) / 2, 0,
+				 sizeof(long) * BITS_PER_BYTE);
+	if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) {
+		test_msg("Setting straddling pages failed\n");
+		return -EINVAL;
+	}
+
+	bitmap_set(bitmap, 0, len * BITS_PER_BYTE);
+	bitmap_clear(bitmap,
+		     (PAGE_CACHE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE,
+		     sizeof(long) * BITS_PER_BYTE);
+	extent_buffer_bitmap_set(eb, 0, 0, len * BITS_PER_BYTE);
+	extent_buffer_bitmap_clear(eb, PAGE_CACHE_SIZE - sizeof(long) / 2, 0,
+				   sizeof(long) * BITS_PER_BYTE);
+	if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) {
+		test_msg("Clearing straddling pages failed\n");
+		return -EINVAL;
+	}
+
+	/*
+	 * Generate a wonky pseudo-random bit pattern for the sake of not using
+	 * something repetitive that could miss some hypothetical off-by-n bug.
+	 */
+	x = 0;
+	for (i = 0; i < len / sizeof(long); i++) {
+		x = (0x19660dULL * (u64)x + 0x3c6ef35fULL) & 0xffffffffUL;
+		bitmap[i] = x;
+	}
+	write_extent_buffer(eb, bitmap, 0, len);
+
+	for (i = 0; i < len * BITS_PER_BYTE; i++) {
+		int bit, bit1;
+
+		bit = !!test_bit(i, bitmap);
+		bit1 = !!extent_buffer_test_bit(eb, 0, i);
+		if (bit1 != bit) {
+			test_msg("Testing bit pattern failed\n");
+			return -EINVAL;
+		}
+
+		bit1 = !!extent_buffer_test_bit(eb, i / BITS_PER_BYTE,
+						i % BITS_PER_BYTE);
+		if (bit1 != bit) {
+			test_msg("Testing bit pattern with offset failed\n");
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+static int test_eb_bitmaps(void)
+{
+	unsigned long len = PAGE_CACHE_SIZE * 4;
+	unsigned long *bitmap;
+	struct extent_buffer *eb;
+	int ret;
+
+	test_msg("Running extent buffer bitmap tests\n");
+
+	bitmap = kmalloc(len, GFP_KERNEL);
+	if (!bitmap) {
+		test_msg("Couldn't allocate test bitmap\n");
+		return -ENOMEM;
+	}
+
+	eb = __alloc_dummy_extent_buffer(NULL, 0, len);
+	if (!eb) {
+		test_msg("Couldn't allocate test extent buffer\n");
+		kfree(bitmap);
+		return -ENOMEM;
+	}
+
+	ret = __test_eb_bitmaps(bitmap, eb, len);
+	if (ret)
+		goto out;
+
+	/* Do it over again with an extent buffer which isn't page-aligned. */
+	free_extent_buffer(eb);
+	eb = __alloc_dummy_extent_buffer(NULL, PAGE_CACHE_SIZE / 2, len);
+	if (!eb) {
+		test_msg("Couldn't allocate test extent buffer\n");
+		kfree(bitmap);
+		return -ENOMEM;
+	}
+
+	ret = __test_eb_bitmaps(bitmap, eb, len);
+out:
+	free_extent_buffer(eb);
+	kfree(bitmap);
+	return ret;
+}
+
 int btrfs_test_extent_io(void)
 {
-	test_msg("Running find delalloc tests\n");
-	return test_find_delalloc();
+	int ret;
+
+	test_msg("Running extent I/O tests\n");
+
+	ret = test_find_delalloc();
+	if (ret)
+		goto out;
+
+	ret = test_eb_bitmaps();
+out:
+	test_msg("Extent I/O tests finished\n");
+	return ret;
 }
diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c
index 8b72b005b..c9ad97b1e 100644
--- a/fs/btrfs/tests/free-space-tests.c
+++ b/fs/btrfs/tests/free-space-tests.c
@@ -23,41 +23,6 @@
 #include "../free-space-cache.h"
 
 #define BITS_PER_BITMAP		(PAGE_CACHE_SIZE * 8)
-static struct btrfs_block_group_cache *init_test_block_group(void)
-{
-	struct btrfs_block_group_cache *cache;
-
-	cache = kzalloc(sizeof(*cache), GFP_NOFS);
-	if (!cache)
-		return NULL;
-	cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
-					GFP_NOFS);
-	if (!cache->free_space_ctl) {
-		kfree(cache);
-		return NULL;
-	}
-	cache->fs_info = btrfs_alloc_dummy_fs_info();
-	if (!cache->fs_info) {
-		kfree(cache->free_space_ctl);
-		kfree(cache);
-		return NULL;
-	}
-
-	cache->key.objectid = 0;
-	cache->key.offset = 1024 * 1024 * 1024;
-	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
-	cache->sectorsize = 4096;
-	cache->full_stripe_len = 4096;
-
-	spin_lock_init(&cache->lock);
-	INIT_LIST_HEAD(&cache->list);
-	INIT_LIST_HEAD(&cache->cluster_list);
-	INIT_LIST_HEAD(&cache->bg_list);
-
-	btrfs_init_free_space_ctl(cache);
-
-	return cache;
-}
 
 /*
  * This test just does basic sanity checking, making sure we can add an exten
@@ -71,59 +36,59 @@ static int test_extents(struct btrfs_block_group_cache *cache)
 	test_msg("Running extent only tests\n");
 
 	/* First just make sure we can remove an entire entry */
-	ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024);
+	ret = btrfs_add_free_space(cache, 0, SZ_4M);
 	if (ret) {
 		test_msg("Error adding initial extents %d\n", ret);
 		return ret;
 	}
 
-	ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024);
+	ret = btrfs_remove_free_space(cache, 0, SZ_4M);
 	if (ret) {
 		test_msg("Error removing extent %d\n", ret);
 		return ret;
 	}
 
-	if (test_check_exists(cache, 0, 4 * 1024 * 1024)) {
+	if (test_check_exists(cache, 0, SZ_4M)) {
 		test_msg("Full remove left some lingering space\n");
 		return -1;
 	}
 
 	/* Ok edge and middle cases now */
-	ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024);
+	ret = btrfs_add_free_space(cache, 0, SZ_4M);
 	if (ret) {
 		test_msg("Error adding half extent %d\n", ret);
 		return ret;
 	}
 
-	ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 1 * 1024 * 1024);
+	ret = btrfs_remove_free_space(cache, 3 * SZ_1M, SZ_1M);
 	if (ret) {
 		test_msg("Error removing tail end %d\n", ret);
 		return ret;
 	}
 
-	ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024);
+	ret = btrfs_remove_free_space(cache, 0, SZ_1M);
 	if (ret) {
 		test_msg("Error removing front end %d\n", ret);
 		return ret;
 	}
 
-	ret = btrfs_remove_free_space(cache, 2 * 1024 * 1024, 4096);
+	ret = btrfs_remove_free_space(cache, SZ_2M, 4096);
 	if (ret) {
 		test_msg("Error removing middle piece %d\n", ret);
 		return ret;
 	}
 
-	if (test_check_exists(cache, 0, 1 * 1024 * 1024)) {
+	if (test_check_exists(cache, 0, SZ_1M)) {
 		test_msg("Still have space at the front\n");
 		return -1;
 	}
 
-	if (test_check_exists(cache, 2 * 1024 * 1024, 4096)) {
+	if (test_check_exists(cache, SZ_2M, 4096)) {
 		test_msg("Still have space in the middle\n");
 		return -1;
 	}
 
-	if (test_check_exists(cache, 3 * 1024 * 1024, 1 * 1024 * 1024)) {
+	if (test_check_exists(cache, 3 * SZ_1M, SZ_1M)) {
 		test_msg("Still have space at the end\n");
 		return -1;
 	}
@@ -141,30 +106,30 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache)
 
 	test_msg("Running bitmap only tests\n");
 
-	ret = test_add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1);
+	ret = test_add_free_space_entry(cache, 0, SZ_4M, 1);
 	if (ret) {
 		test_msg("Couldn't create a bitmap entry %d\n", ret);
 		return ret;
 	}
 
-	ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024);
+	ret = btrfs_remove_free_space(cache, 0, SZ_4M);
 	if (ret) {
 		test_msg("Error removing bitmap full range %d\n", ret);
 		return ret;
 	}
 
-	if (test_check_exists(cache, 0, 4 * 1024 * 1024)) {
+	if (test_check_exists(cache, 0, SZ_4M)) {
 		test_msg("Left some space in bitmap\n");
 		return -1;
 	}
 
-	ret = test_add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1);
+	ret = test_add_free_space_entry(cache, 0, SZ_4M, 1);
 	if (ret) {
 		test_msg("Couldn't add to our bitmap entry %d\n", ret);
 		return ret;
 	}
 
-	ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 2 * 1024 * 1024);
+	ret = btrfs_remove_free_space(cache, SZ_1M, SZ_2M);
 	if (ret) {
 		test_msg("Couldn't remove middle chunk %d\n", ret);
 		return ret;
@@ -177,23 +142,21 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache)
 	next_bitmap_offset = (u64)(BITS_PER_BITMAP * 4096);
 
 	/* Test a bit straddling two bitmaps */
-	ret = test_add_free_space_entry(cache, next_bitmap_offset -
-				   (2 * 1024 * 1024), 4 * 1024 * 1024, 1);
+	ret = test_add_free_space_entry(cache, next_bitmap_offset - SZ_2M,
+					SZ_4M, 1);
 	if (ret) {
 		test_msg("Couldn't add space that straddles two bitmaps %d\n",
 				ret);
 		return ret;
 	}
 
-	ret = btrfs_remove_free_space(cache, next_bitmap_offset -
-				      (1 * 1024 * 1024), 2 * 1024 * 1024);
+	ret = btrfs_remove_free_space(cache, next_bitmap_offset - SZ_1M, SZ_2M);
 	if (ret) {
 		test_msg("Couldn't remove overlapping space %d\n", ret);
 		return ret;
 	}
 
-	if (test_check_exists(cache, next_bitmap_offset - (1 * 1024 * 1024),
-			 2 * 1024 * 1024)) {
+	if (test_check_exists(cache, next_bitmap_offset - SZ_1M, SZ_2M)) {
 		test_msg("Left some space when removing overlapping\n");
 		return -1;
 	}
@@ -216,43 +179,43 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
 	 * bitmap, but the free space completely in the extent and then
 	 * completely in the bitmap.
 	 */
-	ret = test_add_free_space_entry(cache, 4 * 1024 * 1024, 1 * 1024 * 1024, 1);
+	ret = test_add_free_space_entry(cache, SZ_4M, SZ_1M, 1);
 	if (ret) {
 		test_msg("Couldn't create bitmap entry %d\n", ret);
 		return ret;
 	}
 
-	ret = test_add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0);
+	ret = test_add_free_space_entry(cache, 0, SZ_1M, 0);
 	if (ret) {
 		test_msg("Couldn't add extent entry %d\n", ret);
 		return ret;
 	}
 
-	ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024);
+	ret = btrfs_remove_free_space(cache, 0, SZ_1M);
 	if (ret) {
 		test_msg("Couldn't remove extent entry %d\n", ret);
 		return ret;
 	}
 
-	if (test_check_exists(cache, 0, 1 * 1024 * 1024)) {
+	if (test_check_exists(cache, 0, SZ_1M)) {
 		test_msg("Left remnants after our remove\n");
 		return -1;
 	}
 
 	/* Now to add back the extent entry and remove from the bitmap */
-	ret = test_add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0);
+	ret = test_add_free_space_entry(cache, 0, SZ_1M, 0);
 	if (ret) {
 		test_msg("Couldn't re-add extent entry %d\n", ret);
 		return ret;
 	}
 
-	ret = btrfs_remove_free_space(cache, 4 * 1024 * 1024, 1 * 1024 * 1024);
+	ret = btrfs_remove_free_space(cache, SZ_4M, SZ_1M);
 	if (ret) {
 		test_msg("Couldn't remove from bitmap %d\n", ret);
 		return ret;
 	}
 
-	if (test_check_exists(cache, 4 * 1024 * 1024, 1 * 1024 * 1024)) {
+	if (test_check_exists(cache, SZ_4M, SZ_1M)) {
 		test_msg("Left remnants in the bitmap\n");
 		return -1;
 	}
@@ -261,19 +224,19 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
 	 * Ok so a little more evil, extent entry and bitmap at the same offset,
 	 * removing an overlapping chunk.
 	 */
-	ret = test_add_free_space_entry(cache, 1 * 1024 * 1024, 4 * 1024 * 1024, 1);
+	ret = test_add_free_space_entry(cache, SZ_1M, SZ_4M, 1);
 	if (ret) {
 		test_msg("Couldn't add to a bitmap %d\n", ret);
 		return ret;
 	}
 
-	ret = btrfs_remove_free_space(cache, 512 * 1024, 3 * 1024 * 1024);
+	ret = btrfs_remove_free_space(cache, SZ_512K, 3 * SZ_1M);
 	if (ret) {
 		test_msg("Couldn't remove overlapping space %d\n", ret);
 		return ret;
 	}
 
-	if (test_check_exists(cache, 512 * 1024, 3 * 1024 * 1024)) {
+	if (test_check_exists(cache, SZ_512K, 3 * SZ_1M)) {
 		test_msg("Left over pieces after removing overlapping\n");
 		return -1;
 	}
@@ -281,25 +244,25 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
 	__btrfs_remove_free_space_cache(cache->free_space_ctl);
 
 	/* Now with the extent entry offset into the bitmap */
-	ret = test_add_free_space_entry(cache, 4 * 1024 * 1024, 4 * 1024 * 1024, 1);
+	ret = test_add_free_space_entry(cache, SZ_4M, SZ_4M, 1);
 	if (ret) {
 		test_msg("Couldn't add space to the bitmap %d\n", ret);
 		return ret;
 	}
 
-	ret = test_add_free_space_entry(cache, 2 * 1024 * 1024, 2 * 1024 * 1024, 0);
+	ret = test_add_free_space_entry(cache, SZ_2M, SZ_2M, 0);
 	if (ret) {
 		test_msg("Couldn't add extent to the cache %d\n", ret);
 		return ret;
 	}
 
-	ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 4 * 1024 * 1024);
+	ret = btrfs_remove_free_space(cache, 3 * SZ_1M, SZ_4M);
 	if (ret) {
 		test_msg("Problem removing overlapping space %d\n", ret);
 		return ret;
 	}
 
-	if (test_check_exists(cache, 3 * 1024 * 1024, 4 * 1024 * 1024)) {
+	if (test_check_exists(cache, 3 * SZ_1M, SZ_4M)) {
 		test_msg("Left something behind when removing space");
 		return -1;
 	}
@@ -315,29 +278,26 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
 	 *        [ del ]
 	 */
 	__btrfs_remove_free_space_cache(cache->free_space_ctl);
-	ret = test_add_free_space_entry(cache, bitmap_offset + 4 * 1024 * 1024,
-				   4 * 1024 * 1024, 1);
+	ret = test_add_free_space_entry(cache, bitmap_offset + SZ_4M, SZ_4M, 1);
 	if (ret) {
 		test_msg("Couldn't add bitmap %d\n", ret);
 		return ret;
 	}
 
-	ret = test_add_free_space_entry(cache, bitmap_offset - 1 * 1024 * 1024,
-				   5 * 1024 * 1024, 0);
+	ret = test_add_free_space_entry(cache, bitmap_offset - SZ_1M,
+					5 * SZ_1M, 0);
 	if (ret) {
 		test_msg("Couldn't add extent entry %d\n", ret);
 		return ret;
 	}
 
-	ret = btrfs_remove_free_space(cache, bitmap_offset + 1 * 1024 * 1024,
-				      5 * 1024 * 1024);
+	ret = btrfs_remove_free_space(cache, bitmap_offset + SZ_1M, 5 * SZ_1M);
 	if (ret) {
 		test_msg("Failed to free our space %d\n", ret);
 		return ret;
 	}
 
-	if (test_check_exists(cache, bitmap_offset + 1 * 1024 * 1024,
-			 5 * 1024 * 1024)) {
+	if (test_check_exists(cache, bitmap_offset + SZ_1M, 5 * SZ_1M)) {
 		test_msg("Left stuff over\n");
 		return -1;
 	}
@@ -350,19 +310,19 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
 	 * to return -EAGAIN back from btrfs_remove_extent, make sure this
 	 * doesn't happen.
 	 */
-	ret = test_add_free_space_entry(cache, 1 * 1024 * 1024, 2 * 1024 * 1024, 1);
+	ret = test_add_free_space_entry(cache, SZ_1M, SZ_2M, 1);
 	if (ret) {
 		test_msg("Couldn't add bitmap entry %d\n", ret);
 		return ret;
 	}
 
-	ret = test_add_free_space_entry(cache, 3 * 1024 * 1024, 1 * 1024 * 1024, 0);
+	ret = test_add_free_space_entry(cache, 3 * SZ_1M, SZ_1M, 0);
 	if (ret) {
 		test_msg("Couldn't add extent entry %d\n", ret);
 		return ret;
 	}
 
-	ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 3 * 1024 * 1024);
+	ret = btrfs_remove_free_space(cache, SZ_1M, 3 * SZ_1M);
 	if (ret) {
 		test_msg("Error removing bitmap and extent overlapping %d\n", ret);
 		return ret;
@@ -445,9 +405,11 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
 	int ret;
 	u64 offset;
 	u64 max_extent_size;
-
-	bool (*use_bitmap_op)(struct btrfs_free_space_ctl *,
-			      struct btrfs_free_space *);
+	const struct btrfs_free_space_op test_free_space_ops = {
+		.recalc_thresholds = cache->free_space_ctl->op->recalc_thresholds,
+		.use_bitmap = test_use_bitmap,
+	};
+	const struct btrfs_free_space_op *orig_free_space_ops;
 
 	test_msg("Running space stealing from bitmap to extent\n");
 
@@ -469,22 +431,21 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
 	 * that forces use of bitmaps as soon as we have at least 1
 	 * extent entry.
 	 */
-	use_bitmap_op = cache->free_space_ctl->op->use_bitmap;
-	cache->free_space_ctl->op->use_bitmap = test_use_bitmap;
+	orig_free_space_ops = cache->free_space_ctl->op;
+	cache->free_space_ctl->op = &test_free_space_ops;
 
 	/*
 	 * Extent entry covering free space range [128Mb - 256Kb, 128Mb - 128Kb[
 	 */
-	ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 - 256 * 1024,
-					128 * 1024, 0);
+	ret = test_add_free_space_entry(cache, SZ_128M - SZ_256K, SZ_128K, 0);
 	if (ret) {
 		test_msg("Couldn't add extent entry %d\n", ret);
 		return ret;
 	}
 
 	/* Bitmap entry covering free space range [128Mb + 512Kb, 256Mb[ */
-	ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 + 512 * 1024,
-					128 * 1024 * 1024 - 512 * 1024, 1);
+	ret = test_add_free_space_entry(cache, SZ_128M + SZ_512K,
+					SZ_128M - SZ_512K, 1);
 	if (ret) {
 		test_msg("Couldn't add bitmap entry %d\n", ret);
 		return ret;
@@ -502,21 +463,19 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
 	 * [128Mb + 512Kb, 128Mb + 768Kb[
 	 */
 	ret = btrfs_remove_free_space(cache,
-				      128 * 1024 * 1024 + 768 * 1024,
-				      128 * 1024 * 1024 - 768 * 1024);
+				      SZ_128M + 768 * SZ_1K,
+				      SZ_128M - 768 * SZ_1K);
 	if (ret) {
 		test_msg("Failed to free part of bitmap space %d\n", ret);
 		return ret;
 	}
 
 	/* Confirm that only those 2 ranges are marked as free. */
-	if (!test_check_exists(cache, 128 * 1024 * 1024 - 256 * 1024,
-			       128 * 1024)) {
+	if (!test_check_exists(cache, SZ_128M - SZ_256K, SZ_128K)) {
 		test_msg("Free space range missing\n");
 		return -ENOENT;
 	}
-	if (!test_check_exists(cache, 128 * 1024 * 1024 + 512 * 1024,
-			       256 * 1024)) {
+	if (!test_check_exists(cache, SZ_128M + SZ_512K, SZ_256K)) {
 		test_msg("Free space range missing\n");
 		return -ENOENT;
 	}
@@ -525,8 +484,8 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
 	 * Confirm that the bitmap range [128Mb + 768Kb, 256Mb[ isn't marked
 	 * as free anymore.
 	 */
-	if (test_check_exists(cache, 128 * 1024 * 1024 + 768 * 1024,
-			      128 * 1024 * 1024 - 768 * 1024)) {
+	if (test_check_exists(cache, SZ_128M + 768 * SZ_1K,
+			      SZ_128M - 768 * SZ_1K)) {
 		test_msg("Bitmap region not removed from space cache\n");
 		return -EINVAL;
 	}
@@ -535,8 +494,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
 	 * Confirm that the region [128Mb + 256Kb, 128Mb + 512Kb[, which is
 	 * covered by the bitmap, isn't marked as free.
 	 */
-	if (test_check_exists(cache, 128 * 1024 * 1024 + 256 * 1024,
-			      256 * 1024)) {
+	if (test_check_exists(cache, SZ_128M + SZ_256K, SZ_256K)) {
 		test_msg("Invalid bitmap region marked as free\n");
 		return -EINVAL;
 	}
@@ -545,8 +503,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
 	 * Confirm that the region [128Mb, 128Mb + 256Kb[, which is covered
 	 * by the bitmap too, isn't marked as free either.
 	 */
-	if (test_check_exists(cache, 128 * 1024 * 1024,
-			      256 * 1024)) {
+	if (test_check_exists(cache, SZ_128M, SZ_256K)) {
 		test_msg("Invalid bitmap region marked as free\n");
 		return -EINVAL;
 	}
@@ -556,13 +513,13 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
 	 * lets make sure the free space cache marks it as free in the bitmap,
 	 * and doesn't insert a new extent entry to represent this region.
 	 */
-	ret = btrfs_add_free_space(cache, 128 * 1024 * 1024, 512 * 1024);
+	ret = btrfs_add_free_space(cache, SZ_128M, SZ_512K);
 	if (ret) {
 		test_msg("Error adding free space: %d\n", ret);
 		return ret;
 	}
 	/* Confirm the region is marked as free. */
-	if (!test_check_exists(cache, 128 * 1024 * 1024, 512 * 1024)) {
+	if (!test_check_exists(cache, SZ_128M, SZ_512K)) {
 		test_msg("Bitmap region not marked as free\n");
 		return -ENOENT;
 	}
@@ -581,8 +538,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
 	 * The goal is to test that the bitmap entry space stealing doesn't
 	 * steal this space region.
 	 */
-	ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 + 16 * 1024 * 1024,
-				   4096);
+	ret = btrfs_add_free_space(cache, SZ_128M + SZ_16M, 4096);
 	if (ret) {
 		test_msg("Error adding free space: %d\n", ret);
 		return ret;
@@ -601,15 +557,13 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
 	 * expand the range covered by the existing extent entry that represents
 	 * the free space [128Mb - 256Kb, 128Mb - 128Kb[.
 	 */
-	ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 - 128 * 1024,
-				   128 * 1024);
+	ret = btrfs_add_free_space(cache, SZ_128M - SZ_128K, SZ_128K);
 	if (ret) {
 		test_msg("Error adding free space: %d\n", ret);
 		return ret;
 	}
 	/* Confirm the region is marked as free. */
-	if (!test_check_exists(cache, 128 * 1024 * 1024 - 128 * 1024,
-			       128 * 1024)) {
+	if (!test_check_exists(cache, SZ_128M - SZ_128K, SZ_128K)) {
 		test_msg("Extent region not marked as free\n");
 		return -ENOENT;
 	}
@@ -637,21 +591,20 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
 	 * that represents the 1Mb free space, and therefore we're able to
 	 * allocate the whole free space at once.
 	 */
-	if (!test_check_exists(cache, 128 * 1024 * 1024 - 256 * 1024,
-			       1 * 1024 * 1024)) {
+	if (!test_check_exists(cache, SZ_128M - SZ_256K, SZ_1M)) {
 		test_msg("Expected region not marked as free\n");
 		return -ENOENT;
 	}
 
-	if (cache->free_space_ctl->free_space != (1 * 1024 * 1024 + 4096)) {
+	if (cache->free_space_ctl->free_space != (SZ_1M + 4096)) {
 		test_msg("Cache free space is not 1Mb + 4Kb\n");
 		return -EINVAL;
 	}
 
 	offset = btrfs_find_space_for_alloc(cache,
-					    0, 1 * 1024 * 1024, 0,
+					    0, SZ_1M, 0,
 					    &max_extent_size);
-	if (offset != (128 * 1024 * 1024 - 256 * 1024)) {
+	if (offset != (SZ_128M - SZ_256K)) {
 		test_msg("Failed to allocate 1Mb from space cache, returned offset is: %llu\n",
 			 offset);
 		return -EINVAL;
@@ -670,7 +623,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
 	offset = btrfs_find_space_for_alloc(cache,
 					    0, 4096, 0,
 					    &max_extent_size);
-	if (offset != (128 * 1024 * 1024 + 16 * 1024 * 1024)) {
+	if (offset != (SZ_128M + SZ_16M)) {
 		test_msg("Failed to allocate 4Kb from space cache, returned offset is: %llu\n",
 			 offset);
 		return -EINVAL;
@@ -691,16 +644,14 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
 	/*
 	 * Extent entry covering free space range [128Mb + 128Kb, 128Mb + 256Kb[
 	 */
-	ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 + 128 * 1024,
-					128 * 1024, 0);
+	ret = test_add_free_space_entry(cache, SZ_128M + SZ_128K, SZ_128K, 0);
 	if (ret) {
 		test_msg("Couldn't add extent entry %d\n", ret);
 		return ret;
 	}
 
 	/* Bitmap entry covering free space range [0, 128Mb - 512Kb[ */
-	ret = test_add_free_space_entry(cache, 0,
-					128 * 1024 * 1024 - 512 * 1024, 1);
+	ret = test_add_free_space_entry(cache, 0, SZ_128M - SZ_512K, 1);
 	if (ret) {
 		test_msg("Couldn't add bitmap entry %d\n", ret);
 		return ret;
@@ -717,22 +668,18 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
 	 * [128Mb + 128b, 128Mb + 256Kb[
 	 * [128Mb - 768Kb, 128Mb - 512Kb[
 	 */
-	ret = btrfs_remove_free_space(cache,
-				      0,
-				      128 * 1024 * 1024 - 768 * 1024);
+	ret = btrfs_remove_free_space(cache, 0, SZ_128M - 768 * SZ_1K);
 	if (ret) {
 		test_msg("Failed to free part of bitmap space %d\n", ret);
 		return ret;
 	}
 
 	/* Confirm that only those 2 ranges are marked as free. */
-	if (!test_check_exists(cache, 128 * 1024 * 1024 + 128 * 1024,
-			       128 * 1024)) {
+	if (!test_check_exists(cache, SZ_128M + SZ_128K, SZ_128K)) {
 		test_msg("Free space range missing\n");
 		return -ENOENT;
 	}
-	if (!test_check_exists(cache, 128 * 1024 * 1024 - 768 * 1024,
-			       256 * 1024)) {
+	if (!test_check_exists(cache, SZ_128M - 768 * SZ_1K, SZ_256K)) {
 		test_msg("Free space range missing\n");
 		return -ENOENT;
 	}
@@ -741,8 +688,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
 	 * Confirm that the bitmap range [0, 128Mb - 768Kb[ isn't marked
 	 * as free anymore.
 	 */
-	if (test_check_exists(cache, 0,
-			      128 * 1024 * 1024 - 768 * 1024)) {
+	if (test_check_exists(cache, 0, SZ_128M - 768 * SZ_1K)) {
 		test_msg("Bitmap region not removed from space cache\n");
 		return -EINVAL;
 	}
@@ -751,8 +697,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
 	 * Confirm that the region [128Mb - 512Kb, 128Mb[, which is
 	 * covered by the bitmap, isn't marked as free.
 	 */
-	if (test_check_exists(cache, 128 * 1024 * 1024 - 512 * 1024,
-			      512 * 1024)) {
+	if (test_check_exists(cache, SZ_128M - SZ_512K, SZ_512K)) {
 		test_msg("Invalid bitmap region marked as free\n");
 		return -EINVAL;
 	}
@@ -762,15 +707,13 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
 	 * lets make sure the free space cache marks it as free in the bitmap,
 	 * and doesn't insert a new extent entry to represent this region.
 	 */
-	ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 - 512 * 1024,
-				   512 * 1024);
+	ret = btrfs_add_free_space(cache, SZ_128M - SZ_512K, SZ_512K);
 	if (ret) {
 		test_msg("Error adding free space: %d\n", ret);
 		return ret;
 	}
 	/* Confirm the region is marked as free. */
-	if (!test_check_exists(cache, 128 * 1024 * 1024 - 512 * 1024,
-			       512 * 1024)) {
+	if (!test_check_exists(cache, SZ_128M - SZ_512K, SZ_512K)) {
 		test_msg("Bitmap region not marked as free\n");
 		return -ENOENT;
 	}
@@ -789,7 +732,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
 	 * The goal is to test that the bitmap entry space stealing doesn't
 	 * steal this space region.
 	 */
-	ret = btrfs_add_free_space(cache, 32 * 1024 * 1024, 8192);
+	ret = btrfs_add_free_space(cache, SZ_32M, 8192);
 	if (ret) {
 		test_msg("Error adding free space: %d\n", ret);
 		return ret;
@@ -800,13 +743,13 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
 	 * expand the range covered by the existing extent entry that represents
 	 * the free space [128Mb + 128Kb, 128Mb + 256Kb[.
 	 */
-	ret = btrfs_add_free_space(cache, 128 * 1024 * 1024, 128 * 1024);
+	ret = btrfs_add_free_space(cache, SZ_128M, SZ_128K);
 	if (ret) {
 		test_msg("Error adding free space: %d\n", ret);
 		return ret;
 	}
 	/* Confirm the region is marked as free. */
-	if (!test_check_exists(cache, 128 * 1024 * 1024, 128 * 1024)) {
+	if (!test_check_exists(cache, SZ_128M, SZ_128K)) {
 		test_msg("Extent region not marked as free\n");
 		return -ENOENT;
 	}
@@ -834,21 +777,19 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
 	 * that represents the 1Mb free space, and therefore we're able to
 	 * allocate the whole free space at once.
 	 */
-	if (!test_check_exists(cache, 128 * 1024 * 1024 - 768 * 1024,
-			       1 * 1024 * 1024)) {
+	if (!test_check_exists(cache, SZ_128M - 768 * SZ_1K, SZ_1M)) {
 		test_msg("Expected region not marked as free\n");
 		return -ENOENT;
 	}
 
-	if (cache->free_space_ctl->free_space != (1 * 1024 * 1024 + 8192)) {
+	if (cache->free_space_ctl->free_space != (SZ_1M + 8192)) {
 		test_msg("Cache free space is not 1Mb + 8Kb\n");
 		return -EINVAL;
 	}
 
-	offset = btrfs_find_space_for_alloc(cache,
-					    0, 1 * 1024 * 1024, 0,
+	offset = btrfs_find_space_for_alloc(cache, 0, SZ_1M, 0,
 					    &max_extent_size);
-	if (offset != (128 * 1024 * 1024 - 768 * 1024)) {
+	if (offset != (SZ_128M - 768 * SZ_1K)) {
 		test_msg("Failed to allocate 1Mb from space cache, returned offset is: %llu\n",
 			 offset);
 		return -EINVAL;
@@ -867,7 +808,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
 	offset = btrfs_find_space_for_alloc(cache,
 					    0, 8192, 0,
 					    &max_extent_size);
-	if (offset != (32 * 1024 * 1024)) {
+	if (offset != SZ_32M) {
 		test_msg("Failed to allocate 8Kb from space cache, returned offset is: %llu\n",
 			 offset);
 		return -EINVAL;
@@ -877,7 +818,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache)
 	if (ret)
 		return ret;
 
-	cache->free_space_ctl->op->use_bitmap = use_bitmap_op;
+	cache->free_space_ctl->op = orig_free_space_ops;
 	__btrfs_remove_free_space_cache(cache->free_space_ctl);
 
 	return 0;
@@ -891,7 +832,7 @@ int btrfs_test_free_space_cache(void)
 
 	test_msg("Running btrfs free space cache tests\n");
 
-	cache = init_test_block_group();
+	cache = btrfs_alloc_dummy_block_group(1024 * 1024 * 1024);
 	if (!cache) {
 		test_msg("Couldn't run the tests\n");
 		return 0;
@@ -922,9 +863,7 @@ int btrfs_test_free_space_cache(void)
 
 	ret = test_steal_space_from_bitmap_to_extent(cache);
 out:
-	__btrfs_remove_free_space_cache(cache->free_space_ctl);
-	kfree(cache->free_space_ctl);
-	kfree(cache);
+	btrfs_free_dummy_block_group(cache);
 	btrfs_free_dummy_root(root);
 	test_msg("Free space cache tests finished\n");
 	return ret;
diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c
new file mode 100644
index 000000000..d05fe1ab4
--- /dev/null
+++ b/fs/btrfs/tests/free-space-tree-tests.c
@@ -0,0 +1,571 @@
+/*
+ * Copyright (C) 2015 Facebook.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include "btrfs-tests.h"
+#include "../ctree.h"
+#include "../disk-io.h"
+#include "../free-space-tree.h"
+#include "../transaction.h"
+
+struct free_space_extent {
+	u64 start, length;
+};
+
+/*
+ * The test cases align their operations to this in order to hit some of the
+ * edge cases in the bitmap code.
+ */
+#define BITMAP_RANGE (BTRFS_FREE_SPACE_BITMAP_BITS * 4096)
+
+static int __check_free_space_extents(struct btrfs_trans_handle *trans,
+				      struct btrfs_fs_info *fs_info,
+				      struct btrfs_block_group_cache *cache,
+				      struct btrfs_path *path,
+				      struct free_space_extent *extents,
+				      unsigned int num_extents)
+{
+	struct btrfs_free_space_info *info;
+	struct btrfs_key key;
+	int prev_bit = 0, bit;
+	u64 extent_start = 0, offset, end;
+	u32 flags, extent_count;
+	unsigned int i;
+	int ret;
+
+	info = search_free_space_info(trans, fs_info, cache, path, 0);
+	if (IS_ERR(info)) {
+		test_msg("Could not find free space info\n");
+		ret = PTR_ERR(info);
+		goto out;
+	}
+	flags = btrfs_free_space_flags(path->nodes[0], info);
+	extent_count = btrfs_free_space_extent_count(path->nodes[0], info);
+
+	if (extent_count != num_extents) {
+		test_msg("Extent count is wrong\n");
+		ret = -EINVAL;
+		goto out;
+	}
+	if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) {
+		if (path->slots[0] != 0)
+			goto invalid;
+		end = cache->key.objectid + cache->key.offset;
+		i = 0;
+		while (++path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
+			btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+			if (key.type != BTRFS_FREE_SPACE_BITMAP_KEY)
+				goto invalid;
+			offset = key.objectid;
+			while (offset < key.objectid + key.offset) {
+				bit = free_space_test_bit(cache, path, offset);
+				if (prev_bit == 0 && bit == 1) {
+					extent_start = offset;
+				} else if (prev_bit == 1 && bit == 0) {
+					if (i >= num_extents)
+						goto invalid;
+					if (i >= num_extents ||
+					    extent_start != extents[i].start ||
+					    offset - extent_start != extents[i].length)
+						goto invalid;
+					i++;
+				}
+				prev_bit = bit;
+				offset += cache->sectorsize;
+			}
+		}
+		if (prev_bit == 1) {
+			if (i >= num_extents ||
+			    extent_start != extents[i].start ||
+			    end - extent_start != extents[i].length)
+				goto invalid;
+			i++;
+		}
+		if (i != num_extents)
+			goto invalid;
+	} else {
+		if (btrfs_header_nritems(path->nodes[0]) != num_extents + 1 ||
+		    path->slots[0] != 0)
+			goto invalid;
+		for (i = 0; i < num_extents; i++) {
+			path->slots[0]++;
+			btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+			if (key.type != BTRFS_FREE_SPACE_EXTENT_KEY ||
+			    key.objectid != extents[i].start ||
+			    key.offset != extents[i].length)
+				goto invalid;
+		}
+	}
+
+	ret = 0;
+out:
+	btrfs_release_path(path);
+	return ret;
+invalid:
+	test_msg("Free space tree is invalid\n");
+	ret = -EINVAL;
+	goto out;
+}
+
+static int check_free_space_extents(struct btrfs_trans_handle *trans,
+				    struct btrfs_fs_info *fs_info,
+				    struct btrfs_block_group_cache *cache,
+				    struct btrfs_path *path,
+				    struct free_space_extent *extents,
+				    unsigned int num_extents)
+{
+	struct btrfs_free_space_info *info;
+	u32 flags;
+	int ret;
+
+	info = search_free_space_info(trans, fs_info, cache, path, 0);
+	if (IS_ERR(info)) {
+		test_msg("Could not find free space info\n");
+		btrfs_release_path(path);
+		return PTR_ERR(info);
+	}
+	flags = btrfs_free_space_flags(path->nodes[0], info);
+	btrfs_release_path(path);
+
+	ret = __check_free_space_extents(trans, fs_info, cache, path, extents,
+					 num_extents);
+	if (ret)
+		return ret;
+
+	/* Flip it to the other format and check that for good measure. */
+	if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) {
+		ret = convert_free_space_to_extents(trans, fs_info, cache, path);
+		if (ret) {
+			test_msg("Could not convert to extents\n");
+			return ret;
+		}
+	} else {
+		ret = convert_free_space_to_bitmaps(trans, fs_info, cache, path);
+		if (ret) {
+			test_msg("Could not convert to bitmaps\n");
+			return ret;
+		}
+	}
+	return __check_free_space_extents(trans, fs_info, cache, path, extents,
+					  num_extents);
+}
+
+static int test_empty_block_group(struct btrfs_trans_handle *trans,
+				  struct btrfs_fs_info *fs_info,
+				  struct btrfs_block_group_cache *cache,
+				  struct btrfs_path *path)
+{
+	struct free_space_extent extents[] = {
+		{cache->key.objectid, cache->key.offset},
+	};
+
+	return check_free_space_extents(trans, fs_info, cache, path,
+					extents, ARRAY_SIZE(extents));
+}
+
+static int test_remove_all(struct btrfs_trans_handle *trans,
+			   struct btrfs_fs_info *fs_info,
+			   struct btrfs_block_group_cache *cache,
+			   struct btrfs_path *path)
+{
+	struct free_space_extent extents[] = {};
+	int ret;
+
+	ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+					    cache->key.objectid,
+					    cache->key.offset);
+	if (ret) {
+		test_msg("Could not remove free space\n");
+		return ret;
+	}
+
+	return check_free_space_extents(trans, fs_info, cache, path,
+					extents, ARRAY_SIZE(extents));
+}
+
+static int test_remove_beginning(struct btrfs_trans_handle *trans,
+				 struct btrfs_fs_info *fs_info,
+				 struct btrfs_block_group_cache *cache,
+				 struct btrfs_path *path)
+{
+	struct free_space_extent extents[] = {
+		{cache->key.objectid + BITMAP_RANGE,
+			cache->key.offset - BITMAP_RANGE},
+	};
+	int ret;
+
+	ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+					    cache->key.objectid, BITMAP_RANGE);
+	if (ret) {
+		test_msg("Could not remove free space\n");
+		return ret;
+	}
+
+	return check_free_space_extents(trans, fs_info, cache, path,
+					extents, ARRAY_SIZE(extents));
+
+}
+
+static int test_remove_end(struct btrfs_trans_handle *trans,
+			   struct btrfs_fs_info *fs_info,
+			   struct btrfs_block_group_cache *cache,
+			   struct btrfs_path *path)
+{
+	struct free_space_extent extents[] = {
+		{cache->key.objectid, cache->key.offset - BITMAP_RANGE},
+	};
+	int ret;
+
+	ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+					    cache->key.objectid +
+					    cache->key.offset - BITMAP_RANGE,
+					    BITMAP_RANGE);
+	if (ret) {
+		test_msg("Could not remove free space\n");
+		return ret;
+	}
+
+	return check_free_space_extents(trans, fs_info, cache, path,
+					extents, ARRAY_SIZE(extents));
+}
+
+static int test_remove_middle(struct btrfs_trans_handle *trans,
+			      struct btrfs_fs_info *fs_info,
+			      struct btrfs_block_group_cache *cache,
+			      struct btrfs_path *path)
+{
+	struct free_space_extent extents[] = {
+		{cache->key.objectid, BITMAP_RANGE},
+		{cache->key.objectid + 2 * BITMAP_RANGE,
+			cache->key.offset - 2 * BITMAP_RANGE},
+	};
+	int ret;
+
+	ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+					    cache->key.objectid + BITMAP_RANGE,
+					    BITMAP_RANGE);
+	if (ret) {
+		test_msg("Could not remove free space\n");
+		return ret;
+	}
+
+	return check_free_space_extents(trans, fs_info, cache, path,
+					extents, ARRAY_SIZE(extents));
+}
+
+static int test_merge_left(struct btrfs_trans_handle *trans,
+			   struct btrfs_fs_info *fs_info,
+			   struct btrfs_block_group_cache *cache,
+			   struct btrfs_path *path)
+{
+	struct free_space_extent extents[] = {
+		{cache->key.objectid, 2 * BITMAP_RANGE},
+	};
+	int ret;
+
+	ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+					    cache->key.objectid,
+					    cache->key.offset);
+	if (ret) {
+		test_msg("Could not remove free space\n");
+		return ret;
+	}
+
+	ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+				       cache->key.objectid, BITMAP_RANGE);
+	if (ret) {
+		test_msg("Could not add free space\n");
+		return ret;
+	}
+
+	ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+				       cache->key.objectid + BITMAP_RANGE,
+				       BITMAP_RANGE);
+	if (ret) {
+		test_msg("Could not add free space\n");
+		return ret;
+	}
+
+	return check_free_space_extents(trans, fs_info, cache, path,
+					extents, ARRAY_SIZE(extents));
+}
+
+static int test_merge_right(struct btrfs_trans_handle *trans,
+			   struct btrfs_fs_info *fs_info,
+			   struct btrfs_block_group_cache *cache,
+			   struct btrfs_path *path)
+{
+	struct free_space_extent extents[] = {
+		{cache->key.objectid + BITMAP_RANGE, 2 * BITMAP_RANGE},
+	};
+	int ret;
+
+	ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+					    cache->key.objectid,
+					    cache->key.offset);
+	if (ret) {
+		test_msg("Could not remove free space\n");
+		return ret;
+	}
+
+	ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+				       cache->key.objectid + 2 * BITMAP_RANGE,
+				       BITMAP_RANGE);
+	if (ret) {
+		test_msg("Could not add free space\n");
+		return ret;
+	}
+
+	ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+				       cache->key.objectid + BITMAP_RANGE,
+				       BITMAP_RANGE);
+	if (ret) {
+		test_msg("Could not add free space\n");
+		return ret;
+	}
+
+	return check_free_space_extents(trans, fs_info, cache, path,
+					extents, ARRAY_SIZE(extents));
+}
+
+static int test_merge_both(struct btrfs_trans_handle *trans,
+			   struct btrfs_fs_info *fs_info,
+			   struct btrfs_block_group_cache *cache,
+			   struct btrfs_path *path)
+{
+	struct free_space_extent extents[] = {
+		{cache->key.objectid, 3 * BITMAP_RANGE},
+	};
+	int ret;
+
+	ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+					    cache->key.objectid,
+					    cache->key.offset);
+	if (ret) {
+		test_msg("Could not remove free space\n");
+		return ret;
+	}
+
+	ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+				       cache->key.objectid, BITMAP_RANGE);
+	if (ret) {
+		test_msg("Could not add free space\n");
+		return ret;
+	}
+
+	ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+				       cache->key.objectid + 2 * BITMAP_RANGE,
+				       BITMAP_RANGE);
+	if (ret) {
+		test_msg("Could not add free space\n");
+		return ret;
+	}
+
+	ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+				       cache->key.objectid + BITMAP_RANGE,
+				       BITMAP_RANGE);
+	if (ret) {
+		test_msg("Could not add free space\n");
+		return ret;
+	}
+
+	return check_free_space_extents(trans, fs_info, cache, path,
+					extents, ARRAY_SIZE(extents));
+}
+
+static int test_merge_none(struct btrfs_trans_handle *trans,
+			   struct btrfs_fs_info *fs_info,
+			   struct btrfs_block_group_cache *cache,
+			   struct btrfs_path *path)
+{
+	struct free_space_extent extents[] = {
+		{cache->key.objectid, BITMAP_RANGE},
+		{cache->key.objectid + 2 * BITMAP_RANGE, BITMAP_RANGE},
+		{cache->key.objectid + 4 * BITMAP_RANGE, BITMAP_RANGE},
+	};
+	int ret;
+
+	ret = __remove_from_free_space_tree(trans, fs_info, cache, path,
+					    cache->key.objectid,
+					    cache->key.offset);
+	if (ret) {
+		test_msg("Could not remove free space\n");
+		return ret;
+	}
+
+	ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+				       cache->key.objectid, BITMAP_RANGE);
+	if (ret) {
+		test_msg("Could not add free space\n");
+		return ret;
+	}
+
+	ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+				       cache->key.objectid + 4 * BITMAP_RANGE,
+				       BITMAP_RANGE);
+	if (ret) {
+		test_msg("Could not add free space\n");
+		return ret;
+	}
+
+	ret = __add_to_free_space_tree(trans, fs_info, cache, path,
+				       cache->key.objectid + 2 * BITMAP_RANGE,
+				       BITMAP_RANGE);
+	if (ret) {
+		test_msg("Could not add free space\n");
+		return ret;
+	}
+
+	return check_free_space_extents(trans, fs_info, cache, path,
+					extents, ARRAY_SIZE(extents));
+}
+
+typedef int (*test_func_t)(struct btrfs_trans_handle *,
+			   struct btrfs_fs_info *,
+			   struct btrfs_block_group_cache *,
+			   struct btrfs_path *);
+
+static int run_test(test_func_t test_func, int bitmaps)
+{
+	struct btrfs_root *root = NULL;
+	struct btrfs_block_group_cache *cache = NULL;
+	struct btrfs_trans_handle trans;
+	struct btrfs_path *path = NULL;
+	int ret;
+
+	root = btrfs_alloc_dummy_root();
+	if (IS_ERR(root)) {
+		test_msg("Couldn't allocate dummy root\n");
+		ret = PTR_ERR(root);
+		goto out;
+	}
+
+	root->fs_info = btrfs_alloc_dummy_fs_info();
+	if (!root->fs_info) {
+		test_msg("Couldn't allocate dummy fs info\n");
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	btrfs_set_super_compat_ro_flags(root->fs_info->super_copy,
+					BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE);
+	root->fs_info->free_space_root = root;
+	root->fs_info->tree_root = root;
+
+	root->node = alloc_test_extent_buffer(root->fs_info, 4096);
+	if (!root->node) {
+		test_msg("Couldn't allocate dummy buffer\n");
+		ret = -ENOMEM;
+		goto out;
+	}
+	btrfs_set_header_level(root->node, 0);
+	btrfs_set_header_nritems(root->node, 0);
+	root->alloc_bytenr += 8192;
+
+	cache = btrfs_alloc_dummy_block_group(8 * BITMAP_RANGE);
+	if (!cache) {
+		test_msg("Couldn't allocate dummy block group cache\n");
+		ret = -ENOMEM;
+		goto out;
+	}
+	cache->bitmap_low_thresh = 0;
+	cache->bitmap_high_thresh = (u32)-1;
+	cache->needs_free_space = 1;
+
+	btrfs_init_dummy_trans(&trans);
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		test_msg("Couldn't allocate path\n");
+		return -ENOMEM;
+	}
+
+	ret = add_block_group_free_space(&trans, root->fs_info, cache);
+	if (ret) {
+		test_msg("Could not add block group free space\n");
+		goto out;
+	}
+
+	if (bitmaps) {
+		ret = convert_free_space_to_bitmaps(&trans, root->fs_info,
+						    cache, path);
+		if (ret) {
+			test_msg("Could not convert block group to bitmaps\n");
+			goto out;
+		}
+	}
+
+	ret = test_func(&trans, root->fs_info, cache, path);
+	if (ret)
+		goto out;
+
+	ret = remove_block_group_free_space(&trans, root->fs_info, cache);
+	if (ret) {
+		test_msg("Could not remove block group free space\n");
+		goto out;
+	}
+
+	if (btrfs_header_nritems(root->node) != 0) {
+		test_msg("Free space tree has leftover items\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = 0;
+out:
+	btrfs_free_path(path);
+	btrfs_free_dummy_block_group(cache);
+	btrfs_free_dummy_root(root);
+	return ret;
+}
+
+static int run_test_both_formats(test_func_t test_func)
+{
+	int ret;
+
+	ret = run_test(test_func, 0);
+	if (ret)
+		return ret;
+	return run_test(test_func, 1);
+}
+
+int btrfs_test_free_space_tree(void)
+{
+	test_func_t tests[] = {
+		test_empty_block_group,
+		test_remove_all,
+		test_remove_beginning,
+		test_remove_end,
+		test_remove_middle,
+		test_merge_left,
+		test_merge_right,
+		test_merge_both,
+		test_merge_none,
+	};
+	int i;
+
+	test_msg("Running free space tree tests\n");
+	for (i = 0; i < ARRAY_SIZE(tests); i++) {
+		int ret = run_test_both_formats(tests[i]);
+		if (ret) {
+			test_msg("%pf failed\n", tests[i]);
+			return ret;
+		}
+	}
+
+	return 0;
+}
diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c
index 054fc0d97..e2d3da02d 100644
--- a/fs/btrfs/tests/inode-tests.c
+++ b/fs/btrfs/tests/inode-tests.c
@@ -100,7 +100,7 @@ static void insert_inode_item_key(struct btrfs_root *root)
 static void setup_file_extents(struct btrfs_root *root)
 {
 	int slot = 0;
-	u64 disk_bytenr = 1 * 1024 * 1024;
+	u64 disk_bytenr = SZ_1M;
 	u64 offset = 0;
 
 	/* First we want a hole */
@@ -974,7 +974,7 @@ static int test_extent_accounting(void)
 			       (BTRFS_MAX_EXTENT_SIZE >> 1) + 4095,
 			       EXTENT_DELALLOC | EXTENT_DIRTY |
 			       EXTENT_UPTODATE | EXTENT_DO_ACCOUNTING, 0, 0,
-			       NULL, GFP_NOFS);
+			       NULL, GFP_KERNEL);
 	if (ret) {
 		test_msg("clear_extent_bit returned %d\n", ret);
 		goto out;
@@ -1045,7 +1045,7 @@ static int test_extent_accounting(void)
 			       BTRFS_MAX_EXTENT_SIZE+8191,
 			       EXTENT_DIRTY | EXTENT_DELALLOC |
 			       EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0,
-			       NULL, GFP_NOFS);
+			       NULL, GFP_KERNEL);
 	if (ret) {
 		test_msg("clear_extent_bit returned %d\n", ret);
 		goto out;
@@ -1079,7 +1079,7 @@ static int test_extent_accounting(void)
 	ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
 			       EXTENT_DIRTY | EXTENT_DELALLOC |
 			       EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0,
-			       NULL, GFP_NOFS);
+			       NULL, GFP_KERNEL);
 	if (ret) {
 		test_msg("clear_extent_bit returned %d\n", ret);
 		goto out;
@@ -1096,7 +1096,7 @@ out:
 		clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1,
 				 EXTENT_DIRTY | EXTENT_DELALLOC |
 				 EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0,
-				 NULL, GFP_NOFS);
+				 NULL, GFP_KERNEL);
 	iput(inode);
 	btrfs_free_dummy_root(root);
 	return ret;
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index 846d277b1..8ea5d34bc 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -23,14 +23,6 @@
 #include "../qgroup.h"
 #include "../backref.h"
 
-static void init_dummy_trans(struct btrfs_trans_handle *trans)
-{
-	memset(trans, 0, sizeof(*trans));
-	trans->transid = 1;
-	INIT_LIST_HEAD(&trans->qgroup_ref_list);
-	trans->type = __TRANS_DUMMY;
-}
-
 static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr,
 				  u64 num_bytes, u64 parent, u64 root_objectid)
 {
@@ -44,7 +36,7 @@ static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr,
 	u32 size = sizeof(*item) + sizeof(*iref) + sizeof(*block_info);
 	int ret;
 
-	init_dummy_trans(&trans);
+	btrfs_init_dummy_trans(&trans);
 
 	ins.objectid = bytenr;
 	ins.type = BTRFS_EXTENT_ITEM_KEY;
@@ -94,7 +86,7 @@ static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes,
 	u64 refs;
 	int ret;
 
-	init_dummy_trans(&trans);
+	btrfs_init_dummy_trans(&trans);
 
 	key.objectid = bytenr;
 	key.type = BTRFS_EXTENT_ITEM_KEY;
@@ -144,7 +136,7 @@ static int remove_extent_item(struct btrfs_root *root, u64 bytenr,
 	struct btrfs_path *path;
 	int ret;
 
-	init_dummy_trans(&trans);
+	btrfs_init_dummy_trans(&trans);
 
 	key.objectid = bytenr;
 	key.type = BTRFS_EXTENT_ITEM_KEY;
@@ -178,7 +170,7 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr,
 	u64 refs;
 	int ret;
 
-	init_dummy_trans(&trans);
+	btrfs_init_dummy_trans(&trans);
 
 	key.objectid = bytenr;
 	key.type = BTRFS_EXTENT_ITEM_KEY;
@@ -232,7 +224,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root)
 	struct ulist *new_roots = NULL;
 	int ret;
 
-	init_dummy_trans(&trans);
+	btrfs_init_dummy_trans(&trans);
 
 	test_msg("Qgroup basic add\n");
 	ret = btrfs_create_qgroup(NULL, fs_info, 5);
@@ -326,7 +318,7 @@ static int test_multiple_refs(struct btrfs_root *root)
 	struct ulist *new_roots = NULL;
 	int ret;
 
-	init_dummy_trans(&trans);
+	btrfs_init_dummy_trans(&trans);
 
 	test_msg("Qgroup multiple refs test\n");
 
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index be8eae80f..b6031ce47 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -75,6 +75,23 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction)
 			list_del_init(&em->list);
 			free_extent_map(em);
 		}
+		/*
+		 * If any block groups are found in ->deleted_bgs then it's
+		 * because the transaction was aborted and a commit did not
+		 * happen (things failed before writing the new superblock
+		 * and calling btrfs_finish_extent_commit()), so we can not
+		 * discard the physical locations of the block groups.
+		 */
+		while (!list_empty(&transaction->deleted_bgs)) {
+			struct btrfs_block_group_cache *cache;
+
+			cache = list_first_entry(&transaction->deleted_bgs,
+						 struct btrfs_block_group_cache,
+						 bg_list);
+			list_del_init(&cache->bg_list);
+			btrfs_put_block_group_trimming(cache);
+			btrfs_put_block_group(cache);
+		}
 		kmem_cache_free(btrfs_transaction_cachep, transaction);
 	}
 }
@@ -634,17 +651,20 @@ struct btrfs_trans_handle *btrfs_start_transaction_lflush(
 
 struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root)
 {
-	return start_transaction(root, 0, TRANS_JOIN, 0);
+	return start_transaction(root, 0, TRANS_JOIN,
+				 BTRFS_RESERVE_NO_FLUSH);
 }
 
 struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root)
 {
-	return start_transaction(root, 0, TRANS_JOIN_NOLOCK, 0);
+	return start_transaction(root, 0, TRANS_JOIN_NOLOCK,
+				 BTRFS_RESERVE_NO_FLUSH);
 }
 
 struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root)
 {
-	return start_transaction(root, 0, TRANS_USERSPACE, 0);
+	return start_transaction(root, 0, TRANS_USERSPACE,
+				 BTRFS_RESERVE_NO_FLUSH);
 }
 
 /*
@@ -662,7 +682,8 @@ struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root
  */
 struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root)
 {
-	return start_transaction(root, 0, TRANS_ATTACH, 0);
+	return start_transaction(root, 0, TRANS_ATTACH,
+				 BTRFS_RESERVE_NO_FLUSH);
 }
 
 /*
@@ -677,7 +698,8 @@ btrfs_attach_transaction_barrier(struct btrfs_root *root)
 {
 	struct btrfs_trans_handle *trans;
 
-	trans = start_transaction(root, 0, TRANS_ATTACH, 0);
+	trans = start_transaction(root, 0, TRANS_ATTACH,
+				  BTRFS_RESERVE_NO_FLUSH);
 	if (IS_ERR(trans) && PTR_ERR(trans) == -ENOENT)
 		btrfs_wait_for_commit(root, 0);
 
@@ -1319,17 +1341,11 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 	u64 root_flags;
 	uuid_le new_uuid;
 
-	path = btrfs_alloc_path();
-	if (!path) {
-		pending->error = -ENOMEM;
-		return 0;
-	}
+	ASSERT(pending->path);
+	path = pending->path;
 
-	new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS);
-	if (!new_root_item) {
-		pending->error = -ENOMEM;
-		goto root_item_alloc_fail;
-	}
+	ASSERT(pending->root_item);
+	new_root_item = pending->root_item;
 
 	pending->error = btrfs_find_free_objectid(tree_root, &objectid);
 	if (pending->error)
@@ -1562,8 +1578,10 @@ clear_skip_qgroup:
 	btrfs_clear_skip_qgroup(trans);
 no_free_objectid:
 	kfree(new_root_item);
-root_item_alloc_fail:
+	pending->root_item = NULL;
 	btrfs_free_path(path);
+	pending->path = NULL;
+
 	return ret;
 }
 
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 64c8221b6..72be51f7c 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -137,8 +137,10 @@ struct btrfs_pending_snapshot {
 	struct dentry *dentry;
 	struct inode *dir;
 	struct btrfs_root *root;
+	struct btrfs_root_item *root_item;
 	struct btrfs_root *snap;
 	struct btrfs_qgroup_inherit *inherit;
+	struct btrfs_path *path;
 	/* block reservation for the operation */
 	struct btrfs_block_rsv block_rsv;
 	u64 qgroup_reserved;
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index f31db4325..cb6508912 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -89,6 +89,12 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 		goto out;
 	}
 	btrfs_release_path(path);
+	/*
+	 * We don't need a lock on a leaf. btrfs_realloc_node() will lock all
+	 * leafs from path->nodes[1], so set lowest_level to 1 to avoid later
+	 * a deadlock (attempting to write lock an already write locked leaf).
+	 */
+	path->lowest_level = 1;
 	wret = btrfs_search_slot(trans, root, &key, path, 0, 1);
 
 	if (wret < 0) {
@@ -99,9 +105,12 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 		ret = 0;
 		goto out;
 	}
-	path->slots[1] = btrfs_header_nritems(path->nodes[1]);
-	next_key_ret = btrfs_find_next_key(root, path, &key, 1,
-					   min_trans);
+	/*
+	 * The node at level 1 must always be locked when our path has
+	 * keep_locks set and lowest_level is 1, regardless of the value of
+	 * path->slots[1].
+	 */
+	BUG_ON(path->locks[1] == 0);
 	ret = btrfs_realloc_node(trans, root,
 				 path->nodes[1], 0,
 				 &last_ret,
@@ -110,6 +119,18 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
 		WARN_ON(ret == -EAGAIN);
 		goto out;
 	}
+	/*
+	 * Now that we reallocated the node we can find the next key. Note that
+	 * btrfs_find_next_key() can release our path and do another search
+	 * without COWing, this is because even with path->keep_locks = 1,
+	 * btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a
+	 * node when path->slots[node_level - 1] does not point to the last
+	 * item or a slot beyond the last item (ctree.c:unlock_up()). Therefore
+	 * we search for the next key after reallocating our node.
+	 */
+	path->slots[1] = btrfs_header_nritems(path->nodes[1]);
+	next_key_ret = btrfs_find_next_key(root, path, &key, 1,
+					   min_trans);
 	if (next_key_ret == 0) {
 		memcpy(&root->defrag_progress, &key, sizeof(key));
 		ret = -EAGAIN;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 323e12cc9..978c3a810 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4127,7 +4127,9 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
 				     struct inode *inode,
 				     struct btrfs_path *path,
 				     struct list_head *logged_list,
-				     struct btrfs_log_ctx *ctx)
+				     struct btrfs_log_ctx *ctx,
+				     const u64 start,
+				     const u64 end)
 {
 	struct extent_map *em, *n;
 	struct list_head extents;
@@ -4166,7 +4168,13 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans,
 	}
 
 	list_sort(NULL, &extents, extent_cmp);
-
+	/*
+	 * Collect any new ordered extents within the range. This is to
+	 * prevent logging file extent items without waiting for the disk
+	 * location they point to being written. We do this only to deal
+	 * with races against concurrent lockless direct IO writes.
+	 */
+	btrfs_get_logged_extents(inode, logged_list, start, end);
 process:
 	while (!list_empty(&extents)) {
 		em = list_entry(extents.next, struct extent_map, list);
@@ -4701,7 +4709,7 @@ log_extents:
 			goto out_unlock;
 		}
 		ret = btrfs_log_changed_extents(trans, root, inode, dst_path,
-						&logged_list, ctx);
+						&logged_list, ctx, start, end);
 		if (ret) {
 			err = ret;
 			goto out_unlock;
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 9c62a6f97..366b33594 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -108,7 +108,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
 	},
 };
 
-const u64 const btrfs_raid_group[BTRFS_NR_RAID_TYPES] = {
+const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES] = {
 	[BTRFS_RAID_RAID10] = BTRFS_BLOCK_GROUP_RAID10,
 	[BTRFS_RAID_RAID1]  = BTRFS_BLOCK_GROUP_RAID1,
 	[BTRFS_RAID_DUP]    = BTRFS_BLOCK_GROUP_DUP,
@@ -125,6 +125,7 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root);
 static void __btrfs_reset_dev_stats(struct btrfs_device *dev);
 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
+static void btrfs_close_one_device(struct btrfs_device *device);
 
 DEFINE_MUTEX(uuid_mutex);
 static LIST_HEAD(fs_uuids);
@@ -1103,7 +1104,7 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start,
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
-	path->reada = 2;
+	path->reada = READA_FORWARD;
 
 	key.objectid = device->devid;
 	key.offset = start;
@@ -1183,7 +1184,7 @@ again:
 		struct map_lookup *map;
 		int i;
 
-		map = (struct map_lookup *)em->bdev;
+		map = em->map_lookup;
 		for (i = 0; i < map->num_stripes; i++) {
 			u64 end;
 
@@ -1281,7 +1282,7 @@ again:
 		goto out;
 	}
 
-	path->reada = 2;
+	path->reada = READA_FORWARD;
 	path->search_commit_root = 1;
 	path->skip_locking = 1;
 
@@ -1643,7 +1644,6 @@ static void update_dev_time(char *path_name)
 		return;
 	file_update_time(filp);
 	filp_close(filp, NULL);
-	return;
 }
 
 static int btrfs_rm_dev_item(struct btrfs_root *root,
@@ -2756,7 +2756,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans,
 			free_extent_map(em);
 		return -EINVAL;
 	}
-	map = (struct map_lookup *)em->bdev;
+	map = em->map_lookup;
 	lock_chunks(root->fs_info->chunk_root);
 	check_system_chunk(trans, extent_root, map->type);
 	unlock_chunks(root->fs_info->chunk_root);
@@ -3407,7 +3407,7 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info)
 	list_for_each_entry(device, devices, dev_list) {
 		old_size = btrfs_device_get_total_bytes(device);
 		size_to_free = div_factor(old_size, 1);
-		size_to_free = min(size_to_free, (u64)1 * 1024 * 1024);
+		size_to_free = min_t(u64, size_to_free, SZ_1M);
 		if (!device->writeable ||
 		    btrfs_device_get_total_bytes(device) -
 		    btrfs_device_get_bytes_used(device) > size_to_free ||
@@ -3724,14 +3724,6 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 		goto out;
 	}
 
-	/* allow dup'ed data chunks only in mixed mode */
-	if (!mixed && (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
-	    (bctl->data.target & BTRFS_BLOCK_GROUP_DUP)) {
-		btrfs_err(fs_info, "dup for data is not allowed");
-		ret = -EINVAL;
-		goto out;
-	}
-
 	/* allow to reduce meta or sys integrity only if force set */
 	allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
 			BTRFS_BLOCK_GROUP_RAID10 |
@@ -3757,6 +3749,13 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 		}
 	} while (read_seqretry(&fs_info->profiles_lock, seq));
 
+	if (btrfs_get_num_tolerated_disk_barrier_failures(bctl->meta.target) <
+		btrfs_get_num_tolerated_disk_barrier_failures(bctl->data.target)) {
+		btrfs_warn(fs_info,
+	"metadata profile 0x%llx has lower redundancy than data profile 0x%llx",
+			bctl->meta.target, bctl->data.target);
+	}
+
 	if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
 		fs_info->num_tolerated_disk_barrier_failures = min(
 			btrfs_calc_num_tolerated_disk_barrier_failures(fs_info),
@@ -4269,7 +4268,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
 	if (!path)
 		return -ENOMEM;
 
-	path->reada = 2;
+	path->reada = READA_FORWARD;
 
 	lock_chunks(root);
 
@@ -4461,7 +4460,7 @@ static int btrfs_cmp_device_info(const void *a, const void *b)
 static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target)
 {
 	/* TODO allow them to set a preferred stripe size */
-	return 64 * 1024;
+	return SZ_64K;
 }
 
 static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
@@ -4529,21 +4528,21 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 	ncopies = btrfs_raid_array[index].ncopies;
 
 	if (type & BTRFS_BLOCK_GROUP_DATA) {
-		max_stripe_size = 1024 * 1024 * 1024;
+		max_stripe_size = SZ_1G;
 		max_chunk_size = 10 * max_stripe_size;
 		if (!devs_max)
 			devs_max = BTRFS_MAX_DEVS(info->chunk_root);
 	} else if (type & BTRFS_BLOCK_GROUP_METADATA) {
 		/* for larger filesystems, use larger metadata chunks */
-		if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024)
-			max_stripe_size = 1024 * 1024 * 1024;
+		if (fs_devices->total_rw_bytes > 50ULL * SZ_1G)
+			max_stripe_size = SZ_1G;
 		else
-			max_stripe_size = 256 * 1024 * 1024;
+			max_stripe_size = SZ_256M;
 		max_chunk_size = max_stripe_size;
 		if (!devs_max)
 			devs_max = BTRFS_MAX_DEVS(info->chunk_root);
 	} else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
-		max_stripe_size = 32 * 1024 * 1024;
+		max_stripe_size = SZ_32M;
 		max_chunk_size = 2 * max_stripe_size;
 		if (!devs_max)
 			devs_max = BTRFS_MAX_DEVS_SYS_CHUNK;
@@ -4720,7 +4719,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
 		goto error;
 	}
 	set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
-	em->bdev = (struct block_device *)map;
+	em->map_lookup = map;
 	em->start = start;
 	em->len = num_bytes;
 	em->block_start = 0;
@@ -4794,7 +4793,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
 	u64 dev_offset;
 	u64 stripe_size;
 	int i = 0;
-	int ret;
+	int ret = 0;
 
 	em_tree = &extent_root->fs_info->mapping_tree.map_tree;
 	read_lock(&em_tree->lock);
@@ -4815,7 +4814,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
 		return -EINVAL;
 	}
 
-	map = (struct map_lookup *)em->bdev;
+	map = em->map_lookup;
 	item_size = btrfs_chunk_item_size(map->num_stripes);
 	stripe_size = em->orig_block_len;
 
@@ -4825,20 +4824,32 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
 		goto out;
 	}
 
+	/*
+	 * Take the device list mutex to prevent races with the final phase of
+	 * a device replace operation that replaces the device object associated
+	 * with the map's stripes, because the device object's id can change
+	 * at any time during that final phase of the device replace operation
+	 * (dev-replace.c:btrfs_dev_replace_finishing()).
+	 */
+	mutex_lock(&chunk_root->fs_info->fs_devices->device_list_mutex);
 	for (i = 0; i < map->num_stripes; i++) {
 		device = map->stripes[i].dev;
 		dev_offset = map->stripes[i].physical;
 
 		ret = btrfs_update_device(trans, device);
 		if (ret)
-			goto out;
+			break;
 		ret = btrfs_alloc_dev_extent(trans, device,
 					     chunk_root->root_key.objectid,
 					     BTRFS_FIRST_CHUNK_TREE_OBJECTID,
 					     chunk_offset, dev_offset,
 					     stripe_size);
 		if (ret)
-			goto out;
+			break;
+	}
+	if (ret) {
+		mutex_unlock(&chunk_root->fs_info->fs_devices->device_list_mutex);
+		goto out;
 	}
 
 	stripe = &chunk->stripe;
@@ -4851,6 +4862,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
 		memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
 		stripe++;
 	}
+	mutex_unlock(&chunk_root->fs_info->fs_devices->device_list_mutex);
 
 	btrfs_set_stack_chunk_length(chunk, chunk_size);
 	btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
@@ -4957,7 +4969,7 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset)
 	if (!em)
 		return 1;
 
-	map = (struct map_lookup *)em->bdev;
+	map = em->map_lookup;
 	for (i = 0; i < map->num_stripes; i++) {
 		if (map->stripes[i].dev->missing) {
 			miss_ndevs++;
@@ -5037,7 +5049,7 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
 		return 1;
 	}
 
-	map = (struct map_lookup *)em->bdev;
+	map = em->map_lookup;
 	if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1))
 		ret = map->num_stripes;
 	else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
@@ -5073,7 +5085,7 @@ unsigned long btrfs_full_stripe_len(struct btrfs_root *root,
 	BUG_ON(!em);
 
 	BUG_ON(em->start > logical || em->start + em->len < logical);
-	map = (struct map_lookup *)em->bdev;
+	map = em->map_lookup;
 	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
 		len = map->stripe_len * nr_data_stripes(map);
 	free_extent_map(em);
@@ -5094,7 +5106,7 @@ int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree,
 	BUG_ON(!em);
 
 	BUG_ON(em->start > logical || em->start + em->len < logical);
-	map = (struct map_lookup *)em->bdev;
+	map = em->map_lookup;
 	if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
 		ret = 1;
 	free_extent_map(em);
@@ -5253,7 +5265,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 		return -EINVAL;
 	}
 
-	map = (struct map_lookup *)em->bdev;
+	map = em->map_lookup;
 	offset = logical - em->start;
 
 	stripe_len = map->stripe_len;
@@ -5367,35 +5379,33 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 		 * target drive.
 		 */
 		for (i = 0; i < tmp_num_stripes; i++) {
-			if (tmp_bbio->stripes[i].dev->devid == srcdev_devid) {
-				/*
-				 * In case of DUP, in order to keep it
-				 * simple, only add the mirror with the
-				 * lowest physical address
-				 */
-				if (found &&
-				    physical_of_found <=
-				     tmp_bbio->stripes[i].physical)
-					continue;
-				index_srcdev = i;
-				found = 1;
-				physical_of_found =
-					tmp_bbio->stripes[i].physical;
-			}
+			if (tmp_bbio->stripes[i].dev->devid != srcdev_devid)
+				continue;
+
+			/*
+			 * In case of DUP, in order to keep it simple, only add
+			 * the mirror with the lowest physical address
+			 */
+			if (found &&
+			    physical_of_found <= tmp_bbio->stripes[i].physical)
+				continue;
+
+			index_srcdev = i;
+			found = 1;
+			physical_of_found = tmp_bbio->stripes[i].physical;
 		}
 
-		if (found) {
-			mirror_num = index_srcdev + 1;
-			patch_the_first_stripe_for_dev_replace = 1;
-			physical_to_patch_in_first_stripe = physical_of_found;
-		} else {
+		btrfs_put_bbio(tmp_bbio);
+
+		if (!found) {
 			WARN_ON(1);
 			ret = -EIO;
-			btrfs_put_bbio(tmp_bbio);
 			goto out;
 		}
 
-		btrfs_put_bbio(tmp_bbio);
+		mirror_num = index_srcdev + 1;
+		patch_the_first_stripe_for_dev_replace = 1;
+		physical_to_patch_in_first_stripe = physical_of_found;
 	} else if (mirror_num > map->num_stripes) {
 		mirror_num = 0;
 	}
@@ -5795,7 +5805,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree,
 		free_extent_map(em);
 		return -EIO;
 	}
-	map = (struct map_lookup *)em->bdev;
+	map = em->map_lookup;
 
 	length = em->len;
 	rmap_len = map->stripe_len;
@@ -6058,7 +6068,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 	bbio->fs_info = root->fs_info;
 	atomic_set(&bbio->stripes_pending, bbio->num_stripes);
 
-	if (bbio->raid_map) {
+	if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
+	    ((rw & WRITE) || (mirror_num > 1))) {
 		/* In this case, map_length has been set to the length of
 		   a single stripe; not the whole write */
 		if (rw & WRITE) {
@@ -6199,6 +6210,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 	struct extent_map *em;
 	u64 logical;
 	u64 length;
+	u64 stripe_len;
 	u64 devid;
 	u8 uuid[BTRFS_UUID_SIZE];
 	int num_stripes;
@@ -6207,6 +6219,37 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 
 	logical = key->offset;
 	length = btrfs_chunk_length(leaf, chunk);
+	stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
+	num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+	/* Validation check */
+	if (!num_stripes) {
+		btrfs_err(root->fs_info, "invalid chunk num_stripes: %u",
+			  num_stripes);
+		return -EIO;
+	}
+	if (!IS_ALIGNED(logical, root->sectorsize)) {
+		btrfs_err(root->fs_info,
+			  "invalid chunk logical %llu", logical);
+		return -EIO;
+	}
+	if (!length || !IS_ALIGNED(length, root->sectorsize)) {
+		btrfs_err(root->fs_info,
+			"invalid chunk length %llu", length);
+		return -EIO;
+	}
+	if (!is_power_of_2(stripe_len)) {
+		btrfs_err(root->fs_info, "invalid chunk stripe length: %llu",
+			  stripe_len);
+		return -EIO;
+	}
+	if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) &
+	    btrfs_chunk_type(leaf, chunk)) {
+		btrfs_err(root->fs_info, "unrecognized chunk type: %llu",
+			  ~(BTRFS_BLOCK_GROUP_TYPE_MASK |
+			    BTRFS_BLOCK_GROUP_PROFILE_MASK) &
+			  btrfs_chunk_type(leaf, chunk));
+		return -EIO;
+	}
 
 	read_lock(&map_tree->map_tree.lock);
 	em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
@@ -6223,7 +6266,6 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 	em = alloc_extent_map();
 	if (!em)
 		return -ENOMEM;
-	num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
 	map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
 	if (!map) {
 		free_extent_map(em);
@@ -6231,7 +6273,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 	}
 
 	set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
-	em->bdev = (struct block_device *)map;
+	em->map_lookup = map;
 	em->start = logical;
 	em->len = length;
 	em->orig_start = 0;
@@ -6466,11 +6508,11 @@ int btrfs_read_sys_array(struct btrfs_root *root)
 	sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET);
 	if (!sb)
 		return -ENOMEM;
-	btrfs_set_buffer_uptodate(sb);
+	set_extent_buffer_uptodate(sb);
 	btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);
 	/*
 	 * The sb extent buffer is artifical and just used to read the system array.
-	 * btrfs_set_buffer_uptodate() call does not properly mark all it's
+	 * set_extent_buffer_uptodate() call does not properly mark all it's
 	 * pages up-to-date when the page is larger: extent does not cover the
 	 * whole page and consequently check_page_uptodate does not find all
 	 * the page's extents up-to-date (the hole beyond sb),
@@ -6529,6 +6571,9 @@ int btrfs_read_sys_array(struct btrfs_root *root)
 			if (ret)
 				break;
 		} else {
+			printk(KERN_ERR
+		"BTRFS: unexpected item type %u in sys_array at offset %u\n",
+				(u32)key.type, cur_offset);
 			ret = -EIO;
 			break;
 		}
@@ -6930,7 +6975,7 @@ void btrfs_update_commit_device_bytes_used(struct btrfs_root *root,
 	/* In order to kick the device replace finish process */
 	lock_chunks(root);
 	list_for_each_entry(em, &transaction->pending_chunks, list) {
-		map = (struct map_lookup *)em->bdev;
+		map = em->map_lookup;
 
 		for (i = 0; i < map->num_stripes; i++) {
 			dev = map->stripes[i].dev;
@@ -6958,7 +7003,7 @@ void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info)
 	}
 }
 
-void btrfs_close_one_device(struct btrfs_device *device)
+static void btrfs_close_one_device(struct btrfs_device *device)
 {
 	struct btrfs_fs_devices *fs_devices = device->fs_devices;
 	struct btrfs_device *new_device;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index d5c84f6b1..1939ebde6 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -26,7 +26,7 @@
 
 extern struct mutex uuid_mutex;
 
-#define BTRFS_STRIPE_LEN	(64 * 1024)
+#define BTRFS_STRIPE_LEN	SZ_64K
 
 struct buffer_head;
 struct btrfs_pending_bios {
@@ -566,6 +566,5 @@ static inline void unlock_chunks(struct btrfs_root *root)
 struct list_head *btrfs_get_fs_uuids(void);
 void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info);
 void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info);
-void btrfs_close_one_device(struct btrfs_device *device);
 
 #endif
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index 1fcd7b6e7..6c68d6356 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -126,7 +126,7 @@ static int do_setxattr(struct btrfs_trans_handle *trans,
 	 * locks the inode's i_mutex before calling setxattr or removexattr.
 	 */
 	if (flags & XATTR_REPLACE) {
-		ASSERT(mutex_is_locked(&inode->i_mutex));
+		ASSERT(inode_is_locked(inode));
 		di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode),
 					name, name_len, 0);
 		if (!di)
@@ -283,7 +283,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
-	path->reada = 2;
+	path->reada = READA_FORWARD;
 
 	/* search for our xattrs */
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
@@ -351,137 +351,89 @@ err:
 	return ret;
 }
 
-/*
- * List of handlers for synthetic system.* attributes.  All real ondisk
- * attributes are handled directly.
- */
-const struct xattr_handler *btrfs_xattr_handlers[] = {
-#ifdef CONFIG_BTRFS_FS_POSIX_ACL
-	&posix_acl_access_xattr_handler,
-	&posix_acl_default_xattr_handler,
-#endif
-	NULL,
-};
-
-/*
- * Check if the attribute is in a supported namespace.
- *
- * This is applied after the check for the synthetic attributes in the system
- * namespace.
- */
-static int btrfs_is_valid_xattr(const char *name)
+static int btrfs_xattr_handler_get(const struct xattr_handler *handler,
+				   struct dentry *dentry, const char *name,
+				   void *buffer, size_t size)
 {
-	int len = strlen(name);
-	int prefixlen = 0;
-
-	if (!strncmp(name, XATTR_SECURITY_PREFIX,
-			XATTR_SECURITY_PREFIX_LEN))
-		prefixlen = XATTR_SECURITY_PREFIX_LEN;
-	else if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
-		prefixlen = XATTR_SYSTEM_PREFIX_LEN;
-	else if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN))
-		prefixlen = XATTR_TRUSTED_PREFIX_LEN;
-	else if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN))
-		prefixlen = XATTR_USER_PREFIX_LEN;
-	else if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN))
-		prefixlen = XATTR_BTRFS_PREFIX_LEN;
-	else
-		return -EOPNOTSUPP;
-
-	/*
-	 * The name cannot consist of just prefix
-	 */
-	if (len <= prefixlen)
-		return -EINVAL;
+	struct inode *inode = d_inode(dentry);
 
-	return 0;
+	name = xattr_full_name(handler, name);
+	return __btrfs_getxattr(inode, name, buffer, size);
 }
 
-ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
-		       void *buffer, size_t size)
+static int btrfs_xattr_handler_set(const struct xattr_handler *handler,
+				   struct dentry *dentry, const char *name,
+				   const void *buffer, size_t size,
+				   int flags)
 {
-	int ret;
+	struct inode *inode = d_inode(dentry);
 
-	/*
-	 * If this is a request for a synthetic attribute in the system.*
-	 * namespace use the generic infrastructure to resolve a handler
-	 * for it via sb->s_xattr.
-	 */
-	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
-		return generic_getxattr(dentry, name, buffer, size);
+	name = xattr_full_name(handler, name);
+	return __btrfs_setxattr(NULL, inode, name, buffer, size, flags);
+}
 
-	ret = btrfs_is_valid_xattr(name);
-	if (ret)
-		return ret;
-	return __btrfs_getxattr(d_inode(dentry), name, buffer, size);
+static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler,
+					struct dentry *dentry,
+					const char *name, const void *value,
+					size_t size, int flags)
+{
+	name = xattr_full_name(handler, name);
+	return btrfs_set_prop(d_inode(dentry), name, value, size, flags);
 }
 
+static const struct xattr_handler btrfs_security_xattr_handler = {
+	.prefix = XATTR_SECURITY_PREFIX,
+	.get = btrfs_xattr_handler_get,
+	.set = btrfs_xattr_handler_set,
+};
+
+static const struct xattr_handler btrfs_trusted_xattr_handler = {
+	.prefix = XATTR_TRUSTED_PREFIX,
+	.get = btrfs_xattr_handler_get,
+	.set = btrfs_xattr_handler_set,
+};
+
+static const struct xattr_handler btrfs_user_xattr_handler = {
+	.prefix = XATTR_USER_PREFIX,
+	.get = btrfs_xattr_handler_get,
+	.set = btrfs_xattr_handler_set,
+};
+
+static const struct xattr_handler btrfs_btrfs_xattr_handler = {
+	.prefix = XATTR_BTRFS_PREFIX,
+	.get = btrfs_xattr_handler_get,
+	.set = btrfs_xattr_handler_set_prop,
+};
+
+const struct xattr_handler *btrfs_xattr_handlers[] = {
+	&btrfs_security_xattr_handler,
+#ifdef CONFIG_BTRFS_FS_POSIX_ACL
+	&posix_acl_access_xattr_handler,
+	&posix_acl_default_xattr_handler,
+#endif
+	&btrfs_trusted_xattr_handler,
+	&btrfs_user_xattr_handler,
+	&btrfs_btrfs_xattr_handler,
+	NULL,
+};
+
 int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 		   size_t size, int flags)
 {
 	struct btrfs_root *root = BTRFS_I(d_inode(dentry))->root;
-	int ret;
 
-	/*
-	 * The permission on security.* and system.* is not checked
-	 * in permission().
-	 */
 	if (btrfs_root_readonly(root))
 		return -EROFS;
-
-	/*
-	 * If this is a request for a synthetic attribute in the system.*
-	 * namespace use the generic infrastructure to resolve a handler
-	 * for it via sb->s_xattr.
-	 */
-	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
-		return generic_setxattr(dentry, name, value, size, flags);
-
-	ret = btrfs_is_valid_xattr(name);
-	if (ret)
-		return ret;
-
-	if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN))
-		return btrfs_set_prop(d_inode(dentry), name,
-				      value, size, flags);
-
-	if (size == 0)
-		value = "";  /* empty EA, do not remove */
-
-	return __btrfs_setxattr(NULL, d_inode(dentry), name, value, size,
-				flags);
+	return generic_setxattr(dentry, name, value, size, flags);
 }
 
 int btrfs_removexattr(struct dentry *dentry, const char *name)
 {
 	struct btrfs_root *root = BTRFS_I(d_inode(dentry))->root;
-	int ret;
 
-	/*
-	 * The permission on security.* and system.* is not checked
-	 * in permission().
-	 */
 	if (btrfs_root_readonly(root))
 		return -EROFS;
-
-	/*
-	 * If this is a request for a synthetic attribute in the system.*
-	 * namespace use the generic infrastructure to resolve a handler
-	 * for it via sb->s_xattr.
-	 */
-	if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN))
-		return generic_removexattr(dentry, name);
-
-	ret = btrfs_is_valid_xattr(name);
-	if (ret)
-		return ret;
-
-	if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN))
-		return btrfs_set_prop(d_inode(dentry), name,
-				      NULL, 0, XATTR_REPLACE);
-
-	return __btrfs_setxattr(NULL, d_inode(dentry), name, NULL, 0,
-				XATTR_REPLACE);
+	return generic_removexattr(dentry, name);
 }
 
 static int btrfs_initxattrs(struct inode *inode,
@@ -494,7 +446,7 @@ static int btrfs_initxattrs(struct inode *inode,
 
 	for (xattr = xattr_array; xattr->name != NULL; xattr++) {
 		name = kmalloc(XATTR_SECURITY_PREFIX_LEN +
-			       strlen(xattr->name) + 1, GFP_NOFS);
+			       strlen(xattr->name) + 1, GFP_KERNEL);
 		if (!name) {
 			err = -ENOMEM;
 			break;
diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h
index 5049608d1..96807b3d2 100644
--- a/fs/btrfs/xattr.h
+++ b/fs/btrfs/xattr.h
@@ -28,8 +28,6 @@ extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name,
 extern int __btrfs_setxattr(struct btrfs_trans_handle *trans,
 			    struct inode *inode, const char *name,
 			    const void *value, size_t size, int flags);
-extern ssize_t btrfs_getxattr(struct dentry *dentry, const char *name,
-		void *buffer, size_t size);
 extern int btrfs_setxattr(struct dentry *dentry, const char *name,
 		const void *value, size_t size, int flags);
 extern int btrfs_removexattr(struct dentry *dentry, const char *name);
diff --git a/fs/buffer.c b/fs/buffer.c
index 4f4cd959d..e1632abb4 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -134,13 +134,10 @@ __clear_page_buffers(struct page *page)
 
 static void buffer_io_error(struct buffer_head *bh, char *msg)
 {
-	char b[BDEVNAME_SIZE];
-
 	if (!test_bit(BH_Quiet, &bh->b_state))
 		printk_ratelimited(KERN_ERR
-			"Buffer I/O error on dev %s, logical block %llu%s\n",
-			bdevname(bh->b_bdev, b),
-			(unsigned long long)bh->b_blocknr, msg);
+			"Buffer I/O error on dev %pg, logical block %llu%s\n",
+			bh->b_bdev, (unsigned long long)bh->b_blocknr, msg);
 }
 
 /*
@@ -237,15 +234,13 @@ __find_get_block_slow(struct block_device *bdev, sector_t block)
 	 * elsewhere, don't buffer_error if we had some unmapped buffers
 	 */
 	if (all_mapped) {
-		char b[BDEVNAME_SIZE];
-
 		printk("__find_get_block_slow() failed. "
 			"block=%llu, b_blocknr=%llu\n",
 			(unsigned long long)block,
 			(unsigned long long)bh->b_blocknr);
 		printk("b_state=0x%08lx, b_size=%zu\n",
 			bh->b_state, bh->b_size);
-		printk("device %s blocksize: %d\n", bdevname(bdev, b),
+		printk("device %pg blocksize: %d\n", bdev,
 			1 << bd_inode->i_blkbits);
 	}
 out_unlock:
@@ -531,10 +526,8 @@ repeat:
 
 static void do_thaw_one(struct super_block *sb, void *unused)
 {
-	char b[BDEVNAME_SIZE];
 	while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb))
-		printk(KERN_WARNING "Emergency Thaw on %s\n",
-		       bdevname(sb->s_bdev, b));
+		printk(KERN_WARNING "Emergency Thaw on %pg\n", sb->s_bdev);
 }
 
 static void do_thaw_all(struct work_struct *work)
@@ -1074,12 +1067,10 @@ grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp)
 	 * pagecache index.  (this comparison is done using sector_t types).
 	 */
 	if (unlikely(index != block >> sizebits)) {
-		char b[BDEVNAME_SIZE];
-
 		printk(KERN_ERR "%s: requested out-of-range block %llu for "
-			"device %s\n",
+			"device %pg\n",
 			__func__, (unsigned long long)block,
-			bdevname(bdev, b));
+			bdev);
 		return -EIO;
 	}
 
diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c
index f601def05..452e98dd7 100644
--- a/fs/cachefiles/daemon.c
+++ b/fs/cachefiles/daemon.c
@@ -226,15 +226,9 @@ static ssize_t cachefiles_daemon_write(struct file *file,
 		return -EOPNOTSUPP;
 
 	/* drag the command string into the kernel so we can parse it */
-	data = kmalloc(datalen + 1, GFP_KERNEL);
-	if (!data)
-		return -ENOMEM;
-
-	ret = -EFAULT;
-	if (copy_from_user(data, _data, datalen) != 0)
-		goto error;
-
-	data[datalen] = '\0';
+	data = memdup_user_nul(_data, datalen);
+	if (IS_ERR(data))
+		return PTR_ERR(data);
 
 	ret = -EINVAL;
 	if (memchr(data, '\0', datalen))
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index afa023dde..675a3332d 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -446,7 +446,7 @@ static int cachefiles_attr_changed(struct fscache_object *_object)
 		return 0;
 
 	cachefiles_begin_secure(cache, &saved_cred);
-	mutex_lock(&d_inode(object->backer)->i_mutex);
+	inode_lock(d_inode(object->backer));
 
 	/* if there's an extension to a partial page at the end of the backing
 	 * file, we need to discard the partial page so that we pick up new
@@ -465,7 +465,7 @@ static int cachefiles_attr_changed(struct fscache_object *_object)
 	ret = notify_change(object->backer, &newattrs, NULL);
 
 truncate_failed:
-	mutex_unlock(&d_inode(object->backer)->i_mutex);
+	inode_unlock(d_inode(object->backer));
 	cachefiles_end_secure(cache, saved_cred);
 
 	if (ret == -EIO) {
diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index c4b893453..1c2334c16 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -295,7 +295,7 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache,
 				cachefiles_mark_object_buried(cache, rep, why);
 		}
 
-		mutex_unlock(&d_inode(dir)->i_mutex);
+		inode_unlock(d_inode(dir));
 
 		if (ret == -EIO)
 			cachefiles_io_error(cache, "Unlink failed");
@@ -306,7 +306,7 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache,
 
 	/* directories have to be moved to the graveyard */
 	_debug("move stale object to graveyard");
-	mutex_unlock(&d_inode(dir)->i_mutex);
+	inode_unlock(d_inode(dir));
 
 try_again:
 	/* first step is to make up a grave dentry in the graveyard */
@@ -423,13 +423,13 @@ int cachefiles_delete_object(struct cachefiles_cache *cache,
 
 	dir = dget_parent(object->dentry);
 
-	mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT);
+	inode_lock_nested(d_inode(dir), I_MUTEX_PARENT);
 
 	if (test_bit(FSCACHE_OBJECT_KILLED_BY_CACHE, &object->fscache.flags)) {
 		/* object allocation for the same key preemptively deleted this
 		 * object's file so that it could create its own file */
 		_debug("object preemptively buried");
-		mutex_unlock(&d_inode(dir)->i_mutex);
+		inode_unlock(d_inode(dir));
 		ret = 0;
 	} else {
 		/* we need to check that our parent is _still_ our parent - it
@@ -442,7 +442,7 @@ int cachefiles_delete_object(struct cachefiles_cache *cache,
 			/* it got moved, presumably by cachefilesd culling it,
 			 * so it's no longer in the key path and we can ignore
 			 * it */
-			mutex_unlock(&d_inode(dir)->i_mutex);
+			inode_unlock(d_inode(dir));
 			ret = 0;
 		}
 	}
@@ -501,7 +501,7 @@ lookup_again:
 	/* search the current directory for the element name */
 	_debug("lookup '%s'", name);
 
-	mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT);
+	inode_lock_nested(d_inode(dir), I_MUTEX_PARENT);
 
 	start = jiffies;
 	next = lookup_one_len(name, dir, nlen);
@@ -585,7 +585,7 @@ lookup_again:
 	/* process the next component */
 	if (key) {
 		_debug("advance");
-		mutex_unlock(&d_inode(dir)->i_mutex);
+		inode_unlock(d_inode(dir));
 		dput(dir);
 		dir = next;
 		next = NULL;
@@ -623,7 +623,7 @@ lookup_again:
 	/* note that we're now using this object */
 	ret = cachefiles_mark_object_active(cache, object);
 
-	mutex_unlock(&d_inode(dir)->i_mutex);
+	inode_unlock(d_inode(dir));
 	dput(dir);
 	dir = NULL;
 
@@ -705,7 +705,7 @@ lookup_error:
 		cachefiles_io_error(cache, "Lookup failed");
 	next = NULL;
 error:
-	mutex_unlock(&d_inode(dir)->i_mutex);
+	inode_unlock(d_inode(dir));
 	dput(next);
 error_out2:
 	dput(dir);
@@ -729,7 +729,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
 	_enter(",,%s", dirname);
 
 	/* search the current directory for the element name */
-	mutex_lock(&d_inode(dir)->i_mutex);
+	inode_lock(d_inode(dir));
 
 	start = jiffies;
 	subdir = lookup_one_len(dirname, dir, strlen(dirname));
@@ -768,7 +768,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
 		       d_backing_inode(subdir)->i_ino);
 	}
 
-	mutex_unlock(&d_inode(dir)->i_mutex);
+	inode_unlock(d_inode(dir));
 
 	/* we need to make sure the subdir is a directory */
 	ASSERT(d_backing_inode(subdir));
@@ -800,19 +800,19 @@ check_error:
 	return ERR_PTR(ret);
 
 mkdir_error:
-	mutex_unlock(&d_inode(dir)->i_mutex);
+	inode_unlock(d_inode(dir));
 	dput(subdir);
 	pr_err("mkdir %s failed with error %d\n", dirname, ret);
 	return ERR_PTR(ret);
 
 lookup_error:
-	mutex_unlock(&d_inode(dir)->i_mutex);
+	inode_unlock(d_inode(dir));
 	ret = PTR_ERR(subdir);
 	pr_err("Lookup %s failed with error %d\n", dirname, ret);
 	return ERR_PTR(ret);
 
 nomem_d_alloc:
-	mutex_unlock(&d_inode(dir)->i_mutex);
+	inode_unlock(d_inode(dir));
 	_leave(" = -ENOMEM");
 	return ERR_PTR(-ENOMEM);
 }
@@ -837,7 +837,7 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache,
 	//       dir, filename);
 
 	/* look up the victim */
-	mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT);
+	inode_lock_nested(d_inode(dir), I_MUTEX_PARENT);
 
 	start = jiffies;
 	victim = lookup_one_len(filename, dir, strlen(filename));
@@ -852,7 +852,7 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache,
 	 * at the netfs's request whilst the cull was in progress
 	 */
 	if (d_is_negative(victim)) {
-		mutex_unlock(&d_inode(dir)->i_mutex);
+		inode_unlock(d_inode(dir));
 		dput(victim);
 		_leave(" = -ENOENT [absent]");
 		return ERR_PTR(-ENOENT);
@@ -881,13 +881,13 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache,
 
 object_in_use:
 	read_unlock(&cache->active_lock);
-	mutex_unlock(&d_inode(dir)->i_mutex);
+	inode_unlock(d_inode(dir));
 	dput(victim);
 	//_leave(" = -EBUSY [in use]");
 	return ERR_PTR(-EBUSY);
 
 lookup_error:
-	mutex_unlock(&d_inode(dir)->i_mutex);
+	inode_unlock(d_inode(dir));
 	ret = PTR_ERR(victim);
 	if (ret == -ENOENT) {
 		/* file or dir now absent - probably retired by netfs */
@@ -947,7 +947,7 @@ int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir,
 	return 0;
 
 error_unlock:
-	mutex_unlock(&d_inode(dir)->i_mutex);
+	inode_unlock(d_inode(dir));
 error:
 	dput(victim);
 	if (ret == -ENOENT) {
@@ -982,7 +982,7 @@ int cachefiles_check_in_use(struct cachefiles_cache *cache, struct dentry *dir,
 	if (IS_ERR(victim))
 		return PTR_ERR(victim);
 
-	mutex_unlock(&d_inode(dir)->i_mutex);
+	inode_unlock(d_inode(dir));
 	dput(victim);
 	//_leave(" = 0");
 	return 0;
diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c
index 8f84646f1..f19708487 100644
--- a/fs/ceph/acl.c
+++ b/fs/ceph/acl.c
@@ -49,10 +49,10 @@ struct posix_acl *ceph_get_acl(struct inode *inode, int type)
 
 	switch (type) {
 	case ACL_TYPE_ACCESS:
-		name = POSIX_ACL_XATTR_ACCESS;
+		name = XATTR_NAME_POSIX_ACL_ACCESS;
 		break;
 	case ACL_TYPE_DEFAULT:
-		name = POSIX_ACL_XATTR_DEFAULT;
+		name = XATTR_NAME_POSIX_ACL_DEFAULT;
 		break;
 	default:
 		BUG();
@@ -92,7 +92,7 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 
 	switch (type) {
 	case ACL_TYPE_ACCESS:
-		name = POSIX_ACL_XATTR_ACCESS;
+		name = XATTR_NAME_POSIX_ACL_ACCESS;
 		if (acl) {
 			ret = posix_acl_equiv_mode(acl, &new_mode);
 			if (ret < 0)
@@ -106,7 +106,7 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 			ret = acl ? -EINVAL : 0;
 			goto out;
 		}
-		name = POSIX_ACL_XATTR_DEFAULT;
+		name = XATTR_NAME_POSIX_ACL_DEFAULT;
 		break;
 	default:
 		ret = -EINVAL;
@@ -202,11 +202,11 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
 	ceph_pagelist_encode_32(pagelist, acl && default_acl ? 2 : 1);
 
 	if (acl) {
-		size_t len = strlen(POSIX_ACL_XATTR_ACCESS);
+		size_t len = strlen(XATTR_NAME_POSIX_ACL_ACCESS);
 		err = ceph_pagelist_reserve(pagelist, len + val_size1 + 8);
 		if (err)
 			goto out_err;
-		ceph_pagelist_encode_string(pagelist, POSIX_ACL_XATTR_ACCESS,
+		ceph_pagelist_encode_string(pagelist, XATTR_NAME_POSIX_ACL_ACCESS,
 					    len);
 		err = posix_acl_to_xattr(&init_user_ns, acl,
 					 tmp_buf, val_size1);
@@ -216,12 +216,12 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
 		ceph_pagelist_append(pagelist, tmp_buf, val_size1);
 	}
 	if (default_acl) {
-		size_t len = strlen(POSIX_ACL_XATTR_DEFAULT);
+		size_t len = strlen(XATTR_NAME_POSIX_ACL_DEFAULT);
 		err = ceph_pagelist_reserve(pagelist, len + val_size2 + 8);
 		if (err)
 			goto out_err;
 		err = ceph_pagelist_encode_string(pagelist,
-						  POSIX_ACL_XATTR_DEFAULT, len);
+						  XATTR_NAME_POSIX_ACL_DEFAULT, len);
 		err = posix_acl_to_xattr(&init_user_ns, default_acl,
 					 tmp_buf, val_size2);
 		if (err < 0)
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index b7d218a16..19adeb0ef 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1108,7 +1108,7 @@ retry_locked:
 		return 0;
 
 	/* past end of file? */
-	i_size = inode->i_size;   /* caller holds i_mutex */
+	i_size = i_size_read(inode);
 
 	if (page_off >= i_size ||
 	    (pos_in_page == 0 && (pos+len) >= i_size &&
@@ -1149,7 +1149,6 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
 		page = grab_cache_page_write_begin(mapping, index, 0);
 		if (!page)
 			return -ENOMEM;
-		*pagep = page;
 
 		dout("write_begin file %p inode %p page %p %d~%d\n", file,
 		     inode, page, (int)pos, (int)len);
@@ -1184,8 +1183,7 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
 		zero_user_segment(page, from+copied, len);
 
 	/* did file size increase? */
-	/* (no need for i_size_read(); we caller holds i_mutex */
-	if (pos+copied > inode->i_size)
+	if (pos+copied > i_size_read(inode))
 		check_cap = ceph_inode_set_size(inode, pos+copied);
 
 	if (!PageUptodate(page))
@@ -1378,11 +1376,13 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 
 	ret = VM_FAULT_NOPAGE;
 	if ((off > size) ||
-	    (page->mapping != inode->i_mapping))
+	    (page->mapping != inode->i_mapping)) {
+		unlock_page(page);
 		goto out;
+	}
 
 	ret = ceph_update_writeable_page(vma->vm_file, off, len, page);
-	if (ret == 0) {
+	if (ret >= 0) {
 		/* success.  we'll keep the page locked. */
 		set_page_dirty(page);
 		ret = VM_FAULT_LOCKED;
@@ -1393,8 +1393,6 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 			ret = VM_FAULT_SIGBUS;
 	}
 out:
-	if (ret != VM_FAULT_LOCKED)
-		unlock_page(page);
 	if (ret == VM_FAULT_LOCKED ||
 	    ci->i_inline_version != CEPH_INLINE_NONE) {
 		int dirty;
@@ -1758,6 +1756,10 @@ int ceph_pool_perm_check(struct ceph_inode_info *ci, int need)
 	u32 pool;
 	int ret, flags;
 
+	/* does not support pool namespace yet */
+	if (ci->i_pool_ns_len)
+		return -EIO;
+
 	if (ceph_test_mount_opt(ceph_inode_to_client(&ci->vfs_inode),
 				NOPOOLPERM))
 		return 0;
diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c
index a4766ded1..a351480db 100644
--- a/fs/ceph/cache.c
+++ b/fs/ceph/cache.c
@@ -106,7 +106,7 @@ static uint16_t ceph_fscache_inode_get_aux(const void *cookie_netfs_data,
 
 	memset(&aux, 0, sizeof(aux));
 	aux.mtime = inode->i_mtime;
-	aux.size = inode->i_size;
+	aux.size = i_size_read(inode);
 
 	memcpy(buffer, &aux, sizeof(aux));
 
@@ -117,9 +117,7 @@ static void ceph_fscache_inode_get_attr(const void *cookie_netfs_data,
 					uint64_t *size)
 {
 	const struct ceph_inode_info* ci = cookie_netfs_data;
-	const struct inode* inode = &ci->vfs_inode;
-
-	*size = inode->i_size;
+	*size = i_size_read(&ci->vfs_inode);
 }
 
 static enum fscache_checkaux ceph_fscache_inode_check_aux(
@@ -134,7 +132,7 @@ static enum fscache_checkaux ceph_fscache_inode_check_aux(
 
 	memset(&aux, 0, sizeof(aux));
 	aux.mtime = inode->i_mtime;
-	aux.size = inode->i_size;
+	aux.size = i_size_read(inode);
 
 	if (memcmp(data, &aux, sizeof(aux)) != 0)
 		return FSCACHE_CHECKAUX_OBSOLETE;
@@ -197,7 +195,7 @@ void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc,
 		return;
 
 	/* Avoid multiple racing open requests */
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	if (ci->fscache)
 		goto done;
@@ -207,7 +205,7 @@ void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc,
 					     ci, true);
 	fscache_check_consistency(ci->fscache);
 done:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 }
 
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index c69e1253b..6fe0ad26a 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2030,7 +2030,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 	if (datasync)
 		goto out;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	dirty = try_flush_caps(inode, &flush_tid);
 	dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
@@ -2046,7 +2046,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 		ret = wait_event_interruptible(ci->i_cap_wq,
 					caps_are_flushed(inode, flush_tid));
 	}
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 out:
 	dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret);
 	return ret;
@@ -2753,7 +2753,8 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
 			     void *inline_data, int inline_len,
 			     struct ceph_buffer *xattr_buf,
 			     struct ceph_mds_session *session,
-			     struct ceph_cap *cap, int issued)
+			     struct ceph_cap *cap, int issued,
+			     u32 pool_ns_len)
 	__releases(ci->i_ceph_lock)
 	__releases(mdsc->snap_rwsem)
 {
@@ -2873,6 +2874,8 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
 	if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) {
 		/* file layout may have changed */
 		ci->i_layout = grant->layout;
+		ci->i_pool_ns_len = pool_ns_len;
+
 		/* size/truncate_seq? */
 		queue_trunc = ceph_fill_file_size(inode, issued,
 					le32_to_cpu(grant->truncate_seq),
@@ -3411,6 +3414,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 	u32  inline_len = 0;
 	void *snaptrace;
 	size_t snaptrace_len;
+	u32 pool_ns_len = 0;
 	void *p, *end;
 
 	dout("handle_caps from mds%d\n", mds);
@@ -3463,6 +3467,21 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 		p += inline_len;
 	}
 
+	if (le16_to_cpu(msg->hdr.version) >= 8) {
+		u64 flush_tid;
+		u32 caller_uid, caller_gid;
+		u32 osd_epoch_barrier;
+		/* version >= 5 */
+		ceph_decode_32_safe(&p, end, osd_epoch_barrier, bad);
+		/* version >= 6 */
+		ceph_decode_64_safe(&p, end, flush_tid, bad);
+		/* version >= 7 */
+		ceph_decode_32_safe(&p, end, caller_uid, bad);
+		ceph_decode_32_safe(&p, end, caller_gid, bad);
+		/* version >= 8 */
+		ceph_decode_32_safe(&p, end, pool_ns_len, bad);
+	}
+
 	/* lookup ino */
 	inode = ceph_find_inode(sb, vino);
 	ci = ceph_inode(inode);
@@ -3518,7 +3537,8 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 				  &cap, &issued);
 		handle_cap_grant(mdsc, inode, h,
 				 inline_version, inline_data, inline_len,
-				 msg->middle, session, cap, issued);
+				 msg->middle, session, cap, issued,
+				 pool_ns_len);
 		if (realm)
 			ceph_put_snap_realm(mdsc, realm);
 		goto done_unlocked;
@@ -3542,7 +3562,8 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 		issued |= __ceph_caps_dirty(ci);
 		handle_cap_grant(mdsc, inode, h,
 				 inline_version, inline_data, inline_len,
-				 msg->middle, session, cap, issued);
+				 msg->middle, session, cap, issued,
+				 pool_ns_len);
 		goto done_unlocked;
 
 	case CEPH_CAP_OP_FLUSH_ACK:
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 9314b4ea2..fd11fb231 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -507,7 +507,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
 	loff_t old_offset = ceph_make_fpos(fi->frag, fi->next_offset);
 	loff_t retval;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	retval = -EINVAL;
 	switch (whence) {
 	case SEEK_CUR:
@@ -542,7 +542,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
 		}
 	}
 out:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return retval;
 }
 
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index fe02ae7f0..3b3172357 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -215,7 +215,7 @@ static int ceph_get_name(struct dentry *parent, char *name,
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
-	mutex_lock(&d_inode(parent)->i_mutex);
+	inode_lock(d_inode(parent));
 
 	req->r_inode = d_inode(child);
 	ihold(d_inode(child));
@@ -224,7 +224,7 @@ static int ceph_get_name(struct dentry *parent, char *name,
 	req->r_num_caps = 2;
 	err = ceph_mdsc_do_request(mdsc, NULL, req);
 
-	mutex_unlock(&d_inode(parent)->i_mutex);
+	inode_unlock(d_inode(parent));
 
 	if (!err) {
 		struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 3c68e6aee..eb9028e8c 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -397,8 +397,9 @@ int ceph_release(struct inode *inode, struct file *file)
 }
 
 enum {
-	CHECK_EOF = 1,
-	READ_INLINE = 2,
+	HAVE_RETRIED = 1,
+	CHECK_EOF =    2,
+	READ_INLINE =  3,
 };
 
 /*
@@ -411,17 +412,15 @@ enum {
 static int striped_read(struct inode *inode,
 			u64 off, u64 len,
 			struct page **pages, int num_pages,
-			int *checkeof, bool o_direct,
-			unsigned long buf_align)
+			int *checkeof)
 {
 	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	u64 pos, this_len, left;
-	int io_align, page_align;
-	int pages_left;
-	int read;
+	loff_t i_size;
+	int page_align, pages_left;
+	int read, ret;
 	struct page **page_pos;
-	int ret;
 	bool hit_stripe, was_short;
 
 	/*
@@ -432,13 +431,9 @@ static int striped_read(struct inode *inode,
 	page_pos = pages;
 	pages_left = num_pages;
 	read = 0;
-	io_align = off & ~PAGE_MASK;
 
 more:
-	if (o_direct)
-		page_align = (pos - io_align + buf_align) & ~PAGE_MASK;
-	else
-		page_align = pos & ~PAGE_MASK;
+	page_align = pos & ~PAGE_MASK;
 	this_len = left;
 	ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
 				  &ci->i_layout, pos, &this_len,
@@ -452,13 +447,12 @@ more:
 	dout("striped_read %llu~%llu (read %u) got %d%s%s\n", pos, left, read,
 	     ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : "");
 
+	i_size = i_size_read(inode);
 	if (ret >= 0) {
 		int didpages;
-		if (was_short && (pos + ret < inode->i_size)) {
-			int zlen = min(this_len - ret,
-				       inode->i_size - pos - ret);
-			int zoff = (o_direct ? buf_align : io_align) +
-				    read + ret;
+		if (was_short && (pos + ret < i_size)) {
+			int zlen = min(this_len - ret, i_size - pos - ret);
+			int zoff = (off & ~PAGE_MASK) + read + ret;
 			dout(" zero gap %llu to %llu\n",
 				pos + ret, pos + ret + zlen);
 			ceph_zero_page_vector_range(zoff, zlen, pages);
@@ -473,14 +467,14 @@ more:
 		pages_left -= didpages;
 
 		/* hit stripe and need continue*/
-		if (left && hit_stripe && pos < inode->i_size)
+		if (left && hit_stripe && pos < i_size)
 			goto more;
 	}
 
 	if (read > 0) {
 		ret = read;
 		/* did we bounce off eof? */
-		if (pos + left > inode->i_size)
+		if (pos + left > i_size)
 			*checkeof = CHECK_EOF;
 	}
 
@@ -521,54 +515,28 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i,
 	if (ret < 0)
 		return ret;
 
-	if (iocb->ki_flags & IOCB_DIRECT) {
-		while (iov_iter_count(i)) {
-			size_t start;
-			ssize_t n;
-
-			n = dio_get_pagev_size(i);
-			pages = dio_get_pages_alloc(i, n, &start, &num_pages);
-			if (IS_ERR(pages))
-				return PTR_ERR(pages);
-
-			ret = striped_read(inode, off, n,
-					   pages, num_pages, checkeof,
-					   1, start);
-
-			ceph_put_page_vector(pages, num_pages, true);
-
-			if (ret <= 0)
-				break;
-			off += ret;
-			iov_iter_advance(i, ret);
-			if (ret < n)
+	num_pages = calc_pages_for(off, len);
+	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
+	if (IS_ERR(pages))
+		return PTR_ERR(pages);
+	ret = striped_read(inode, off, len, pages,
+				num_pages, checkeof);
+	if (ret > 0) {
+		int l, k = 0;
+		size_t left = ret;
+
+		while (left) {
+			size_t page_off = off & ~PAGE_MASK;
+			size_t copy = min_t(size_t, left,
+					    PAGE_SIZE - page_off);
+			l = copy_page_to_iter(pages[k++], page_off, copy, i);
+			off += l;
+			left -= l;
+			if (l < copy)
 				break;
 		}
-	} else {
-		num_pages = calc_pages_for(off, len);
-		pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
-		if (IS_ERR(pages))
-			return PTR_ERR(pages);
-		ret = striped_read(inode, off, len, pages,
-					num_pages, checkeof, 0, 0);
-		if (ret > 0) {
-			int l, k = 0;
-			size_t left = ret;
-
-			while (left) {
-				size_t page_off = off & ~PAGE_MASK;
-				size_t copy = min_t(size_t,
-						    PAGE_SIZE - page_off, left);
-				l = copy_page_to_iter(pages[k++], page_off,
-						      copy, i);
-				off += l;
-				left -= l;
-				if (l < copy)
-					break;
-			}
-		}
-		ceph_release_page_vector(pages, num_pages);
 	}
+	ceph_release_page_vector(pages, num_pages);
 
 	if (off > iocb->ki_pos) {
 		ret = off - iocb->ki_pos;
@@ -579,6 +547,193 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i,
 	return ret;
 }
 
+struct ceph_aio_request {
+	struct kiocb *iocb;
+	size_t total_len;
+	int write;
+	int error;
+	struct list_head osd_reqs;
+	unsigned num_reqs;
+	atomic_t pending_reqs;
+	struct timespec mtime;
+	struct ceph_cap_flush *prealloc_cf;
+};
+
+struct ceph_aio_work {
+	struct work_struct work;
+	struct ceph_osd_request *req;
+};
+
+static void ceph_aio_retry_work(struct work_struct *work);
+
+static void ceph_aio_complete(struct inode *inode,
+			      struct ceph_aio_request *aio_req)
+{
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	int ret;
+
+	if (!atomic_dec_and_test(&aio_req->pending_reqs))
+		return;
+
+	ret = aio_req->error;
+	if (!ret)
+		ret = aio_req->total_len;
+
+	dout("ceph_aio_complete %p rc %d\n", inode, ret);
+
+	if (ret >= 0 && aio_req->write) {
+		int dirty;
+
+		loff_t endoff = aio_req->iocb->ki_pos + aio_req->total_len;
+		if (endoff > i_size_read(inode)) {
+			if (ceph_inode_set_size(inode, endoff))
+				ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
+		}
+
+		spin_lock(&ci->i_ceph_lock);
+		ci->i_inline_version = CEPH_INLINE_NONE;
+		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
+					       &aio_req->prealloc_cf);
+		spin_unlock(&ci->i_ceph_lock);
+		if (dirty)
+			__mark_inode_dirty(inode, dirty);
+
+	}
+
+	ceph_put_cap_refs(ci, (aio_req->write ? CEPH_CAP_FILE_WR :
+						CEPH_CAP_FILE_RD));
+
+	aio_req->iocb->ki_complete(aio_req->iocb, ret, 0);
+
+	ceph_free_cap_flush(aio_req->prealloc_cf);
+	kfree(aio_req);
+}
+
+static void ceph_aio_complete_req(struct ceph_osd_request *req,
+				  struct ceph_msg *msg)
+{
+	int rc = req->r_result;
+	struct inode *inode = req->r_inode;
+	struct ceph_aio_request *aio_req = req->r_priv;
+	struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0);
+	int num_pages = calc_pages_for((u64)osd_data->alignment,
+				       osd_data->length);
+
+	dout("ceph_aio_complete_req %p rc %d bytes %llu\n",
+	     inode, rc, osd_data->length);
+
+	if (rc == -EOLDSNAPC) {
+		struct ceph_aio_work *aio_work;
+		BUG_ON(!aio_req->write);
+
+		aio_work = kmalloc(sizeof(*aio_work), GFP_NOFS);
+		if (aio_work) {
+			INIT_WORK(&aio_work->work, ceph_aio_retry_work);
+			aio_work->req = req;
+			queue_work(ceph_inode_to_client(inode)->wb_wq,
+				   &aio_work->work);
+			return;
+		}
+		rc = -ENOMEM;
+	} else if (!aio_req->write) {
+		if (rc == -ENOENT)
+			rc = 0;
+		if (rc >= 0 && osd_data->length > rc) {
+			int zoff = osd_data->alignment + rc;
+			int zlen = osd_data->length - rc;
+			/*
+			 * If read is satisfied by single OSD request,
+			 * it can pass EOF. Otherwise read is within
+			 * i_size.
+			 */
+			if (aio_req->num_reqs == 1) {
+				loff_t i_size = i_size_read(inode);
+				loff_t endoff = aio_req->iocb->ki_pos + rc;
+				if (endoff < i_size)
+					zlen = min_t(size_t, zlen,
+						     i_size - endoff);
+				aio_req->total_len = rc + zlen;
+			}
+
+			if (zlen > 0)
+				ceph_zero_page_vector_range(zoff, zlen,
+							    osd_data->pages);
+		}
+	}
+
+	ceph_put_page_vector(osd_data->pages, num_pages, false);
+	ceph_osdc_put_request(req);
+
+	if (rc < 0)
+		cmpxchg(&aio_req->error, 0, rc);
+
+	ceph_aio_complete(inode, aio_req);
+	return;
+}
+
+static void ceph_aio_retry_work(struct work_struct *work)
+{
+	struct ceph_aio_work *aio_work =
+		container_of(work, struct ceph_aio_work, work);
+	struct ceph_osd_request *orig_req = aio_work->req;
+	struct ceph_aio_request *aio_req = orig_req->r_priv;
+	struct inode *inode = orig_req->r_inode;
+	struct ceph_inode_info *ci = ceph_inode(inode);
+	struct ceph_snap_context *snapc;
+	struct ceph_osd_request *req;
+	int ret;
+
+	spin_lock(&ci->i_ceph_lock);
+	if (__ceph_have_pending_cap_snap(ci)) {
+		struct ceph_cap_snap *capsnap =
+			list_last_entry(&ci->i_cap_snaps,
+					struct ceph_cap_snap,
+					ci_item);
+		snapc = ceph_get_snap_context(capsnap->context);
+	} else {
+		BUG_ON(!ci->i_head_snapc);
+		snapc = ceph_get_snap_context(ci->i_head_snapc);
+	}
+	spin_unlock(&ci->i_ceph_lock);
+
+	req = ceph_osdc_alloc_request(orig_req->r_osdc, snapc, 2,
+			false, GFP_NOFS);
+	if (!req) {
+		ret = -ENOMEM;
+		req = orig_req;
+		goto out;
+	}
+
+	req->r_flags =	CEPH_OSD_FLAG_ORDERSNAP |
+			CEPH_OSD_FLAG_ONDISK |
+			CEPH_OSD_FLAG_WRITE;
+	req->r_base_oloc = orig_req->r_base_oloc;
+	req->r_base_oid = orig_req->r_base_oid;
+
+	req->r_ops[0] = orig_req->r_ops[0];
+	osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
+
+	ceph_osdc_build_request(req, req->r_ops[0].extent.offset,
+				snapc, CEPH_NOSNAP, &aio_req->mtime);
+
+	ceph_osdc_put_request(orig_req);
+
+	req->r_callback = ceph_aio_complete_req;
+	req->r_inode = inode;
+	req->r_priv = aio_req;
+
+	ret = ceph_osdc_start_request(req->r_osdc, req, false);
+out:
+	if (ret < 0) {
+		BUG_ON(ret == -EOLDSNAPC);
+		req->r_result = ret;
+		ceph_aio_complete_req(req, NULL);
+	}
+
+	ceph_put_snap_context(snapc);
+	kfree(aio_work);
+}
+
 /*
  * Write commit request unsafe callback, called to tell us when a
  * request is unsafe (that is, in flight--has been handed to the
@@ -612,16 +767,10 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe)
 }
 
 
-/*
- * Synchronous write, straight from __user pointer or user pages.
- *
- * If write spans object boundary, just do multiple writes.  (For a
- * correct atomic write, we should e.g. take write locks on all
- * objects, rollback on failure, etc.)
- */
 static ssize_t
-ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
-		       struct ceph_snap_context *snapc)
+ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
+		       struct ceph_snap_context *snapc,
+		       struct ceph_cap_flush **pcf)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file_inode(file);
@@ -630,44 +779,52 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
 	struct ceph_vino vino;
 	struct ceph_osd_request *req;
 	struct page **pages;
-	int num_pages;
-	int written = 0;
+	struct ceph_aio_request *aio_req = NULL;
+	int num_pages = 0;
 	int flags;
-	int check_caps = 0;
 	int ret;
 	struct timespec mtime = CURRENT_TIME;
-	size_t count = iov_iter_count(from);
+	size_t count = iov_iter_count(iter);
+	loff_t pos = iocb->ki_pos;
+	bool write = iov_iter_rw(iter) == WRITE;
 
-	if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
+	if (write && ceph_snap(file_inode(file)) != CEPH_NOSNAP)
 		return -EROFS;
 
-	dout("sync_direct_write on file %p %lld~%u\n", file, pos,
-	     (unsigned)count);
+	dout("sync_direct_read_write (%s) on file %p %lld~%u\n",
+	     (write ? "write" : "read"), file, pos, (unsigned)count);
 
 	ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count);
 	if (ret < 0)
 		return ret;
 
-	ret = invalidate_inode_pages2_range(inode->i_mapping,
-					    pos >> PAGE_CACHE_SHIFT,
-					    (pos + count) >> PAGE_CACHE_SHIFT);
-	if (ret < 0)
-		dout("invalidate_inode_pages2_range returned %d\n", ret);
+	if (write) {
+		ret = invalidate_inode_pages2_range(inode->i_mapping,
+					pos >> PAGE_CACHE_SHIFT,
+					(pos + count) >> PAGE_CACHE_SHIFT);
+		if (ret < 0)
+			dout("invalidate_inode_pages2_range returned %d\n", ret);
 
-	flags = CEPH_OSD_FLAG_ORDERSNAP |
-		CEPH_OSD_FLAG_ONDISK |
-		CEPH_OSD_FLAG_WRITE;
+		flags = CEPH_OSD_FLAG_ORDERSNAP |
+			CEPH_OSD_FLAG_ONDISK |
+			CEPH_OSD_FLAG_WRITE;
+	} else {
+		flags = CEPH_OSD_FLAG_READ;
+	}
 
-	while (iov_iter_count(from) > 0) {
-		u64 len = dio_get_pagev_size(from);
-		size_t start;
-		ssize_t n;
+	while (iov_iter_count(iter) > 0) {
+		u64 size = dio_get_pagev_size(iter);
+		size_t start = 0;
+		ssize_t len;
 
 		vino = ceph_vino(inode);
 		req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
-					    vino, pos, &len, 0,
-					    2,/*include a 'startsync' command*/
-					    CEPH_OSD_OP_WRITE, flags, snapc,
+					    vino, pos, &size, 0,
+					    /*include a 'startsync' command*/
+					    write ? 2 : 1,
+					    write ? CEPH_OSD_OP_WRITE :
+						    CEPH_OSD_OP_READ,
+					    flags, snapc,
 					    ci->i_truncate_seq,
 					    ci->i_truncate_size,
 					    false);
@@ -676,10 +833,8 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
 			break;
 		}
 
-		osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
-
-		n = len;
-		pages = dio_get_pages_alloc(from, len, &start, &num_pages);
+		len = size;
+		pages = dio_get_pages_alloc(iter, len, &start, &num_pages);
 		if (IS_ERR(pages)) {
 			ceph_osdc_put_request(req);
 			ret = PTR_ERR(pages);
@@ -687,47 +842,128 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
 		}
 
 		/*
-		 * throw out any page cache pages in this range. this
-		 * may block.
+		 * To simplify error handling, allow AIO when IO within i_size
+		 * or IO can be satisfied by single OSD request.
 		 */
-		truncate_inode_pages_range(inode->i_mapping, pos,
-				   (pos+n) | (PAGE_CACHE_SIZE-1));
-		osd_req_op_extent_osd_data_pages(req, 0, pages, n, start,
-						false, false);
+		if (pos == iocb->ki_pos && !is_sync_kiocb(iocb) &&
+		    (len == count || pos + count <= i_size_read(inode))) {
+			aio_req = kzalloc(sizeof(*aio_req), GFP_KERNEL);
+			if (aio_req) {
+				aio_req->iocb = iocb;
+				aio_req->write = write;
+				INIT_LIST_HEAD(&aio_req->osd_reqs);
+				if (write) {
+					aio_req->mtime = mtime;
+					swap(aio_req->prealloc_cf, *pcf);
+				}
+			}
+			/* ignore error */
+		}
+
+		if (write) {
+			/*
+			 * throw out any page cache pages in this range. this
+			 * may block.
+			 */
+			truncate_inode_pages_range(inode->i_mapping, pos,
+					(pos+len) | (PAGE_CACHE_SIZE - 1));
+
+			osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0);
+		}
+
+
+		osd_req_op_extent_osd_data_pages(req, 0, pages, len, start,
+						 false, false);
 
-		/* BUG_ON(vino.snap != CEPH_NOSNAP); */
 		ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime);
 
-		ret = ceph_osdc_start_request(&fsc->client->osdc, req, false);
+		if (aio_req) {
+			aio_req->total_len += len;
+			aio_req->num_reqs++;
+			atomic_inc(&aio_req->pending_reqs);
+
+			req->r_callback = ceph_aio_complete_req;
+			req->r_inode = inode;
+			req->r_priv = aio_req;
+			list_add_tail(&req->r_unsafe_item, &aio_req->osd_reqs);
+
+			pos += len;
+			iov_iter_advance(iter, len);
+			continue;
+		}
+
+		ret = ceph_osdc_start_request(req->r_osdc, req, false);
 		if (!ret)
 			ret = ceph_osdc_wait_request(&fsc->client->osdc, req);
 
+		size = i_size_read(inode);
+		if (!write) {
+			if (ret == -ENOENT)
+				ret = 0;
+			if (ret >= 0 && ret < len && pos + ret < size) {
+				int zlen = min_t(size_t, len - ret,
+						 size - pos - ret);
+				ceph_zero_page_vector_range(start + ret, zlen,
+							    pages);
+				ret += zlen;
+			}
+			if (ret >= 0)
+				len = ret;
+		}
+
 		ceph_put_page_vector(pages, num_pages, false);
 
 		ceph_osdc_put_request(req);
-		if (ret)
+		if (ret < 0)
+			break;
+
+		pos += len;
+		iov_iter_advance(iter, len);
+
+		if (!write && pos >= size)
 			break;
-		pos += n;
-		written += n;
-		iov_iter_advance(from, n);
 
-		if (pos > i_size_read(inode)) {
-			check_caps = ceph_inode_set_size(inode, pos);
-			if (check_caps)
+		if (write && pos > size) {
+			if (ceph_inode_set_size(inode, pos))
 				ceph_check_caps(ceph_inode(inode),
 						CHECK_CAPS_AUTHONLY,
 						NULL);
 		}
 	}
 
-	if (ret != -EOLDSNAPC && written > 0) {
+	if (aio_req) {
+		if (aio_req->num_reqs == 0) {
+			kfree(aio_req);
+			return ret;
+		}
+
+		ceph_get_cap_refs(ci, write ? CEPH_CAP_FILE_WR :
+					      CEPH_CAP_FILE_RD);
+
+		while (!list_empty(&aio_req->osd_reqs)) {
+			req = list_first_entry(&aio_req->osd_reqs,
+					       struct ceph_osd_request,
+					       r_unsafe_item);
+			list_del_init(&req->r_unsafe_item);
+			if (ret >= 0)
+				ret = ceph_osdc_start_request(req->r_osdc,
+							      req, false);
+			if (ret < 0) {
+				BUG_ON(ret == -EOLDSNAPC);
+				req->r_result = ret;
+				ceph_aio_complete_req(req, NULL);
+			}
+		}
+		return -EIOCBQUEUED;
+	}
+
+	if (ret != -EOLDSNAPC && pos > iocb->ki_pos) {
+		ret = pos - iocb->ki_pos;
 		iocb->ki_pos = pos;
-		ret = written;
 	}
 	return ret;
 }
 
-
 /*
  * Synchronous write, straight from __user pointer or user pages.
  *
@@ -897,8 +1133,14 @@ again:
 		     ceph_cap_string(got));
 
 		if (ci->i_inline_version == CEPH_INLINE_NONE) {
-			/* hmm, this isn't really async... */
-			ret = ceph_sync_read(iocb, to, &retry_op);
+			if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) {
+				ret = ceph_direct_read_write(iocb, to,
+							     NULL, NULL);
+				if (ret >= 0 && ret < len)
+					retry_op = CHECK_EOF;
+			} else {
+				ret = ceph_sync_read(iocb, to, &retry_op);
+			}
 		} else {
 			retry_op = READ_INLINE;
 		}
@@ -916,7 +1158,7 @@ again:
 		pinned_page = NULL;
 	}
 	ceph_put_cap_refs(ci, got);
-	if (retry_op && ret >= 0) {
+	if (retry_op > HAVE_RETRIED && ret >= 0) {
 		int statret;
 		struct page *page = NULL;
 		loff_t i_size;
@@ -968,12 +1210,11 @@ again:
 		if (retry_op == CHECK_EOF && iocb->ki_pos < i_size &&
 		    ret < len) {
 			dout("sync_read hit hole, ppos %lld < size %lld"
-			     ", reading more\n", iocb->ki_pos,
-			     inode->i_size);
+			     ", reading more\n", iocb->ki_pos, i_size);
 
 			read += ret;
 			len -= ret;
-			retry_op = 0;
+			retry_op = HAVE_RETRIED;
 			goto again;
 		}
 	}
@@ -1014,7 +1255,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	if (!prealloc_cf)
 		return -ENOMEM;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	/* We can write back this queue in page reclaim */
 	current->backing_dev_info = inode_to_bdi(inode);
@@ -1052,7 +1293,7 @@ retry_snap:
 	}
 
 	dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n",
-	     inode, ceph_vinop(inode), pos, count, inode->i_size);
+	     inode, ceph_vinop(inode), pos, count, i_size_read(inode));
 	if (fi->fmode & CEPH_FILE_MODE_LAZY)
 		want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO;
 	else
@@ -1070,7 +1311,7 @@ retry_snap:
 	    (iocb->ki_flags & IOCB_DIRECT) || (fi->flags & CEPH_F_SYNC)) {
 		struct ceph_snap_context *snapc;
 		struct iov_iter data;
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 
 		spin_lock(&ci->i_ceph_lock);
 		if (__ceph_have_pending_cap_snap(ci)) {
@@ -1088,8 +1329,8 @@ retry_snap:
 		/* we might need to revert back to that point */
 		data = *from;
 		if (iocb->ki_flags & IOCB_DIRECT)
-			written = ceph_sync_direct_write(iocb, &data, pos,
-							 snapc);
+			written = ceph_direct_read_write(iocb, &data, snapc,
+							 &prealloc_cf);
 		else
 			written = ceph_sync_write(iocb, &data, pos, snapc);
 		if (written == -EOLDSNAPC) {
@@ -1097,14 +1338,14 @@ retry_snap:
 				"got EOLDSNAPC, retrying\n",
 				inode, ceph_vinop(inode),
 				pos, (unsigned)count);
-			mutex_lock(&inode->i_mutex);
+			inode_lock(inode);
 			goto retry_snap;
 		}
 		if (written > 0)
 			iov_iter_advance(from, written);
 		ceph_put_snap_context(snapc);
 	} else {
-		loff_t old_size = inode->i_size;
+		loff_t old_size = i_size_read(inode);
 		/*
 		 * No need to acquire the i_truncate_mutex. Because
 		 * the MDS revokes Fwb caps before sending truncate
@@ -1115,9 +1356,9 @@ retry_snap:
 		written = generic_perform_write(file, from, pos);
 		if (likely(written >= 0))
 			iocb->ki_pos = pos + written;
-		if (inode->i_size > old_size)
+		if (i_size_read(inode) > old_size)
 			ceph_fscache_update_objectsize(inode);
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 	}
 
 	if (written >= 0) {
@@ -1147,7 +1388,7 @@ retry_snap:
 	goto out_unlocked;
 
 out:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 out_unlocked:
 	ceph_free_cap_flush(prealloc_cf);
 	current->backing_dev_info = NULL;
@@ -1160,9 +1401,10 @@ out_unlocked:
 static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
 {
 	struct inode *inode = file->f_mapping->host;
+	loff_t i_size;
 	int ret;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) {
 		ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false);
@@ -1172,9 +1414,10 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
 		}
 	}
 
+	i_size = i_size_read(inode);
 	switch (whence) {
 	case SEEK_END:
-		offset += inode->i_size;
+		offset += i_size;
 		break;
 	case SEEK_CUR:
 		/*
@@ -1190,24 +1433,24 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence)
 		offset += file->f_pos;
 		break;
 	case SEEK_DATA:
-		if (offset >= inode->i_size) {
+		if (offset >= i_size) {
 			ret = -ENXIO;
 			goto out;
 		}
 		break;
 	case SEEK_HOLE:
-		if (offset >= inode->i_size) {
+		if (offset >= i_size) {
 			ret = -ENXIO;
 			goto out;
 		}
-		offset = inode->i_size;
+		offset = i_size;
 		break;
 	}
 
 	offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
 
 out:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return offset;
 }
 
@@ -1363,7 +1606,7 @@ static long ceph_fallocate(struct file *file, int mode,
 	if (!prealloc_cf)
 		return -ENOMEM;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	if (ceph_snap(inode) != CEPH_NOSNAP) {
 		ret = -EROFS;
@@ -1418,7 +1661,7 @@ static long ceph_fallocate(struct file *file, int mode,
 
 	ceph_put_cap_refs(ci, got);
 unlock:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	ceph_free_cap_flush(prealloc_cf);
 	return ret;
 }
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 498dcfa2d..5849b88bb 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -396,6 +396,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
 	ci->i_symlink = NULL;
 
 	memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
+	ci->i_pool_ns_len = 0;
 
 	ci->i_fragtree = RB_ROOT;
 	mutex_init(&ci->i_fragtree_mutex);
@@ -548,7 +549,7 @@ int ceph_fill_file_size(struct inode *inode, int issued,
 	if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 ||
 	    (truncate_seq == ci->i_truncate_seq && size > inode->i_size)) {
 		dout("size %lld -> %llu\n", inode->i_size, size);
-		inode->i_size = size;
+		i_size_write(inode, size);
 		inode->i_blocks = (size + (1<<9) - 1) >> 9;
 		ci->i_reported_size = size;
 		if (truncate_seq != ci->i_truncate_seq) {
@@ -756,6 +757,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
 		if (ci->i_layout.fl_pg_pool != info->layout.fl_pg_pool)
 			ci->i_ceph_flags &= ~CEPH_I_POOL_PERM;
 		ci->i_layout = info->layout;
+		ci->i_pool_ns_len = iinfo->pool_ns_len;
 
 		queue_trunc = ceph_fill_file_size(inode, issued,
 					le32_to_cpu(info->truncate_seq),
@@ -808,7 +810,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
 			spin_unlock(&ci->i_ceph_lock);
 
 			err = -EINVAL;
-			if (WARN_ON(symlen != inode->i_size))
+			if (WARN_ON(symlen != i_size_read(inode)))
 				goto out;
 
 			err = -ENOMEM;
@@ -1549,7 +1551,7 @@ int ceph_inode_set_size(struct inode *inode, loff_t size)
 
 	spin_lock(&ci->i_ceph_lock);
 	dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size);
-	inode->i_size = size;
+	i_size_write(inode, size);
 	inode->i_blocks = (size + (1 << 9) - 1) >> 9;
 
 	/* tell the MDS if we are approaching max_size */
@@ -1756,7 +1758,7 @@ retry:
  */
 static const struct inode_operations ceph_symlink_iops = {
 	.readlink = generic_readlink,
-	.follow_link = simple_follow_link,
+	.get_link = simple_get_link,
 	.setattr = ceph_setattr,
 	.getattr = ceph_getattr,
 	.setxattr = ceph_setxattr,
@@ -1911,7 +1913,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
 		     inode->i_size, attr->ia_size);
 		if ((issued & CEPH_CAP_FILE_EXCL) &&
 		    attr->ia_size > inode->i_size) {
-			inode->i_size = attr->ia_size;
+			i_size_write(inode, attr->ia_size);
 			inode->i_blocks =
 				(attr->ia_size + (1 << 9) - 1) >> 9;
 			inode->i_ctime = attr->ia_ctime;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index e7b130a63..911d64d86 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -100,6 +100,14 @@ static int parse_reply_info_in(void **p, void *end,
 	} else
 		info->inline_version = CEPH_INLINE_NONE;
 
+	if (features & CEPH_FEATURE_FS_FILE_LAYOUT_V2) {
+		ceph_decode_32_safe(p, end, info->pool_ns_len, bad);
+		ceph_decode_need(p, end, info->pool_ns_len, bad);
+		*p += info->pool_ns_len;
+	} else {
+		info->pool_ns_len = 0;
+	}
+
 	return 0;
 bad:
 	return err;
@@ -2298,6 +2306,14 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
 		ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir),
 				  CEPH_CAP_PIN);
 
+	/* deny access to directories with pool_ns layouts */
+	if (req->r_inode && S_ISDIR(req->r_inode->i_mode) &&
+	    ceph_inode(req->r_inode)->i_pool_ns_len)
+		return -EIO;
+	if (req->r_locked_dir &&
+	    ceph_inode(req->r_locked_dir)->i_pool_ns_len)
+		return -EIO;
+
 	/* issue */
 	mutex_lock(&mdsc->mutex);
 	__register_request(mdsc, req, dir);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index ccf11ef0c..37712ccff 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -44,6 +44,7 @@ struct ceph_mds_reply_info_in {
 	u64 inline_version;
 	u32 inline_len;
 	char *inline_data;
+	u32 pool_ns_len;
 };
 
 /*
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index f446afada..ca4d5e845 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -639,8 +639,8 @@ static int __init init_caches(void)
 	ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
 				      sizeof(struct ceph_inode_info),
 				      __alignof__(struct ceph_inode_info),
-				      (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
-				      ceph_inode_init_once);
+				      SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
+				      SLAB_ACCOUNT, ceph_inode_init_once);
 	if (ceph_inode_cachep == NULL)
 		return -ENOMEM;
 
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 75b7d125c..9c458eb52 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -287,6 +287,7 @@ struct ceph_inode_info {
 
 	struct ceph_dir_layout i_dir_layout;
 	struct ceph_file_layout i_layout;
+	size_t i_pool_ns_len;
 	char *i_symlink;
 
 	/* for dirs */
diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c
index 7dc886c9a..e956cba94 100644
--- a/fs/cifs/cifs_dfs_ref.c
+++ b/fs/cifs/cifs_dfs_ref.c
@@ -175,7 +175,7 @@ char *cifs_compose_mount_options(const char *sb_mountdata,
 	 * string to the length of the original string to allow for worst case.
 	 */
 	md_len = strlen(sb_mountdata) + INET6_ADDRSTRLEN;
-	mountdata = kzalloc(md_len + 1, GFP_KERNEL);
+	mountdata = kzalloc(md_len + sizeof("ip=") + 1, GFP_KERNEL);
 	if (mountdata == NULL) {
 		rc = -ENOMEM;
 		goto compose_mount_options_err;
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index cbc0f4bca..2eea40353 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -507,6 +507,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
 
 	seq_printf(s, ",rsize=%u", cifs_sb->rsize);
 	seq_printf(s, ",wsize=%u", cifs_sb->wsize);
+	seq_printf(s, ",echo_interval=%lu",
+			tcon->ses->server->echo_interval / HZ);
 	/* convert actimeo and display it in seconds */
 	seq_printf(s, ",actimeo=%lu", cifs_sb->actimeo / HZ);
 
@@ -640,9 +642,9 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
 		while (*s && *s != sep)
 			s++;
 
-		mutex_lock(&dir->i_mutex);
+		inode_lock(dir);
 		child = lookup_one_len(p, dentry, s - p);
-		mutex_unlock(&dir->i_mutex);
+		inode_unlock(dir);
 		dput(dentry);
 		dentry = child;
 	} while (!IS_ERR(dentry));
@@ -752,6 +754,9 @@ cifs_loose_read_iter(struct kiocb *iocb, struct iov_iter *iter)
 	ssize_t rc;
 	struct inode *inode = file_inode(iocb->ki_filp);
 
+	if (iocb->ki_filp->f_flags & O_DIRECT)
+		return cifs_user_readv(iocb, iter);
+
 	rc = cifs_revalidate_mapping(inode);
 	if (rc)
 		return rc;
@@ -766,6 +771,18 @@ static ssize_t cifs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	ssize_t written;
 	int rc;
 
+	if (iocb->ki_filp->f_flags & O_DIRECT) {
+		written = cifs_user_writev(iocb, from);
+		if (written > 0 && CIFS_CACHE_READ(cinode)) {
+			cifs_zap_mapping(inode);
+			cifs_dbg(FYI,
+				 "Set no oplock for inode=%p after a write operation\n",
+				 inode);
+			cinode->oplock = 0;
+		}
+		return written;
+	}
+
 	written = cifs_get_writer(cinode);
 	if (written)
 		return written;
@@ -900,8 +917,7 @@ const struct inode_operations cifs_file_inode_ops = {
 
 const struct inode_operations cifs_symlink_inode_ops = {
 	.readlink = generic_readlink,
-	.follow_link = cifs_follow_link,
-	.put_link = kfree_put_link,
+	.get_link = cifs_get_link,
 	.permission = cifs_permission,
 	/* BB add the following two eventually */
 	/* revalidate: cifs_revalidate,
@@ -914,6 +930,59 @@ const struct inode_operations cifs_symlink_inode_ops = {
 #endif
 };
 
+static int cifs_clone_file_range(struct file *src_file, loff_t off,
+		struct file *dst_file, loff_t destoff, u64 len)
+{
+	struct inode *src_inode = file_inode(src_file);
+	struct inode *target_inode = file_inode(dst_file);
+	struct cifsFileInfo *smb_file_src = src_file->private_data;
+	struct cifsFileInfo *smb_file_target = dst_file->private_data;
+	struct cifs_tcon *target_tcon = tlink_tcon(smb_file_target->tlink);
+	unsigned int xid;
+	int rc;
+
+	cifs_dbg(FYI, "clone range\n");
+
+	xid = get_xid();
+
+	if (!src_file->private_data || !dst_file->private_data) {
+		rc = -EBADF;
+		cifs_dbg(VFS, "missing cifsFileInfo on copy range src file\n");
+		goto out;
+	}
+
+	/*
+	 * Note: cifs case is easier than btrfs since server responsible for
+	 * checks for proper open modes and file type and if it wants
+	 * server could even support copy of range where source = target
+	 */
+	lock_two_nondirectories(target_inode, src_inode);
+
+	if (len == 0)
+		len = src_inode->i_size - off;
+
+	cifs_dbg(FYI, "about to flush pages\n");
+	/* should we flush first and last page first */
+	truncate_inode_pages_range(&target_inode->i_data, destoff,
+				   PAGE_CACHE_ALIGN(destoff + len)-1);
+
+	if (target_tcon->ses->server->ops->duplicate_extents)
+		rc = target_tcon->ses->server->ops->duplicate_extents(xid,
+			smb_file_src, smb_file_target, off, len, destoff);
+	else
+		rc = -EOPNOTSUPP;
+
+	/* force revalidate of size and timestamps of target file now
+	   that target is updated on the server */
+	CIFS_I(target_inode)->time = 0;
+	/* although unlocking in the reverse order from locking is not
+	   strictly necessary here it is a little cleaner to be consistent */
+	unlock_two_nondirectories(src_inode, target_inode);
+out:
+	free_xid(xid);
+	return rc;
+}
+
 const struct file_operations cifs_file_ops = {
 	.read_iter = cifs_loose_read_iter,
 	.write_iter = cifs_file_write_iter,
@@ -926,6 +995,7 @@ const struct file_operations cifs_file_ops = {
 	.splice_read = generic_file_splice_read,
 	.llseek = cifs_llseek,
 	.unlocked_ioctl	= cifs_ioctl,
+	.clone_file_range = cifs_clone_file_range,
 	.setlease = cifs_setlease,
 	.fallocate = cifs_fallocate,
 };
@@ -942,6 +1012,7 @@ const struct file_operations cifs_file_strict_ops = {
 	.splice_read = generic_file_splice_read,
 	.llseek = cifs_llseek,
 	.unlocked_ioctl	= cifs_ioctl,
+	.clone_file_range = cifs_clone_file_range,
 	.setlease = cifs_setlease,
 	.fallocate = cifs_fallocate,
 };
@@ -958,6 +1029,7 @@ const struct file_operations cifs_file_direct_ops = {
 	.mmap = cifs_file_mmap,
 	.splice_read = generic_file_splice_read,
 	.unlocked_ioctl  = cifs_ioctl,
+	.clone_file_range = cifs_clone_file_range,
 	.llseek = cifs_llseek,
 	.setlease = cifs_setlease,
 	.fallocate = cifs_fallocate,
@@ -974,6 +1046,7 @@ const struct file_operations cifs_file_nobrl_ops = {
 	.splice_read = generic_file_splice_read,
 	.llseek = cifs_llseek,
 	.unlocked_ioctl	= cifs_ioctl,
+	.clone_file_range = cifs_clone_file_range,
 	.setlease = cifs_setlease,
 	.fallocate = cifs_fallocate,
 };
@@ -989,6 +1062,7 @@ const struct file_operations cifs_file_strict_nobrl_ops = {
 	.splice_read = generic_file_splice_read,
 	.llseek = cifs_llseek,
 	.unlocked_ioctl	= cifs_ioctl,
+	.clone_file_range = cifs_clone_file_range,
 	.setlease = cifs_setlease,
 	.fallocate = cifs_fallocate,
 };
@@ -1004,6 +1078,7 @@ const struct file_operations cifs_file_direct_nobrl_ops = {
 	.mmap = cifs_file_mmap,
 	.splice_read = generic_file_splice_read,
 	.unlocked_ioctl  = cifs_ioctl,
+	.clone_file_range = cifs_clone_file_range,
 	.llseek = cifs_llseek,
 	.setlease = cifs_setlease,
 	.fallocate = cifs_fallocate,
@@ -1014,6 +1089,7 @@ const struct file_operations cifs_dir_ops = {
 	.release = cifs_closedir,
 	.read    = generic_read_dir,
 	.unlocked_ioctl  = cifs_ioctl,
+	.clone_file_range = cifs_clone_file_range,
 	.llseek = generic_file_llseek,
 };
 
@@ -1032,7 +1108,7 @@ cifs_init_inodecache(void)
 	cifs_inode_cachep = kmem_cache_create("cifs_inode_cache",
 					      sizeof(struct cifsInodeInfo),
 					      0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					      cifs_init_once);
 	if (cifs_inode_cachep == NULL)
 		return -ENOMEM;
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 44b3d4280..83aac8ba5 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -116,9 +116,8 @@ extern struct vfsmount *cifs_dfs_d_automount(struct path *path);
 #endif
 
 /* Functions related to symlinks */
-extern const char *cifs_follow_link(struct dentry *direntry, void **cookie);
-extern int cifs_readlink(struct dentry *direntry, char __user *buffer,
-			 int buflen);
+extern const char *cifs_get_link(struct dentry *, struct inode *,
+			struct delayed_call *);
 extern int cifs_symlink(struct inode *inode, struct dentry *direntry,
 			const char *symname);
 extern int	cifs_removexattr(struct dentry *, const char *);
@@ -127,7 +126,6 @@ extern int	cifs_setxattr(struct dentry *, const char *, const void *,
 extern ssize_t	cifs_getxattr(struct dentry *, const char *, void *, size_t);
 extern ssize_t	cifs_listxattr(struct dentry *, char *, size_t);
 extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
-
 #ifdef CONFIG_CIFS_NFSD_EXPORT
 extern const struct export_operations cifs_export_ops;
 #endif /* CONFIG_CIFS_NFSD_EXPORT */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 2b510c537..a25b2513f 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -70,8 +70,10 @@
 #define SERVER_NAME_LENGTH 40
 #define SERVER_NAME_LEN_WITH_NULL     (SERVER_NAME_LENGTH + 1)
 
-/* SMB echo "timeout" -- FIXME: tunable? */
-#define SMB_ECHO_INTERVAL (60 * HZ)
+/* echo interval in seconds */
+#define SMB_ECHO_INTERVAL_MIN 1
+#define SMB_ECHO_INTERVAL_MAX 600
+#define SMB_ECHO_INTERVAL_DEFAULT 60
 
 #include "cifspdu.h"
 
@@ -225,7 +227,7 @@ struct smb_version_operations {
 	void (*print_stats)(struct seq_file *m, struct cifs_tcon *);
 	void (*dump_share_caps)(struct seq_file *, struct cifs_tcon *);
 	/* verify the message */
-	int (*check_message)(char *, unsigned int);
+	int (*check_message)(char *, unsigned int, struct TCP_Server_Info *);
 	bool (*is_oplock_break)(char *, struct TCP_Server_Info *);
 	void (*downgrade_oplock)(struct TCP_Server_Info *,
 					struct cifsInodeInfo *, bool);
@@ -507,6 +509,7 @@ struct smb_vol {
 	struct sockaddr_storage dstaddr; /* destination address */
 	struct sockaddr_storage srcaddr; /* allow binding to a local IP */
 	struct nls_table *local_nls;
+	unsigned int echo_interval; /* echo interval in secs */
 };
 
 #define CIFS_MOUNT_MASK (CIFS_MOUNT_NO_PERM | CIFS_MOUNT_SET_UID | \
@@ -627,7 +630,9 @@ struct TCP_Server_Info {
 #ifdef CONFIG_CIFS_SMB2
 	unsigned int	max_read;
 	unsigned int	max_write;
+	__u8		preauth_hash[512];
 #endif /* CONFIG_CIFS_SMB2 */
+	unsigned long echo_interval;
 };
 
 static inline unsigned int
@@ -809,7 +814,10 @@ struct cifs_ses {
 	bool need_reconnect:1; /* connection reset, uid now invalid */
 #ifdef CONFIG_CIFS_SMB2
 	__u16 session_flags;
-	char smb3signingkey[SMB3_SIGN_KEY_SIZE]; /* for signing smb3 packets */
+	__u8 smb3signingkey[SMB3_SIGN_KEY_SIZE];
+	__u8 smb3encryptionkey[SMB3_SIGN_KEY_SIZE];
+	__u8 smb3decryptionkey[SMB3_SIGN_KEY_SIZE];
+	__u8 preauth_hash[512];
 #endif /* CONFIG_CIFS_SMB2 */
 };
 
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index c63fd1dde..eed7ff50f 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -102,7 +102,7 @@ extern int SendReceiveBlockingLock(const unsigned int xid,
 			struct smb_hdr *out_buf,
 			int *bytes_returned);
 extern int cifs_reconnect(struct TCP_Server_Info *server);
-extern int checkSMB(char *buf, unsigned int length);
+extern int checkSMB(char *buf, unsigned int len, struct TCP_Server_Info *srvr);
 extern bool is_valid_oplock_break(char *, struct TCP_Server_Info *);
 extern bool backup_cred(struct cifs_sb_info *);
 extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof);
@@ -439,7 +439,8 @@ extern int setup_ntlm_response(struct cifs_ses *, const struct nls_table *);
 extern int setup_ntlmv2_rsp(struct cifs_ses *, const struct nls_table *);
 extern void cifs_crypto_shash_release(struct TCP_Server_Info *);
 extern int calc_seckey(struct cifs_ses *);
-extern int generate_smb3signingkey(struct cifs_ses *);
+extern int generate_smb30signingkey(struct cifs_ses *);
+extern int generate_smb311signingkey(struct cifs_ses *);
 
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
 extern int calc_lanman_hash(const char *password, const char *cryptkey,
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 3c194ff0d..a763cd3d9 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -95,6 +95,7 @@ enum {
 	Opt_cruid, Opt_gid, Opt_file_mode,
 	Opt_dirmode, Opt_port,
 	Opt_rsize, Opt_wsize, Opt_actimeo,
+	Opt_echo_interval,
 
 	/* Mount options which take string value */
 	Opt_user, Opt_pass, Opt_ip,
@@ -188,6 +189,7 @@ static const match_table_t cifs_mount_option_tokens = {
 	{ Opt_rsize, "rsize=%s" },
 	{ Opt_wsize, "wsize=%s" },
 	{ Opt_actimeo, "actimeo=%s" },
+	{ Opt_echo_interval, "echo_interval=%s" },
 
 	{ Opt_blank_user, "user=" },
 	{ Opt_blank_user, "username=" },
@@ -418,6 +420,7 @@ cifs_echo_request(struct work_struct *work)
 	int rc;
 	struct TCP_Server_Info *server = container_of(work,
 					struct TCP_Server_Info, echo.work);
+	unsigned long echo_interval = server->echo_interval;
 
 	/*
 	 * We cannot send an echo if it is disabled or until the
@@ -427,7 +430,7 @@ cifs_echo_request(struct work_struct *work)
 	 */
 	if (!server->ops->need_neg || server->ops->need_neg(server) ||
 	    (server->ops->can_echo && !server->ops->can_echo(server)) ||
-	    time_before(jiffies, server->lstrp + SMB_ECHO_INTERVAL - HZ))
+	    time_before(jiffies, server->lstrp + echo_interval - HZ))
 		goto requeue_echo;
 
 	rc = server->ops->echo ? server->ops->echo(server) : -ENOSYS;
@@ -436,7 +439,7 @@ cifs_echo_request(struct work_struct *work)
 			 server->hostname);
 
 requeue_echo:
-	queue_delayed_work(cifsiod_wq, &server->echo, SMB_ECHO_INTERVAL);
+	queue_delayed_work(cifsiod_wq, &server->echo, echo_interval);
 }
 
 static bool
@@ -487,9 +490,9 @@ server_unresponsive(struct TCP_Server_Info *server)
 	 *     a response in >60s.
 	 */
 	if (server->tcpStatus == CifsGood &&
-	    time_after(jiffies, server->lstrp + 2 * SMB_ECHO_INTERVAL)) {
-		cifs_dbg(VFS, "Server %s has not responded in %d seconds. Reconnecting...\n",
-			 server->hostname, (2 * SMB_ECHO_INTERVAL) / HZ);
+	    time_after(jiffies, server->lstrp + 2 * server->echo_interval)) {
+		cifs_dbg(VFS, "Server %s has not responded in %lu seconds. Reconnecting...\n",
+			 server->hostname, (2 * server->echo_interval) / HZ);
 		cifs_reconnect(server);
 		wake_up(&server->response_q);
 		return true;
@@ -828,7 +831,7 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid)
 	 * 48 bytes is enough to display the header and a little bit
 	 * into the payload for debugging purposes.
 	 */
-	length = server->ops->check_message(buf, server->total_read);
+	length = server->ops->check_message(buf, server->total_read, server);
 	if (length != 0)
 		cifs_dump_mem("Bad SMB: ", buf,
 			min_t(unsigned int, server->total_read, 48));
@@ -1624,6 +1627,14 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
 				goto cifs_parse_mount_err;
 			}
 			break;
+		case Opt_echo_interval:
+			if (get_option_ul(args, &option)) {
+				cifs_dbg(VFS, "%s: Invalid echo interval value\n",
+					 __func__);
+				goto cifs_parse_mount_err;
+			}
+			vol->echo_interval = option;
+			break;
 
 		/* String Arguments */
 
@@ -2089,6 +2100,9 @@ static int match_server(struct TCP_Server_Info *server, struct smb_vol *vol)
 	if (!match_security(server, vol))
 		return 0;
 
+	if (server->echo_interval != vol->echo_interval)
+		return 0;
+
 	return 1;
 }
 
@@ -2208,6 +2222,12 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 	tcp_ses->tcpStatus = CifsNew;
 	++tcp_ses->srv_count;
 
+	if (volume_info->echo_interval >= SMB_ECHO_INTERVAL_MIN &&
+		volume_info->echo_interval <= SMB_ECHO_INTERVAL_MAX)
+		tcp_ses->echo_interval = volume_info->echo_interval * HZ;
+	else
+		tcp_ses->echo_interval = SMB_ECHO_INTERVAL_DEFAULT * HZ;
+
 	rc = ip_connect(tcp_ses);
 	if (rc < 0) {
 		cifs_dbg(VFS, "Error connecting to socket. Aborting operation.\n");
@@ -2237,7 +2257,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 	cifs_fscache_get_client_cookie(tcp_ses);
 
 	/* queue echo request delayed work */
-	queue_delayed_work(cifsiod_wq, &tcp_ses->echo, SMB_ECHO_INTERVAL);
+	queue_delayed_work(cifsiod_wq, &tcp_ses->echo, tcp_ses->echo_interval);
 
 	return tcp_ses;
 
@@ -2979,8 +2999,7 @@ ip_rfc1001_connect(struct TCP_Server_Info *server)
 	if (ses_init_buf) {
 		ses_init_buf->trailer.session_req.called_len = 32;
 
-		if (server->server_RFC1001_name &&
-		    server->server_RFC1001_name[0] != 0)
+		if (server->server_RFC1001_name[0] != 0)
 			rfc1002mangle(ses_init_buf->trailer.
 				      session_req.called_name,
 				      server->server_RFC1001_name,
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 0068e8221..ff882aeac 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2267,7 +2267,7 @@ int cifs_strict_fsync(struct file *file, loff_t start, loff_t end,
 	rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
 	if (rc)
 		return rc;
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	xid = get_xid();
 
@@ -2292,7 +2292,7 @@ int cifs_strict_fsync(struct file *file, loff_t start, loff_t end,
 	}
 
 	free_xid(xid);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return rc;
 }
 
@@ -2309,7 +2309,7 @@ int cifs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 	rc = filemap_write_and_wait_range(inode->i_mapping, start, end);
 	if (rc)
 		return rc;
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	xid = get_xid();
 
@@ -2326,7 +2326,7 @@ int cifs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 	}
 
 	free_xid(xid);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return rc;
 }
 
@@ -2672,7 +2672,7 @@ cifs_writev(struct kiocb *iocb, struct iov_iter *from)
 	 * with a brlock that prevents writing.
 	 */
 	down_read(&cinode->lock_sem);
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	rc = generic_write_checks(iocb, from);
 	if (rc <= 0)
@@ -2685,7 +2685,7 @@ cifs_writev(struct kiocb *iocb, struct iov_iter *from)
 	else
 		rc = -EACCES;
 out:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	if (rc > 0) {
 		ssize_t err = generic_write_sync(file, iocb->ki_pos - rc, rc);
@@ -3391,13 +3391,13 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list,
 	 * should have access to this page, we're safe to simply set
 	 * PG_locked without checking it first.
 	 */
-	__set_page_locked(page);
+	__SetPageLocked(page);
 	rc = add_to_page_cache_locked(page, mapping,
 				      page->index, gfp);
 
 	/* give up if we can't stick it in the cache */
 	if (rc) {
-		__clear_page_locked(page);
+		__ClearPageLocked(page);
 		return rc;
 	}
 
@@ -3418,9 +3418,9 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list,
 		if (*bytes + PAGE_CACHE_SIZE > rsize)
 			break;
 
-		__set_page_locked(page);
+		__SetPageLocked(page);
 		if (add_to_page_cache_locked(page, mapping, page->index, gfp)) {
-			__clear_page_locked(page);
+			__ClearPageLocked(page);
 			break;
 		}
 		list_move_tail(&page->lru, tmplist);
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index a329f5ba3..aeb26dbfa 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -814,8 +814,21 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
 			}
 		} else
 			fattr.cf_uniqueid = iunique(sb, ROOT_I);
-	} else
-		fattr.cf_uniqueid = CIFS_I(*inode)->uniqueid;
+	} else {
+		if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) &&
+		    validinum == false && server->ops->get_srv_inum) {
+			/*
+			 * Pass a NULL tcon to ensure we don't make a round
+			 * trip to the server. This only works for SMB2+.
+			 */
+			tmprc = server->ops->get_srv_inum(xid,
+				NULL, cifs_sb, full_path,
+				&fattr.cf_uniqueid, data);
+			if (tmprc)
+				fattr.cf_uniqueid = CIFS_I(*inode)->uniqueid;
+		} else
+			fattr.cf_uniqueid = CIFS_I(*inode)->uniqueid;
+	}
 
 	/* query for SFU type info if supported and needed */
 	if (fattr.cf_cifsattrs & ATTR_SYSTEM &&
@@ -856,6 +869,13 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
 	} else {
 		/* we already have inode, update it */
 
+		/* if uniqueid is different, return error */
+		if (unlikely(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM &&
+		    CIFS_I(*inode)->uniqueid != fattr.cf_uniqueid)) {
+			rc = -ESTALE;
+			goto cgii_exit;
+		}
+
 		/* if filetype is different, return error */
 		if (unlikely(((*inode)->i_mode & S_IFMT) !=
 		    (fattr.cf_mode & S_IFMT))) {
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 35cf990f8..7a3b84e30 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -34,73 +34,36 @@
 #include "cifs_ioctl.h"
 #include <linux/btrfs.h>
 
-static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file,
-			unsigned long srcfd, u64 off, u64 len, u64 destoff,
-			bool dup_extents)
+static int cifs_file_clone_range(unsigned int xid, struct file *src_file,
+			  struct file *dst_file)
 {
-	int rc;
-	struct cifsFileInfo *smb_file_target = dst_file->private_data;
+	struct inode *src_inode = file_inode(src_file);
 	struct inode *target_inode = file_inode(dst_file);
-	struct cifs_tcon *target_tcon;
-	struct fd src_file;
 	struct cifsFileInfo *smb_file_src;
-	struct inode *src_inode;
+	struct cifsFileInfo *smb_file_target;
 	struct cifs_tcon *src_tcon;
+	struct cifs_tcon *target_tcon;
+	int rc;
 
 	cifs_dbg(FYI, "ioctl clone range\n");
-	/* the destination must be opened for writing */
-	if (!(dst_file->f_mode & FMODE_WRITE)) {
-		cifs_dbg(FYI, "file target not open for write\n");
-		return -EINVAL;
-	}
 
-	/* check if target volume is readonly and take reference */
-	rc = mnt_want_write_file(dst_file);
-	if (rc) {
-		cifs_dbg(FYI, "mnt_want_write failed with rc %d\n", rc);
-		return rc;
-	}
-
-	src_file = fdget(srcfd);
-	if (!src_file.file) {
-		rc = -EBADF;
-		goto out_drop_write;
-	}
-
-	if (src_file.file->f_op->unlocked_ioctl != cifs_ioctl) {
-		rc = -EBADF;
-		cifs_dbg(VFS, "src file seems to be from a different filesystem type\n");
-		goto out_fput;
-	}
-
-	if ((!src_file.file->private_data) || (!dst_file->private_data)) {
+	if (!src_file->private_data || !dst_file->private_data) {
 		rc = -EBADF;
 		cifs_dbg(VFS, "missing cifsFileInfo on copy range src file\n");
-		goto out_fput;
+		goto out;
 	}
 
 	rc = -EXDEV;
 	smb_file_target = dst_file->private_data;
-	smb_file_src = src_file.file->private_data;
+	smb_file_src = src_file->private_data;
 	src_tcon = tlink_tcon(smb_file_src->tlink);
 	target_tcon = tlink_tcon(smb_file_target->tlink);
 
-	/* check source and target on same server (or volume if dup_extents) */
-	if (dup_extents && (src_tcon != target_tcon)) {
-		cifs_dbg(VFS, "source and target of copy not on same share\n");
-		goto out_fput;
-	}
-
-	if (!dup_extents && (src_tcon->ses != target_tcon->ses)) {
+	if (src_tcon->ses != target_tcon->ses) {
 		cifs_dbg(VFS, "source and target of copy not on same server\n");
-		goto out_fput;
+		goto out;
 	}
 
-	src_inode = file_inode(src_file.file);
-	rc = -EINVAL;
-	if (S_ISDIR(src_inode->i_mode))
-		goto out_fput;
-
 	/*
 	 * Note: cifs case is easier than btrfs since server responsible for
 	 * checks for proper open modes and file type and if it wants
@@ -108,34 +71,66 @@ static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file,
 	 */
 	lock_two_nondirectories(target_inode, src_inode);
 
-	/* determine range to clone */
-	rc = -EINVAL;
-	if (off + len > src_inode->i_size || off + len < off)
-		goto out_unlock;
-	if (len == 0)
-		len = src_inode->i_size - off;
-
 	cifs_dbg(FYI, "about to flush pages\n");
 	/* should we flush first and last page first */
-	truncate_inode_pages_range(&target_inode->i_data, destoff,
-				   PAGE_CACHE_ALIGN(destoff + len)-1);
+	truncate_inode_pages(&target_inode->i_data, 0);
 
-	if (dup_extents && target_tcon->ses->server->ops->duplicate_extents)
-		rc = target_tcon->ses->server->ops->duplicate_extents(xid,
-			smb_file_src, smb_file_target, off, len, destoff);
-	else if (!dup_extents && target_tcon->ses->server->ops->clone_range)
+	if (target_tcon->ses->server->ops->clone_range)
 		rc = target_tcon->ses->server->ops->clone_range(xid,
-			smb_file_src, smb_file_target, off, len, destoff);
+			smb_file_src, smb_file_target, 0, src_inode->i_size, 0);
 	else
 		rc = -EOPNOTSUPP;
 
 	/* force revalidate of size and timestamps of target file now
 	   that target is updated on the server */
 	CIFS_I(target_inode)->time = 0;
-out_unlock:
 	/* although unlocking in the reverse order from locking is not
 	   strictly necessary here it is a little cleaner to be consistent */
 	unlock_two_nondirectories(src_inode, target_inode);
+out:
+	return rc;
+}
+
+static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file,
+			unsigned long srcfd)
+{
+	int rc;
+	struct fd src_file;
+	struct inode *src_inode;
+
+	cifs_dbg(FYI, "ioctl clone range\n");
+	/* the destination must be opened for writing */
+	if (!(dst_file->f_mode & FMODE_WRITE)) {
+		cifs_dbg(FYI, "file target not open for write\n");
+		return -EINVAL;
+	}
+
+	/* check if target volume is readonly and take reference */
+	rc = mnt_want_write_file(dst_file);
+	if (rc) {
+		cifs_dbg(FYI, "mnt_want_write failed with rc %d\n", rc);
+		return rc;
+	}
+
+	src_file = fdget(srcfd);
+	if (!src_file.file) {
+		rc = -EBADF;
+		goto out_drop_write;
+	}
+
+	if (src_file.file->f_op->unlocked_ioctl != cifs_ioctl) {
+		rc = -EBADF;
+		cifs_dbg(VFS, "src file seems to be from a different filesystem type\n");
+		goto out_fput;
+	}
+
+	src_inode = file_inode(src_file.file);
+	rc = -EINVAL;
+	if (S_ISDIR(src_inode->i_mode))
+		goto out_fput;
+
+	rc = cifs_file_clone_range(xid, src_file.file, dst_file);
+
 out_fput:
 	fdput(src_file);
 out_drop_write:
@@ -256,10 +251,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
 			}
 			break;
 		case CIFS_IOC_COPYCHUNK_FILE:
-			rc = cifs_ioctl_clone(xid, filep, arg, 0, 0, 0, false);
-			break;
-		case BTRFS_IOC_CLONE:
-			rc = cifs_ioctl_clone(xid, filep, arg, 0, 0, 0, true);
+			rc = cifs_ioctl_clone(xid, filep, arg);
 			break;
 		case CIFS_IOC_SET_INTEGRITY:
 			if (pSMBFile == NULL)
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index e3548f73b..062c23755 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -627,9 +627,9 @@ cifs_hl_exit:
 }
 
 const char *
-cifs_follow_link(struct dentry *direntry, void **cookie)
+cifs_get_link(struct dentry *direntry, struct inode *inode,
+	      struct delayed_call *done)
 {
-	struct inode *inode = d_inode(direntry);
 	int rc = -ENOMEM;
 	unsigned int xid;
 	char *full_path = NULL;
@@ -639,6 +639,9 @@ cifs_follow_link(struct dentry *direntry, void **cookie)
 	struct cifs_tcon *tcon;
 	struct TCP_Server_Info *server;
 
+	if (!direntry)
+		return ERR_PTR(-ECHILD);
+
 	xid = get_xid();
 
 	tlink = cifs_sb_tlink(cifs_sb);
@@ -678,7 +681,8 @@ cifs_follow_link(struct dentry *direntry, void **cookie)
 		kfree(target_path);
 		return ERR_PTR(rc);
 	}
-	return *cookie = target_path;
+	set_delayed_call(done, kfree_link, target_path);
+	return target_path;
 }
 
 int
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 8442b8b8e..813fe13c2 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -310,7 +310,7 @@ check_smb_hdr(struct smb_hdr *smb)
 }
 
 int
-checkSMB(char *buf, unsigned int total_read)
+checkSMB(char *buf, unsigned int total_read, struct TCP_Server_Info *server)
 {
 	struct smb_hdr *smb = (struct smb_hdr *)buf;
 	__u32 rfclen = be32_to_cpu(smb->smb_buf_length);
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index 1c5907019..389fb9f8c 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -38,7 +38,7 @@ check_smb2_hdr(struct smb2_hdr *hdr, __u64 mid)
 	 * Make sure that this really is an SMB, that it is a response,
 	 * and that the message ids match.
 	 */
-	if ((*(__le32 *)hdr->ProtocolId == SMB2_PROTO_NUMBER) &&
+	if ((hdr->ProtocolId == SMB2_PROTO_NUMBER) &&
 	    (mid == wire_mid)) {
 		if (hdr->Flags & SMB2_FLAGS_SERVER_TO_REDIR)
 			return 0;
@@ -50,9 +50,9 @@ check_smb2_hdr(struct smb2_hdr *hdr, __u64 mid)
 				cifs_dbg(VFS, "Received Request not response\n");
 		}
 	} else { /* bad signature or mid */
-		if (*(__le32 *)hdr->ProtocolId != SMB2_PROTO_NUMBER)
+		if (hdr->ProtocolId != SMB2_PROTO_NUMBER)
 			cifs_dbg(VFS, "Bad protocol string signature header %x\n",
-				 *(unsigned int *) hdr->ProtocolId);
+				 le32_to_cpu(hdr->ProtocolId));
 		if (mid != wire_mid)
 			cifs_dbg(VFS, "Mids do not match: %llu and %llu\n",
 				 mid, wire_mid);
@@ -93,11 +93,11 @@ static const __le16 smb2_rsp_struct_sizes[NUMBER_OF_SMB2_COMMANDS] = {
 };
 
 int
-smb2_check_message(char *buf, unsigned int length)
+smb2_check_message(char *buf, unsigned int length, struct TCP_Server_Info *srvr)
 {
 	struct smb2_hdr *hdr = (struct smb2_hdr *)buf;
 	struct smb2_pdu *pdu = (struct smb2_pdu *)hdr;
-	__u64 mid = le64_to_cpu(hdr->MessageId);
+	__u64 mid;
 	__u32 len = get_rfc1002_length(buf);
 	__u32 clc_len;  /* calculated length */
 	int command;
@@ -111,6 +111,30 @@ smb2_check_message(char *buf, unsigned int length)
 	 * ie Validate the wct via smb2_struct_sizes table above
 	 */
 
+	if (hdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM) {
+		struct smb2_transform_hdr *thdr =
+			(struct smb2_transform_hdr *)buf;
+		struct cifs_ses *ses = NULL;
+		struct list_head *tmp;
+
+		/* decrypt frame now that it is completely read in */
+		spin_lock(&cifs_tcp_ses_lock);
+		list_for_each(tmp, &srvr->smb_ses_list) {
+			ses = list_entry(tmp, struct cifs_ses, smb_ses_list);
+			if (ses->Suid == thdr->SessionId)
+				break;
+
+			ses = NULL;
+		}
+		spin_unlock(&cifs_tcp_ses_lock);
+		if (ses == NULL) {
+			cifs_dbg(VFS, "no decryption - session id not found\n");
+			return 1;
+		}
+	}
+
+
+	mid = le64_to_cpu(hdr->MessageId);
 	if (length < sizeof(struct smb2_pdu)) {
 		if ((length >= sizeof(struct smb2_hdr)) && (hdr->Status != 0)) {
 			pdu->StructureSize2 = 0;
@@ -322,7 +346,7 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr)
 
 	/* return pointer to beginning of data area, ie offset from SMB start */
 	if ((*off != 0) && (*len != 0))
-		return (char *)(&hdr->ProtocolId[0]) + *off;
+		return (char *)(&hdr->ProtocolId) + *off;
 	else
 		return NULL;
 }
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 53ccdde6f..3525ed756 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -182,6 +182,11 @@ smb2_find_mid(struct TCP_Server_Info *server, char *buf)
 	struct smb2_hdr *hdr = (struct smb2_hdr *)buf;
 	__u64 wire_mid = le64_to_cpu(hdr->MessageId);
 
+	if (hdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM) {
+		cifs_dbg(VFS, "encrypted frame parsing not supported yet");
+		return NULL;
+	}
+
 	spin_lock(&GlobalMid_Lock);
 	list_for_each_entry(mid, &server->pending_mid_q, qhead) {
 		if ((mid->mid == wire_mid) &&
@@ -1692,7 +1697,7 @@ struct smb_version_operations smb30_operations = {
 	.get_lease_key = smb2_get_lease_key,
 	.set_lease_key = smb2_set_lease_key,
 	.new_lease_key = smb2_new_lease_key,
-	.generate_signingkey = generate_smb3signingkey,
+	.generate_signingkey = generate_smb30signingkey,
 	.calc_signature = smb3_calc_signature,
 	.set_integrity  = smb3_set_integrity,
 	.is_read_op = smb21_is_read_op,
@@ -1779,7 +1784,7 @@ struct smb_version_operations smb311_operations = {
 	.get_lease_key = smb2_get_lease_key,
 	.set_lease_key = smb2_set_lease_key,
 	.new_lease_key = smb2_new_lease_key,
-	.generate_signingkey = generate_smb3signingkey,
+	.generate_signingkey = generate_smb311signingkey,
 	.calc_signature = smb3_calc_signature,
 	.set_integrity  = smb3_set_integrity,
 	.is_read_op = smb21_is_read_op,
@@ -1838,7 +1843,7 @@ struct smb_version_values smb21_values = {
 struct smb_version_values smb30_values = {
 	.version_string = SMB30_VERSION_STRING,
 	.protocol_id = SMB30_PROT_ID,
-	.req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES,
+	.req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES | SMB2_GLOBAL_CAP_ENCRYPTION,
 	.large_lock_type = 0,
 	.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
 	.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
@@ -1858,7 +1863,7 @@ struct smb_version_values smb30_values = {
 struct smb_version_values smb302_values = {
 	.version_string = SMB302_VERSION_STRING,
 	.protocol_id = SMB302_PROT_ID,
-	.req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES,
+	.req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES | SMB2_GLOBAL_CAP_ENCRYPTION,
 	.large_lock_type = 0,
 	.exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK,
 	.shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK,
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 373b5cd1c..42e1f440e 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -97,10 +97,7 @@ smb2_hdr_assemble(struct smb2_hdr *hdr, __le16 smb2_cmd /* command */ ,
 	hdr->smb2_buf_length = cpu_to_be32(parmsize + sizeof(struct smb2_hdr)
 			- 4 /*  RFC 1001 length field itself not counted */);
 
-	hdr->ProtocolId[0] = 0xFE;
-	hdr->ProtocolId[1] = 'S';
-	hdr->ProtocolId[2] = 'M';
-	hdr->ProtocolId[3] = 'B';
+	hdr->ProtocolId = SMB2_PROTO_NUMBER;
 	hdr->StructureSize = cpu_to_le16(64);
 	hdr->Command = smb2_cmd;
 	hdr->CreditRequest = cpu_to_le16(2); /* BB make this dynamic */
@@ -1577,7 +1574,8 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
 		goto ioctl_exit;
 	}
 
-	memcpy(*out_data, rsp->hdr.ProtocolId + le32_to_cpu(rsp->OutputOffset),
+	memcpy(*out_data,
+	       (char *)&rsp->hdr.ProtocolId + le32_to_cpu(rsp->OutputOffset),
 	       *plen);
 ioctl_exit:
 	free_rsp_buf(resp_buftype, rsp);
@@ -2097,7 +2095,7 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms,
 	}
 
 	if (*buf) {
-		memcpy(*buf, (char *)rsp->hdr.ProtocolId + rsp->DataOffset,
+		memcpy(*buf, (char *)&rsp->hdr.ProtocolId + rsp->DataOffset,
 		       *nbytes);
 		free_rsp_buf(resp_buftype, iov[0].iov_base);
 	} else if (resp_buftype != CIFS_NO_BUFFER) {
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index 4af52780e..ff88d9feb 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -86,6 +86,7 @@
 #define MAX_SMB2_HDR_SIZE 0x78 /* 4 len + 64 hdr + (2*24 wct) + 2 bct + 2 pad */
 
 #define SMB2_PROTO_NUMBER cpu_to_le32(0x424d53fe)
+#define SMB2_TRANSFORM_PROTO_NUM cpu_to_le32(0x424d53fd)
 
 /*
  * SMB2 Header Definition
@@ -102,7 +103,7 @@ struct smb2_hdr {
 	__be32 smb2_buf_length;	/* big endian on wire */
 				/* length is only two or three bytes - with
 				 one or two byte type preceding it that MBZ */
-	__u8   ProtocolId[4];	/* 0xFE 'S' 'M' 'B' */
+	__le32 ProtocolId;	/* 0xFE 'S' 'M' 'B' */
 	__le16 StructureSize;	/* 64 */
 	__le16 CreditCharge;	/* MBZ */
 	__le32 Status;		/* Error from server */
@@ -128,11 +129,10 @@ struct smb2_transform_hdr {
 				 one or two byte type preceding it that MBZ */
 	__u8   ProtocolId[4];	/* 0xFD 'S' 'M' 'B' */
 	__u8   Signature[16];
-	__u8   Nonce[11];
-	__u8   Reserved[5];
+	__u8   Nonce[16];
 	__le32 OriginalMessageSize;
 	__u16  Reserved1;
-	__le16 EncryptionAlgorithm;
+	__le16 Flags; /* EncryptionAlgorithm */
 	__u64  SessionId;
 } __packed;
 
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index 79dc650c1..4f07dc936 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -34,7 +34,8 @@ struct smb_rqst;
  *****************************************************************
  */
 extern int map_smb2_to_linux_error(char *buf, bool log_err);
-extern int smb2_check_message(char *buf, unsigned int length);
+extern int smb2_check_message(char *buf, unsigned int length,
+			      struct TCP_Server_Info *server);
 extern unsigned int smb2_calc_size(void *buf);
 extern char *smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr);
 extern __le16 *cifs_convert_path_to_utf16(const char *from,
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index d4c5b6f10..8732a43b1 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -222,8 +222,8 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
 	return rc;
 }
 
-int
-generate_smb3signingkey(struct cifs_ses *ses)
+static int generate_key(struct cifs_ses *ses, struct kvec label,
+			struct kvec context, __u8 *key, unsigned int key_size)
 {
 	unsigned char zero = 0x0;
 	__u8 i[4] = {0, 0, 0, 1};
@@ -233,7 +233,7 @@ generate_smb3signingkey(struct cifs_ses *ses)
 	unsigned char *hashptr = prfhash;
 
 	memset(prfhash, 0x0, SMB2_HMACSHA256_SIZE);
-	memset(ses->smb3signingkey, 0x0, SMB3_SIGNKEY_SIZE);
+	memset(key, 0x0, key_size);
 
 	rc = smb3_crypto_shash_allocate(ses->server);
 	if (rc) {
@@ -262,7 +262,7 @@ generate_smb3signingkey(struct cifs_ses *ses)
 	}
 
 	rc = crypto_shash_update(&ses->server->secmech.sdeschmacsha256->shash,
-				"SMB2AESCMAC", 12);
+				label.iov_base, label.iov_len);
 	if (rc) {
 		cifs_dbg(VFS, "%s: Could not update with label\n", __func__);
 		goto smb3signkey_ret;
@@ -276,7 +276,7 @@ generate_smb3signingkey(struct cifs_ses *ses)
 	}
 
 	rc = crypto_shash_update(&ses->server->secmech.sdeschmacsha256->shash,
-				"SmbSign", 8);
+				context.iov_base, context.iov_len);
 	if (rc) {
 		cifs_dbg(VFS, "%s: Could not update with context\n", __func__);
 		goto smb3signkey_ret;
@@ -296,12 +296,102 @@ generate_smb3signingkey(struct cifs_ses *ses)
 		goto smb3signkey_ret;
 	}
 
-	memcpy(ses->smb3signingkey, hashptr, SMB3_SIGNKEY_SIZE);
+	memcpy(key, hashptr, key_size);
 
 smb3signkey_ret:
 	return rc;
 }
 
+struct derivation {
+	struct kvec label;
+	struct kvec context;
+};
+
+struct derivation_triplet {
+	struct derivation signing;
+	struct derivation encryption;
+	struct derivation decryption;
+};
+
+static int
+generate_smb3signingkey(struct cifs_ses *ses,
+			const struct derivation_triplet *ptriplet)
+{
+	int rc;
+
+	rc = generate_key(ses, ptriplet->signing.label,
+			  ptriplet->signing.context, ses->smb3signingkey,
+			  SMB3_SIGN_KEY_SIZE);
+	if (rc)
+		return rc;
+
+	rc = generate_key(ses, ptriplet->encryption.label,
+			  ptriplet->encryption.context, ses->smb3encryptionkey,
+			  SMB3_SIGN_KEY_SIZE);
+	if (rc)
+		return rc;
+
+	return generate_key(ses, ptriplet->decryption.label,
+			    ptriplet->decryption.context,
+			    ses->smb3decryptionkey, SMB3_SIGN_KEY_SIZE);
+}
+
+int
+generate_smb30signingkey(struct cifs_ses *ses)
+
+{
+	struct derivation_triplet triplet;
+	struct derivation *d;
+
+	d = &triplet.signing;
+	d->label.iov_base = "SMB2AESCMAC";
+	d->label.iov_len = 12;
+	d->context.iov_base = "SmbSign";
+	d->context.iov_len = 8;
+
+	d = &triplet.encryption;
+	d->label.iov_base = "SMB2AESCCM";
+	d->label.iov_len = 11;
+	d->context.iov_base = "ServerIn ";
+	d->context.iov_len = 10;
+
+	d = &triplet.decryption;
+	d->label.iov_base = "SMB2AESCCM";
+	d->label.iov_len = 11;
+	d->context.iov_base = "ServerOut";
+	d->context.iov_len = 10;
+
+	return generate_smb3signingkey(ses, &triplet);
+}
+
+int
+generate_smb311signingkey(struct cifs_ses *ses)
+
+{
+	struct derivation_triplet triplet;
+	struct derivation *d;
+
+	d = &triplet.signing;
+	d->label.iov_base = "SMB2AESCMAC";
+	d->label.iov_len = 12;
+	d->context.iov_base = "SmbSign";
+	d->context.iov_len = 8;
+
+	d = &triplet.encryption;
+	d->label.iov_base = "SMB2AESCCM";
+	d->label.iov_len = 11;
+	d->context.iov_base = "ServerIn ";
+	d->context.iov_len = 10;
+
+	d = &triplet.decryption;
+	d->label.iov_base = "SMB2AESCCM";
+	d->label.iov_len = 11;
+	d->context.iov_base = "ServerOut";
+	d->context.iov_len = 10;
+
+	return generate_smb3signingkey(ses, &triplet);
+}
+
 int
 smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
 {
diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c
index ff9e1f8b1..f5dc2f0df 100644
--- a/fs/cifs/xattr.c
+++ b/fs/cifs/xattr.c
@@ -190,8 +190,8 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
 #endif /* CONFIG_CIFS_ACL */
 	} else {
 		int temp;
-		temp = strncmp(ea_name, POSIX_ACL_XATTR_ACCESS,
-			strlen(POSIX_ACL_XATTR_ACCESS));
+		temp = strncmp(ea_name, XATTR_NAME_POSIX_ACL_ACCESS,
+			strlen(XATTR_NAME_POSIX_ACL_ACCESS));
 		if (temp == 0) {
 #ifdef CONFIG_CIFS_POSIX
 			if (sb->s_flags & MS_POSIXACL)
@@ -203,8 +203,8 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name,
 #else
 			cifs_dbg(FYI, "set POSIX ACL not supported\n");
 #endif
-		} else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT,
-				   strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) {
+		} else if (strncmp(ea_name, XATTR_NAME_POSIX_ACL_DEFAULT,
+				   strlen(XATTR_NAME_POSIX_ACL_DEFAULT)) == 0) {
 #ifdef CONFIG_CIFS_POSIX
 			if (sb->s_flags & MS_POSIXACL)
 				rc = CIFSSMBSetPosixACL(xid, pTcon, full_path,
@@ -292,8 +292,8 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
 			rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon,
 				full_path, ea_name, ea_value, buf_size,
 				cifs_sb->local_nls, cifs_remap(cifs_sb));
-	} else if (strncmp(ea_name, POSIX_ACL_XATTR_ACCESS,
-			  strlen(POSIX_ACL_XATTR_ACCESS)) == 0) {
+	} else if (strncmp(ea_name, XATTR_NAME_POSIX_ACL_ACCESS,
+			  strlen(XATTR_NAME_POSIX_ACL_ACCESS)) == 0) {
 #ifdef CONFIG_CIFS_POSIX
 		if (sb->s_flags & MS_POSIXACL)
 			rc = CIFSSMBGetPosixACL(xid, pTcon, full_path,
@@ -303,8 +303,8 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name,
 #else
 		cifs_dbg(FYI, "Query POSIX ACL not supported yet\n");
 #endif /* CONFIG_CIFS_POSIX */
-	} else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT,
-			  strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) {
+	} else if (strncmp(ea_name, XATTR_NAME_POSIX_ACL_DEFAULT,
+			  strlen(XATTR_NAME_POSIX_ACL_DEFAULT)) == 0) {
 #ifdef CONFIG_CIFS_POSIX
 		if (sb->s_flags & MS_POSIXACL)
 			rc = CIFSSMBGetPosixACL(xid, pTcon, full_path,
diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c
index 7740b1c87..1bfb7ba4e 100644
--- a/fs/coda/cnode.c
+++ b/fs/coda/cnode.c
@@ -8,6 +8,7 @@
 
 #include <linux/coda.h>
 #include <linux/coda_psdev.h>
+#include <linux/pagemap.h>
 #include "coda_linux.h"
 
 static inline int coda_fideq(struct CodaFid *fid1, struct CodaFid *fid2)
@@ -17,8 +18,7 @@ static inline int coda_fideq(struct CodaFid *fid1, struct CodaFid *fid2)
 
 static const struct inode_operations coda_symlink_inode_operations = {
 	.readlink	= generic_readlink,
-	.follow_link	= page_follow_link_light,
-	.put_link	= page_put_link,
+	.get_link	= page_get_link,
 	.setattr	= coda_setattr,
 };
 
@@ -35,6 +35,7 @@ static void coda_fill_inode(struct inode *inode, struct coda_vattr *attr)
                 inode->i_fop = &coda_dir_operations;
         } else if (S_ISLNK(inode->i_mode)) {
 		inode->i_op = &coda_symlink_inode_operations;
+		inode_nohighmem(inode);
 		inode->i_data.a_ops = &coda_symlink_aops;
 		inode->i_mapping = &inode->i_data;
 	} else
diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h
index f829fe963..5104d84c4 100644
--- a/fs/coda/coda_linux.h
+++ b/fs/coda/coda_linux.h
@@ -72,8 +72,7 @@ void coda_sysctl_clean(void);
 } while (0)
 
 
-#define CODA_FREE(ptr,size) \
-    do { if (size < PAGE_SIZE) kfree((ptr)); else vfree((ptr)); } while (0)
+#define CODA_FREE(ptr, size) kvfree((ptr))
 
 /* inode to cnode access functions */
 
diff --git a/fs/coda/dir.c b/fs/coda/dir.c
index fda9f4311..42e731b8c 100644
--- a/fs/coda/dir.c
+++ b/fs/coda/dir.c
@@ -427,13 +427,13 @@ static int coda_readdir(struct file *coda_file, struct dir_context *ctx)
 	if (host_file->f_op->iterate) {
 		struct inode *host_inode = file_inode(host_file);
 
-		mutex_lock(&host_inode->i_mutex);
+		inode_lock(host_inode);
 		ret = -ENOENT;
 		if (!IS_DEADDIR(host_inode)) {
 			ret = host_file->f_op->iterate(host_file, ctx);
 			file_accessed(host_file);
 		}
-		mutex_unlock(&host_inode->i_mutex);
+		inode_unlock(host_inode);
 		return ret;
 	}
 	/* Venus: we must read Venus dirents from a file */
diff --git a/fs/coda/file.c b/fs/coda/file.c
index 1da3805f3..f47c74838 100644
--- a/fs/coda/file.c
+++ b/fs/coda/file.c
@@ -71,12 +71,12 @@ coda_file_write_iter(struct kiocb *iocb, struct iov_iter *to)
 
 	host_file = cfi->cfi_container;
 	file_start_write(host_file);
-	mutex_lock(&coda_inode->i_mutex);
+	inode_lock(coda_inode);
 	ret = vfs_iter_write(cfi->cfi_container, to, &iocb->ki_pos);
 	coda_inode->i_size = file_inode(host_file)->i_size;
 	coda_inode->i_blocks = (coda_inode->i_size + 511) >> 9;
 	coda_inode->i_mtime = coda_inode->i_ctime = CURRENT_TIME_SEC;
-	mutex_unlock(&coda_inode->i_mutex);
+	inode_unlock(coda_inode);
 	file_end_write(host_file);
 	return ret;
 }
@@ -203,7 +203,7 @@ int coda_fsync(struct file *coda_file, loff_t start, loff_t end, int datasync)
 	err = filemap_write_and_wait_range(coda_inode->i_mapping, start, end);
 	if (err)
 		return err;
-	mutex_lock(&coda_inode->i_mutex);
+	inode_lock(coda_inode);
 
 	cfi = CODA_FTOC(coda_file);
 	BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC);
@@ -212,7 +212,7 @@ int coda_fsync(struct file *coda_file, loff_t start, loff_t end, int datasync)
 	err = vfs_fsync(host_file, datasync);
 	if (!err && !datasync)
 		err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode));
-	mutex_unlock(&coda_inode->i_mutex);
+	inode_unlock(coda_inode);
 
 	return err;
 }
diff --git a/fs/coda/inode.c b/fs/coda/inode.c
index cac1390b8..57e81cbba 100644
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -74,9 +74,9 @@ static void init_once(void *foo)
 int __init coda_init_inodecache(void)
 {
 	coda_inode_cachep = kmem_cache_create("coda_inode_cache",
-				sizeof(struct coda_inode_info),
-				0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
-				init_once);
+				sizeof(struct coda_inode_info), 0,
+				SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
+				SLAB_ACCOUNT, init_once);
 	if (coda_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/coda/symlink.c b/fs/coda/symlink.c
index ab94ef63c..03736e20d 100644
--- a/fs/coda/symlink.c
+++ b/fs/coda/symlink.c
@@ -26,7 +26,7 @@ static int coda_symlink_filler(struct file *file, struct page *page)
 	int error;
 	struct coda_inode_info *cii;
 	unsigned int len = PAGE_SIZE;
-	char *p = kmap(page);
+	char *p = page_address(page);
 
 	cii = ITOC(inode);
 
@@ -34,13 +34,11 @@ static int coda_symlink_filler(struct file *file, struct page *page)
 	if (error)
 		goto fail;
 	SetPageUptodate(page);
-	kunmap(page);
 	unlock_page(page);
 	return 0;
 
 fail:
 	SetPageError(page);
-	kunmap(page);
 	unlock_page(page);
 	return error;
 }
diff --git a/fs/compat.c b/fs/compat.c
index 6fd272d45..a71936a3f 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -792,7 +792,7 @@ COMPAT_SYSCALL_DEFINE5(mount, const char __user *, dev_name,
 		       const void __user *, data)
 {
 	char *kernel_type;
-	unsigned long data_page;
+	void *options;
 	char *kernel_dev;
 	int retval;
 
@@ -806,26 +806,25 @@ COMPAT_SYSCALL_DEFINE5(mount, const char __user *, dev_name,
 	if (IS_ERR(kernel_dev))
 		goto out1;
 
-	retval = copy_mount_options(data, &data_page);
-	if (retval < 0)
+	options = copy_mount_options(data);
+	retval = PTR_ERR(options);
+	if (IS_ERR(options))
 		goto out2;
 
-	retval = -EINVAL;
-
-	if (kernel_type && data_page) {
+	if (kernel_type && options) {
 		if (!strcmp(kernel_type, NCPFS_NAME)) {
-			do_ncp_super_data_conv((void *)data_page);
+			do_ncp_super_data_conv(options);
 		} else if (!strcmp(kernel_type, NFS4_NAME)) {
-			if (do_nfs4_super_data_conv((void *) data_page))
+			retval = -EINVAL;
+			if (do_nfs4_super_data_conv(options))
 				goto out3;
 		}
 	}
 
-	retval = do_mount(kernel_dev, dir_name, kernel_type,
-			flags, (void*)data_page);
+	retval = do_mount(kernel_dev, dir_name, kernel_type, flags, options);
 
  out3:
-	free_page(data_page);
+	kfree(options);
  out2:
 	kfree(kernel_dev);
  out1:
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index dcf26537c..6402eaf8a 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -58,6 +58,8 @@
 #include <linux/atalk.h>
 #include <linux/gfp.h>
 
+#include "internal.h"
+
 #include <net/bluetooth/bluetooth.h>
 #include <net/bluetooth/hci_sock.h>
 #include <net/bluetooth/rfcomm.h>
@@ -115,19 +117,38 @@
 #include <asm/fbio.h>
 #endif
 
-static int w_long(unsigned int fd, unsigned int cmd,
-		compat_ulong_t __user *argp)
+#define convert_in_user(srcptr, dstptr)			\
+({							\
+	typeof(*srcptr) val;				\
+							\
+	get_user(val, srcptr) || put_user(val, dstptr);	\
+})
+
+static int do_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 {
-	mm_segment_t old_fs = get_fs();
 	int err;
-	unsigned long val;
 
-	set_fs (KERNEL_DS);
-	err = sys_ioctl(fd, cmd, (unsigned long)&val);
-	set_fs (old_fs);
-	if (!err && put_user(val, argp))
+	err = security_file_ioctl(file, cmd, arg);
+	if (err)
+		return err;
+
+	return vfs_ioctl(file, cmd, arg);
+}
+
+static int w_long(struct file *file,
+		unsigned int cmd, compat_ulong_t __user *argp)
+{
+	int err;
+	unsigned long __user *valp = compat_alloc_user_space(sizeof(*valp));
+
+	if (valp == NULL)
 		return -EFAULT;
-	return err;
+	err = do_ioctl(file, cmd, (unsigned long)valp);
+	if (err)
+		return err;
+	if (convert_in_user(valp, argp))
+		return -EFAULT;
+	return 0;
 }
 
 struct compat_video_event {
@@ -139,23 +160,23 @@ struct compat_video_event {
 	} u;
 };
 
-static int do_video_get_event(unsigned int fd, unsigned int cmd,
-		struct compat_video_event __user *up)
+static int do_video_get_event(struct file *file,
+		unsigned int cmd, struct compat_video_event __user *up)
 {
-	struct video_event kevent;
-	mm_segment_t old_fs = get_fs();
+	struct video_event __user *kevent =
+		compat_alloc_user_space(sizeof(*kevent));
 	int err;
 
-	set_fs(KERNEL_DS);
-	err = sys_ioctl(fd, cmd, (unsigned long) &kevent);
-	set_fs(old_fs);
+	if (kevent == NULL)
+		return -EFAULT;
 
+	err = do_ioctl(file, cmd, (unsigned long)kevent);
 	if (!err) {
-		err  = put_user(kevent.type, &up->type);
-		err |= put_user(kevent.timestamp, &up->timestamp);
-		err |= put_user(kevent.u.size.w, &up->u.size.w);
-		err |= put_user(kevent.u.size.h, &up->u.size.h);
-		err |= put_user(kevent.u.size.aspect_ratio,
+		err  = convert_in_user(&kevent->type, &up->type);
+		err |= convert_in_user(&kevent->timestamp, &up->timestamp);
+		err |= convert_in_user(&kevent->u.size.w, &up->u.size.w);
+		err |= convert_in_user(&kevent->u.size.h, &up->u.size.h);
+		err |= convert_in_user(&kevent->u.size.aspect_ratio,
 				&up->u.size.aspect_ratio);
 		if (err)
 			err = -EFAULT;
@@ -169,8 +190,8 @@ struct compat_video_still_picture {
         int32_t size;
 };
 
-static int do_video_stillpicture(unsigned int fd, unsigned int cmd,
-	struct compat_video_still_picture __user *up)
+static int do_video_stillpicture(struct file *file,
+		unsigned int cmd, struct compat_video_still_picture __user *up)
 {
 	struct video_still_picture __user *up_native;
 	compat_uptr_t fp;
@@ -190,7 +211,7 @@ static int do_video_stillpicture(unsigned int fd, unsigned int cmd,
 	if (err)
 		return -EFAULT;
 
-	err = sys_ioctl(fd, cmd, (unsigned long) up_native);
+	err = do_ioctl(file, cmd, (unsigned long) up_native);
 
 	return err;
 }
@@ -200,8 +221,8 @@ struct compat_video_spu_palette {
 	compat_uptr_t palette;
 };
 
-static int do_video_set_spu_palette(unsigned int fd, unsigned int cmd,
-		struct compat_video_spu_palette __user *up)
+static int do_video_set_spu_palette(struct file *file,
+		unsigned int cmd, struct compat_video_spu_palette __user *up)
 {
 	struct video_spu_palette __user *up_native;
 	compat_uptr_t palp;
@@ -218,7 +239,7 @@ static int do_video_set_spu_palette(unsigned int fd, unsigned int cmd,
 	if (err)
 		return -EFAULT;
 
-	err = sys_ioctl(fd, cmd, (unsigned long) up_native);
+	err = do_ioctl(file, cmd, (unsigned long) up_native);
 
 	return err;
 }
@@ -276,7 +297,7 @@ static int sg_build_iovec(sg_io_hdr_t __user *sgio, void __user *dxferp, u16 iov
 	return 0;
 }
 
-static int sg_ioctl_trans(unsigned int fd, unsigned int cmd,
+static int sg_ioctl_trans(struct file *file, unsigned int cmd,
 			sg_io_hdr32_t __user *sgio32)
 {
 	sg_io_hdr_t __user *sgio;
@@ -289,7 +310,7 @@ static int sg_ioctl_trans(unsigned int fd, unsigned int cmd,
 	if (get_user(interface_id, &sgio32->interface_id))
 		return -EFAULT;
 	if (interface_id != 'S')
-		return sys_ioctl(fd, cmd, (unsigned long)sgio32);
+		return do_ioctl(file, cmd, (unsigned long)sgio32);
 
 	if (get_user(iovec_count, &sgio32->iovec_count))
 		return -EFAULT;
@@ -349,7 +370,7 @@ static int sg_ioctl_trans(unsigned int fd, unsigned int cmd,
 	if (put_user(compat_ptr(data), &sgio->usr_ptr))
 		return -EFAULT;
 
-	err = sys_ioctl(fd, cmd, (unsigned long) sgio);
+	err = do_ioctl(file, cmd, (unsigned long) sgio);
 
 	if (err >= 0) {
 		void __user *datap;
@@ -380,13 +401,13 @@ struct compat_sg_req_info { /* used by SG_GET_REQUEST_TABLE ioctl() */
 	int unused;
 };
 
-static int sg_grt_trans(unsigned int fd, unsigned int cmd, struct
-			compat_sg_req_info __user *o)
+static int sg_grt_trans(struct file *file,
+		unsigned int cmd, struct compat_sg_req_info __user *o)
 {
 	int err, i;
 	sg_req_info_t __user *r;
 	r = compat_alloc_user_space(sizeof(sg_req_info_t)*SG_MAX_QUEUE);
-	err = sys_ioctl(fd,cmd,(unsigned long)r);
+	err = do_ioctl(file, cmd, (unsigned long)r);
 	if (err < 0)
 		return err;
 	for (i = 0; i < SG_MAX_QUEUE; i++) {
@@ -412,8 +433,8 @@ struct sock_fprog32 {
 #define PPPIOCSPASS32	_IOW('t', 71, struct sock_fprog32)
 #define PPPIOCSACTIVE32	_IOW('t', 70, struct sock_fprog32)
 
-static int ppp_sock_fprog_ioctl_trans(unsigned int fd, unsigned int cmd,
-			struct sock_fprog32 __user *u_fprog32)
+static int ppp_sock_fprog_ioctl_trans(struct file *file,
+		unsigned int cmd, struct sock_fprog32 __user *u_fprog32)
 {
 	struct sock_fprog __user *u_fprog64 = compat_alloc_user_space(sizeof(struct sock_fprog));
 	void __user *fptr64;
@@ -435,7 +456,7 @@ static int ppp_sock_fprog_ioctl_trans(unsigned int fd, unsigned int cmd,
 	else
 		cmd = PPPIOCSACTIVE;
 
-	return sys_ioctl(fd, cmd, (unsigned long) u_fprog64);
+	return do_ioctl(file, cmd, (unsigned long) u_fprog64);
 }
 
 struct ppp_option_data32 {
@@ -451,7 +472,7 @@ struct ppp_idle32 {
 };
 #define PPPIOCGIDLE32		_IOR('t', 63, struct ppp_idle32)
 
-static int ppp_gidle(unsigned int fd, unsigned int cmd,
+static int ppp_gidle(struct file *file, unsigned int cmd,
 		struct ppp_idle32 __user *idle32)
 {
 	struct ppp_idle __user *idle;
@@ -460,7 +481,7 @@ static int ppp_gidle(unsigned int fd, unsigned int cmd,
 
 	idle = compat_alloc_user_space(sizeof(*idle));
 
-	err = sys_ioctl(fd, PPPIOCGIDLE, (unsigned long) idle);
+	err = do_ioctl(file, PPPIOCGIDLE, (unsigned long) idle);
 
 	if (!err) {
 		if (get_user(xmit, &idle->xmit_idle) ||
@@ -472,7 +493,7 @@ static int ppp_gidle(unsigned int fd, unsigned int cmd,
 	return err;
 }
 
-static int ppp_scompress(unsigned int fd, unsigned int cmd,
+static int ppp_scompress(struct file *file, unsigned int cmd,
 	struct ppp_option_data32 __user *odata32)
 {
 	struct ppp_option_data __user *odata;
@@ -492,7 +513,7 @@ static int ppp_scompress(unsigned int fd, unsigned int cmd,
 			 sizeof(__u32) + sizeof(int)))
 		return -EFAULT;
 
-	return sys_ioctl(fd, PPPIOCSCOMPRESS, (unsigned long) odata);
+	return do_ioctl(file, PPPIOCSCOMPRESS, (unsigned long) odata);
 }
 
 #ifdef CONFIG_BLOCK
@@ -512,12 +533,13 @@ struct mtpos32 {
 };
 #define MTIOCPOS32	_IOR('m', 3, struct mtpos32)
 
-static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, void __user *argp)
+static int mt_ioctl_trans(struct file *file,
+		unsigned int cmd, void __user *argp)
 {
-	mm_segment_t old_fs = get_fs();
-	struct mtget get;
+	/* NULL initialization to make gcc shut up */
+	struct mtget __user *get = NULL;
 	struct mtget32 __user *umget32;
-	struct mtpos pos;
+	struct mtpos __user *pos = NULL;
 	struct mtpos32 __user *upos32;
 	unsigned long kcmd;
 	void *karg;
@@ -526,32 +548,34 @@ static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, void __user *argp)
 	switch(cmd) {
 	case MTIOCPOS32:
 		kcmd = MTIOCPOS;
-		karg = &pos;
+		pos = compat_alloc_user_space(sizeof(*pos));
+		karg = pos;
 		break;
 	default:	/* MTIOCGET32 */
 		kcmd = MTIOCGET;
-		karg = &get;
+		get = compat_alloc_user_space(sizeof(*get));
+		karg = get;
 		break;
 	}
-	set_fs (KERNEL_DS);
-	err = sys_ioctl (fd, kcmd, (unsigned long)karg);
-	set_fs (old_fs);
+	if (karg == NULL)
+		return -EFAULT;
+	err = do_ioctl(file, kcmd, (unsigned long)karg);
 	if (err)
 		return err;
 	switch (cmd) {
 	case MTIOCPOS32:
 		upos32 = argp;
-		err = __put_user(pos.mt_blkno, &upos32->mt_blkno);
+		err = convert_in_user(&pos->mt_blkno, &upos32->mt_blkno);
 		break;
 	case MTIOCGET32:
 		umget32 = argp;
-		err = __put_user(get.mt_type, &umget32->mt_type);
-		err |= __put_user(get.mt_resid, &umget32->mt_resid);
-		err |= __put_user(get.mt_dsreg, &umget32->mt_dsreg);
-		err |= __put_user(get.mt_gstat, &umget32->mt_gstat);
-		err |= __put_user(get.mt_erreg, &umget32->mt_erreg);
-		err |= __put_user(get.mt_fileno, &umget32->mt_fileno);
-		err |= __put_user(get.mt_blkno, &umget32->mt_blkno);
+		err = convert_in_user(&get->mt_type, &umget32->mt_type);
+		err |= convert_in_user(&get->mt_resid, &umget32->mt_resid);
+		err |= convert_in_user(&get->mt_dsreg, &umget32->mt_dsreg);
+		err |= convert_in_user(&get->mt_gstat, &umget32->mt_gstat);
+		err |= convert_in_user(&get->mt_erreg, &umget32->mt_erreg);
+		err |= convert_in_user(&get->mt_fileno, &umget32->mt_fileno);
+		err |= convert_in_user(&get->mt_blkno, &umget32->mt_blkno);
 		break;
 	}
 	return err ? -EFAULT: 0;
@@ -605,42 +629,41 @@ struct serial_struct32 {
         compat_int_t    reserved[1];
 };
 
-static int serial_struct_ioctl(unsigned fd, unsigned cmd,
-			struct serial_struct32 __user *ss32)
+static int serial_struct_ioctl(struct file *file,
+		unsigned cmd, struct serial_struct32 __user *ss32)
 {
         typedef struct serial_struct32 SS32;
         int err;
-        struct serial_struct ss;
-        mm_segment_t oldseg = get_fs();
+	struct serial_struct __user *ss = compat_alloc_user_space(sizeof(*ss));
         __u32 udata;
 	unsigned int base;
+	unsigned char *iomem_base;
 
+	if (ss == NULL)
+		return -EFAULT;
         if (cmd == TIOCSSERIAL) {
-                if (!access_ok(VERIFY_READ, ss32, sizeof(SS32)))
-                        return -EFAULT;
-                if (__copy_from_user(&ss, ss32, offsetof(SS32, iomem_base)))
-			return -EFAULT;
-                if (__get_user(udata, &ss32->iomem_base))
+		if (copy_in_user(ss, ss32, offsetof(SS32, iomem_base)) ||
+		    get_user(udata, &ss32->iomem_base))
 			return -EFAULT;
-                ss.iomem_base = compat_ptr(udata);
-                if (__get_user(ss.iomem_reg_shift, &ss32->iomem_reg_shift) ||
-		    __get_user(ss.port_high, &ss32->port_high))
+		iomem_base = compat_ptr(udata);
+		if (put_user(iomem_base, &ss->iomem_base) ||
+		    convert_in_user(&ss32->iomem_reg_shift,
+		      &ss->iomem_reg_shift) ||
+		    convert_in_user(&ss32->port_high, &ss->port_high) ||
+		    put_user(0UL, &ss->iomap_base))
 			return -EFAULT;
-                ss.iomap_base = 0UL;
         }
-        set_fs(KERNEL_DS);
-                err = sys_ioctl(fd,cmd,(unsigned long)(&ss));
-        set_fs(oldseg);
+	err = do_ioctl(file, cmd, (unsigned long)ss);
         if (cmd == TIOCGSERIAL && err >= 0) {
-                if (!access_ok(VERIFY_WRITE, ss32, sizeof(SS32)))
-                        return -EFAULT;
-                if (__copy_to_user(ss32,&ss,offsetof(SS32,iomem_base)))
+		if (copy_in_user(ss32, ss, offsetof(SS32, iomem_base)) ||
+		    get_user(iomem_base, &ss->iomem_base))
 			return -EFAULT;
-		base = (unsigned long)ss.iomem_base  >> 32 ?
-			0xffffffff : (unsigned)(unsigned long)ss.iomem_base;
-		if (__put_user(base, &ss32->iomem_base) ||
-		    __put_user(ss.iomem_reg_shift, &ss32->iomem_reg_shift) ||
-		    __put_user(ss.port_high, &ss32->port_high))
+		base = (unsigned long)iomem_base  >> 32 ?
+			0xffffffff : (unsigned)(unsigned long)iomem_base;
+		if (put_user(base, &ss32->iomem_base) ||
+		    convert_in_user(&ss->iomem_reg_shift,
+		      &ss32->iomem_reg_shift) ||
+		    convert_in_user(&ss->port_high, &ss32->port_high))
 			return -EFAULT;
         }
         return err;
@@ -674,8 +697,8 @@ struct i2c_rdwr_aligned {
 	struct i2c_msg msgs[0];
 };
 
-static int do_i2c_rdwr_ioctl(unsigned int fd, unsigned int cmd,
-			struct i2c_rdwr_ioctl_data32    __user *udata)
+static int do_i2c_rdwr_ioctl(struct file *file,
+	unsigned int cmd, struct i2c_rdwr_ioctl_data32 __user *udata)
 {
 	struct i2c_rdwr_aligned		__user *tdata;
 	struct i2c_msg			__user *tmsgs;
@@ -708,11 +731,11 @@ static int do_i2c_rdwr_ioctl(unsigned int fd, unsigned int cmd,
 		    put_user(compat_ptr(datap), &tmsgs[i].buf))
 			return -EFAULT;
 	}
-	return sys_ioctl(fd, cmd, (unsigned long)tdata);
+	return do_ioctl(file, cmd, (unsigned long)tdata);
 }
 
-static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd,
-			struct i2c_smbus_ioctl_data32   __user *udata)
+static int do_i2c_smbus_ioctl(struct file *file,
+		unsigned int cmd, struct i2c_smbus_ioctl_data32   __user *udata)
 {
 	struct i2c_smbus_ioctl_data	__user *tdata;
 	compat_caddr_t			datap;
@@ -734,7 +757,7 @@ static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd,
 	    __put_user(compat_ptr(datap), &tdata->data))
 		return -EFAULT;
 
-	return sys_ioctl(fd, cmd, (unsigned long)tdata);
+	return do_ioctl(file, cmd, (unsigned long)tdata);
 }
 
 #define RTC_IRQP_READ32		_IOR('p', 0x0b, compat_ulong_t)
@@ -742,29 +765,27 @@ static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd,
 #define RTC_EPOCH_READ32	_IOR('p', 0x0d, compat_ulong_t)
 #define RTC_EPOCH_SET32		_IOW('p', 0x0e, compat_ulong_t)
 
-static int rtc_ioctl(unsigned fd, unsigned cmd, void __user *argp)
+static int rtc_ioctl(struct file *file,
+		unsigned cmd, void __user *argp)
 {
-	mm_segment_t oldfs = get_fs();
-	compat_ulong_t val32;
-	unsigned long kval;
+	unsigned long __user *valp = compat_alloc_user_space(sizeof(*valp));
 	int ret;
 
+	if (valp == NULL)
+		return -EFAULT;
 	switch (cmd) {
 	case RTC_IRQP_READ32:
 	case RTC_EPOCH_READ32:
-		set_fs(KERNEL_DS);
-		ret = sys_ioctl(fd, (cmd == RTC_IRQP_READ32) ?
+		ret = do_ioctl(file, (cmd == RTC_IRQP_READ32) ?
 					RTC_IRQP_READ : RTC_EPOCH_READ,
-					(unsigned long)&kval);
-		set_fs(oldfs);
+					(unsigned long)valp);
 		if (ret)
 			return ret;
-		val32 = kval;
-		return put_user(val32, (unsigned int __user *)argp);
+		return convert_in_user(valp, (unsigned int __user *)argp);
 	case RTC_IRQP_SET32:
-		return sys_ioctl(fd, RTC_IRQP_SET, (unsigned long)argp);
+		return do_ioctl(file, RTC_IRQP_SET, (unsigned long)argp);
 	case RTC_EPOCH_SET32:
-		return sys_ioctl(fd, RTC_EPOCH_SET, (unsigned long)argp);
+		return do_ioctl(file, RTC_EPOCH_SET, (unsigned long)argp);
 	}
 
 	return -ENOIOCTLCMD;
@@ -1240,6 +1261,9 @@ COMPATIBLE_IOCTL(HCIUNBLOCKADDR)
 COMPATIBLE_IOCTL(HCIINQUIRY)
 COMPATIBLE_IOCTL(HCIUARTSETPROTO)
 COMPATIBLE_IOCTL(HCIUARTGETPROTO)
+COMPATIBLE_IOCTL(HCIUARTGETDEVICE)
+COMPATIBLE_IOCTL(HCIUARTSETFLAGS)
+COMPATIBLE_IOCTL(HCIUARTGETFLAGS)
 COMPATIBLE_IOCTL(RFCOMMCREATEDEV)
 COMPATIBLE_IOCTL(RFCOMMRELEASEDEV)
 COMPATIBLE_IOCTL(RFCOMMGETDEVLIST)
@@ -1284,12 +1308,6 @@ COMPATIBLE_IOCTL(PCIIOC_CONTROLLER)
 COMPATIBLE_IOCTL(PCIIOC_MMAP_IS_IO)
 COMPATIBLE_IOCTL(PCIIOC_MMAP_IS_MEM)
 COMPATIBLE_IOCTL(PCIIOC_WRITE_COMBINE)
-/* NBD */
-COMPATIBLE_IOCTL(NBD_DO_IT)
-COMPATIBLE_IOCTL(NBD_CLEAR_SOCK)
-COMPATIBLE_IOCTL(NBD_CLEAR_QUE)
-COMPATIBLE_IOCTL(NBD_PRINT_DEBUG)
-COMPATIBLE_IOCTL(NBD_DISCONNECT)
 /* i2c */
 COMPATIBLE_IOCTL(I2C_SLAVE)
 COMPATIBLE_IOCTL(I2C_SLAVE_FORCE)
@@ -1436,53 +1454,53 @@ IGNORE_IOCTL(FBIOGCURSOR32)
  * a compat_ioctl operation in the place that handleѕ the
  * ioctl for the native case.
  */
-static long do_ioctl_trans(int fd, unsigned int cmd,
+static long do_ioctl_trans(unsigned int cmd,
 		 unsigned long arg, struct file *file)
 {
 	void __user *argp = compat_ptr(arg);
 
 	switch (cmd) {
 	case PPPIOCGIDLE32:
-		return ppp_gidle(fd, cmd, argp);
+		return ppp_gidle(file, cmd, argp);
 	case PPPIOCSCOMPRESS32:
-		return ppp_scompress(fd, cmd, argp);
+		return ppp_scompress(file, cmd, argp);
 	case PPPIOCSPASS32:
 	case PPPIOCSACTIVE32:
-		return ppp_sock_fprog_ioctl_trans(fd, cmd, argp);
+		return ppp_sock_fprog_ioctl_trans(file, cmd, argp);
 #ifdef CONFIG_BLOCK
 	case SG_IO:
-		return sg_ioctl_trans(fd, cmd, argp);
+		return sg_ioctl_trans(file, cmd, argp);
 	case SG_GET_REQUEST_TABLE:
-		return sg_grt_trans(fd, cmd, argp);
+		return sg_grt_trans(file, cmd, argp);
 	case MTIOCGET32:
 	case MTIOCPOS32:
-		return mt_ioctl_trans(fd, cmd, argp);
+		return mt_ioctl_trans(file, cmd, argp);
 #endif
 	/* Serial */
 	case TIOCGSERIAL:
 	case TIOCSSERIAL:
-		return serial_struct_ioctl(fd, cmd, argp);
+		return serial_struct_ioctl(file, cmd, argp);
 	/* i2c */
 	case I2C_FUNCS:
-		return w_long(fd, cmd, argp);
+		return w_long(file, cmd, argp);
 	case I2C_RDWR:
-		return do_i2c_rdwr_ioctl(fd, cmd, argp);
+		return do_i2c_rdwr_ioctl(file, cmd, argp);
 	case I2C_SMBUS:
-		return do_i2c_smbus_ioctl(fd, cmd, argp);
+		return do_i2c_smbus_ioctl(file, cmd, argp);
 	/* Not implemented in the native kernel */
 	case RTC_IRQP_READ32:
 	case RTC_IRQP_SET32:
 	case RTC_EPOCH_READ32:
 	case RTC_EPOCH_SET32:
-		return rtc_ioctl(fd, cmd, argp);
+		return rtc_ioctl(file, cmd, argp);
 
 	/* dvb */
 	case VIDEO_GET_EVENT:
-		return do_video_get_event(fd, cmd, argp);
+		return do_video_get_event(file, cmd, argp);
 	case VIDEO_STILLPICTURE:
-		return do_video_stillpicture(fd, cmd, argp);
+		return do_video_stillpicture(file, cmd, argp);
 	case VIDEO_SET_SPU_PALETTE:
-		return do_video_set_spu_palette(fd, cmd, argp);
+		return do_video_set_spu_palette(file, cmd, argp);
 	}
 
 	/*
@@ -1508,12 +1526,7 @@ static long do_ioctl_trans(int fd, unsigned int cmd,
 	case KDSKBMETA:
 	case KDSKBLED:
 	case KDSETLED:
-	/* NBD */
-	case NBD_SET_SOCK:
-	case NBD_SET_BLKSIZE:
-	case NBD_SET_SIZE:
-	case NBD_SET_SIZE_BLOCKS:
-		return do_vfs_ioctl(file, fd, cmd, arg);
+		return vfs_ioctl(file, cmd, arg);
 	}
 
 	return -ENOIOCTLCMD;
@@ -1580,6 +1593,11 @@ COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd,
 		goto out_fput;
 #endif
 
+	case FICLONE:
+	case FICLONERANGE:
+	case FIDEDUPERANGE:
+		goto do_ioctl;
+
 	case FIBMAP:
 	case FIGETBSZ:
 	case FIONREAD:
@@ -1602,7 +1620,7 @@ COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd,
 	if (compat_ioctl_check_table(XFORM(cmd)))
 		goto found_handler;
 
-	error = do_ioctl_trans(fd, cmd, arg, f.file);
+	error = do_ioctl_trans(cmd, arg, f.file);
 	if (error == -ENOIOCTLCMD)
 		error = -ENOTTY;
 
diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h
index b65d1ef53..ccc31fa6f 100644
--- a/fs/configfs/configfs_internal.h
+++ b/fs/configfs/configfs_internal.h
@@ -53,13 +53,14 @@ struct configfs_dirent {
 #define CONFIGFS_ROOT		0x0001
 #define CONFIGFS_DIR		0x0002
 #define CONFIGFS_ITEM_ATTR	0x0004
+#define CONFIGFS_ITEM_BIN_ATTR	0x0008
 #define CONFIGFS_ITEM_LINK	0x0020
 #define CONFIGFS_USET_DIR	0x0040
 #define CONFIGFS_USET_DEFAULT	0x0080
 #define CONFIGFS_USET_DROPPING	0x0100
 #define CONFIGFS_USET_IN_MKDIR	0x0200
 #define CONFIGFS_USET_CREATING	0x0400
-#define CONFIGFS_NOT_PINNED	(CONFIGFS_ITEM_ATTR)
+#define CONFIGFS_NOT_PINNED	(CONFIGFS_ITEM_ATTR | CONFIGFS_ITEM_BIN_ATTR)
 
 extern struct mutex configfs_symlink_mutex;
 extern spinlock_t configfs_dirent_lock;
@@ -72,6 +73,8 @@ extern struct inode * configfs_new_inode(umode_t mode, struct configfs_dirent *,
 extern int configfs_create(struct dentry *, umode_t mode, void (*init)(struct inode *));
 
 extern int configfs_create_file(struct config_item *, const struct configfs_attribute *);
+extern int configfs_create_bin_file(struct config_item *,
+				    const struct configfs_bin_attribute *);
 extern int configfs_make_dirent(struct configfs_dirent *,
 				struct dentry *, void *, umode_t, int);
 extern int configfs_dirent_is_ready(struct configfs_dirent *);
@@ -88,7 +91,7 @@ extern void configfs_release_fs(void);
 extern struct rw_semaphore configfs_rename_sem;
 extern const struct file_operations configfs_dir_operations;
 extern const struct file_operations configfs_file_operations;
-extern const struct file_operations bin_fops;
+extern const struct file_operations configfs_bin_file_operations;
 extern const struct inode_operations configfs_dir_inode_operations;
 extern const struct inode_operations configfs_root_inode_operations;
 extern const struct inode_operations configfs_symlink_inode_operations;
@@ -119,6 +122,13 @@ static inline struct configfs_attribute * to_attr(struct dentry * dentry)
 	return ((struct configfs_attribute *) sd->s_element);
 }
 
+static inline struct configfs_bin_attribute *to_bin_attr(struct dentry *dentry)
+{
+	struct configfs_attribute *attr = to_attr(dentry);
+
+	return container_of(attr, struct configfs_bin_attribute, cb_attr);
+}
+
 static inline struct config_item *configfs_get_config_item(struct dentry *dentry)
 {
 	struct config_item * item = NULL;
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index a7a1b218f..f419519ec 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -255,6 +255,12 @@ static void configfs_init_file(struct inode * inode)
 	inode->i_fop = &configfs_file_operations;
 }
 
+static void configfs_init_bin_file(struct inode *inode)
+{
+	inode->i_size = 0;
+	inode->i_fop = &configfs_bin_file_operations;
+}
+
 static void init_symlink(struct inode * inode)
 {
 	inode->i_op = &configfs_symlink_inode_operations;
@@ -423,7 +429,9 @@ static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * den
 	spin_unlock(&configfs_dirent_lock);
 
 	error = configfs_create(dentry, (attr->ca_mode & S_IALLUGO) | S_IFREG,
-				configfs_init_file);
+				(sd->s_type & CONFIGFS_ITEM_BIN_ATTR) ?
+					configfs_init_bin_file :
+					configfs_init_file);
 	if (error) {
 		configfs_put(sd);
 		return error;
@@ -583,6 +591,7 @@ static int populate_attrs(struct config_item *item)
 {
 	struct config_item_type *t = item->ci_type;
 	struct configfs_attribute *attr;
+	struct configfs_bin_attribute *bin_attr;
 	int error = 0;
 	int i;
 
@@ -594,6 +603,13 @@ static int populate_attrs(struct config_item *item)
 				break;
 		}
 	}
+	if (t->ct_bin_attrs) {
+		for (i = 0; (bin_attr = t->ct_bin_attrs[i]) != NULL; i++) {
+			error = configfs_create_bin_file(item, bin_attr);
+			if (error)
+				break;
+		}
+	}
 
 	if (error)
 		detach_attrs(item);
@@ -624,13 +640,13 @@ static void detach_groups(struct config_group *group)
 
 		child = sd->s_dentry;
 
-		mutex_lock(&d_inode(child)->i_mutex);
+		inode_lock(d_inode(child));
 
 		configfs_detach_group(sd->s_element);
 		d_inode(child)->i_flags |= S_DEAD;
 		dont_mount(child);
 
-		mutex_unlock(&d_inode(child)->i_mutex);
+		inode_unlock(d_inode(child));
 
 		d_delete(child);
 		dput(child);
@@ -818,11 +834,11 @@ static int configfs_attach_item(struct config_item *parent_item,
 			 * the VFS may already have hit and used them. Thus,
 			 * we must lock them as rmdir() would.
 			 */
-			mutex_lock(&d_inode(dentry)->i_mutex);
+			inode_lock(d_inode(dentry));
 			configfs_remove_dir(item);
 			d_inode(dentry)->i_flags |= S_DEAD;
 			dont_mount(dentry);
-			mutex_unlock(&d_inode(dentry)->i_mutex);
+			inode_unlock(d_inode(dentry));
 			d_delete(dentry);
 		}
 	}
@@ -858,7 +874,7 @@ static int configfs_attach_group(struct config_item *parent_item,
 		 * We must also lock the inode to remove it safely in case of
 		 * error, as rmdir() would.
 		 */
-		mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD);
+		inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD);
 		configfs_adjust_dir_dirent_depth_before_populate(sd);
 		ret = populate_groups(to_config_group(item));
 		if (ret) {
@@ -867,7 +883,7 @@ static int configfs_attach_group(struct config_item *parent_item,
 			dont_mount(dentry);
 		}
 		configfs_adjust_dir_dirent_depth_after_populate(sd);
-		mutex_unlock(&d_inode(dentry)->i_mutex);
+		inode_unlock(d_inode(dentry));
 		if (ret)
 			d_delete(dentry);
 	}
@@ -1054,11 +1070,55 @@ out:
 	return ret;
 }
 
+static int configfs_do_depend_item(struct dentry *subsys_dentry,
+				   struct config_item *target)
+{
+	struct configfs_dirent *p;
+	int ret;
+
+	spin_lock(&configfs_dirent_lock);
+	/* Scan the tree, return 0 if found */
+	ret = configfs_depend_prep(subsys_dentry, target);
+	if (ret)
+		goto out_unlock_dirent_lock;
+
+	/*
+	 * We are sure that the item is not about to be removed by rmdir(), and
+	 * not in the middle of attachment by mkdir().
+	 */
+	p = target->ci_dentry->d_fsdata;
+	p->s_dependent_count += 1;
+
+out_unlock_dirent_lock:
+	spin_unlock(&configfs_dirent_lock);
+
+	return ret;
+}
+
+static inline struct configfs_dirent *
+configfs_find_subsys_dentry(struct configfs_dirent *root_sd,
+			    struct config_item *subsys_item)
+{
+	struct configfs_dirent *p;
+	struct configfs_dirent *ret = NULL;
+
+	list_for_each_entry(p, &root_sd->s_children, s_sibling) {
+		if (p->s_type & CONFIGFS_DIR &&
+		    p->s_element == subsys_item) {
+			ret = p;
+			break;
+		}
+	}
+
+	return ret;
+}
+
+
 int configfs_depend_item(struct configfs_subsystem *subsys,
 			 struct config_item *target)
 {
 	int ret;
-	struct configfs_dirent *p, *root_sd, *subsys_sd = NULL;
+	struct configfs_dirent *subsys_sd;
 	struct config_item *s_item = &subsys->su_group.cg_item;
 	struct dentry *root;
 
@@ -1075,43 +1135,19 @@ int configfs_depend_item(struct configfs_subsystem *subsys,
 	 * subsystem is really registered, and so we need to lock out
 	 * configfs_[un]register_subsystem().
 	 */
-	mutex_lock(&d_inode(root)->i_mutex);
-
-	root_sd = root->d_fsdata;
-
-	list_for_each_entry(p, &root_sd->s_children, s_sibling) {
-		if (p->s_type & CONFIGFS_DIR) {
-			if (p->s_element == s_item) {
-				subsys_sd = p;
-				break;
-			}
-		}
-	}
+	inode_lock(d_inode(root));
 
+	subsys_sd = configfs_find_subsys_dentry(root->d_fsdata, s_item);
 	if (!subsys_sd) {
 		ret = -ENOENT;
 		goto out_unlock_fs;
 	}
 
 	/* Ok, now we can trust subsys/s_item */
+	ret = configfs_do_depend_item(subsys_sd->s_dentry, target);
 
-	spin_lock(&configfs_dirent_lock);
-	/* Scan the tree, return 0 if found */
-	ret = configfs_depend_prep(subsys_sd->s_dentry, target);
-	if (ret)
-		goto out_unlock_dirent_lock;
-
-	/*
-	 * We are sure that the item is not about to be removed by rmdir(), and
-	 * not in the middle of attachment by mkdir().
-	 */
-	p = target->ci_dentry->d_fsdata;
-	p->s_dependent_count += 1;
-
-out_unlock_dirent_lock:
-	spin_unlock(&configfs_dirent_lock);
 out_unlock_fs:
-	mutex_unlock(&d_inode(root)->i_mutex);
+	inode_unlock(d_inode(root));
 
 	/*
 	 * If we succeeded, the fs is pinned via other methods.  If not,
@@ -1128,8 +1164,7 @@ EXPORT_SYMBOL(configfs_depend_item);
  * configfs_depend_item() because we know that that the client driver is
  * pinned, thus the subsystem is pinned, and therefore configfs is pinned.
  */
-void configfs_undepend_item(struct configfs_subsystem *subsys,
-			    struct config_item *target)
+void configfs_undepend_item(struct config_item *target)
 {
 	struct configfs_dirent *sd;
 
@@ -1152,6 +1187,79 @@ void configfs_undepend_item(struct configfs_subsystem *subsys,
 }
 EXPORT_SYMBOL(configfs_undepend_item);
 
+/*
+ * caller_subsys is a caller's subsystem not target's. This is used to
+ * determine if we should lock root and check subsys or not. When we are
+ * in the same subsystem as our target there is no need to do locking as
+ * we know that subsys is valid and is not unregistered during this function
+ * as we are called from callback of one of his children and VFS holds a lock
+ * on some inode. Otherwise we have to lock our root to  ensure that target's
+ * subsystem it is not unregistered during this function.
+ */
+int configfs_depend_item_unlocked(struct configfs_subsystem *caller_subsys,
+				  struct config_item *target)
+{
+	struct configfs_subsystem *target_subsys;
+	struct config_group *root, *parent;
+	struct configfs_dirent *subsys_sd;
+	int ret = -ENOENT;
+
+	/* Disallow this function for configfs root */
+	if (configfs_is_root(target))
+		return -EINVAL;
+
+	parent = target->ci_group;
+	/*
+	 * This may happen when someone is trying to depend root
+	 * directory of some subsystem
+	 */
+	if (configfs_is_root(&parent->cg_item)) {
+		target_subsys = to_configfs_subsystem(to_config_group(target));
+		root = parent;
+	} else {
+		target_subsys = parent->cg_subsys;
+		/* Find a cofnigfs root as we may need it for locking */
+		for (root = parent; !configfs_is_root(&root->cg_item);
+		     root = root->cg_item.ci_group)
+			;
+	}
+
+	if (target_subsys != caller_subsys) {
+		/*
+		 * We are in other configfs subsystem, so we have to do
+		 * additional locking to prevent other subsystem from being
+		 * unregistered
+		 */
+		inode_lock(d_inode(root->cg_item.ci_dentry));
+
+		/*
+		 * As we are trying to depend item from other subsystem
+		 * we have to check if this subsystem is still registered
+		 */
+		subsys_sd = configfs_find_subsys_dentry(
+				root->cg_item.ci_dentry->d_fsdata,
+				&target_subsys->su_group.cg_item);
+		if (!subsys_sd)
+			goto out_root_unlock;
+	} else {
+		subsys_sd = target_subsys->su_group.cg_item.ci_dentry->d_fsdata;
+	}
+
+	/* Now we can execute core of depend item */
+	ret = configfs_do_depend_item(subsys_sd->s_dentry, target);
+
+	if (target_subsys != caller_subsys)
+out_root_unlock:
+		/*
+		 * We were called from subsystem other than our target so we
+		 * took some locks so now it's time to release them
+		 */
+		inode_unlock(d_inode(root->cg_item.ci_dentry));
+
+	return ret;
+}
+EXPORT_SYMBOL(configfs_depend_item_unlocked);
+
 static int configfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
 	int ret = 0;
@@ -1453,7 +1561,7 @@ int configfs_rename_dir(struct config_item * item, const char *new_name)
 	down_write(&configfs_rename_sem);
 	parent = item->parent->dentry;
 
-	mutex_lock(&d_inode(parent)->i_mutex);
+	inode_lock(d_inode(parent));
 
 	new_dentry = lookup_one_len(new_name, parent, strlen(new_name));
 	if (!IS_ERR(new_dentry)) {
@@ -1469,7 +1577,7 @@ int configfs_rename_dir(struct config_item * item, const char *new_name)
 			error = -EEXIST;
 		dput(new_dentry);
 	}
-	mutex_unlock(&d_inode(parent)->i_mutex);
+	inode_unlock(d_inode(parent));
 	up_write(&configfs_rename_sem);
 
 	return error;
@@ -1482,7 +1590,7 @@ static int configfs_dir_open(struct inode *inode, struct file *file)
 	struct configfs_dirent * parent_sd = dentry->d_fsdata;
 	int err;
 
-	mutex_lock(&d_inode(dentry)->i_mutex);
+	inode_lock(d_inode(dentry));
 	/*
 	 * Fake invisibility if dir belongs to a group/default groups hierarchy
 	 * being attached
@@ -1495,7 +1603,7 @@ static int configfs_dir_open(struct inode *inode, struct file *file)
 		else
 			err = 0;
 	}
-	mutex_unlock(&d_inode(dentry)->i_mutex);
+	inode_unlock(d_inode(dentry));
 
 	return err;
 }
@@ -1505,11 +1613,11 @@ static int configfs_dir_close(struct inode *inode, struct file *file)
 	struct dentry * dentry = file->f_path.dentry;
 	struct configfs_dirent * cursor = file->private_data;
 
-	mutex_lock(&d_inode(dentry)->i_mutex);
+	inode_lock(d_inode(dentry));
 	spin_lock(&configfs_dirent_lock);
 	list_del_init(&cursor->s_sibling);
 	spin_unlock(&configfs_dirent_lock);
-	mutex_unlock(&d_inode(dentry)->i_mutex);
+	inode_unlock(d_inode(dentry));
 
 	release_configfs_dirent(cursor);
 
@@ -1590,7 +1698,7 @@ static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence)
 {
 	struct dentry * dentry = file->f_path.dentry;
 
-	mutex_lock(&d_inode(dentry)->i_mutex);
+	inode_lock(d_inode(dentry));
 	switch (whence) {
 		case 1:
 			offset += file->f_pos;
@@ -1598,7 +1706,7 @@ static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence)
 			if (offset >= 0)
 				break;
 		default:
-			mutex_unlock(&d_inode(dentry)->i_mutex);
+			inode_unlock(d_inode(dentry));
 			return -EINVAL;
 	}
 	if (offset != file->f_pos) {
@@ -1624,7 +1732,7 @@ static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence)
 			spin_unlock(&configfs_dirent_lock);
 		}
 	}
-	mutex_unlock(&d_inode(dentry)->i_mutex);
+	inode_unlock(d_inode(dentry));
 	return offset;
 }
 
@@ -1659,14 +1767,14 @@ int configfs_register_group(struct config_group *parent_group,
 
 	parent = parent_group->cg_item.ci_dentry;
 
-	mutex_lock_nested(&d_inode(parent)->i_mutex, I_MUTEX_PARENT);
+	inode_lock_nested(d_inode(parent), I_MUTEX_PARENT);
 	ret = create_default_group(parent_group, group);
 	if (!ret) {
 		spin_lock(&configfs_dirent_lock);
 		configfs_dir_set_ready(group->cg_item.ci_dentry->d_fsdata);
 		spin_unlock(&configfs_dirent_lock);
 	}
-	mutex_unlock(&d_inode(parent)->i_mutex);
+	inode_unlock(d_inode(parent));
 	return ret;
 }
 EXPORT_SYMBOL(configfs_register_group);
@@ -1683,7 +1791,7 @@ void configfs_unregister_group(struct config_group *group)
 	struct dentry *dentry = group->cg_item.ci_dentry;
 	struct dentry *parent = group->cg_item.ci_parent->ci_dentry;
 
-	mutex_lock_nested(&d_inode(parent)->i_mutex, I_MUTEX_PARENT);
+	inode_lock_nested(d_inode(parent), I_MUTEX_PARENT);
 	spin_lock(&configfs_dirent_lock);
 	configfs_detach_prep(dentry, NULL);
 	spin_unlock(&configfs_dirent_lock);
@@ -1692,7 +1800,7 @@ void configfs_unregister_group(struct config_group *group)
 	d_inode(dentry)->i_flags |= S_DEAD;
 	dont_mount(dentry);
 	d_delete(dentry);
-	mutex_unlock(&d_inode(parent)->i_mutex);
+	inode_unlock(d_inode(parent));
 
 	dput(dentry);
 
@@ -1764,7 +1872,7 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
 	sd = root->d_fsdata;
 	link_group(to_config_group(sd->s_element), group);
 
-	mutex_lock_nested(&d_inode(root)->i_mutex, I_MUTEX_PARENT);
+	inode_lock_nested(d_inode(root), I_MUTEX_PARENT);
 
 	err = -ENOMEM;
 	dentry = d_alloc_name(root, group->cg_item.ci_name);
@@ -1784,7 +1892,7 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys)
 		}
 	}
 
-	mutex_unlock(&d_inode(root)->i_mutex);
+	inode_unlock(d_inode(root));
 
 	if (err) {
 		unlink_group(group);
@@ -1805,9 +1913,9 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys)
 		return;
 	}
 
-	mutex_lock_nested(&d_inode(root)->i_mutex,
+	inode_lock_nested(d_inode(root),
 			  I_MUTEX_PARENT);
-	mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD);
+	inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD);
 	mutex_lock(&configfs_symlink_mutex);
 	spin_lock(&configfs_dirent_lock);
 	if (configfs_detach_prep(dentry, NULL)) {
@@ -1818,11 +1926,11 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys)
 	configfs_detach_group(&group->cg_item);
 	d_inode(dentry)->i_flags |= S_DEAD;
 	dont_mount(dentry);
-	mutex_unlock(&d_inode(dentry)->i_mutex);
+	inode_unlock(d_inode(dentry));
 
 	d_delete(dentry);
 
-	mutex_unlock(&d_inode(root)->i_mutex);
+	inode_unlock(d_inode(root));
 
 	dput(dentry);
 
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index d39099ea7..33b7ee34e 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -28,6 +28,7 @@
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/mutex.h>
+#include <linux/vmalloc.h>
 #include <asm/uaccess.h>
 
 #include <linux/configfs.h>
@@ -48,6 +49,10 @@ struct configfs_buffer {
 	struct configfs_item_operations	* ops;
 	struct mutex		mutex;
 	int			needs_read_fill;
+	bool			read_in_progress;
+	bool			write_in_progress;
+	char			*bin_buffer;
+	int			bin_buffer_size;
 };
 
 
@@ -123,6 +128,87 @@ out:
 	return retval;
 }
 
+/**
+ *	configfs_read_bin_file - read a binary attribute.
+ *	@file:	file pointer.
+ *	@buf:	buffer to fill.
+ *	@count:	number of bytes to read.
+ *	@ppos:	starting offset in file.
+ *
+ *	Userspace wants to read a binary attribute file. The attribute
+ *	descriptor is in the file's ->d_fsdata. The target item is in the
+ *	directory's ->d_fsdata.
+ *
+ *	We check whether we need to refill the buffer. If so we will
+ *	call the attributes' attr->read() twice. The first time we
+ *	will pass a NULL as a buffer pointer, which the attributes' method
+ *	will use to return the size of the buffer required. If no error
+ *	occurs we will allocate the buffer using vmalloc and call
+ *	attr->read() again passing that buffer as an argument.
+ *	Then we just copy to user-space using simple_read_from_buffer.
+ */
+
+static ssize_t
+configfs_read_bin_file(struct file *file, char __user *buf,
+		       size_t count, loff_t *ppos)
+{
+	struct configfs_buffer *buffer = file->private_data;
+	struct dentry *dentry = file->f_path.dentry;
+	struct config_item *item = to_item(dentry->d_parent);
+	struct configfs_bin_attribute *bin_attr = to_bin_attr(dentry);
+	ssize_t retval = 0;
+	ssize_t len = min_t(size_t, count, PAGE_SIZE);
+
+	mutex_lock(&buffer->mutex);
+
+	/* we don't support switching read/write modes */
+	if (buffer->write_in_progress) {
+		retval = -ETXTBSY;
+		goto out;
+	}
+	buffer->read_in_progress = 1;
+
+	if (buffer->needs_read_fill) {
+		/* perform first read with buf == NULL to get extent */
+		len = bin_attr->read(item, NULL, 0);
+		if (len <= 0) {
+			retval = len;
+			goto out;
+		}
+
+		/* do not exceed the maximum value */
+		if (bin_attr->cb_max_size && len > bin_attr->cb_max_size) {
+			retval = -EFBIG;
+			goto out;
+		}
+
+		buffer->bin_buffer = vmalloc(len);
+		if (buffer->bin_buffer == NULL) {
+			retval = -ENOMEM;
+			goto out;
+		}
+		buffer->bin_buffer_size = len;
+
+		/* perform second read to fill buffer */
+		len = bin_attr->read(item, buffer->bin_buffer, len);
+		if (len < 0) {
+			retval = len;
+			vfree(buffer->bin_buffer);
+			buffer->bin_buffer_size = 0;
+			buffer->bin_buffer = NULL;
+			goto out;
+		}
+
+		buffer->needs_read_fill = 0;
+	}
+
+	retval = simple_read_from_buffer(buf, count, ppos, buffer->bin_buffer,
+					buffer->bin_buffer_size);
+out:
+	mutex_unlock(&buffer->mutex);
+	return retval;
+}
+
 
 /**
  *	fill_write_buffer - copy buffer from userspace.
@@ -209,10 +295,80 @@ configfs_write_file(struct file *file, const char __user *buf, size_t count, lof
 	return len;
 }
 
-static int check_perm(struct inode * inode, struct file * file)
+/**
+ *	configfs_write_bin_file - write a binary attribute.
+ *	@file:	file pointer
+ *	@buf:	data to write
+ *	@count:	number of bytes
+ *	@ppos:	starting offset
+ *
+ *	Writing to a binary attribute file is similar to a normal read.
+ *	We buffer the consecutive writes (binary attribute files do not
+ *	support lseek) in a continuously growing buffer, but we don't
+ *	commit until the close of the file.
+ */
+
+static ssize_t
+configfs_write_bin_file(struct file *file, const char __user *buf,
+			size_t count, loff_t *ppos)
+{
+	struct configfs_buffer *buffer = file->private_data;
+	struct dentry *dentry = file->f_path.dentry;
+	struct configfs_bin_attribute *bin_attr = to_bin_attr(dentry);
+	void *tbuf = NULL;
+	ssize_t len;
+
+	mutex_lock(&buffer->mutex);
+
+	/* we don't support switching read/write modes */
+	if (buffer->read_in_progress) {
+		len = -ETXTBSY;
+		goto out;
+	}
+	buffer->write_in_progress = 1;
+
+	/* buffer grows? */
+	if (*ppos + count > buffer->bin_buffer_size) {
+
+		if (bin_attr->cb_max_size &&
+			*ppos + count > bin_attr->cb_max_size) {
+			len = -EFBIG;
+		}
+
+		tbuf = vmalloc(*ppos + count);
+		if (tbuf == NULL) {
+			len = -ENOMEM;
+			goto out;
+		}
+
+		/* copy old contents */
+		if (buffer->bin_buffer) {
+			memcpy(tbuf, buffer->bin_buffer,
+				buffer->bin_buffer_size);
+			vfree(buffer->bin_buffer);
+		}
+
+		/* clear the new area */
+		memset(tbuf + buffer->bin_buffer_size, 0,
+			*ppos + count - buffer->bin_buffer_size);
+		buffer->bin_buffer = tbuf;
+		buffer->bin_buffer_size = *ppos + count;
+	}
+
+	len = simple_write_to_buffer(buffer->bin_buffer,
+			buffer->bin_buffer_size, ppos, buf, count);
+	if (len > 0)
+		*ppos += len;
+out:
+	mutex_unlock(&buffer->mutex);
+	return len;
+}
+
+static int check_perm(struct inode * inode, struct file * file, int type)
 {
 	struct config_item *item = configfs_get_config_item(file->f_path.dentry->d_parent);
 	struct configfs_attribute * attr = to_attr(file->f_path.dentry);
+	struct configfs_bin_attribute *bin_attr = NULL;
 	struct configfs_buffer * buffer;
 	struct configfs_item_operations * ops = NULL;
 	int error = 0;
@@ -220,6 +376,9 @@ static int check_perm(struct inode * inode, struct file * file)
 	if (!item || !attr)
 		goto Einval;
 
+	if (type & CONFIGFS_ITEM_BIN_ATTR)
+		bin_attr = to_bin_attr(file->f_path.dentry);
+
 	/* Grab the module reference for this attribute if we have one */
 	if (!try_module_get(attr->ca_owner)) {
 		error = -ENODEV;
@@ -236,9 +395,14 @@ static int check_perm(struct inode * inode, struct file * file)
 	 * and we must have a store method.
 	 */
 	if (file->f_mode & FMODE_WRITE) {
-		if (!(inode->i_mode & S_IWUGO) || !attr->store)
+		if (!(inode->i_mode & S_IWUGO))
+			goto Eaccess;
+
+		if ((type & CONFIGFS_ITEM_ATTR) && !attr->store)
 			goto Eaccess;
 
+		if ((type & CONFIGFS_ITEM_BIN_ATTR) && !bin_attr->write)
+			goto Eaccess;
 	}
 
 	/* File needs read support.
@@ -246,7 +410,13 @@ static int check_perm(struct inode * inode, struct file * file)
 	 * must be a show method for it.
 	 */
 	if (file->f_mode & FMODE_READ) {
-		if (!(inode->i_mode & S_IRUGO) || !attr->show)
+		if (!(inode->i_mode & S_IRUGO))
+			goto Eaccess;
+
+		if ((type & CONFIGFS_ITEM_ATTR) && !attr->show)
+			goto Eaccess;
+
+		if ((type & CONFIGFS_ITEM_BIN_ATTR) && !bin_attr->read)
 			goto Eaccess;
 	}
 
@@ -260,6 +430,8 @@ static int check_perm(struct inode * inode, struct file * file)
 	}
 	mutex_init(&buffer->mutex);
 	buffer->needs_read_fill = 1;
+	buffer->read_in_progress = 0;
+	buffer->write_in_progress = 0;
 	buffer->ops = ops;
 	file->private_data = buffer;
 	goto Done;
@@ -277,12 +449,7 @@ static int check_perm(struct inode * inode, struct file * file)
 	return error;
 }
 
-static int configfs_open_file(struct inode * inode, struct file * filp)
-{
-	return check_perm(inode,filp);
-}
-
-static int configfs_release(struct inode * inode, struct file * filp)
+static int configfs_release(struct inode *inode, struct file *filp)
 {
 	struct config_item * item = to_item(filp->f_path.dentry->d_parent);
 	struct configfs_attribute * attr = to_attr(filp->f_path.dentry);
@@ -303,6 +470,47 @@ static int configfs_release(struct inode * inode, struct file * filp)
 	return 0;
 }
 
+static int configfs_open_file(struct inode *inode, struct file *filp)
+{
+	return check_perm(inode, filp, CONFIGFS_ITEM_ATTR);
+}
+
+static int configfs_open_bin_file(struct inode *inode, struct file *filp)
+{
+	return check_perm(inode, filp, CONFIGFS_ITEM_BIN_ATTR);
+}
+
+static int configfs_release_bin_file(struct inode *inode, struct file *filp)
+{
+	struct configfs_buffer *buffer = filp->private_data;
+	struct dentry *dentry = filp->f_path.dentry;
+	struct config_item *item = to_item(dentry->d_parent);
+	struct configfs_bin_attribute *bin_attr = to_bin_attr(dentry);
+	ssize_t len = 0;
+	int ret;
+
+	buffer->read_in_progress = 0;
+
+	if (buffer->write_in_progress) {
+		buffer->write_in_progress = 0;
+
+		len = bin_attr->write(item, buffer->bin_buffer,
+				buffer->bin_buffer_size);
+
+		/* vfree on NULL is safe */
+		vfree(buffer->bin_buffer);
+		buffer->bin_buffer = NULL;
+		buffer->bin_buffer_size = 0;
+		buffer->needs_read_fill = 1;
+	}
+
+	ret = configfs_release(inode, filp);
+	if (len < 0)
+		return len;
+	return ret;
+}
+
+
 const struct file_operations configfs_file_operations = {
 	.read		= configfs_read_file,
 	.write		= configfs_write_file,
@@ -311,6 +519,14 @@ const struct file_operations configfs_file_operations = {
 	.release	= configfs_release,
 };
 
+const struct file_operations configfs_bin_file_operations = {
+	.read		= configfs_read_bin_file,
+	.write		= configfs_write_bin_file,
+	.llseek		= NULL,		/* bin file is not seekable */
+	.open		= configfs_open_bin_file,
+	.release	= configfs_release_bin_file,
+};
+
 /**
  *	configfs_create_file - create an attribute file for an item.
  *	@item:	item we're creating for.
@@ -324,11 +540,32 @@ int configfs_create_file(struct config_item * item, const struct configfs_attrib
 	umode_t mode = (attr->ca_mode & S_IALLUGO) | S_IFREG;
 	int error = 0;
 
-	mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_NORMAL);
+	inode_lock_nested(d_inode(dir), I_MUTEX_NORMAL);
 	error = configfs_make_dirent(parent_sd, NULL, (void *) attr, mode,
 				     CONFIGFS_ITEM_ATTR);
-	mutex_unlock(&d_inode(dir)->i_mutex);
+	inode_unlock(d_inode(dir));
 
 	return error;
 }
 
+/**
+ *	configfs_create_bin_file - create a binary attribute file for an item.
+ *	@item:	item we're creating for.
+ *	@attr:	atrribute descriptor.
+ */
+
+int configfs_create_bin_file(struct config_item *item,
+		const struct configfs_bin_attribute *bin_attr)
+{
+	struct dentry *dir = item->ci_dentry;
+	struct configfs_dirent *parent_sd = dir->d_fsdata;
+	umode_t mode = (bin_attr->cb_attr.ca_mode & S_IALLUGO) | S_IFREG;
+	int error = 0;
+
+	inode_lock_nested(dir->d_inode, I_MUTEX_NORMAL);
+	error = configfs_make_dirent(parent_sd, NULL, (void *) bin_attr, mode,
+				     CONFIGFS_ITEM_BIN_ATTR);
+	inode_unlock(dir->d_inode);
+
+	return error;
+}
diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c
index eae87575e..cee087d8f 100644
--- a/fs/configfs/inode.c
+++ b/fs/configfs/inode.c
@@ -218,7 +218,7 @@ const unsigned char * configfs_get_name(struct configfs_dirent *sd)
 	if (sd->s_type & (CONFIGFS_DIR | CONFIGFS_ITEM_LINK))
 		return sd->s_dentry->d_name.name;
 
-	if (sd->s_type & CONFIGFS_ITEM_ATTR) {
+	if (sd->s_type & (CONFIGFS_ITEM_ATTR | CONFIGFS_ITEM_BIN_ATTR)) {
 		attr = sd->s_element;
 		return attr->ca_name;
 	}
@@ -255,7 +255,7 @@ void configfs_hash_and_remove(struct dentry * dir, const char * name)
 		/* no inode means this hasn't been made visible yet */
 		return;
 
-	mutex_lock(&d_inode(dir)->i_mutex);
+	inode_lock(d_inode(dir));
 	list_for_each_entry(sd, &parent_sd->s_children, s_sibling) {
 		if (!sd->s_element)
 			continue;
@@ -268,5 +268,5 @@ void configfs_hash_and_remove(struct dentry * dir, const char * name)
 			break;
 		}
 	}
-	mutex_unlock(&d_inode(dir)->i_mutex);
+	inode_unlock(d_inode(dir));
 }
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index ec5c8325b..db6d69289 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -279,27 +279,33 @@ static int configfs_getlink(struct dentry *dentry, char * path)
 
 }
 
-static const char *configfs_follow_link(struct dentry *dentry, void **cookie)
+static const char *configfs_get_link(struct dentry *dentry,
+				     struct inode *inode,
+				     struct delayed_call *done)
 {
-	unsigned long page = get_zeroed_page(GFP_KERNEL);
+	char *body;
 	int error;
 
-	if (!page)
+	if (!dentry)
+		return ERR_PTR(-ECHILD);
+
+	body = kzalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!body)
 		return ERR_PTR(-ENOMEM);
 
-	error = configfs_getlink(dentry, (char *)page);
+	error = configfs_getlink(dentry, body);
 	if (!error) {
-		return *cookie = (void *)page;
+		set_delayed_call(done, kfree_link, body);
+		return body;
 	}
 
-	free_page(page);
+	kfree(body);
 	return ERR_PTR(error);
 }
 
 const struct inode_operations configfs_symlink_inode_operations = {
-	.follow_link = configfs_follow_link,
+	.get_link = configfs_get_link,
 	.readlink = generic_readlink,
-	.put_link = free_page_put_link,
 	.setattr = configfs_setattr,
 };
 
diff --git a/fs/coredump.c b/fs/coredump.c
index 1777331ee..9ea87e9fd 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -32,6 +32,7 @@
 #include <linux/pipe_fs_i.h>
 #include <linux/oom.h>
 #include <linux/compat.h>
+#include <linux/timekeeping.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -117,6 +118,26 @@ int cn_esc_printf(struct core_name *cn, const char *fmt, ...)
 	ret = cn_vprintf(cn, fmt, arg);
 	va_end(arg);
 
+	if (ret == 0) {
+		/*
+		 * Ensure that this coredump name component can't cause the
+		 * resulting corefile path to consist of a ".." or ".".
+		 */
+		if ((cn->used - cur == 1 && cn->corename[cur] == '.') ||
+				(cn->used - cur == 2 && cn->corename[cur] == '.'
+				&& cn->corename[cur+1] == '.'))
+			cn->corename[cur] = '!';
+
+		/*
+		 * Empty names are fishy and could be used to create a "//" in a
+		 * corefile name, causing the coredump to happen one directory
+		 * level too high. Enforce that all components of the core
+		 * pattern are at least one character long.
+		 */
+		if (cn->used == cur)
+			ret = cn_printf(cn, "!");
+	}
+
 	for (; cur < cn->used; ++cur) {
 		if (cn->corename[cur] == '/')
 			cn->corename[cur] = '!';
@@ -232,9 +253,10 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm)
 				break;
 			/* UNIX time of coredump */
 			case 't': {
-				struct timeval tv;
-				do_gettimeofday(&tv);
-				err = cn_printf(cn, "%lu", tv.tv_sec);
+				time64_t time;
+
+				time = ktime_get_real_seconds();
+				err = cn_printf(cn, "%lld", time);
 				break;
 			}
 			/* hostname */
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index 355c522f3..b862bc219 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -100,6 +100,7 @@ static struct inode *get_cramfs_inode(struct super_block *sb,
 		break;
 	case S_IFLNK:
 		inode->i_op = &page_symlink_inode_operations;
+		inode_nohighmem(inode);
 		inode->i_data.a_ops = &cramfs_aops;
 		break;
 	default:
diff --git a/fs/dax.c b/fs/dax.c
index 43671b682..bbb2ad783 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -24,57 +24,91 @@
 #include <linux/memcontrol.h>
 #include <linux/mm.h>
 #include <linux/mutex.h>
+#include <linux/pagevec.h>
 #include <linux/pmem.h>
 #include <linux/sched.h>
 #include <linux/uio.h>
 #include <linux/vmstat.h>
+#include <linux/pfn_t.h>
+#include <linux/sizes.h>
+
+static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
+{
+	struct request_queue *q = bdev->bd_queue;
+	long rc = -EIO;
+
+	dax->addr = (void __pmem *) ERR_PTR(-EIO);
+	if (blk_queue_enter(q, true) != 0)
+		return rc;
+
+	rc = bdev_direct_access(bdev, dax);
+	if (rc < 0) {
+		dax->addr = (void __pmem *) ERR_PTR(rc);
+		blk_queue_exit(q);
+		return rc;
+	}
+	return rc;
+}
+
+static void dax_unmap_atomic(struct block_device *bdev,
+		const struct blk_dax_ctl *dax)
+{
+	if (IS_ERR(dax->addr))
+		return;
+	blk_queue_exit(bdev->bd_queue);
+}
+
+struct page *read_dax_sector(struct block_device *bdev, sector_t n)
+{
+	struct page *page = alloc_pages(GFP_KERNEL, 0);
+	struct blk_dax_ctl dax = {
+		.size = PAGE_SIZE,
+		.sector = n & ~((((int) PAGE_SIZE) / 512) - 1),
+	};
+	long rc;
+
+	if (!page)
+		return ERR_PTR(-ENOMEM);
+
+	rc = dax_map_atomic(bdev, &dax);
+	if (rc < 0)
+		return ERR_PTR(rc);
+	memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE);
+	dax_unmap_atomic(bdev, &dax);
+	return page;
+}
 
 /*
- * dax_clear_blocks() is called from within transaction context from XFS,
+ * dax_clear_sectors() is called from within transaction context from XFS,
  * and hence this means the stack from this point must follow GFP_NOFS
  * semantics for all operations.
  */
-int dax_clear_blocks(struct inode *inode, sector_t block, long size)
+int dax_clear_sectors(struct block_device *bdev, sector_t _sector, long _size)
 {
-	struct block_device *bdev = inode->i_sb->s_bdev;
-	sector_t sector = block << (inode->i_blkbits - 9);
+	struct blk_dax_ctl dax = {
+		.sector = _sector,
+		.size = _size,
+	};
 
 	might_sleep();
 	do {
-		void __pmem *addr;
-		unsigned long pfn;
-		long count;
+		long count, sz;
 
-		count = bdev_direct_access(bdev, sector, &addr, &pfn, size);
+		count = dax_map_atomic(bdev, &dax);
 		if (count < 0)
 			return count;
-		BUG_ON(size < count);
-		while (count > 0) {
-			unsigned pgsz = PAGE_SIZE - offset_in_page(addr);
-			if (pgsz > count)
-				pgsz = count;
-			clear_pmem(addr, pgsz);
-			addr += pgsz;
-			size -= pgsz;
-			count -= pgsz;
-			BUG_ON(pgsz & 511);
-			sector += pgsz / 512;
-			cond_resched();
-		}
-	} while (size);
+		sz = min_t(long, count, SZ_128K);
+		clear_pmem(dax.addr, sz);
+		dax.size -= sz;
+		dax.sector += sz / 512;
+		dax_unmap_atomic(bdev, &dax);
+		cond_resched();
+	} while (dax.size);
 
 	wmb_pmem();
 	return 0;
 }
-EXPORT_SYMBOL_GPL(dax_clear_blocks);
-
-static long dax_get_addr(struct buffer_head *bh, void __pmem **addr,
-		unsigned blkbits)
-{
-	unsigned long pfn;
-	sector_t sector = bh->b_blocknr << (blkbits - 9);
-	return bdev_direct_access(bh->b_bdev, sector, addr, &pfn, bh->b_size);
-}
+EXPORT_SYMBOL_GPL(dax_clear_sectors);
 
 /* the clear_pmem() calls are ordered by a wmb_pmem() in the caller */
 static void dax_new_buf(void __pmem *addr, unsigned size, unsigned first,
@@ -105,19 +139,29 @@ static bool buffer_size_valid(struct buffer_head *bh)
 	return bh->b_state != 0;
 }
 
+
+static sector_t to_sector(const struct buffer_head *bh,
+		const struct inode *inode)
+{
+	sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
+
+	return sector;
+}
+
 static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
 		      loff_t start, loff_t end, get_block_t get_block,
 		      struct buffer_head *bh)
 {
-	ssize_t retval = 0;
-	loff_t pos = start;
-	loff_t max = start;
-	loff_t bh_max = start;
-	void __pmem *addr;
-	bool hole = false;
-	bool need_wmb = false;
-
-	if (iov_iter_rw(iter) != WRITE)
+	loff_t pos = start, max = start, bh_max = start;
+	bool hole = false, need_wmb = false;
+	struct block_device *bdev = NULL;
+	int rw = iov_iter_rw(iter), rc;
+	long map_len = 0;
+	struct blk_dax_ctl dax = {
+		.addr = (void __pmem *) ERR_PTR(-EIO),
+	};
+
+	if (rw == READ)
 		end = min(end, i_size_read(inode));
 
 	while (pos < end) {
@@ -132,13 +176,13 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
 			if (pos == bh_max) {
 				bh->b_size = PAGE_ALIGN(end - pos);
 				bh->b_state = 0;
-				retval = get_block(inode, block, bh,
-						   iov_iter_rw(iter) == WRITE);
-				if (retval)
+				rc = get_block(inode, block, bh, rw == WRITE);
+				if (rc)
 					break;
 				if (!buffer_size_valid(bh))
 					bh->b_size = 1 << blkbits;
 				bh_max = pos - first + bh->b_size;
+				bdev = bh->b_bdev;
 			} else {
 				unsigned done = bh->b_size -
 						(bh_max - (pos - first));
@@ -146,47 +190,53 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
 				bh->b_size -= done;
 			}
 
-			hole = iov_iter_rw(iter) != WRITE && !buffer_written(bh);
+			hole = rw == READ && !buffer_written(bh);
 			if (hole) {
-				addr = NULL;
 				size = bh->b_size - first;
 			} else {
-				retval = dax_get_addr(bh, &addr, blkbits);
-				if (retval < 0)
+				dax_unmap_atomic(bdev, &dax);
+				dax.sector = to_sector(bh, inode);
+				dax.size = bh->b_size;
+				map_len = dax_map_atomic(bdev, &dax);
+				if (map_len < 0) {
+					rc = map_len;
 					break;
+				}
 				if (buffer_unwritten(bh) || buffer_new(bh)) {
-					dax_new_buf(addr, retval, first, pos,
-									end);
+					dax_new_buf(dax.addr, map_len, first,
+							pos, end);
 					need_wmb = true;
 				}
-				addr += first;
-				size = retval - first;
+				dax.addr += first;
+				size = map_len - first;
 			}
 			max = min(pos + size, end);
 		}
 
 		if (iov_iter_rw(iter) == WRITE) {
-			len = copy_from_iter_pmem(addr, max - pos, iter);
+			len = copy_from_iter_pmem(dax.addr, max - pos, iter);
 			need_wmb = true;
 		} else if (!hole)
-			len = copy_to_iter((void __force *)addr, max - pos,
+			len = copy_to_iter((void __force *) dax.addr, max - pos,
 					iter);
 		else
 			len = iov_iter_zero(max - pos, iter);
 
 		if (!len) {
-			retval = -EFAULT;
+			rc = -EFAULT;
 			break;
 		}
 
 		pos += len;
-		addr += len;
+		if (!IS_ERR(dax.addr))
+			dax.addr += len;
 	}
 
 	if (need_wmb)
 		wmb_pmem();
+	dax_unmap_atomic(bdev, &dax);
 
-	return (pos == start) ? retval : pos - start;
+	return (pos == start) ? rc : pos - start;
 }
 
 /**
@@ -215,13 +265,14 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
 	loff_t end = pos + iov_iter_count(iter);
 
 	memset(&bh, 0, sizeof(bh));
+	bh.b_bdev = inode->i_sb->s_bdev;
 
 	if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) {
 		struct address_space *mapping = inode->i_mapping;
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 		retval = filemap_write_and_wait_range(mapping, pos, end - 1);
 		if (retval) {
-			mutex_unlock(&inode->i_mutex);
+			inode_unlock(inode);
 			goto out;
 		}
 	}
@@ -233,7 +284,7 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
 	retval = dax_io(inode, iter, pos, end, get_block, &bh);
 
 	if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 
 	if ((retval > 0) && end_io)
 		end_io(iocb, pos, retval, bh.b_private);
@@ -275,28 +326,231 @@ static int dax_load_hole(struct address_space *mapping, struct page *page,
 	return VM_FAULT_LOCKED;
 }
 
-static int copy_user_bh(struct page *to, struct buffer_head *bh,
-			unsigned blkbits, unsigned long vaddr)
+static int copy_user_bh(struct page *to, struct inode *inode,
+		struct buffer_head *bh, unsigned long vaddr)
 {
-	void __pmem *vfrom;
+	struct blk_dax_ctl dax = {
+		.sector = to_sector(bh, inode),
+		.size = bh->b_size,
+	};
+	struct block_device *bdev = bh->b_bdev;
 	void *vto;
 
-	if (dax_get_addr(bh, &vfrom, blkbits) < 0)
-		return -EIO;
+	if (dax_map_atomic(bdev, &dax) < 0)
+		return PTR_ERR(dax.addr);
 	vto = kmap_atomic(to);
-	copy_user_page(vto, (void __force *)vfrom, vaddr, to);
+	copy_user_page(vto, (void __force *)dax.addr, vaddr, to);
 	kunmap_atomic(vto);
+	dax_unmap_atomic(bdev, &dax);
 	return 0;
 }
 
+#define NO_SECTOR -1
+#define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_CACHE_SHIFT))
+
+static int dax_radix_entry(struct address_space *mapping, pgoff_t index,
+		sector_t sector, bool pmd_entry, bool dirty)
+{
+	struct radix_tree_root *page_tree = &mapping->page_tree;
+	pgoff_t pmd_index = DAX_PMD_INDEX(index);
+	int type, error = 0;
+	void *entry;
+
+	WARN_ON_ONCE(pmd_entry && !dirty);
+	if (dirty)
+		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+
+	spin_lock_irq(&mapping->tree_lock);
+
+	entry = radix_tree_lookup(page_tree, pmd_index);
+	if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) {
+		index = pmd_index;
+		goto dirty;
+	}
+
+	entry = radix_tree_lookup(page_tree, index);
+	if (entry) {
+		type = RADIX_DAX_TYPE(entry);
+		if (WARN_ON_ONCE(type != RADIX_DAX_PTE &&
+					type != RADIX_DAX_PMD)) {
+			error = -EIO;
+			goto unlock;
+		}
+
+		if (!pmd_entry || type == RADIX_DAX_PMD)
+			goto dirty;
+
+		/*
+		 * We only insert dirty PMD entries into the radix tree.  This
+		 * means we don't need to worry about removing a dirty PTE
+		 * entry and inserting a clean PMD entry, thus reducing the
+		 * range we would flush with a follow-up fsync/msync call.
+		 */
+		radix_tree_delete(&mapping->page_tree, index);
+		mapping->nrexceptional--;
+	}
+
+	if (sector == NO_SECTOR) {
+		/*
+		 * This can happen during correct operation if our pfn_mkwrite
+		 * fault raced against a hole punch operation.  If this
+		 * happens the pte that was hole punched will have been
+		 * unmapped and the radix tree entry will have been removed by
+		 * the time we are called, but the call will still happen.  We
+		 * will return all the way up to wp_pfn_shared(), where the
+		 * pte_same() check will fail, eventually causing page fault
+		 * to be retried by the CPU.
+		 */
+		goto unlock;
+	}
+
+	error = radix_tree_insert(page_tree, index,
+			RADIX_DAX_ENTRY(sector, pmd_entry));
+	if (error)
+		goto unlock;
+
+	mapping->nrexceptional++;
+ dirty:
+	if (dirty)
+		radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
+ unlock:
+	spin_unlock_irq(&mapping->tree_lock);
+	return error;
+}
+
+static int dax_writeback_one(struct block_device *bdev,
+		struct address_space *mapping, pgoff_t index, void *entry)
+{
+	struct radix_tree_root *page_tree = &mapping->page_tree;
+	int type = RADIX_DAX_TYPE(entry);
+	struct radix_tree_node *node;
+	struct blk_dax_ctl dax;
+	void **slot;
+	int ret = 0;
+
+	spin_lock_irq(&mapping->tree_lock);
+	/*
+	 * Regular page slots are stabilized by the page lock even
+	 * without the tree itself locked.  These unlocked entries
+	 * need verification under the tree lock.
+	 */
+	if (!__radix_tree_lookup(page_tree, index, &node, &slot))
+		goto unlock;
+	if (*slot != entry)
+		goto unlock;
+
+	/* another fsync thread may have already written back this entry */
+	if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
+		goto unlock;
+
+	if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) {
+		ret = -EIO;
+		goto unlock;
+	}
+
+	dax.sector = RADIX_DAX_SECTOR(entry);
+	dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE);
+	spin_unlock_irq(&mapping->tree_lock);
+
+	/*
+	 * We cannot hold tree_lock while calling dax_map_atomic() because it
+	 * eventually calls cond_resched().
+	 */
+	ret = dax_map_atomic(bdev, &dax);
+	if (ret < 0)
+		return ret;
+
+	if (WARN_ON_ONCE(ret < dax.size)) {
+		ret = -EIO;
+		goto unmap;
+	}
+
+	wb_cache_pmem(dax.addr, dax.size);
+
+	spin_lock_irq(&mapping->tree_lock);
+	radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE);
+	spin_unlock_irq(&mapping->tree_lock);
+ unmap:
+	dax_unmap_atomic(bdev, &dax);
+	return ret;
+
+ unlock:
+	spin_unlock_irq(&mapping->tree_lock);
+	return ret;
+}
+
+/*
+ * Flush the mapping to the persistent domain within the byte range of [start,
+ * end]. This is required by data integrity operations to ensure file data is
+ * on persistent storage prior to completion of the operation.
+ */
+int dax_writeback_mapping_range(struct address_space *mapping,
+		struct block_device *bdev, struct writeback_control *wbc)
+{
+	struct inode *inode = mapping->host;
+	pgoff_t start_index, end_index, pmd_index;
+	pgoff_t indices[PAGEVEC_SIZE];
+	struct pagevec pvec;
+	bool done = false;
+	int i, ret = 0;
+	void *entry;
+
+	if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
+		return -EIO;
+
+	if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL)
+		return 0;
+
+	start_index = wbc->range_start >> PAGE_CACHE_SHIFT;
+	end_index = wbc->range_end >> PAGE_CACHE_SHIFT;
+	pmd_index = DAX_PMD_INDEX(start_index);
+
+	rcu_read_lock();
+	entry = radix_tree_lookup(&mapping->page_tree, pmd_index);
+	rcu_read_unlock();
+
+	/* see if the start of our range is covered by a PMD entry */
+	if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD)
+		start_index = pmd_index;
+
+	tag_pages_for_writeback(mapping, start_index, end_index);
+
+	pagevec_init(&pvec, 0);
+	while (!done) {
+		pvec.nr = find_get_entries_tag(mapping, start_index,
+				PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE,
+				pvec.pages, indices);
+
+		if (pvec.nr == 0)
+			break;
+
+		for (i = 0; i < pvec.nr; i++) {
+			if (indices[i] > end_index) {
+				done = true;
+				break;
+			}
+
+			ret = dax_writeback_one(bdev, mapping, indices[i],
+					pvec.pages[i]);
+			if (ret < 0)
+				return ret;
+		}
+	}
+	wmb_pmem();
+	return 0;
+}
+EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
+
 static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
 			struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-	struct address_space *mapping = inode->i_mapping;
-	sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
 	unsigned long vaddr = (unsigned long)vmf->virtual_address;
-	void __pmem *addr;
-	unsigned long pfn;
+	struct address_space *mapping = inode->i_mapping;
+	struct block_device *bdev = bh->b_bdev;
+	struct blk_dax_ctl dax = {
+		.sector = to_sector(bh, inode),
+		.size = bh->b_size,
+	};
 	pgoff_t size;
 	int error;
 
@@ -315,20 +569,23 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
 		goto out;
 	}
 
-	error = bdev_direct_access(bh->b_bdev, sector, &addr, &pfn, bh->b_size);
-	if (error < 0)
-		goto out;
-	if (error < PAGE_SIZE) {
-		error = -EIO;
+	if (dax_map_atomic(bdev, &dax) < 0) {
+		error = PTR_ERR(dax.addr);
 		goto out;
 	}
 
 	if (buffer_unwritten(bh) || buffer_new(bh)) {
-		clear_pmem(addr, PAGE_SIZE);
+		clear_pmem(dax.addr, PAGE_SIZE);
 		wmb_pmem();
 	}
+	dax_unmap_atomic(bdev, &dax);
 
-	error = vm_insert_mixed(vma, vaddr, pfn);
+	error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false,
+			vmf->flags & FAULT_FLAG_WRITE);
+	if (error)
+		goto out;
+
+	error = vm_insert_mixed(vma, vaddr, dax.pfn);
 
  out:
 	i_mmap_unlock_read(mapping);
@@ -373,6 +630,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 
 	memset(&bh, 0, sizeof(bh));
 	block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits);
+	bh.b_bdev = inode->i_sb->s_bdev;
 	bh.b_size = PAGE_SIZE;
 
  repeat:
@@ -422,7 +680,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 	if (vmf->cow_page) {
 		struct page *new_page = vmf->cow_page;
 		if (buffer_written(&bh))
-			error = copy_user_bh(new_page, &bh, blkbits, vaddr);
+			error = copy_user_bh(new_page, inode, &bh, vaddr);
 		else
 			clear_user_highpage(new_page, vaddr);
 		if (error)
@@ -452,6 +710,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 		delete_from_page_cache(page);
 		unlock_page(page);
 		page_cache_release(page);
+		page = NULL;
 	}
 
 	/*
@@ -523,6 +782,24 @@ EXPORT_SYMBOL_GPL(dax_fault);
  */
 #define PG_PMD_COLOUR	((PMD_SIZE >> PAGE_SHIFT) - 1)
 
+static void __dax_dbg(struct buffer_head *bh, unsigned long address,
+		const char *reason, const char *fn)
+{
+	if (bh) {
+		char bname[BDEVNAME_SIZE];
+		bdevname(bh->b_bdev, bname);
+		pr_debug("%s: %s addr: %lx dev %s state %lx start %lld "
+			"length %zd fallback: %s\n", fn, current->comm,
+			address, bname, bh->b_state, (u64)bh->b_blocknr,
+			bh->b_size, reason);
+	} else {
+		pr_debug("%s: %s addr: %lx fallback: %s\n", fn,
+			current->comm, address, reason);
+	}
+}
+
+#define dax_pmd_dbg(bh, address, reason)	__dax_dbg(bh, address, reason, "dax_pmd")
+
 int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 		pmd_t *pmd, unsigned int flags, get_block_t get_block,
 		dax_iodone_t complete_unwritten)
@@ -534,61 +811,83 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 	unsigned blkbits = inode->i_blkbits;
 	unsigned long pmd_addr = address & PMD_MASK;
 	bool write = flags & FAULT_FLAG_WRITE;
-	long length;
-	void __pmem *kaddr;
+	struct block_device *bdev;
 	pgoff_t size, pgoff;
-	sector_t block, sector;
-	unsigned long pfn;
-	int result = 0;
+	sector_t block;
+	int error, result = 0;
+	bool alloc = false;
 
-	/* dax pmd mappings are broken wrt gup and fork */
+	/* dax pmd mappings require pfn_t_devmap() */
 	if (!IS_ENABLED(CONFIG_FS_DAX_PMD))
 		return VM_FAULT_FALLBACK;
 
 	/* Fall back to PTEs if we're going to COW */
-	if (write && !(vma->vm_flags & VM_SHARED))
+	if (write && !(vma->vm_flags & VM_SHARED)) {
+		split_huge_pmd(vma, pmd, address);
+		dax_pmd_dbg(NULL, address, "cow write");
 		return VM_FAULT_FALLBACK;
+	}
 	/* If the PMD would extend outside the VMA */
-	if (pmd_addr < vma->vm_start)
+	if (pmd_addr < vma->vm_start) {
+		dax_pmd_dbg(NULL, address, "vma start unaligned");
 		return VM_FAULT_FALLBACK;
-	if ((pmd_addr + PMD_SIZE) > vma->vm_end)
+	}
+	if ((pmd_addr + PMD_SIZE) > vma->vm_end) {
+		dax_pmd_dbg(NULL, address, "vma end unaligned");
 		return VM_FAULT_FALLBACK;
+	}
 
 	pgoff = linear_page_index(vma, pmd_addr);
 	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	if (pgoff >= size)
 		return VM_FAULT_SIGBUS;
 	/* If the PMD would cover blocks out of the file */
-	if ((pgoff | PG_PMD_COLOUR) >= size)
+	if ((pgoff | PG_PMD_COLOUR) >= size) {
+		dax_pmd_dbg(NULL, address,
+				"offset + huge page size > file size");
 		return VM_FAULT_FALLBACK;
+	}
 
 	memset(&bh, 0, sizeof(bh));
+	bh.b_bdev = inode->i_sb->s_bdev;
 	block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);
 
 	bh.b_size = PMD_SIZE;
-	length = get_block(inode, block, &bh, write);
-	if (length)
+
+	if (get_block(inode, block, &bh, 0) != 0)
 		return VM_FAULT_SIGBUS;
-	i_mmap_lock_read(mapping);
+
+	if (!buffer_mapped(&bh) && write) {
+		if (get_block(inode, block, &bh, 1) != 0)
+			return VM_FAULT_SIGBUS;
+		alloc = true;
+	}
+
+	bdev = bh.b_bdev;
 
 	/*
 	 * If the filesystem isn't willing to tell us the length of a hole,
 	 * just fall back to PTEs.  Calling get_block 512 times in a loop
 	 * would be silly.
 	 */
-	if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE)
-		goto fallback;
+	if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) {
+		dax_pmd_dbg(&bh, address, "allocated block too small");
+		return VM_FAULT_FALLBACK;
+	}
 
 	/*
 	 * If we allocated new storage, make sure no process has any
 	 * zero pages covering this hole
 	 */
-	if (buffer_new(&bh)) {
-		i_mmap_unlock_read(mapping);
-		unmap_mapping_range(mapping, pgoff << PAGE_SHIFT, PMD_SIZE, 0);
-		i_mmap_lock_read(mapping);
+	if (alloc) {
+		loff_t lstart = pgoff << PAGE_SHIFT;
+		loff_t lend = lstart + PMD_SIZE - 1; /* inclusive */
+
+		truncate_pagecache_range(inode, lstart, lend);
 	}
 
+	i_mmap_lock_read(mapping);
+
 	/*
 	 * If a truncate happened while we were allocating blocks, we may
 	 * leave blocks allocated to the file that are beyond EOF.  We can't
@@ -600,57 +899,108 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
 		result = VM_FAULT_SIGBUS;
 		goto out;
 	}
-	if ((pgoff | PG_PMD_COLOUR) >= size)
+	if ((pgoff | PG_PMD_COLOUR) >= size) {
+		dax_pmd_dbg(&bh, address,
+				"offset + huge page size > file size");
 		goto fallback;
+	}
 
 	if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) {
 		spinlock_t *ptl;
 		pmd_t entry;
 		struct page *zero_page = get_huge_zero_page();
 
-		if (unlikely(!zero_page))
+		if (unlikely(!zero_page)) {
+			dax_pmd_dbg(&bh, address, "no zero page");
 			goto fallback;
+		}
 
 		ptl = pmd_lock(vma->vm_mm, pmd);
 		if (!pmd_none(*pmd)) {
 			spin_unlock(ptl);
+			dax_pmd_dbg(&bh, address, "pmd already present");
 			goto fallback;
 		}
 
+		dev_dbg(part_to_dev(bdev->bd_part),
+				"%s: %s addr: %lx pfn: <zero> sect: %llx\n",
+				__func__, current->comm, address,
+				(unsigned long long) to_sector(&bh, inode));
+
 		entry = mk_pmd(zero_page, vma->vm_page_prot);
 		entry = pmd_mkhuge(entry);
 		set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry);
 		result = VM_FAULT_NOPAGE;
 		spin_unlock(ptl);
 	} else {
-		sector = bh.b_blocknr << (blkbits - 9);
-		length = bdev_direct_access(bh.b_bdev, sector, &kaddr, &pfn,
-						bh.b_size);
+		struct blk_dax_ctl dax = {
+			.sector = to_sector(&bh, inode),
+			.size = PMD_SIZE,
+		};
+		long length = dax_map_atomic(bdev, &dax);
+
 		if (length < 0) {
 			result = VM_FAULT_SIGBUS;
 			goto out;
 		}
-		if ((length < PMD_SIZE) || (pfn & PG_PMD_COLOUR))
+		if (length < PMD_SIZE) {
+			dax_pmd_dbg(&bh, address, "dax-length too small");
+			dax_unmap_atomic(bdev, &dax);
 			goto fallback;
+		}
+		if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) {
+			dax_pmd_dbg(&bh, address, "pfn unaligned");
+			dax_unmap_atomic(bdev, &dax);
+			goto fallback;
+		}
 
-		/*
-		 * TODO: teach vmf_insert_pfn_pmd() to support
-		 * 'pte_special' for pmds
-		 */
-		if (pfn_valid(pfn))
+		if (!pfn_t_devmap(dax.pfn)) {
+			dax_unmap_atomic(bdev, &dax);
+			dax_pmd_dbg(&bh, address, "pfn not in memmap");
 			goto fallback;
+		}
 
 		if (buffer_unwritten(&bh) || buffer_new(&bh)) {
-			int i;
-			for (i = 0; i < PTRS_PER_PMD; i++)
-				clear_pmem(kaddr + i * PAGE_SIZE, PAGE_SIZE);
+			clear_pmem(dax.addr, PMD_SIZE);
 			wmb_pmem();
 			count_vm_event(PGMAJFAULT);
 			mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
 			result |= VM_FAULT_MAJOR;
 		}
+		dax_unmap_atomic(bdev, &dax);
 
-		result |= vmf_insert_pfn_pmd(vma, address, pmd, pfn, write);
+		/*
+		 * For PTE faults we insert a radix tree entry for reads, and
+		 * leave it clean.  Then on the first write we dirty the radix
+		 * tree entry via the dax_pfn_mkwrite() path.  This sequence
+		 * allows the dax_pfn_mkwrite() call to be simpler and avoid a
+		 * call into get_block() to translate the pgoff to a sector in
+		 * order to be able to create a new radix tree entry.
+		 *
+		 * The PMD path doesn't have an equivalent to
+		 * dax_pfn_mkwrite(), though, so for a read followed by a
+		 * write we traverse all the way through __dax_pmd_fault()
+		 * twice.  This means we can just skip inserting a radix tree
+		 * entry completely on the initial read and just wait until
+		 * the write to insert a dirty entry.
+		 */
+		if (write) {
+			error = dax_radix_entry(mapping, pgoff, dax.sector,
+					true, true);
+			if (error) {
+				dax_pmd_dbg(&bh, address,
+						"PMD radix insertion failed");
+				goto fallback;
+			}
+		}
+
+		dev_dbg(part_to_dev(bdev->bd_part),
+				"%s: %s addr: %lx pfn: %lx sect: %llx\n",
+				__func__, current->comm, address,
+				pfn_t_to_pfn(dax.pfn),
+				(unsigned long long) dax.sector);
+		result |= vmf_insert_pfn_pmd(vma, address, pmd,
+				dax.pfn, write);
 	}
 
  out:
@@ -702,15 +1052,27 @@ EXPORT_SYMBOL_GPL(dax_pmd_fault);
  * dax_pfn_mkwrite - handle first write to DAX page
  * @vma: The virtual memory area where the fault occurred
  * @vmf: The description of the fault
- *
  */
 int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-	struct super_block *sb = file_inode(vma->vm_file)->i_sb;
+	struct file *file = vma->vm_file;
+	int error;
+
+	/*
+	 * We pass NO_SECTOR to dax_radix_entry() because we expect that a
+	 * RADIX_DAX_PTE entry already exists in the radix tree from a
+	 * previous call to __dax_fault().  We just want to look up that PTE
+	 * entry using vmf->pgoff and make sure the dirty tag is set.  This
+	 * saves us from having to make a call to get_block() here to look
+	 * up the sector.
+	 */
+	error = dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false,
+			true);
 
-	sb_start_pagefault(sb);
-	file_update_time(vma->vm_file);
-	sb_end_pagefault(sb);
+	if (error == -ENOMEM)
+		return VM_FAULT_OOM;
+	if (error)
+		return VM_FAULT_SIGBUS;
 	return VM_FAULT_NOPAGE;
 }
 EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
@@ -747,17 +1109,23 @@ int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
 	BUG_ON((offset + length) > PAGE_CACHE_SIZE);
 
 	memset(&bh, 0, sizeof(bh));
+	bh.b_bdev = inode->i_sb->s_bdev;
 	bh.b_size = PAGE_CACHE_SIZE;
 	err = get_block(inode, index, &bh, 0);
 	if (err < 0)
 		return err;
 	if (buffer_written(&bh)) {
-		void __pmem *addr;
-		err = dax_get_addr(&bh, &addr, inode->i_blkbits);
-		if (err < 0)
-			return err;
-		clear_pmem(addr + offset, length);
+		struct block_device *bdev = bh.b_bdev;
+		struct blk_dax_ctl dax = {
+			.sector = to_sector(&bh, inode),
+			.size = PAGE_CACHE_SIZE,
+		};
+
+		if (dax_map_atomic(bdev, &dax) < 0)
+			return PTR_ERR(dax.addr);
+		clear_pmem(dax.addr + offset, length);
 		wmb_pmem();
+		dax_unmap_atomic(bdev, &dax);
 	}
 
 	return 0;
diff --git a/fs/dcache.c b/fs/dcache.c
index 927ed93af..2398f9f94 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1156,7 +1156,7 @@ enum d_walk_ret {
  *
  * The @enter() and @finish() callbacks are called with d_lock held.
  */
-void d_walk(struct dentry *parent, void *data,
+static void d_walk(struct dentry *parent, void *data,
 		   enum d_walk_ret (*enter)(void *, struct dentry *),
 		   void (*finish)(void *))
 {
@@ -1261,7 +1261,6 @@ rename_retry:
 	seq = 1;
 	goto again;
 }
-EXPORT_SYMBOL_GPL(d_walk);
 
 /*
  * Search for at least 1 mount point in the dentry's subdirs.
@@ -1561,7 +1560,8 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
 	dentry->d_iname[DNAME_INLINE_LEN-1] = 0;
 	if (name->len > DNAME_INLINE_LEN-1) {
 		size_t size = offsetof(struct external_name, name[1]);
-		struct external_name *p = kmalloc(size + name->len, GFP_KERNEL);
+		struct external_name *p = kmalloc(size + name->len,
+						  GFP_KERNEL_ACCOUNT);
 		if (!p) {
 			kmem_cache_free(dentry_cache, dentry); 
 			return NULL;
@@ -1724,7 +1724,7 @@ static unsigned d_flags_for_inode(struct inode *inode)
 	}
 
 	if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) {
-		if (unlikely(inode->i_op->follow_link)) {
+		if (unlikely(inode->i_op->get_link)) {
 			add_flags = DCACHE_SYMLINK_TYPE;
 			goto type_determined;
 		}
@@ -2452,7 +2452,7 @@ EXPORT_SYMBOL(d_rehash);
  */
 void dentry_update_name_case(struct dentry *dentry, struct qstr *name)
 {
-	BUG_ON(!mutex_is_locked(&dentry->d_parent->d_inode->i_mutex));
+	BUG_ON(!inode_is_locked(dentry->d_parent->d_inode));
 	BUG_ON(dentry->d_name.len != name->len); /* d_lookup gives this */
 
 	spin_lock(&dentry->d_lock);
@@ -2728,7 +2728,7 @@ static int __d_unalias(struct inode *inode,
 	if (!mutex_trylock(&dentry->d_sb->s_vfs_rename_mutex))
 		goto out_err;
 	m1 = &dentry->d_sb->s_vfs_rename_mutex;
-	if (!mutex_trylock(&alias->d_parent->d_inode->i_mutex))
+	if (!inode_trylock(alias->d_parent->d_inode))
 		goto out_err;
 	m2 = &alias->d_parent->d_inode->i_mutex;
 out_unalias:
@@ -3294,18 +3294,18 @@ out:
  * @new_dentry: new dentry
  * @old_dentry: old dentry
  *
- * Returns 1 if new_dentry is a subdirectory of the parent (at any depth).
- * Returns 0 otherwise.
+ * Returns true if new_dentry is a subdirectory of the parent (at any depth).
+ * Returns false otherwise.
  * Caller must ensure that "new_dentry" is pinned before calling is_subdir()
  */
   
-int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
+bool is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
 {
-	int result;
+	bool result;
 	unsigned seq;
 
 	if (new_dentry == old_dentry)
-		return 1;
+		return true;
 
 	do {
 		/* for restarting inner loop in case of seq retry */
@@ -3316,9 +3316,9 @@ int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry)
 		 */
 		rcu_read_lock();
 		if (d_ancestor(old_dentry, new_dentry))
-			result = 1;
+			result = true;
 		else
-			result = 0;
+			result = false;
 		rcu_read_unlock();
 	} while (read_seqretry(&rename_lock, seq));
 
@@ -3406,7 +3406,7 @@ static void __init dcache_init(void)
 	 * of the dcache. 
 	 */
 	dentry_cache = KMEM_CACHE(dentry,
-		SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD);
+		SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD|SLAB_ACCOUNT);
 
 	/* Hash may have been set up in dcache_init_early */
 	if (!hashdist)
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index b7fcc0de0..bece948b3 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -265,7 +265,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent)
 	if (!parent)
 		parent = debugfs_mount->mnt_root;
 
-	mutex_lock(&d_inode(parent)->i_mutex);
+	inode_lock(d_inode(parent));
 	dentry = lookup_one_len(name, parent, strlen(name));
 	if (!IS_ERR(dentry) && d_really_is_positive(dentry)) {
 		dput(dentry);
@@ -273,7 +273,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent)
 	}
 
 	if (IS_ERR(dentry)) {
-		mutex_unlock(&d_inode(parent)->i_mutex);
+		inode_unlock(d_inode(parent));
 		simple_release_fs(&debugfs_mount, &debugfs_mount_count);
 	}
 
@@ -282,7 +282,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent)
 
 static struct dentry *failed_creating(struct dentry *dentry)
 {
-	mutex_unlock(&d_inode(dentry->d_parent)->i_mutex);
+	inode_unlock(d_inode(dentry->d_parent));
 	dput(dentry);
 	simple_release_fs(&debugfs_mount, &debugfs_mount_count);
 	return NULL;
@@ -290,7 +290,7 @@ static struct dentry *failed_creating(struct dentry *dentry)
 
 static struct dentry *end_creating(struct dentry *dentry)
 {
-	mutex_unlock(&d_inode(dentry->d_parent)->i_mutex);
+	inode_unlock(d_inode(dentry->d_parent));
 	return dentry;
 }
 
@@ -560,9 +560,9 @@ void debugfs_remove(struct dentry *dentry)
 	if (!parent || d_really_is_negative(parent))
 		return;
 
-	mutex_lock(&d_inode(parent)->i_mutex);
+	inode_lock(d_inode(parent));
 	ret = __debugfs_remove(dentry, parent);
-	mutex_unlock(&d_inode(parent)->i_mutex);
+	inode_unlock(d_inode(parent));
 	if (!ret)
 		simple_release_fs(&debugfs_mount, &debugfs_mount_count);
 }
@@ -594,7 +594,7 @@ void debugfs_remove_recursive(struct dentry *dentry)
 
 	parent = dentry;
  down:
-	mutex_lock(&d_inode(parent)->i_mutex);
+	inode_lock(d_inode(parent));
  loop:
 	/*
 	 * The parent->d_subdirs is protected by the d_lock. Outside that
@@ -609,7 +609,7 @@ void debugfs_remove_recursive(struct dentry *dentry)
 		/* perhaps simple_empty(child) makes more sense */
 		if (!list_empty(&child->d_subdirs)) {
 			spin_unlock(&parent->d_lock);
-			mutex_unlock(&d_inode(parent)->i_mutex);
+			inode_unlock(d_inode(parent));
 			parent = child;
 			goto down;
 		}
@@ -630,10 +630,10 @@ void debugfs_remove_recursive(struct dentry *dentry)
 	}
 	spin_unlock(&parent->d_lock);
 
-	mutex_unlock(&d_inode(parent)->i_mutex);
+	inode_unlock(d_inode(parent));
 	child = parent;
 	parent = parent->d_parent;
-	mutex_lock(&d_inode(parent)->i_mutex);
+	inode_lock(d_inode(parent));
 
 	if (child != dentry)
 		/* go up */
@@ -641,7 +641,7 @@ void debugfs_remove_recursive(struct dentry *dentry)
 
 	if (!__debugfs_remove(child, parent))
 		simple_release_fs(&debugfs_mount, &debugfs_mount_count);
-	mutex_unlock(&d_inode(parent)->i_mutex);
+	inode_unlock(d_inode(parent));
 }
 EXPORT_SYMBOL_GPL(debugfs_remove_recursive);
 
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 706de324f..655f21f99 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -255,7 +255,7 @@ static int mknod_ptmx(struct super_block *sb)
 	if (!uid_valid(root_uid) || !gid_valid(root_gid))
 		return -EINVAL;
 
-	mutex_lock(&d_inode(root)->i_mutex);
+	inode_lock(d_inode(root));
 
 	/* If we have already created ptmx node, return */
 	if (fsi->ptmx_dentry) {
@@ -292,7 +292,7 @@ static int mknod_ptmx(struct super_block *sb)
 	fsi->ptmx_dentry = dentry;
 	rc = 0;
 out:
-	mutex_unlock(&d_inode(root)->i_mutex);
+	inode_unlock(d_inode(root));
 	return rc;
 }
 
@@ -635,7 +635,7 @@ struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index,
 
 	sprintf(s, "%d", index);
 
-	mutex_lock(&d_inode(root)->i_mutex);
+	inode_lock(d_inode(root));
 
 	dentry = d_alloc_name(root, s);
 	if (dentry) {
@@ -646,7 +646,7 @@ struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index,
 		inode = ERR_PTR(-ENOMEM);
 	}
 
-	mutex_unlock(&d_inode(root)->i_mutex);
+	inode_unlock(d_inode(root));
 
 	return inode;
 }
@@ -691,7 +691,7 @@ void devpts_pty_kill(struct inode *inode)
 
 	BUG_ON(inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR));
 
-	mutex_lock(&d_inode(root)->i_mutex);
+	inode_lock(d_inode(root));
 
 	dentry = d_find_alias(inode);
 
@@ -700,7 +700,7 @@ void devpts_pty_kill(struct inode *inode)
 	dput(dentry);	/* d_alloc_name() in devpts_pty_new() */
 	dput(dentry);		/* d_find_alias above */
 
-	mutex_unlock(&d_inode(root)->i_mutex);
+	inode_unlock(d_inode(root));
 }
 
 static int __init init_devpts_fs(void)
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 01171d8a6..d6a9012d4 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1157,12 +1157,12 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
 					iocb->ki_filp->f_mapping;
 
 			/* will be released by direct_io_worker */
-			mutex_lock(&inode->i_mutex);
+			inode_lock(inode);
 
 			retval = filemap_write_and_wait_range(mapping, offset,
 							      end - 1);
 			if (retval) {
-				mutex_unlock(&inode->i_mutex);
+				inode_unlock(inode);
 				kmem_cache_free(dio_cache, dio);
 				goto out;
 			}
@@ -1173,7 +1173,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
 	dio->i_size = i_size_read(inode);
 	if (iov_iter_rw(iter) == READ && offset >= dio->i_size) {
 		if (dio->flags & DIO_LOCKING)
-			mutex_unlock(&inode->i_mutex);
+			inode_unlock(inode);
 		kmem_cache_free(dio_cache, dio);
 		retval = 0;
 		goto out;
@@ -1295,7 +1295,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
 	 * of protecting us from looking up uninitialized blocks.
 	 */
 	if (iov_iter_rw(iter) == READ && (dio->flags & DIO_LOCKING))
-		mutex_unlock(&dio->inode->i_mutex);
+		inode_unlock(dio->inode);
 
 	/*
 	 * The only time we want to leave bios in flight is when a successful
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 173b3873a..58c2f4a21 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -515,14 +515,9 @@ static ssize_t device_write(struct file *file, const char __user *buf,
 	if (count > sizeof(struct dlm_write_request) + DLM_RESNAME_MAXLEN)
 		return -EINVAL;
 
-	kbuf = kzalloc(count + 1, GFP_NOFS);
-	if (!kbuf)
-		return -ENOMEM;
-
-	if (copy_from_user(kbuf, buf, count)) {
-		error = -EFAULT;
-		goto out_free;
-	}
+	kbuf = memdup_user_nul(buf, count);
+	if (IS_ERR(kbuf))
+		return PTR_ERR(kbuf);
 
 	if (check_version(kbuf)) {
 		error = -EBADE;
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index 4f591f190..d72d52b90 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -8,7 +8,6 @@
 #include <linux/writeback.h>
 #include <linux/sysctl.h>
 #include <linux/gfp.h>
-#include <linux/export.h>
 #include "internal.h"
 
 /* A global variable is a bit ugly, but it keeps the code simple */
@@ -40,12 +39,6 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
 	iput(toput_inode);
 }
 
-/* For TuxOnIce */
-void drop_pagecache(void)
-{
-	iterate_supers(drop_pagecache_sb, NULL);
-}
-
 int drop_caches_sysctl_handler(struct ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index e2e47ba5d..4e685ac10 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -41,13 +41,13 @@ static struct dentry *lock_parent(struct dentry *dentry)
 	struct dentry *dir;
 
 	dir = dget_parent(dentry);
-	mutex_lock_nested(&(d_inode(dir)->i_mutex), I_MUTEX_PARENT);
+	inode_lock_nested(d_inode(dir), I_MUTEX_PARENT);
 	return dir;
 }
 
 static void unlock_dir(struct dentry *dir)
 {
-	mutex_unlock(&d_inode(dir)->i_mutex);
+	inode_unlock(d_inode(dir));
 	dput(dir);
 }
 
@@ -282,9 +282,7 @@ ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry,
 	if (rc) {
 		ecryptfs_do_unlink(directory_inode, ecryptfs_dentry,
 				   ecryptfs_inode);
-		make_bad_inode(ecryptfs_inode);
-		unlock_new_inode(ecryptfs_inode);
-		iput(ecryptfs_inode);
+		iget_failed(ecryptfs_inode);
 		goto out;
 	}
 	unlock_new_inode(ecryptfs_inode);
@@ -399,11 +397,11 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
 	int rc = 0;
 
 	lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent);
-	mutex_lock(&d_inode(lower_dir_dentry)->i_mutex);
+	inode_lock(d_inode(lower_dir_dentry));
 	lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name,
 				      lower_dir_dentry,
 				      ecryptfs_dentry->d_name.len);
-	mutex_unlock(&d_inode(lower_dir_dentry)->i_mutex);
+	inode_unlock(d_inode(lower_dir_dentry));
 	if (IS_ERR(lower_dentry)) {
 		rc = PTR_ERR(lower_dentry);
 		ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
@@ -428,11 +426,11 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode,
 		       "filename; rc = [%d]\n", __func__, rc);
 		goto out;
 	}
-	mutex_lock(&d_inode(lower_dir_dentry)->i_mutex);
+	inode_lock(d_inode(lower_dir_dentry));
 	lower_dentry = lookup_one_len(encrypted_and_encoded_name,
 				      lower_dir_dentry,
 				      encrypted_and_encoded_name_size);
-	mutex_unlock(&d_inode(lower_dir_dentry)->i_mutex);
+	inode_unlock(d_inode(lower_dir_dentry));
 	if (IS_ERR(lower_dentry)) {
 		rc = PTR_ERR(lower_dentry);
 		ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned "
@@ -674,16 +672,24 @@ out:
 	return rc ? ERR_PTR(rc) : buf;
 }
 
-static const char *ecryptfs_follow_link(struct dentry *dentry, void **cookie)
+static const char *ecryptfs_get_link(struct dentry *dentry,
+				     struct inode *inode,
+				     struct delayed_call *done)
 {
 	size_t len;
-	char *buf = ecryptfs_readlink_lower(dentry, &len);
+	char *buf;
+
+	if (!dentry)
+		return ERR_PTR(-ECHILD);
+
+	buf = ecryptfs_readlink_lower(dentry, &len);
 	if (IS_ERR(buf))
 		return buf;
 	fsstack_copy_attr_atime(d_inode(dentry),
 				d_inode(ecryptfs_dentry_to_lower(dentry)));
 	buf[len] = '\0';
-	return *cookie = buf;
+	set_delayed_call(done, kfree_link, buf);
+	return buf;
 }
 
 /**
@@ -863,9 +869,9 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length)
 	if (!rc && lower_ia.ia_valid & ATTR_SIZE) {
 		struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
 
-		mutex_lock(&d_inode(lower_dentry)->i_mutex);
+		inode_lock(d_inode(lower_dentry));
 		rc = notify_change(lower_dentry, &lower_ia, NULL);
-		mutex_unlock(&d_inode(lower_dentry)->i_mutex);
+		inode_unlock(d_inode(lower_dentry));
 	}
 	return rc;
 }
@@ -964,9 +970,9 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia)
 	if (lower_ia.ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID))
 		lower_ia.ia_valid &= ~ATTR_MODE;
 
-	mutex_lock(&d_inode(lower_dentry)->i_mutex);
+	inode_lock(d_inode(lower_dentry));
 	rc = notify_change(lower_dentry, &lower_ia, NULL);
-	mutex_unlock(&d_inode(lower_dentry)->i_mutex);
+	inode_unlock(d_inode(lower_dentry));
 out:
 	fsstack_copy_attr_all(inode, lower_inode);
 	return rc;
@@ -1042,10 +1048,10 @@ ecryptfs_getxattr_lower(struct dentry *lower_dentry, const char *name,
 		rc = -EOPNOTSUPP;
 		goto out;
 	}
-	mutex_lock(&d_inode(lower_dentry)->i_mutex);
+	inode_lock(d_inode(lower_dentry));
 	rc = d_inode(lower_dentry)->i_op->getxattr(lower_dentry, name, value,
 						   size);
-	mutex_unlock(&d_inode(lower_dentry)->i_mutex);
+	inode_unlock(d_inode(lower_dentry));
 out:
 	return rc;
 }
@@ -1069,9 +1075,9 @@ ecryptfs_listxattr(struct dentry *dentry, char *list, size_t size)
 		rc = -EOPNOTSUPP;
 		goto out;
 	}
-	mutex_lock(&d_inode(lower_dentry)->i_mutex);
+	inode_lock(d_inode(lower_dentry));
 	rc = d_inode(lower_dentry)->i_op->listxattr(lower_dentry, list, size);
-	mutex_unlock(&d_inode(lower_dentry)->i_mutex);
+	inode_unlock(d_inode(lower_dentry));
 out:
 	return rc;
 }
@@ -1086,17 +1092,16 @@ static int ecryptfs_removexattr(struct dentry *dentry, const char *name)
 		rc = -EOPNOTSUPP;
 		goto out;
 	}
-	mutex_lock(&d_inode(lower_dentry)->i_mutex);
+	inode_lock(d_inode(lower_dentry));
 	rc = d_inode(lower_dentry)->i_op->removexattr(lower_dentry, name);
-	mutex_unlock(&d_inode(lower_dentry)->i_mutex);
+	inode_unlock(d_inode(lower_dentry));
 out:
 	return rc;
 }
 
 const struct inode_operations ecryptfs_symlink_iops = {
 	.readlink = generic_readlink,
-	.follow_link = ecryptfs_follow_link,
-	.put_link = kfree_put_link,
+	.get_link = ecryptfs_get_link,
 	.permission = ecryptfs_permission,
 	.setattr = ecryptfs_setattr,
 	.getattr = ecryptfs_getattr_link,
diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c
index 4f4d0474b..e25b6b06b 100644
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -663,6 +663,7 @@ static struct ecryptfs_cache_info {
 	struct kmem_cache **cache;
 	const char *name;
 	size_t size;
+	unsigned long flags;
 	void (*ctor)(void *obj);
 } ecryptfs_cache_infos[] = {
 	{
@@ -684,6 +685,7 @@ static struct ecryptfs_cache_info {
 		.cache = &ecryptfs_inode_info_cache,
 		.name = "ecryptfs_inode_cache",
 		.size = sizeof(struct ecryptfs_inode_info),
+		.flags = SLAB_ACCOUNT,
 		.ctor = inode_info_init_once,
 	},
 	{
@@ -755,8 +757,8 @@ static int ecryptfs_init_kmem_caches(void)
 		struct ecryptfs_cache_info *info;
 
 		info = &ecryptfs_cache_infos[i];
-		*(info->cache) = kmem_cache_create(info->name, info->size,
-				0, SLAB_HWCACHE_ALIGN, info->ctor);
+		*(info->cache) = kmem_cache_create(info->name, info->size, 0,
+				SLAB_HWCACHE_ALIGN | info->flags, info->ctor);
 		if (!*(info->cache)) {
 			ecryptfs_free_kmem_caches();
 			ecryptfs_printk(KERN_WARNING, "%s: "
diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c
index caba848ac..c6ced4cbf 100644
--- a/fs/ecryptfs/mmap.c
+++ b/fs/ecryptfs/mmap.c
@@ -436,7 +436,7 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode)
 		rc = -ENOMEM;
 		goto out;
 	}
-	mutex_lock(&lower_inode->i_mutex);
+	inode_lock(lower_inode);
 	size = lower_inode->i_op->getxattr(lower_dentry, ECRYPTFS_XATTR_NAME,
 					   xattr_virt, PAGE_CACHE_SIZE);
 	if (size < 0)
@@ -444,7 +444,7 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode)
 	put_unaligned_be64(i_size_read(ecryptfs_inode), xattr_virt);
 	rc = lower_inode->i_op->setxattr(lower_dentry, ECRYPTFS_XATTR_NAME,
 					 xattr_virt, size, 0);
-	mutex_unlock(&lower_inode->i_mutex);
+	inode_unlock(lower_inode);
 	if (rc)
 		printk(KERN_ERR "Error whilst attempting to write inode size "
 		       "to lower file xattr; rc = [%d]\n", rc);
diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c
index 66842e55c..d48e0d261 100644
--- a/fs/efivarfs/file.c
+++ b/fs/efivarfs/file.c
@@ -51,9 +51,9 @@ static ssize_t efivarfs_file_write(struct file *file,
 		d_delete(file->f_path.dentry);
 		dput(file->f_path.dentry);
 	} else {
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 		i_size_write(inode, datasize + sizeof(attributes));
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 	}
 
 	bytes = count;
@@ -148,9 +148,9 @@ efivarfs_ioc_setxflags(struct file *file, void __user *arg)
 	if (error)
 		return error;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	inode_set_flags(inode, i_flags, S_IMMUTABLE);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	mnt_drop_write_file(file);
 
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index abb244b06..dd029d13e 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -164,10 +164,10 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor,
 	efivar_entry_size(entry, &size);
 	efivar_entry_add(entry, &efivarfs_list);
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	inode->i_private = entry;
 	i_size_write(inode, size + sizeof(entry->var.Attributes));
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	d_add(dentry, inode);
 
 	return 0;
diff --git a/fs/efs/inode.c b/fs/efs/inode.c
index 079d20306..cdf087238 100644
--- a/fs/efs/inode.c
+++ b/fs/efs/inode.c
@@ -151,6 +151,7 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino)
 			break;
 		case S_IFLNK:
 			inode->i_op = &page_symlink_inode_operations;
+			inode_nohighmem(inode);
 			inode->i_data.a_ops = &efs_symlink_aops;
 			break;
 		case S_IFCHR:
diff --git a/fs/efs/super.c b/fs/efs/super.c
index c8411a30f..cb68dac4f 100644
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -94,9 +94,9 @@ static void init_once(void *foo)
 static int __init init_inodecache(void)
 {
 	efs_inode_cachep = kmem_cache_create("efs_inode_cache",
-				sizeof(struct efs_inode_info),
-				0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
-				init_once);
+				sizeof(struct efs_inode_info), 0,
+				SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
+				SLAB_ACCOUNT, init_once);
 	if (efs_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/efs/symlink.c b/fs/efs/symlink.c
index 75117d0da..4870cc82d 100644
--- a/fs/efs/symlink.c
+++ b/fs/efs/symlink.c
@@ -13,7 +13,7 @@
 
 static int efs_symlink_readpage(struct file *file, struct page *page)
 {
-	char *link = kmap(page);
+	char *link = page_address(page);
 	struct buffer_head * bh;
 	struct inode * inode = page->mapping->host;
 	efs_block_t size = inode->i_size;
@@ -39,12 +39,10 @@ static int efs_symlink_readpage(struct file *file, struct page *page)
 	}
 	link[size] = '\0';
 	SetPageUptodate(page);
-	kunmap(page);
 	unlock_page(page);
 	return 0;
 fail:
 	SetPageError(page);
-	kunmap(page);
 	unlock_page(page);
 	return err;
 }
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 8d0c0df01..ed70cf9fd 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -45,10 +45,10 @@ struct eventfd_ctx {
  *
  * This function is supposed to be called by the kernel in paths that do not
  * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX
- * value, and we signal this as overflow condition by returining a POLLERR
+ * value, and we signal this as overflow condition by returning a POLLERR
  * to poll(2).
  *
- * Returns the amount by which the counter was incrememnted.  This will be less
+ * Returns the amount by which the counter was incremented.  This will be less
  * than @n if the counter has overflowed.
  */
 __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n)
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 1e009cad8..cde60741c 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -92,7 +92,12 @@
  */
 
 /* Epoll private bits inside the event mask */
-#define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET)
+#define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | EPOLLEXCLUSIVE)
+
+#define EPOLLINOUT_BITS (POLLIN | POLLOUT)
+
+#define EPOLLEXCLUSIVE_OK_BITS (EPOLLINOUT_BITS | POLLERR | POLLHUP | \
+				EPOLLWAKEUP | EPOLLET | EPOLLEXCLUSIVE)
 
 /* Maximum number of nesting allowed inside epoll sets */
 #define EP_MAX_NESTS 4
@@ -1002,6 +1007,7 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
 	unsigned long flags;
 	struct epitem *epi = ep_item_from_wait(wait);
 	struct eventpoll *ep = epi->ep;
+	int ewake = 0;
 
 	if ((unsigned long)key & POLLFREE) {
 		ep_pwq_from_wait(wait)->whead = NULL;
@@ -1066,8 +1072,25 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
 	 * Wake up ( if active ) both the eventpoll wait list and the ->poll()
 	 * wait list.
 	 */
-	if (waitqueue_active(&ep->wq))
+	if (waitqueue_active(&ep->wq)) {
+		if ((epi->event.events & EPOLLEXCLUSIVE) &&
+					!((unsigned long)key & POLLFREE)) {
+			switch ((unsigned long)key & EPOLLINOUT_BITS) {
+			case POLLIN:
+				if (epi->event.events & POLLIN)
+					ewake = 1;
+				break;
+			case POLLOUT:
+				if (epi->event.events & POLLOUT)
+					ewake = 1;
+				break;
+			case 0:
+				ewake = 1;
+				break;
+			}
+		}
 		wake_up_locked(&ep->wq);
+	}
 	if (waitqueue_active(&ep->poll_wait))
 		pwake++;
 
@@ -1078,6 +1101,9 @@ out_unlock:
 	if (pwake)
 		ep_poll_safewake(&ep->poll_wait);
 
+	if (epi->event.events & EPOLLEXCLUSIVE)
+		return ewake;
+
 	return 1;
 }
 
@@ -1095,7 +1121,10 @@ static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead,
 		init_waitqueue_func_entry(&pwq->wait, ep_poll_callback);
 		pwq->whead = whead;
 		pwq->base = epi;
-		add_wait_queue(whead, &pwq->wait);
+		if (epi->event.events & EPOLLEXCLUSIVE)
+			add_wait_queue_exclusive(whead, &pwq->wait);
+		else
+			add_wait_queue(whead, &pwq->wait);
 		list_add_tail(&pwq->llink, &epi->pwqlist);
 		epi->nwait++;
 	} else {
@@ -1862,6 +1891,19 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
 		goto error_tgt_fput;
 
 	/*
+	 * epoll adds to the wakeup queue at EPOLL_CTL_ADD time only,
+	 * so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation.
+	 * Also, we do not currently supported nested exclusive wakeups.
+	 */
+	if (epds.events & EPOLLEXCLUSIVE) {
+		if (op == EPOLL_CTL_MOD)
+			goto error_tgt_fput;
+		if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) ||
+				(epds.events & ~EPOLLEXCLUSIVE_OK_BITS)))
+			goto error_tgt_fput;
+	}
+
+	/*
 	 * At this point it is safe to assume that the "private_data" contains
 	 * our own data structure.
 	 */
@@ -1932,8 +1974,10 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
 		break;
 	case EPOLL_CTL_MOD:
 		if (epi) {
-			epds.events |= POLLERR | POLLHUP;
-			error = ep_modify(ep, epi, &epds);
+			if (!(epi->event.events & EPOLLEXCLUSIVE)) {
+				epds.events |= POLLERR | POLLHUP;
+				error = ep_modify(ep, epi, &epds);
+			}
 		} else
 			error = -ENOENT;
 		break;
diff --git a/fs/exec.c b/fs/exec.c
index 203f822aa..2af88108e 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -106,7 +106,6 @@ bool path_noexec(const struct path *path)
 	return (path->mnt->mnt_flags & MNT_NOEXEC) ||
 	       (path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC);
 }
-EXPORT_SYMBOL_GPL(path_noexec);
 
 #ifdef CONFIG_USELIB
 /*
@@ -123,7 +122,7 @@ SYSCALL_DEFINE1(uselib, const char __user *, library)
 	int error = PTR_ERR(tmp);
 	static const struct open_flags uselib_flags = {
 		.open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
-		.acc_mode = MAY_READ | MAY_EXEC | MAY_OPEN,
+		.acc_mode = MAY_READ | MAY_EXEC,
 		.intent = LOOKUP_OPEN,
 		.lookup_flags = LOOKUP_FOLLOW,
 	};
@@ -767,7 +766,7 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags)
 	int err;
 	struct open_flags open_exec_flags = {
 		.open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
-		.acc_mode = MAY_EXEC | MAY_OPEN,
+		.acc_mode = MAY_EXEC,
 		.intent = LOOKUP_OPEN,
 		.lookup_flags = LOOKUP_FOLLOW,
 	};
@@ -1314,13 +1313,13 @@ static void bprm_fill_uid(struct linux_binprm *bprm)
 		return;
 
 	/* Be careful if suid/sgid is set */
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	/* reload atomically mode/uid/gid now that lock held */
 	mode = inode->i_mode;
 	uid = inode->i_uid;
 	gid = inode->i_gid;
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	/* We ignore suid/sgid if there are no mappings for them in the ns */
 	if (!kuid_has_mapping(bprm->cred->user_ns, uid) ||
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
index 906de66e8..28645f064 100644
--- a/fs/exofs/file.c
+++ b/fs/exofs/file.c
@@ -52,9 +52,9 @@ static int exofs_file_fsync(struct file *filp, loff_t start, loff_t end,
 	if (ret)
 		return ret;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	ret = sync_inode_metadata(filp->f_mapping->host, 1);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return ret;
 }
 
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 60f03b789..9eaf595ae 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -1224,6 +1224,7 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
 			inode->i_link = (char *)oi->i_data;
 		} else {
 			inode->i_op = &page_symlink_inode_operations;
+			inode_nohighmem(inode);
 			inode->i_mapping->a_ops = &exofs_aops;
 		}
 	} else {
diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c
index 994e078da..c20d77df2 100644
--- a/fs/exofs/namei.c
+++ b/fs/exofs/namei.c
@@ -111,6 +111,7 @@ static int exofs_symlink(struct inode *dir, struct dentry *dentry,
 	if (l > sizeof(oi->i_data)) {
 		/* slow symlink */
 		inode->i_op = &page_symlink_inode_operations;
+		inode_nohighmem(inode);
 		inode->i_mapping->a_ops = &exofs_aops;
 		memset(oi->i_data, 0, sizeof(oi->i_data));
 
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index b795c567b..6658a5053 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -194,8 +194,8 @@ static int init_inodecache(void)
 {
 	exofs_inode_cachep = kmem_cache_create("exofs_inode_cache",
 				sizeof(struct exofs_i_info), 0,
-				SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
-				exofs_init_once);
+				SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD |
+				SLAB_ACCOUNT, exofs_init_once);
 	if (exofs_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 714cd37a6..c46f1a190 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -124,10 +124,10 @@ static struct dentry *reconnect_one(struct vfsmount *mnt,
 	int err;
 
 	parent = ERR_PTR(-EACCES);
-	mutex_lock(&dentry->d_inode->i_mutex);
+	inode_lock(dentry->d_inode);
 	if (mnt->mnt_sb->s_export_op->get_parent)
 		parent = mnt->mnt_sb->s_export_op->get_parent(dentry);
-	mutex_unlock(&dentry->d_inode->i_mutex);
+	inode_unlock(dentry->d_inode);
 
 	if (IS_ERR(parent)) {
 		dprintk("%s: get_parent of %ld failed, err %d\n",
@@ -143,9 +143,9 @@ static struct dentry *reconnect_one(struct vfsmount *mnt,
 	if (err)
 		goto out_err;
 	dprintk("%s: found name: %s\n", __func__, nbuf);
-	mutex_lock(&parent->d_inode->i_mutex);
+	inode_lock(parent->d_inode);
 	tmp = lookup_one_len(nbuf, parent, strlen(nbuf));
-	mutex_unlock(&parent->d_inode->i_mutex);
+	inode_unlock(parent->d_inode);
 	if (IS_ERR(tmp)) {
 		dprintk("%s: lookup failed: %d\n", __func__, PTR_ERR(tmp));
 		goto out_err;
@@ -503,10 +503,10 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
 		 */
 		err = exportfs_get_name(mnt, target_dir, nbuf, result);
 		if (!err) {
-			mutex_lock(&target_dir->d_inode->i_mutex);
+			inode_lock(target_dir->d_inode);
 			nresult = lookup_one_len(nbuf, target_dir,
 						 strlen(nbuf));
-			mutex_unlock(&target_dir->d_inode->i_mutex);
+			inode_unlock(target_dir->d_inode);
 			if (!IS_ERR(nresult)) {
 				if (nresult->d_inode) {
 					dput(result);
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 11a42c5a0..c1400b109 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -80,30 +80,13 @@ static int ext2_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
 	return ret;
 }
 
-static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
-{
-	struct inode *inode = file_inode(vma->vm_file);
-	struct ext2_inode_info *ei = EXT2_I(inode);
-	int ret;
-
-	sb_start_pagefault(inode->i_sb);
-	file_update_time(vma->vm_file);
-	down_read(&ei->dax_sem);
-
-	ret = __dax_mkwrite(vma, vmf, ext2_get_block, NULL);
-
-	up_read(&ei->dax_sem);
-	sb_end_pagefault(inode->i_sb);
-	return ret;
-}
-
 static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma,
 		struct vm_fault *vmf)
 {
 	struct inode *inode = file_inode(vma->vm_file);
 	struct ext2_inode_info *ei = EXT2_I(inode);
-	int ret = VM_FAULT_NOPAGE;
 	loff_t size;
+	int ret;
 
 	sb_start_pagefault(inode->i_sb);
 	file_update_time(vma->vm_file);
@@ -113,6 +96,8 @@ static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma,
 	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	if (vmf->pgoff >= size)
 		ret = VM_FAULT_SIGBUS;
+	else
+		ret = dax_pfn_mkwrite(vma, vmf);
 
 	up_read(&ei->dax_sem);
 	sb_end_pagefault(inode->i_sb);
@@ -122,7 +107,7 @@ static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma,
 static const struct vm_operations_struct ext2_dax_vm_ops = {
 	.fault		= ext2_dax_fault,
 	.pmd_fault	= ext2_dax_pmd_fault,
-	.page_mkwrite	= ext2_dax_mkwrite,
+	.page_mkwrite	= ext2_dax_fault,
 	.pfn_mkwrite	= ext2_dax_pfn_mkwrite,
 };
 
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 0aa9bf6e6..6bd58e6ff 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -737,8 +737,10 @@ static int ext2_get_blocks(struct inode *inode,
 		 * so that it's not found by another thread before it's
 		 * initialised
 		 */
-		err = dax_clear_blocks(inode, le32_to_cpu(chain[depth-1].key),
-						1 << inode->i_blkbits);
+		err = dax_clear_sectors(inode->i_sb->s_bdev,
+				le32_to_cpu(chain[depth-1].key) <<
+				(inode->i_blkbits - 9),
+				1 << inode->i_blkbits);
 		if (err) {
 			mutex_unlock(&ei->truncate_mutex);
 			goto cleanup;
@@ -874,6 +876,14 @@ ext2_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
 static int
 ext2_writepages(struct address_space *mapping, struct writeback_control *wbc)
 {
+#ifdef CONFIG_FS_DAX
+	if (dax_mapping(mapping)) {
+		return dax_writeback_mapping_range(mapping,
+						   mapping->host->i_sb->s_bdev,
+						   wbc);
+	}
+#endif
+
 	return mpage_writepages(mapping, wbc, ext2_get_block);
 }
 
@@ -1296,7 +1306,7 @@ void ext2_set_inode_flags(struct inode *inode)
 		inode->i_flags |= S_NOATIME;
 	if (flags & EXT2_DIRSYNC_FL)
 		inode->i_flags |= S_DIRSYNC;
-	if (test_opt(inode->i_sb, DAX))
+	if (test_opt(inode->i_sb, DAX) && S_ISREG(inode->i_mode))
 		inode->i_flags |= S_DAX;
 }
 
@@ -1420,6 +1430,7 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
 				sizeof(ei->i_data) - 1);
 		} else {
 			inode->i_op = &ext2_symlink_inode_operations;
+			inode_nohighmem(inode);
 			if (test_opt(inode->i_sb, NOBH))
 				inode->i_mapping->a_ops = &ext2_nobh_aops;
 			else
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index 5d46c0986..b386af2e4 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -51,10 +51,10 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 
 		flags = ext2_mask_flags(inode->i_mode, flags);
 
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 		/* Is it quota file? Do not allow user to mess with it */
 		if (IS_NOQUOTA(inode)) {
-			mutex_unlock(&inode->i_mutex);
+			inode_unlock(inode);
 			ret = -EPERM;
 			goto setflags_out;
 		}
@@ -68,7 +68,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		 */
 		if ((flags ^ oldflags) & (EXT2_APPEND_FL | EXT2_IMMUTABLE_FL)) {
 			if (!capable(CAP_LINUX_IMMUTABLE)) {
-				mutex_unlock(&inode->i_mutex);
+				inode_unlock(inode);
 				ret = -EPERM;
 				goto setflags_out;
 			}
@@ -80,7 +80,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 
 		ext2_set_inode_flags(inode);
 		inode->i_ctime = CURRENT_TIME_SEC;
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 
 		mark_inode_dirty(inode);
 setflags_out:
@@ -102,10 +102,10 @@ setflags_out:
 			goto setversion_out;
 		}
 
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 		inode->i_ctime = CURRENT_TIME_SEC;
 		inode->i_generation = generation;
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 
 		mark_inode_dirty(inode);
 setversion_out:
diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c
index 3267a80db..7a2be8f7f 100644
--- a/fs/ext2/namei.c
+++ b/fs/ext2/namei.c
@@ -183,6 +183,7 @@ static int ext2_symlink (struct inode * dir, struct dentry * dentry,
 	if (l > sizeof (EXT2_I(inode)->i_data)) {
 		/* slow symlink */
 		inode->i_op = &ext2_symlink_inode_operations;
+		inode_nohighmem(inode);
 		if (test_opt(inode->i_sb, NOBH))
 			inode->i_mapping->a_ops = &ext2_nobh_aops;
 		else
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 748d35afc..2a188413a 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -203,7 +203,7 @@ static int __init init_inodecache(void)
 	ext2_inode_cachep = kmem_cache_create("ext2_inode_cache",
 					     sizeof(struct ext2_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					     init_once);
 	if (ext2_inode_cachep == NULL)
 		return -ENOMEM;
diff --git a/fs/ext2/symlink.c b/fs/ext2/symlink.c
index ae17179f3..3495d8ae4 100644
--- a/fs/ext2/symlink.c
+++ b/fs/ext2/symlink.c
@@ -22,8 +22,7 @@
 
 const struct inode_operations ext2_symlink_inode_operations = {
 	.readlink	= generic_readlink,
-	.follow_link	= page_follow_link_light,
-	.put_link	= page_put_link,
+	.get_link	= page_get_link,
 	.setattr	= ext2_setattr,
 #ifdef CONFIG_EXT2_FS_XATTR
 	.setxattr	= generic_setxattr,
@@ -35,7 +34,7 @@ const struct inode_operations ext2_symlink_inode_operations = {
  
 const struct inode_operations ext2_fast_symlink_inode_operations = {
 	.readlink	= generic_readlink,
-	.follow_link	= simple_follow_link,
+	.get_link	= simple_get_link,
 	.setattr	= ext2_setattr,
 #ifdef CONFIG_EXT2_FS_XATTR
 	.setxattr	= generic_setxattr,
diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c
index fa70848af..f57a7aba3 100644
--- a/fs/ext2/xattr.c
+++ b/fs/ext2/xattr.c
@@ -77,10 +77,8 @@
 		printk("\n"); \
 	} while (0)
 # define ea_bdebug(bh, f...) do { \
-		char b[BDEVNAME_SIZE]; \
-		printk(KERN_DEBUG "block %s:%lu: ", \
-			bdevname(bh->b_bdev, b), \
-			(unsigned long) bh->b_blocknr); \
+		printk(KERN_DEBUG "block %pg:%lu: ", \
+			bh->b_bdev, (unsigned long) bh->b_blocknr); \
 		printk(f); \
 		printk("\n"); \
 	} while (0)
@@ -292,16 +290,21 @@ bad_block:	ext2_error(inode->i_sb, "ext2_xattr_list",
 		const struct xattr_handler *handler =
 			ext2_xattr_handler(entry->e_name_index);
 
-		if (handler) {
-			size_t size = handler->list(handler, dentry, buffer,
-						    rest, entry->e_name,
-						    entry->e_name_len);
+		if (handler && (!handler->list || handler->list(dentry))) {
+			const char *prefix = handler->prefix ?: handler->name;
+			size_t prefix_len = strlen(prefix);
+			size_t size = prefix_len + entry->e_name_len + 1;
+
 			if (buffer) {
 				if (size > rest) {
 					error = -ERANGE;
 					goto cleanup;
 				}
-				buffer += size;
+				memcpy(buffer, prefix, prefix_len);
+				buffer += prefix_len;
+				memcpy(buffer, entry->e_name, entry->e_name_len);
+				buffer += entry->e_name_len;
+				*buffer++ = 0;
 			}
 			rest -= size;
 		}
diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c
index dfb087503..ba97f243b 100644
--- a/fs/ext2/xattr_security.c
+++ b/fs/ext2/xattr_security.c
@@ -7,29 +7,11 @@
 #include <linux/security.h>
 #include "xattr.h"
 
-static size_t
-ext2_xattr_security_list(const struct xattr_handler *handler,
-			 struct dentry *dentry, char *list, size_t list_size,
-			 const char *name, size_t name_len)
-{
-	const int prefix_len = XATTR_SECURITY_PREFIX_LEN;
-	const size_t total_len = prefix_len + name_len + 1;
-
-	if (list && total_len <= list_size) {
-		memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
-		memcpy(list+prefix_len, name, name_len);
-		list[prefix_len + name_len] = '\0';
-	}
-	return total_len;
-}
-
 static int
 ext2_xattr_security_get(const struct xattr_handler *handler,
 			struct dentry *dentry, const char *name,
 			void *buffer, size_t size)
 {
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
 	return ext2_xattr_get(d_inode(dentry), EXT2_XATTR_INDEX_SECURITY, name,
 			      buffer, size);
 }
@@ -39,8 +21,6 @@ ext2_xattr_security_set(const struct xattr_handler *handler,
 			struct dentry *dentry, const char *name,
 			const void *value, size_t size, int flags)
 {
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
 	return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_SECURITY, name,
 			      value, size, flags);
 }
@@ -71,7 +51,6 @@ ext2_init_security(struct inode *inode, struct inode *dir,
 
 const struct xattr_handler ext2_xattr_security_handler = {
 	.prefix	= XATTR_SECURITY_PREFIX,
-	.list	= ext2_xattr_security_list,
 	.get	= ext2_xattr_security_get,
 	.set	= ext2_xattr_security_set,
 };
diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c
index 3150dd3a7..2c94d1930 100644
--- a/fs/ext2/xattr_trusted.c
+++ b/fs/ext2/xattr_trusted.c
@@ -8,23 +8,10 @@
 #include "ext2.h"
 #include "xattr.h"
 
-static size_t
-ext2_xattr_trusted_list(const struct xattr_handler *handler,
-			struct dentry *dentry, char *list, size_t list_size,
-			const char *name, size_t name_len)
+static bool
+ext2_xattr_trusted_list(struct dentry *dentry)
 {
-	const int prefix_len = XATTR_TRUSTED_PREFIX_LEN;
-	const size_t total_len = prefix_len + name_len + 1;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return 0;
-
-	if (list && total_len <= list_size) {
-		memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
-		memcpy(list+prefix_len, name, name_len);
-		list[prefix_len + name_len] = '\0';
-	}
-	return total_len;
+	return capable(CAP_SYS_ADMIN);
 }
 
 static int
@@ -32,8 +19,6 @@ ext2_xattr_trusted_get(const struct xattr_handler *handler,
 		       struct dentry *dentry, const char *name,
 		       void *buffer, size_t size)
 {
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
 	return ext2_xattr_get(d_inode(dentry), EXT2_XATTR_INDEX_TRUSTED, name,
 			      buffer, size);
 }
@@ -43,8 +28,6 @@ ext2_xattr_trusted_set(const struct xattr_handler *handler,
 		       struct dentry *dentry, const char *name,
 		       const void *value, size_t size, int flags)
 {
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
 	return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_TRUSTED, name,
 			      value, size, flags);
 }
diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c
index 339a49bbb..72a2a96d6 100644
--- a/fs/ext2/xattr_user.c
+++ b/fs/ext2/xattr_user.c
@@ -10,23 +10,10 @@
 #include "ext2.h"
 #include "xattr.h"
 
-static size_t
-ext2_xattr_user_list(const struct xattr_handler *handler,
-		     struct dentry *dentry, char *list, size_t list_size,
-		     const char *name, size_t name_len)
+static bool
+ext2_xattr_user_list(struct dentry *dentry)
 {
-	const size_t prefix_len = XATTR_USER_PREFIX_LEN;
-	const size_t total_len = prefix_len + name_len + 1;
-
-	if (!test_opt(dentry->d_sb, XATTR_USER))
-		return 0;
-
-	if (list && total_len <= list_size) {
-		memcpy(list, XATTR_USER_PREFIX, prefix_len);
-		memcpy(list+prefix_len, name, name_len);
-		list[prefix_len + name_len] = '\0';
-	}
-	return total_len;
+	return test_opt(dentry->d_sb, XATTR_USER);
 }
 
 static int
@@ -34,8 +21,6 @@ ext2_xattr_user_get(const struct xattr_handler *handler,
 		    struct dentry *dentry, const char *name,
 		    void *buffer, size_t size)
 {
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
 	if (!test_opt(dentry->d_sb, XATTR_USER))
 		return -EOPNOTSUPP;
 	return ext2_xattr_get(d_inode(dentry), EXT2_XATTR_INDEX_USER,
@@ -47,8 +32,6 @@ ext2_xattr_user_set(const struct xattr_handler *handler,
 		    struct dentry *dentry, const char *name,
 		    const void *value, size_t size, int flags)
 {
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
 	if (!test_opt(dentry->d_sb, XATTR_USER))
 		return -EOPNOTSUPP;
 
diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c
index 1a0835073..38f756248 100644
--- a/fs/ext4/crypto.c
+++ b/fs/ext4/crypto.c
@@ -384,14 +384,12 @@ int ext4_decrypt(struct page *page)
 				EXT4_DECRYPT, page->index, page, page);
 }
 
-int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex)
+int ext4_encrypted_zeroout(struct inode *inode, ext4_lblk_t lblk,
+			   ext4_fsblk_t pblk, ext4_lblk_t len)
 {
 	struct ext4_crypto_ctx	*ctx;
 	struct page		*ciphertext_page = NULL;
 	struct bio		*bio;
-	ext4_lblk_t		lblk = le32_to_cpu(ex->ee_block);
-	ext4_fsblk_t		pblk = ext4_ext_pblock(ex);
-	unsigned int		len = ext4_ext_get_actual_len(ex);
 	int			ret, err = 0;
 
 #if 0
@@ -469,3 +467,59 @@ uint32_t ext4_validate_encryption_key_size(uint32_t mode, uint32_t size)
 		return size;
 	return 0;
 }
+
+/*
+ * Validate dentries for encrypted directories to make sure we aren't
+ * potentially caching stale data after a key has been added or
+ * removed.
+ */
+static int ext4_d_revalidate(struct dentry *dentry, unsigned int flags)
+{
+	struct inode *dir = d_inode(dentry->d_parent);
+	struct ext4_crypt_info *ci = EXT4_I(dir)->i_crypt_info;
+	int dir_has_key, cached_with_key;
+
+	if (!ext4_encrypted_inode(dir))
+		return 0;
+
+	if (ci && ci->ci_keyring_key &&
+	    (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) |
+					  (1 << KEY_FLAG_REVOKED) |
+					  (1 << KEY_FLAG_DEAD))))
+		ci = NULL;
+
+	/* this should eventually be an flag in d_flags */
+	cached_with_key = dentry->d_fsdata != NULL;
+	dir_has_key = (ci != NULL);
+
+	/*
+	 * If the dentry was cached without the key, and it is a
+	 * negative dentry, it might be a valid name.  We can't check
+	 * if the key has since been made available due to locking
+	 * reasons, so we fail the validation so ext4_lookup() can do
+	 * this check.
+	 *
+	 * We also fail the validation if the dentry was created with
+	 * the key present, but we no longer have the key, or vice versa.
+	 */
+	if ((!cached_with_key && d_is_negative(dentry)) ||
+	    (!cached_with_key && dir_has_key) ||
+	    (cached_with_key && !dir_has_key)) {
+#if 0				/* Revalidation debug */
+		char buf[80];
+		char *cp = simple_dname(dentry, buf, sizeof(buf));
+
+		if (IS_ERR(cp))
+			cp = (char *) "???";
+		pr_err("revalidate: %s %p %d %d %d\n", cp, dentry->d_fsdata,
+		       cached_with_key, d_is_negative(dentry),
+		       dir_has_key);
+#endif
+		return 0;
+	}
+	return 1;
+}
+
+const struct dentry_operations ext4_encrypted_d_ops = {
+	.d_revalidate = ext4_d_revalidate,
+};
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 1d1bca74f..33f5e2a50 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -111,6 +111,12 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
 	int dir_has_error = 0;
 	struct ext4_str fname_crypto_str = {.name = NULL, .len = 0};
 
+	if (ext4_encrypted_inode(inode)) {
+		err = ext4_get_encryption_info(inode);
+		if (err && err != -ENOKEY)
+			return err;
+	}
+
 	if (is_dx_dir(inode)) {
 		err = ext4_dx_readdir(file, ctx);
 		if (err != ERR_BAD_DX_DIR) {
@@ -157,8 +163,11 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx)
 					index, 1);
 			file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT;
 			bh = ext4_bread(NULL, inode, map.m_lblk, 0);
-			if (IS_ERR(bh))
-				return PTR_ERR(bh);
+			if (IS_ERR(bh)) {
+				err = PTR_ERR(bh);
+				bh = NULL;
+				goto errout;
+			}
 		}
 
 		if (!bh) {
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index cc7ca4e87..157b458a6 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -378,14 +378,22 @@ struct flex_groups {
 #define EXT4_PROJINHERIT_FL		0x20000000 /* Create with parents projid */
 #define EXT4_RESERVED_FL		0x80000000 /* reserved for ext4 lib */
 
-#define EXT4_FL_USER_VISIBLE		0x004BDFFF /* User visible flags */
-#define EXT4_FL_USER_MODIFIABLE		0x004380FF /* User modifiable flags */
+#define EXT4_FL_USER_VISIBLE		0x304BDFFF /* User visible flags */
+#define EXT4_FL_USER_MODIFIABLE		0x204380FF /* User modifiable flags */
+
+#define EXT4_FL_XFLAG_VISIBLE		(EXT4_SYNC_FL | \
+					 EXT4_IMMUTABLE_FL | \
+					 EXT4_APPEND_FL | \
+					 EXT4_NODUMP_FL | \
+					 EXT4_NOATIME_FL | \
+					 EXT4_PROJINHERIT_FL)
 
 /* Flags that should be inherited by new inodes from their parent. */
 #define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
 			   EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\
 			   EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\
-			   EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL)
+			   EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL |\
+			   EXT4_PROJINHERIT_FL)
 
 /* Flags that are appropriate for regular files (all but dir-specific ones). */
 #define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL))
@@ -555,10 +563,12 @@ enum {
 #define EXT4_GET_BLOCKS_NO_NORMALIZE		0x0040
 	/* Request will not result in inode size update (user for fallocate) */
 #define EXT4_GET_BLOCKS_KEEP_SIZE		0x0080
-	/* Do not take i_data_sem locking in ext4_map_blocks */
-#define EXT4_GET_BLOCKS_NO_LOCK			0x0100
 	/* Convert written extents to unwritten */
-#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN	0x0200
+#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN	0x0100
+	/* Write zeros to newly created written extents */
+#define EXT4_GET_BLOCKS_ZERO			0x0200
+#define EXT4_GET_BLOCKS_CREATE_ZERO		(EXT4_GET_BLOCKS_CREATE |\
+					EXT4_GET_BLOCKS_ZERO)
 
 /*
  * The bit position of these flags must not overlap with any of the
@@ -616,6 +626,46 @@ enum {
 #define EXT4_IOC_GET_ENCRYPTION_PWSALT	_IOW('f', 20, __u8[16])
 #define EXT4_IOC_GET_ENCRYPTION_POLICY	_IOW('f', 21, struct ext4_encryption_policy)
 
+#ifndef FS_IOC_FSGETXATTR
+/* Until the uapi changes get merged for project quota... */
+
+#define FS_IOC_FSGETXATTR		_IOR('X', 31, struct fsxattr)
+#define FS_IOC_FSSETXATTR		_IOW('X', 32, struct fsxattr)
+
+/*
+ * Structure for FS_IOC_FSGETXATTR and FS_IOC_FSSETXATTR.
+ */
+struct fsxattr {
+	__u32		fsx_xflags;	/* xflags field value (get/set) */
+	__u32		fsx_extsize;	/* extsize field value (get/set)*/
+	__u32		fsx_nextents;	/* nextents field value (get)	*/
+	__u32		fsx_projid;	/* project identifier (get/set) */
+	unsigned char	fsx_pad[12];
+};
+
+/*
+ * Flags for the fsx_xflags field
+ */
+#define FS_XFLAG_REALTIME	0x00000001	/* data in realtime volume */
+#define FS_XFLAG_PREALLOC	0x00000002	/* preallocated file extents */
+#define FS_XFLAG_IMMUTABLE	0x00000008	/* file cannot be modified */
+#define FS_XFLAG_APPEND		0x00000010	/* all writes append */
+#define FS_XFLAG_SYNC		0x00000020	/* all writes synchronous */
+#define FS_XFLAG_NOATIME	0x00000040	/* do not update access time */
+#define FS_XFLAG_NODUMP		0x00000080	/* do not include in backups */
+#define FS_XFLAG_RTINHERIT	0x00000100	/* create with rt bit set */
+#define FS_XFLAG_PROJINHERIT	0x00000200	/* create with parents projid */
+#define FS_XFLAG_NOSYMLINKS	0x00000400	/* disallow symlink creation */
+#define FS_XFLAG_EXTSIZE	0x00000800	/* extent size allocator hint */
+#define FS_XFLAG_EXTSZINHERIT	0x00001000	/* inherit inode extent size */
+#define FS_XFLAG_NODEFRAG	0x00002000  	/* do not defragment */
+#define FS_XFLAG_FILESTREAM	0x00004000	/* use filestream allocator */
+#define FS_XFLAG_HASATTR	0x80000000	/* no DIFLAG for this */
+#endif /* !defined(FS_IOC_FSGETXATTR) */
+
+#define EXT4_IOC_FSGETXATTR		FS_IOC_FSGETXATTR
+#define EXT4_IOC_FSSETXATTR		FS_IOC_FSSETXATTR
+
 #if defined(__KERNEL__) && defined(CONFIG_COMPAT)
 /*
  * ioctl commands in 32 bit emulation
@@ -910,6 +960,15 @@ struct ext4_inode_info {
 	 * by other means, so we have i_data_sem.
 	 */
 	struct rw_semaphore i_data_sem;
+	/*
+	 * i_mmap_sem is for serializing page faults with truncate / punch hole
+	 * operations. We have to make sure that new page cannot be faulted in
+	 * a section of the inode that is being punched. We cannot easily use
+	 * i_data_sem for this since we need protection for the whole punch
+	 * operation and i_data_sem ranks below transaction start so we have
+	 * to occasionally drop it.
+	 */
+	struct rw_semaphore i_mmap_sem;
 	struct inode vfs_inode;
 	struct jbd2_inode *jinode;
 
@@ -993,6 +1052,7 @@ struct ext4_inode_info {
 	/* Encryption params */
 	struct ext4_crypt_info *i_crypt_info;
 #endif
+	kprojid_t i_projid;
 };
 
 /*
@@ -1248,7 +1308,7 @@ struct ext4_super_block {
 #endif
 
 /* Number of quota types we support */
-#define EXT4_MAXQUOTAS 2
+#define EXT4_MAXQUOTAS 3
 
 /*
  * fourth extended-fs super-block data in memory
@@ -1754,7 +1814,8 @@ EXT4_FEATURE_INCOMPAT_FUNCS(encrypt,		ENCRYPT)
 					 EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\
 					 EXT4_FEATURE_RO_COMPAT_BIGALLOC |\
 					 EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\
-					 EXT4_FEATURE_RO_COMPAT_QUOTA)
+					 EXT4_FEATURE_RO_COMPAT_QUOTA |\
+					 EXT4_FEATURE_RO_COMPAT_PROJECT)
 
 #define EXTN_FEATURE_FUNCS(ver) \
 static inline bool ext4_has_unknown_ext##ver##_compat_features(struct super_block *sb) \
@@ -1796,6 +1857,11 @@ static inline bool ext4_has_incompat_features(struct super_block *sb)
 #define	EXT4_DEF_RESUID		0
 #define	EXT4_DEF_RESGID		0
 
+/*
+ * Default project ID
+ */
+#define	EXT4_DEF_PROJID		0
+
 #define EXT4_DEF_INODE_READAHEAD_BLKS	32
 
 /*
@@ -2234,7 +2300,9 @@ void ext4_restore_control_page(struct page *data_page);
 struct page *ext4_encrypt(struct inode *inode,
 			  struct page *plaintext_page);
 int ext4_decrypt(struct page *page);
-int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex);
+int ext4_encrypted_zeroout(struct inode *inode, ext4_lblk_t lblk,
+			   ext4_fsblk_t pblk, ext4_lblk_t len);
+extern const struct dentry_operations ext4_encrypted_d_ops;
 
 #ifdef CONFIG_EXT4_FS_ENCRYPTION
 int ext4_init_crypto(void);
@@ -2440,8 +2508,8 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
 struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
 int ext4_get_block_write(struct inode *inode, sector_t iblock,
 			 struct buffer_head *bh_result, int create);
-int ext4_get_block_dax(struct inode *inode, sector_t iblock,
-			 struct buffer_head *bh_result, int create);
+int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock,
+			    struct buffer_head *bh_result, int create);
 int ext4_get_block(struct inode *inode, sector_t iblock,
 				struct buffer_head *bh_result, int create);
 int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
@@ -2484,9 +2552,13 @@ extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
 extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
 			     loff_t lstart, loff_t lend);
 extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
+extern int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf);
 extern qsize_t *ext4_get_reserved_space(struct inode *inode);
+extern int ext4_get_projid(struct inode *inode, kprojid_t *projid);
 extern void ext4_da_update_reserve_space(struct inode *inode,
 					int used, int quota_claim);
+extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk,
+			      ext4_fsblk_t pblk, ext4_lblk_t len);
 
 /* indirect.c */
 extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
@@ -2825,7 +2897,7 @@ do {								\
 static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize)
 {
 	WARN_ON_ONCE(S_ISREG(inode->i_mode) &&
-		     !mutex_is_locked(&inode->i_mutex));
+		     !inode_is_locked(inode));
 	down_write(&EXT4_I(inode)->i_data_sem);
 	if (newsize > EXT4_I(inode)->i_disksize)
 		EXT4_I(inode)->i_disksize = newsize;
@@ -2848,6 +2920,9 @@ static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize)
 	return changed;
 }
 
+int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
+				      loff_t len);
+
 struct ext4_group_info {
 	unsigned long   bb_state;
 	struct rb_root  bb_free_root;
@@ -2986,8 +3061,7 @@ extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos,
 					 struct page *page);
 extern int ext4_try_add_inline_entry(handle_t *handle,
 				     struct ext4_filename *fname,
-				     struct dentry *dentry,
-				     struct inode *inode);
+				     struct inode *dir, struct inode *inode);
 extern int ext4_try_create_inline_dir(handle_t *handle,
 				      struct inode *parent,
 				      struct inode *inode);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 551353b1b..3753ceb0b 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3119,19 +3119,11 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex)
 {
 	ext4_fsblk_t ee_pblock;
 	unsigned int ee_len;
-	int ret;
 
 	ee_len    = ext4_ext_get_actual_len(ex);
 	ee_pblock = ext4_ext_pblock(ex);
-
-	if (ext4_encrypted_inode(inode))
-		return ext4_encrypted_zeroout(inode, ex);
-
-	ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS);
-	if (ret > 0)
-		ret = 0;
-
-	return ret;
+	return ext4_issue_zeroout(inode, le32_to_cpu(ex->ee_block), ee_pblock,
+				  ee_len);
 }
 
 /*
@@ -3936,7 +3928,7 @@ static int
 convert_initialized_extent(handle_t *handle, struct inode *inode,
 			   struct ext4_map_blocks *map,
 			   struct ext4_ext_path **ppath, int flags,
-			   unsigned int allocated, ext4_fsblk_t newblock)
+			   unsigned int allocated)
 {
 	struct ext4_ext_path *path = *ppath;
 	struct ext4_extent *ex;
@@ -4052,6 +4044,14 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode,
 	}
 	/* IO end_io complete, convert the filled extent to written */
 	if (flags & EXT4_GET_BLOCKS_CONVERT) {
+		if (flags & EXT4_GET_BLOCKS_ZERO) {
+			if (allocated > map->m_len)
+				allocated = map->m_len;
+			err = ext4_issue_zeroout(inode, map->m_lblk, newblock,
+						 allocated);
+			if (err < 0)
+				goto out2;
+		}
 		ret = ext4_convert_unwritten_extents_endio(handle, inode, map,
 							   ppath);
 		if (ret >= 0) {
@@ -4347,7 +4347,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 			    (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) {
 				allocated = convert_initialized_extent(
 						handle, inode, map, &path,
-						flags, allocated, newblock);
+						flags, allocated);
 				goto out2;
 			} else if (!ext4_ext_is_unwritten(ex))
 				goto out;
@@ -4685,10 +4685,6 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
 	if (len <= EXT_UNWRITTEN_MAX_LEN)
 		flags |= EXT4_GET_BLOCKS_NO_NORMALIZE;
 
-	/* Wait all existing dio workers, newcomers will block on i_mutex */
-	ext4_inode_block_unlocked_dio(inode);
-	inode_dio_wait(inode);
-
 	/*
 	 * credits to insert 1 extent into extent tree
 	 */
@@ -4752,8 +4748,6 @@ retry:
 		goto retry;
 	}
 
-	ext4_inode_resume_unlocked_dio(inode);
-
 	return ret > 0 ? ret2 : ret;
 }
 
@@ -4770,7 +4764,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
 	int partial_begin, partial_end;
 	loff_t start, end;
 	ext4_lblk_t lblk;
-	struct address_space *mapping = inode->i_mapping;
 	unsigned int blkbits = inode->i_blkbits;
 
 	trace_ext4_zero_range(inode, offset, len, mode);
@@ -4786,17 +4779,6 @@ static long ext4_zero_range(struct file *file, loff_t offset,
 	}
 
 	/*
-	 * Write out all dirty pages to avoid race conditions
-	 * Then release them.
-	 */
-	if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
-		ret = filemap_write_and_wait_range(mapping, offset,
-						   offset + len - 1);
-		if (ret)
-			return ret;
-	}
-
-	/*
 	 * Round up offset. This is not fallocate, we neet to zero out
 	 * blocks, so convert interior block aligned part of the range to
 	 * unwritten and possibly manually zero out unaligned parts of the
@@ -4817,7 +4799,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
 	else
 		max_blocks -= lblk;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	/*
 	 * Indirect files do not support unwritten extnets
@@ -4839,6 +4821,10 @@ static long ext4_zero_range(struct file *file, loff_t offset,
 	if (mode & FALLOC_FL_KEEP_SIZE)
 		flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
 
+	/* Wait all existing dio workers, newcomers will block on i_mutex */
+	ext4_inode_block_unlocked_dio(inode);
+	inode_dio_wait(inode);
+
 	/* Preallocate the range including the unaligned edges */
 	if (partial_begin || partial_end) {
 		ret = ext4_alloc_file_blocks(file,
@@ -4847,7 +4833,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
 				 round_down(offset, 1 << blkbits)) >> blkbits,
 				new_size, flags, mode);
 		if (ret)
-			goto out_mutex;
+			goto out_dio;
 
 	}
 
@@ -4856,16 +4842,23 @@ static long ext4_zero_range(struct file *file, loff_t offset,
 		flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN |
 			  EXT4_EX_NOCACHE);
 
-		/* Now release the pages and zero block aligned part of pages*/
+		/*
+		 * Prevent page faults from reinstantiating pages we have
+		 * released from page cache.
+		 */
+		down_write(&EXT4_I(inode)->i_mmap_sem);
+		ret = ext4_update_disksize_before_punch(inode, offset, len);
+		if (ret) {
+			up_write(&EXT4_I(inode)->i_mmap_sem);
+			goto out_dio;
+		}
+		/* Now release the pages and zero block aligned part of pages */
 		truncate_pagecache_range(inode, start, end - 1);
 		inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
 
-		/* Wait all existing dio workers, newcomers will block on i_mutex */
-		ext4_inode_block_unlocked_dio(inode);
-		inode_dio_wait(inode);
-
 		ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
 					     flags, mode);
+		up_write(&EXT4_I(inode)->i_mmap_sem);
 		if (ret)
 			goto out_dio;
 	}
@@ -4909,7 +4902,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
 out_dio:
 	ext4_inode_resume_unlocked_dio(inode);
 out_mutex:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return ret;
 }
 
@@ -4980,7 +4973,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 	if (mode & FALLOC_FL_KEEP_SIZE)
 		flags |= EXT4_GET_BLOCKS_KEEP_SIZE;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	/*
 	 * We only support preallocation for extent-based files only
@@ -4998,8 +4991,13 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 			goto out;
 	}
 
+	/* Wait all existing dio workers, newcomers will block on i_mutex */
+	ext4_inode_block_unlocked_dio(inode);
+	inode_dio_wait(inode);
+
 	ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
 				     flags, mode);
+	ext4_inode_resume_unlocked_dio(inode);
 	if (ret)
 		goto out;
 
@@ -5008,7 +5006,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 						EXT4_I(inode)->i_sync_tid);
 	}
 out:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	trace_ext4_fallocate_exit(inode, offset, max_blocks, ret);
 	return ret;
 }
@@ -5494,21 +5492,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
 			return ret;
 	}
 
-	/*
-	 * Need to round down offset to be aligned with page size boundary
-	 * for page size > block size.
-	 */
-	ioffset = round_down(offset, PAGE_SIZE);
-
-	/* Write out all dirty pages */
-	ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
-					   LLONG_MAX);
-	if (ret)
-		return ret;
-
-	/* Take mutex lock */
-	mutex_lock(&inode->i_mutex);
-
+	inode_lock(inode);
 	/*
 	 * There is no need to overlap collapse range with EOF, in which case
 	 * it is effectively a truncate operation
@@ -5524,17 +5508,43 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
 		goto out_mutex;
 	}
 
-	truncate_pagecache(inode, ioffset);
-
 	/* Wait for existing dio to complete */
 	ext4_inode_block_unlocked_dio(inode);
 	inode_dio_wait(inode);
 
+	/*
+	 * Prevent page faults from reinstantiating pages we have released from
+	 * page cache.
+	 */
+	down_write(&EXT4_I(inode)->i_mmap_sem);
+	/*
+	 * Need to round down offset to be aligned with page size boundary
+	 * for page size > block size.
+	 */
+	ioffset = round_down(offset, PAGE_SIZE);
+	/*
+	 * Write tail of the last page before removed range since it will get
+	 * removed from the page cache below.
+	 */
+	ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, offset);
+	if (ret)
+		goto out_mmap;
+	/*
+	 * Write data that will be shifted to preserve them when discarding
+	 * page cache below. We are also protected from pages becoming dirty
+	 * by i_mmap_sem.
+	 */
+	ret = filemap_write_and_wait_range(inode->i_mapping, offset + len,
+					   LLONG_MAX);
+	if (ret)
+		goto out_mmap;
+	truncate_pagecache(inode, ioffset);
+
 	credits = ext4_writepage_trans_blocks(inode);
 	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
-		goto out_dio;
+		goto out_mmap;
 	}
 
 	down_write(&EXT4_I(inode)->i_data_sem);
@@ -5573,10 +5583,11 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
 
 out_stop:
 	ext4_journal_stop(handle);
-out_dio:
+out_mmap:
+	up_write(&EXT4_I(inode)->i_mmap_sem);
 	ext4_inode_resume_unlocked_dio(inode);
 out_mutex:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return ret;
 }
 
@@ -5627,21 +5638,7 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
 			return ret;
 	}
 
-	/*
-	 * Need to round down to align start offset to page size boundary
-	 * for page size > block size.
-	 */
-	ioffset = round_down(offset, PAGE_SIZE);
-
-	/* Write out all dirty pages */
-	ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
-			LLONG_MAX);
-	if (ret)
-		return ret;
-
-	/* Take mutex lock */
-	mutex_lock(&inode->i_mutex);
-
+	inode_lock(inode);
 	/* Currently just for extent based files */
 	if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
 		ret = -EOPNOTSUPP;
@@ -5660,17 +5657,32 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
 		goto out_mutex;
 	}
 
-	truncate_pagecache(inode, ioffset);
-
 	/* Wait for existing dio to complete */
 	ext4_inode_block_unlocked_dio(inode);
 	inode_dio_wait(inode);
 
+	/*
+	 * Prevent page faults from reinstantiating pages we have released from
+	 * page cache.
+	 */
+	down_write(&EXT4_I(inode)->i_mmap_sem);
+	/*
+	 * Need to round down to align start offset to page size boundary
+	 * for page size > block size.
+	 */
+	ioffset = round_down(offset, PAGE_SIZE);
+	/* Write out all dirty pages */
+	ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
+			LLONG_MAX);
+	if (ret)
+		goto out_mmap;
+	truncate_pagecache(inode, ioffset);
+
 	credits = ext4_writepage_trans_blocks(inode);
 	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
 	if (IS_ERR(handle)) {
 		ret = PTR_ERR(handle);
-		goto out_dio;
+		goto out_mmap;
 	}
 
 	/* Expand file to avoid data loss if there is error while shifting */
@@ -5741,10 +5753,11 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
 
 out_stop:
 	ext4_journal_stop(handle);
-out_dio:
+out_mmap:
+	up_write(&EXT4_I(inode)->i_mmap_sem);
 	ext4_inode_resume_unlocked_dio(inode);
 out_mutex:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return ret;
 }
 
@@ -5779,8 +5792,8 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
 
 	BUG_ON(!rwsem_is_locked(&EXT4_I(inode1)->i_data_sem));
 	BUG_ON(!rwsem_is_locked(&EXT4_I(inode2)->i_data_sem));
-	BUG_ON(!mutex_is_locked(&inode1->i_mutex));
-	BUG_ON(!mutex_is_locked(&inode2->i_mutex));
+	BUG_ON(!inode_is_locked(inode1));
+	BUG_ON(!inode_is_locked(inode2));
 
 	*erp = ext4_es_remove_extent(inode1, lblk1, count);
 	if (unlikely(*erp))
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 113837e7b..4cd318f31 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -113,7 +113,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 		ext4_unwritten_wait(inode);
 	}
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	ret = generic_write_checks(iocb, from);
 	if (ret <= 0)
 		goto out;
@@ -169,7 +169,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	}
 
 	ret = __generic_file_write_iter(iocb, from);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	if (ret > 0) {
 		ssize_t err;
@@ -186,50 +186,42 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	return ret;
 
 out:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	if (aio_mutex)
 		mutex_unlock(aio_mutex);
 	return ret;
 }
 
 #ifdef CONFIG_FS_DAX
-static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate)
-{
-	struct inode *inode = bh->b_assoc_map->host;
-	/* XXX: breaks on 32-bit > 16TB. Is that even supported? */
-	loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits;
-	int err;
-	if (!uptodate)
-		return;
-	WARN_ON(!buffer_unwritten(bh));
-	err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size);
-}
-
 static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	int result;
 	handle_t *handle = NULL;
-	struct super_block *sb = file_inode(vma->vm_file)->i_sb;
+	struct inode *inode = file_inode(vma->vm_file);
+	struct super_block *sb = inode->i_sb;
 	bool write = vmf->flags & FAULT_FLAG_WRITE;
 
 	if (write) {
 		sb_start_pagefault(sb);
 		file_update_time(vma->vm_file);
+		down_read(&EXT4_I(inode)->i_mmap_sem);
 		handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
 						EXT4_DATA_TRANS_BLOCKS(sb));
-	}
+	} else
+		down_read(&EXT4_I(inode)->i_mmap_sem);
 
 	if (IS_ERR(handle))
 		result = VM_FAULT_SIGBUS;
 	else
-		result = __dax_fault(vma, vmf, ext4_get_block_dax,
-						ext4_end_io_unwritten);
+		result = __dax_fault(vma, vmf, ext4_dax_mmap_get_block, NULL);
 
 	if (write) {
 		if (!IS_ERR(handle))
 			ext4_journal_stop(handle);
+		up_read(&EXT4_I(inode)->i_mmap_sem);
 		sb_end_pagefault(sb);
-	}
+	} else
+		up_read(&EXT4_I(inode)->i_mmap_sem);
 
 	return result;
 }
@@ -246,44 +238,73 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
 	if (write) {
 		sb_start_pagefault(sb);
 		file_update_time(vma->vm_file);
+		down_read(&EXT4_I(inode)->i_mmap_sem);
 		handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
 				ext4_chunk_trans_blocks(inode,
 							PMD_SIZE / PAGE_SIZE));
-	}
+	} else
+		down_read(&EXT4_I(inode)->i_mmap_sem);
 
 	if (IS_ERR(handle))
 		result = VM_FAULT_SIGBUS;
 	else
 		result = __dax_pmd_fault(vma, addr, pmd, flags,
-				ext4_get_block_dax, ext4_end_io_unwritten);
+				ext4_dax_mmap_get_block, NULL);
 
 	if (write) {
 		if (!IS_ERR(handle))
 			ext4_journal_stop(handle);
+		up_read(&EXT4_I(inode)->i_mmap_sem);
 		sb_end_pagefault(sb);
-	}
+	} else
+		up_read(&EXT4_I(inode)->i_mmap_sem);
 
 	return result;
 }
 
-static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+/*
+ * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_fault()
+ * handler we check for races agaist truncate. Note that since we cycle through
+ * i_mmap_sem, we are sure that also any hole punching that began before we
+ * were called is finished by now and so if it included part of the file we
+ * are working on, our pte will get unmapped and the check for pte_same() in
+ * wp_pfn_shared() fails. Thus fault gets retried and things work out as
+ * desired.
+ */
+static int ext4_dax_pfn_mkwrite(struct vm_area_struct *vma,
+				struct vm_fault *vmf)
 {
-	return dax_mkwrite(vma, vmf, ext4_get_block_dax,
-				ext4_end_io_unwritten);
+	struct inode *inode = file_inode(vma->vm_file);
+	struct super_block *sb = inode->i_sb;
+	loff_t size;
+	int ret;
+
+	sb_start_pagefault(sb);
+	file_update_time(vma->vm_file);
+	down_read(&EXT4_I(inode)->i_mmap_sem);
+	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	if (vmf->pgoff >= size)
+		ret = VM_FAULT_SIGBUS;
+	else
+		ret = dax_pfn_mkwrite(vma, vmf);
+	up_read(&EXT4_I(inode)->i_mmap_sem);
+	sb_end_pagefault(sb);
+
+	return ret;
 }
 
 static const struct vm_operations_struct ext4_dax_vm_ops = {
 	.fault		= ext4_dax_fault,
 	.pmd_fault	= ext4_dax_pmd_fault,
-	.page_mkwrite	= ext4_dax_mkwrite,
-	.pfn_mkwrite	= dax_pfn_mkwrite,
+	.page_mkwrite	= ext4_dax_fault,
+	.pfn_mkwrite	= ext4_dax_pfn_mkwrite,
 };
 #else
 #define ext4_dax_vm_ops	ext4_file_vm_ops
 #endif
 
 static const struct vm_operations_struct ext4_file_vm_ops = {
-	.fault		= filemap_fault,
+	.fault		= ext4_filemap_fault,
 	.map_pages	= filemap_map_pages,
 	.page_mkwrite   = ext4_page_mkwrite,
 };
@@ -314,6 +335,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
 	struct super_block *sb = inode->i_sb;
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
 	struct vfsmount *mnt = filp->f_path.mnt;
+	struct inode *dir = filp->f_path.dentry->d_parent->d_inode;
 	struct path path;
 	char buf[64], *cp;
 	int ret;
@@ -357,6 +379,14 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
 		if (ext4_encryption_info(inode) == NULL)
 			return -ENOKEY;
 	}
+	if (ext4_encrypted_inode(dir) &&
+	    !ext4_is_child_context_consistent_with_parent(dir, inode)) {
+		ext4_warning(inode->i_sb,
+			     "Inconsistent encryption contexts: %lu/%lu\n",
+			     (unsigned long) dir->i_ino,
+			     (unsigned long) inode->i_ino);
+		return -EPERM;
+	}
 	/*
 	 * Set up the jbd2_inode if we are opening the inode for
 	 * writing and the journal is present
@@ -527,11 +557,11 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
 	int blkbits;
 	int ret = 0;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	isize = i_size_read(inode);
 	if (offset >= isize) {
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 		return -ENXIO;
 	}
 
@@ -579,7 +609,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
 		dataoff = (loff_t)last << blkbits;
 	} while (last <= end);
 
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	if (dataoff > isize)
 		return -ENXIO;
@@ -600,11 +630,11 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
 	int blkbits;
 	int ret = 0;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	isize = i_size_read(inode);
 	if (offset >= isize) {
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 		return -ENXIO;
 	}
 
@@ -655,7 +685,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
 		break;
 	} while (last <= end);
 
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	if (holeoff > isize)
 		holeoff = isize;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 53f2b98a6..acc0ad56b 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -801,6 +801,13 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir,
 		inode->i_gid = dir->i_gid;
 	} else
 		inode_init_owner(inode, dir, mode);
+
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_PROJECT) &&
+	    ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT))
+		ei->i_projid = EXT4_I(dir)->i_projid;
+	else
+		ei->i_projid = make_kprojid(&init_user_ns, EXT4_DEF_PROJID);
+
 	err = dquot_initialize(inode);
 	if (err)
 		goto out;
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index d884989cc..dfe3b9baf 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -995,12 +995,11 @@ void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh,
  */
 static int ext4_add_dirent_to_inline(handle_t *handle,
 				     struct ext4_filename *fname,
-				     struct dentry *dentry,
+				     struct inode *dir,
 				     struct inode *inode,
 				     struct ext4_iloc *iloc,
 				     void *inline_start, int inline_size)
 {
-	struct inode	*dir = d_inode(dentry->d_parent);
 	int		err;
 	struct ext4_dir_entry_2 *de;
 
@@ -1245,12 +1244,11 @@ out:
  * the new created block.
  */
 int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname,
-			      struct dentry *dentry, struct inode *inode)
+			      struct inode *dir, struct inode *inode)
 {
 	int ret, inline_size;
 	void *inline_start;
 	struct ext4_iloc iloc;
-	struct inode *dir = d_inode(dentry->d_parent);
 
 	ret = ext4_get_inode_loc(dir, &iloc);
 	if (ret)
@@ -1264,7 +1262,7 @@ int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname,
 						 EXT4_INLINE_DOTDOT_SIZE;
 	inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE;
 
-	ret = ext4_add_dirent_to_inline(handle, fname, dentry, inode, &iloc,
+	ret = ext4_add_dirent_to_inline(handle, fname, dir, inode, &iloc,
 					inline_start, inline_size);
 	if (ret != -ENOSPC)
 		goto out;
@@ -1285,7 +1283,7 @@ int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname,
 	if (inline_size) {
 		inline_start = ext4_get_inline_xattr_pos(dir, &iloc);
 
-		ret = ext4_add_dirent_to_inline(handle, fname, dentry,
+		ret = ext4_add_dirent_to_inline(handle, fname, dir,
 						inode, &iloc, inline_start,
 						inline_size);
 
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 06bda0361..aee960b1a 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -383,6 +383,21 @@ static int __check_block_validity(struct inode *inode, const char *func,
 	return 0;
 }
 
+int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk,
+		       ext4_lblk_t len)
+{
+	int ret;
+
+	if (ext4_encrypted_inode(inode))
+		return ext4_encrypted_zeroout(inode, lblk, pblk, len);
+
+	ret = sb_issue_zeroout(inode->i_sb, pblk, len, GFP_NOFS);
+	if (ret > 0)
+		ret = 0;
+
+	return ret;
+}
+
 #define check_block_validity(inode, map)	\
 	__check_block_validity((inode), __func__, __LINE__, (map))
 
@@ -403,8 +418,7 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,
 	 * out taking i_data_sem.  So at the time the unwritten extent
 	 * could be converted.
 	 */
-	if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
-		down_read(&EXT4_I(inode)->i_data_sem);
+	down_read(&EXT4_I(inode)->i_data_sem);
 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
 		retval = ext4_ext_map_blocks(handle, inode, map, flags &
 					     EXT4_GET_BLOCKS_KEEP_SIZE);
@@ -412,8 +426,7 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,
 		retval = ext4_ind_map_blocks(handle, inode, map, flags &
 					     EXT4_GET_BLOCKS_KEEP_SIZE);
 	}
-	if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
-		up_read((&EXT4_I(inode)->i_data_sem));
+	up_read((&EXT4_I(inode)->i_data_sem));
 
 	/*
 	 * We don't check m_len because extent will be collpased in status
@@ -509,8 +522,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
 	 * Try to see if we can get the block without requesting a new
 	 * file system block.
 	 */
-	if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
-		down_read(&EXT4_I(inode)->i_data_sem);
+	down_read(&EXT4_I(inode)->i_data_sem);
 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
 		retval = ext4_ext_map_blocks(handle, inode, map, flags &
 					     EXT4_GET_BLOCKS_KEEP_SIZE);
@@ -541,8 +553,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
 		if (ret < 0)
 			retval = ret;
 	}
-	if (!(flags & EXT4_GET_BLOCKS_NO_LOCK))
-		up_read((&EXT4_I(inode)->i_data_sem));
+	up_read((&EXT4_I(inode)->i_data_sem));
 
 found:
 	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
@@ -626,13 +637,29 @@ found:
 		}
 
 		/*
+		 * We have to zeroout blocks before inserting them into extent
+		 * status tree. Otherwise someone could look them up there and
+		 * use them before they are really zeroed.
+		 */
+		if (flags & EXT4_GET_BLOCKS_ZERO &&
+		    map->m_flags & EXT4_MAP_MAPPED &&
+		    map->m_flags & EXT4_MAP_NEW) {
+			ret = ext4_issue_zeroout(inode, map->m_lblk,
+						 map->m_pblk, map->m_len);
+			if (ret) {
+				retval = ret;
+				goto out_sem;
+			}
+		}
+
+		/*
 		 * If the extent has been zeroed out, we don't need to update
 		 * extent status tree.
 		 */
 		if ((flags & EXT4_GET_BLOCKS_PRE_IO) &&
 		    ext4_es_lookup_extent(inode, map->m_lblk, &es)) {
 			if (ext4_es_is_written(&es))
-				goto has_zeroout;
+				goto out_sem;
 		}
 		status = map->m_flags & EXT4_MAP_UNWRITTEN ?
 				EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
@@ -643,11 +670,13 @@ found:
 			status |= EXTENT_STATUS_DELAYED;
 		ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
 					    map->m_pblk, status);
-		if (ret < 0)
+		if (ret < 0) {
 			retval = ret;
+			goto out_sem;
+		}
 	}
 
-has_zeroout:
+out_sem:
 	up_write((&EXT4_I(inode)->i_data_sem));
 	if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) {
 		ret = check_block_validity(inode, map);
@@ -702,7 +731,7 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
 	map.m_lblk = iblock;
 	map.m_len = bh->b_size >> inode->i_blkbits;
 
-	if (flags && !(flags & EXT4_GET_BLOCKS_NO_LOCK) && !handle) {
+	if (flags && !handle) {
 		/* Direct IO write... */
 		if (map.m_len > DIO_MAX_BLOCKS)
 			map.m_len = DIO_MAX_BLOCKS;
@@ -722,16 +751,6 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
 
 		map_bh(bh, inode->i_sb, map.m_pblk);
 		ext4_update_bh_state(bh, map.m_flags);
-		if (IS_DAX(inode) && buffer_unwritten(bh)) {
-			/*
-			 * dgc: I suspect unwritten conversion on ext4+DAX is
-			 * fundamentally broken here when there are concurrent
-			 * read/write in progress on this inode.
-			 */
-			WARN_ON_ONCE(io_end);
-			bh->b_assoc_map = inode->i_mapping;
-			bh->b_private = (void *)(unsigned long)iblock;
-		}
 		if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN)
 			set_buffer_defer_completion(bh);
 		bh->b_size = inode->i_sb->s_blocksize * map.m_len;
@@ -907,9 +926,6 @@ int do_journal_get_write_access(handle_t *handle,
 	return ret;
 }
 
-static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
-		   struct buffer_head *bh_result, int create);
-
 #ifdef CONFIG_EXT4_FS_ENCRYPTION
 static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
 				  get_block_t *get_block)
@@ -2462,6 +2478,10 @@ static int ext4_writepages(struct address_space *mapping,
 
 	trace_ext4_writepages(inode, wbc);
 
+	if (dax_mapping(mapping))
+		return dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev,
+						   wbc);
+
 	/*
 	 * No pages to write? This is mainly a kludge to avoid starting
 	 * a transaction for special inodes like journal inode on last iput()
@@ -3082,25 +3102,96 @@ int ext4_get_block_write(struct inode *inode, sector_t iblock,
 			       EXT4_GET_BLOCKS_IO_CREATE_EXT);
 }
 
-static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock,
+static int ext4_get_block_overwrite(struct inode *inode, sector_t iblock,
 		   struct buffer_head *bh_result, int create)
 {
-	ext4_debug("ext4_get_block_write_nolock: inode %lu, create flag %d\n",
+	int ret;
+
+	ext4_debug("ext4_get_block_overwrite: inode %lu, create flag %d\n",
 		   inode->i_ino, create);
-	return _ext4_get_block(inode, iblock, bh_result,
-			       EXT4_GET_BLOCKS_NO_LOCK);
+	ret = _ext4_get_block(inode, iblock, bh_result, 0);
+	/*
+	 * Blocks should have been preallocated! ext4_file_write_iter() checks
+	 * that.
+	 */
+	WARN_ON_ONCE(!buffer_mapped(bh_result));
+
+	return ret;
 }
 
-int ext4_get_block_dax(struct inode *inode, sector_t iblock,
-		   struct buffer_head *bh_result, int create)
+#ifdef CONFIG_FS_DAX
+int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock,
+			    struct buffer_head *bh_result, int create)
 {
-	int flags = EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_UNWRIT_EXT;
-	if (create)
-		flags |= EXT4_GET_BLOCKS_CREATE;
-	ext4_debug("ext4_get_block_dax: inode %lu, create flag %d\n",
+	int ret, err;
+	int credits;
+	struct ext4_map_blocks map;
+	handle_t *handle = NULL;
+	int flags = 0;
+
+	ext4_debug("ext4_dax_mmap_get_block: inode %lu, create flag %d\n",
 		   inode->i_ino, create);
-	return _ext4_get_block(inode, iblock, bh_result, flags);
+	map.m_lblk = iblock;
+	map.m_len = bh_result->b_size >> inode->i_blkbits;
+	credits = ext4_chunk_trans_blocks(inode, map.m_len);
+	if (create) {
+		flags |= EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_CREATE_ZERO;
+		handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			return ret;
+		}
+	}
+
+	ret = ext4_map_blocks(handle, inode, &map, flags);
+	if (create) {
+		err = ext4_journal_stop(handle);
+		if (ret >= 0 && err < 0)
+			ret = err;
+	}
+	if (ret <= 0)
+		goto out;
+	if (map.m_flags & EXT4_MAP_UNWRITTEN) {
+		int err2;
+
+		/*
+		 * We are protected by i_mmap_sem so we know block cannot go
+		 * away from under us even though we dropped i_data_sem.
+		 * Convert extent to written and write zeros there.
+		 *
+		 * Note: We may get here even when create == 0.
+		 */
+		handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits);
+		if (IS_ERR(handle)) {
+			ret = PTR_ERR(handle);
+			goto out;
+		}
+
+		err = ext4_map_blocks(handle, inode, &map,
+		      EXT4_GET_BLOCKS_CONVERT | EXT4_GET_BLOCKS_CREATE_ZERO);
+		if (err < 0)
+			ret = err;
+		err2 = ext4_journal_stop(handle);
+		if (err2 < 0 && ret > 0)
+			ret = err2;
+	}
+out:
+	WARN_ON_ONCE(ret == 0 && create);
+	if (ret > 0) {
+		map_bh(bh_result, inode->i_sb, map.m_pblk);
+		bh_result->b_state = (bh_result->b_state & ~EXT4_MAP_FLAGS) |
+					map.m_flags;
+		/*
+		 * At least for now we have to clear BH_New so that DAX code
+		 * doesn't attempt to zero blocks again in a racy way.
+		 */
+		bh_result->b_state &= ~(1 << BH_New);
+		bh_result->b_size = map.m_len << inode->i_blkbits;
+		ret = 0;
+	}
+	return ret;
 }
+#endif
 
 static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
 			    ssize_t size, void *private)
@@ -3171,10 +3262,8 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	/* If we do a overwrite dio, i_mutex locking can be released */
 	overwrite = *((int *)iocb->private);
 
-	if (overwrite) {
-		down_read(&EXT4_I(inode)->i_data_sem);
-		mutex_unlock(&inode->i_mutex);
-	}
+	if (overwrite)
+		inode_unlock(inode);
 
 	/*
 	 * We could direct write to holes and fallocate.
@@ -3196,29 +3285,29 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	 * case, we allocate an io_end structure to hook to the iocb.
 	 */
 	iocb->private = NULL;
-	ext4_inode_aio_set(inode, NULL);
-	if (!is_sync_kiocb(iocb)) {
-		io_end = ext4_init_io_end(inode, GFP_NOFS);
-		if (!io_end) {
-			ret = -ENOMEM;
-			goto retake_lock;
-		}
-		/*
-		 * Grab reference for DIO. Will be dropped in ext4_end_io_dio()
-		 */
-		iocb->private = ext4_get_io_end(io_end);
-		/*
-		 * we save the io structure for current async direct
-		 * IO, so that later ext4_map_blocks() could flag the
-		 * io structure whether there is a unwritten extents
-		 * needs to be converted when IO is completed.
-		 */
-		ext4_inode_aio_set(inode, io_end);
-	}
-
 	if (overwrite) {
-		get_block_func = ext4_get_block_write_nolock;
+		get_block_func = ext4_get_block_overwrite;
 	} else {
+		ext4_inode_aio_set(inode, NULL);
+		if (!is_sync_kiocb(iocb)) {
+			io_end = ext4_init_io_end(inode, GFP_NOFS);
+			if (!io_end) {
+				ret = -ENOMEM;
+				goto retake_lock;
+			}
+			/*
+			 * Grab reference for DIO. Will be dropped in
+			 * ext4_end_io_dio()
+			 */
+			iocb->private = ext4_get_io_end(io_end);
+			/*
+			 * we save the io structure for current async direct
+			 * IO, so that later ext4_map_blocks() could flag the
+			 * io structure whether there is a unwritten extents
+			 * needs to be converted when IO is completed.
+			 */
+			ext4_inode_aio_set(inode, io_end);
+		}
 		get_block_func = ext4_get_block_write;
 		dio_flags = DIO_LOCKING;
 	}
@@ -3273,10 +3362,8 @@ retake_lock:
 	if (iov_iter_rw(iter) == WRITE)
 		inode_dio_end(inode);
 	/* take i_mutex locking again if we do a ovewrite dio */
-	if (overwrite) {
-		up_read(&EXT4_I(inode)->i_data_sem);
-		mutex_lock(&inode->i_mutex);
-	}
+	if (overwrite)
+		inode_lock(inode);
 
 	return ret;
 }
@@ -3587,6 +3674,35 @@ int ext4_can_truncate(struct inode *inode)
 }
 
 /*
+ * We have to make sure i_disksize gets properly updated before we truncate
+ * page cache due to hole punching or zero range. Otherwise i_disksize update
+ * can get lost as it may have been postponed to submission of writeback but
+ * that will never happen after we truncate page cache.
+ */
+int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
+				      loff_t len)
+{
+	handle_t *handle;
+	loff_t size = i_size_read(inode);
+
+	WARN_ON(!inode_is_locked(inode));
+	if (offset > size || offset + len < size)
+		return 0;
+
+	if (EXT4_I(inode)->i_disksize >= size)
+		return 0;
+
+	handle = ext4_journal_start(inode, EXT4_HT_MISC, 1);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+	ext4_update_i_disksize(inode, size);
+	ext4_mark_inode_dirty(handle, inode);
+	ext4_journal_stop(handle);
+
+	return 0;
+}
+
+/*
  * ext4_punch_hole: punches a hole in a file by releaseing the blocks
  * associated with the given offset and length
  *
@@ -3623,7 +3739,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
 			return ret;
 	}
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	/* No need to punch hole beyond i_size */
 	if (offset >= inode->i_size)
@@ -3651,17 +3767,26 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
 
 	}
 
+	/* Wait all existing dio workers, newcomers will block on i_mutex */
+	ext4_inode_block_unlocked_dio(inode);
+	inode_dio_wait(inode);
+
+	/*
+	 * Prevent page faults from reinstantiating pages we have released from
+	 * page cache.
+	 */
+	down_write(&EXT4_I(inode)->i_mmap_sem);
 	first_block_offset = round_up(offset, sb->s_blocksize);
 	last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
 
 	/* Now release the pages and zero block aligned part of pages*/
-	if (last_block_offset > first_block_offset)
+	if (last_block_offset > first_block_offset) {
+		ret = ext4_update_disksize_before_punch(inode, offset, length);
+		if (ret)
+			goto out_dio;
 		truncate_pagecache_range(inode, first_block_offset,
 					 last_block_offset);
-
-	/* Wait all existing dio workers, newcomers will block on i_mutex */
-	ext4_inode_block_unlocked_dio(inode);
-	inode_dio_wait(inode);
+	}
 
 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
 		credits = ext4_writepage_trans_blocks(inode);
@@ -3708,19 +3833,15 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
 	if (IS_SYNC(inode))
 		ext4_handle_sync(handle);
 
-	/* Now release the pages again to reduce race window */
-	if (last_block_offset > first_block_offset)
-		truncate_pagecache_range(inode, first_block_offset,
-					 last_block_offset);
-
 	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
 	ext4_mark_inode_dirty(handle, inode);
 out_stop:
 	ext4_journal_stop(handle);
 out_dio:
+	up_write(&EXT4_I(inode)->i_mmap_sem);
 	ext4_inode_resume_unlocked_dio(inode);
 out_mutex:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return ret;
 }
 
@@ -3790,7 +3911,7 @@ void ext4_truncate(struct inode *inode)
 	 * have i_mutex locked because it's not necessary.
 	 */
 	if (!(inode->i_state & (I_NEW|I_FREEING)))
-		WARN_ON(!mutex_is_locked(&inode->i_mutex));
+		WARN_ON(!inode_is_locked(inode));
 	trace_ext4_truncate_enter(inode);
 
 	if (!ext4_can_truncate(inode))
@@ -4038,7 +4159,7 @@ void ext4_set_inode_flags(struct inode *inode)
 		new_fl |= S_NOATIME;
 	if (flags & EXT4_DIRSYNC_FL)
 		new_fl |= S_DIRSYNC;
-	if (test_opt(inode->i_sb, DAX))
+	if (test_opt(inode->i_sb, DAX) && S_ISREG(inode->i_mode))
 		new_fl |= S_DAX;
 	inode_set_flags(inode, new_fl,
 			S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX);
@@ -4104,6 +4225,14 @@ static inline void ext4_iget_extra_inode(struct inode *inode,
 		EXT4_I(inode)->i_inline_off = 0;
 }
 
+int ext4_get_projid(struct inode *inode, kprojid_t *projid)
+{
+	if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, EXT4_FEATURE_RO_COMPAT_PROJECT))
+		return -EOPNOTSUPP;
+	*projid = EXT4_I(inode)->i_projid;
+	return 0;
+}
+
 struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 {
 	struct ext4_iloc iloc;
@@ -4115,6 +4244,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 	int block;
 	uid_t i_uid;
 	gid_t i_gid;
+	projid_t i_projid;
 
 	inode = iget_locked(sb, ino);
 	if (!inode)
@@ -4164,12 +4294,20 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 	inode->i_mode = le16_to_cpu(raw_inode->i_mode);
 	i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
 	i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_PROJECT) &&
+	    EXT4_INODE_SIZE(sb) > EXT4_GOOD_OLD_INODE_SIZE &&
+	    EXT4_FITS_IN_INODE(raw_inode, ei, i_projid))
+		i_projid = (projid_t)le32_to_cpu(raw_inode->i_projid);
+	else
+		i_projid = EXT4_DEF_PROJID;
+
 	if (!(test_opt(inode->i_sb, NO_UID32))) {
 		i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
 		i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
 	}
 	i_uid_write(inode, i_uid);
 	i_gid_write(inode, i_gid);
+	ei->i_projid = make_kprojid(&init_user_ns, i_projid);
 	set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
 
 	ext4_clear_state_flags(ei);	/* Only relevant on 32-bit archs */
@@ -4311,6 +4449,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 			inode->i_op = &ext4_symlink_inode_operations;
 			ext4_set_aops(inode);
 		}
+		inode_nohighmem(inode);
 	} else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
 	      S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
 		inode->i_op = &ext4_special_inode_operations;
@@ -4467,6 +4606,7 @@ static int ext4_do_update_inode(handle_t *handle,
 	int need_datasync = 0, set_large_file = 0;
 	uid_t i_uid;
 	gid_t i_gid;
+	projid_t i_projid;
 
 	spin_lock(&ei->i_raw_lock);
 
@@ -4479,6 +4619,7 @@ static int ext4_do_update_inode(handle_t *handle,
 	raw_inode->i_mode = cpu_to_le16(inode->i_mode);
 	i_uid = i_uid_read(inode);
 	i_gid = i_gid_read(inode);
+	i_projid = from_kprojid(&init_user_ns, ei->i_projid);
 	if (!(test_opt(inode->i_sb, NO_UID32))) {
 		raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid));
 		raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid));
@@ -4556,6 +4697,15 @@ static int ext4_do_update_inode(handle_t *handle,
 				cpu_to_le16(ei->i_extra_isize);
 		}
 	}
+
+	BUG_ON(!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+			EXT4_FEATURE_RO_COMPAT_PROJECT) &&
+	       i_projid != EXT4_DEF_PROJID);
+
+	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE &&
+	    EXT4_FITS_IN_INODE(raw_inode, ei, i_projid))
+		raw_inode->i_projid = cpu_to_le32(i_projid);
+
 	ext4_inode_csum_set(inode, raw_inode, ei);
 	spin_unlock(&ei->i_raw_lock);
 	if (inode->i_sb->s_flags & MS_LAZYTIME)
@@ -4851,6 +5001,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 			} else
 				ext4_wait_for_tail_page_commit(inode);
 		}
+		down_write(&EXT4_I(inode)->i_mmap_sem);
 		/*
 		 * Truncate pagecache after we've waited for commit
 		 * in data=journal mode to make pages freeable.
@@ -4858,6 +5009,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 		truncate_pagecache(inode, inode->i_size);
 		if (shrink)
 			ext4_truncate(inode);
+		up_write(&EXT4_I(inode)->i_mmap_sem);
 	}
 
 	if (!rc) {
@@ -5306,6 +5458,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 
 	sb_start_pagefault(inode->i_sb);
 	file_update_time(vma->vm_file);
+
+	down_read(&EXT4_I(inode)->i_mmap_sem);
 	/* Delalloc case is easy... */
 	if (test_opt(inode->i_sb, DELALLOC) &&
 	    !ext4_should_journal_data(inode) &&
@@ -5375,6 +5529,19 @@ retry_alloc:
 out_ret:
 	ret = block_page_mkwrite_return(ret);
 out:
+	up_read(&EXT4_I(inode)->i_mmap_sem);
 	sb_end_pagefault(inode->i_sb);
 	return ret;
 }
+
+int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct inode *inode = file_inode(vma->vm_file);
+	int err;
+
+	down_read(&EXT4_I(inode)->i_mmap_sem);
+	err = filemap_fault(vma, vmf);
+	up_read(&EXT4_I(inode)->i_mmap_sem);
+
+	return err;
+}
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 5e872fd40..eae5917c5 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -14,6 +14,7 @@
 #include <linux/mount.h>
 #include <linux/file.h>
 #include <linux/random.h>
+#include <linux/quotaops.h>
 #include <asm/uaccess.h>
 #include "ext4_jbd2.h"
 #include "ext4.h"
@@ -202,6 +203,238 @@ static int uuid_is_zero(__u8 u[16])
 	return 1;
 }
 
+static int ext4_ioctl_setflags(struct inode *inode,
+			       unsigned int flags)
+{
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	handle_t *handle = NULL;
+	int err = -EPERM, migrate = 0;
+	struct ext4_iloc iloc;
+	unsigned int oldflags, mask, i;
+	unsigned int jflag;
+
+	/* Is it quota file? Do not allow user to mess with it */
+	if (IS_NOQUOTA(inode))
+		goto flags_out;
+
+	oldflags = ei->i_flags;
+
+	/* The JOURNAL_DATA flag is modifiable only by root */
+	jflag = flags & EXT4_JOURNAL_DATA_FL;
+
+	/*
+	 * The IMMUTABLE and APPEND_ONLY flags can only be changed by
+	 * the relevant capability.
+	 *
+	 * This test looks nicer. Thanks to Pauline Middelink
+	 */
+	if ((flags ^ oldflags) & (EXT4_APPEND_FL | EXT4_IMMUTABLE_FL)) {
+		if (!capable(CAP_LINUX_IMMUTABLE))
+			goto flags_out;
+	}
+
+	/*
+	 * The JOURNAL_DATA flag can only be changed by
+	 * the relevant capability.
+	 */
+	if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
+		if (!capable(CAP_SYS_RESOURCE))
+			goto flags_out;
+	}
+	if ((flags ^ oldflags) & EXT4_EXTENTS_FL)
+		migrate = 1;
+
+	if (flags & EXT4_EOFBLOCKS_FL) {
+		/* we don't support adding EOFBLOCKS flag */
+		if (!(oldflags & EXT4_EOFBLOCKS_FL)) {
+			err = -EOPNOTSUPP;
+			goto flags_out;
+		}
+	} else if (oldflags & EXT4_EOFBLOCKS_FL)
+		ext4_truncate(inode);
+
+	handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
+	if (IS_ERR(handle)) {
+		err = PTR_ERR(handle);
+		goto flags_out;
+	}
+	if (IS_SYNC(inode))
+		ext4_handle_sync(handle);
+	err = ext4_reserve_inode_write(handle, inode, &iloc);
+	if (err)
+		goto flags_err;
+
+	for (i = 0, mask = 1; i < 32; i++, mask <<= 1) {
+		if (!(mask & EXT4_FL_USER_MODIFIABLE))
+			continue;
+		if (mask & flags)
+			ext4_set_inode_flag(inode, i);
+		else
+			ext4_clear_inode_flag(inode, i);
+	}
+
+	ext4_set_inode_flags(inode);
+	inode->i_ctime = ext4_current_time(inode);
+
+	err = ext4_mark_iloc_dirty(handle, inode, &iloc);
+flags_err:
+	ext4_journal_stop(handle);
+	if (err)
+		goto flags_out;
+
+	if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL))
+		err = ext4_change_inode_journal_flag(inode, jflag);
+	if (err)
+		goto flags_out;
+	if (migrate) {
+		if (flags & EXT4_EXTENTS_FL)
+			err = ext4_ext_migrate(inode);
+		else
+			err = ext4_ind_migrate(inode);
+	}
+
+flags_out:
+	return err;
+}
+
+#ifdef CONFIG_QUOTA
+static int ext4_ioctl_setproject(struct file *filp, __u32 projid)
+{
+	struct inode *inode = file_inode(filp);
+	struct super_block *sb = inode->i_sb;
+	struct ext4_inode_info *ei = EXT4_I(inode);
+	int err, rc;
+	handle_t *handle;
+	kprojid_t kprojid;
+	struct ext4_iloc iloc;
+	struct ext4_inode *raw_inode;
+
+	if (!EXT4_HAS_RO_COMPAT_FEATURE(sb,
+			EXT4_FEATURE_RO_COMPAT_PROJECT)) {
+		if (projid != EXT4_DEF_PROJID)
+			return -EOPNOTSUPP;
+		else
+			return 0;
+	}
+
+	if (EXT4_INODE_SIZE(sb) <= EXT4_GOOD_OLD_INODE_SIZE)
+		return -EOPNOTSUPP;
+
+	kprojid = make_kprojid(&init_user_ns, (projid_t)projid);
+
+	if (projid_eq(kprojid, EXT4_I(inode)->i_projid))
+		return 0;
+
+	err = mnt_want_write_file(filp);
+	if (err)
+		return err;
+
+	err = -EPERM;
+	inode_lock(inode);
+	/* Is it quota file? Do not allow user to mess with it */
+	if (IS_NOQUOTA(inode))
+		goto out_unlock;
+
+	err = ext4_get_inode_loc(inode, &iloc);
+	if (err)
+		goto out_unlock;
+
+	raw_inode = ext4_raw_inode(&iloc);
+	if (!EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) {
+		err = -EOVERFLOW;
+		brelse(iloc.bh);
+		goto out_unlock;
+	}
+	brelse(iloc.bh);
+
+	dquot_initialize(inode);
+
+	handle = ext4_journal_start(inode, EXT4_HT_QUOTA,
+		EXT4_QUOTA_INIT_BLOCKS(sb) +
+		EXT4_QUOTA_DEL_BLOCKS(sb) + 3);
+	if (IS_ERR(handle)) {
+		err = PTR_ERR(handle);
+		goto out_unlock;
+	}
+
+	err = ext4_reserve_inode_write(handle, inode, &iloc);
+	if (err)
+		goto out_stop;
+
+	if (sb_has_quota_limits_enabled(sb, PRJQUOTA)) {
+		struct dquot *transfer_to[MAXQUOTAS] = { };
+
+		transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid));
+		if (transfer_to[PRJQUOTA]) {
+			err = __dquot_transfer(inode, transfer_to);
+			dqput(transfer_to[PRJQUOTA]);
+			if (err)
+				goto out_dirty;
+		}
+	}
+	EXT4_I(inode)->i_projid = kprojid;
+	inode->i_ctime = ext4_current_time(inode);
+out_dirty:
+	rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
+	if (!err)
+		err = rc;
+out_stop:
+	ext4_journal_stop(handle);
+out_unlock:
+	inode_unlock(inode);
+	mnt_drop_write_file(filp);
+	return err;
+}
+#else
+static int ext4_ioctl_setproject(struct file *filp, __u32 projid)
+{
+	if (projid != EXT4_DEF_PROJID)
+		return -EOPNOTSUPP;
+	return 0;
+}
+#endif
+
+/* Transfer internal flags to xflags */
+static inline __u32 ext4_iflags_to_xflags(unsigned long iflags)
+{
+	__u32 xflags = 0;
+
+	if (iflags & EXT4_SYNC_FL)
+		xflags |= FS_XFLAG_SYNC;
+	if (iflags & EXT4_IMMUTABLE_FL)
+		xflags |= FS_XFLAG_IMMUTABLE;
+	if (iflags & EXT4_APPEND_FL)
+		xflags |= FS_XFLAG_APPEND;
+	if (iflags & EXT4_NODUMP_FL)
+		xflags |= FS_XFLAG_NODUMP;
+	if (iflags & EXT4_NOATIME_FL)
+		xflags |= FS_XFLAG_NOATIME;
+	if (iflags & EXT4_PROJINHERIT_FL)
+		xflags |= FS_XFLAG_PROJINHERIT;
+	return xflags;
+}
+
+/* Transfer xflags flags to internal */
+static inline unsigned long ext4_xflags_to_iflags(__u32 xflags)
+{
+	unsigned long iflags = 0;
+
+	if (xflags & FS_XFLAG_SYNC)
+		iflags |= EXT4_SYNC_FL;
+	if (xflags & FS_XFLAG_IMMUTABLE)
+		iflags |= EXT4_IMMUTABLE_FL;
+	if (xflags & FS_XFLAG_APPEND)
+		iflags |= EXT4_APPEND_FL;
+	if (xflags & FS_XFLAG_NODUMP)
+		iflags |= EXT4_NODUMP_FL;
+	if (xflags & FS_XFLAG_NOATIME)
+		iflags |= EXT4_NOATIME_FL;
+	if (xflags & FS_XFLAG_PROJINHERIT)
+		iflags |= EXT4_PROJINHERIT_FL;
+
+	return iflags;
+}
+
 long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	struct inode *inode = file_inode(filp);
@@ -217,11 +450,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		flags = ei->i_flags & EXT4_FL_USER_VISIBLE;
 		return put_user(flags, (int __user *) arg);
 	case EXT4_IOC_SETFLAGS: {
-		handle_t *handle = NULL;
-		int err, migrate = 0;
-		struct ext4_iloc iloc;
-		unsigned int oldflags, mask, i;
-		unsigned int jflag;
+		int err;
 
 		if (!inode_owner_or_capable(inode))
 			return -EACCES;
@@ -235,90 +464,9 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 
 		flags = ext4_mask_flags(inode->i_mode, flags);
 
-		err = -EPERM;
-		mutex_lock(&inode->i_mutex);
-		/* Is it quota file? Do not allow user to mess with it */
-		if (IS_NOQUOTA(inode))
-			goto flags_out;
-
-		oldflags = ei->i_flags;
-
-		/* The JOURNAL_DATA flag is modifiable only by root */
-		jflag = flags & EXT4_JOURNAL_DATA_FL;
-
-		/*
-		 * The IMMUTABLE and APPEND_ONLY flags can only be changed by
-		 * the relevant capability.
-		 *
-		 * This test looks nicer. Thanks to Pauline Middelink
-		 */
-		if ((flags ^ oldflags) & (EXT4_APPEND_FL | EXT4_IMMUTABLE_FL)) {
-			if (!capable(CAP_LINUX_IMMUTABLE))
-				goto flags_out;
-		}
-
-		/*
-		 * The JOURNAL_DATA flag can only be changed by
-		 * the relevant capability.
-		 */
-		if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
-			if (!capable(CAP_SYS_RESOURCE))
-				goto flags_out;
-		}
-		if ((flags ^ oldflags) & EXT4_EXTENTS_FL)
-			migrate = 1;
-
-		if (flags & EXT4_EOFBLOCKS_FL) {
-			/* we don't support adding EOFBLOCKS flag */
-			if (!(oldflags & EXT4_EOFBLOCKS_FL)) {
-				err = -EOPNOTSUPP;
-				goto flags_out;
-			}
-		} else if (oldflags & EXT4_EOFBLOCKS_FL)
-			ext4_truncate(inode);
-
-		handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
-		if (IS_ERR(handle)) {
-			err = PTR_ERR(handle);
-			goto flags_out;
-		}
-		if (IS_SYNC(inode))
-			ext4_handle_sync(handle);
-		err = ext4_reserve_inode_write(handle, inode, &iloc);
-		if (err)
-			goto flags_err;
-
-		for (i = 0, mask = 1; i < 32; i++, mask <<= 1) {
-			if (!(mask & EXT4_FL_USER_MODIFIABLE))
-				continue;
-			if (mask & flags)
-				ext4_set_inode_flag(inode, i);
-			else
-				ext4_clear_inode_flag(inode, i);
-		}
-
-		ext4_set_inode_flags(inode);
-		inode->i_ctime = ext4_current_time(inode);
-
-		err = ext4_mark_iloc_dirty(handle, inode, &iloc);
-flags_err:
-		ext4_journal_stop(handle);
-		if (err)
-			goto flags_out;
-
-		if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL))
-			err = ext4_change_inode_journal_flag(inode, jflag);
-		if (err)
-			goto flags_out;
-		if (migrate) {
-			if (flags & EXT4_EXTENTS_FL)
-				err = ext4_ext_migrate(inode);
-			else
-				err = ext4_ind_migrate(inode);
-		}
-
-flags_out:
-		mutex_unlock(&inode->i_mutex);
+		inode_lock(inode);
+		err = ext4_ioctl_setflags(inode, flags);
+		inode_unlock(inode);
 		mnt_drop_write_file(filp);
 		return err;
 	}
@@ -349,7 +497,7 @@ flags_out:
 			goto setversion_out;
 		}
 
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 		handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
 		if (IS_ERR(handle)) {
 			err = PTR_ERR(handle);
@@ -364,7 +512,7 @@ flags_out:
 		ext4_journal_stop(handle);
 
 unlock_out:
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 setversion_out:
 		mnt_drop_write_file(filp);
 		return err;
@@ -435,6 +583,11 @@ group_extend_out:
 				 "Online defrag not supported with bigalloc");
 			err = -EOPNOTSUPP;
 			goto mext_out;
+		} else if (IS_DAX(inode)) {
+			ext4_msg(sb, KERN_ERR,
+				 "Online defrag not supported with DAX");
+			err = -EOPNOTSUPP;
+			goto mext_out;
 		}
 
 		err = mnt_want_write_file(filp);
@@ -510,9 +663,9 @@ group_add_out:
 		 * ext4_ext_swap_inode_data before we switch the
 		 * inode format to prevent read.
 		 */
-		mutex_lock(&(inode->i_mutex));
+		inode_lock((inode));
 		err = ext4_ext_migrate(inode);
-		mutex_unlock(&(inode->i_mutex));
+		inode_unlock((inode));
 		mnt_drop_write_file(filp);
 		return err;
 	}
@@ -689,6 +842,60 @@ encryption_policy_out:
 		return -EOPNOTSUPP;
 #endif
 	}
+	case EXT4_IOC_FSGETXATTR:
+	{
+		struct fsxattr fa;
+
+		memset(&fa, 0, sizeof(struct fsxattr));
+		ext4_get_inode_flags(ei);
+		fa.fsx_xflags = ext4_iflags_to_xflags(ei->i_flags & EXT4_FL_USER_VISIBLE);
+
+		if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb,
+				EXT4_FEATURE_RO_COMPAT_PROJECT)) {
+			fa.fsx_projid = (__u32)from_kprojid(&init_user_ns,
+				EXT4_I(inode)->i_projid);
+		}
+
+		if (copy_to_user((struct fsxattr __user *)arg,
+				 &fa, sizeof(fa)))
+			return -EFAULT;
+		return 0;
+	}
+	case EXT4_IOC_FSSETXATTR:
+	{
+		struct fsxattr fa;
+		int err;
+
+		if (copy_from_user(&fa, (struct fsxattr __user *)arg,
+				   sizeof(fa)))
+			return -EFAULT;
+
+		/* Make sure caller has proper permission */
+		if (!inode_owner_or_capable(inode))
+			return -EACCES;
+
+		err = mnt_want_write_file(filp);
+		if (err)
+			return err;
+
+		flags = ext4_xflags_to_iflags(fa.fsx_xflags);
+		flags = ext4_mask_flags(inode->i_mode, flags);
+
+		inode_lock(inode);
+		flags = (ei->i_flags & ~EXT4_FL_XFLAG_VISIBLE) |
+			 (flags & EXT4_FL_XFLAG_VISIBLE);
+		err = ext4_ioctl_setflags(inode, flags);
+		inode_unlock(inode);
+		mnt_drop_write_file(filp);
+		if (err)
+			return err;
+
+		err = ext4_ioctl_setproject(filp, fa.fsx_projid);
+		if (err)
+			return err;
+
+		return 0;
+	}
 	default:
 		return -ENOTTY;
 	}
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 61eaf74dc..4424b7bf8 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -2285,7 +2285,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
 	if (group == 0)
 		seq_puts(seq, "#group: free  frags first ["
 			      " 2^0   2^1   2^2   2^3   2^4   2^5   2^6  "
-			      " 2^7   2^8   2^9   2^10  2^11  2^12  2^13  ]");
+			      " 2^7   2^8   2^9   2^10  2^11  2^12  2^13  ]\n");
 
 	i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) +
 		sizeof(struct ext4_group_info);
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index e032a0423..4098acc70 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -390,6 +390,7 @@ data_copy:
 		*err = ext4_get_block(orig_inode, orig_blk_offset + i, bh, 0);
 		if (*err < 0)
 			break;
+		bh = bh->b_this_page;
 	}
 	if (!*err)
 		*err = block_commit_write(pagep[0], from, from + replaced_size);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index a969ab39f..48e4b8907 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -273,7 +273,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
 		struct ext4_filename *fname,
 		struct ext4_dir_entry_2 **res_dir);
 static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
-			     struct dentry *dentry, struct inode *inode);
+			     struct inode *dir, struct inode *inode);
 
 /* checksumming functions */
 void initialize_dirent_tail(struct ext4_dir_entry_tail *t,
@@ -1558,6 +1558,24 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
 	struct ext4_dir_entry_2 *de;
 	struct buffer_head *bh;
 
+       if (ext4_encrypted_inode(dir)) {
+               int res = ext4_get_encryption_info(dir);
+
+		/*
+		 * This should be a properly defined flag for
+		 * dentry->d_flags when we uplift this to the VFS.
+		 * d_fsdata is set to (void *) 1 if if the dentry is
+		 * created while the directory was encrypted and we
+		 * don't have access to the key.
+		 */
+	       dentry->d_fsdata = NULL;
+	       if (ext4_encryption_info(dir))
+		       dentry->d_fsdata = (void *) 1;
+	       d_set_d_op(dentry, &ext4_encrypted_d_ops);
+	       if (res && res != -ENOKEY)
+		       return ERR_PTR(res);
+       }
+
 	if (dentry->d_name.len > EXT4_NAME_LEN)
 		return ERR_PTR(-ENAMETOOLONG);
 
@@ -1585,11 +1603,15 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
 			return ERR_PTR(-EFSCORRUPTED);
 		}
 		if (!IS_ERR(inode) && ext4_encrypted_inode(dir) &&
-		    (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
-		     S_ISLNK(inode->i_mode)) &&
+		    (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) &&
 		    !ext4_is_child_context_consistent_with_parent(dir,
 								  inode)) {
+			int nokey = ext4_encrypted_inode(inode) &&
+				!ext4_encryption_info(inode);
+
 			iput(inode);
+			if (nokey)
+				return ERR_PTR(-ENOKEY);
 			ext4_warning(inode->i_sb,
 				     "Inconsistent encryption contexts: %lu/%lu\n",
 				     (unsigned long) dir->i_ino,
@@ -1928,10 +1950,9 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
  * directory, and adds the dentry to the indexed directory.
  */
 static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
-			    struct dentry *dentry,
+			    struct inode *dir,
 			    struct inode *inode, struct buffer_head *bh)
 {
-	struct inode	*dir = d_inode(dentry->d_parent);
 	struct buffer_head *bh2;
 	struct dx_root	*root;
 	struct dx_frame	frames[2], *frame;
@@ -2086,8 +2107,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
 		return retval;
 
 	if (ext4_has_inline_data(dir)) {
-		retval = ext4_try_add_inline_entry(handle, &fname,
-						   dentry, inode);
+		retval = ext4_try_add_inline_entry(handle, &fname, dir, inode);
 		if (retval < 0)
 			goto out;
 		if (retval == 1) {
@@ -2097,7 +2117,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
 	}
 
 	if (is_dx(dir)) {
-		retval = ext4_dx_add_entry(handle, &fname, dentry, inode);
+		retval = ext4_dx_add_entry(handle, &fname, dir, inode);
 		if (!retval || (retval != ERR_BAD_DX_DIR))
 			goto out;
 		ext4_clear_inode_flag(dir, EXT4_INODE_INDEX);
@@ -2119,7 +2139,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
 
 		if (blocks == 1 && !dx_fallback &&
 		    ext4_has_feature_dir_index(sb)) {
-			retval = make_indexed_dir(handle, &fname, dentry,
+			retval = make_indexed_dir(handle, &fname, dir,
 						  inode, bh);
 			bh = NULL; /* make_indexed_dir releases bh */
 			goto out;
@@ -2154,12 +2174,11 @@ out:
  * Returns 0 for success, or a negative error value
  */
 static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
-			     struct dentry *dentry, struct inode *inode)
+			     struct inode *dir, struct inode *inode)
 {
 	struct dx_frame frames[2], *frame;
 	struct dx_entry *entries, *at;
 	struct buffer_head *bh;
-	struct inode *dir = d_inode(dentry->d_parent);
 	struct super_block *sb = dir->i_sb;
 	struct ext4_dir_entry_2 *de;
 	int err;
@@ -2756,7 +2775,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
 		return 0;
 
 	WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) &&
-		     !mutex_is_locked(&inode->i_mutex));
+		     !inode_is_locked(inode));
 	/*
 	 * Exit early if inode already is on orphan list. This is a big speedup
 	 * since we don't have to contend on the global s_orphan_lock.
@@ -2838,7 +2857,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
 		return 0;
 
 	WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) &&
-		     !mutex_is_locked(&inode->i_mutex));
+		     !inode_is_locked(inode));
 	/* Do this quick check before taking global s_orphan_lock. */
 	if (list_empty(&ei->i_orphan))
 		return 0;
@@ -3132,6 +3151,7 @@ static int ext4_symlink(struct inode *dir,
 	if ((disk_link.len > EXT4_N_BLOCKS * 4)) {
 		if (!encryption_required)
 			inode->i_op = &ext4_symlink_inode_operations;
+		inode_nohighmem(inode);
 		ext4_set_aops(inode);
 		/*
 		 * We cannot call page_symlink() with transaction started
@@ -3211,6 +3231,12 @@ static int ext4_link(struct dentry *old_dentry,
 	if (ext4_encrypted_inode(dir) &&
 	    !ext4_is_child_context_consistent_with_parent(dir, inode))
 		return -EPERM;
+
+       if ((ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT)) &&
+	   (!projid_eq(EXT4_I(dir)->i_projid,
+		       EXT4_I(old_dentry->d_inode)->i_projid)))
+		return -EXDEV;
+
 	err = dquot_initialize(dir);
 	if (err)
 		return err;
@@ -3491,6 +3517,11 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 	int credits;
 	u8 old_file_type;
 
+	if ((ext4_test_inode_flag(new_dir, EXT4_INODE_PROJINHERIT)) &&
+	    (!projid_eq(EXT4_I(new_dir)->i_projid,
+			EXT4_I(old_dentry->d_inode)->i_projid)))
+		return -EXDEV;
+
 	retval = dquot_initialize(old.dir);
 	if (retval)
 		return retval;
@@ -3700,6 +3731,14 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
 							   new.inode)))
 		return -EPERM;
 
+	if ((ext4_test_inode_flag(new_dir, EXT4_INODE_PROJINHERIT) &&
+	     !projid_eq(EXT4_I(new_dir)->i_projid,
+			EXT4_I(old_dentry->d_inode)->i_projid)) ||
+	    (ext4_test_inode_flag(old_dir, EXT4_INODE_PROJINHERIT) &&
+	     !projid_eq(EXT4_I(old_dir)->i_projid,
+			EXT4_I(new_dentry->d_inode)->i_projid)))
+		return -EXDEV;
+
 	retval = dquot_initialize(old.dir);
 	if (retval)
 		return retval;
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 17fbe3882..090b34986 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -52,9 +52,8 @@ void ext4_exit_pageio(void)
  */
 static void buffer_io_error(struct buffer_head *bh)
 {
-	char b[BDEVNAME_SIZE];
-	printk_ratelimited(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n",
-			bdevname(bh->b_bdev, b),
+	printk_ratelimited(KERN_ERR "Buffer I/O error on device %pg, logical block %llu\n",
+		       bh->b_bdev,
 			(unsigned long long)bh->b_blocknr);
 }
 
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index c9ab67da6..3ed01ec01 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -80,6 +80,36 @@ static void ext4_destroy_lazyinit_thread(void);
 static void ext4_unregister_li_request(struct super_block *sb);
 static void ext4_clear_request_list(void);
 
+/*
+ * Lock ordering
+ *
+ * Note the difference between i_mmap_sem (EXT4_I(inode)->i_mmap_sem) and
+ * i_mmap_rwsem (inode->i_mmap_rwsem)!
+ *
+ * page fault path:
+ * mmap_sem -> sb_start_pagefault -> i_mmap_sem (r) -> transaction start ->
+ *   page lock -> i_data_sem (rw)
+ *
+ * buffered write path:
+ * sb_start_write -> i_mutex -> mmap_sem
+ * sb_start_write -> i_mutex -> transaction start -> page lock ->
+ *   i_data_sem (rw)
+ *
+ * truncate:
+ * sb_start_write -> i_mutex -> EXT4_STATE_DIOREAD_LOCK (w) -> i_mmap_sem (w) ->
+ *   i_mmap_rwsem (w) -> page lock
+ * sb_start_write -> i_mutex -> EXT4_STATE_DIOREAD_LOCK (w) -> i_mmap_sem (w) ->
+ *   transaction start -> i_data_sem (rw)
+ *
+ * direct IO:
+ * sb_start_write -> i_mutex -> EXT4_STATE_DIOREAD_LOCK (r) -> mmap_sem
+ * sb_start_write -> i_mutex -> EXT4_STATE_DIOREAD_LOCK (r) ->
+ *   transaction start -> i_data_sem (rw)
+ *
+ * writepages:
+ * transaction start -> page lock(s) -> i_data_sem (rw)
+ */
+
 #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2)
 static struct file_system_type ext2_fs_type = {
 	.owner		= THIS_MODULE,
@@ -958,6 +988,7 @@ static void init_once(void *foo)
 	INIT_LIST_HEAD(&ei->i_orphan);
 	init_rwsem(&ei->xattr_sem);
 	init_rwsem(&ei->i_data_sem);
+	init_rwsem(&ei->i_mmap_sem);
 	inode_init_once(&ei->vfs_inode);
 }
 
@@ -966,7 +997,7 @@ static int __init init_inodecache(void)
 	ext4_inode_cachep = kmem_cache_create("ext4_inode_cache",
 					     sizeof(struct ext4_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					     init_once);
 	if (ext4_inode_cachep == NULL)
 		return -ENOMEM;
@@ -1066,8 +1097,8 @@ static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
 }
 
 #ifdef CONFIG_QUOTA
-#define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group")
-#define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA))
+static char *quotatypes[] = INITQFNAMES;
+#define QTYPE2NAME(t) (quotatypes[t])
 
 static int ext4_write_dquot(struct dquot *dquot);
 static int ext4_acquire_dquot(struct dquot *dquot);
@@ -1100,6 +1131,7 @@ static const struct dquot_operations ext4_quota_operations = {
 	.write_info	= ext4_write_info,
 	.alloc_dquot	= dquot_alloc,
 	.destroy_dquot	= dquot_destroy,
+	.get_projid	= ext4_get_projid,
 };
 
 static const struct quotactl_ops ext4_qctl_operations = {
@@ -2254,10 +2286,10 @@ static void ext4_orphan_cleanup(struct super_block *sb,
 					__func__, inode->i_ino, inode->i_size);
 			jbd_debug(2, "truncating inode %lu to %lld bytes\n",
 				  inode->i_ino, inode->i_size);
-			mutex_lock(&inode->i_mutex);
+			inode_lock(inode);
 			truncate_inode_pages(inode->i_mapping, inode->i_size);
 			ext4_truncate(inode);
-			mutex_unlock(&inode->i_mutex);
+			inode_unlock(inode);
 			nr_truncates++;
 		} else {
 			if (test_opt(sb, DEBUG))
@@ -2526,6 +2558,12 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly)
 			 "without CONFIG_QUOTA");
 		return 0;
 	}
+	if (ext4_has_feature_project(sb) && !readonly) {
+		ext4_msg(sb, KERN_ERR,
+			 "Filesystem with project quota feature cannot be mounted RDWR "
+			 "without CONFIG_QUOTA");
+		return 0;
+	}
 #endif  /* CONFIG_QUOTA */
 	return 1;
 }
@@ -3654,7 +3692,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 		sb->s_qcop = &dquot_quotactl_sysfile_ops;
 	else
 		sb->s_qcop = &ext4_qctl_operations;
-	sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP;
+	sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ;
 #endif
 	memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid));
 
@@ -4790,6 +4828,48 @@ restore_opts:
 	return err;
 }
 
+#ifdef CONFIG_QUOTA
+static int ext4_statfs_project(struct super_block *sb,
+			       kprojid_t projid, struct kstatfs *buf)
+{
+	struct kqid qid;
+	struct dquot *dquot;
+	u64 limit;
+	u64 curblock;
+
+	qid = make_kqid_projid(projid);
+	dquot = dqget(sb, qid);
+	if (IS_ERR(dquot))
+		return PTR_ERR(dquot);
+	spin_lock(&dq_data_lock);
+
+	limit = (dquot->dq_dqb.dqb_bsoftlimit ?
+		 dquot->dq_dqb.dqb_bsoftlimit :
+		 dquot->dq_dqb.dqb_bhardlimit) >> sb->s_blocksize_bits;
+	if (limit && buf->f_blocks > limit) {
+		curblock = dquot->dq_dqb.dqb_curspace >> sb->s_blocksize_bits;
+		buf->f_blocks = limit;
+		buf->f_bfree = buf->f_bavail =
+			(buf->f_blocks > curblock) ?
+			 (buf->f_blocks - curblock) : 0;
+	}
+
+	limit = dquot->dq_dqb.dqb_isoftlimit ?
+		dquot->dq_dqb.dqb_isoftlimit :
+		dquot->dq_dqb.dqb_ihardlimit;
+	if (limit && buf->f_files > limit) {
+		buf->f_files = limit;
+		buf->f_ffree =
+			(buf->f_files > dquot->dq_dqb.dqb_curinodes) ?
+			 (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0;
+	}
+
+	spin_unlock(&dq_data_lock);
+	dqput(dquot);
+	return 0;
+}
+#endif
+
 static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct super_block *sb = dentry->d_sb;
@@ -4822,6 +4902,11 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
 	buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
 
+#ifdef CONFIG_QUOTA
+	if (ext4_test_inode_flag(dentry->d_inode, EXT4_INODE_PROJINHERIT) &&
+	    sb_has_quota_limits_enabled(sb, PRJQUOTA))
+		ext4_statfs_project(sb, EXT4_I(dentry->d_inode)->i_projid, buf);
+#endif
 	return 0;
 }
 
@@ -4986,7 +5071,8 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id,
 	struct inode *qf_inode;
 	unsigned long qf_inums[EXT4_MAXQUOTAS] = {
 		le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum),
-		le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum)
+		le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum),
+		le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum)
 	};
 
 	BUG_ON(!ext4_has_feature_quota(sb));
@@ -5014,7 +5100,8 @@ static int ext4_enable_quotas(struct super_block *sb)
 	int type, err = 0;
 	unsigned long qf_inums[EXT4_MAXQUOTAS] = {
 		le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum),
-		le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum)
+		le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum),
+		le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum)
 	};
 
 	sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE;
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index e8e7af62a..6f7ee30a8 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -23,17 +23,21 @@
 #include "xattr.h"
 
 #ifdef CONFIG_EXT4_FS_ENCRYPTION
-static const char *ext4_encrypted_follow_link(struct dentry *dentry, void **cookie)
+static const char *ext4_encrypted_get_link(struct dentry *dentry,
+					   struct inode *inode,
+					   struct delayed_call *done)
 {
 	struct page *cpage = NULL;
 	char *caddr, *paddr = NULL;
 	struct ext4_str cstr, pstr;
-	struct inode *inode = d_inode(dentry);
 	struct ext4_encrypted_symlink_data *sd;
 	loff_t size = min_t(loff_t, i_size_read(inode), PAGE_SIZE - 1);
 	int res;
 	u32 plen, max_size = inode->i_sb->s_blocksize;
 
+	if (!dentry)
+		return ERR_PTR(-ECHILD);
+
 	res = ext4_get_encryption_info(inode);
 	if (res)
 		return ERR_PTR(res);
@@ -45,7 +49,7 @@ static const char *ext4_encrypted_follow_link(struct dentry *dentry, void **cook
 		cpage = read_mapping_page(inode->i_mapping, 0, NULL);
 		if (IS_ERR(cpage))
 			return ERR_CAST(cpage);
-		caddr = kmap(cpage);
+		caddr = page_address(cpage);
 		caddr[size] = 0;
 	}
 
@@ -75,24 +79,20 @@ static const char *ext4_encrypted_follow_link(struct dentry *dentry, void **cook
 	/* Null-terminate the name */
 	if (res <= plen)
 		paddr[res] = '\0';
-	if (cpage) {
-		kunmap(cpage);
+	if (cpage)
 		page_cache_release(cpage);
-	}
-	return *cookie = paddr;
+	set_delayed_call(done, kfree_link, paddr);
+	return paddr;
 errout:
-	if (cpage) {
-		kunmap(cpage);
+	if (cpage)
 		page_cache_release(cpage);
-	}
 	kfree(paddr);
 	return ERR_PTR(res);
 }
 
 const struct inode_operations ext4_encrypted_symlink_inode_operations = {
 	.readlink	= generic_readlink,
-	.follow_link    = ext4_encrypted_follow_link,
-	.put_link       = kfree_put_link,
+	.get_link	= ext4_encrypted_get_link,
 	.setattr	= ext4_setattr,
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
@@ -103,8 +103,7 @@ const struct inode_operations ext4_encrypted_symlink_inode_operations = {
 
 const struct inode_operations ext4_symlink_inode_operations = {
 	.readlink	= generic_readlink,
-	.follow_link	= page_follow_link_light,
-	.put_link	= page_put_link,
+	.get_link	= page_get_link,
 	.setattr	= ext4_setattr,
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
@@ -114,7 +113,7 @@ const struct inode_operations ext4_symlink_inode_operations = {
 
 const struct inode_operations ext4_fast_symlink_inode_operations = {
 	.readlink	= generic_readlink,
-	.follow_link    = simple_follow_link,
+	.get_link	= simple_get_link,
 	.setattr	= ext4_setattr,
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
diff --git a/fs/ext4/truncate.h b/fs/ext4/truncate.h
index 011ba6670..c70d06a38 100644
--- a/fs/ext4/truncate.h
+++ b/fs/ext4/truncate.h
@@ -10,8 +10,10 @@
  */
 static inline void ext4_truncate_failed_write(struct inode *inode)
 {
+	down_write(&EXT4_I(inode)->i_mmap_sem);
 	truncate_inode_pages(inode->i_mapping, inode->i_size);
 	ext4_truncate(inode);
+	up_write(&EXT4_I(inode)->i_mmap_sem);
 }
 
 /*
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 6b6b3e751..a95151e87 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -68,10 +68,8 @@
 		printk("\n"); \
 	} while (0)
 # define ea_bdebug(bh, f...) do { \
-		char b[BDEVNAME_SIZE]; \
-		printk(KERN_DEBUG "block %s:%lu: ", \
-			bdevname(bh->b_bdev, b), \
-			(unsigned long) bh->b_blocknr); \
+		printk(KERN_DEBUG "block %pg:%lu: ",		   \
+		       bh->b_bdev, (unsigned long) bh->b_blocknr); \
 		printk(f); \
 		printk("\n"); \
 	} while (0)
@@ -404,19 +402,24 @@ ext4_xattr_list_entries(struct dentry *dentry, struct ext4_xattr_entry *entry,
 		const struct xattr_handler *handler =
 			ext4_xattr_handler(entry->e_name_index);
 
-		if (handler) {
-			size_t size = handler->list(handler, dentry, buffer,
-						    rest, entry->e_name,
-						    entry->e_name_len);
+		if (handler && (!handler->list || handler->list(dentry))) {
+			const char *prefix = handler->prefix ?: handler->name;
+			size_t prefix_len = strlen(prefix);
+			size_t size = prefix_len + entry->e_name_len + 1;
+
 			if (buffer) {
 				if (size > rest)
 					return -ERANGE;
-				buffer += size;
+				memcpy(buffer, prefix, prefix_len);
+				buffer += prefix_len;
+				memcpy(buffer, entry->e_name, entry->e_name_len);
+				buffer += entry->e_name_len;
+				*buffer++ = 0;
 			}
 			rest -= size;
 		}
 	}
-	return buffer_size - rest;
+	return buffer_size - rest;  /* total size */
 }
 
 static int
diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c
index 36f4c1a84..3e81bdca0 100644
--- a/fs/ext4/xattr_security.c
+++ b/fs/ext4/xattr_security.c
@@ -11,30 +11,11 @@
 #include "ext4.h"
 #include "xattr.h"
 
-static size_t
-ext4_xattr_security_list(const struct xattr_handler *handler,
-			 struct dentry *dentry, char *list, size_t list_size,
-			 const char *name, size_t name_len)
-{
-	const size_t prefix_len = sizeof(XATTR_SECURITY_PREFIX)-1;
-	const size_t total_len = prefix_len + name_len + 1;
-
-
-	if (list && total_len <= list_size) {
-		memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
-		memcpy(list+prefix_len, name, name_len);
-		list[prefix_len + name_len] = '\0';
-	}
-	return total_len;
-}
-
 static int
 ext4_xattr_security_get(const struct xattr_handler *handler,
 			struct dentry *dentry, const char *name,
 			void *buffer, size_t size)
 {
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
 	return ext4_xattr_get(d_inode(dentry), EXT4_XATTR_INDEX_SECURITY,
 			      name, buffer, size);
 }
@@ -44,8 +25,6 @@ ext4_xattr_security_set(const struct xattr_handler *handler,
 			struct dentry *dentry, const char *name,
 			const void *value, size_t size, int flags)
 {
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
 	return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_SECURITY,
 			      name, value, size, flags);
 }
@@ -79,7 +58,6 @@ ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir,
 
 const struct xattr_handler ext4_xattr_security_handler = {
 	.prefix	= XATTR_SECURITY_PREFIX,
-	.list	= ext4_xattr_security_list,
 	.get	= ext4_xattr_security_get,
 	.set	= ext4_xattr_security_set,
 };
diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c
index 488089053..2a3c6f9b8 100644
--- a/fs/ext4/xattr_trusted.c
+++ b/fs/ext4/xattr_trusted.c
@@ -12,23 +12,10 @@
 #include "ext4.h"
 #include "xattr.h"
 
-static size_t
-ext4_xattr_trusted_list(const struct xattr_handler *handler,
-			struct dentry *dentry, char *list, size_t list_size,
-			const char *name, size_t name_len)
+static bool
+ext4_xattr_trusted_list(struct dentry *dentry)
 {
-	const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
-	const size_t total_len = prefix_len + name_len + 1;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return 0;
-
-	if (list && total_len <= list_size) {
-		memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
-		memcpy(list+prefix_len, name, name_len);
-		list[prefix_len + name_len] = '\0';
-	}
-	return total_len;
+	return capable(CAP_SYS_ADMIN);
 }
 
 static int
@@ -36,8 +23,6 @@ ext4_xattr_trusted_get(const struct xattr_handler *handler,
 		       struct dentry *dentry, const char *name, void *buffer,
 		       size_t size)
 {
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
 	return ext4_xattr_get(d_inode(dentry), EXT4_XATTR_INDEX_TRUSTED,
 			      name, buffer, size);
 }
@@ -47,8 +32,6 @@ ext4_xattr_trusted_set(const struct xattr_handler *handler,
 		       struct dentry *dentry, const char *name,
 		       const void *value, size_t size, int flags)
 {
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
 	return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_TRUSTED,
 			      name, value, size, flags);
 }
diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c
index d2dec3364..d152f431e 100644
--- a/fs/ext4/xattr_user.c
+++ b/fs/ext4/xattr_user.c
@@ -11,23 +11,10 @@
 #include "ext4.h"
 #include "xattr.h"
 
-static size_t
-ext4_xattr_user_list(const struct xattr_handler *handler,
-		     struct dentry *dentry, char *list, size_t list_size,
-		     const char *name, size_t name_len)
+static bool
+ext4_xattr_user_list(struct dentry *dentry)
 {
-	const size_t prefix_len = XATTR_USER_PREFIX_LEN;
-	const size_t total_len = prefix_len + name_len + 1;
-
-	if (!test_opt(dentry->d_sb, XATTR_USER))
-		return 0;
-
-	if (list && total_len <= list_size) {
-		memcpy(list, XATTR_USER_PREFIX, prefix_len);
-		memcpy(list+prefix_len, name, name_len);
-		list[prefix_len + name_len] = '\0';
-	}
-	return total_len;
+	return test_opt(dentry->d_sb, XATTR_USER);
 }
 
 static int
@@ -35,8 +22,6 @@ ext4_xattr_user_get(const struct xattr_handler *handler,
 		    struct dentry *dentry, const char *name,
 		    void *buffer, size_t size)
 {
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
 	if (!test_opt(dentry->d_sb, XATTR_USER))
 		return -EOPNOTSUPP;
 	return ext4_xattr_get(d_inode(dentry), EXT4_XATTR_INDEX_USER,
@@ -48,8 +33,6 @@ ext4_xattr_user_set(const struct xattr_handler *handler,
 		    struct dentry *dentry, const char *name,
 		    const void *value, size_t size, int flags)
 {
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
 	if (!test_opt(dentry->d_sb, XATTR_USER))
 		return -EOPNOTSUPP;
 	return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_USER,
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index f661d8047..3842af954 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -237,7 +237,7 @@ static int f2fs_write_meta_page(struct page *page,
 	dec_page_count(sbi, F2FS_DIRTY_META);
 	unlock_page(page);
 
-	if (wbc->for_reclaim)
+	if (wbc->for_reclaim || unlikely(f2fs_cp_error(sbi)))
 		f2fs_submit_merged_bio(sbi, META, WRITE);
 	return 0;
 
@@ -410,13 +410,13 @@ static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
 	spin_unlock(&im->ino_lock);
 }
 
-void add_dirty_inode(struct f2fs_sb_info *sbi, nid_t ino, int type)
+void add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
 {
 	/* add new dirty ino entry into list */
 	__add_ino_entry(sbi, ino, type);
 }
 
-void remove_dirty_inode(struct f2fs_sb_info *sbi, nid_t ino, int type)
+void remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
 {
 	/* remove dirty ino entry from list */
 	__remove_ino_entry(sbi, ino, type);
@@ -434,7 +434,7 @@ bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode)
 	return e ? true : false;
 }
 
-void release_dirty_inode(struct f2fs_sb_info *sbi)
+void release_ino_entry(struct f2fs_sb_info *sbi)
 {
 	struct ino_entry *e, *tmp;
 	int i;
@@ -722,47 +722,48 @@ fail_no_cp:
 	return -EINVAL;
 }
 
-static int __add_dirty_inode(struct inode *inode, struct inode_entry *new)
+static void __add_dirty_inode(struct inode *inode, enum inode_type type)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	struct f2fs_inode_info *fi = F2FS_I(inode);
+	int flag = (type == DIR_INODE) ? FI_DIRTY_DIR : FI_DIRTY_FILE;
 
-	if (is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR))
-		return -EEXIST;
+	if (is_inode_flag_set(fi, flag))
+		return;
 
-	set_inode_flag(F2FS_I(inode), FI_DIRTY_DIR);
-	F2FS_I(inode)->dirty_dir = new;
-	list_add_tail(&new->list, &sbi->dir_inode_list);
-	stat_inc_dirty_dir(sbi);
-	return 0;
+	set_inode_flag(fi, flag);
+	list_add_tail(&fi->dirty_list, &sbi->inode_list[type]);
+	stat_inc_dirty_inode(sbi, type);
+}
+
+static void __remove_dirty_inode(struct inode *inode, enum inode_type type)
+{
+	struct f2fs_inode_info *fi = F2FS_I(inode);
+	int flag = (type == DIR_INODE) ? FI_DIRTY_DIR : FI_DIRTY_FILE;
+
+	if (get_dirty_pages(inode) ||
+			!is_inode_flag_set(F2FS_I(inode), flag))
+		return;
+
+	list_del_init(&fi->dirty_list);
+	clear_inode_flag(fi, flag);
+	stat_dec_dirty_inode(F2FS_I_SB(inode), type);
 }
 
 void update_dirty_page(struct inode *inode, struct page *page)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-	struct inode_entry *new;
-	int ret = 0;
+	enum inode_type type = S_ISDIR(inode->i_mode) ? DIR_INODE : FILE_INODE;
 
 	if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) &&
 			!S_ISLNK(inode->i_mode))
 		return;
 
-	if (!S_ISDIR(inode->i_mode)) {
-		inode_inc_dirty_pages(inode);
-		goto out;
-	}
-
-	new = f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
-	new->inode = inode;
-	INIT_LIST_HEAD(&new->list);
-
-	spin_lock(&sbi->dir_inode_lock);
-	ret = __add_dirty_inode(inode, new);
+	spin_lock(&sbi->inode_lock[type]);
+	__add_dirty_inode(inode, type);
 	inode_inc_dirty_pages(inode);
-	spin_unlock(&sbi->dir_inode_lock);
+	spin_unlock(&sbi->inode_lock[type]);
 
-	if (ret)
-		kmem_cache_free(inode_entry_slab, new);
-out:
 	SetPagePrivate(page);
 	f2fs_trace_pid(page);
 }
@@ -770,70 +771,60 @@ out:
 void add_dirty_dir_inode(struct inode *inode)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-	struct inode_entry *new =
-			f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS);
-	int ret = 0;
-
-	new->inode = inode;
-	INIT_LIST_HEAD(&new->list);
 
-	spin_lock(&sbi->dir_inode_lock);
-	ret = __add_dirty_inode(inode, new);
-	spin_unlock(&sbi->dir_inode_lock);
-
-	if (ret)
-		kmem_cache_free(inode_entry_slab, new);
+	spin_lock(&sbi->inode_lock[DIR_INODE]);
+	__add_dirty_inode(inode, DIR_INODE);
+	spin_unlock(&sbi->inode_lock[DIR_INODE]);
 }
 
-void remove_dirty_dir_inode(struct inode *inode)
+void remove_dirty_inode(struct inode *inode)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-	struct inode_entry *entry;
-
-	if (!S_ISDIR(inode->i_mode))
-		return;
+	struct f2fs_inode_info *fi = F2FS_I(inode);
+	enum inode_type type = S_ISDIR(inode->i_mode) ? DIR_INODE : FILE_INODE;
 
-	spin_lock(&sbi->dir_inode_lock);
-	if (get_dirty_pages(inode) ||
-			!is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR)) {
-		spin_unlock(&sbi->dir_inode_lock);
+	if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) &&
+			!S_ISLNK(inode->i_mode))
 		return;
-	}
 
-	entry = F2FS_I(inode)->dirty_dir;
-	list_del(&entry->list);
-	F2FS_I(inode)->dirty_dir = NULL;
-	clear_inode_flag(F2FS_I(inode), FI_DIRTY_DIR);
-	stat_dec_dirty_dir(sbi);
-	spin_unlock(&sbi->dir_inode_lock);
-	kmem_cache_free(inode_entry_slab, entry);
+	spin_lock(&sbi->inode_lock[type]);
+	__remove_dirty_inode(inode, type);
+	spin_unlock(&sbi->inode_lock[type]);
 
 	/* Only from the recovery routine */
-	if (is_inode_flag_set(F2FS_I(inode), FI_DELAY_IPUT)) {
-		clear_inode_flag(F2FS_I(inode), FI_DELAY_IPUT);
+	if (is_inode_flag_set(fi, FI_DELAY_IPUT)) {
+		clear_inode_flag(fi, FI_DELAY_IPUT);
 		iput(inode);
 	}
 }
 
-void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi)
+int sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type)
 {
 	struct list_head *head;
-	struct inode_entry *entry;
 	struct inode *inode;
+	struct f2fs_inode_info *fi;
+	bool is_dir = (type == DIR_INODE);
+
+	trace_f2fs_sync_dirty_inodes_enter(sbi->sb, is_dir,
+				get_pages(sbi, is_dir ?
+				F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA));
 retry:
 	if (unlikely(f2fs_cp_error(sbi)))
-		return;
+		return -EIO;
 
-	spin_lock(&sbi->dir_inode_lock);
+	spin_lock(&sbi->inode_lock[type]);
 
-	head = &sbi->dir_inode_list;
+	head = &sbi->inode_list[type];
 	if (list_empty(head)) {
-		spin_unlock(&sbi->dir_inode_lock);
-		return;
+		spin_unlock(&sbi->inode_lock[type]);
+		trace_f2fs_sync_dirty_inodes_exit(sbi->sb, is_dir,
+				get_pages(sbi, is_dir ?
+				F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA));
+		return 0;
 	}
-	entry = list_entry(head->next, struct inode_entry, list);
-	inode = igrab(entry->inode);
-	spin_unlock(&sbi->dir_inode_lock);
+	fi = list_entry(head->next, struct f2fs_inode_info, dirty_list);
+	inode = igrab(&fi->vfs_inode);
+	spin_unlock(&sbi->inode_lock[type]);
 	if (inode) {
 		filemap_fdatawrite(inode->i_mapping);
 		iput(inode);
@@ -868,11 +859,9 @@ retry_flush_dents:
 	/* write all the dirty dentry pages */
 	if (get_pages(sbi, F2FS_DIRTY_DENTS)) {
 		f2fs_unlock_all(sbi);
-		sync_dirty_dir_inodes(sbi);
-		if (unlikely(f2fs_cp_error(sbi))) {
-			err = -EIO;
+		err = sync_dirty_inodes(sbi, DIR_INODE);
+		if (err)
 			goto out;
-		}
 		goto retry_flush_dents;
 	}
 
@@ -885,10 +874,9 @@ retry_flush_nodes:
 
 	if (get_pages(sbi, F2FS_DIRTY_NODES)) {
 		up_write(&sbi->node_write);
-		sync_node_pages(sbi, 0, &wbc);
-		if (unlikely(f2fs_cp_error(sbi))) {
+		err = sync_node_pages(sbi, 0, &wbc);
+		if (err) {
 			f2fs_unlock_all(sbi);
-			err = -EIO;
 			goto out;
 		}
 		goto retry_flush_nodes;
@@ -919,7 +907,7 @@ static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi)
 	finish_wait(&sbi->cp_wait, &wait);
 }
 
-static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
+static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 {
 	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
 	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
@@ -945,7 +933,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	while (get_pages(sbi, F2FS_DIRTY_META)) {
 		sync_meta_pages(sbi, META, LONG_MAX);
 		if (unlikely(f2fs_cp_error(sbi)))
-			return;
+			return -EIO;
 	}
 
 	next_free_nid(sbi, &last_nid);
@@ -1030,7 +1018,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	/* need to wait for end_io results */
 	wait_on_all_pages_writeback(sbi);
 	if (unlikely(f2fs_cp_error(sbi)))
-		return;
+		return -EIO;
 
 	/* write out checkpoint buffer at block 0 */
 	update_meta_page(sbi, ckpt, start_blk++);
@@ -1058,7 +1046,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	wait_on_all_pages_writeback(sbi);
 
 	if (unlikely(f2fs_cp_error(sbi)))
-		return;
+		return -EIO;
 
 	filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LONG_MAX);
 	filemap_fdatawait_range(META_MAPPING(sbi), 0, LONG_MAX);
@@ -1081,22 +1069,25 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 		invalidate_mapping_pages(META_MAPPING(sbi), discard_blk,
 								discard_blk);
 
-	release_dirty_inode(sbi);
+	release_ino_entry(sbi);
 
 	if (unlikely(f2fs_cp_error(sbi)))
-		return;
+		return -EIO;
 
 	clear_prefree_segments(sbi, cpc);
 	clear_sbi_flag(sbi, SBI_IS_DIRTY);
+
+	return 0;
 }
 
 /*
  * We guarantee that this checkpoint procedure will not fail.
  */
-void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
+int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 {
 	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
 	unsigned long long ckpt_ver;
+	int err = 0;
 
 	mutex_lock(&sbi->cp_mutex);
 
@@ -1104,14 +1095,19 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 		(cpc->reason == CP_FASTBOOT || cpc->reason == CP_SYNC ||
 		(cpc->reason == CP_DISCARD && !sbi->discard_blks)))
 		goto out;
-	if (unlikely(f2fs_cp_error(sbi)))
+	if (unlikely(f2fs_cp_error(sbi))) {
+		err = -EIO;
 		goto out;
-	if (f2fs_readonly(sbi->sb))
+	}
+	if (f2fs_readonly(sbi->sb)) {
+		err = -EROFS;
 		goto out;
+	}
 
 	trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "start block_ops");
 
-	if (block_operations(sbi))
+	err = block_operations(sbi);
+	if (err)
 		goto out;
 
 	trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish block_ops");
@@ -1133,7 +1129,7 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	flush_sit_entries(sbi, cpc);
 
 	/* unlock all the fs_lock[] in do_checkpoint() */
-	do_checkpoint(sbi, cpc);
+	err = do_checkpoint(sbi, cpc);
 
 	unblock_operations(sbi);
 	stat_inc_cp_count(sbi->stat_info);
@@ -1143,10 +1139,11 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 			"checkpoint: version = %llx", ckpt_ver);
 
 	/* do checkpoint periodically */
-	sbi->cp_expires = round_jiffies_up(jiffies + HZ * sbi->cp_interval);
+	f2fs_update_time(sbi, CP_TIME);
+	trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint");
 out:
 	mutex_unlock(&sbi->cp_mutex);
-	trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint");
+	return err;
 }
 
 void init_ino_entry_info(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 972eab7ac..5c06db17e 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -225,7 +225,8 @@ void set_data_blkaddr(struct dnode_of_data *dn)
 	/* Get physical address of data block */
 	addr_array = blkaddr_in_node(rn);
 	addr_array[ofs_in_node] = cpu_to_le32(dn->data_blkaddr);
-	set_page_dirty(node_page);
+	if (set_page_dirty(node_page))
+		dn->node_changed = true;
 }
 
 int reserve_new_block(struct dnode_of_data *dn)
@@ -412,7 +413,7 @@ struct page *get_new_data_page(struct inode *inode,
 	struct page *page;
 	struct dnode_of_data dn;
 	int err;
-repeat:
+
 	page = f2fs_grab_cache_page(mapping, index, true);
 	if (!page) {
 		/*
@@ -441,12 +442,11 @@ repeat:
 	} else {
 		f2fs_put_page(page, 1);
 
-		page = get_read_data_page(inode, index, READ_SYNC, true);
+		/* if ipage exists, blkaddr should be NEW_ADDR */
+		f2fs_bug_on(F2FS_I_SB(inode), ipage);
+		page = get_lock_data_page(inode, index, true);
 		if (IS_ERR(page))
-			goto repeat;
-
-		/* wait for read completion */
-		lock_page(page);
+			return page;
 	}
 got_it:
 	if (new_i_size && i_size_read(inode) <
@@ -494,14 +494,10 @@ alloc:
 	if (i_size_read(dn->inode) < ((loff_t)(fofs + 1) << PAGE_CACHE_SHIFT))
 		i_size_write(dn->inode,
 				((loff_t)(fofs + 1) << PAGE_CACHE_SHIFT));
-
-	/* direct IO doesn't use extent cache to maximize the performance */
-	f2fs_drop_largest_extent(dn->inode, fofs);
-
 	return 0;
 }
 
-static void __allocate_data_blocks(struct inode *inode, loff_t offset,
+static int __allocate_data_blocks(struct inode *inode, loff_t offset,
 							size_t count)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
@@ -510,14 +506,15 @@ static void __allocate_data_blocks(struct inode *inode, loff_t offset,
 	u64 len = F2FS_BYTES_TO_BLK(count);
 	bool allocated;
 	u64 end_offset;
+	int err = 0;
 
 	while (len) {
-		f2fs_balance_fs(sbi);
 		f2fs_lock_op(sbi);
 
 		/* When reading holes, we need its node page */
 		set_new_dnode(&dn, inode, NULL, NULL, 0);
-		if (get_dnode_of_data(&dn, start, ALLOC_NODE))
+		err = get_dnode_of_data(&dn, start, ALLOC_NODE);
+		if (err)
 			goto out;
 
 		allocated = false;
@@ -526,12 +523,15 @@ static void __allocate_data_blocks(struct inode *inode, loff_t offset,
 		while (dn.ofs_in_node < end_offset && len) {
 			block_t blkaddr;
 
-			if (unlikely(f2fs_cp_error(sbi)))
+			if (unlikely(f2fs_cp_error(sbi))) {
+				err = -EIO;
 				goto sync_out;
+			}
 
 			blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
 			if (blkaddr == NULL_ADDR || blkaddr == NEW_ADDR) {
-				if (__allocate_data_block(&dn))
+				err = __allocate_data_block(&dn);
+				if (err)
 					goto sync_out;
 				allocated = true;
 			}
@@ -545,8 +545,10 @@ static void __allocate_data_blocks(struct inode *inode, loff_t offset,
 
 		f2fs_put_dnode(&dn);
 		f2fs_unlock_op(sbi);
+
+		f2fs_balance_fs(sbi, dn.node_changed);
 	}
-	return;
+	return err;
 
 sync_out:
 	if (allocated)
@@ -554,7 +556,8 @@ sync_out:
 	f2fs_put_dnode(&dn);
 out:
 	f2fs_unlock_op(sbi);
-	return;
+	f2fs_balance_fs(sbi, dn.node_changed);
+	return err;
 }
 
 /*
@@ -566,7 +569,7 @@ out:
  *     b. do not use extent cache for better performance
  *     c. give the block addresses to blockdev
  */
-static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
+int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
 						int create, int flag)
 {
 	unsigned int maxblocks = map->m_len;
@@ -577,6 +580,7 @@ static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
 	int err = 0, ofs = 1;
 	struct extent_info ei;
 	bool allocated = false;
+	block_t blkaddr;
 
 	map->m_len = 0;
 	map->m_flags = 0;
@@ -592,7 +596,7 @@ static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
 	}
 
 	if (create)
-		f2fs_lock_op(F2FS_I_SB(inode));
+		f2fs_lock_op(sbi);
 
 	/* When reading holes, we need its node page */
 	set_new_dnode(&dn, inode, NULL, NULL, 0);
@@ -640,12 +644,21 @@ static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
 	pgofs++;
 
 get_next:
+	if (map->m_len >= maxblocks)
+		goto sync_out;
+
 	if (dn.ofs_in_node >= end_offset) {
 		if (allocated)
 			sync_inode_page(&dn);
 		allocated = false;
 		f2fs_put_dnode(&dn);
 
+		if (create) {
+			f2fs_unlock_op(sbi);
+			f2fs_balance_fs(sbi, dn.node_changed);
+			f2fs_lock_op(sbi);
+		}
+
 		set_new_dnode(&dn, inode, NULL, NULL, 0);
 		err = get_dnode_of_data(&dn, pgofs, mode);
 		if (err) {
@@ -657,52 +670,53 @@ get_next:
 		end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode));
 	}
 
-	if (maxblocks > map->m_len) {
-		block_t blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
+	blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node);
 
-		if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) {
-			if (create) {
-				if (unlikely(f2fs_cp_error(sbi))) {
-					err = -EIO;
-					goto sync_out;
-				}
-				err = __allocate_data_block(&dn);
-				if (err)
-					goto sync_out;
-				allocated = true;
-				map->m_flags |= F2FS_MAP_NEW;
-				blkaddr = dn.data_blkaddr;
-			} else {
-				/*
-				 * we only merge preallocated unwritten blocks
-				 * for fiemap.
-				 */
-				if (flag != F2FS_GET_BLOCK_FIEMAP ||
-						blkaddr != NEW_ADDR)
-					goto sync_out;
+	if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) {
+		if (create) {
+			if (unlikely(f2fs_cp_error(sbi))) {
+				err = -EIO;
+				goto sync_out;
 			}
+			err = __allocate_data_block(&dn);
+			if (err)
+				goto sync_out;
+			allocated = true;
+			map->m_flags |= F2FS_MAP_NEW;
+			blkaddr = dn.data_blkaddr;
+		} else {
+			/*
+			 * we only merge preallocated unwritten blocks
+			 * for fiemap.
+			 */
+			if (flag != F2FS_GET_BLOCK_FIEMAP ||
+					blkaddr != NEW_ADDR)
+				goto sync_out;
 		}
+	}
 
-		/* Give more consecutive addresses for the readahead */
-		if ((map->m_pblk != NEW_ADDR &&
-				blkaddr == (map->m_pblk + ofs)) ||
-				(map->m_pblk == NEW_ADDR &&
-				blkaddr == NEW_ADDR)) {
-			ofs++;
-			dn.ofs_in_node++;
-			pgofs++;
-			map->m_len++;
-			goto get_next;
-		}
+	/* Give more consecutive addresses for the readahead */
+	if ((map->m_pblk != NEW_ADDR &&
+			blkaddr == (map->m_pblk + ofs)) ||
+			(map->m_pblk == NEW_ADDR &&
+			blkaddr == NEW_ADDR)) {
+		ofs++;
+		dn.ofs_in_node++;
+		pgofs++;
+		map->m_len++;
+		goto get_next;
 	}
+
 sync_out:
 	if (allocated)
 		sync_inode_page(&dn);
 put_out:
 	f2fs_put_dnode(&dn);
 unlock_out:
-	if (create)
-		f2fs_unlock_op(F2FS_I_SB(inode));
+	if (create) {
+		f2fs_unlock_op(sbi);
+		f2fs_balance_fs(sbi, dn.node_changed);
+	}
 out:
 	trace_f2fs_map_blocks(inode, map, err);
 	return err;
@@ -742,6 +756,10 @@ static int get_data_block_dio(struct inode *inode, sector_t iblock,
 static int get_data_block_bmap(struct inode *inode, sector_t iblock,
 			struct buffer_head *bh_result, int create)
 {
+	/* Block number less than F2FS MAX BLOCKS */
+	if (unlikely(iblock >= F2FS_I_SB(inode)->max_file_blocks))
+		return -EFBIG;
+
 	return __get_data_block(inode, iblock, bh_result, create,
 						F2FS_GET_BLOCK_BMAP);
 }
@@ -761,10 +779,9 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 {
 	struct buffer_head map_bh;
 	sector_t start_blk, last_blk;
-	loff_t isize = i_size_read(inode);
+	loff_t isize;
 	u64 logical = 0, phys = 0, size = 0;
 	u32 flags = 0;
-	bool past_eof = false, whole_file = false;
 	int ret = 0;
 
 	ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC);
@@ -777,18 +794,21 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 			return ret;
 	}
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
-	if (len >= isize) {
-		whole_file = true;
-		len = isize;
-	}
+	isize = i_size_read(inode);
+	if (start >= isize)
+		goto out;
+
+	if (start + len > isize)
+		len = isize - start;
 
 	if (logical_to_blk(inode, len) == 0)
 		len = blk_to_logical(inode, 1);
 
 	start_blk = logical_to_blk(inode, start);
 	last_blk = logical_to_blk(inode, start + len - 1);
+
 next:
 	memset(&map_bh, 0, sizeof(struct buffer_head));
 	map_bh.b_size = len;
@@ -800,59 +820,37 @@ next:
 
 	/* HOLE */
 	if (!buffer_mapped(&map_bh)) {
-		start_blk++;
-
-		if (!past_eof && blk_to_logical(inode, start_blk) >= isize)
-			past_eof = 1;
-
-		if (past_eof && size) {
-			flags |= FIEMAP_EXTENT_LAST;
-			ret = fiemap_fill_next_extent(fieinfo, logical,
-					phys, size, flags);
-		} else if (size) {
-			ret = fiemap_fill_next_extent(fieinfo, logical,
-					phys, size, flags);
-			size = 0;
-		}
+		/* Go through holes util pass the EOF */
+		if (blk_to_logical(inode, start_blk++) < isize)
+			goto prep_next;
+		/* Found a hole beyond isize means no more extents.
+		 * Note that the premise is that filesystems don't
+		 * punch holes beyond isize and keep size unchanged.
+		 */
+		flags |= FIEMAP_EXTENT_LAST;
+	}
 
-		/* if we have holes up to/past EOF then we're done */
-		if (start_blk > last_blk || past_eof || ret)
-			goto out;
-	} else {
-		if (start_blk > last_blk && !whole_file) {
-			ret = fiemap_fill_next_extent(fieinfo, logical,
-					phys, size, flags);
-			goto out;
-		}
+	if (size) {
+		if (f2fs_encrypted_inode(inode))
+			flags |= FIEMAP_EXTENT_DATA_ENCRYPTED;
 
-		/*
-		 * if size != 0 then we know we already have an extent
-		 * to add, so add it.
-		 */
-		if (size) {
-			ret = fiemap_fill_next_extent(fieinfo, logical,
-					phys, size, flags);
-			if (ret)
-				goto out;
-		}
+		ret = fiemap_fill_next_extent(fieinfo, logical,
+				phys, size, flags);
+	}
 
-		logical = blk_to_logical(inode, start_blk);
-		phys = blk_to_logical(inode, map_bh.b_blocknr);
-		size = map_bh.b_size;
-		flags = 0;
-		if (buffer_unwritten(&map_bh))
-			flags = FIEMAP_EXTENT_UNWRITTEN;
+	if (start_blk > last_blk || ret)
+		goto out;
 
-		start_blk += logical_to_blk(inode, size);
+	logical = blk_to_logical(inode, start_blk);
+	phys = blk_to_logical(inode, map_bh.b_blocknr);
+	size = map_bh.b_size;
+	flags = 0;
+	if (buffer_unwritten(&map_bh))
+		flags = FIEMAP_EXTENT_UNWRITTEN;
 
-		/*
-		 * If we are past the EOF, then we need to make sure as
-		 * soon as we find a hole that the last extent we found
-		 * is marked with FIEMAP_EXTENT_LAST
-		 */
-		if (!past_eof && logical + size >= isize)
-			past_eof = true;
-	}
+	start_blk += logical_to_blk(inode, size);
+
+prep_next:
 	cond_resched();
 	if (fatal_signal_pending(current))
 		ret = -EINTR;
@@ -862,7 +860,7 @@ out:
 	if (ret == 1)
 		ret = 0;
 
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return ret;
 }
 
@@ -1083,6 +1081,7 @@ int do_write_data_page(struct f2fs_io_info *fio)
 	 */
 	if (unlikely(fio->blk_addr != NEW_ADDR &&
 			!is_cold_data(page) &&
+			!IS_ATOMIC_WRITTEN_PAGE(page) &&
 			need_inplace_update(inode))) {
 		rewrite_data_page(fio);
 		set_inode_flag(F2FS_I(inode), FI_UPDATE_WRITE);
@@ -1179,10 +1178,11 @@ out:
 	if (err)
 		ClearPageUptodate(page);
 	unlock_page(page);
-	if (need_balance_fs)
-		f2fs_balance_fs(sbi);
-	if (wbc->for_reclaim)
+	f2fs_balance_fs(sbi, need_balance_fs);
+	if (wbc->for_reclaim || unlikely(f2fs_cp_error(sbi))) {
 		f2fs_submit_merged_bio(sbi, DATA, WRITE);
+		remove_dirty_inode(inode);
+	}
 	return 0;
 
 redirty_out:
@@ -1354,6 +1354,10 @@ static int f2fs_write_data_pages(struct address_space *mapping,
 			available_free_memory(sbi, DIRTY_DENTS))
 		goto skip_write;
 
+	/* skip writing during file defragment */
+	if (is_inode_flag_set(F2FS_I(inode), FI_DO_DEFRAG))
+		goto skip_write;
+
 	/* during POR, we don't need to trigger writepage at all. */
 	if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
 		goto skip_write;
@@ -1369,7 +1373,7 @@ static int f2fs_write_data_pages(struct address_space *mapping,
 	if (locked)
 		mutex_unlock(&sbi->writepages);
 
-	remove_dirty_dir_inode(inode);
+	remove_dirty_inode(inode);
 
 	wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff);
 	return ret;
@@ -1382,13 +1386,85 @@ skip_write:
 static void f2fs_write_failed(struct address_space *mapping, loff_t to)
 {
 	struct inode *inode = mapping->host;
+	loff_t i_size = i_size_read(inode);
 
-	if (to > inode->i_size) {
-		truncate_pagecache(inode, inode->i_size);
-		truncate_blocks(inode, inode->i_size, true);
+	if (to > i_size) {
+		truncate_pagecache(inode, i_size);
+		truncate_blocks(inode, i_size, true);
 	}
 }
 
+static int prepare_write_begin(struct f2fs_sb_info *sbi,
+			struct page *page, loff_t pos, unsigned len,
+			block_t *blk_addr, bool *node_changed)
+{
+	struct inode *inode = page->mapping->host;
+	pgoff_t index = page->index;
+	struct dnode_of_data dn;
+	struct page *ipage;
+	bool locked = false;
+	struct extent_info ei;
+	int err = 0;
+
+	if (f2fs_has_inline_data(inode) ||
+			(pos & PAGE_CACHE_MASK) >= i_size_read(inode)) {
+		f2fs_lock_op(sbi);
+		locked = true;
+	}
+restart:
+	/* check inline_data */
+	ipage = get_node_page(sbi, inode->i_ino);
+	if (IS_ERR(ipage)) {
+		err = PTR_ERR(ipage);
+		goto unlock_out;
+	}
+
+	set_new_dnode(&dn, inode, ipage, ipage, 0);
+
+	if (f2fs_has_inline_data(inode)) {
+		if (pos + len <= MAX_INLINE_DATA) {
+			read_inline_data(page, ipage);
+			set_inode_flag(F2FS_I(inode), FI_DATA_EXIST);
+			sync_inode_page(&dn);
+		} else {
+			err = f2fs_convert_inline_page(&dn, page);
+			if (err)
+				goto out;
+			if (dn.data_blkaddr == NULL_ADDR)
+				err = f2fs_get_block(&dn, index);
+		}
+	} else if (locked) {
+		err = f2fs_get_block(&dn, index);
+	} else {
+		if (f2fs_lookup_extent_cache(inode, index, &ei)) {
+			dn.data_blkaddr = ei.blk + index - ei.fofs;
+		} else {
+			bool restart = false;
+
+			/* hole case */
+			err = get_dnode_of_data(&dn, index, LOOKUP_NODE);
+			if (err || (!err && dn.data_blkaddr == NULL_ADDR))
+				restart = true;
+			if (restart) {
+				f2fs_put_dnode(&dn);
+				f2fs_lock_op(sbi);
+				locked = true;
+				goto restart;
+			}
+		}
+	}
+
+	/* convert_inline_page can make node_changed */
+	*blk_addr = dn.data_blkaddr;
+	*node_changed = dn.node_changed;
+out:
+	f2fs_put_dnode(&dn);
+unlock_out:
+	if (locked)
+		f2fs_unlock_op(sbi);
+	return err;
+}
+
 static int f2fs_write_begin(struct file *file, struct address_space *mapping,
 		loff_t pos, unsigned len, unsigned flags,
 		struct page **pagep, void **fsdata)
@@ -1396,15 +1472,13 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
 	struct inode *inode = mapping->host;
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct page *page = NULL;
-	struct page *ipage;
 	pgoff_t index = ((unsigned long long) pos) >> PAGE_CACHE_SHIFT;
-	struct dnode_of_data dn;
+	bool need_balance = false;
+	block_t blkaddr = NULL_ADDR;
 	int err = 0;
 
 	trace_f2fs_write_begin(inode, pos, len, flags);
 
-	f2fs_balance_fs(sbi);
-
 	/*
 	 * We should check this at this moment to avoid deadlock on inode page
 	 * and #0 page. The locking rule for inline_data conversion should be:
@@ -1424,41 +1498,27 @@ repeat:
 
 	*pagep = page;
 
-	f2fs_lock_op(sbi);
-
-	/* check inline_data */
-	ipage = get_node_page(sbi, inode->i_ino);
-	if (IS_ERR(ipage)) {
-		err = PTR_ERR(ipage);
-		goto unlock_fail;
-	}
-
-	set_new_dnode(&dn, inode, ipage, ipage, 0);
+	err = prepare_write_begin(sbi, page, pos, len,
+					&blkaddr, &need_balance);
+	if (err)
+		goto fail;
 
-	if (f2fs_has_inline_data(inode)) {
-		if (pos + len <= MAX_INLINE_DATA) {
-			read_inline_data(page, ipage);
-			set_inode_flag(F2FS_I(inode), FI_DATA_EXIST);
-			sync_inode_page(&dn);
-			goto put_next;
+	if (need_balance && has_not_enough_free_secs(sbi, 0)) {
+		unlock_page(page);
+		f2fs_balance_fs(sbi, true);
+		lock_page(page);
+		if (page->mapping != mapping) {
+			/* The page got truncated from under us */
+			f2fs_put_page(page, 1);
+			goto repeat;
 		}
-		err = f2fs_convert_inline_page(&dn, page);
-		if (err)
-			goto put_fail;
 	}
 
-	err = f2fs_get_block(&dn, index);
-	if (err)
-		goto put_fail;
-put_next:
-	f2fs_put_dnode(&dn);
-	f2fs_unlock_op(sbi);
-
 	f2fs_wait_on_page_writeback(page, DATA);
 
 	/* wait for GCed encrypted page writeback */
 	if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
-		f2fs_wait_on_encrypted_page_writeback(sbi, dn.data_blkaddr);
+		f2fs_wait_on_encrypted_page_writeback(sbi, blkaddr);
 
 	if (len == PAGE_CACHE_SIZE)
 		goto out_update;
@@ -1474,14 +1534,14 @@ put_next:
 		goto out_update;
 	}
 
-	if (dn.data_blkaddr == NEW_ADDR) {
+	if (blkaddr == NEW_ADDR) {
 		zero_user_segment(page, 0, PAGE_CACHE_SIZE);
 	} else {
 		struct f2fs_io_info fio = {
 			.sbi = sbi,
 			.type = DATA,
 			.rw = READ_SYNC,
-			.blk_addr = dn.data_blkaddr,
+			.blk_addr = blkaddr,
 			.page = page,
 			.encrypted_page = NULL,
 		};
@@ -1512,10 +1572,6 @@ out_clear:
 	clear_cold_data(page);
 	return 0;
 
-put_fail:
-	f2fs_put_dnode(&dn);
-unlock_fail:
-	f2fs_unlock_op(sbi);
 fail:
 	f2fs_put_page(page, 1);
 	f2fs_write_failed(mapping, pos + len);
@@ -1540,6 +1596,7 @@ static int f2fs_write_end(struct file *file,
 	}
 
 	f2fs_put_page(page, 1);
+	f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
 	return copied;
 }
 
@@ -1567,11 +1624,9 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	int err;
 
 	/* we don't need to use inline_data strictly */
-	if (f2fs_has_inline_data(inode)) {
-		err = f2fs_convert_inline_inode(inode);
-		if (err)
-			return err;
-	}
+	err = f2fs_convert_inline_inode(inode);
+	if (err)
+		return err;
 
 	if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
 		return 0;
@@ -1583,11 +1638,9 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	trace_f2fs_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
 
 	if (iov_iter_rw(iter) == WRITE) {
-		__allocate_data_blocks(inode, offset, count);
-		if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) {
-			err = -EIO;
+		err = __allocate_data_blocks(inode, offset, count);
+		if (err)
 			goto out;
-		}
 	}
 
 	err = blockdev_direct_IO(iocb, inode, iter, offset, get_data_block_dio);
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 478e5d541..4fb6ef88a 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -38,12 +38,15 @@ static void update_general_status(struct f2fs_sb_info *sbi)
 	si->hit_rbtree = atomic64_read(&sbi->read_hit_rbtree);
 	si->hit_total = si->hit_largest + si->hit_cached + si->hit_rbtree;
 	si->total_ext = atomic64_read(&sbi->total_hit_ext);
-	si->ext_tree = sbi->total_ext_tree;
+	si->ext_tree = atomic_read(&sbi->total_ext_tree);
+	si->zombie_tree = atomic_read(&sbi->total_zombie_tree);
 	si->ext_node = atomic_read(&sbi->total_ext_node);
 	si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES);
 	si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS);
-	si->ndirty_dirs = sbi->n_dirty_dirs;
 	si->ndirty_meta = get_pages(sbi, F2FS_DIRTY_META);
+	si->ndirty_data = get_pages(sbi, F2FS_DIRTY_DATA);
+	si->ndirty_dirs = sbi->ndirty_inode[DIR_INODE];
+	si->ndirty_files = sbi->ndirty_inode[FILE_INODE];
 	si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES);
 	si->wb_pages = get_pages(sbi, F2FS_WRITEBACK);
 	si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg;
@@ -105,7 +108,7 @@ static void update_sit_info(struct f2fs_sb_info *sbi)
 
 	bimodal = 0;
 	total_vblocks = 0;
-	blks_per_sec = sbi->segs_per_sec * (1 << sbi->log_blocks_per_seg);
+	blks_per_sec = sbi->segs_per_sec * sbi->blocks_per_seg;
 	hblks_per_sec = blks_per_sec / 2;
 	for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) {
 		vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec);
@@ -189,10 +192,10 @@ get_cache:
 	si->cache_mem += NM_I(sbi)->dirty_nat_cnt *
 					sizeof(struct nat_entry_set);
 	si->cache_mem += si->inmem_pages * sizeof(struct inmem_pages);
-	si->cache_mem += sbi->n_dirty_dirs * sizeof(struct inode_entry);
 	for (i = 0; i <= UPDATE_INO; i++)
 		si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry);
-	si->cache_mem += sbi->total_ext_tree * sizeof(struct extent_tree);
+	si->cache_mem += atomic_read(&sbi->total_ext_tree) *
+						sizeof(struct extent_tree);
 	si->cache_mem += atomic_read(&sbi->total_ext_node) *
 						sizeof(struct extent_node);
 
@@ -211,12 +214,10 @@ static int stat_show(struct seq_file *s, void *v)
 
 	mutex_lock(&f2fs_stat_mutex);
 	list_for_each_entry(si, &f2fs_stat_list, stat_list) {
-		char devname[BDEVNAME_SIZE];
-
 		update_general_status(si->sbi);
 
-		seq_printf(s, "\n=====[ partition info(%s). #%d ]=====\n",
-			bdevname(si->sbi->sb->s_bdev, devname), i++);
+		seq_printf(s, "\n=====[ partition info(%pg). #%d ]=====\n",
+			si->sbi->sb->s_bdev, i++);
 		seq_printf(s, "[SB: 1] [CP: 2] [SIT: %d] [NAT: %d] ",
 			   si->sit_area_segs, si->nat_area_segs);
 		seq_printf(s, "[SSA: %d] [MAIN: %d",
@@ -269,7 +270,8 @@ static int stat_show(struct seq_file *s, void *v)
 			   si->dirty_count);
 		seq_printf(s, "  - Prefree: %d\n  - Free: %d (%d)\n\n",
 			   si->prefree_count, si->free_segs, si->free_secs);
-		seq_printf(s, "CP calls: %d\n", si->cp_count);
+		seq_printf(s, "CP calls: %d (BG: %d)\n",
+				si->cp_count, si->bg_cp_count);
 		seq_printf(s, "GC calls: %d (BG: %d)\n",
 			   si->call_count, si->bg_gc);
 		seq_printf(s, "  - data segments : %d (%d)\n",
@@ -290,8 +292,8 @@ static int stat_show(struct seq_file *s, void *v)
 				!si->total_ext ? 0 :
 				div64_u64(si->hit_total * 100, si->total_ext),
 				si->hit_total, si->total_ext);
-		seq_printf(s, "  - Inner Struct Count: tree: %d, node: %d\n",
-				si->ext_tree, si->ext_node);
+		seq_printf(s, "  - Inner Struct Count: tree: %d(%d), node: %d\n",
+				si->ext_tree, si->zombie_tree, si->ext_node);
 		seq_puts(s, "\nBalancing F2FS Async:\n");
 		seq_printf(s, "  - inmem: %4d, wb: %4d\n",
 			   si->inmem_pages, si->wb_pages);
@@ -299,6 +301,8 @@ static int stat_show(struct seq_file *s, void *v)
 			   si->ndirty_node, si->node_pages);
 		seq_printf(s, "  - dents: %4d in dirs:%4d\n",
 			   si->ndirty_dent, si->ndirty_dirs);
+		seq_printf(s, "  - datas: %4d in files:%4d\n",
+			   si->ndirty_data, si->ndirty_files);
 		seq_printf(s, "  - meta: %4d in %4d\n",
 			   si->ndirty_meta, si->meta_pages);
 		seq_printf(s, "  - NATs: %9d/%9d\n  - SITs: %9d/%9d\n",
@@ -406,20 +410,23 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi)
 	kfree(si);
 }
 
-void __init f2fs_create_root_stats(void)
+int __init f2fs_create_root_stats(void)
 {
 	struct dentry *file;
 
 	f2fs_debugfs_root = debugfs_create_dir("f2fs", NULL);
 	if (!f2fs_debugfs_root)
-		return;
+		return -ENOMEM;
 
 	file = debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root,
 			NULL, &stat_fops);
 	if (!file) {
 		debugfs_remove(f2fs_debugfs_root);
 		f2fs_debugfs_root = NULL;
+		return -ENOMEM;
 	}
+
+	return 0;
 }
 
 void f2fs_destroy_root_stats(void)
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 7c1678ba8..faa7495e2 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -172,8 +172,6 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir,
 
 	namehash = f2fs_dentry_hash(&name);
 
-	f2fs_bug_on(F2FS_I_SB(dir), level > MAX_DIR_HASH_DEPTH);
-
 	nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level);
 	nblock = bucket_blocks(level);
 
@@ -238,6 +236,14 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir,
 		goto out;
 
 	max_depth = F2FS_I(dir)->i_current_depth;
+	if (unlikely(max_depth > MAX_DIR_HASH_DEPTH)) {
+		f2fs_msg(F2FS_I_SB(dir)->sb, KERN_WARNING,
+				"Corrupted max_depth of %lu: %u",
+				dir->i_ino, max_depth);
+		max_depth = MAX_DIR_HASH_DEPTH;
+		F2FS_I(dir)->i_current_depth = max_depth;
+		mark_inode_dirty(dir);
+	}
 
 	for (level = 0; level < max_depth; level++) {
 		de = find_in_level(dir, level, &fname, res_page);
@@ -444,7 +450,7 @@ error:
 	/* once the failed inode becomes a bad inode, i_mode is S_IFREG */
 	truncate_inode_pages(&inode->i_data, 0);
 	truncate_blocks(inode, 0, false);
-	remove_dirty_dir_inode(inode);
+	remove_dirty_inode(inode);
 	remove_inode_page(inode);
 	return ERR_PTR(err);
 }
@@ -630,6 +636,7 @@ fail:
 	f2fs_put_page(dentry_page, 1);
 out:
 	f2fs_fname_free_filename(&fname);
+	f2fs_update_time(F2FS_I_SB(dir), REQ_TIME);
 	return err;
 }
 
@@ -651,6 +658,7 @@ int f2fs_do_tmpfile(struct inode *inode, struct inode *dir)
 	clear_inode_flag(F2FS_I(inode), FI_NEW_INODE);
 fail:
 	up_write(&F2FS_I(inode)->i_sem);
+	f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
 	return err;
 }
 
@@ -695,6 +703,8 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
 	int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len));
 	int i;
 
+	f2fs_update_time(F2FS_I_SB(dir), REQ_TIME);
+
 	if (f2fs_has_inline_dentry(dir))
 		return f2fs_delete_inline_entry(dentry, page, dir, inode);
 
@@ -855,25 +865,27 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
 
 	for (; n < npages; n++) {
 		dentry_page = get_lock_data_page(inode, n, false);
-		if (IS_ERR(dentry_page))
-			continue;
+		if (IS_ERR(dentry_page)) {
+			err = PTR_ERR(dentry_page);
+			if (err == -ENOENT)
+				continue;
+			else
+				goto out;
+		}
 
 		dentry_blk = kmap(dentry_page);
 
 		make_dentry_ptr(inode, &d, (void *)dentry_blk, 1);
 
-		if (f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr))
-			goto stop;
+		if (f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr)) {
+			kunmap(dentry_page);
+			f2fs_put_page(dentry_page, 1);
+			break;
+		}
 
 		ctx->pos = (n + 1) * NR_DENTRY_IN_BLOCK;
 		kunmap(dentry_page);
 		f2fs_put_page(dentry_page, 1);
-		dentry_page = NULL;
-	}
-stop:
-	if (dentry_page && !IS_ERR(dentry_page)) {
-		kunmap(dentry_page);
-		f2fs_put_page(dentry_page, 1);
 	}
 out:
 	f2fs_fname_crypto_free_buffer(&fstr);
diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index 7ddba812e..ccd5c636d 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -36,7 +36,7 @@ static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi,
 
 	rb_link_node(&en->rb_node, parent, p);
 	rb_insert_color(&en->rb_node, &et->root);
-	et->count++;
+	atomic_inc(&et->node_cnt);
 	atomic_inc(&sbi->total_ext_node);
 	return en;
 }
@@ -45,7 +45,7 @@ static void __detach_extent_node(struct f2fs_sb_info *sbi,
 				struct extent_tree *et, struct extent_node *en)
 {
 	rb_erase(&en->rb_node, &et->root);
-	et->count--;
+	atomic_dec(&et->node_cnt);
 	atomic_dec(&sbi->total_ext_node);
 
 	if (et->cached_en == en)
@@ -68,11 +68,13 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode)
 		et->root = RB_ROOT;
 		et->cached_en = NULL;
 		rwlock_init(&et->lock);
-		atomic_set(&et->refcount, 0);
-		et->count = 0;
-		sbi->total_ext_tree++;
+		INIT_LIST_HEAD(&et->list);
+		atomic_set(&et->node_cnt, 0);
+		atomic_inc(&sbi->total_ext_tree);
+	} else {
+		atomic_dec(&sbi->total_zombie_tree);
+		list_del_init(&et->list);
 	}
-	atomic_inc(&et->refcount);
 	up_write(&sbi->extent_tree_lock);
 
 	/* never died until evict_inode */
@@ -131,7 +133,7 @@ static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi,
 {
 	struct rb_node *node, *next;
 	struct extent_node *en;
-	unsigned int count = et->count;
+	unsigned int count = atomic_read(&et->node_cnt);
 
 	node = rb_first(&et->root);
 	while (node) {
@@ -152,7 +154,7 @@ static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi,
 		node = next;
 	}
 
-	return count - et->count;
+	return count - atomic_read(&et->node_cnt);
 }
 
 static void __drop_largest_extent(struct inode *inode,
@@ -164,34 +166,33 @@ static void __drop_largest_extent(struct inode *inode,
 		largest->len = 0;
 }
 
-void f2fs_drop_largest_extent(struct inode *inode, pgoff_t fofs)
-{
-	if (!f2fs_may_extent_tree(inode))
-		return;
-
-	__drop_largest_extent(inode, fofs, 1);
-}
-
-void f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext)
+/* return true, if inode page is changed */
+bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct extent_tree *et;
 	struct extent_node *en;
 	struct extent_info ei;
 
-	if (!f2fs_may_extent_tree(inode))
-		return;
+	if (!f2fs_may_extent_tree(inode)) {
+		/* drop largest extent */
+		if (i_ext && i_ext->len) {
+			i_ext->len = 0;
+			return true;
+		}
+		return false;
+	}
 
 	et = __grab_extent_tree(inode);
 
-	if (!i_ext || le32_to_cpu(i_ext->len) < F2FS_MIN_EXTENT_LEN)
-		return;
+	if (!i_ext || !i_ext->len)
+		return false;
 
 	set_extent_info(&ei, le32_to_cpu(i_ext->fofs),
 		le32_to_cpu(i_ext->blk), le32_to_cpu(i_ext->len));
 
 	write_lock(&et->lock);
-	if (et->count)
+	if (atomic_read(&et->node_cnt))
 		goto out;
 
 	en = __init_extent_tree(sbi, et, &ei);
@@ -202,6 +203,7 @@ void f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext)
 	}
 out:
 	write_unlock(&et->lock);
+	return false;
 }
 
 static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs,
@@ -549,45 +551,44 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode,
 unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
 {
 	struct extent_tree *treevec[EXT_TREE_VEC_SIZE];
+	struct extent_tree *et, *next;
 	struct extent_node *en, *tmp;
 	unsigned long ino = F2FS_ROOT_INO(sbi);
-	struct radix_tree_root *root = &sbi->extent_tree_root;
 	unsigned int found;
 	unsigned int node_cnt = 0, tree_cnt = 0;
 	int remained;
+	bool do_free = false;
 
 	if (!test_opt(sbi, EXTENT_CACHE))
 		return 0;
 
+	if (!atomic_read(&sbi->total_zombie_tree))
+		goto free_node;
+
 	if (!down_write_trylock(&sbi->extent_tree_lock))
 		goto out;
 
 	/* 1. remove unreferenced extent tree */
-	while ((found = radix_tree_gang_lookup(root,
-				(void **)treevec, ino, EXT_TREE_VEC_SIZE))) {
-		unsigned i;
-
-		ino = treevec[found - 1]->ino + 1;
-		for (i = 0; i < found; i++) {
-			struct extent_tree *et = treevec[i];
-
-			if (!atomic_read(&et->refcount)) {
-				write_lock(&et->lock);
-				node_cnt += __free_extent_tree(sbi, et, true);
-				write_unlock(&et->lock);
+	list_for_each_entry_safe(et, next, &sbi->zombie_list, list) {
+		if (atomic_read(&et->node_cnt)) {
+			write_lock(&et->lock);
+			node_cnt += __free_extent_tree(sbi, et, true);
+			write_unlock(&et->lock);
+		}
 
-				radix_tree_delete(root, et->ino);
-				kmem_cache_free(extent_tree_slab, et);
-				sbi->total_ext_tree--;
-				tree_cnt++;
+		list_del_init(&et->list);
+		radix_tree_delete(&sbi->extent_tree_root, et->ino);
+		kmem_cache_free(extent_tree_slab, et);
+		atomic_dec(&sbi->total_ext_tree);
+		atomic_dec(&sbi->total_zombie_tree);
+		tree_cnt++;
 
-				if (node_cnt + tree_cnt >= nr_shrink)
-					goto unlock_out;
-			}
-		}
+		if (node_cnt + tree_cnt >= nr_shrink)
+			goto unlock_out;
 	}
 	up_write(&sbi->extent_tree_lock);
 
+free_node:
 	/* 2. remove LRU extent entries */
 	if (!down_write_trylock(&sbi->extent_tree_lock))
 		goto out;
@@ -599,15 +600,19 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
 		if (!remained--)
 			break;
 		list_del_init(&en->list);
+		do_free = true;
 	}
 	spin_unlock(&sbi->extent_lock);
 
+	if (do_free == false)
+		goto unlock_out;
+
 	/*
 	 * reset ino for searching victims from beginning of global extent tree.
 	 */
 	ino = F2FS_ROOT_INO(sbi);
 
-	while ((found = radix_tree_gang_lookup(root,
+	while ((found = radix_tree_gang_lookup(&sbi->extent_tree_root,
 				(void **)treevec, ino, EXT_TREE_VEC_SIZE))) {
 		unsigned i;
 
@@ -615,9 +620,13 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink)
 		for (i = 0; i < found; i++) {
 			struct extent_tree *et = treevec[i];
 
-			write_lock(&et->lock);
-			node_cnt += __free_extent_tree(sbi, et, false);
-			write_unlock(&et->lock);
+			if (!atomic_read(&et->node_cnt))
+				continue;
+
+			if (write_trylock(&et->lock)) {
+				node_cnt += __free_extent_tree(sbi, et, false);
+				write_unlock(&et->lock);
+			}
 
 			if (node_cnt + tree_cnt >= nr_shrink)
 				goto unlock_out;
@@ -637,7 +646,7 @@ unsigned int f2fs_destroy_extent_node(struct inode *inode)
 	struct extent_tree *et = F2FS_I(inode)->extent_tree;
 	unsigned int node_cnt = 0;
 
-	if (!et)
+	if (!et || !atomic_read(&et->node_cnt))
 		return 0;
 
 	write_lock(&et->lock);
@@ -656,8 +665,12 @@ void f2fs_destroy_extent_tree(struct inode *inode)
 	if (!et)
 		return;
 
-	if (inode->i_nlink && !is_bad_inode(inode) && et->count) {
-		atomic_dec(&et->refcount);
+	if (inode->i_nlink && !is_bad_inode(inode) &&
+					atomic_read(&et->node_cnt)) {
+		down_write(&sbi->extent_tree_lock);
+		list_add_tail(&et->list, &sbi->zombie_list);
+		atomic_inc(&sbi->total_zombie_tree);
+		up_write(&sbi->extent_tree_lock);
 		return;
 	}
 
@@ -666,11 +679,10 @@ void f2fs_destroy_extent_tree(struct inode *inode)
 
 	/* delete extent tree entry in radix tree */
 	down_write(&sbi->extent_tree_lock);
-	atomic_dec(&et->refcount);
-	f2fs_bug_on(sbi, atomic_read(&et->refcount) || et->count);
+	f2fs_bug_on(sbi, atomic_read(&et->node_cnt));
 	radix_tree_delete(&sbi->extent_tree_root, inode->i_ino);
 	kmem_cache_free(extent_tree_slab, et);
-	sbi->total_ext_tree--;
+	atomic_dec(&sbi->total_ext_tree);
 	up_write(&sbi->extent_tree_lock);
 
 	F2FS_I(inode)->extent_tree = NULL;
@@ -722,7 +734,9 @@ void init_extent_cache_info(struct f2fs_sb_info *sbi)
 	init_rwsem(&sbi->extent_tree_lock);
 	INIT_LIST_HEAD(&sbi->extent_list);
 	spin_lock_init(&sbi->extent_lock);
-	sbi->total_ext_tree = 0;
+	atomic_set(&sbi->total_ext_tree, 0);
+	INIT_LIST_HEAD(&sbi->zombie_list);
+	atomic_set(&sbi->total_zombie_tree, 0);
 	atomic_set(&sbi->total_ext_node, 0);
 }
 
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 9db5500d6..ff79054c6 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -21,6 +21,7 @@
 #include <linux/sched.h>
 #include <linux/vmalloc.h>
 #include <linux/bio.h>
+#include <linux/blkdev.h>
 
 #ifdef CONFIG_F2FS_CHECK_FS
 #define f2fs_bug_on(sbi, condition)	BUG_ON(condition)
@@ -54,6 +55,7 @@
 #define F2FS_MOUNT_FASTBOOT		0x00001000
 #define F2FS_MOUNT_EXTENT_CACHE		0x00002000
 #define F2FS_MOUNT_FORCE_FG_GC		0x00004000
+#define F2FS_MOUNT_DATA_FLUSH		0x00008000
 
 #define clear_opt(sbi, option)	(sbi->mount_opt.opt &= ~F2FS_MOUNT_##option)
 #define set_opt(sbi, option)	(sbi->mount_opt.opt |= F2FS_MOUNT_##option)
@@ -125,6 +127,7 @@ enum {
 #define BATCHED_TRIM_BLOCKS(sbi)	\
 		(BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg)
 #define DEF_CP_INTERVAL			60	/* 60 secs */
+#define DEF_IDLE_INTERVAL		120	/* 2 mins */
 
 struct cp_control {
 	int reason;
@@ -158,13 +161,7 @@ struct ino_entry {
 	nid_t ino;		/* inode number */
 };
 
-/*
- * for the list of directory inodes or gc inodes.
- * NOTE: there are two slab users for this structure, if we add/modify/delete
- * fields in structure for one of slab users, it may affect fields or size of
- * other one, in this condition, it's better to split both of slab and related
- * data structure.
- */
+/* for the list of inodes to be GCed */
 struct inode_entry {
 	struct list_head list;	/* list head */
 	struct inode *inode;	/* vfs inode pointer */
@@ -234,6 +231,7 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size,
 #define F2FS_IOC_ABORT_VOLATILE_WRITE	_IO(F2FS_IOCTL_MAGIC, 5)
 #define F2FS_IOC_GARBAGE_COLLECT	_IO(F2FS_IOCTL_MAGIC, 6)
 #define F2FS_IOC_WRITE_CHECKPOINT	_IO(F2FS_IOCTL_MAGIC, 7)
+#define F2FS_IOC_DEFRAGMENT		_IO(F2FS_IOCTL_MAGIC, 8)
 
 #define F2FS_IOC_SET_ENCRYPTION_POLICY					\
 		_IOR('f', 19, struct f2fs_encryption_policy)
@@ -256,10 +254,16 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size,
 /*
  * ioctl commands in 32 bit emulation
  */
-#define F2FS_IOC32_GETFLAGS             FS_IOC32_GETFLAGS
-#define F2FS_IOC32_SETFLAGS             FS_IOC32_SETFLAGS
+#define F2FS_IOC32_GETFLAGS		FS_IOC32_GETFLAGS
+#define F2FS_IOC32_SETFLAGS		FS_IOC32_SETFLAGS
+#define F2FS_IOC32_GETVERSION		FS_IOC32_GETVERSION
 #endif
 
+struct f2fs_defragment {
+	u64 start;
+	u64 len;
+};
+
 /*
  * For INODE and NODE manager
  */
@@ -357,9 +361,9 @@ struct extent_tree {
 	struct rb_root root;		/* root of extent info rb-tree */
 	struct extent_node *cached_en;	/* recently accessed extent node */
 	struct extent_info largest;	/* largested extent info */
+	struct list_head list;		/* to be used by sbi->zombie_list */
 	rwlock_t lock;			/* protect extent info rb-tree */
-	atomic_t refcount;		/* reference count of rb-tree */
-	unsigned int count;		/* # of extent node in rb-tree*/
+	atomic_t node_cnt;		/* # of extent node in rb-tree*/
 };
 
 /*
@@ -434,8 +438,8 @@ struct f2fs_inode_info {
 	unsigned int clevel;		/* maximum level of given file name */
 	nid_t i_xattr_nid;		/* node id that contains xattrs */
 	unsigned long long xattr_ver;	/* cp version of xattr modification */
-	struct inode_entry *dirty_dir;	/* the pointer of dirty dir */
 
+	struct list_head dirty_list;	/* linked in global dirty list */
 	struct list_head inmem_pages;	/* inmemory pages managed by f2fs */
 	struct mutex inmem_lock;	/* lock for inmemory pages */
 
@@ -544,6 +548,7 @@ struct dnode_of_data {
 	nid_t nid;			/* node id of the direct node block */
 	unsigned int ofs_in_node;	/* data offset in the node page */
 	bool inode_page_locked;		/* inode page is locked or not */
+	bool node_changed;		/* is node block changed */
 	block_t	data_blkaddr;		/* block address of the node block */
 };
 
@@ -647,6 +652,7 @@ struct f2fs_sm_info {
 enum count_type {
 	F2FS_WRITEBACK,
 	F2FS_DIRTY_DENTS,
+	F2FS_DIRTY_DATA,
 	F2FS_DIRTY_NODES,
 	F2FS_DIRTY_META,
 	F2FS_INMEM_PAGES,
@@ -695,6 +701,12 @@ struct f2fs_bio_info {
 	struct rw_semaphore io_rwsem;	/* blocking op for bio */
 };
 
+enum inode_type {
+	DIR_INODE,			/* for dirty dir inode */
+	FILE_INODE,			/* for dirty regular/symlink inode */
+	NR_INODE_TYPE,
+};
+
 /* for inner inode cache management */
 struct inode_management {
 	struct radix_tree_root ino_root;	/* ino entry array */
@@ -711,11 +723,17 @@ enum {
 	SBI_POR_DOING,				/* recovery is doing or not */
 };
 
+enum {
+	CP_TIME,
+	REQ_TIME,
+	MAX_TIME,
+};
+
 struct f2fs_sb_info {
 	struct super_block *sb;			/* pointer to VFS super block */
 	struct proc_dir_entry *s_proc;		/* proc entry */
-	struct buffer_head *raw_super_buf;	/* buffer head of raw sb */
 	struct f2fs_super_block *raw_super;	/* raw super block pointer */
+	int valid_super_block;			/* valid super block no */
 	int s_flag;				/* flags for sbi */
 
 	/* for node-related operations */
@@ -737,23 +755,26 @@ struct f2fs_sb_info {
 	struct rw_semaphore node_write;		/* locking node writes */
 	struct mutex writepages;		/* mutex for writepages() */
 	wait_queue_head_t cp_wait;
-	long cp_expires, cp_interval;		/* next expected periodic cp */
+	unsigned long last_time[MAX_TIME];	/* to store time in jiffies */
+	long interval_time[MAX_TIME];		/* to store thresholds */
 
 	struct inode_management im[MAX_INO_ENTRY];      /* manage inode cache */
 
 	/* for orphan inode, use 0'th array */
 	unsigned int max_orphans;		/* max orphan inodes */
 
-	/* for directory inode management */
-	struct list_head dir_inode_list;	/* dir inode list */
-	spinlock_t dir_inode_lock;		/* for dir inode list lock */
+	/* for inode management */
+	struct list_head inode_list[NR_INODE_TYPE];	/* dirty inode list */
+	spinlock_t inode_lock[NR_INODE_TYPE];	/* for dirty inode list lock */
 
 	/* for extent tree cache */
 	struct radix_tree_root extent_tree_root;/* cache extent cache entries */
 	struct rw_semaphore extent_tree_lock;	/* locking extent radix tree */
 	struct list_head extent_list;		/* lru list for shrinker */
 	spinlock_t extent_lock;			/* locking extent lru list */
-	int total_ext_tree;			/* extent tree count */
+	atomic_t total_ext_tree;		/* extent tree count */
+	struct list_head zombie_list;		/* extent zombie tree list */
+	atomic_t total_zombie_tree;		/* extent zombie tree count */
 	atomic_t total_ext_node;		/* extent info count */
 
 	/* basic filesystem units */
@@ -771,6 +792,7 @@ struct f2fs_sb_info {
 	unsigned int total_node_count;		/* total node block count */
 	unsigned int total_valid_node_count;	/* valid node block count */
 	unsigned int total_valid_inode_count;	/* valid inode count */
+	loff_t max_file_blocks;			/* max block index of file */
 	int active_logs;			/* # of active logs */
 	int dir_level;				/* directory level */
 
@@ -809,7 +831,7 @@ struct f2fs_sb_info {
 	atomic_t inline_inode;			/* # of inline_data inodes */
 	atomic_t inline_dir;			/* # of inline_dentry inodes */
 	int bg_gc;				/* background gc calls */
-	unsigned int n_dirty_dirs;		/* # of dir inodes */
+	unsigned int ndirty_inode[NR_INODE_TYPE];	/* # of dirty inodes */
 #endif
 	unsigned int last_victim[2];		/* last victim segment # */
 	spinlock_t stat_lock;			/* lock for stat operations */
@@ -824,6 +846,31 @@ struct f2fs_sb_info {
 	unsigned int shrinker_run_no;
 };
 
+static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type)
+{
+	sbi->last_time[type] = jiffies;
+}
+
+static inline bool f2fs_time_over(struct f2fs_sb_info *sbi, int type)
+{
+	struct timespec ts = {sbi->interval_time[type], 0};
+	unsigned long interval = timespec_to_jiffies(&ts);
+
+	return time_after(jiffies, sbi->last_time[type] + interval);
+}
+
+static inline bool is_idle(struct f2fs_sb_info *sbi)
+{
+	struct block_device *bdev = sbi->sb->s_bdev;
+	struct request_queue *q = bdev_get_queue(bdev);
+	struct request_list *rl = &q->root_rl;
+
+	if (rl->count[BLK_RW_SYNC] || rl->count[BLK_RW_ASYNC])
+		return 0;
+
+	return f2fs_time_over(sbi, REQ_TIME);
+}
+
 /*
  * Inline functions
  */
@@ -1059,8 +1106,8 @@ static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type)
 static inline void inode_inc_dirty_pages(struct inode *inode)
 {
 	atomic_inc(&F2FS_I(inode)->dirty_pages);
-	if (S_ISDIR(inode->i_mode))
-		inc_page_count(F2FS_I_SB(inode), F2FS_DIRTY_DENTS);
+	inc_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ?
+				F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA);
 }
 
 static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type)
@@ -1075,9 +1122,8 @@ static inline void inode_dec_dirty_pages(struct inode *inode)
 		return;
 
 	atomic_dec(&F2FS_I(inode)->dirty_pages);
-
-	if (S_ISDIR(inode->i_mode))
-		dec_page_count(F2FS_I_SB(inode), F2FS_DIRTY_DENTS);
+	dec_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ?
+				F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA);
 }
 
 static inline int get_pages(struct f2fs_sb_info *sbi, int count_type)
@@ -1092,8 +1138,7 @@ static inline int get_dirty_pages(struct inode *inode)
 
 static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type)
 {
-	unsigned int pages_per_sec = sbi->segs_per_sec *
-					(1 << sbi->log_blocks_per_seg);
+	unsigned int pages_per_sec = sbi->segs_per_sec * sbi->blocks_per_seg;
 	return ((get_pages(sbi, block_type) + pages_per_sec - 1)
 			>> sbi->log_blocks_per_seg) / sbi->segs_per_sec;
 }
@@ -1416,6 +1461,8 @@ enum {
 	FI_DROP_CACHE,		/* drop dirty page cache */
 	FI_DATA_EXIST,		/* indicate data exists */
 	FI_INLINE_DOTS,		/* indicate inline dot dentries */
+	FI_DO_DEFRAG,		/* indicate defragment is running */
+	FI_DIRTY_FILE,		/* indicate regular/symlink has dirty pages */
 };
 
 static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag)
@@ -1602,13 +1649,11 @@ static inline bool is_dot_dotdot(const struct qstr *str)
 
 static inline bool f2fs_may_extent_tree(struct inode *inode)
 {
-	mode_t mode = inode->i_mode;
-
 	if (!test_opt(F2FS_I_SB(inode), EXTENT_CACHE) ||
 			is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT))
 		return false;
 
-	return S_ISREG(mode);
+	return S_ISREG(inode->i_mode);
 }
 
 static inline void *f2fs_kvmalloc(size_t size, gfp_t flags)
@@ -1661,8 +1706,8 @@ long f2fs_compat_ioctl(struct file *, unsigned int, unsigned long);
 void f2fs_set_inode_flags(struct inode *);
 struct inode *f2fs_iget(struct super_block *, unsigned long);
 int try_to_free_nats(struct f2fs_sb_info *, int);
-void update_inode(struct inode *, struct page *);
-void update_inode_page(struct inode *);
+int update_inode(struct inode *, struct page *);
+int update_inode_page(struct inode *);
 int f2fs_write_inode(struct inode *, struct writeback_control *);
 void f2fs_evict_inode(struct inode *);
 void handle_failed_inode(struct inode *);
@@ -1767,7 +1812,7 @@ void destroy_node_manager_caches(void);
  */
 void register_inmem_page(struct inode *, struct page *);
 int commit_inmem_pages(struct inode *, bool);
-void f2fs_balance_fs(struct f2fs_sb_info *);
+void f2fs_balance_fs(struct f2fs_sb_info *, bool);
 void f2fs_balance_fs_bg(struct f2fs_sb_info *);
 int f2fs_issue_flush(struct f2fs_sb_info *);
 int create_flush_cmd_control(struct f2fs_sb_info *);
@@ -1813,9 +1858,9 @@ bool is_valid_blkaddr(struct f2fs_sb_info *, block_t, int);
 int ra_meta_pages(struct f2fs_sb_info *, block_t, int, int, bool);
 void ra_meta_pages_cond(struct f2fs_sb_info *, pgoff_t);
 long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long);
-void add_dirty_inode(struct f2fs_sb_info *, nid_t, int type);
-void remove_dirty_inode(struct f2fs_sb_info *, nid_t, int type);
-void release_dirty_inode(struct f2fs_sb_info *);
+void add_ino_entry(struct f2fs_sb_info *, nid_t, int type);
+void remove_ino_entry(struct f2fs_sb_info *, nid_t, int type);
+void release_ino_entry(struct f2fs_sb_info *);
 bool exist_written_data(struct f2fs_sb_info *, nid_t, int);
 int acquire_orphan_inode(struct f2fs_sb_info *);
 void release_orphan_inode(struct f2fs_sb_info *);
@@ -1825,9 +1870,9 @@ int recover_orphan_inodes(struct f2fs_sb_info *);
 int get_valid_checkpoint(struct f2fs_sb_info *);
 void update_dirty_page(struct inode *, struct page *);
 void add_dirty_dir_inode(struct inode *);
-void remove_dirty_dir_inode(struct inode *);
-void sync_dirty_dir_inodes(struct f2fs_sb_info *);
-void write_checkpoint(struct f2fs_sb_info *, struct cp_control *);
+void remove_dirty_inode(struct inode *);
+int sync_dirty_inodes(struct f2fs_sb_info *, enum inode_type);
+int write_checkpoint(struct f2fs_sb_info *, struct cp_control *);
 void init_ino_entry_info(struct f2fs_sb_info *);
 int __init create_checkpoint_caches(void);
 void destroy_checkpoint_caches(void);
@@ -1847,6 +1892,7 @@ struct page *find_data_page(struct inode *, pgoff_t);
 struct page *get_lock_data_page(struct inode *, pgoff_t, bool);
 struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool);
 int do_write_data_page(struct f2fs_io_info *);
+int f2fs_map_blocks(struct inode *, struct f2fs_map_blocks *, int, int);
 int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64);
 void f2fs_invalidate_page(struct page *, unsigned int, unsigned int);
 int f2fs_release_page(struct page *, gfp_t);
@@ -1877,8 +1923,9 @@ struct f2fs_stat_info {
 	int main_area_segs, main_area_sections, main_area_zones;
 	unsigned long long hit_largest, hit_cached, hit_rbtree;
 	unsigned long long hit_total, total_ext;
-	int ext_tree, ext_node;
-	int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta;
+	int ext_tree, zombie_tree, ext_node;
+	int ndirty_node, ndirty_meta;
+	int ndirty_dent, ndirty_dirs, ndirty_data, ndirty_files;
 	int nats, dirty_nats, sits, dirty_sits, fnids;
 	int total_count, utilization;
 	int bg_gc, inmem_pages, wb_pages;
@@ -1888,7 +1935,7 @@ struct f2fs_stat_info {
 	int util_free, util_valid, util_invalid;
 	int rsvd_segs, overp_segs;
 	int dirty_count, node_pages, meta_pages;
-	int prefree_count, call_count, cp_count;
+	int prefree_count, call_count, cp_count, bg_cp_count;
 	int tot_segs, node_segs, data_segs, free_segs, free_secs;
 	int bg_node_segs, bg_data_segs;
 	int tot_blks, data_blks, node_blks;
@@ -1909,10 +1956,11 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
 }
 
 #define stat_inc_cp_count(si)		((si)->cp_count++)
+#define stat_inc_bg_cp_count(si)	((si)->bg_cp_count++)
 #define stat_inc_call_count(si)		((si)->call_count++)
 #define stat_inc_bggc_count(sbi)	((sbi)->bg_gc++)
-#define stat_inc_dirty_dir(sbi)		((sbi)->n_dirty_dirs++)
-#define stat_dec_dirty_dir(sbi)		((sbi)->n_dirty_dirs--)
+#define stat_inc_dirty_inode(sbi, type)	((sbi)->ndirty_inode[type]++)
+#define stat_dec_dirty_inode(sbi, type)	((sbi)->ndirty_inode[type]--)
 #define stat_inc_total_hit(sbi)		(atomic64_inc(&(sbi)->total_hit_ext))
 #define stat_inc_rbtree_node_hit(sbi)	(atomic64_inc(&(sbi)->read_hit_rbtree))
 #define stat_inc_largest_node_hit(sbi)	(atomic64_inc(&(sbi)->read_hit_largest))
@@ -1987,14 +2035,15 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi)
 
 int f2fs_build_stats(struct f2fs_sb_info *);
 void f2fs_destroy_stats(struct f2fs_sb_info *);
-void __init f2fs_create_root_stats(void);
+int __init f2fs_create_root_stats(void);
 void f2fs_destroy_root_stats(void);
 #else
 #define stat_inc_cp_count(si)
+#define stat_inc_bg_cp_count(si)
 #define stat_inc_call_count(si)
 #define stat_inc_bggc_count(si)
-#define stat_inc_dirty_dir(sbi)
-#define stat_dec_dirty_dir(sbi)
+#define stat_inc_dirty_inode(sbi, type)
+#define stat_dec_dirty_inode(sbi, type)
 #define stat_inc_total_hit(sb)
 #define stat_inc_rbtree_node_hit(sb)
 #define stat_inc_largest_node_hit(sbi)
@@ -2015,7 +2064,7 @@ void f2fs_destroy_root_stats(void);
 
 static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; }
 static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { }
-static inline void __init f2fs_create_root_stats(void) { }
+static inline int __init f2fs_create_root_stats(void) { return 0; }
 static inline void f2fs_destroy_root_stats(void) { }
 #endif
 
@@ -2069,8 +2118,7 @@ void f2fs_leave_shrinker(struct f2fs_sb_info *);
  * extent_cache.c
  */
 unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *, int);
-void f2fs_drop_largest_extent(struct inode *, pgoff_t);
-void f2fs_init_extent_tree(struct inode *, struct f2fs_extent *);
+bool f2fs_init_extent_tree(struct inode *, struct f2fs_extent *);
 unsigned int f2fs_destroy_extent_node(struct inode *);
 void f2fs_destroy_extent_tree(struct inode *);
 bool f2fs_lookup_extent_cache(struct inode *, pgoff_t, struct extent_info *);
@@ -2121,7 +2169,7 @@ static inline int f2fs_sb_has_crypto(struct super_block *sb)
 static inline bool f2fs_may_encrypt(struct inode *inode)
 {
 #ifdef CONFIG_F2FS_FS_ENCRYPTION
-	mode_t mode = inode->i_mode;
+	umode_t mode = inode->i_mode;
 
 	return (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode));
 #else
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index a197215ad..ea272be62 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -40,8 +40,6 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
 	struct dnode_of_data dn;
 	int err;
 
-	f2fs_balance_fs(sbi);
-
 	sb_start_pagefault(inode->i_sb);
 
 	f2fs_bug_on(sbi, f2fs_has_inline_data(inode));
@@ -57,6 +55,8 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma,
 	f2fs_put_dnode(&dn);
 	f2fs_unlock_op(sbi);
 
+	f2fs_balance_fs(sbi, dn.node_changed);
+
 	file_update_time(vma->vm_file);
 	lock_page(page);
 	if (unlikely(page->mapping != inode->i_mapping ||
@@ -96,6 +96,7 @@ mapped:
 	clear_cold_data(page);
 out:
 	sb_end_pagefault(inode->i_sb);
+	f2fs_update_time(sbi, REQ_TIME);
 	return block_page_mkwrite_return(err);
 }
 
@@ -201,7 +202,7 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 	trace_f2fs_sync_file_enter(inode);
 
 	/* if fdatasync is triggered, let's do in-place-update */
-	if (get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks)
+	if (datasync || get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks)
 		set_inode_flag(fi, FI_NEED_IPU);
 	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
 	clear_inode_flag(fi, FI_NEED_IPU);
@@ -233,9 +234,6 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 		goto out;
 	}
 go_write:
-	/* guarantee free sections for fsync */
-	f2fs_balance_fs(sbi);
-
 	/*
 	 * Both of fdatasync() and fsync() are able to be recovered from
 	 * sudden-power-off.
@@ -261,8 +259,10 @@ sync_nodes:
 	sync_node_pages(sbi, ino, &wbc);
 
 	/* if cp_error was enabled, we should avoid infinite loop */
-	if (unlikely(f2fs_cp_error(sbi)))
+	if (unlikely(f2fs_cp_error(sbi))) {
+		ret = -EIO;
 		goto out;
+	}
 
 	if (need_inode_block_update(sbi, ino)) {
 		mark_inode_dirty_sync(inode);
@@ -275,12 +275,13 @@ sync_nodes:
 		goto out;
 
 	/* once recovery info is written, don't need to tack this */
-	remove_dirty_inode(sbi, ino, APPEND_INO);
+	remove_ino_entry(sbi, ino, APPEND_INO);
 	clear_inode_flag(fi, FI_APPEND_WRITE);
 flush_out:
-	remove_dirty_inode(sbi, ino, UPDATE_INO);
+	remove_ino_entry(sbi, ino, UPDATE_INO);
 	clear_inode_flag(fi, FI_UPDATE_WRITE);
 	ret = f2fs_issue_flush(sbi);
+	f2fs_update_time(sbi, REQ_TIME);
 out:
 	trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret);
 	f2fs_trace_ios(NULL, 1);
@@ -332,7 +333,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
 	loff_t isize;
 	int err = 0;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	isize = i_size_read(inode);
 	if (offset >= isize)
@@ -387,10 +388,10 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
 found:
 	if (whence == SEEK_HOLE && data_ofs > isize)
 		data_ofs = isize;
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return vfs_setpos(file, data_ofs, maxbytes);
 fail:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return -ENXIO;
 }
 
@@ -418,19 +419,18 @@ static loff_t f2fs_llseek(struct file *file, loff_t offset, int whence)
 static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	struct inode *inode = file_inode(file);
+	int err;
 
 	if (f2fs_encrypted_inode(inode)) {
-		int err = f2fs_get_encryption_info(inode);
+		err = f2fs_get_encryption_info(inode);
 		if (err)
 			return 0;
 	}
 
 	/* we don't need to use inline_data strictly */
-	if (f2fs_has_inline_data(inode)) {
-		int err = f2fs_convert_inline_inode(inode);
-		if (err)
-			return err;
-	}
+	err = f2fs_convert_inline_inode(inode);
+	if (err)
+		return err;
 
 	file_accessed(file);
 	vma->vm_ops = &f2fs_file_vm_ops;
@@ -483,11 +483,11 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count)
 						F2FS_I(dn->inode)) + ofs;
 		f2fs_update_extent_cache_range(dn, fofs, 0, len);
 		dec_valid_block_count(sbi, dn->inode, nr_free);
-		set_page_dirty(dn->node_page);
 		sync_inode_page(dn);
 	}
 	dn->ofs_in_node = ofs;
 
+	f2fs_update_time(sbi, REQ_TIME);
 	trace_f2fs_truncate_data_blocks_range(dn->inode, dn->nid,
 					 dn->ofs_in_node, nr_free);
 	return nr_free;
@@ -604,7 +604,7 @@ int f2fs_truncate(struct inode *inode, bool lock)
 	trace_f2fs_truncate(inode);
 
 	/* we should check inline_data size */
-	if (f2fs_has_inline_data(inode) && !f2fs_may_inline_data(inode)) {
+	if (!f2fs_may_inline_data(inode)) {
 		err = f2fs_convert_inline_inode(inode);
 		if (err)
 			return err;
@@ -679,13 +679,20 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
 			err = f2fs_truncate(inode, true);
 			if (err)
 				return err;
-			f2fs_balance_fs(F2FS_I_SB(inode));
+			f2fs_balance_fs(F2FS_I_SB(inode), true);
 		} else {
 			/*
 			 * do not trim all blocks after i_size if target size is
 			 * larger than i_size.
 			 */
 			truncate_setsize(inode, attr->ia_size);
+
+			/* should convert inline inode here */
+			if (!f2fs_may_inline_data(inode)) {
+				err = f2fs_convert_inline_inode(inode);
+				if (err)
+					return err;
+			}
 			inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		}
 	}
@@ -727,7 +734,7 @@ static int fill_zero(struct inode *inode, pgoff_t index,
 	if (!len)
 		return 0;
 
-	f2fs_balance_fs(sbi);
+	f2fs_balance_fs(sbi, true);
 
 	f2fs_lock_op(sbi);
 	page = get_new_data_page(inode, NULL, index, false);
@@ -778,13 +785,11 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
 {
 	pgoff_t pg_start, pg_end;
 	loff_t off_start, off_end;
-	int ret = 0;
+	int ret;
 
-	if (f2fs_has_inline_data(inode)) {
-		ret = f2fs_convert_inline_inode(inode);
-		if (ret)
-			return ret;
-	}
+	ret = f2fs_convert_inline_inode(inode);
+	if (ret)
+		return ret;
 
 	pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT;
 	pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT;
@@ -815,7 +820,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len)
 			loff_t blk_start, blk_end;
 			struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 
-			f2fs_balance_fs(sbi);
+			f2fs_balance_fs(sbi, true);
 
 			blk_start = (loff_t)pg_start << PAGE_CACHE_SHIFT;
 			blk_end = (loff_t)pg_end << PAGE_CACHE_SHIFT;
@@ -918,7 +923,7 @@ static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end)
 	int ret = 0;
 
 	for (; end < nrpages; start++, end++) {
-		f2fs_balance_fs(sbi);
+		f2fs_balance_fs(sbi, true);
 		f2fs_lock_op(sbi);
 		ret = __exchange_data_block(inode, end, start, true);
 		f2fs_unlock_op(sbi);
@@ -941,13 +946,9 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len)
 	if (offset & (F2FS_BLKSIZE - 1) || len & (F2FS_BLKSIZE - 1))
 		return -EINVAL;
 
-	f2fs_balance_fs(F2FS_I_SB(inode));
-
-	if (f2fs_has_inline_data(inode)) {
-		ret = f2fs_convert_inline_inode(inode);
-		if (ret)
-			return ret;
-	}
+	ret = f2fs_convert_inline_inode(inode);
+	if (ret)
+		return ret;
 
 	pg_start = offset >> PAGE_CACHE_SHIFT;
 	pg_end = (offset + len) >> PAGE_CACHE_SHIFT;
@@ -991,13 +992,9 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
 	if (ret)
 		return ret;
 
-	f2fs_balance_fs(sbi);
-
-	if (f2fs_has_inline_data(inode)) {
-		ret = f2fs_convert_inline_inode(inode);
-		if (ret)
-			return ret;
-	}
+	ret = f2fs_convert_inline_inode(inode);
+	if (ret)
+		return ret;
 
 	ret = filemap_write_and_wait_range(mapping, offset, offset + len - 1);
 	if (ret)
@@ -1104,13 +1101,11 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
 	if (offset & (F2FS_BLKSIZE - 1) || len & (F2FS_BLKSIZE - 1))
 		return -EINVAL;
 
-	f2fs_balance_fs(sbi);
+	ret = f2fs_convert_inline_inode(inode);
+	if (ret)
+		return ret;
 
-	if (f2fs_has_inline_data(inode)) {
-		ret = f2fs_convert_inline_inode(inode);
-		if (ret)
-			return ret;
-	}
+	f2fs_balance_fs(sbi, true);
 
 	ret = truncate_blocks(inode, i_size_read(inode), true);
 	if (ret)
@@ -1154,17 +1149,15 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
 	loff_t off_start, off_end;
 	int ret = 0;
 
-	f2fs_balance_fs(sbi);
-
 	ret = inode_newsize_ok(inode, (len + offset));
 	if (ret)
 		return ret;
 
-	if (f2fs_has_inline_data(inode)) {
-		ret = f2fs_convert_inline_inode(inode);
-		if (ret)
-			return ret;
-	}
+	ret = f2fs_convert_inline_inode(inode);
+	if (ret)
+		return ret;
+
+	f2fs_balance_fs(sbi, true);
 
 	pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT;
 	pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT;
@@ -1226,7 +1219,7 @@ static long f2fs_fallocate(struct file *file, int mode,
 			FALLOC_FL_INSERT_RANGE))
 		return -EOPNOTSUPP;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	if (mode & FALLOC_FL_PUNCH_HOLE) {
 		if (offset >= inode->i_size)
@@ -1246,10 +1239,11 @@ static long f2fs_fallocate(struct file *file, int mode,
 	if (!ret) {
 		inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 		mark_inode_dirty(inode);
+		f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
 	}
 
 out:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	trace_f2fs_fallocate(inode, mode, offset, len, ret);
 	return ret;
@@ -1313,13 +1307,13 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg)
 
 	flags = f2fs_mask_flags(inode->i_mode, flags);
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	oldflags = fi->i_flags;
 
 	if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) {
 		if (!capable(CAP_LINUX_IMMUTABLE)) {
-			mutex_unlock(&inode->i_mutex);
+			inode_unlock(inode);
 			ret = -EPERM;
 			goto out;
 		}
@@ -1328,7 +1322,7 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg)
 	flags = flags & FS_FL_USER_MODIFIABLE;
 	flags |= oldflags & ~FS_FL_USER_MODIFIABLE;
 	fi->i_flags = flags;
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	f2fs_set_inode_flags(inode);
 	inode->i_ctime = CURRENT_TIME;
@@ -1353,8 +1347,6 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
 	if (!inode_owner_or_capable(inode))
 		return -EACCES;
 
-	f2fs_balance_fs(F2FS_I_SB(inode));
-
 	if (f2fs_is_atomic_file(inode))
 		return 0;
 
@@ -1363,6 +1355,8 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
 		return ret;
 
 	set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
+	f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
+
 	return 0;
 }
 
@@ -1384,8 +1378,10 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp)
 	if (f2fs_is_atomic_file(inode)) {
 		clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
 		ret = commit_inmem_pages(inode, false);
-		if (ret)
+		if (ret) {
+			set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
 			goto err_out;
+		}
 	}
 
 	ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0);
@@ -1410,6 +1406,7 @@ static int f2fs_ioc_start_volatile_write(struct file *filp)
 		return ret;
 
 	set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
+	f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
 	return 0;
 }
 
@@ -1441,13 +1438,17 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp)
 	if (ret)
 		return ret;
 
-	f2fs_balance_fs(F2FS_I_SB(inode));
-
-	clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
-	clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
-	commit_inmem_pages(inode, true);
+	if (f2fs_is_atomic_file(inode)) {
+		clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE);
+		commit_inmem_pages(inode, true);
+	}
+	if (f2fs_is_volatile_file(inode)) {
+		clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE);
+		ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0);
+	}
 
 	mnt_drop_write_file(filp);
+	f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
 	return ret;
 }
 
@@ -1487,6 +1488,7 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg)
 	default:
 		return -EINVAL;
 	}
+	f2fs_update_time(sbi, REQ_TIME);
 	return 0;
 }
 
@@ -1517,6 +1519,7 @@ static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg)
 	if (copy_to_user((struct fstrim_range __user *)arg, &range,
 				sizeof(range)))
 		return -EFAULT;
+	f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
 	return 0;
 }
 
@@ -1540,6 +1543,7 @@ static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg)
 				sizeof(policy)))
 		return -EFAULT;
 
+	f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
 	return f2fs_process_policy(&policy, inode);
 #else
 	return -EOPNOTSUPP;
@@ -1586,13 +1590,13 @@ static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg)
 	generate_random_uuid(sbi->raw_super->encrypt_pw_salt);
 
 	err = f2fs_commit_super(sbi, false);
-
-	mnt_drop_write_file(filp);
 	if (err) {
 		/* undo new data */
 		memset(sbi->raw_super->encrypt_pw_salt, 0, 16);
+		mnt_drop_write_file(filp);
 		return err;
 	}
+	mnt_drop_write_file(filp);
 got_it:
 	if (copy_to_user((__u8 __user *)arg, sbi->raw_super->encrypt_pw_salt,
 									16))
@@ -1629,7 +1633,6 @@ static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg)
 {
 	struct inode *inode = file_inode(filp);
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-	struct cp_control cpc;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
@@ -1637,13 +1640,196 @@ static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg)
 	if (f2fs_readonly(sbi->sb))
 		return -EROFS;
 
-	cpc.reason = __get_cp_reason(sbi);
+	return f2fs_sync_fs(sbi->sb, 1);
+}
 
-	mutex_lock(&sbi->gc_mutex);
-	write_checkpoint(sbi, &cpc);
-	mutex_unlock(&sbi->gc_mutex);
+static int f2fs_defragment_range(struct f2fs_sb_info *sbi,
+					struct file *filp,
+					struct f2fs_defragment *range)
+{
+	struct inode *inode = file_inode(filp);
+	struct f2fs_map_blocks map;
+	struct extent_info ei;
+	pgoff_t pg_start, pg_end;
+	unsigned int blk_per_seg = sbi->blocks_per_seg;
+	unsigned int total = 0, sec_num;
+	unsigned int pages_per_sec = sbi->segs_per_sec * blk_per_seg;
+	block_t blk_end = 0;
+	bool fragmented = false;
+	int err;
 
-	return 0;
+	/* if in-place-update policy is enabled, don't waste time here */
+	if (need_inplace_update(inode))
+		return -EINVAL;
+
+	pg_start = range->start >> PAGE_CACHE_SHIFT;
+	pg_end = (range->start + range->len) >> PAGE_CACHE_SHIFT;
+
+	f2fs_balance_fs(sbi, true);
+
+	inode_lock(inode);
+
+	/* writeback all dirty pages in the range */
+	err = filemap_write_and_wait_range(inode->i_mapping, range->start,
+						range->start + range->len - 1);
+	if (err)
+		goto out;
+
+	/*
+	 * lookup mapping info in extent cache, skip defragmenting if physical
+	 * block addresses are continuous.
+	 */
+	if (f2fs_lookup_extent_cache(inode, pg_start, &ei)) {
+		if (ei.fofs + ei.len >= pg_end)
+			goto out;
+	}
+
+	map.m_lblk = pg_start;
+
+	/*
+	 * lookup mapping info in dnode page cache, skip defragmenting if all
+	 * physical block addresses are continuous even if there are hole(s)
+	 * in logical blocks.
+	 */
+	while (map.m_lblk < pg_end) {
+		map.m_len = pg_end - map.m_lblk;
+		err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ);
+		if (err)
+			goto out;
+
+		if (!(map.m_flags & F2FS_MAP_FLAGS)) {
+			map.m_lblk++;
+			continue;
+		}
+
+		if (blk_end && blk_end != map.m_pblk) {
+			fragmented = true;
+			break;
+		}
+		blk_end = map.m_pblk + map.m_len;
+
+		map.m_lblk += map.m_len;
+	}
+
+	if (!fragmented)
+		goto out;
+
+	map.m_lblk = pg_start;
+	map.m_len = pg_end - pg_start;
+
+	sec_num = (map.m_len + pages_per_sec - 1) / pages_per_sec;
+
+	/*
+	 * make sure there are enough free section for LFS allocation, this can
+	 * avoid defragment running in SSR mode when free section are allocated
+	 * intensively
+	 */
+	if (has_not_enough_free_secs(sbi, sec_num)) {
+		err = -EAGAIN;
+		goto out;
+	}
+
+	while (map.m_lblk < pg_end) {
+		pgoff_t idx;
+		int cnt = 0;
+
+do_map:
+		map.m_len = pg_end - map.m_lblk;
+		err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ);
+		if (err)
+			goto clear_out;
+
+		if (!(map.m_flags & F2FS_MAP_FLAGS)) {
+			map.m_lblk++;
+			continue;
+		}
+
+		set_inode_flag(F2FS_I(inode), FI_DO_DEFRAG);
+
+		idx = map.m_lblk;
+		while (idx < map.m_lblk + map.m_len && cnt < blk_per_seg) {
+			struct page *page;
+
+			page = get_lock_data_page(inode, idx, true);
+			if (IS_ERR(page)) {
+				err = PTR_ERR(page);
+				goto clear_out;
+			}
+
+			set_page_dirty(page);
+			f2fs_put_page(page, 1);
+
+			idx++;
+			cnt++;
+			total++;
+		}
+
+		map.m_lblk = idx;
+
+		if (idx < pg_end && cnt < blk_per_seg)
+			goto do_map;
+
+		clear_inode_flag(F2FS_I(inode), FI_DO_DEFRAG);
+
+		err = filemap_fdatawrite(inode->i_mapping);
+		if (err)
+			goto out;
+	}
+clear_out:
+	clear_inode_flag(F2FS_I(inode), FI_DO_DEFRAG);
+out:
+	inode_unlock(inode);
+	if (!err)
+		range->len = (u64)total << PAGE_CACHE_SHIFT;
+	return err;
+}
+
+static int f2fs_ioc_defragment(struct file *filp, unsigned long arg)
+{
+	struct inode *inode = file_inode(filp);
+	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	struct f2fs_defragment range;
+	int err;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (!S_ISREG(inode->i_mode))
+		return -EINVAL;
+
+	err = mnt_want_write_file(filp);
+	if (err)
+		return err;
+
+	if (f2fs_readonly(sbi->sb)) {
+		err = -EROFS;
+		goto out;
+	}
+
+	if (copy_from_user(&range, (struct f2fs_defragment __user *)arg,
+							sizeof(range))) {
+		err = -EFAULT;
+		goto out;
+	}
+
+	/* verify alignment of offset & size */
+	if (range.start & (F2FS_BLKSIZE - 1) ||
+		range.len & (F2FS_BLKSIZE - 1)) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	err = f2fs_defragment_range(sbi, filp, &range);
+	f2fs_update_time(sbi, REQ_TIME);
+	if (err < 0)
+		goto out;
+
+	if (copy_to_user((struct f2fs_defragment __user *)arg, &range,
+							sizeof(range)))
+		err = -EFAULT;
+out:
+	mnt_drop_write_file(filp);
+	return err;
 }
 
 long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
@@ -1679,6 +1865,8 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		return f2fs_ioc_gc(filp, arg);
 	case F2FS_IOC_WRITE_CHECKPOINT:
 		return f2fs_ioc_write_checkpoint(filp, arg);
+	case F2FS_IOC_DEFRAGMENT:
+		return f2fs_ioc_defragment(filp, arg);
 	default:
 		return -ENOTTY;
 	}
@@ -1706,6 +1894,22 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case F2FS_IOC32_SETFLAGS:
 		cmd = F2FS_IOC_SETFLAGS;
 		break;
+	case F2FS_IOC32_GETVERSION:
+		cmd = F2FS_IOC_GETVERSION;
+		break;
+	case F2FS_IOC_START_ATOMIC_WRITE:
+	case F2FS_IOC_COMMIT_ATOMIC_WRITE:
+	case F2FS_IOC_START_VOLATILE_WRITE:
+	case F2FS_IOC_RELEASE_VOLATILE_WRITE:
+	case F2FS_IOC_ABORT_VOLATILE_WRITE:
+	case F2FS_IOC_SHUTDOWN:
+	case F2FS_IOC_SET_ENCRYPTION_POLICY:
+	case F2FS_IOC_GET_ENCRYPTION_PWSALT:
+	case F2FS_IOC_GET_ENCRYPTION_POLICY:
+	case F2FS_IOC_GARBAGE_COLLECT:
+	case F2FS_IOC_WRITE_CHECKPOINT:
+	case F2FS_IOC_DEFRAGMENT:
+		break;
 	default:
 		return -ENOIOCTLCMD;
 	}
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index fedbf67a0..f610c2a9b 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -16,7 +16,6 @@
 #include <linux/kthread.h>
 #include <linux/delay.h>
 #include <linux/freezer.h>
-#include <linux/blkdev.h>
 
 #include "f2fs.h"
 #include "node.h"
@@ -173,9 +172,9 @@ static unsigned int get_max_cost(struct f2fs_sb_info *sbi,
 {
 	/* SSR allocates in a segment unit */
 	if (p->alloc_mode == SSR)
-		return 1 << sbi->log_blocks_per_seg;
+		return sbi->blocks_per_seg;
 	if (p->gc_mode == GC_GREEDY)
-		return (1 << sbi->log_blocks_per_seg) * p->ofs_unit;
+		return sbi->blocks_per_seg * p->ofs_unit;
 	else if (p->gc_mode == GC_CB)
 		return UINT_MAX;
 	else /* No other gc_mode */
@@ -832,8 +831,10 @@ gc_more:
 
 	if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE)))
 		goto stop;
-	if (unlikely(f2fs_cp_error(sbi)))
+	if (unlikely(f2fs_cp_error(sbi))) {
+		ret = -EIO;
 		goto stop;
+	}
 
 	if (gc_type == BG_GC && has_not_enough_free_secs(sbi, sec_freed)) {
 		gc_type = FG_GC;
diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h
index b4a65be9f..a993967dc 100644
--- a/fs/f2fs/gc.h
+++ b/fs/f2fs/gc.h
@@ -100,11 +100,3 @@ static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi)
 		return true;
 	return false;
 }
-
-static inline int is_idle(struct f2fs_sb_info *sbi)
-{
-	struct block_device *bdev = sbi->sb->s_bdev;
-	struct request_queue *q = bdev_get_queue(bdev);
-	struct request_list *rl = &q->root_rl;
-	return !(rl->count[BLK_RW_SYNC]) && !(rl->count[BLK_RW_ASYNC]);
-}
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index bda712646..c3f0b7d4c 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -16,9 +16,6 @@
 
 bool f2fs_may_inline_data(struct inode *inode)
 {
-	if (!test_opt(F2FS_I_SB(inode), INLINE_DATA))
-		return false;
-
 	if (f2fs_is_atomic_file(inode))
 		return false;
 
@@ -177,6 +174,9 @@ int f2fs_convert_inline_inode(struct inode *inode)
 	struct page *ipage, *page;
 	int err = 0;
 
+	if (!f2fs_has_inline_data(inode))
+		return 0;
+
 	page = grab_cache_page(inode->i_mapping, 0);
 	if (!page)
 		return -ENOMEM;
@@ -199,6 +199,9 @@ out:
 	f2fs_unlock_op(sbi);
 
 	f2fs_put_page(page, 1);
+
+	f2fs_balance_fs(sbi, dn.node_changed);
+
 	return err;
 }
 
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 97e20deca..2adeff26b 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -138,7 +138,8 @@ static int do_read_inode(struct inode *inode)
 	fi->i_pino = le32_to_cpu(ri->i_pino);
 	fi->i_dir_level = ri->i_dir_level;
 
-	f2fs_init_extent_tree(inode, &ri->i_ext);
+	if (f2fs_init_extent_tree(inode, &ri->i_ext))
+		set_page_dirty(node_page);
 
 	get_inline_info(fi, ri);
 
@@ -202,6 +203,7 @@ make_now:
 			inode->i_op = &f2fs_encrypted_symlink_inode_operations;
 		else
 			inode->i_op = &f2fs_symlink_inode_operations;
+		inode_nohighmem(inode);
 		inode->i_mapping->a_ops = &f2fs_dblock_aops;
 	} else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
 			S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
@@ -221,7 +223,7 @@ bad_inode:
 	return ERR_PTR(ret);
 }
 
-void update_inode(struct inode *inode, struct page *node_page)
+int update_inode(struct inode *inode, struct page *node_page)
 {
 	struct f2fs_inode *ri;
 
@@ -259,15 +261,16 @@ void update_inode(struct inode *inode, struct page *node_page)
 
 	__set_inode_rdev(inode, ri);
 	set_cold_node(inode, node_page);
-	set_page_dirty(node_page);
-
 	clear_inode_flag(F2FS_I(inode), FI_DIRTY_INODE);
+
+	return set_page_dirty(node_page);
 }
 
-void update_inode_page(struct inode *inode)
+int update_inode_page(struct inode *inode)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct page *node_page;
+	int ret = 0;
 retry:
 	node_page = get_node_page(sbi, inode->i_ino);
 	if (IS_ERR(node_page)) {
@@ -278,10 +281,11 @@ retry:
 		} else if (err != -ENOENT) {
 			f2fs_stop_checkpoint(sbi);
 		}
-		return;
+		return 0;
 	}
-	update_inode(inode, node_page);
+	ret = update_inode(inode, node_page);
 	f2fs_put_page(node_page, 1);
+	return ret;
 }
 
 int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
@@ -299,9 +303,8 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
 	 * We need to balance fs here to prevent from producing dirty node pages
 	 * during the urgent cleaning time when runing out of free sections.
 	 */
-	update_inode_page(inode);
-
-	f2fs_balance_fs(sbi);
+	if (update_inode_page(inode))
+		f2fs_balance_fs(sbi, true);
 	return 0;
 }
 
@@ -327,7 +330,7 @@ void f2fs_evict_inode(struct inode *inode)
 		goto out_clear;
 
 	f2fs_bug_on(sbi, get_dirty_pages(inode));
-	remove_dirty_dir_inode(inode);
+	remove_dirty_inode(inode);
 
 	f2fs_destroy_extent_tree(inode);
 
@@ -357,9 +360,9 @@ no_delete:
 	if (xnid)
 		invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid);
 	if (is_inode_flag_set(fi, FI_APPEND_WRITE))
-		add_dirty_inode(sbi, inode->i_ino, APPEND_INO);
+		add_ino_entry(sbi, inode->i_ino, APPEND_INO);
 	if (is_inode_flag_set(fi, FI_UPDATE_WRITE))
-		add_dirty_inode(sbi, inode->i_ino, UPDATE_INO);
+		add_ino_entry(sbi, inode->i_ino, UPDATE_INO);
 	if (is_inode_flag_set(fi, FI_FREE_NID)) {
 		if (err && err != -ENOENT)
 			alloc_nid_done(sbi, inode->i_ino);
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 2c32110f9..6f944e5eb 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -60,7 +60,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
 	if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode))
 		f2fs_set_encrypted_inode(inode);
 
-	if (f2fs_may_inline_data(inode))
+	if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode))
 		set_inode_flag(F2FS_I(inode), FI_INLINE_DATA);
 	if (f2fs_may_inline_dentry(inode))
 		set_inode_flag(F2FS_I(inode), FI_INLINE_DENTRY);
@@ -128,8 +128,6 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 	nid_t ino = 0;
 	int err;
 
-	f2fs_balance_fs(sbi);
-
 	inode = f2fs_new_inode(dir, mode);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
@@ -142,6 +140,8 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 	inode->i_mapping->a_ops = &f2fs_dblock_aops;
 	ino = inode->i_ino;
 
+	f2fs_balance_fs(sbi, true);
+
 	f2fs_lock_op(sbi);
 	err = f2fs_add_link(dentry, inode);
 	if (err)
@@ -172,7 +172,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
 		!f2fs_is_child_context_consistent_with_parent(dir, inode))
 		return -EPERM;
 
-	f2fs_balance_fs(sbi);
+	f2fs_balance_fs(sbi, true);
 
 	inode->i_ctime = CURRENT_TIME;
 	ihold(inode);
@@ -214,6 +214,15 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino)
 	struct page *page;
 	int err = 0;
 
+	if (f2fs_readonly(sbi->sb)) {
+		f2fs_msg(sbi->sb, KERN_INFO,
+			"skip recovering inline_dots inode (ino:%lu, pino:%u) "
+			"in readonly mountpoint", dir->i_ino, pino);
+		return 0;
+	}
+
+	f2fs_balance_fs(sbi, true);
+
 	f2fs_lock_op(sbi);
 
 	de = f2fs_find_entry(dir, &dot, &page);
@@ -288,12 +297,13 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
 	int err = -ENOENT;
 
 	trace_f2fs_unlink_enter(dir, dentry);
-	f2fs_balance_fs(sbi);
 
 	de = f2fs_find_entry(dir, &dentry->d_name, &page);
 	if (!de)
 		goto fail;
 
+	f2fs_balance_fs(sbi, true);
+
 	f2fs_lock_op(sbi);
 	err = acquire_orphan_inode(sbi);
 	if (err) {
@@ -315,12 +325,15 @@ fail:
 	return err;
 }
 
-static const char *f2fs_follow_link(struct dentry *dentry, void **cookie)
+static const char *f2fs_get_link(struct dentry *dentry,
+				 struct inode *inode,
+				 struct delayed_call *done)
 {
-	const char *link = page_follow_link_light(dentry, cookie);
+	const char *link = page_get_link(dentry, inode, done);
 	if (!IS_ERR(link) && !*link) {
 		/* this is broken symlink case */
-		page_put_link(NULL, *cookie);
+		do_delayed_call(done);
+		clear_delayed_call(done);
 		link = ERR_PTR(-ENOENT);
 	}
 	return link;
@@ -341,8 +354,6 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
 	if (len > dir->i_sb->s_blocksize)
 		return -ENAMETOOLONG;
 
-	f2fs_balance_fs(sbi);
-
 	inode = f2fs_new_inode(dir, S_IFLNK | S_IRWXUGO);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
@@ -351,8 +362,11 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
 		inode->i_op = &f2fs_encrypted_symlink_inode_operations;
 	else
 		inode->i_op = &f2fs_symlink_inode_operations;
+	inode_nohighmem(inode);
 	inode->i_mapping->a_ops = &f2fs_dblock_aops;
 
+	f2fs_balance_fs(sbi, true);
+
 	f2fs_lock_op(sbi);
 	err = f2fs_add_link(dentry, inode);
 	if (err)
@@ -433,8 +447,6 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	struct inode *inode;
 	int err;
 
-	f2fs_balance_fs(sbi);
-
 	inode = f2fs_new_inode(dir, S_IFDIR | mode);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
@@ -444,6 +456,8 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	inode->i_mapping->a_ops = &f2fs_dblock_aops;
 	mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_HIGH_ZERO);
 
+	f2fs_balance_fs(sbi, true);
+
 	set_inode_flag(F2FS_I(inode), FI_INC_LINK);
 	f2fs_lock_op(sbi);
 	err = f2fs_add_link(dentry, inode);
@@ -481,8 +495,6 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
 	struct inode *inode;
 	int err = 0;
 
-	f2fs_balance_fs(sbi);
-
 	inode = f2fs_new_inode(dir, mode);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
@@ -490,6 +502,8 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
 	init_special_inode(inode, inode->i_mode, rdev);
 	inode->i_op = &f2fs_special_inode_operations;
 
+	f2fs_balance_fs(sbi, true);
+
 	f2fs_lock_op(sbi);
 	err = f2fs_add_link(dentry, inode);
 	if (err)
@@ -516,9 +530,6 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry,
 	struct inode *inode;
 	int err;
 
-	if (!whiteout)
-		f2fs_balance_fs(sbi);
-
 	inode = f2fs_new_inode(dir, mode);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
@@ -532,6 +543,8 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry,
 		inode->i_mapping->a_ops = &f2fs_dblock_aops;
 	}
 
+	f2fs_balance_fs(sbi, true);
+
 	f2fs_lock_op(sbi);
 	err = acquire_orphan_inode(sbi);
 	if (err)
@@ -604,8 +617,6 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		goto out;
 	}
 
-	f2fs_balance_fs(sbi);
-
 	old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page);
 	if (!old_entry)
 		goto out;
@@ -635,6 +646,8 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		if (!new_entry)
 			goto out_whiteout;
 
+		f2fs_balance_fs(sbi, true);
+
 		f2fs_lock_op(sbi);
 
 		err = acquire_orphan_inode(sbi);
@@ -666,6 +679,8 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
 		update_inode_page(old_inode);
 		update_inode_page(new_inode);
 	} else {
+		f2fs_balance_fs(sbi, true);
+
 		f2fs_lock_op(sbi);
 
 		err = f2fs_add_link(new_dentry, old_inode);
@@ -763,8 +778,6 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
 								new_inode)))
 		return -EPERM;
 
-	f2fs_balance_fs(sbi);
-
 	old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page);
 	if (!old_entry)
 		goto out;
@@ -807,6 +820,8 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
 			goto out_new_dir;
 	}
 
+	f2fs_balance_fs(sbi, true);
+
 	f2fs_lock_op(sbi);
 
 	err = update_dent_inode(old_inode, new_inode, &new_dentry->d_name);
@@ -923,18 +938,22 @@ static int f2fs_rename2(struct inode *old_dir, struct dentry *old_dentry,
 }
 
 #ifdef CONFIG_F2FS_FS_ENCRYPTION
-static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cookie)
+static const char *f2fs_encrypted_get_link(struct dentry *dentry,
+					   struct inode *inode,
+					   struct delayed_call *done)
 {
 	struct page *cpage = NULL;
 	char *caddr, *paddr = NULL;
-	struct f2fs_str cstr;
+	struct f2fs_str cstr = FSTR_INIT(NULL, 0);
 	struct f2fs_str pstr = FSTR_INIT(NULL, 0);
-	struct inode *inode = d_inode(dentry);
 	struct f2fs_encrypted_symlink_data *sd;
 	loff_t size = min_t(loff_t, i_size_read(inode), PAGE_SIZE - 1);
 	u32 max_size = inode->i_sb->s_blocksize;
 	int res;
 
+	if (!dentry)
+		return ERR_PTR(-ECHILD);
+
 	res = f2fs_get_encryption_info(inode);
 	if (res)
 		return ERR_PTR(res);
@@ -942,12 +961,18 @@ static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cook
 	cpage = read_mapping_page(inode->i_mapping, 0, NULL);
 	if (IS_ERR(cpage))
 		return ERR_CAST(cpage);
-	caddr = kmap(cpage);
+	caddr = page_address(cpage);
 	caddr[size] = 0;
 
 	/* Symlink is encrypted */
 	sd = (struct f2fs_encrypted_symlink_data *)caddr;
 	cstr.len = le16_to_cpu(sd->len);
+
+	/* this is broken symlink case */
+	if (unlikely(cstr.len == 0)) {
+		res = -ENOENT;
+		goto errout;
+	}
 	cstr.name = kmalloc(cstr.len, GFP_NOFS);
 	if (!cstr.name) {
 		res = -ENOMEM;
@@ -956,7 +981,7 @@ static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cook
 	memcpy(cstr.name, sd->encrypted_path, cstr.len);
 
 	/* this is broken symlink case */
-	if (cstr.name[0] == 0 && cstr.len == 0) {
+	if (unlikely(cstr.name[0] == 0)) {
 		res = -ENOENT;
 		goto errout;
 	}
@@ -982,27 +1007,27 @@ static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cook
 	/* Null-terminate the name */
 	paddr[res] = '\0';
 
-	kunmap(cpage);
 	page_cache_release(cpage);
-	return *cookie = paddr;
+	set_delayed_call(done, kfree_link, paddr);
+	return paddr;
 errout:
 	kfree(cstr.name);
 	f2fs_fname_crypto_free_buffer(&pstr);
-	kunmap(cpage);
 	page_cache_release(cpage);
 	return ERR_PTR(res);
 }
 
 const struct inode_operations f2fs_encrypted_symlink_inode_operations = {
 	.readlink       = generic_readlink,
-	.follow_link    = f2fs_encrypted_follow_link,
-	.put_link       = kfree_put_link,
+	.get_link       = f2fs_encrypted_get_link,
 	.getattr	= f2fs_getattr,
 	.setattr	= f2fs_setattr,
+#ifdef CONFIG_F2FS_FS_XATTR
 	.setxattr	= generic_setxattr,
 	.getxattr	= generic_getxattr,
 	.listxattr	= f2fs_listxattr,
 	.removexattr	= generic_removexattr,
+#endif
 };
 #endif
 
@@ -1031,8 +1056,7 @@ const struct inode_operations f2fs_dir_inode_operations = {
 
 const struct inode_operations f2fs_symlink_inode_operations = {
 	.readlink       = generic_readlink,
-	.follow_link    = f2fs_follow_link,
-	.put_link       = page_put_link,
+	.get_link       = f2fs_get_link,
 	.getattr	= f2fs_getattr,
 	.setattr	= f2fs_setattr,
 #ifdef CONFIG_F2FS_FS_XATTR
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 7bcbc6e9c..342597a58 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -65,13 +65,14 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
 				sizeof(struct ino_entry)) >> PAGE_CACHE_SHIFT;
 		res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
 	} else if (type == EXTENT_CACHE) {
-		mem_size = (sbi->total_ext_tree * sizeof(struct extent_tree) +
+		mem_size = (atomic_read(&sbi->total_ext_tree) *
+				sizeof(struct extent_tree) +
 				atomic_read(&sbi->total_ext_node) *
 				sizeof(struct extent_node)) >> PAGE_CACHE_SHIFT;
 		res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
 	} else {
-		if (sbi->sb->s_bdi->wb.dirty_exceeded)
-			return false;
+		if (!sbi->sb->s_bdi->wb.dirty_exceeded)
+			return true;
 	}
 	return res;
 }
@@ -261,13 +262,11 @@ static void cache_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid,
 {
 	struct nat_entry *e;
 
-	down_write(&nm_i->nat_tree_lock);
 	e = __lookup_nat_cache(nm_i, nid);
 	if (!e) {
 		e = grab_nat_entry(nm_i, nid);
 		node_info_from_raw_nat(&e->ni, ne);
 	}
-	up_write(&nm_i->nat_tree_lock);
 }
 
 static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
@@ -379,6 +378,8 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
 
 	memset(&ne, 0, sizeof(struct f2fs_nat_entry));
 
+	down_write(&nm_i->nat_tree_lock);
+
 	/* Check current segment summary */
 	mutex_lock(&curseg->curseg_mutex);
 	i = lookup_journal_in_cursum(sum, NAT_JOURNAL, nid, 0);
@@ -399,6 +400,7 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
 cache:
 	/* cache nat entry */
 	cache_nat_entry(NM_I(sbi), nid, &ne);
+	up_write(&nm_i->nat_tree_lock);
 }
 
 /*
@@ -676,7 +678,8 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs,
 			ret = truncate_dnode(&rdn);
 			if (ret < 0)
 				goto out_err;
-			set_nid(page, i, 0, false);
+			if (set_nid(page, i, 0, false))
+				dn->node_changed = true;
 		}
 	} else {
 		child_nofs = nofs + ofs * (NIDS_PER_BLOCK + 1) + 1;
@@ -689,7 +692,8 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs,
 			rdn.nid = child_nid;
 			ret = truncate_nodes(&rdn, child_nofs, 0, depth - 1);
 			if (ret == (NIDS_PER_BLOCK + 1)) {
-				set_nid(page, i, 0, false);
+				if (set_nid(page, i, 0, false))
+					dn->node_changed = true;
 				child_nofs += ret;
 			} else if (ret < 0 && ret != -ENOENT) {
 				goto out_err;
@@ -750,7 +754,8 @@ static int truncate_partial_nodes(struct dnode_of_data *dn,
 		err = truncate_dnode(dn);
 		if (err < 0)
 			goto fail;
-		set_nid(pages[idx], i, 0, false);
+		if (set_nid(pages[idx], i, 0, false))
+			dn->node_changed = true;
 	}
 
 	if (offset[idx + 1] == 0) {
@@ -975,7 +980,8 @@ struct page *new_node_page(struct dnode_of_data *dn,
 	fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true);
 	set_cold_node(dn->inode, page);
 	SetPageUptodate(page);
-	set_page_dirty(page);
+	if (set_page_dirty(page))
+		dn->node_changed = true;
 
 	if (f2fs_has_xattr_block(ofs))
 		F2FS_I(dn->inode)->i_xattr_nid = dn->nid;
@@ -1035,6 +1041,10 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
 	struct page *apage;
 	int err;
 
+	if (!nid)
+		return;
+	f2fs_bug_on(sbi, check_nid_range(sbi, nid));
+
 	apage = find_get_page(NODE_MAPPING(sbi), nid);
 	if (apage && PageUptodate(apage)) {
 		f2fs_put_page(apage, 0);
@@ -1050,51 +1060,38 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid)
 	f2fs_put_page(apage, err ? 1 : 0);
 }
 
-struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid)
+/*
+ * readahead MAX_RA_NODE number of node pages.
+ */
+void ra_node_pages(struct page *parent, int start)
 {
-	struct page *page;
-	int err;
-repeat:
-	page = grab_cache_page(NODE_MAPPING(sbi), nid);
-	if (!page)
-		return ERR_PTR(-ENOMEM);
+	struct f2fs_sb_info *sbi = F2FS_P_SB(parent);
+	struct blk_plug plug;
+	int i, end;
+	nid_t nid;
 
-	err = read_node_page(page, READ_SYNC);
-	if (err < 0) {
-		f2fs_put_page(page, 1);
-		return ERR_PTR(err);
-	} else if (err != LOCKED_PAGE) {
-		lock_page(page);
-	}
+	blk_start_plug(&plug);
 
-	if (unlikely(!PageUptodate(page) || nid != nid_of_node(page))) {
-		ClearPageUptodate(page);
-		f2fs_put_page(page, 1);
-		return ERR_PTR(-EIO);
-	}
-	if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
-		f2fs_put_page(page, 1);
-		goto repeat;
+	/* Then, try readahead for siblings of the desired node */
+	end = start + MAX_RA_NODE;
+	end = min(end, NIDS_PER_BLOCK);
+	for (i = start; i < end; i++) {
+		nid = get_nid(parent, i, false);
+		ra_node_page(sbi, nid);
 	}
-	return page;
+
+	blk_finish_plug(&plug);
 }
 
-/*
- * Return a locked page for the desired node page.
- * And, readahead MAX_RA_NODE number of node pages.
- */
-struct page *get_node_page_ra(struct page *parent, int start)
+struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid,
+					struct page *parent, int start)
 {
-	struct f2fs_sb_info *sbi = F2FS_P_SB(parent);
-	struct blk_plug plug;
 	struct page *page;
-	int err, i, end;
-	nid_t nid;
+	int err;
 
-	/* First, try getting the desired direct node. */
-	nid = get_nid(parent, start, false);
 	if (!nid)
 		return ERR_PTR(-ENOENT);
+	f2fs_bug_on(sbi, check_nid_range(sbi, nid));
 repeat:
 	page = grab_cache_page(NODE_MAPPING(sbi), nid);
 	if (!page)
@@ -1108,46 +1105,53 @@ repeat:
 		goto page_hit;
 	}
 
-	blk_start_plug(&plug);
-
-	/* Then, try readahead for siblings of the desired node */
-	end = start + MAX_RA_NODE;
-	end = min(end, NIDS_PER_BLOCK);
-	for (i = start + 1; i < end; i++) {
-		nid = get_nid(parent, i, false);
-		if (!nid)
-			continue;
-		ra_node_page(sbi, nid);
-	}
-
-	blk_finish_plug(&plug);
+	if (parent)
+		ra_node_pages(parent, start + 1);
 
 	lock_page(page);
+
+	if (unlikely(!PageUptodate(page))) {
+		f2fs_put_page(page, 1);
+		return ERR_PTR(-EIO);
+	}
 	if (unlikely(page->mapping != NODE_MAPPING(sbi))) {
 		f2fs_put_page(page, 1);
 		goto repeat;
 	}
 page_hit:
-	if (unlikely(!PageUptodate(page))) {
-		f2fs_put_page(page, 1);
-		return ERR_PTR(-EIO);
-	}
+	f2fs_bug_on(sbi, nid != nid_of_node(page));
 	return page;
 }
 
+struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid)
+{
+	return __get_node_page(sbi, nid, NULL, 0);
+}
+
+struct page *get_node_page_ra(struct page *parent, int start)
+{
+	struct f2fs_sb_info *sbi = F2FS_P_SB(parent);
+	nid_t nid = get_nid(parent, start, false);
+
+	return __get_node_page(sbi, nid, parent, start);
+}
+
 void sync_inode_page(struct dnode_of_data *dn)
 {
+	int ret = 0;
+
 	if (IS_INODE(dn->node_page) || dn->inode_page == dn->node_page) {
-		update_inode(dn->inode, dn->node_page);
+		ret = update_inode(dn->inode, dn->node_page);
 	} else if (dn->inode_page) {
 		if (!dn->inode_page_locked)
 			lock_page(dn->inode_page);
-		update_inode(dn->inode, dn->inode_page);
+		ret = update_inode(dn->inode, dn->inode_page);
 		if (!dn->inode_page_locked)
 			unlock_page(dn->inode_page);
 	} else {
-		update_inode_page(dn->inode);
+		ret = update_inode_page(dn->inode);
 	}
+	dn->node_changed = ret ? true: false;
 }
 
 int sync_node_pages(struct f2fs_sb_info *sbi, nid_t ino,
@@ -1175,6 +1179,11 @@ next_step:
 		for (i = 0; i < nr_pages; i++) {
 			struct page *page = pvec.pages[i];
 
+			if (unlikely(f2fs_cp_error(sbi))) {
+				pagevec_release(&pvec);
+				return -EIO;
+			}
+
 			/*
 			 * flushing sequence with step:
 			 * 0. indirect nodes
@@ -1349,7 +1358,7 @@ static int f2fs_write_node_page(struct page *page,
 	up_read(&sbi->node_write);
 	unlock_page(page);
 
-	if (wbc->for_reclaim)
+	if (wbc->for_reclaim || unlikely(f2fs_cp_error(sbi)))
 		f2fs_submit_merged_bio(sbi, NODE, WRITE);
 
 	return 0;
@@ -1440,13 +1449,10 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
 
 	if (build) {
 		/* do not add allocated nids */
-		down_read(&nm_i->nat_tree_lock);
 		ne = __lookup_nat_cache(nm_i, nid);
-		if (ne &&
-			(!get_nat_flag(ne, IS_CHECKPOINTED) ||
+		if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) ||
 				nat_get_blkaddr(ne) != NULL_ADDR))
 			allocated = true;
-		up_read(&nm_i->nat_tree_lock);
 		if (allocated)
 			return 0;
 	}
@@ -1532,6 +1538,8 @@ static void build_free_nids(struct f2fs_sb_info *sbi)
 	ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES,
 							META_NAT, true);
 
+	down_read(&nm_i->nat_tree_lock);
+
 	while (1) {
 		struct page *page = get_current_nat_page(sbi, nid);
 
@@ -1560,6 +1568,7 @@ static void build_free_nids(struct f2fs_sb_info *sbi)
 			remove_free_nid(nm_i, nid);
 	}
 	mutex_unlock(&curseg->curseg_mutex);
+	up_read(&nm_i->nat_tree_lock);
 
 	ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid),
 					nm_i->ra_nid_pages, META_NAT, false);
@@ -1582,8 +1591,6 @@ retry:
 
 	/* We should not use stale free nids created by build_free_nids */
 	if (nm_i->fcnt && !on_build_free_nids(nm_i)) {
-		struct node_info ni;
-
 		f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list));
 		list_for_each_entry(i, &nm_i->free_nid_list, list)
 			if (i->state == NID_NEW)
@@ -1594,13 +1601,6 @@ retry:
 		i->state = NID_ALLOC;
 		nm_i->fcnt--;
 		spin_unlock(&nm_i->free_nid_list_lock);
-
-		/* check nid is allocated already */
-		get_node_info(sbi, *nid, &ni);
-		if (ni.blk_addr != NULL_ADDR) {
-			alloc_nid_done(sbi, *nid);
-			goto retry;
-		}
 		return true;
 	}
 	spin_unlock(&nm_i->free_nid_list_lock);
@@ -1842,14 +1842,12 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
 
 		raw_ne = nat_in_journal(sum, i);
 
-		down_write(&nm_i->nat_tree_lock);
 		ne = __lookup_nat_cache(nm_i, nid);
 		if (!ne) {
 			ne = grab_nat_entry(nm_i, nid);
 			node_info_from_raw_nat(&ne->ni, &raw_ne);
 		}
 		__set_nat_cache_dirty(nm_i, ne);
-		up_write(&nm_i->nat_tree_lock);
 	}
 	update_nats_in_cursum(sum, -i);
 	mutex_unlock(&curseg->curseg_mutex);
@@ -1883,7 +1881,6 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
 	struct f2fs_nat_block *nat_blk;
 	struct nat_entry *ne, *cur;
 	struct page *page = NULL;
-	struct f2fs_nm_info *nm_i = NM_I(sbi);
 
 	/*
 	 * there are two steps to flush nat entries:
@@ -1920,12 +1917,8 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
 			raw_ne = &nat_blk->entries[nid - start_nid];
 		}
 		raw_nat_from_node_info(raw_ne, &ne->ni);
-
-		down_write(&NM_I(sbi)->nat_tree_lock);
 		nat_reset_flag(ne);
 		__clear_nat_cache_dirty(NM_I(sbi), ne);
-		up_write(&NM_I(sbi)->nat_tree_lock);
-
 		if (nat_get_blkaddr(ne) == NULL_ADDR)
 			add_free_nid(sbi, nid, false);
 	}
@@ -1937,9 +1930,7 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
 
 	f2fs_bug_on(sbi, set->entry_cnt);
 
-	down_write(&nm_i->nat_tree_lock);
 	radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set);
-	up_write(&nm_i->nat_tree_lock);
 	kmem_cache_free(nat_entry_set_slab, set);
 }
 
@@ -1959,6 +1950,9 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
 
 	if (!nm_i->dirty_nat_cnt)
 		return;
+
+	down_write(&nm_i->nat_tree_lock);
+
 	/*
 	 * if there are no enough space in journal to store dirty nat
 	 * entries, remove all entries from journal and merge them
@@ -1967,7 +1961,6 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
 	if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt, NAT_JOURNAL))
 		remove_nats_in_journal(sbi);
 
-	down_write(&nm_i->nat_tree_lock);
 	while ((found = __gang_lookup_nat_set(nm_i,
 					set_idx, SETVEC_SIZE, setvec))) {
 		unsigned idx;
@@ -1976,12 +1969,13 @@ void flush_nat_entries(struct f2fs_sb_info *sbi)
 			__adjust_nat_entry_set(setvec[idx], &sets,
 							MAX_NAT_JENTRIES(sum));
 	}
-	up_write(&nm_i->nat_tree_lock);
 
 	/* flush dirty nats in nat entry set */
 	list_for_each_entry_safe(set, tmp, &sets, set_list)
 		__flush_nat_entry_set(sbi, set);
 
+	up_write(&nm_i->nat_tree_lock);
+
 	f2fs_bug_on(sbi, nm_i->dirty_nat_cnt);
 }
 
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index e4fffd2d9..d4d1f636f 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -183,7 +183,7 @@ static inline pgoff_t current_nat_addr(struct f2fs_sb_info *sbi, nid_t start)
 
 	block_addr = (pgoff_t)(nm_i->nat_blkaddr +
 		(seg_off << sbi->log_blocks_per_seg << 1) +
-		(block_off & ((1 << sbi->log_blocks_per_seg) - 1)));
+		(block_off & (sbi->blocks_per_seg - 1)));
 
 	if (f2fs_test_bit(block_off, nm_i->nat_bitmap))
 		block_addr += sbi->blocks_per_seg;
@@ -317,7 +317,7 @@ static inline bool IS_DNODE(struct page *node_page)
 	return true;
 }
 
-static inline void set_nid(struct page *p, int off, nid_t nid, bool i)
+static inline int set_nid(struct page *p, int off, nid_t nid, bool i)
 {
 	struct f2fs_node *rn = F2FS_NODE(p);
 
@@ -327,7 +327,7 @@ static inline void set_nid(struct page *p, int off, nid_t nid, bool i)
 		rn->i.i_nid[off - NODE_DIR1_BLOCK] = cpu_to_le32(nid);
 	else
 		rn->in.nid[off] = cpu_to_le32(nid);
-	set_page_dirty(p);
+	return set_page_dirty(p);
 }
 
 static inline nid_t get_nid(struct page *p, int off, bool i)
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index cbf74f47c..589b20b86 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -168,6 +168,32 @@ static void recover_inode(struct inode *inode, struct page *page)
 			ino_of_node(page), name);
 }
 
+static bool is_same_inode(struct inode *inode, struct page *ipage)
+{
+	struct f2fs_inode *ri = F2FS_INODE(ipage);
+	struct timespec disk;
+
+	if (!IS_INODE(ipage))
+		return true;
+
+	disk.tv_sec = le64_to_cpu(ri->i_ctime);
+	disk.tv_nsec = le32_to_cpu(ri->i_ctime_nsec);
+	if (timespec_compare(&inode->i_ctime, &disk) > 0)
+		return false;
+
+	disk.tv_sec = le64_to_cpu(ri->i_atime);
+	disk.tv_nsec = le32_to_cpu(ri->i_atime_nsec);
+	if (timespec_compare(&inode->i_atime, &disk) > 0)
+		return false;
+
+	disk.tv_sec = le64_to_cpu(ri->i_mtime);
+	disk.tv_nsec = le32_to_cpu(ri->i_mtime_nsec);
+	if (timespec_compare(&inode->i_mtime, &disk) > 0)
+		return false;
+
+	return true;
+}
+
 static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
 {
 	unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi));
@@ -197,7 +223,10 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
 			goto next;
 
 		entry = get_fsync_inode(head, ino_of_node(page));
-		if (!entry) {
+		if (entry) {
+			if (!is_same_inode(entry->inode, page))
+				goto next;
+		} else {
 			if (IS_INODE(page) && is_dent_dnode(page)) {
 				err = recover_inode_page(sbi, page);
 				if (err)
@@ -459,8 +488,7 @@ out:
 	return err;
 }
 
-static int recover_data(struct f2fs_sb_info *sbi,
-				struct list_head *head, int type)
+static int recover_data(struct f2fs_sb_info *sbi, struct list_head *head)
 {
 	unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi));
 	struct curseg_info *curseg;
@@ -469,7 +497,7 @@ static int recover_data(struct f2fs_sb_info *sbi,
 	block_t blkaddr;
 
 	/* get node pages in the current segment */
-	curseg = CURSEG_I(sbi, type);
+	curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
 	blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
 
 	while (1) {
@@ -556,7 +584,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi)
 	need_writecp = true;
 
 	/* step #2: recover data */
-	err = recover_data(sbi, &inode_list, CURSEG_WARM_NODE);
+	err = recover_data(sbi, &inode_list);
 	if (!err)
 		f2fs_bug_on(sbi, !list_empty(&inode_list));
 out:
@@ -595,7 +623,7 @@ out:
 			.reason = CP_RECOVERY,
 		};
 		mutex_unlock(&sbi->cp_mutex);
-		write_checkpoint(sbi, &cpc);
+		err = write_checkpoint(sbi, &cpc);
 	} else {
 		mutex_unlock(&sbi->cp_mutex);
 	}
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index f77b32584..5904a411c 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -86,6 +86,7 @@ static inline unsigned long __reverse_ffs(unsigned long word)
 /*
  * __find_rev_next(_zero)_bit is copied from lib/find_next_bit.c because
  * f2fs_set_bit makes MSB and LSB reversed in a byte.
+ * @size must be integral times of unsigned long.
  * Example:
  *                             MSB <--> LSB
  *   f2fs_set_bit(0, bitmap) => 1000 0000
@@ -95,94 +96,73 @@ static unsigned long __find_rev_next_bit(const unsigned long *addr,
 			unsigned long size, unsigned long offset)
 {
 	const unsigned long *p = addr + BIT_WORD(offset);
-	unsigned long result = offset & ~(BITS_PER_LONG - 1);
+	unsigned long result = size;
 	unsigned long tmp;
 
 	if (offset >= size)
 		return size;
 
-	size -= result;
+	size -= (offset & ~(BITS_PER_LONG - 1));
 	offset %= BITS_PER_LONG;
-	if (!offset)
-		goto aligned;
-
-	tmp = __reverse_ulong((unsigned char *)p);
-	tmp &= ~0UL >> offset;
-
-	if (size < BITS_PER_LONG)
-		goto found_first;
-	if (tmp)
-		goto found_middle;
-
-	size -= BITS_PER_LONG;
-	result += BITS_PER_LONG;
-	p++;
-aligned:
-	while (size & ~(BITS_PER_LONG-1)) {
+
+	while (1) {
+		if (*p == 0)
+			goto pass;
+
 		tmp = __reverse_ulong((unsigned char *)p);
+
+		tmp &= ~0UL >> offset;
+		if (size < BITS_PER_LONG)
+			tmp &= (~0UL << (BITS_PER_LONG - size));
 		if (tmp)
-			goto found_middle;
-		result += BITS_PER_LONG;
+			goto found;
+pass:
+		if (size <= BITS_PER_LONG)
+			break;
 		size -= BITS_PER_LONG;
+		offset = 0;
 		p++;
 	}
-	if (!size)
-		return result;
-
-	tmp = __reverse_ulong((unsigned char *)p);
-found_first:
-	tmp &= (~0UL << (BITS_PER_LONG - size));
-	if (!tmp)		/* Are any bits set? */
-		return result + size;   /* Nope. */
-found_middle:
-	return result + __reverse_ffs(tmp);
+	return result;
+found:
+	return result - size + __reverse_ffs(tmp);
 }
 
 static unsigned long __find_rev_next_zero_bit(const unsigned long *addr,
 			unsigned long size, unsigned long offset)
 {
 	const unsigned long *p = addr + BIT_WORD(offset);
-	unsigned long result = offset & ~(BITS_PER_LONG - 1);
+	unsigned long result = size;
 	unsigned long tmp;
 
 	if (offset >= size)
 		return size;
 
-	size -= result;
+	size -= (offset & ~(BITS_PER_LONG - 1));
 	offset %= BITS_PER_LONG;
-	if (!offset)
-		goto aligned;
-
-	tmp = __reverse_ulong((unsigned char *)p);
-	tmp |= ~((~0UL << offset) >> offset);
-
-	if (size < BITS_PER_LONG)
-		goto found_first;
-	if (tmp != ~0UL)
-		goto found_middle;
-
-	size -= BITS_PER_LONG;
-	result += BITS_PER_LONG;
-	p++;
-aligned:
-	while (size & ~(BITS_PER_LONG - 1)) {
+
+	while (1) {
+		if (*p == ~0UL)
+			goto pass;
+
 		tmp = __reverse_ulong((unsigned char *)p);
+
+		if (offset)
+			tmp |= ~0UL << (BITS_PER_LONG - offset);
+		if (size < BITS_PER_LONG)
+			tmp |= ~0UL >> size;
 		if (tmp != ~0UL)
-			goto found_middle;
-		result += BITS_PER_LONG;
+			goto found;
+pass:
+		if (size <= BITS_PER_LONG)
+			break;
 		size -= BITS_PER_LONG;
+		offset = 0;
 		p++;
 	}
-	if (!size)
-		return result;
-
-	tmp = __reverse_ulong((unsigned char *)p);
-found_first:
-	tmp |= ~(~0UL << (BITS_PER_LONG - size));
-	if (tmp == ~0UL)	/* Are any bits zero? */
-		return result + size;   /* Nope. */
-found_middle:
-	return result + __reverse_ffz(tmp);
+	return result;
+found:
+	return result - size + __reverse_ffz(tmp);
 }
 
 void register_inmem_page(struct inode *inode, struct page *page)
@@ -233,7 +213,7 @@ int commit_inmem_pages(struct inode *inode, bool abort)
 	 * inode becomes free by iget_locked in f2fs_iget.
 	 */
 	if (!abort) {
-		f2fs_balance_fs(sbi);
+		f2fs_balance_fs(sbi, true);
 		f2fs_lock_op(sbi);
 	}
 
@@ -257,6 +237,7 @@ int commit_inmem_pages(struct inode *inode, bool abort)
 				submit_bio = true;
 			}
 		} else {
+			ClearPageUptodate(cur->page);
 			trace_f2fs_commit_inmem_page(cur->page, INMEM_DROP);
 		}
 		set_page_private(cur->page, 0);
@@ -281,8 +262,10 @@ int commit_inmem_pages(struct inode *inode, bool abort)
  * This function balances dirty node and dentry pages.
  * In addition, it controls garbage collection.
  */
-void f2fs_balance_fs(struct f2fs_sb_info *sbi)
+void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need)
 {
+	if (!need)
+		return;
 	/*
 	 * We should do GC or end up with checkpoint, if there are so many dirty
 	 * dir/node pages without enough free segments.
@@ -310,8 +293,12 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
 	if (!available_free_memory(sbi, NAT_ENTRIES) ||
 			excess_prefree_segs(sbi) ||
 			!available_free_memory(sbi, INO_ENTRIES) ||
-			jiffies > sbi->cp_expires)
+			(is_idle(sbi) && f2fs_time_over(sbi, CP_TIME))) {
+		if (test_opt(sbi, DATA_FLUSH))
+			sync_dirty_inodes(sbi, FILE_INODE);
 		f2fs_sync_fs(sbi->sb, true);
+		stat_inc_bg_cp_count(sbi->stat_info);
+	}
 }
 
 static int issue_flush_thread(void *data)
@@ -1134,6 +1121,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
 	__u64 end = start + F2FS_BYTES_TO_BLK(range->len) - 1;
 	unsigned int start_segno, end_segno;
 	struct cp_control cpc;
+	int err = 0;
 
 	if (start >= MAX_BLKADDR(sbi) || range->len < sbi->blocksize)
 		return -EINVAL;
@@ -1164,12 +1152,12 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
 				sbi->segs_per_sec) - 1, end_segno);
 
 		mutex_lock(&sbi->gc_mutex);
-		write_checkpoint(sbi, &cpc);
+		err = write_checkpoint(sbi, &cpc);
 		mutex_unlock(&sbi->gc_mutex);
 	}
 out:
 	range->len = F2FS_BLK_TO_BYTES(cpc.trimmed);
-	return 0;
+	return err;
 }
 
 static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type)
@@ -1749,13 +1737,13 @@ int lookup_journal_in_cursum(struct f2fs_summary_block *sum, int type,
 			if (le32_to_cpu(nid_in_journal(sum, i)) == val)
 				return i;
 		}
-		if (alloc && nats_in_cursum(sum) < NAT_JOURNAL_ENTRIES)
+		if (alloc && __has_cursum_space(sum, 1, NAT_JOURNAL))
 			return update_nats_in_cursum(sum, 1);
 	} else if (type == SIT_JOURNAL) {
 		for (i = 0; i < sits_in_cursum(sum); i++)
 			if (le32_to_cpu(segno_in_journal(sum, i)) == val)
 				return i;
-		if (alloc && sits_in_cursum(sum) < SIT_JOURNAL_ENTRIES)
+		if (alloc && __has_cursum_space(sum, 1, SIT_JOURNAL))
 			return update_sits_in_cursum(sum, 1);
 	}
 	return -1;
diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c
index da0d8e0b5..93606f281 100644
--- a/fs/f2fs/shrinker.c
+++ b/fs/f2fs/shrinker.c
@@ -32,7 +32,8 @@ static unsigned long __count_free_nids(struct f2fs_sb_info *sbi)
 
 static unsigned long __count_extent_cache(struct f2fs_sb_info *sbi)
 {
-	return sbi->total_ext_tree + atomic_read(&sbi->total_ext_node);
+	return atomic_read(&sbi->total_zombie_tree) +
+				atomic_read(&sbi->total_ext_node);
 }
 
 unsigned long f2fs_shrink_count(struct shrinker *shrink,
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 3a65e0132..6134832ba 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -67,6 +67,7 @@ enum {
 	Opt_extent_cache,
 	Opt_noextent_cache,
 	Opt_noinline_data,
+	Opt_data_flush,
 	Opt_err,
 };
 
@@ -91,6 +92,7 @@ static match_table_t f2fs_tokens = {
 	{Opt_extent_cache, "extent_cache"},
 	{Opt_noextent_cache, "noextent_cache"},
 	{Opt_noinline_data, "noinline_data"},
+	{Opt_data_flush, "data_flush"},
 	{Opt_err, NULL},
 };
 
@@ -216,7 +218,8 @@ F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh);
 F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages);
 F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search);
 F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level);
-F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, cp_interval);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]);
+F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]);
 
 #define ATTR_LIST(name) (&f2fs_attr_##name.attr)
 static struct attribute *f2fs_attrs[] = {
@@ -235,6 +238,7 @@ static struct attribute *f2fs_attrs[] = {
 	ATTR_LIST(ram_thresh),
 	ATTR_LIST(ra_nid_pages),
 	ATTR_LIST(cp_interval),
+	ATTR_LIST(idle_interval),
 	NULL,
 };
 
@@ -406,6 +410,9 @@ static int parse_options(struct super_block *sb, char *options)
 		case Opt_noinline_data:
 			clear_opt(sbi, INLINE_DATA);
 			break;
+		case Opt_data_flush:
+			set_opt(sbi, DATA_FLUSH);
+			break;
 		default:
 			f2fs_msg(sb, KERN_ERR,
 				"Unrecognized mount option \"%s\" or missing value",
@@ -432,6 +439,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
 	fi->i_current_depth = 1;
 	fi->i_advise = 0;
 	init_rwsem(&fi->i_sem);
+	INIT_LIST_HEAD(&fi->dirty_list);
 	INIT_LIST_HEAD(&fi->inmem_pages);
 	mutex_init(&fi->inmem_lock);
 
@@ -548,7 +556,7 @@ static void f2fs_put_super(struct super_block *sb)
 	 * normally superblock is clean, so we need to release this.
 	 * In addition, EIO will skip do checkpoint, we need this as well.
 	 */
-	release_dirty_inode(sbi);
+	release_ino_entry(sbi);
 	release_discard_addrs(sbi);
 
 	f2fs_leave_shrinker(sbi);
@@ -566,13 +574,14 @@ static void f2fs_put_super(struct super_block *sb)
 	wait_for_completion(&sbi->s_kobj_unregister);
 
 	sb->s_fs_info = NULL;
-	brelse(sbi->raw_super_buf);
+	kfree(sbi->raw_super);
 	kfree(sbi);
 }
 
 int f2fs_sync_fs(struct super_block *sb, int sync)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(sb);
+	int err = 0;
 
 	trace_f2fs_sync_fs(sb, sync);
 
@@ -582,14 +591,12 @@ int f2fs_sync_fs(struct super_block *sb, int sync)
 		cpc.reason = __get_cp_reason(sbi);
 
 		mutex_lock(&sbi->gc_mutex);
-		write_checkpoint(sbi, &cpc);
+		err = write_checkpoint(sbi, &cpc);
 		mutex_unlock(&sbi->gc_mutex);
-	} else {
-		f2fs_balance_fs(sbi);
 	}
 	f2fs_trace_ios(NULL, 1);
 
-	return 0;
+	return err;
 }
 
 static int f2fs_freeze(struct super_block *sb)
@@ -686,6 +693,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
 		seq_puts(seq, ",extent_cache");
 	else
 		seq_puts(seq, ",noextent_cache");
+	if (test_opt(sbi, DATA_FLUSH))
+		seq_puts(seq, ",data_flush");
 	seq_printf(seq, ",active_logs=%u", sbi->active_logs);
 
 	return 0;
@@ -898,7 +907,7 @@ static const struct export_operations f2fs_export_ops = {
 	.get_parent = f2fs_get_parent,
 };
 
-static loff_t max_file_size(unsigned bits)
+static loff_t max_file_blocks(void)
 {
 	loff_t result = (DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS);
 	loff_t leaf_count = ADDRS_PER_BLOCK;
@@ -914,10 +923,82 @@ static loff_t max_file_size(unsigned bits)
 	leaf_count *= NIDS_PER_BLOCK;
 	result += leaf_count;
 
-	result <<= bits;
 	return result;
 }
 
+static inline bool sanity_check_area_boundary(struct super_block *sb,
+					struct f2fs_super_block *raw_super)
+{
+	u32 segment0_blkaddr = le32_to_cpu(raw_super->segment0_blkaddr);
+	u32 cp_blkaddr = le32_to_cpu(raw_super->cp_blkaddr);
+	u32 sit_blkaddr = le32_to_cpu(raw_super->sit_blkaddr);
+	u32 nat_blkaddr = le32_to_cpu(raw_super->nat_blkaddr);
+	u32 ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr);
+	u32 main_blkaddr = le32_to_cpu(raw_super->main_blkaddr);
+	u32 segment_count_ckpt = le32_to_cpu(raw_super->segment_count_ckpt);
+	u32 segment_count_sit = le32_to_cpu(raw_super->segment_count_sit);
+	u32 segment_count_nat = le32_to_cpu(raw_super->segment_count_nat);
+	u32 segment_count_ssa = le32_to_cpu(raw_super->segment_count_ssa);
+	u32 segment_count_main = le32_to_cpu(raw_super->segment_count_main);
+	u32 segment_count = le32_to_cpu(raw_super->segment_count);
+	u32 log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg);
+
+	if (segment0_blkaddr != cp_blkaddr) {
+		f2fs_msg(sb, KERN_INFO,
+			"Mismatch start address, segment0(%u) cp_blkaddr(%u)",
+			segment0_blkaddr, cp_blkaddr);
+		return true;
+	}
+
+	if (cp_blkaddr + (segment_count_ckpt << log_blocks_per_seg) !=
+							sit_blkaddr) {
+		f2fs_msg(sb, KERN_INFO,
+			"Wrong CP boundary, start(%u) end(%u) blocks(%u)",
+			cp_blkaddr, sit_blkaddr,
+			segment_count_ckpt << log_blocks_per_seg);
+		return true;
+	}
+
+	if (sit_blkaddr + (segment_count_sit << log_blocks_per_seg) !=
+							nat_blkaddr) {
+		f2fs_msg(sb, KERN_INFO,
+			"Wrong SIT boundary, start(%u) end(%u) blocks(%u)",
+			sit_blkaddr, nat_blkaddr,
+			segment_count_sit << log_blocks_per_seg);
+		return true;
+	}
+
+	if (nat_blkaddr + (segment_count_nat << log_blocks_per_seg) !=
+							ssa_blkaddr) {
+		f2fs_msg(sb, KERN_INFO,
+			"Wrong NAT boundary, start(%u) end(%u) blocks(%u)",
+			nat_blkaddr, ssa_blkaddr,
+			segment_count_nat << log_blocks_per_seg);
+		return true;
+	}
+
+	if (ssa_blkaddr + (segment_count_ssa << log_blocks_per_seg) !=
+							main_blkaddr) {
+		f2fs_msg(sb, KERN_INFO,
+			"Wrong SSA boundary, start(%u) end(%u) blocks(%u)",
+			ssa_blkaddr, main_blkaddr,
+			segment_count_ssa << log_blocks_per_seg);
+		return true;
+	}
+
+	if (main_blkaddr + (segment_count_main << log_blocks_per_seg) !=
+		segment0_blkaddr + (segment_count << log_blocks_per_seg)) {
+		f2fs_msg(sb, KERN_INFO,
+			"Wrong MAIN_AREA boundary, start(%u) end(%u) blocks(%u)",
+			main_blkaddr,
+			segment0_blkaddr + (segment_count << log_blocks_per_seg),
+			segment_count_main << log_blocks_per_seg);
+		return true;
+	}
+
+	return false;
+}
+
 static int sanity_check_raw_super(struct super_block *sb,
 			struct f2fs_super_block *raw_super)
 {
@@ -947,6 +1028,14 @@ static int sanity_check_raw_super(struct super_block *sb,
 		return 1;
 	}
 
+	/* check log blocks per segment */
+	if (le32_to_cpu(raw_super->log_blocks_per_seg) != 9) {
+		f2fs_msg(sb, KERN_INFO,
+			"Invalid log blocks per segment (%u)\n",
+			le32_to_cpu(raw_super->log_blocks_per_seg));
+		return 1;
+	}
+
 	/* Currently, support 512/1024/2048/4096 bytes sector size */
 	if (le32_to_cpu(raw_super->log_sectorsize) >
 				F2FS_MAX_LOG_SECTOR_SIZE ||
@@ -965,6 +1054,23 @@ static int sanity_check_raw_super(struct super_block *sb,
 			le32_to_cpu(raw_super->log_sectorsize));
 		return 1;
 	}
+
+	/* check reserved ino info */
+	if (le32_to_cpu(raw_super->node_ino) != 1 ||
+		le32_to_cpu(raw_super->meta_ino) != 2 ||
+		le32_to_cpu(raw_super->root_ino) != 3) {
+		f2fs_msg(sb, KERN_INFO,
+			"Invalid Fs Meta Ino: node(%u) meta(%u) root(%u)",
+			le32_to_cpu(raw_super->node_ino),
+			le32_to_cpu(raw_super->meta_ino),
+			le32_to_cpu(raw_super->root_ino));
+		return 1;
+	}
+
+	/* check CP/SIT/NAT/SSA/MAIN_AREA area boundary */
+	if (sanity_check_area_boundary(sb, raw_super))
+		return 1;
+
 	return 0;
 }
 
@@ -1018,7 +1124,8 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
 		atomic_set(&sbi->nr_pages[i], 0);
 
 	sbi->dir_level = DEF_DIR_LEVEL;
-	sbi->cp_interval = DEF_CP_INTERVAL;
+	sbi->interval_time[CP_TIME] = DEF_CP_INTERVAL;
+	sbi->interval_time[REQ_TIME] = DEF_IDLE_INTERVAL;
 	clear_sbi_flag(sbi, SBI_NEED_FSCK);
 
 	INIT_LIST_HEAD(&sbi->s_list);
@@ -1032,111 +1139,114 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
  */
 static int read_raw_super_block(struct super_block *sb,
 			struct f2fs_super_block **raw_super,
-			struct buffer_head **raw_super_buf,
-			int *recovery)
+			int *valid_super_block, int *recovery)
 {
 	int block = 0;
-	struct buffer_head *buffer;
-	struct f2fs_super_block *super;
+	struct buffer_head *bh;
+	struct f2fs_super_block *super, *buf;
 	int err = 0;
 
+	super = kzalloc(sizeof(struct f2fs_super_block), GFP_KERNEL);
+	if (!super)
+		return -ENOMEM;
 retry:
-	buffer = sb_bread(sb, block);
-	if (!buffer) {
+	bh = sb_bread(sb, block);
+	if (!bh) {
 		*recovery = 1;
 		f2fs_msg(sb, KERN_ERR, "Unable to read %dth superblock",
 				block + 1);
-		if (block == 0) {
-			block++;
-			goto retry;
-		} else {
-			err = -EIO;
-			goto out;
-		}
+		err = -EIO;
+		goto next;
 	}
 
-	super = (struct f2fs_super_block *)
-		((char *)(buffer)->b_data + F2FS_SUPER_OFFSET);
+	buf = (struct f2fs_super_block *)(bh->b_data + F2FS_SUPER_OFFSET);
 
 	/* sanity checking of raw super */
-	if (sanity_check_raw_super(sb, super)) {
-		brelse(buffer);
+	if (sanity_check_raw_super(sb, buf)) {
+		brelse(bh);
 		*recovery = 1;
 		f2fs_msg(sb, KERN_ERR,
 			"Can't find valid F2FS filesystem in %dth superblock",
 								block + 1);
-		if (block == 0) {
-			block++;
-			goto retry;
-		} else {
-			err = -EINVAL;
-			goto out;
-		}
+		err = -EINVAL;
+		goto next;
 	}
 
 	if (!*raw_super) {
-		*raw_super_buf = buffer;
+		memcpy(super, buf, sizeof(*super));
+		*valid_super_block = block;
 		*raw_super = super;
-	} else {
-		/* already have a valid superblock */
-		brelse(buffer);
 	}
+	brelse(bh);
 
+next:
 	/* check the validity of the second superblock */
 	if (block == 0) {
 		block++;
 		goto retry;
 	}
 
-out:
 	/* No valid superblock */
-	if (!*raw_super)
+	if (!*raw_super) {
+		kfree(super);
 		return err;
+	}
 
 	return 0;
 }
 
+static int __f2fs_commit_super(struct f2fs_sb_info *sbi, int block)
+{
+	struct f2fs_super_block *super = F2FS_RAW_SUPER(sbi);
+	struct buffer_head *bh;
+	int err;
+
+	bh = sb_getblk(sbi->sb, block);
+	if (!bh)
+		return -EIO;
+
+	lock_buffer(bh);
+	memcpy(bh->b_data + F2FS_SUPER_OFFSET, super, sizeof(*super));
+	set_buffer_uptodate(bh);
+	set_buffer_dirty(bh);
+	unlock_buffer(bh);
+
+	/* it's rare case, we can do fua all the time */
+	err = __sync_dirty_buffer(bh, WRITE_FLUSH_FUA);
+	brelse(bh);
+
+	return err;
+}
+
 int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover)
 {
-	struct buffer_head *sbh = sbi->raw_super_buf;
-	sector_t block = sbh->b_blocknr;
 	int err;
 
 	/* write back-up superblock first */
-	sbh->b_blocknr = block ? 0 : 1;
-	mark_buffer_dirty(sbh);
-	err = sync_dirty_buffer(sbh);
-
-	sbh->b_blocknr = block;
+	err = __f2fs_commit_super(sbi, sbi->valid_super_block ? 0 : 1);
 
 	/* if we are in recovery path, skip writing valid superblock */
 	if (recover || err)
-		goto out;
+		return err;
 
 	/* write current valid superblock */
-	mark_buffer_dirty(sbh);
-	err = sync_dirty_buffer(sbh);
-out:
-	clear_buffer_write_io_error(sbh);
-	set_buffer_uptodate(sbh);
-	return err;
+	return __f2fs_commit_super(sbi, sbi->valid_super_block);
 }
 
 static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
 {
 	struct f2fs_sb_info *sbi;
 	struct f2fs_super_block *raw_super;
-	struct buffer_head *raw_super_buf;
 	struct inode *root;
 	long err;
 	bool retry = true, need_fsck = false;
 	char *options = NULL;
-	int recovery, i;
+	int recovery, i, valid_super_block;
 
 try_onemore:
 	err = -EINVAL;
 	raw_super = NULL;
-	raw_super_buf = NULL;
+	valid_super_block = -1;
 	recovery = 0;
 
 	/* allocate memory for f2fs-specific super block info */
@@ -1150,7 +1260,8 @@ try_onemore:
 		goto free_sbi;
 	}
 
-	err = read_raw_super_block(sb, &raw_super, &raw_super_buf, &recovery);
+	err = read_raw_super_block(sb, &raw_super, &valid_super_block,
+								&recovery);
 	if (err)
 		goto free_sbi;
 
@@ -1167,7 +1278,9 @@ try_onemore:
 	if (err)
 		goto free_options;
 
-	sb->s_maxbytes = max_file_size(le32_to_cpu(raw_super->log_blocksize));
+	sbi->max_file_blocks = max_file_blocks();
+	sb->s_maxbytes = sbi->max_file_blocks <<
+				le32_to_cpu(raw_super->log_blocksize);
 	sb->s_max_links = F2FS_LINK_MAX;
 	get_random_bytes(&sbi->s_next_generation, sizeof(u32));
 
@@ -1183,7 +1296,7 @@ try_onemore:
 	/* init f2fs-specific super block info */
 	sbi->sb = sb;
 	sbi->raw_super = raw_super;
-	sbi->raw_super_buf = raw_super_buf;
+	sbi->valid_super_block = valid_super_block;
 	mutex_init(&sbi->gc_mutex);
 	mutex_init(&sbi->writepages);
 	mutex_init(&sbi->cp_mutex);
@@ -1236,8 +1349,10 @@ try_onemore:
 				le64_to_cpu(sbi->ckpt->valid_block_count);
 	sbi->last_valid_block_count = sbi->total_valid_block_count;
 	sbi->alloc_valid_block_count = 0;
-	INIT_LIST_HEAD(&sbi->dir_inode_list);
-	spin_lock_init(&sbi->dir_inode_lock);
+	for (i = 0; i < NR_INODE_TYPE; i++) {
+		INIT_LIST_HEAD(&sbi->inode_list[i]);
+		spin_lock_init(&sbi->inode_lock[i]);
+	}
 
 	init_extent_cache_info(sbi);
 
@@ -1355,12 +1470,14 @@ try_onemore:
 		f2fs_commit_super(sbi, true);
 	}
 
-	sbi->cp_expires = round_jiffies_up(jiffies);
-
+	f2fs_update_time(sbi, CP_TIME);
+	f2fs_update_time(sbi, REQ_TIME);
 	return 0;
 
 free_kobj:
 	kobject_del(&sbi->s_kobj);
+	kobject_put(&sbi->s_kobj);
+	wait_for_completion(&sbi->s_kobj_unregister);
 free_proc:
 	if (sbi->s_proc) {
 		remove_proc_entry("segment_info", sbi->s_proc);
@@ -1387,7 +1504,7 @@ free_meta_inode:
 free_options:
 	kfree(options);
 free_sb_buf:
-	brelse(raw_super_buf);
+	kfree(raw_super);
 free_sbi:
 	kfree(sbi);
 
@@ -1424,8 +1541,9 @@ MODULE_ALIAS_FS("f2fs");
 
 static int __init init_inodecache(void)
 {
-	f2fs_inode_cachep = f2fs_kmem_cache_create("f2fs_inode_cache",
-			sizeof(struct f2fs_inode_info));
+	f2fs_inode_cachep = kmem_cache_create("f2fs_inode_cache",
+			sizeof(struct f2fs_inode_info), 0,
+			SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, NULL);
 	if (!f2fs_inode_cachep)
 		return -ENOMEM;
 	return 0;
@@ -1478,10 +1596,14 @@ static int __init init_f2fs_fs(void)
 	err = register_filesystem(&f2fs_fs_type);
 	if (err)
 		goto free_shrinker;
-	f2fs_create_root_stats();
+	err = f2fs_create_root_stats();
+	if (err)
+		goto free_filesystem;
 	f2fs_proc_root = proc_mkdir("fs/f2fs", NULL);
 	return 0;
 
+free_filesystem:
+	unregister_filesystem(&f2fs_fs_type);
 free_shrinker:
 	unregister_shrinker(&f2fs_shrinker_info);
 free_crypto:
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 862368a32..10f1e784f 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -25,38 +25,6 @@
 #include "f2fs.h"
 #include "xattr.h"
 
-static size_t f2fs_xattr_generic_list(const struct xattr_handler *handler,
-		struct dentry *dentry, char *list, size_t list_size,
-		const char *name, size_t len)
-{
-	struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
-	int total_len, prefix_len;
-
-	switch (handler->flags) {
-	case F2FS_XATTR_INDEX_USER:
-		if (!test_opt(sbi, XATTR_USER))
-			return -EOPNOTSUPP;
-		break;
-	case F2FS_XATTR_INDEX_TRUSTED:
-		if (!capable(CAP_SYS_ADMIN))
-			return -EPERM;
-		break;
-	case F2FS_XATTR_INDEX_SECURITY:
-		break;
-	default:
-		return -EINVAL;
-	}
-
-	prefix_len = strlen(handler->prefix);
-	total_len = prefix_len + len + 1;
-	if (list && total_len <= list_size) {
-		memcpy(list, handler->prefix, prefix_len);
-		memcpy(list + prefix_len, name, len);
-		list[prefix_len + len] = '\0';
-	}
-	return total_len;
-}
-
 static int f2fs_xattr_generic_get(const struct xattr_handler *handler,
 		struct dentry *dentry, const char *name, void *buffer,
 		size_t size)
@@ -77,8 +45,6 @@ static int f2fs_xattr_generic_get(const struct xattr_handler *handler,
 	default:
 		return -EINVAL;
 	}
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
 	return f2fs_getxattr(d_inode(dentry), handler->flags, name,
 			     buffer, size, NULL);
 }
@@ -103,24 +69,20 @@ static int f2fs_xattr_generic_set(const struct xattr_handler *handler,
 	default:
 		return -EINVAL;
 	}
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
-
 	return f2fs_setxattr(d_inode(dentry), handler->flags, name,
 					value, size, NULL, flags);
 }
 
-static size_t f2fs_xattr_advise_list(const struct xattr_handler *handler,
-		struct dentry *dentry, char *list, size_t list_size,
-		const char *name, size_t len)
+static bool f2fs_xattr_user_list(struct dentry *dentry)
 {
-	const char *xname = F2FS_SYSTEM_ADVISE_PREFIX;
-	size_t size;
+	struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb);
 
-	size = strlen(xname) + 1;
-	if (list && size <= list_size)
-		memcpy(list, xname, size);
-	return size;
+	return test_opt(sbi, XATTR_USER);
+}
+
+static bool f2fs_xattr_trusted_list(struct dentry *dentry)
+{
+	return capable(CAP_SYS_ADMIN);
 }
 
 static int f2fs_xattr_advise_get(const struct xattr_handler *handler,
@@ -129,9 +91,6 @@ static int f2fs_xattr_advise_get(const struct xattr_handler *handler,
 {
 	struct inode *inode = d_inode(dentry);
 
-	if (strcmp(name, "") != 0)
-		return -EINVAL;
-
 	if (buffer)
 		*((char *)buffer) = F2FS_I(inode)->i_advise;
 	return sizeof(char);
@@ -143,8 +102,6 @@ static int f2fs_xattr_advise_set(const struct xattr_handler *handler,
 {
 	struct inode *inode = d_inode(dentry);
 
-	if (strcmp(name, "") != 0)
-		return -EINVAL;
 	if (!inode_owner_or_capable(inode))
 		return -EPERM;
 	if (value == NULL)
@@ -183,7 +140,7 @@ int f2fs_init_security(struct inode *inode, struct inode *dir,
 const struct xattr_handler f2fs_xattr_user_handler = {
 	.prefix	= XATTR_USER_PREFIX,
 	.flags	= F2FS_XATTR_INDEX_USER,
-	.list	= f2fs_xattr_generic_list,
+	.list	= f2fs_xattr_user_list,
 	.get	= f2fs_xattr_generic_get,
 	.set	= f2fs_xattr_generic_set,
 };
@@ -191,15 +148,14 @@ const struct xattr_handler f2fs_xattr_user_handler = {
 const struct xattr_handler f2fs_xattr_trusted_handler = {
 	.prefix	= XATTR_TRUSTED_PREFIX,
 	.flags	= F2FS_XATTR_INDEX_TRUSTED,
-	.list	= f2fs_xattr_generic_list,
+	.list	= f2fs_xattr_trusted_list,
 	.get	= f2fs_xattr_generic_get,
 	.set	= f2fs_xattr_generic_set,
 };
 
 const struct xattr_handler f2fs_xattr_advise_handler = {
-	.prefix = F2FS_SYSTEM_ADVISE_PREFIX,
+	.name	= F2FS_SYSTEM_ADVISE_NAME,
 	.flags	= F2FS_XATTR_INDEX_ADVISE,
-	.list   = f2fs_xattr_advise_list,
 	.get    = f2fs_xattr_advise_get,
 	.set    = f2fs_xattr_advise_set,
 };
@@ -207,7 +163,6 @@ const struct xattr_handler f2fs_xattr_advise_handler = {
 const struct xattr_handler f2fs_xattr_security_handler = {
 	.prefix	= XATTR_SECURITY_PREFIX,
 	.flags	= F2FS_XATTR_INDEX_SECURITY,
-	.list	= f2fs_xattr_generic_list,
 	.get	= f2fs_xattr_generic_get,
 	.set	= f2fs_xattr_generic_set,
 };
@@ -455,20 +410,27 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
 	list_for_each_xattr(entry, base_addr) {
 		const struct xattr_handler *handler =
 			f2fs_xattr_handler(entry->e_name_index);
+		const char *prefix;
+		size_t prefix_len;
 		size_t size;
 
-		if (!handler)
+		if (!handler || (handler->list && !handler->list(dentry)))
 			continue;
 
-		size = handler->list(handler, dentry, buffer, rest,
-				     entry->e_name, entry->e_name_len);
-		if (buffer && size > rest) {
-			error = -ERANGE;
-			goto cleanup;
+		prefix = handler->prefix ?: handler->name;
+		prefix_len = strlen(prefix);
+		size = prefix_len + entry->e_name_len + 1;
+		if (buffer) {
+			if (size > rest) {
+				error = -ERANGE;
+				goto cleanup;
+			}
+			memcpy(buffer, prefix, prefix_len);
+			buffer += prefix_len;
+			memcpy(buffer, entry->e_name, entry->e_name_len);
+			buffer += entry->e_name_len;
+			*buffer++ = 0;
 		}
-
-		if (buffer)
-			buffer += size;
 		rest -= size;
 	}
 	error = buffer_size - rest;
@@ -609,7 +571,7 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name,
 	if (ipage)
 		return __f2fs_setxattr(inode, index, name, value,
 						size, ipage, flags);
-	f2fs_balance_fs(sbi);
+	f2fs_balance_fs(sbi, true);
 
 	f2fs_lock_op(sbi);
 	/* protect xattr_ver */
@@ -618,5 +580,6 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name,
 	up_write(&F2FS_I(inode)->i_sem);
 	f2fs_unlock_op(sbi);
 
+	f2fs_update_time(sbi, REQ_TIME);
 	return err;
 }
diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h
index 71a7100d5..79dccc825 100644
--- a/fs/f2fs/xattr.h
+++ b/fs/f2fs/xattr.h
@@ -27,7 +27,7 @@
 #define F2FS_XATTR_REFCOUNT_MAX         1024
 
 /* Name indexes */
-#define F2FS_SYSTEM_ADVISE_PREFIX		"system.advise"
+#define F2FS_SYSTEM_ADVISE_NAME			"system.advise"
 #define F2FS_XATTR_INDEX_USER			1
 #define F2FS_XATTR_INDEX_POSIX_ACL_ACCESS	2
 #define F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT	3
diff --git a/fs/fat/cache.c b/fs/fat/cache.c
index 93fc62232..5d3849215 100644
--- a/fs/fat/cache.c
+++ b/fs/fat/cache.c
@@ -301,15 +301,59 @@ static int fat_bmap_cluster(struct inode *inode, int cluster)
 	return dclus;
 }
 
-int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
-	     unsigned long *mapped_blocks, int create)
+int fat_get_mapped_cluster(struct inode *inode, sector_t sector,
+			   sector_t last_block,
+			   unsigned long *mapped_blocks, sector_t *bmap)
 {
 	struct super_block *sb = inode->i_sb;
 	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	int cluster, offset;
+
+	cluster = sector >> (sbi->cluster_bits - sb->s_blocksize_bits);
+	offset  = sector & (sbi->sec_per_clus - 1);
+	cluster = fat_bmap_cluster(inode, cluster);
+	if (cluster < 0)
+		return cluster;
+	else if (cluster) {
+		*bmap = fat_clus_to_blknr(sbi, cluster) + offset;
+		*mapped_blocks = sbi->sec_per_clus - offset;
+		if (*mapped_blocks > last_block - sector)
+			*mapped_blocks = last_block - sector;
+	}
+
+	return 0;
+}
+
+static int is_exceed_eof(struct inode *inode, sector_t sector,
+			 sector_t *last_block, int create)
+{
+	struct super_block *sb = inode->i_sb;
 	const unsigned long blocksize = sb->s_blocksize;
 	const unsigned char blocksize_bits = sb->s_blocksize_bits;
+
+	*last_block = (i_size_read(inode) + (blocksize - 1)) >> blocksize_bits;
+	if (sector >= *last_block) {
+		if (!create)
+			return 1;
+
+		/*
+		 * ->mmu_private can access on only allocation path.
+		 * (caller must hold ->i_mutex)
+		 */
+		*last_block = (MSDOS_I(inode)->mmu_private + (blocksize - 1))
+			>> blocksize_bits;
+		if (sector >= *last_block)
+			return 1;
+	}
+
+	return 0;
+}
+
+int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
+	     unsigned long *mapped_blocks, int create, bool from_bmap)
+{
+	struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
 	sector_t last_block;
-	int cluster, offset;
 
 	*phys = 0;
 	*mapped_blocks = 0;
@@ -321,31 +365,16 @@ int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
 		return 0;
 	}
 
-	last_block = (i_size_read(inode) + (blocksize - 1)) >> blocksize_bits;
-	if (sector >= last_block) {
-		if (!create)
+	if (!from_bmap) {
+		if (is_exceed_eof(inode, sector, &last_block, create))
 			return 0;
-
-		/*
-		 * ->mmu_private can access on only allocation path.
-		 * (caller must hold ->i_mutex)
-		 */
-		last_block = (MSDOS_I(inode)->mmu_private + (blocksize - 1))
-			>> blocksize_bits;
+	} else {
+		last_block = inode->i_blocks >>
+				(inode->i_sb->s_blocksize_bits - 9);
 		if (sector >= last_block)
 			return 0;
 	}
 
-	cluster = sector >> (sbi->cluster_bits - sb->s_blocksize_bits);
-	offset  = sector & (sbi->sec_per_clus - 1);
-	cluster = fat_bmap_cluster(inode, cluster);
-	if (cluster < 0)
-		return cluster;
-	else if (cluster) {
-		*phys = fat_clus_to_blknr(sbi, cluster) + offset;
-		*mapped_blocks = sbi->sec_per_clus - offset;
-		if (*mapped_blocks > last_block - sector)
-			*mapped_blocks = last_block - sector;
-	}
-	return 0;
+	return fat_get_mapped_cluster(inode, sector, last_block, mapped_blocks,
+				      phys);
 }
diff --git a/fs/fat/dir.c b/fs/fat/dir.c
index 8b2127ffb..d0b95c950 100644
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -91,7 +91,7 @@ next:
 
 	*bh = NULL;
 	iblock = *pos >> sb->s_blocksize_bits;
-	err = fat_bmap(dir, iblock, &phys, &mapped_blocks, 0);
+	err = fat_bmap(dir, iblock, &phys, &mapped_blocks, 0, false);
 	if (err || !phys)
 		return -1;	/* beyond EOF or error */
 
@@ -769,7 +769,7 @@ static int fat_ioctl_readdir(struct inode *inode, struct file *file,
 
 	buf.dirent = dirent;
 	buf.result = 0;
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	buf.ctx.pos = file->f_pos;
 	ret = -ENOENT;
 	if (!IS_DEADDIR(inode)) {
@@ -777,7 +777,7 @@ static int fat_ioctl_readdir(struct inode *inode, struct file *file,
 				    short_only, both ? &buf : NULL);
 		file->f_pos = buf.ctx.pos;
 	}
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	if (ret >= 0)
 		ret = buf.result;
 	return ret;
diff --git a/fs/fat/fat.h b/fs/fat/fat.h
index be5e15323..e6b764a17 100644
--- a/fs/fat/fat.h
+++ b/fs/fat/fat.h
@@ -87,7 +87,7 @@ struct msdos_sb_info {
 	unsigned int vol_id;		/*volume ID*/
 
 	int fatent_shift;
-	struct fatent_operations *fatent_ops;
+	const struct fatent_operations *fatent_ops;
 	struct inode *fat_inode;
 	struct inode *fsinfo_inode;
 
@@ -285,8 +285,11 @@ static inline void fatwchar_to16(__u8 *dst, const wchar_t *src, size_t len)
 extern void fat_cache_inval_inode(struct inode *inode);
 extern int fat_get_cluster(struct inode *inode, int cluster,
 			   int *fclus, int *dclus);
+extern int fat_get_mapped_cluster(struct inode *inode, sector_t sector,
+				  sector_t last_block,
+				  unsigned long *mapped_blocks, sector_t *bmap);
 extern int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys,
-		    unsigned long *mapped_blocks, int create);
+		    unsigned long *mapped_blocks, int create, bool from_bmap);
 
 /* fat/dir.c */
 extern const struct file_operations fat_dir_operations;
@@ -384,6 +387,7 @@ static inline unsigned long fat_dir_hash(int logstart)
 {
 	return hash_32(logstart, FAT_HASH_BITS);
 }
+extern int fat_add_cluster(struct inode *inode);
 
 /* fat/misc.c */
 extern __printf(3, 4) __cold
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index 822655713..1d9a8c4e9 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -99,7 +99,7 @@ err:
 static int fat_ent_bread(struct super_block *sb, struct fat_entry *fatent,
 			 int offset, sector_t blocknr)
 {
-	struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops;
+	const struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops;
 
 	WARN_ON(blocknr < MSDOS_SB(sb)->fat_start);
 	fatent->fat_inode = MSDOS_SB(sb)->fat_inode;
@@ -246,7 +246,7 @@ static int fat32_ent_next(struct fat_entry *fatent)
 	return 0;
 }
 
-static struct fatent_operations fat12_ops = {
+static const struct fatent_operations fat12_ops = {
 	.ent_blocknr	= fat12_ent_blocknr,
 	.ent_set_ptr	= fat12_ent_set_ptr,
 	.ent_bread	= fat12_ent_bread,
@@ -255,7 +255,7 @@ static struct fatent_operations fat12_ops = {
 	.ent_next	= fat12_ent_next,
 };
 
-static struct fatent_operations fat16_ops = {
+static const struct fatent_operations fat16_ops = {
 	.ent_blocknr	= fat_ent_blocknr,
 	.ent_set_ptr	= fat16_ent_set_ptr,
 	.ent_bread	= fat_ent_bread,
@@ -264,7 +264,7 @@ static struct fatent_operations fat16_ops = {
 	.ent_next	= fat16_ent_next,
 };
 
-static struct fatent_operations fat32_ops = {
+static const struct fatent_operations fat32_ops = {
 	.ent_blocknr	= fat_ent_blocknr,
 	.ent_set_ptr	= fat32_ent_set_ptr,
 	.ent_bread	= fat_ent_bread,
@@ -320,7 +320,7 @@ static inline int fat_ent_update_ptr(struct super_block *sb,
 				     int offset, sector_t blocknr)
 {
 	struct msdos_sb_info *sbi = MSDOS_SB(sb);
-	struct fatent_operations *ops = sbi->fatent_ops;
+	const struct fatent_operations *ops = sbi->fatent_ops;
 	struct buffer_head **bhs = fatent->bhs;
 
 	/* Is this fatent's blocks including this entry? */
@@ -349,7 +349,7 @@ int fat_ent_read(struct inode *inode, struct fat_entry *fatent, int entry)
 {
 	struct super_block *sb = inode->i_sb;
 	struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb);
-	struct fatent_operations *ops = sbi->fatent_ops;
+	const struct fatent_operations *ops = sbi->fatent_ops;
 	int err, offset;
 	sector_t blocknr;
 
@@ -407,7 +407,7 @@ int fat_ent_write(struct inode *inode, struct fat_entry *fatent,
 		  int new, int wait)
 {
 	struct super_block *sb = inode->i_sb;
-	struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops;
+	const struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops;
 	int err;
 
 	ops->ent_put(fatent, new);
@@ -432,7 +432,7 @@ static inline int fat_ent_next(struct msdos_sb_info *sbi,
 static inline int fat_ent_read_block(struct super_block *sb,
 				     struct fat_entry *fatent)
 {
-	struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops;
+	const struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops;
 	sector_t blocknr;
 	int offset;
 
@@ -463,7 +463,7 @@ int fat_alloc_clusters(struct inode *inode, int *cluster, int nr_cluster)
 {
 	struct super_block *sb = inode->i_sb;
 	struct msdos_sb_info *sbi = MSDOS_SB(sb);
-	struct fatent_operations *ops = sbi->fatent_ops;
+	const struct fatent_operations *ops = sbi->fatent_ops;
 	struct fat_entry fatent, prev_ent;
 	struct buffer_head *bhs[MAX_BUF_PER_PAGE];
 	int i, count, err, nr_bhs, idx_clus;
@@ -551,7 +551,7 @@ int fat_free_clusters(struct inode *inode, int cluster)
 {
 	struct super_block *sb = inode->i_sb;
 	struct msdos_sb_info *sbi = MSDOS_SB(sb);
-	struct fatent_operations *ops = sbi->fatent_ops;
+	const struct fatent_operations *ops = sbi->fatent_ops;
 	struct fat_entry fatent;
 	struct buffer_head *bhs[MAX_BUF_PER_PAGE];
 	int i, err, nr_bhs;
@@ -636,7 +636,7 @@ EXPORT_SYMBOL_GPL(fat_free_clusters);
 static void fat_ent_reada(struct super_block *sb, struct fat_entry *fatent,
 			  unsigned long reada_blocks)
 {
-	struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops;
+	const struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops;
 	sector_t blocknr;
 	int i, offset;
 
@@ -649,7 +649,7 @@ static void fat_ent_reada(struct super_block *sb, struct fat_entry *fatent,
 int fat_count_free_clusters(struct super_block *sb)
 {
 	struct msdos_sb_info *sbi = MSDOS_SB(sb);
-	struct fatent_operations *ops = sbi->fatent_ops;
+	const struct fatent_operations *ops = sbi->fatent_ops;
 	struct fat_entry fatent;
 	unsigned long reada_blocks, reada_mask, cur_block;
 	int err = 0, free;
diff --git a/fs/fat/file.c b/fs/fat/file.c
index a08f10399..f70185668 100644
--- a/fs/fat/file.c
+++ b/fs/fat/file.c
@@ -14,15 +14,19 @@
 #include <linux/backing-dev.h>
 #include <linux/fsnotify.h>
 #include <linux/security.h>
+#include <linux/falloc.h>
 #include "fat.h"
 
+static long fat_fallocate(struct file *file, int mode,
+			  loff_t offset, loff_t len);
+
 static int fat_ioctl_get_attributes(struct inode *inode, u32 __user *user_attr)
 {
 	u32 attr;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	attr = fat_make_attrs(inode);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	return put_user(attr, user_attr);
 }
@@ -43,7 +47,7 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr)
 	err = mnt_want_write_file(file);
 	if (err)
 		goto out;
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	/*
 	 * ATTR_VOLUME and ATTR_DIR cannot be changed; this also
@@ -105,7 +109,7 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr)
 	fat_save_attrs(inode, attr);
 	mark_inode_dirty(inode);
 out_unlock_inode:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	mnt_drop_write_file(file);
 out:
 	return err;
@@ -177,6 +181,7 @@ const struct file_operations fat_file_operations = {
 #endif
 	.fsync		= fat_file_fsync,
 	.splice_read	= generic_file_splice_read,
+	.fallocate	= fat_fallocate,
 };
 
 static int fat_cont_expand(struct inode *inode, loff_t size)
@@ -215,6 +220,62 @@ out:
 	return err;
 }
 
+/*
+ * Preallocate space for a file. This implements fat's fallocate file
+ * operation, which gets called from sys_fallocate system call. User
+ * space requests len bytes at offset. If FALLOC_FL_KEEP_SIZE is set
+ * we just allocate clusters without zeroing them out. Otherwise we
+ * allocate and zero out clusters via an expanding truncate.
+ */
+static long fat_fallocate(struct file *file, int mode,
+			  loff_t offset, loff_t len)
+{
+	int nr_cluster; /* Number of clusters to be allocated */
+	loff_t mm_bytes; /* Number of bytes to be allocated for file */
+	loff_t ondisksize; /* block aligned on-disk size in bytes*/
+	struct inode *inode = file->f_mapping->host;
+	struct super_block *sb = inode->i_sb;
+	struct msdos_sb_info *sbi = MSDOS_SB(sb);
+	int err = 0;
+
+	/* No support for hole punch or other fallocate flags. */
+	if (mode & ~FALLOC_FL_KEEP_SIZE)
+		return -EOPNOTSUPP;
+
+	/* No support for dir */
+	if (!S_ISREG(inode->i_mode))
+		return -EOPNOTSUPP;
+
+	inode_lock(inode);
+	if (mode & FALLOC_FL_KEEP_SIZE) {
+		ondisksize = inode->i_blocks << 9;
+		if ((offset + len) <= ondisksize)
+			goto error;
+
+		/* First compute the number of clusters to be allocated */
+		mm_bytes = offset + len - ondisksize;
+		nr_cluster = (mm_bytes + (sbi->cluster_size - 1)) >>
+			sbi->cluster_bits;
+
+		/* Start the allocation.We are not zeroing out the clusters */
+		while (nr_cluster-- > 0) {
+			err = fat_add_cluster(inode);
+			if (err)
+				goto error;
+		}
+	} else {
+		if ((offset + len) <= i_size_read(inode))
+			goto error;
+
+		/* This is just an expanding truncate */
+		err = fat_cont_expand(inode, (offset + len));
+	}
+
+error:
+	inode_unlock(inode);
+	return err;
+}
+
 /* Free all clusters after the skip'th cluster. */
 static int fat_free(struct inode *inode, int skip)
 {
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 509411dd3..a55990521 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -93,7 +93,7 @@ static struct fat_floppy_defaults {
 },
 };
 
-static int fat_add_cluster(struct inode *inode)
+int fat_add_cluster(struct inode *inode)
 {
 	int err, cluster;
 
@@ -115,10 +115,10 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock,
 	struct super_block *sb = inode->i_sb;
 	struct msdos_sb_info *sbi = MSDOS_SB(sb);
 	unsigned long mapped_blocks;
-	sector_t phys;
+	sector_t phys, last_block;
 	int err, offset;
 
-	err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create);
+	err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create, false);
 	if (err)
 		return err;
 	if (phys) {
@@ -135,8 +135,14 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock,
 		return -EIO;
 	}
 
+	last_block = inode->i_blocks >> (sb->s_blocksize_bits - 9);
 	offset = (unsigned long)iblock & (sbi->sec_per_clus - 1);
-	if (!offset) {
+	/*
+	 * allocate a cluster according to the following.
+	 * 1) no more available blocks
+	 * 2) not part of fallocate region
+	 */
+	if (!offset && !(iblock < last_block)) {
 		/* TODO: multiple cluster allocation would be desirable. */
 		err = fat_add_cluster(inode);
 		if (err)
@@ -148,7 +154,7 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock,
 	*max_blocks = min(mapped_blocks, *max_blocks);
 	MSDOS_I(inode)->mmu_private += *max_blocks << sb->s_blocksize_bits;
 
-	err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create);
+	err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create, false);
 	if (err)
 		return err;
 
@@ -273,13 +279,38 @@ static ssize_t fat_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	return ret;
 }
 
+static int fat_get_block_bmap(struct inode *inode, sector_t iblock,
+		struct buffer_head *bh_result, int create)
+{
+	struct super_block *sb = inode->i_sb;
+	unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
+	int err;
+	sector_t bmap;
+	unsigned long mapped_blocks;
+
+	BUG_ON(create != 0);
+
+	err = fat_bmap(inode, iblock, &bmap, &mapped_blocks, create, true);
+	if (err)
+		return err;
+
+	if (bmap) {
+		map_bh(bh_result, sb, bmap);
+		max_blocks = min(mapped_blocks, max_blocks);
+	}
+
+	bh_result->b_size = max_blocks << sb->s_blocksize_bits;
+
+	return 0;
+}
+
 static sector_t _fat_bmap(struct address_space *mapping, sector_t block)
 {
 	sector_t blocknr;
 
 	/* fat_get_cluster() assumes the requested blocknr isn't truncated. */
 	down_read(&MSDOS_I(mapping->host)->truncate_lock);
-	blocknr = generic_block_bmap(mapping, block, fat_get_block);
+	blocknr = generic_block_bmap(mapping, block, fat_get_block_bmap);
 	up_read(&MSDOS_I(mapping->host)->truncate_lock);
 
 	return blocknr;
@@ -449,6 +480,24 @@ static int fat_calc_dir_size(struct inode *inode)
 	return 0;
 }
 
+static int fat_validate_dir(struct inode *dir)
+{
+	struct super_block *sb = dir->i_sb;
+
+	if (dir->i_nlink < 2) {
+		/* Directory should have "."/".." entries at least. */
+		fat_fs_error(sb, "corrupted directory (invalid entries)");
+		return -EIO;
+	}
+	if (MSDOS_I(dir)->i_start == 0 ||
+	    MSDOS_I(dir)->i_start == MSDOS_SB(sb)->root_cluster) {
+		/* Directory should point valid cluster. */
+		fat_fs_error(sb, "corrupted directory (invalid i_start)");
+		return -EIO;
+	}
+	return 0;
+}
+
 /* doesn't deal with root inode */
 int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
 {
@@ -475,6 +524,10 @@ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de)
 		MSDOS_I(inode)->mmu_private = inode->i_size;
 
 		set_nlink(inode, fat_subdirs(inode));
+
+		error = fat_validate_dir(inode);
+		if (error < 0)
+			return error;
 	} else { /* not a directory */
 		inode->i_generation |= 1;
 		inode->i_mode = fat_make_mode(sbi, de->attr,
@@ -553,13 +606,43 @@ out:
 
 EXPORT_SYMBOL_GPL(fat_build_inode);
 
+static int __fat_write_inode(struct inode *inode, int wait);
+
+static void fat_free_eofblocks(struct inode *inode)
+{
+	/* Release unwritten fallocated blocks on inode eviction. */
+	if ((inode->i_blocks << 9) >
+			round_up(MSDOS_I(inode)->mmu_private,
+				MSDOS_SB(inode->i_sb)->cluster_size)) {
+		int err;
+
+		fat_truncate_blocks(inode, MSDOS_I(inode)->mmu_private);
+		/* Fallocate results in updating the i_start/iogstart
+		 * for the zero byte file. So, make it return to
+		 * original state during evict and commit it to avoid
+		 * any corruption on the next access to the cluster
+		 * chain for the file.
+		 */
+		err = __fat_write_inode(inode, inode_needs_sync(inode));
+		if (err) {
+			fat_msg(inode->i_sb, KERN_WARNING, "Failed to "
+					"update on disk inode for unused "
+					"fallocated blocks, inode could be "
+					"corrupted. Please run fsck");
+		}
+
+	}
+}
+
 static void fat_evict_inode(struct inode *inode)
 {
 	truncate_inode_pages_final(&inode->i_data);
 	if (!inode->i_nlink) {
 		inode->i_size = 0;
 		fat_truncate_blocks(inode, 0);
-	}
+	} else
+		fat_free_eofblocks(inode);
+
 	invalidate_inode_buffers(inode);
 	clear_inode(inode);
 	fat_cache_inval_inode(inode);
@@ -677,7 +760,7 @@ static int __init fat_init_inodecache(void)
 	fat_inode_cachep = kmem_cache_create("fat_inode_cache",
 					     sizeof(struct msdos_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					     init_once);
 	if (fat_inode_cachep == NULL)
 		return -ENOMEM;
@@ -1146,7 +1229,12 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat,
 		case Opt_time_offset:
 			if (match_int(&args[0], &option))
 				return -EINVAL;
-			if (option < -12 * 60 || option > 12 * 60)
+			/*
+			 * GMT+-12 zones may have DST corrections so at least
+			 * 13 hours difference is needed. Make the limit 24
+			 * just in case someone invents something unusual.
+			 */
+			if (option < -24 * 60 || option > 24 * 60)
 				return -EINVAL;
 			opts->tz_set = 1;
 			opts->time_offset = option;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 8abb9f814..350a2c8cf 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -29,7 +29,7 @@
 
 #define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME)
 
-int setfl(int fd, struct file * filp, unsigned long arg)
+static int setfl(int fd, struct file * filp, unsigned long arg)
 {
 	struct inode * inode = file_inode(filp);
 	int error = 0;
@@ -51,7 +51,8 @@ int setfl(int fd, struct file * filp, unsigned long arg)
 	       if (arg & O_NDELAY)
 		   arg |= O_NONBLOCK;
 
-	if (arg & O_DIRECT) {
+	/* Pipe packetized mode is controlled by O_DIRECT flag */
+	if (!S_ISFIFO(filp->f_inode->i_mode) && (arg & O_DIRECT)) {
 		if (!filp->f_mapping || !filp->f_mapping->a_ops ||
 			!filp->f_mapping->a_ops->direct_IO)
 				return -EINVAL;
@@ -59,8 +60,6 @@ int setfl(int fd, struct file * filp, unsigned long arg)
 
 	if (filp->f_op->check_flags)
 		error = filp->f_op->check_flags(arg);
-	if (!error && filp->f_op->setfl)
-		error = filp->f_op->setfl(filp, arg);
 	if (error)
 		return error;
 
@@ -81,7 +80,6 @@ int setfl(int fd, struct file * filp, unsigned long arg)
  out:
 	return error;
 }
-EXPORT_SYMBOL_GPL(setfl);
 
 static void f_modown(struct file *filp, struct pid *pid, enum pid_type type,
                      int force)
diff --git a/fs/file.c b/fs/file.c
index 39f8f1592..1fbc5c055 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -25,9 +25,9 @@
 
 int sysctl_nr_open __read_mostly = 1024*1024;
 int sysctl_nr_open_min = BITS_PER_LONG;
-/* our max() is unusable in constant expressions ;-/ */
-#define __const_max(x, y) ((x) < (y) ? (x) : (y))
-int sysctl_nr_open_max = __const_max(INT_MAX, ~(size_t)0/sizeof(void *)) &
+/* our min() is unusable in constant expressions ;-/ */
+#define __const_min(x, y) ((x) < (y) ? (x) : (y))
+int sysctl_nr_open_max = __const_min(INT_MAX, ~(size_t)0/sizeof(void *)) &
 			 -BITS_PER_LONG;
 
 static void *alloc_fdmem(size_t size)
@@ -37,11 +37,12 @@ static void *alloc_fdmem(size_t size)
 	 * vmalloc() if the allocation size will be considered "large" by the VM.
 	 */
 	if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
-		void *data = kmalloc(size, GFP_KERNEL|__GFP_NOWARN|__GFP_NORETRY);
+		void *data = kmalloc(size, GFP_KERNEL_ACCOUNT |
+				     __GFP_NOWARN | __GFP_NORETRY);
 		if (data != NULL)
 			return data;
 	}
-	return vmalloc(size);
+	return __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_HIGHMEM, PAGE_KERNEL);
 }
 
 static void __free_fdtable(struct fdtable *fdt)
@@ -126,7 +127,7 @@ static struct fdtable * alloc_fdtable(unsigned int nr)
 	if (unlikely(nr > sysctl_nr_open))
 		nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1;
 
-	fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL);
+	fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT);
 	if (!fdt)
 		goto out;
 	fdt->max_fds = nr;
diff --git a/fs/file_table.c b/fs/file_table.c
index ae9f2676d..ad17e05eb 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -147,7 +147,6 @@ over:
 	}
 	return ERR_PTR(-ENFILE);
 }
-EXPORT_SYMBOL_GPL(get_empty_filp);
 
 /**
  * alloc_file - allocate and initialize a 'struct file'
@@ -259,7 +258,6 @@ void flush_delayed_fput(void)
 {
 	delayed_fput(NULL);
 }
-EXPORT_SYMBOL_GPL(flush_delayed_fput);
 
 static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput);
 
@@ -302,7 +300,6 @@ void __fput_sync(struct file *file)
 }
 
 EXPORT_SYMBOL(fput);
-EXPORT_SYMBOL_GPL(__fput_sync);
 
 void put_filp(struct file *file)
 {
@@ -311,7 +308,6 @@ void put_filp(struct file *file)
 		file_free(file);
 	}
 }
-EXPORT_SYMBOL_GPL(put_filp);
 
 void __init files_init(void)
 { 
diff --git a/fs/filesystems.c b/fs/filesystems.c
index 5797d45a7..c5618db11 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -46,9 +46,9 @@ void put_filesystem(struct file_system_type *fs)
 static struct file_system_type **find_filesystem(const char *name, unsigned len)
 {
 	struct file_system_type **p;
-	for (p=&file_systems; *p; p=&(*p)->next)
-		if (strlen((*p)->name) == len &&
-		    strncmp((*p)->name, name, len) == 0)
+	for (p = &file_systems; *p; p = &(*p)->next)
+		if (strncmp((*p)->name, name, len) == 0 &&
+		    !(*p)->name[len])
 			break;
 	return p;
 }
diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c
index ef73ed674..3e2ccade6 100644
--- a/fs/freevxfs/vxfs_inode.c
+++ b/fs/freevxfs/vxfs_inode.c
@@ -326,6 +326,7 @@ vxfs_iget(struct super_block *sbp, ino_t ino)
 	} else if (S_ISLNK(ip->i_mode)) {
 		if (!VXFS_ISIMMED(vip)) {
 			ip->i_op = &page_symlink_inode_operations;
+			inode_nohighmem(ip);
 			ip->i_mapping->a_ops = &vxfs_aops;
 		} else {
 			ip->i_op = &simple_symlink_inode_operations;
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 7a8ea1351..5c46ed9f3 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -685,9 +685,7 @@ void wbc_account_io(struct writeback_control *wbc, struct page *page,
 	if (!wbc->wb)
 		return;
 
-	rcu_read_lock();
 	id = mem_cgroup_css_from_page(page)->id;
-	rcu_read_unlock();
 
 	if (id == wbc->wb_id) {
 		wbc->wb_bytes += bytes;
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 5e2e08712..4b855b65d 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -944,7 +944,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
 	if (!parent)
 		return -ENOENT;
 
-	mutex_lock(&parent->i_mutex);
+	inode_lock(parent);
 	if (!S_ISDIR(parent->i_mode))
 		goto unlock;
 
@@ -962,7 +962,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
 	fuse_invalidate_entry(entry);
 
 	if (child_nodeid != 0 && d_really_is_positive(entry)) {
-		mutex_lock(&d_inode(entry)->i_mutex);
+		inode_lock(d_inode(entry));
 		if (get_node_id(d_inode(entry)) != child_nodeid) {
 			err = -ENOENT;
 			goto badentry;
@@ -983,7 +983,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
 		clear_nlink(d_inode(entry));
 		err = 0;
  badentry:
-		mutex_unlock(&d_inode(entry)->i_mutex);
+		inode_unlock(d_inode(entry));
 		if (!err)
 			d_delete(entry);
 	} else {
@@ -992,7 +992,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
 	dput(entry);
 
  unlock:
-	mutex_unlock(&parent->i_mutex);
+	inode_unlock(parent);
 	iput(parent);
 	return err;
 }
@@ -1365,15 +1365,19 @@ static int fuse_readdir(struct file *file, struct dir_context *ctx)
 	return err;
 }
 
-static const char *fuse_follow_link(struct dentry *dentry, void **cookie)
+static const char *fuse_get_link(struct dentry *dentry,
+				 struct inode *inode,
+				 struct delayed_call *done)
 {
-	struct inode *inode = d_inode(dentry);
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	FUSE_ARGS(args);
 	char *link;
 	ssize_t ret;
 
-	link = (char *) __get_free_page(GFP_KERNEL);
+	if (!dentry)
+		return ERR_PTR(-ECHILD);
+
+	link = kmalloc(PAGE_SIZE, GFP_KERNEL);
 	if (!link)
 		return ERR_PTR(-ENOMEM);
 
@@ -1385,11 +1389,11 @@ static const char *fuse_follow_link(struct dentry *dentry, void **cookie)
 	args.out.args[0].value = link;
 	ret = fuse_simple_request(fc, &args);
 	if (ret < 0) {
-		free_page((unsigned long) link);
+		kfree(link);
 		link = ERR_PTR(ret);
 	} else {
 		link[ret] = '\0';
-		*cookie = link;
+		set_delayed_call(done, kfree_link, link);
 	}
 	fuse_invalidate_atime(inode);
 	return link;
@@ -1500,7 +1504,7 @@ void fuse_set_nowrite(struct inode *inode)
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_inode *fi = get_fuse_inode(inode);
 
-	BUG_ON(!mutex_is_locked(&inode->i_mutex));
+	BUG_ON(!inode_is_locked(inode));
 
 	spin_lock(&fc->lock);
 	BUG_ON(fi->writectr < 0);
@@ -1909,8 +1913,7 @@ static const struct inode_operations fuse_common_inode_operations = {
 
 static const struct inode_operations fuse_symlink_inode_operations = {
 	.setattr	= fuse_setattr,
-	.follow_link	= fuse_follow_link,
-	.put_link	= free_page_put_link,
+	.get_link	= fuse_get_link,
 	.readlink	= generic_readlink,
 	.getattr	= fuse_getattr,
 	.setxattr	= fuse_setxattr,
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 570ca4053..b03d253ec 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -207,7 +207,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
 		return err;
 
 	if (lock_inode)
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 
 	err = fuse_do_open(fc, get_node_id(inode), file, isdir);
 
@@ -215,7 +215,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
 		fuse_finish_open(inode, file);
 
 	if (lock_inode)
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 
 	return err;
 }
@@ -413,9 +413,9 @@ static int fuse_flush(struct file *file, fl_owner_t id)
 	if (err)
 		return err;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	fuse_sync_writes(inode);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	req = fuse_get_req_nofail_nopages(fc, file);
 	memset(&inarg, 0, sizeof(inarg));
@@ -450,7 +450,7 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
 	if (is_bad_inode(inode))
 		return -EIO;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	/*
 	 * Start writeback against all dirty pages of the inode, then
@@ -486,7 +486,7 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
 		err = 0;
 	}
 out:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return err;
 }
 
@@ -1160,7 +1160,7 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 		return generic_file_write_iter(iocb, from);
 	}
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	/* We can write back this queue in page reclaim */
 	current->backing_dev_info = inode_to_bdi(inode);
@@ -1210,7 +1210,7 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	}
 out:
 	current->backing_dev_info = NULL;
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	return written ? written : err;
 }
@@ -1322,10 +1322,10 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
 
 	if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) {
 		if (!write)
-			mutex_lock(&inode->i_mutex);
+			inode_lock(inode);
 		fuse_sync_writes(inode);
 		if (!write)
-			mutex_unlock(&inode->i_mutex);
+			inode_unlock(inode);
 	}
 
 	while (count) {
@@ -1413,14 +1413,14 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
 		return -EIO;
 
 	/* Don't allow parallel writes to the same file */
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	res = generic_write_checks(iocb, from);
 	if (res > 0)
 		res = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE);
 	fuse_invalidate_attr(inode);
 	if (res > 0)
 		fuse_write_update_size(inode, iocb->ki_pos);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	return res;
 }
@@ -2231,20 +2231,77 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
 	return err ? 0 : outarg.block;
 }
 
+static loff_t fuse_lseek(struct file *file, loff_t offset, int whence)
+{
+	struct inode *inode = file->f_mapping->host;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_file *ff = file->private_data;
+	FUSE_ARGS(args);
+	struct fuse_lseek_in inarg = {
+		.fh = ff->fh,
+		.offset = offset,
+		.whence = whence
+	};
+	struct fuse_lseek_out outarg;
+	int err;
+
+	if (fc->no_lseek)
+		goto fallback;
+
+	args.in.h.opcode = FUSE_LSEEK;
+	args.in.h.nodeid = ff->nodeid;
+	args.in.numargs = 1;
+	args.in.args[0].size = sizeof(inarg);
+	args.in.args[0].value = &inarg;
+	args.out.numargs = 1;
+	args.out.args[0].size = sizeof(outarg);
+	args.out.args[0].value = &outarg;
+	err = fuse_simple_request(fc, &args);
+	if (err) {
+		if (err == -ENOSYS) {
+			fc->no_lseek = 1;
+			goto fallback;
+		}
+		return err;
+	}
+
+	return vfs_setpos(file, outarg.offset, inode->i_sb->s_maxbytes);
+
+fallback:
+	err = fuse_update_attributes(inode, NULL, file, NULL);
+	if (!err)
+		return generic_file_llseek(file, offset, whence);
+	else
+		return err;
+}
+
 static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence)
 {
 	loff_t retval;
 	struct inode *inode = file_inode(file);
 
-	/* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */
-	if (whence == SEEK_CUR || whence == SEEK_SET)
-		return generic_file_llseek(file, offset, whence);
-
-	mutex_lock(&inode->i_mutex);
-	retval = fuse_update_attributes(inode, NULL, file, NULL);
-	if (!retval)
+	switch (whence) {
+	case SEEK_SET:
+	case SEEK_CUR:
+		 /* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */
 		retval = generic_file_llseek(file, offset, whence);
-	mutex_unlock(&inode->i_mutex);
+		break;
+	case SEEK_END:
+		inode_lock(inode);
+		retval = fuse_update_attributes(inode, NULL, file, NULL);
+		if (!retval)
+			retval = generic_file_llseek(file, offset, whence);
+		inode_unlock(inode);
+		break;
+	case SEEK_HOLE:
+	case SEEK_DATA:
+		inode_lock(inode);
+		retval = fuse_lseek(file, offset, whence);
+		inode_unlock(inode);
+		break;
+	default:
+		retval = -EINVAL;
+	}
 
 	return retval;
 }
@@ -2887,7 +2944,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
 		return -EOPNOTSUPP;
 
 	if (lock_inode) {
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 		if (mode & FALLOC_FL_PUNCH_HOLE) {
 			loff_t endbyte = offset + length - 1;
 			err = filemap_write_and_wait_range(inode->i_mapping,
@@ -2933,7 +2990,7 @@ out:
 		clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
 
 	if (lock_inode)
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 
 	return err;
 }
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index 405113101..ce394b5fe 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -605,6 +605,9 @@ struct fuse_conn {
 	/** Does the filesystem support asynchronous direct-IO submission? */
 	unsigned async_dio:1;
 
+	/** Is lseek not implemented by fs? */
+	unsigned no_lseek:1;
+
 	/** The number of requests waiting for completion */
 	atomic_t num_waiting;
 
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 2913db2a5..4d69d5c0b 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1255,8 +1255,8 @@ static int __init fuse_fs_init(void)
 	int err;
 
 	fuse_inode_cachep = kmem_cache_create("fuse_inode",
-					      sizeof(struct fuse_inode),
-					      0, SLAB_HWCACHE_ALIGN,
+					      sizeof(struct fuse_inode), 0,
+					      SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT,
 					      fuse_inode_init_once);
 	err = -ENOMEM;
 	if (!fuse_inode_cachep)
diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c
index 1be3b061c..791932617 100644
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -31,9 +31,9 @@ static const char *gfs2_acl_name(int type)
 {
 	switch (type) {
 	case ACL_TYPE_ACCESS:
-		return GFS2_POSIX_ACL_ACCESS;
+		return XATTR_POSIX_ACL_ACCESS;
 	case ACL_TYPE_DEFAULT:
-		return GFS2_POSIX_ACL_DEFAULT;
+		return XATTR_POSIX_ACL_DEFAULT;
 	}
 	return NULL;
 }
diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h
index 2d65ec4cd..3af4f407a 100644
--- a/fs/gfs2/acl.h
+++ b/fs/gfs2/acl.h
@@ -12,8 +12,6 @@
 
 #include "incore.h"
 
-#define GFS2_POSIX_ACL_ACCESS		"posix_acl_access"
-#define GFS2_POSIX_ACL_DEFAULT		"posix_acl_default"
 #define GFS2_ACL_MAX_ENTRIES(sdp) ((300 << (sdp)->sd_sb.sb_bsize_shift) >> 12)
 
 extern struct posix_acl *gfs2_get_acl(struct inode *inode, int type);
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 1caee0534..93f07465e 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -914,7 +914,7 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping,
 failed:
 	gfs2_trans_end(sdp);
 	gfs2_inplace_release(ip);
-	if (ip->i_res->rs_qa_qd_num)
+	if (ip->i_qadata && ip->i_qadata->qa_qd_num)
 		gfs2_quota_unlock(ip);
 	if (inode == sdp->sd_rindex) {
 		gfs2_glock_dq(&m_ip->i_gh);
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 61296ecbd..0860f0b5b 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -787,8 +787,8 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh,
 	if (error)
 		goto out_rlist;
 
-	if (gfs2_rs_active(ip->i_res)) /* needs to be done with the rgrp glock held */
-		gfs2_rs_deltree(ip->i_res);
+	if (gfs2_rs_active(&ip->i_res)) /* needs to be done with the rgrp glock held */
+		gfs2_rs_deltree(&ip->i_res);
 
 	error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE +
 				 RES_INDIRECT + RES_STATFS + RES_QUOTA,
@@ -1291,13 +1291,9 @@ int gfs2_setattr_size(struct inode *inode, u64 newsize)
 	if (ret)
 		return ret;
 
-	ret = get_write_access(inode);
-	if (ret)
-		return ret;
-
 	inode_dio_wait(inode);
 
-	ret = gfs2_rs_alloc(ip);
+	ret = gfs2_rsqa_alloc(ip);
 	if (ret)
 		goto out;
 
@@ -1307,10 +1303,9 @@ int gfs2_setattr_size(struct inode *inode, u64 newsize)
 		goto out;
 	}
 
-	gfs2_rs_deltree(ip->i_res);
 	ret = do_shrink(inode, oldsize, newsize);
 out:
-	put_write_access(inode);
+	gfs2_rsqa_delete(ip, NULL);
 	return ret;
 }
 
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index ad8a5b757..6a9259230 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -82,6 +82,8 @@
 
 #define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1)
 #define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1))
+#define GFS2_HASH_INDEX_MASK 0xffffc000
+#define GFS2_USE_HASH_FLAG 0x2000
 
 struct qstr gfs2_qdot __read_mostly;
 struct qstr gfs2_qdotdot __read_mostly;
@@ -108,7 +110,7 @@ static int gfs2_dir_get_existing_buffer(struct gfs2_inode *ip, u64 block,
 	struct buffer_head *bh;
 	int error;
 
-	error = gfs2_meta_read(ip->i_gl, block, DIO_WAIT, &bh);
+	error = gfs2_meta_read(ip->i_gl, block, DIO_WAIT, 0, &bh);
 	if (error)
 		return error;
 	if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_JD)) {
@@ -305,7 +307,7 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, __be64 *buf,
 			BUG_ON(extlen < 1);
 			bh = gfs2_meta_ra(ip->i_gl, dblock, extlen);
 		} else {
-			error = gfs2_meta_read(ip->i_gl, dblock, DIO_WAIT, &bh);
+			error = gfs2_meta_read(ip->i_gl, dblock, DIO_WAIT, 0, &bh);
 			if (error)
 				goto fail;
 		}
@@ -443,6 +445,27 @@ static int gfs2_dirent_last(const struct gfs2_dirent *dent,
 	return 0;
 }
 
+/* Look for the dirent that contains the offset specified in data. Once we
+ * find that dirent, there must be space available there for the new dirent */
+static int gfs2_dirent_find_offset(const struct gfs2_dirent *dent,
+				  const struct qstr *name,
+				  void *ptr)
+{
+	unsigned required = GFS2_DIRENT_SIZE(name->len);
+	unsigned actual = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len));
+	unsigned totlen = be16_to_cpu(dent->de_rec_len);
+
+	if (ptr < (void *)dent || ptr >= (void *)dent + totlen)
+		return 0;
+	if (gfs2_dirent_sentinel(dent))
+		actual = 0;
+	if (ptr < (void *)dent + actual)
+		return -1;
+	if ((void *)dent + totlen >= ptr + required)
+		return 1;
+	return -1;
+}
+
 static int gfs2_dirent_find_space(const struct gfs2_dirent *dent,
 				  const struct qstr *name,
 				  void *opaque)
@@ -682,6 +705,27 @@ static void dirent_del(struct gfs2_inode *dip, struct buffer_head *bh,
 	prev->de_rec_len = cpu_to_be16(prev_rec_len);
 }
 
+
+static struct gfs2_dirent *do_init_dirent(struct inode *inode,
+					  struct gfs2_dirent *dent,
+					  const struct qstr *name,
+					  struct buffer_head *bh,
+					  unsigned offset)
+{
+	struct gfs2_inode *ip = GFS2_I(inode);
+	struct gfs2_dirent *ndent;
+	unsigned totlen;
+
+	totlen = be16_to_cpu(dent->de_rec_len);
+	BUG_ON(offset + name->len > totlen);
+	gfs2_trans_add_meta(ip->i_gl, bh);
+	ndent = (struct gfs2_dirent *)((char *)dent + offset);
+	dent->de_rec_len = cpu_to_be16(offset);
+	gfs2_qstr2dirent(name, totlen - offset, ndent);
+	return ndent;
+}
+
+
 /*
  * Takes a dent from which to grab space as an argument. Returns the
  * newly created dent.
@@ -691,31 +735,25 @@ static struct gfs2_dirent *gfs2_init_dirent(struct inode *inode,
 					    const struct qstr *name,
 					    struct buffer_head *bh)
 {
-	struct gfs2_inode *ip = GFS2_I(inode);
-	struct gfs2_dirent *ndent;
-	unsigned offset = 0, totlen;
+	unsigned offset = 0;
 
 	if (!gfs2_dirent_sentinel(dent))
 		offset = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len));
-	totlen = be16_to_cpu(dent->de_rec_len);
-	BUG_ON(offset + name->len > totlen);
-	gfs2_trans_add_meta(ip->i_gl, bh);
-	ndent = (struct gfs2_dirent *)((char *)dent + offset);
-	dent->de_rec_len = cpu_to_be16(offset);
-	gfs2_qstr2dirent(name, totlen - offset, ndent);
-	return ndent;
+	return do_init_dirent(inode, dent, name, bh, offset);
 }
 
-static struct gfs2_dirent *gfs2_dirent_alloc(struct inode *inode,
-					     struct buffer_head *bh,
-					     const struct qstr *name)
+static struct gfs2_dirent *gfs2_dirent_split_alloc(struct inode *inode,
+						   struct buffer_head *bh,
+						   const struct qstr *name,
+						   void *ptr)
 {
 	struct gfs2_dirent *dent;
 	dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
-				gfs2_dirent_find_space, name, NULL);
+				gfs2_dirent_find_offset, name, ptr);
 	if (!dent || IS_ERR(dent))
 		return dent;
-	return gfs2_init_dirent(inode, dent, name, bh);
+	return do_init_dirent(inode, dent, name, bh,
+			      (unsigned)(ptr - (void *)dent));
 }
 
 static int get_leaf(struct gfs2_inode *dip, u64 leaf_no,
@@ -723,7 +761,7 @@ static int get_leaf(struct gfs2_inode *dip, u64 leaf_no,
 {
 	int error;
 
-	error = gfs2_meta_read(dip->i_gl, leaf_no, DIO_WAIT, bhp);
+	error = gfs2_meta_read(dip->i_gl, leaf_no, DIO_WAIT, 0, bhp);
 	if (!error && gfs2_metatype_check(GFS2_SB(&dip->i_inode), *bhp, GFS2_METATYPE_LF)) {
 		/* pr_info("block num=%llu\n", leaf_no); */
 		error = -EIO;
@@ -1051,10 +1089,11 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name)
 		if (!gfs2_dirent_sentinel(dent) &&
 		    be32_to_cpu(dent->de_hash) < divider) {
 			struct qstr str;
+			void *ptr = ((char *)dent - obh->b_data) + nbh->b_data;
 			str.name = (char*)(dent+1);
 			str.len = be16_to_cpu(dent->de_name_len);
 			str.hash = be32_to_cpu(dent->de_hash);
-			new = gfs2_dirent_alloc(inode, nbh, &str);
+			new = gfs2_dirent_split_alloc(inode, nbh, &str, ptr);
 			if (IS_ERR(new)) {
 				error = PTR_ERR(new);
 				break;
@@ -1186,10 +1225,10 @@ static int compare_dents(const void *a, const void *b)
 	int ret = 0;
 
 	dent_a = *(const struct gfs2_dirent **)a;
-	hash_a = be32_to_cpu(dent_a->de_hash);
+	hash_a = dent_a->de_cookie;
 
 	dent_b = *(const struct gfs2_dirent **)b;
-	hash_b = be32_to_cpu(dent_b->de_hash);
+	hash_b = dent_b->de_cookie;
 
 	if (hash_a > hash_b)
 		ret = 1;
@@ -1227,19 +1266,20 @@ static int compare_dents(const void *a, const void *b)
  */
 
 static int do_filldir_main(struct gfs2_inode *dip, struct dir_context *ctx,
-			   const struct gfs2_dirent **darr, u32 entries,
-			   int *copied)
+			   struct gfs2_dirent **darr, u32 entries,
+			   u32 sort_start, int *copied)
 {
 	const struct gfs2_dirent *dent, *dent_next;
 	u64 off, off_next;
 	unsigned int x, y;
 	int run = 0;
 
-	sort(darr, entries, sizeof(struct gfs2_dirent *), compare_dents, NULL);
+	if (sort_start < entries)
+		sort(&darr[sort_start], entries - sort_start,
+		     sizeof(struct gfs2_dirent *), compare_dents, NULL);
 
 	dent_next = darr[0];
-	off_next = be32_to_cpu(dent_next->de_hash);
-	off_next = gfs2_disk_hash2offset(off_next);
+	off_next = dent_next->de_cookie;
 
 	for (x = 0, y = 1; x < entries; x++, y++) {
 		dent = dent_next;
@@ -1247,8 +1287,7 @@ static int do_filldir_main(struct gfs2_inode *dip, struct dir_context *ctx,
 
 		if (y < entries) {
 			dent_next = darr[y];
-			off_next = be32_to_cpu(dent_next->de_hash);
-			off_next = gfs2_disk_hash2offset(off_next);
+			off_next = dent_next->de_cookie;
 
 			if (off < ctx->pos)
 				continue;
@@ -1295,6 +1334,40 @@ static void *gfs2_alloc_sort_buffer(unsigned size)
 	return ptr;
 }
 
+
+static int gfs2_set_cookies(struct gfs2_sbd *sdp, struct buffer_head *bh,
+			    unsigned leaf_nr, struct gfs2_dirent **darr,
+			    unsigned entries)
+{
+	int sort_id = -1;
+	int i;
+	
+	for (i = 0; i < entries; i++) {
+		unsigned offset;
+
+		darr[i]->de_cookie = be32_to_cpu(darr[i]->de_hash);
+		darr[i]->de_cookie = gfs2_disk_hash2offset(darr[i]->de_cookie);
+
+		if (!sdp->sd_args.ar_loccookie)
+			continue;
+		offset = (char *)(darr[i]) -
+			 (bh->b_data + gfs2_dirent_offset(bh->b_data));
+		offset /= GFS2_MIN_DIRENT_SIZE;
+		offset += leaf_nr * sdp->sd_max_dents_per_leaf;
+		if (offset >= GFS2_USE_HASH_FLAG ||
+		    leaf_nr >= GFS2_USE_HASH_FLAG) {
+			darr[i]->de_cookie |= GFS2_USE_HASH_FLAG;
+			if (sort_id < 0)
+				sort_id = i;
+			continue;
+		}
+		darr[i]->de_cookie &= GFS2_HASH_INDEX_MASK;
+		darr[i]->de_cookie |= offset;
+	}
+	return sort_id;
+}	
+
+
 static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
 			      int *copied, unsigned *depth,
 			      u64 leaf_no)
@@ -1304,12 +1377,11 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
 	struct buffer_head *bh;
 	struct gfs2_leaf *lf;
 	unsigned entries = 0, entries2 = 0;
-	unsigned leaves = 0;
-	const struct gfs2_dirent **darr, *dent;
+	unsigned leaves = 0, leaf = 0, offset, sort_offset;
+	struct gfs2_dirent **darr, *dent;
 	struct dirent_gather g;
 	struct buffer_head **larr;
-	int leaf = 0;
-	int error, i;
+	int error, i, need_sort = 0, sort_id;
 	u64 lfn = leaf_no;
 
 	do {
@@ -1325,6 +1397,11 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
 		brelse(bh);
 	} while(lfn);
 
+	if (*depth < GFS2_DIR_MAX_DEPTH || !sdp->sd_args.ar_loccookie) {
+		need_sort = 1;
+		sort_offset = 0;
+	}
+
 	if (!entries)
 		return 0;
 
@@ -1338,8 +1415,8 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
 	larr = gfs2_alloc_sort_buffer((leaves + entries + 99) * sizeof(void *));
 	if (!larr)
 		goto out;
-	darr = (const struct gfs2_dirent **)(larr + leaves);
-	g.pdent = darr;
+	darr = (struct gfs2_dirent **)(larr + leaves);
+	g.pdent = (const struct gfs2_dirent **)darr;
 	g.offset = 0;
 	lfn = leaf_no;
 
@@ -1350,6 +1427,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
 		lf = (struct gfs2_leaf *)bh->b_data;
 		lfn = be64_to_cpu(lf->lf_next);
 		if (lf->lf_entries) {
+			offset = g.offset;
 			entries2 += be16_to_cpu(lf->lf_entries);
 			dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size,
 						gfs2_dirent_gather, NULL, &g);
@@ -1367,17 +1445,26 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx,
 				goto out_free;
 			}
 			error = 0;
+			sort_id = gfs2_set_cookies(sdp, bh, leaf, &darr[offset],
+						   be16_to_cpu(lf->lf_entries));
+			if (!need_sort && sort_id >= 0) {
+				need_sort = 1;
+				sort_offset = offset + sort_id;
+			}
 			larr[leaf++] = bh;
 		} else {
+			larr[leaf++] = NULL;
 			brelse(bh);
 		}
 	} while(lfn);
 
 	BUG_ON(entries2 != entries);
-	error = do_filldir_main(ip, ctx, darr, entries, copied);
+	error = do_filldir_main(ip, ctx, darr, entries, need_sort ?
+				sort_offset : entries, copied);
 out_free:
 	for(i = 0; i < leaf; i++)
-		brelse(larr[i]);
+		if (larr[i])
+			brelse(larr[i]);
 	kvfree(larr);
 out:
 	return error;
@@ -1483,7 +1570,7 @@ int gfs2_dir_read(struct inode *inode, struct dir_context *ctx,
 	struct gfs2_inode *dip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	struct dirent_gather g;
-	const struct gfs2_dirent **darr, *dent;
+	struct gfs2_dirent **darr, *dent;
 	struct buffer_head *dibh;
 	int copied = 0;
 	int error;
@@ -1507,7 +1594,7 @@ int gfs2_dir_read(struct inode *inode, struct dir_context *ctx,
 	/* 96 is max number of dirents which can be stuffed into an inode */
 	darr = kmalloc(96 * sizeof(struct gfs2_dirent *), GFP_NOFS);
 	if (darr) {
-		g.pdent = darr;
+		g.pdent = (const struct gfs2_dirent **)darr;
 		g.offset = 0;
 		dent = gfs2_dirent_scan(inode, dibh->b_data, dibh->b_size,
 					gfs2_dirent_gather, NULL, &g);
@@ -1524,8 +1611,9 @@ int gfs2_dir_read(struct inode *inode, struct dir_context *ctx,
 			error = -EIO;
 			goto out;
 		}
+		gfs2_set_cookies(sdp, dibh, 0, darr, dip->i_entries);
 		error = do_filldir_main(dip, ctx, darr,
-					dip->i_entries, &copied);
+					dip->i_entries, 0, &copied);
 out:
 		kfree(darr);
 	}
@@ -1560,15 +1648,22 @@ struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name,
 
 	dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, &bh);
 	if (dent) {
+		struct inode *inode;
+		u16 rahead;
+
 		if (IS_ERR(dent))
 			return ERR_CAST(dent);
 		dtype = be16_to_cpu(dent->de_type);
+		rahead = be16_to_cpu(dent->de_rahead);
 		addr = be64_to_cpu(dent->de_inum.no_addr);
 		formal_ino = be64_to_cpu(dent->de_inum.no_formal_ino);
 		brelse(bh);
 		if (fail_on_exist)
 			return ERR_PTR(-EEXIST);
-		return gfs2_inode_lookup(dir->i_sb, dtype, addr, formal_ino, 0);
+		inode = gfs2_inode_lookup(dir->i_sb, dtype, addr, formal_ino, 0);
+		if (!IS_ERR(inode))
+			GFS2_I(inode)->i_rahead = rahead;
+		return inode;
 	}
 	return ERR_PTR(-ENOENT);
 }
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 5e425469f..c9384f932 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -298,9 +298,9 @@ static int gfs2_set_flags(struct file *filp, u32 __user *ptr)
 		gfsflags &= ~GFS2_DIF_TOPDIR;
 		if (gfsflags & GFS2_DIF_INHERIT_JDATA)
 			gfsflags ^= (GFS2_DIF_JDATA | GFS2_DIF_INHERIT_JDATA);
-		return do_gfs2_set_flags(filp, gfsflags, ~0);
+		return do_gfs2_set_flags(filp, gfsflags, ~GFS2_DIF_SYSTEM);
 	}
-	return do_gfs2_set_flags(filp, gfsflags, ~GFS2_DIF_JDATA);
+	return do_gfs2_set_flags(filp, gfsflags, ~(GFS2_DIF_SYSTEM | GFS2_DIF_JDATA));
 }
 
 static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
@@ -336,8 +336,8 @@ static void gfs2_size_hint(struct file *filep, loff_t offset, size_t size)
 	size_t blks = (size + sdp->sd_sb.sb_bsize - 1) >> sdp->sd_sb.sb_bsize_shift;
 	int hint = min_t(size_t, INT_MAX, blks);
 
-	if (hint > atomic_read(&ip->i_res->rs_sizehint))
-		atomic_set(&ip->i_res->rs_sizehint, hint);
+	if (hint > atomic_read(&ip->i_res.rs_sizehint))
+		atomic_set(&ip->i_res.rs_sizehint, hint);
 }
 
 /**
@@ -397,14 +397,10 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	/* Update file times before taking page lock */
 	file_update_time(vma->vm_file);
 
-	ret = get_write_access(inode);
+	ret = gfs2_rsqa_alloc(ip);
 	if (ret)
 		goto out;
 
-	ret = gfs2_rs_alloc(ip);
-	if (ret)
-		goto out_write_access;
-
 	gfs2_size_hint(vma->vm_file, pos, PAGE_CACHE_SIZE);
 
 	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
@@ -486,8 +482,6 @@ out_uninit:
 		set_page_dirty(page);
 		wait_for_stable_page(page);
 	}
-out_write_access:
-	put_write_access(inode);
 out:
 	sb_end_pagefault(inode->i_sb);
 	return block_page_mkwrite_return(ret);
@@ -623,7 +617,7 @@ static int gfs2_release(struct inode *inode, struct file *file)
 	if (!(file->f_mode & FMODE_WRITE))
 		return 0;
 
-	gfs2_rs_delete(ip, &inode->i_writecount);
+	gfs2_rsqa_delete(ip, &inode->i_writecount);
 	return 0;
 }
 
@@ -703,7 +697,7 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	struct gfs2_inode *ip = GFS2_I(file_inode(file));
 	int ret;
 
-	ret = gfs2_rs_alloc(ip);
+	ret = gfs2_rsqa_alloc(ip);
 	if (ret)
 		return ret;
 
@@ -920,7 +914,7 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t le
 	if ((mode & ~FALLOC_FL_KEEP_SIZE) || gfs2_is_jdata(ip))
 		return -EOPNOTSUPP;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
 	ret = gfs2_glock_nq(&gh);
@@ -938,20 +932,21 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t le
 	if (ret)
 		goto out_unlock;
 
-	ret = gfs2_rs_alloc(ip);
+	ret = gfs2_rsqa_alloc(ip);
 	if (ret)
 		goto out_putw;
 
 	ret = __gfs2_fallocate(file, mode, offset, len);
 	if (ret)
-		gfs2_rs_deltree(ip->i_res);
+		gfs2_rs_deltree(&ip->i_res);
+
 out_putw:
 	put_write_access(inode);
 out_unlock:
 	gfs2_glock_dq(&gh);
 out_uninit:
 	gfs2_holder_uninit(&gh);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return ret;
 }
 
@@ -962,7 +957,7 @@ static ssize_t gfs2_file_splice_write(struct pipe_inode_info *pipe,
 	int error;
 	struct gfs2_inode *ip = GFS2_I(out->f_mapping->host);
 
-	error = gfs2_rs_alloc(ip);
+	error = gfs2_rsqa_alloc(ip);
 	if (error)
 		return (ssize_t)error;
 
@@ -1018,7 +1013,7 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
 	struct gfs2_inode *ip = GFS2_I(file_inode(file));
 	struct gfs2_glock *gl;
 	unsigned int state;
-	int flags;
+	u16 flags;
 	int error = 0;
 	int sleeptime;
 
@@ -1032,7 +1027,10 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
 		if (fl_gh->gh_state == state)
 			goto out;
 		locks_lock_file_wait(file,
-				     &(struct file_lock){.fl_type = F_UNLCK});
+				     &(struct file_lock) {
+					     .fl_type = F_UNLCK,
+					     .fl_flags = FL_FLOCK
+				     });
 		gfs2_glock_dq(fl_gh);
 		gfs2_holder_reinit(state, flags, fl_gh);
 	} else {
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 32e74710b..a4ff7b56f 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -446,7 +446,7 @@ __acquires(&gl->gl_lockref.lock)
 {
 	const struct gfs2_glock_operations *glops = gl->gl_ops;
 	struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
-	unsigned int lck_flags = gh ? gh->gh_flags : 0;
+	unsigned int lck_flags = (unsigned int)(gh ? gh->gh_flags : 0);
 	int ret;
 
 	lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP |
@@ -750,7 +750,7 @@ again:
  *
  */
 
-void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
+void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, u16 flags,
 		      struct gfs2_holder *gh)
 {
 	INIT_LIST_HEAD(&gh->gh_list);
@@ -774,7 +774,7 @@ void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags,
  *
  */
 
-void gfs2_holder_reinit(unsigned int state, unsigned flags, struct gfs2_holder *gh)
+void gfs2_holder_reinit(unsigned int state, u16 flags, struct gfs2_holder *gh)
 {
 	gh->gh_state = state;
 	gh->gh_flags = flags;
@@ -1080,7 +1080,7 @@ void gfs2_glock_dq_uninit(struct gfs2_holder *gh)
 
 int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number,
 		      const struct gfs2_glock_operations *glops,
-		      unsigned int state, int flags, struct gfs2_holder *gh)
+		      unsigned int state, u16 flags, struct gfs2_holder *gh)
 {
 	struct gfs2_glock *gl;
 	int error;
@@ -1417,14 +1417,14 @@ static struct shrinker glock_shrinker = {
 static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp)
 {
 	struct gfs2_glock *gl;
-	struct rhash_head *pos, *next;
+	struct rhash_head *pos;
 	const struct bucket_table *tbl;
 	int i;
 
 	rcu_read_lock();
 	tbl = rht_dereference_rcu(gl_hash_table.tbl, &gl_hash_table);
 	for (i = 0; i < tbl->size; i++) {
-		rht_for_each_entry_safe(gl, pos, next, tbl, i, gl_node) {
+		rht_for_each_entry_rcu(gl, pos, tbl, i, gl_node) {
 			if ((gl->gl_name.ln_sbd == sdp) &&
 			    lockref_get_not_dead(&gl->gl_lockref))
 				examiner(gl);
@@ -1506,7 +1506,9 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp)
 	flush_workqueue(glock_workqueue);
 	glock_hash_walk(clear_glock, sdp);
 	flush_workqueue(glock_workqueue);
-	wait_event(sdp->sd_glock_wait, atomic_read(&sdp->sd_glock_disposal) == 0);
+	wait_event_timeout(sdp->sd_glock_wait,
+			   atomic_read(&sdp->sd_glock_disposal) == 0,
+			   HZ * 600);
 	glock_hash_walk(dump_glock_func, sdp);
 }
 
@@ -1539,7 +1541,7 @@ static const char *state2str(unsigned state)
 	return "??";
 }
 
-static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags)
+static const char *hflags2str(char *buf, u16 flags, unsigned long iflags)
 {
 	char *p = buf;
 	if (flags & LM_FLAG_TRY)
diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h
index f7cdaa8b4..46ab67fc1 100644
--- a/fs/gfs2/glock.h
+++ b/fs/gfs2/glock.h
@@ -79,15 +79,15 @@ enum {
  * requested had acquired and released the lock.
  */
 
-#define LM_FLAG_TRY		0x00000001
-#define LM_FLAG_TRY_1CB		0x00000002
-#define LM_FLAG_NOEXP		0x00000004
-#define LM_FLAG_ANY		0x00000008
-#define LM_FLAG_PRIORITY	0x00000010
-#define GL_ASYNC		0x00000040
-#define GL_EXACT		0x00000080
-#define GL_SKIP			0x00000100
-#define GL_NOCACHE		0x00000400
+#define LM_FLAG_TRY		0x0001
+#define LM_FLAG_TRY_1CB		0x0002
+#define LM_FLAG_NOEXP		0x0004
+#define LM_FLAG_ANY		0x0008
+#define LM_FLAG_PRIORITY	0x0010
+#define GL_ASYNC		0x0040
+#define GL_EXACT		0x0080
+#define GL_SKIP			0x0100
+#define GL_NOCACHE		0x0400
   
 /*
  * lm_async_cb return flags
@@ -183,8 +183,8 @@ extern int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
 			  int create, struct gfs2_glock **glp);
 extern void gfs2_glock_put(struct gfs2_glock *gl);
 extern void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state,
-			     unsigned flags, struct gfs2_holder *gh);
-extern void gfs2_holder_reinit(unsigned int state, unsigned flags,
+			     u16 flags, struct gfs2_holder *gh);
+extern void gfs2_holder_reinit(unsigned int state, u16 flags,
 			       struct gfs2_holder *gh);
 extern void gfs2_holder_uninit(struct gfs2_holder *gh);
 extern int gfs2_glock_nq(struct gfs2_holder *gh);
@@ -195,7 +195,7 @@ extern void gfs2_glock_dq_wait(struct gfs2_holder *gh);
 extern void gfs2_glock_dq_uninit(struct gfs2_holder *gh);
 extern int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number,
 			     const struct gfs2_glock_operations *glops,
-			     unsigned int state, int flags,
+			     unsigned int state, u16 flags,
 			     struct gfs2_holder *gh);
 extern int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs);
 extern void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs);
@@ -215,7 +215,7 @@ void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...);
  */
 
 static inline int gfs2_glock_nq_init(struct gfs2_glock *gl,
-				     unsigned int state, int flags,
+				     unsigned int state, u16 flags,
 				     struct gfs2_holder *gh)
 {
 	int error;
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index f348cfb6b..437fd73e3 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -13,6 +13,7 @@
 #include <linux/gfs2_ondisk.h>
 #include <linux/bio.h>
 #include <linux/posix_acl.h>
+#include <linux/security.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -262,6 +263,7 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
 		if (ip) {
 			set_bit(GIF_INVALID, &ip->i_flags);
 			forget_all_cached_acls(&ip->i_inode);
+			security_inode_invalidate_secctx(&ip->i_inode);
 			gfs2_dir_hash_inval(ip);
 		}
 	}
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index de7b4f97a..845fb09cc 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -259,8 +259,8 @@ struct gfs2_holder {
 
 	struct gfs2_glock *gh_gl;
 	struct pid *gh_owner_pid;
-	unsigned int gh_state;
-	unsigned gh_flags;
+	u16 gh_flags;
+	u16 gh_state;
 
 	int gh_error;
 	unsigned long gh_iflags; /* HIF_... */
@@ -270,6 +270,13 @@ struct gfs2_holder {
 /* Number of quota types we support */
 #define GFS2_MAXQUOTAS 2
 
+struct gfs2_qadata { /* quota allocation data */
+	/* Quota stuff */
+	struct gfs2_quota_data *qa_qd[2 * GFS2_MAXQUOTAS];
+	struct gfs2_holder qa_qd_ghs[2 * GFS2_MAXQUOTAS];
+	unsigned int qa_qd_num;
+};
+
 /* Resource group multi-block reservation, in order of appearance:
 
    Step 1. Function prepares to write, allocates a mb, sets the size hint.
@@ -288,11 +295,6 @@ struct gfs2_blkreserv {
 	struct gfs2_rbm rs_rbm;       /* Start of reservation */
 	u32 rs_free;                  /* how many blocks are still free */
 	u64 rs_inum;                  /* Inode number for reservation */
-
-	/* ancillary quota stuff */
-	struct gfs2_quota_data *rs_qa_qd[2 * GFS2_MAXQUOTAS];
-	struct gfs2_holder rs_qa_qd_ghs[2 * GFS2_MAXQUOTAS];
-	unsigned int rs_qa_qd_num;
 };
 
 /*
@@ -391,7 +393,8 @@ struct gfs2_inode {
 	struct gfs2_glock *i_gl; /* Move into i_gh? */
 	struct gfs2_holder i_iopen_gh;
 	struct gfs2_holder i_gh; /* for prepare/commit_write only */
-	struct gfs2_blkreserv *i_res; /* rgrp multi-block reservation */
+	struct gfs2_qadata *i_qadata; /* quota allocation data */
+	struct gfs2_blkreserv i_res; /* rgrp multi-block reservation */
 	struct gfs2_rgrpd *i_rgd;
 	u64 i_goal;	/* goal block for allocations */
 	struct rw_semaphore i_rw_mutex;
@@ -402,6 +405,7 @@ struct gfs2_inode {
 	u32 i_diskflags;
 	u8 i_height;
 	u8 i_depth;
+	u16 i_rahead;
 };
 
 /*
@@ -558,6 +562,8 @@ struct gfs2_args {
 	unsigned int ar_errors:2;               /* errors=withdraw | panic */
 	unsigned int ar_nobarrier:1;            /* do not send barriers */
 	unsigned int ar_rgrplvb:1;		/* use lvbs for rgrp info */
+	unsigned int ar_loccookie:1;		/* use location based readdir
+						   cookies */
 	int ar_commit;				/* Commit interval */
 	int ar_statfs_quantum;			/* The fast statfs interval */
 	int ar_quota_quantum;			/* The quota interval */
@@ -685,6 +691,7 @@ struct gfs2_sbd {
 	u64 sd_heightsize[GFS2_MAX_META_HEIGHT + 1];
 	u32 sd_max_jheight; /* Max height of journaled file's meta tree */
 	u64 sd_jheightsize[GFS2_MAX_META_HEIGHT + 1];
+	u32 sd_max_dents_per_leaf; /* Max number of dirents in a leaf block */
 
 	struct gfs2_args sd_args;	/* Mount arguments */
 	struct gfs2_tune sd_tune;	/* Filesystem tuning structure */
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 063fdfcf8..352f95876 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -191,13 +191,13 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type,
 fail_refresh:
 	ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
 	ip->i_iopen_gh.gh_gl->gl_object = NULL;
-	gfs2_glock_dq_uninit(&ip->i_iopen_gh);
+	gfs2_glock_dq_wait(&ip->i_iopen_gh);
+	gfs2_holder_uninit(&ip->i_iopen_gh);
 fail_iopen:
 	if (io_gl)
 		gfs2_glock_put(io_gl);
 fail_put:
 	ip->i_gl->gl_object = NULL;
-	gfs2_glock_put(ip->i_gl);
 fail:
 	iget_failed(inode);
 	return ERR_PTR(error);
@@ -593,7 +593,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
 	struct gfs2_inode *dip = GFS2_I(dir), *ip;
 	struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode);
 	struct gfs2_glock *io_gl;
-	int error, free_vfs_inode = 0;
+	int error, free_vfs_inode = 1;
 	u32 aflags = 0;
 	unsigned blocks = 1;
 	struct gfs2_diradd da = { .bh = NULL, .save_loc = 1, };
@@ -601,7 +601,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
 	if (!name->len || name->len > GFS2_FNAMESIZE)
 		return -ENAMETOOLONG;
 
-	error = gfs2_rs_alloc(dip);
+	error = gfs2_rsqa_alloc(dip);
 	if (error)
 		return error;
 
@@ -650,10 +650,10 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
 
 	error = posix_acl_create(dir, &mode, &default_acl, &acl);
 	if (error)
-		goto fail_free_vfs_inode;
+		goto fail_gunlock;
 
 	ip = GFS2_I(inode);
-	error = gfs2_rs_alloc(ip);
+	error = gfs2_rsqa_alloc(ip);
 	if (error)
 		goto fail_free_acls;
 
@@ -685,6 +685,11 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
 		ip->i_entries = 2;
 		break;
 	}
+
+	/* Force SYSTEM flag on all files and subdirs of a SYSTEM directory */
+	if (dip->i_diskflags & GFS2_DIF_SYSTEM)
+		ip->i_diskflags |= GFS2_DIF_SYSTEM;
+
 	gfs2_set_inode_flags(inode);
 
 	if ((GFS2_I(d_inode(sdp->sd_root_dir)) == dip) ||
@@ -733,6 +738,9 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
 	gfs2_set_iop(inode);
 	insert_inode_hash(inode);
 
+	free_vfs_inode = 0; /* After this point, the inode is no longer
+			       considered free. Any failures need to undo
+			       the gfs2 structures. */
 	if (default_acl) {
 		error = gfs2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
 		posix_acl_release(default_acl);
@@ -766,24 +774,19 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
 	return error;
 
 fail_gunlock3:
-	gfs2_glock_dq_uninit(ghs + 1);
-	if (ip->i_gl)
-		gfs2_glock_put(ip->i_gl);
-	goto fail_gunlock;
-
+	gfs2_glock_dq_uninit(&ip->i_iopen_gh);
+	gfs2_glock_put(io_gl);
 fail_gunlock2:
 	gfs2_glock_dq_uninit(ghs + 1);
 fail_free_inode:
 	if (ip->i_gl)
 		gfs2_glock_put(ip->i_gl);
-	gfs2_rs_delete(ip, NULL);
+	gfs2_rsqa_delete(ip, NULL);
 fail_free_acls:
 	if (default_acl)
 		posix_acl_release(default_acl);
 	if (acl)
 		posix_acl_release(acl);
-fail_free_vfs_inode:
-	free_vfs_inode = 1;
 fail_gunlock:
 	gfs2_dir_no_add(&da);
 	gfs2_glock_dq_uninit(ghs);
@@ -898,7 +901,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir,
 	if (S_ISDIR(inode->i_mode))
 		return -EPERM;
 
-	error = gfs2_rs_alloc(dip);
+	error = gfs2_rsqa_alloc(dip);
 	if (error)
 		return error;
 
@@ -1371,7 +1374,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry,
 	if (error)
 		return error;
 
-	error = gfs2_rs_alloc(ndip);
+	error = gfs2_rsqa_alloc(ndip);
 	if (error)
 		return error;
 
@@ -1712,24 +1715,30 @@ static int gfs2_rename2(struct inode *odir, struct dentry *odentry,
 }
 
 /**
- * gfs2_follow_link - Follow a symbolic link
+ * gfs2_get_link - Follow a symbolic link
  * @dentry: The dentry of the link
- * @nd: Data that we pass to vfs_follow_link()
+ * @inode: The inode of the link
+ * @done: destructor for return value
  *
  * This can handle symlinks of any size.
  *
  * Returns: 0 on success or error code
  */
 
-static const char *gfs2_follow_link(struct dentry *dentry, void **cookie)
+static const char *gfs2_get_link(struct dentry *dentry,
+				 struct inode *inode,
+				 struct delayed_call *done)
 {
-	struct gfs2_inode *ip = GFS2_I(d_inode(dentry));
+	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_holder i_gh;
 	struct buffer_head *dibh;
 	unsigned int size;
 	char *buf;
 	int error;
 
+	if (!dentry)
+		return ERR_PTR(-ECHILD);
+
 	gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh);
 	error = gfs2_glock_nq(&i_gh);
 	if (error) {
@@ -1759,7 +1768,7 @@ static const char *gfs2_follow_link(struct dentry *dentry, void **cookie)
 out:
 	gfs2_glock_dq_uninit(&i_gh);
 	if (!IS_ERR(buf))
-		*cookie = buf;
+		set_delayed_call(done, kfree_link, buf);
 	return buf;
 }
 
@@ -1854,11 +1863,7 @@ static int setattr_chown(struct inode *inode, struct iattr *attr)
 	if (!(attr->ia_valid & ATTR_GID) || gid_eq(ogid, ngid))
 		ogid = ngid = NO_GID_QUOTA_CHANGE;
 
-	error = get_write_access(inode);
-	if (error)
-		return error;
-
-	error = gfs2_rs_alloc(ip);
+	error = gfs2_rsqa_alloc(ip);
 	if (error)
 		goto out;
 
@@ -1898,7 +1903,6 @@ out_end_trans:
 out_gunlock_q:
 	gfs2_quota_unlock(ip);
 out:
-	put_write_access(inode);
 	return error;
 }
 
@@ -1920,7 +1924,7 @@ static int gfs2_setattr(struct dentry *dentry, struct iattr *attr)
 	struct gfs2_holder i_gh;
 	int error;
 
-	error = gfs2_rs_alloc(ip);
+	error = gfs2_rsqa_alloc(ip);
 	if (error)
 		return error;
 
@@ -2002,7 +2006,7 @@ static int gfs2_setxattr(struct dentry *dentry, const char *name,
 	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
 	ret = gfs2_glock_nq(&gh);
 	if (ret == 0) {
-		ret = gfs2_rs_alloc(ip);
+		ret = gfs2_rsqa_alloc(ip);
 		if (ret == 0)
 			ret = generic_setxattr(dentry, name, data, size, flags);
 		gfs2_glock_dq(&gh);
@@ -2043,7 +2047,7 @@ static int gfs2_removexattr(struct dentry *dentry, const char *name)
 	gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh);
 	ret = gfs2_glock_nq(&gh);
 	if (ret == 0) {
-		ret = gfs2_rs_alloc(ip);
+		ret = gfs2_rsqa_alloc(ip);
 		if (ret == 0)
 			ret = generic_removexattr(dentry, name);
 		gfs2_glock_dq(&gh);
@@ -2063,7 +2067,7 @@ static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 	if (ret)
 		return ret;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh);
 	if (ret)
@@ -2090,7 +2094,7 @@ static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 
 	gfs2_glock_dq_uninit(&gh);
 out:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return ret;
 }
 
@@ -2132,8 +2136,7 @@ const struct inode_operations gfs2_dir_iops = {
 
 const struct inode_operations gfs2_symlink_iops = {
 	.readlink = generic_readlink,
-	.follow_link = gfs2_follow_link,
-	.put_link = kfree_put_link,
+	.get_link = gfs2_get_link,
 	.permission = gfs2_permission,
 	.setattr = gfs2_setattr,
 	.getattr = gfs2_getattr,
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 536e7a625..0ff028c15 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -716,6 +716,9 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl,
 	}
 	trace_gfs2_log_flush(sdp, 1);
 
+	if (type == SHUTDOWN_FLUSH)
+		clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
+
 	sdp->sd_log_flush_head = sdp->sd_log_head;
 	sdp->sd_log_flush_wrapped = 0;
 	tr = sdp->sd_log_tr;
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index fb2b42cf4..f99f8e94d 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -41,7 +41,9 @@ static void gfs2_init_inode_once(void *foo)
 	inode_init_once(&ip->i_inode);
 	init_rwsem(&ip->i_rw_mutex);
 	INIT_LIST_HEAD(&ip->i_trunc_list);
-	ip->i_res = NULL;
+	ip->i_qadata = NULL;
+	memset(&ip->i_res, 0, sizeof(ip->i_res));
+	RB_CLEAR_NODE(&ip->i_res.rs_node);
 	ip->i_hash_cache = NULL;
 }
 
@@ -112,7 +114,8 @@ static int __init init_gfs2_fs(void)
 	gfs2_inode_cachep = kmem_cache_create("gfs2_inode",
 					      sizeof(struct gfs2_inode),
 					      0,  SLAB_RECLAIM_ACCOUNT|
-					          SLAB_MEM_SPREAD,
+						  SLAB_MEM_SPREAD|
+						  SLAB_ACCOUNT,
 					      gfs2_init_inode_once);
 	if (!gfs2_inode_cachep)
 		goto fail;
@@ -135,10 +138,10 @@ static int __init init_gfs2_fs(void)
 	if (!gfs2_quotad_cachep)
 		goto fail;
 
-	gfs2_rsrv_cachep = kmem_cache_create("gfs2_mblk",
-					     sizeof(struct gfs2_blkreserv),
+	gfs2_qadata_cachep = kmem_cache_create("gfs2_qadata",
+					       sizeof(struct gfs2_qadata),
 					       0, 0, NULL);
-	if (!gfs2_rsrv_cachep)
+	if (!gfs2_qadata_cachep)
 		goto fail;
 
 	register_shrinker(&gfs2_qd_shrinker);
@@ -193,8 +196,8 @@ fail_lru:
 	unregister_shrinker(&gfs2_qd_shrinker);
 	gfs2_glock_exit();
 
-	if (gfs2_rsrv_cachep)
-		kmem_cache_destroy(gfs2_rsrv_cachep);
+	if (gfs2_qadata_cachep)
+		kmem_cache_destroy(gfs2_qadata_cachep);
 
 	if (gfs2_quotad_cachep)
 		kmem_cache_destroy(gfs2_quotad_cachep);
@@ -238,7 +241,7 @@ static void __exit exit_gfs2_fs(void)
 	rcu_barrier();
 
 	mempool_destroy(gfs2_page_pool);
-	kmem_cache_destroy(gfs2_rsrv_cachep);
+	kmem_cache_destroy(gfs2_qadata_cachep);
 	kmem_cache_destroy(gfs2_quotad_cachep);
 	kmem_cache_destroy(gfs2_rgrpd_cachep);
 	kmem_cache_destroy(gfs2_bufdata_cachep);
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 0e1d4be58..e137d96f1 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -187,6 +187,52 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno)
 	return bh;
 }
 
+static void gfs2_meta_read_endio(struct bio *bio)
+{
+	struct bio_vec *bvec;
+	int i;
+
+	bio_for_each_segment_all(bvec, bio, i) {
+		struct page *page = bvec->bv_page;
+		struct buffer_head *bh = page_buffers(page);
+		unsigned int len = bvec->bv_len;
+
+		while (bh_offset(bh) < bvec->bv_offset)
+			bh = bh->b_this_page;
+		do {
+			struct buffer_head *next = bh->b_this_page;
+			len -= bh->b_size;
+			bh->b_end_io(bh, !bio->bi_error);
+			bh = next;
+		} while (bh && len);
+	}
+	bio_put(bio);
+}
+
+/*
+ * Submit several consecutive buffer head I/O requests as a single bio I/O
+ * request.  (See submit_bh_wbc.)
+ */
+static void gfs2_submit_bhs(int rw, struct buffer_head *bhs[], int num)
+{
+	struct buffer_head *bh = bhs[0];
+	struct bio *bio;
+	int i;
+
+	if (!num)
+		return;
+
+	bio = bio_alloc(GFP_NOIO, num);
+	bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
+	bio->bi_bdev = bh->b_bdev;
+	for (i = 0; i < num; i++) {
+		bh = bhs[i];
+		bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
+	}
+	bio->bi_end_io = gfs2_meta_read_endio;
+	submit_bio(rw, bio);
+}
+
 /**
  * gfs2_meta_read - Read a block from disk
  * @gl: The glock covering the block
@@ -198,10 +244,11 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno)
  */
 
 int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
-		   struct buffer_head **bhp)
+		   int rahead, struct buffer_head **bhp)
 {
 	struct gfs2_sbd *sdp = gl->gl_name.ln_sbd;
-	struct buffer_head *bh;
+	struct buffer_head *bh, *bhs[2];
+	int num = 0;
 
 	if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) {
 		*bhp = NULL;
@@ -213,14 +260,31 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
 	lock_buffer(bh);
 	if (buffer_uptodate(bh)) {
 		unlock_buffer(bh);
-		return 0;
+		flags &= ~DIO_WAIT;
+	} else {
+		bh->b_end_io = end_buffer_read_sync;
+		get_bh(bh);
+		bhs[num++] = bh;
 	}
-	bh->b_end_io = end_buffer_read_sync;
-	get_bh(bh);
-	submit_bh(READ_SYNC | REQ_META | REQ_PRIO, bh);
+
+	if (rahead) {
+		bh = gfs2_getbuf(gl, blkno + 1, CREATE);
+
+		lock_buffer(bh);
+		if (buffer_uptodate(bh)) {
+			unlock_buffer(bh);
+			brelse(bh);
+		} else {
+			bh->b_end_io = end_buffer_read_sync;
+			bhs[num++] = bh;
+		}
+	}
+
+	gfs2_submit_bhs(READ_SYNC | REQ_META | REQ_PRIO, bhs, num);
 	if (!(flags & DIO_WAIT))
 		return 0;
 
+	bh = *bhp;
 	wait_on_buffer(bh);
 	if (unlikely(!buffer_uptodate(bh))) {
 		struct gfs2_trans *tr = current->journal_info;
@@ -341,8 +405,12 @@ int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num,
 	struct buffer_head *bh;
 	int ret = 0;
 	u32 mtype = height ? GFS2_METATYPE_IN : GFS2_METATYPE_DI;
+	int rahead = 0;
+
+	if (num == ip->i_no_addr)
+		rahead = ip->i_rahead;
 
-	ret = gfs2_meta_read(gl, num, DIO_WAIT, &bh);
+	ret = gfs2_meta_read(gl, num, DIO_WAIT, rahead, &bh);
 	if (ret == 0 && gfs2_metatype_check(sdp, bh, mtype)) {
 		brelse(bh);
 		ret = -EIO;
diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h
index 8ca161567..c5086c8af 100644
--- a/fs/gfs2/meta_io.h
+++ b/fs/gfs2/meta_io.h
@@ -53,7 +53,7 @@ static inline struct gfs2_sbd *gfs2_mapping2sbd(struct address_space *mapping)
 
 extern struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno);
 extern int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
-			  struct buffer_head **bhp);
+			  int rahead, struct buffer_head **bhp);
 extern int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh);
 extern struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno,
 				       int create);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index baab99b69..dbed9e243 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -352,6 +352,9 @@ static int gfs2_read_sb(struct gfs2_sbd *sdp, int silent)
 	sdp->sd_jheightsize[x] = ~0;
 	gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT);
 
+	sdp->sd_max_dents_per_leaf = (sdp->sd_sb.sb_bsize -
+				      sizeof(struct gfs2_leaf)) /
+				     GFS2_MIN_DIRENT_SIZE;
 	return 0;
 }
 
@@ -910,8 +913,7 @@ fail_qc_i:
 fail_ut_i:
 	iput(sdp->sd_sc_inode);
 fail:
-	if (pn)
-		iput(pn);
+	iput(pn);
 	return error;
 }
 
@@ -1315,9 +1317,7 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags,
 		if ((flags ^ s->s_flags) & MS_RDONLY)
 			goto error_super;
 	} else {
-		char b[BDEVNAME_SIZE];
-
-		strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
+		snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
 		sb_set_blocksize(s, block_size(bdev));
 		error = fill_super(s, &args, flags & MS_SILENT ? 1 : 0);
 		if (error)
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 3a3122653..a39891344 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -388,7 +388,7 @@ static int bh_get(struct gfs2_quota_data *qd)
 	error = gfs2_block_map(&ip->i_inode, block, &bh_map, 0);
 	if (error)
 		goto fail;
-	error = gfs2_meta_read(ip->i_gl, bh_map.b_blocknr, DIO_WAIT, &bh);
+	error = gfs2_meta_read(ip->i_gl, bh_map.b_blocknr, DIO_WAIT, 0, &bh);
 	if (error)
 		goto fail;
 	error = -EIO;
@@ -527,37 +527,70 @@ static void qdsb_put(struct gfs2_quota_data *qd)
 	qd_put(qd);
 }
 
+/**
+ * gfs2_qa_alloc - make sure we have a quota allocations data structure,
+ *                 if necessary
+ * @ip: the inode for this reservation
+ */
+int gfs2_qa_alloc(struct gfs2_inode *ip)
+{
+	int error = 0;
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+
+	if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
+		return 0;
+
+	down_write(&ip->i_rw_mutex);
+	if (ip->i_qadata == NULL) {
+		ip->i_qadata = kmem_cache_zalloc(gfs2_qadata_cachep, GFP_NOFS);
+		if (!ip->i_qadata)
+			error = -ENOMEM;
+	}
+	up_write(&ip->i_rw_mutex);
+	return error;
+}
+
+void gfs2_qa_delete(struct gfs2_inode *ip, atomic_t *wcount)
+{
+	down_write(&ip->i_rw_mutex);
+	if (ip->i_qadata && ((wcount == NULL) || (atomic_read(wcount) <= 1))) {
+		kmem_cache_free(gfs2_qadata_cachep, ip->i_qadata);
+		ip->i_qadata = NULL;
+	}
+	up_write(&ip->i_rw_mutex);
+}
+
 int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct gfs2_quota_data **qd;
 	int error;
 
-	if (ip->i_res == NULL) {
-		error = gfs2_rs_alloc(ip);
+	if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
+		return 0;
+
+	if (ip->i_qadata == NULL) {
+		error = gfs2_rsqa_alloc(ip);
 		if (error)
 			return error;
 	}
 
-	qd = ip->i_res->rs_qa_qd;
+	qd = ip->i_qadata->qa_qd;
 
-	if (gfs2_assert_warn(sdp, !ip->i_res->rs_qa_qd_num) ||
+	if (gfs2_assert_warn(sdp, !ip->i_qadata->qa_qd_num) ||
 	    gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags)))
 		return -EIO;
 
-	if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
-		return 0;
-
 	error = qdsb_get(sdp, make_kqid_uid(ip->i_inode.i_uid), qd);
 	if (error)
 		goto out;
-	ip->i_res->rs_qa_qd_num++;
+	ip->i_qadata->qa_qd_num++;
 	qd++;
 
 	error = qdsb_get(sdp, make_kqid_gid(ip->i_inode.i_gid), qd);
 	if (error)
 		goto out;
-	ip->i_res->rs_qa_qd_num++;
+	ip->i_qadata->qa_qd_num++;
 	qd++;
 
 	if (!uid_eq(uid, NO_UID_QUOTA_CHANGE) &&
@@ -565,7 +598,7 @@ int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
 		error = qdsb_get(sdp, make_kqid_uid(uid), qd);
 		if (error)
 			goto out;
-		ip->i_res->rs_qa_qd_num++;
+		ip->i_qadata->qa_qd_num++;
 		qd++;
 	}
 
@@ -574,7 +607,7 @@ int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
 		error = qdsb_get(sdp, make_kqid_gid(gid), qd);
 		if (error)
 			goto out;
-		ip->i_res->rs_qa_qd_num++;
+		ip->i_qadata->qa_qd_num++;
 		qd++;
 	}
 
@@ -587,17 +620,17 @@ out:
 void gfs2_quota_unhold(struct gfs2_inode *ip)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	unsigned int x;
+	u32 x;
 
-	if (ip->i_res == NULL)
+	if (ip->i_qadata == NULL)
 		return;
 	gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags));
 
-	for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) {
-		qdsb_put(ip->i_res->rs_qa_qd[x]);
-		ip->i_res->rs_qa_qd[x] = NULL;
+	for (x = 0; x < ip->i_qadata->qa_qd_num; x++) {
+		qdsb_put(ip->i_qadata->qa_qd[x]);
+		ip->i_qadata->qa_qd[x] = NULL;
 	}
-	ip->i_res->rs_qa_qd_num = 0;
+	ip->i_qadata->qa_qd_num = 0;
 }
 
 static int sort_qd(const void *a, const void *b)
@@ -843,7 +876,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
 	unsigned int nalloc = 0, blocks;
 	int error;
 
-	error = gfs2_rs_alloc(ip);
+	error = gfs2_rsqa_alloc(ip);
 	if (error)
 		return error;
 
@@ -855,7 +888,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda)
 		return -ENOMEM;
 
 	sort(qda, num_qd, sizeof(struct gfs2_quota_data *), sort_qd, NULL);
-	mutex_lock(&ip->i_inode.i_mutex);
+	inode_lock(&ip->i_inode);
 	for (qx = 0; qx < num_qd; qx++) {
 		error = gfs2_glock_nq_init(qda[qx]->qd_gl, LM_ST_EXCLUSIVE,
 					   GL_NOCACHE, &ghs[qx]);
@@ -920,7 +953,7 @@ out_alloc:
 out:
 	while (qx--)
 		gfs2_glock_dq_uninit(&ghs[qx]);
-	mutex_unlock(&ip->i_inode.i_mutex);
+	inode_unlock(&ip->i_inode);
 	kfree(ghs);
 	gfs2_log_flush(ip->i_gl->gl_name.ln_sbd, ip->i_gl, NORMAL_FLUSH);
 	return error;
@@ -1003,23 +1036,23 @@ int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct gfs2_quota_data *qd;
-	unsigned int x;
+	u32 x;
 	int error = 0;
 
-	error = gfs2_quota_hold(ip, uid, gid);
-	if (error)
-		return error;
-
 	if (capable(CAP_SYS_RESOURCE) ||
 	    sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
 		return 0;
 
-	sort(ip->i_res->rs_qa_qd, ip->i_res->rs_qa_qd_num,
+	error = gfs2_quota_hold(ip, uid, gid);
+	if (error)
+		return error;
+
+	sort(ip->i_qadata->qa_qd, ip->i_qadata->qa_qd_num,
 	     sizeof(struct gfs2_quota_data *), sort_qd, NULL);
 
-	for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) {
-		qd = ip->i_res->rs_qa_qd[x];
-		error = do_glock(qd, NO_FORCE, &ip->i_res->rs_qa_qd_ghs[x]);
+	for (x = 0; x < ip->i_qadata->qa_qd_num; x++) {
+		qd = ip->i_qadata->qa_qd[x];
+		error = do_glock(qd, NO_FORCE, &ip->i_qadata->qa_qd_ghs[x]);
 		if (error)
 			break;
 	}
@@ -1028,7 +1061,7 @@ int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
 		set_bit(GIF_QD_LOCKED, &ip->i_flags);
 	else {
 		while (x--)
-			gfs2_glock_dq_uninit(&ip->i_res->rs_qa_qd_ghs[x]);
+			gfs2_glock_dq_uninit(&ip->i_qadata->qa_qd_ghs[x]);
 		gfs2_quota_unhold(ip);
 	}
 
@@ -1076,20 +1109,20 @@ void gfs2_quota_unlock(struct gfs2_inode *ip)
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct gfs2_quota_data *qda[4];
 	unsigned int count = 0;
-	unsigned int x;
+	u32 x;
 	int found;
 
 	if (!test_and_clear_bit(GIF_QD_LOCKED, &ip->i_flags))
 		goto out;
 
-	for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) {
+	for (x = 0; x < ip->i_qadata->qa_qd_num; x++) {
 		struct gfs2_quota_data *qd;
 		int sync;
 
-		qd = ip->i_res->rs_qa_qd[x];
+		qd = ip->i_qadata->qa_qd[x];
 		sync = need_sync(qd);
 
-		gfs2_glock_dq_uninit(&ip->i_res->rs_qa_qd_ghs[x]);
+		gfs2_glock_dq_uninit(&ip->i_qadata->qa_qd_ghs[x]);
 		if (!sync)
 			continue;
 
@@ -1158,7 +1191,7 @@ int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid,
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct gfs2_quota_data *qd;
 	s64 value, warn, limit;
-	unsigned int x;
+	u32 x;
 	int error = 0;
 
 	ap->allowed = UINT_MAX; /* Assume we are permitted a whole lot */
@@ -1168,8 +1201,8 @@ int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid,
         if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON)
                 return 0;
 
-	for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) {
-		qd = ip->i_res->rs_qa_qd[x];
+	for (x = 0; x < ip->i_qadata->qa_qd_num; x++) {
+		qd = ip->i_qadata->qa_qd[x];
 
 		if (!(qid_eq(qd->qd_id, make_kqid_uid(uid)) ||
 		      qid_eq(qd->qd_id, make_kqid_gid(gid))))
@@ -1216,15 +1249,17 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change,
 		       kuid_t uid, kgid_t gid)
 {
 	struct gfs2_quota_data *qd;
-	unsigned int x;
+	u32 x;
+	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 
-	if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), change))
+	if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON ||
+	    gfs2_assert_warn(sdp, change))
 		return;
 	if (ip->i_diskflags & GFS2_DIF_SYSTEM)
 		return;
 
-	for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) {
-		qd = ip->i_res->rs_qa_qd[x];
+	for (x = 0; x < ip->i_qadata->qa_qd_num; x++) {
+		qd = ip->i_qadata->qa_qd[x];
 
 		if (qid_eq(qd->qd_id, make_kqid_uid(uid)) ||
 		    qid_eq(qd->qd_id, make_kqid_gid(gid))) {
@@ -1635,11 +1670,11 @@ static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid,
 	if (error)
 		return error;
 
-	error = gfs2_rs_alloc(ip);
+	error = gfs2_rsqa_alloc(ip);
 	if (error)
 		goto out_put;
 
-	mutex_lock(&ip->i_inode.i_mutex);
+	inode_lock(&ip->i_inode);
 	error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_EXCLUSIVE, 0, &q_gh);
 	if (error)
 		goto out_unlockput;
@@ -1704,7 +1739,7 @@ out_i:
 out_q:
 	gfs2_glock_dq_uninit(&q_gh);
 out_unlockput:
-	mutex_unlock(&ip->i_inode.i_mutex);
+	inode_unlock(&ip->i_inode);
 out_put:
 	qd_put(qd);
 	return error;
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index ad04b3aca..5e47c935a 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -18,6 +18,8 @@ struct gfs2_sbd;
 #define NO_UID_QUOTA_CHANGE INVALID_UID
 #define NO_GID_QUOTA_CHANGE INVALID_GID
 
+extern int gfs2_qa_alloc(struct gfs2_inode *ip);
+extern void gfs2_qa_delete(struct gfs2_inode *ip, atomic_t *wcount);
 extern int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid);
 extern void gfs2_quota_unhold(struct gfs2_inode *ip);
 
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index c134c0462..07c0265aa 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -596,27 +596,13 @@ void gfs2_free_clones(struct gfs2_rgrpd *rgd)
 }
 
 /**
- * gfs2_rs_alloc - make sure we have a reservation assigned to the inode
+ * gfs2_rsqa_alloc - make sure we have a reservation assigned to the inode
+ *                 plus a quota allocations data structure, if necessary
  * @ip: the inode for this reservation
  */
-int gfs2_rs_alloc(struct gfs2_inode *ip)
+int gfs2_rsqa_alloc(struct gfs2_inode *ip)
 {
-	int error = 0;
-
-	down_write(&ip->i_rw_mutex);
-	if (ip->i_res)
-		goto out;
-
-	ip->i_res = kmem_cache_zalloc(gfs2_rsrv_cachep, GFP_NOFS);
-	if (!ip->i_res) {
-		error = -ENOMEM;
-		goto out;
-	}
-
-	RB_CLEAR_NODE(&ip->i_res->rs_node);
-out:
-	up_write(&ip->i_rw_mutex);
-	return error;
+	return gfs2_qa_alloc(ip);
 }
 
 static void dump_rs(struct seq_file *seq, const struct gfs2_blkreserv *rs)
@@ -678,21 +664,20 @@ void gfs2_rs_deltree(struct gfs2_blkreserv *rs)
 }
 
 /**
- * gfs2_rs_delete - delete a multi-block reservation
+ * gfs2_rsqa_delete - delete a multi-block reservation and quota allocation
  * @ip: The inode for this reservation
  * @wcount: The inode's write count, or NULL
  *
  */
-void gfs2_rs_delete(struct gfs2_inode *ip, atomic_t *wcount)
+void gfs2_rsqa_delete(struct gfs2_inode *ip, atomic_t *wcount)
 {
 	down_write(&ip->i_rw_mutex);
-	if (ip->i_res && ((wcount == NULL) || (atomic_read(wcount) <= 1))) {
-		gfs2_rs_deltree(ip->i_res);
-		BUG_ON(ip->i_res->rs_free);
-		kmem_cache_free(gfs2_rsrv_cachep, ip->i_res);
-		ip->i_res = NULL;
+	if ((wcount == NULL) || (atomic_read(wcount) <= 1)) {
+		gfs2_rs_deltree(&ip->i_res);
+		BUG_ON(ip->i_res.rs_free);
 	}
 	up_write(&ip->i_rw_mutex);
+	gfs2_qa_delete(ip, wcount);
 }
 
 /**
@@ -1158,7 +1143,7 @@ static int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd)
 
 	for (x = 0; x < length; x++) {
 		bi = rgd->rd_bits + x;
-		error = gfs2_meta_read(gl, rgd->rd_addr + x, 0, &bi->bi_bh);
+		error = gfs2_meta_read(gl, rgd->rd_addr + x, 0, 0, &bi->bi_bh);
 		if (error)
 			goto fail;
 	}
@@ -1456,7 +1441,7 @@ static void rs_insert(struct gfs2_inode *ip)
 {
 	struct rb_node **newn, *parent = NULL;
 	int rc;
-	struct gfs2_blkreserv *rs = ip->i_res;
+	struct gfs2_blkreserv *rs = &ip->i_res;
 	struct gfs2_rgrpd *rgd = rs->rs_rbm.rgd;
 	u64 fsblock = gfs2_rbm_to_block(&rs->rs_rbm);
 
@@ -1503,7 +1488,7 @@ static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip,
 {
 	struct gfs2_rbm rbm = { .rgd = rgd, };
 	u64 goal;
-	struct gfs2_blkreserv *rs = ip->i_res;
+	struct gfs2_blkreserv *rs = &ip->i_res;
 	u32 extlen;
 	u32 free_blocks = rgd->rd_free_clone - rgd->rd_reserved;
 	int ret;
@@ -1574,7 +1559,7 @@ static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block,
 	}
 
 	if (n) {
-		while ((rs_cmp(block, length, rs) == 0) && (ip->i_res != rs)) {
+		while ((rs_cmp(block, length, rs) == 0) && (&ip->i_res != rs)) {
 			block = gfs2_rbm_to_block(&rs->rs_rbm) + rs->rs_free;
 			n = n->rb_right;
 			if (n == NULL)
@@ -1804,7 +1789,7 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip
 			continue;
 		*last_unlinked = block;
 
-		error = gfs2_glock_get(sdp, block, &gfs2_inode_glops, CREATE, &gl);
+		error = gfs2_glock_get(sdp, block, &gfs2_iopen_glops, CREATE, &gl);
 		if (error)
 			continue;
 
@@ -1984,7 +1969,7 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
 	struct gfs2_rgrpd *begin = NULL;
-	struct gfs2_blkreserv *rs = ip->i_res;
+	struct gfs2_blkreserv *rs = &ip->i_res;
 	int error = 0, rg_locked, flags = 0;
 	u64 last_unlinked = NO_BLOCK;
 	int loops = 0;
@@ -2113,7 +2098,7 @@ next_rgrp:
 
 void gfs2_inplace_release(struct gfs2_inode *ip)
 {
-	struct gfs2_blkreserv *rs = ip->i_res;
+	struct gfs2_blkreserv *rs = &ip->i_res;
 
 	if (rs->rs_rgd_gh.gh_gl)
 		gfs2_glock_dq_uninit(&rs->rs_rgd_gh);
@@ -2267,7 +2252,7 @@ static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd)
 static void gfs2_adjust_reservation(struct gfs2_inode *ip,
 				    const struct gfs2_rbm *rbm, unsigned len)
 {
-	struct gfs2_blkreserv *rs = ip->i_res;
+	struct gfs2_blkreserv *rs = &ip->i_res;
 	struct gfs2_rgrpd *rgd = rbm->rgd;
 	unsigned rlen;
 	u64 block;
@@ -2310,8 +2295,8 @@ static void gfs2_set_alloc_start(struct gfs2_rbm *rbm,
 {
 	u64 goal;
 
-	if (gfs2_rs_active(ip->i_res)) {
-		*rbm = ip->i_res->rs_rbm;
+	if (gfs2_rs_active(&ip->i_res)) {
+		*rbm = ip->i_res.rs_rbm;
 		return;
 	}
 
@@ -2365,7 +2350,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks,
 	gfs2_alloc_extent(&rbm, dinode, nblocks);
 	block = gfs2_rbm_to_block(&rbm);
 	rbm.rgd->rd_last_alloc = block - rbm.rgd->rd_data0;
-	if (gfs2_rs_active(ip->i_res))
+	if (gfs2_rs_active(&ip->i_res))
 		gfs2_adjust_reservation(ip, &rbm, *nblocks);
 	ndata = *nblocks;
 	if (dinode)
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index c0ab33fa3..66b51cf66 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -49,9 +49,9 @@ extern void gfs2_inplace_release(struct gfs2_inode *ip);
 extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n,
 			     bool dinode, u64 *generation);
 
-extern int gfs2_rs_alloc(struct gfs2_inode *ip);
+extern int gfs2_rsqa_alloc(struct gfs2_inode *ip);
 extern void gfs2_rs_deltree(struct gfs2_blkreserv *rs);
-extern void gfs2_rs_delete(struct gfs2_inode *ip, atomic_t *wcount);
+extern void gfs2_rsqa_delete(struct gfs2_inode *ip, atomic_t *wcount);
 extern void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta);
 extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen);
 extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip);
@@ -78,7 +78,7 @@ extern int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset,
 extern int gfs2_fitrim(struct file *filp, void __user *argp);
 
 /* This is how to tell if a reservation is in the rgrp tree: */
-static inline bool gfs2_rs_active(struct gfs2_blkreserv *rs)
+static inline bool gfs2_rs_active(const struct gfs2_blkreserv *rs)
 {
 	return rs && !RB_EMPTY_NODE(&rs->rs_node);
 }
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 894fb01a9..8f960a51a 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -83,6 +83,8 @@ enum {
 	Opt_nobarrier,
 	Opt_rgrplvb,
 	Opt_norgrplvb,
+	Opt_loccookie,
+	Opt_noloccookie,
 	Opt_error,
 };
 
@@ -122,6 +124,8 @@ static const match_table_t tokens = {
 	{Opt_nobarrier, "nobarrier"},
 	{Opt_rgrplvb, "rgrplvb"},
 	{Opt_norgrplvb, "norgrplvb"},
+	{Opt_loccookie, "loccookie"},
+	{Opt_noloccookie, "noloccookie"},
 	{Opt_error, NULL}
 };
 
@@ -278,6 +282,12 @@ int gfs2_mount_args(struct gfs2_args *args, char *options)
 		case Opt_norgrplvb:
 			args->ar_rgrplvb = 0;
 			break;
+		case Opt_loccookie:
+			args->ar_loccookie = 1;
+			break;
+		case Opt_noloccookie:
+			args->ar_loccookie = 0;
+			break;
 		case Opt_error:
 		default:
 			pr_warn("invalid mount option: %s\n", o);
@@ -556,6 +566,7 @@ void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
 	struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local;
 
 	gfs2_trans_add_meta(l_ip->i_gl, l_bh);
+	gfs2_trans_add_meta(m_ip->i_gl, m_bh);
 
 	spin_lock(&sdp->sd_statfs_spin);
 	m_sc->sc_total += l_sc->sc_total;
@@ -564,10 +575,8 @@ void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh,
 	memset(l_sc, 0, sizeof(struct gfs2_statfs_change));
 	memset(l_bh->b_data + sizeof(struct gfs2_dinode),
 	       0, sizeof(struct gfs2_statfs_change));
-	spin_unlock(&sdp->sd_statfs_spin);
-
-	gfs2_trans_add_meta(m_ip->i_gl, m_bh);
 	gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode));
+	spin_unlock(&sdp->sd_statfs_spin);
 }
 
 int gfs2_statfs_sync(struct super_block *sb, int type)
@@ -842,10 +851,6 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
 	gfs2_quota_sync(sdp->sd_vfs, 0);
 	gfs2_statfs_sync(sdp->sd_vfs, 0);
 
-	down_write(&sdp->sd_log_flush_lock);
-	clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
-	up_write(&sdp->sd_log_flush_lock);
-
 	gfs2_log_flush(sdp, NULL, SHUTDOWN_FLUSH);
 	wait_event(sdp->sd_reserving_log_wait, atomic_read(&sdp->sd_reserving_log) == 0);
 	gfs2_assert_warn(sdp, atomic_read(&sdp->sd_log_blks_free) == sdp->sd_jdesc->jd_blocks);
@@ -1419,6 +1424,8 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root)
 		seq_puts(s, ",demote_interface_used");
 	if (args->ar_rgrplvb)
 		seq_puts(s, ",rgrplvb");
+	if (args->ar_loccookie)
+		seq_puts(s, ",loccookie");
 	return 0;
 }
 
@@ -1512,6 +1519,7 @@ static void gfs2_evict_inode(struct inode *inode)
 	struct gfs2_sbd *sdp = sb->s_fs_info;
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_holder gh;
+	struct address_space *metamapping;
 	int error;
 
 	if (test_bit(GIF_FREE_VFS_INODE, &ip->i_flags)) {
@@ -1526,7 +1534,8 @@ static void gfs2_evict_inode(struct inode *inode)
 	error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &gh);
 	if (unlikely(error)) {
 		ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
-		gfs2_glock_dq_uninit(&ip->i_iopen_gh);
+		gfs2_glock_dq_wait(&ip->i_iopen_gh);
+		gfs2_holder_uninit(&ip->i_iopen_gh);
 		goto out;
 	}
 
@@ -1575,8 +1584,8 @@ static void gfs2_evict_inode(struct inode *inode)
 
 out_truncate:
 	gfs2_log_flush(sdp, ip->i_gl, NORMAL_FLUSH);
+	metamapping = gfs2_glock2aspace(ip->i_gl);
 	if (test_bit(GLF_DIRTY, &ip->i_gl->gl_flags)) {
-		struct address_space *metamapping = gfs2_glock2aspace(ip->i_gl);
 		filemap_fdatawrite(metamapping);
 		filemap_fdatawait(metamapping);
 	}
@@ -1589,16 +1598,17 @@ out_truncate:
 		goto out_unlock;
 	/* Needs to be done before glock release & also in a transaction */
 	truncate_inode_pages(&inode->i_data, 0);
+	truncate_inode_pages(metamapping, 0);
 	gfs2_trans_end(sdp);
 
 out_unlock:
 	/* Error path for case 1 */
-	if (gfs2_rs_active(ip->i_res))
-		gfs2_rs_deltree(ip->i_res);
+	if (gfs2_rs_active(&ip->i_res))
+		gfs2_rs_deltree(&ip->i_res);
 
 	if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) {
 		ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
-		gfs2_glock_dq(&ip->i_iopen_gh);
+		gfs2_glock_dq_wait(&ip->i_iopen_gh);
 	}
 	gfs2_holder_uninit(&ip->i_iopen_gh);
 	gfs2_glock_dq_uninit(&gh);
@@ -1607,7 +1617,7 @@ out_unlock:
 out:
 	/* Case 3 starts here */
 	truncate_inode_pages_final(&inode->i_data);
-	gfs2_rs_delete(ip, NULL);
+	gfs2_rsqa_delete(ip, NULL);
 	gfs2_ordered_del_inode(ip);
 	clear_inode(inode);
 	gfs2_dir_hash_inval(ip);
@@ -1619,7 +1629,8 @@ out:
 	if (ip->i_iopen_gh.gh_gl) {
 		ip->i_iopen_gh.gh_gl->gl_object = NULL;
 		ip->i_iopen_gh.gh_flags |= GL_NOCACHE;
-		gfs2_glock_dq_uninit(&ip->i_iopen_gh);
+		gfs2_glock_dq_wait(&ip->i_iopen_gh);
+		gfs2_holder_uninit(&ip->i_iopen_gh);
 	}
 }
 
@@ -1632,7 +1643,9 @@ static struct inode *gfs2_alloc_inode(struct super_block *sb)
 		ip->i_flags = 0;
 		ip->i_gl = NULL;
 		ip->i_rgd = NULL;
-		ip->i_res = NULL;
+		memset(&ip->i_res, 0, sizeof(ip->i_res));
+		RB_CLEAR_NODE(&ip->i_res.rs_node);
+		ip->i_rahead = 0;
 	}
 	return &ip->i_inode;
 }
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index 86d2035ac..cf6458357 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -27,7 +27,7 @@ struct kmem_cache *gfs2_inode_cachep __read_mostly;
 struct kmem_cache *gfs2_bufdata_cachep __read_mostly;
 struct kmem_cache *gfs2_rgrpd_cachep __read_mostly;
 struct kmem_cache *gfs2_quotad_cachep __read_mostly;
-struct kmem_cache *gfs2_rsrv_cachep __read_mostly;
+struct kmem_cache *gfs2_qadata_cachep __read_mostly;
 mempool_t *gfs2_page_pool __read_mostly;
 
 void gfs2_assert_i(struct gfs2_sbd *sdp)
diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h
index cbdcbdf39..c81295f40 100644
--- a/fs/gfs2/util.h
+++ b/fs/gfs2/util.h
@@ -149,7 +149,7 @@ extern struct kmem_cache *gfs2_inode_cachep;
 extern struct kmem_cache *gfs2_bufdata_cachep;
 extern struct kmem_cache *gfs2_rgrpd_cachep;
 extern struct kmem_cache *gfs2_quotad_cachep;
-extern struct kmem_cache *gfs2_rsrv_cachep;
+extern struct kmem_cache *gfs2_qadata_cachep;
 extern mempool_t *gfs2_page_pool;
 
 static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt,
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index 53ce76a37..e8dfb4740 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -119,7 +119,7 @@ static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data)
 	__be64 *eablk, *end;
 	int error;
 
-	error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &bh);
+	error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, 0, &bh);
 	if (error)
 		return error;
 
@@ -143,7 +143,7 @@ static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data)
 			break;
 		bn = be64_to_cpu(*eablk);
 
-		error = gfs2_meta_read(ip->i_gl, bn, DIO_WAIT, &eabh);
+		error = gfs2_meta_read(ip->i_gl, bn, DIO_WAIT, 0, &eabh);
 		if (error)
 			break;
 		error = ea_foreach_i(ip, eabh, ea_call, data);
@@ -477,7 +477,7 @@ static int gfs2_iter_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea,
 		return -ENOMEM;
 
 	for (x = 0; x < nptrs; x++) {
-		error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs), 0,
+		error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs), 0, 0,
 				       bh + x);
 		if (error) {
 			while (x--)
@@ -979,7 +979,7 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er,
 	if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) {
 		__be64 *end;
 
-		error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT,
+		error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, 0,
 				       &indbh);
 		if (error)
 			return error;
@@ -1237,56 +1237,6 @@ static int gfs2_xattr_set(const struct xattr_handler *handler,
 				size, flags, handler->flags);
 }
 
-
-static int ea_acl_chmod_unstuffed(struct gfs2_inode *ip,
-				  struct gfs2_ea_header *ea, char *data)
-{
-	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
-	unsigned int amount = GFS2_EA_DATA_LEN(ea);
-	unsigned int nptrs = DIV_ROUND_UP(amount, sdp->sd_jbsize);
-	int ret;
-
-	ret = gfs2_trans_begin(sdp, nptrs + RES_DINODE, 0);
-	if (ret)
-		return ret;
-
-	ret = gfs2_iter_unstuffed(ip, ea, data, NULL);
-	gfs2_trans_end(sdp);
-
-	return ret;
-}
-
-int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data)
-{
-	struct inode *inode = &ip->i_inode;
-	struct gfs2_sbd *sdp = GFS2_SB(inode);
-	struct gfs2_ea_location el;
-	int error;
-
-	error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, GFS2_POSIX_ACL_ACCESS, &el);
-	if (error)
-		return error;
-
-	if (GFS2_EA_IS_STUFFED(el.el_ea)) {
-		error = gfs2_trans_begin(sdp, RES_DINODE + RES_EATTR, 0);
-		if (error == 0) {
-			gfs2_trans_add_meta(ip->i_gl, el.el_bh);
-			memcpy(GFS2_EA2DATA(el.el_ea), data,
-			       GFS2_EA_DATA_LEN(el.el_ea));
-		}
-	} else {
-		error = ea_acl_chmod_unstuffed(ip, el.el_ea, data);
-	}
-
-	brelse(el.el_bh);
-	if (error)
-		return error;
-
-	error = gfs2_setattr_simple(inode, attr);
-	gfs2_trans_end(sdp);
-	return error;
-}
-
 static int ea_dealloc_indirect(struct gfs2_inode *ip)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
@@ -1306,7 +1256,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip)
 
 	memset(&rlist, 0, sizeof(struct gfs2_rgrp_list));
 
-	error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &indbh);
+	error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, 0, &indbh);
 	if (error)
 		return error;
 
diff --git a/fs/gfs2/xattr.h b/fs/gfs2/xattr.h
index d392f8358..2d887c88e 100644
--- a/fs/gfs2/xattr.h
+++ b/fs/gfs2/xattr.h
@@ -62,6 +62,5 @@ extern int gfs2_ea_dealloc(struct gfs2_inode *ip);
 /* Exported to acl.c */
 
 extern int gfs2_xattr_acl_get(struct gfs2_inode *ip, const char *name, char **data);
-extern int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data);
 
 #endif /* __EATTR_DOT_H__ */
diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c
index db458ee3a..1eb5d415d 100644
--- a/fs/hfs/catalog.c
+++ b/fs/hfs/catalog.c
@@ -214,7 +214,7 @@ int hfs_cat_delete(u32 cnid, struct inode *dir, struct qstr *str)
 {
 	struct super_block *sb;
 	struct hfs_find_data fd;
-	struct list_head *pos;
+	struct hfs_readdir_data *rd;
 	int res, type;
 
 	hfs_dbg(CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid);
@@ -240,9 +240,7 @@ int hfs_cat_delete(u32 cnid, struct inode *dir, struct qstr *str)
 		}
 	}
 
-	list_for_each(pos, &HFS_I(dir)->open_dir_list) {
-		struct hfs_readdir_data *rd =
-			list_entry(pos, struct hfs_readdir_data, list);
+	list_for_each_entry(rd, &HFS_I(dir)->open_dir_list, list) {
 		if (fd.tree->keycmp(fd.search_key, (void *)&rd->key) < 0)
 			rd->file->f_pos--;
 	}
diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c
index 70788e038..e9f2b855f 100644
--- a/fs/hfs/dir.c
+++ b/fs/hfs/dir.c
@@ -173,9 +173,9 @@ static int hfs_dir_release(struct inode *inode, struct file *file)
 {
 	struct hfs_readdir_data *rd = file->private_data;
 	if (rd) {
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 		list_del(&rd->list);
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 		kfree(rd);
 	}
 	return 0;
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index b99ebddb1..6686bf39a 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -570,13 +570,13 @@ static int hfs_file_release(struct inode *inode, struct file *file)
 	if (HFS_IS_RSRC(inode))
 		inode = HFS_I(inode)->rsrc_inode;
 	if (atomic_dec_and_test(&HFS_I(inode)->opencnt)) {
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 		hfs_file_truncate(inode);
 		//if (inode->i_flags & S_DEAD) {
 		//	hfs_delete_cat(inode->i_ino, HFSPLUS_SB(sb).hidden_dir, NULL);
 		//	hfs_delete_inode(inode);
 		//}
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 	}
 	return 0;
 }
@@ -656,7 +656,7 @@ static int hfs_file_fsync(struct file *filp, loff_t start, loff_t end,
 	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
 	if (ret)
 		return ret;
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	/* sync the inode to buffers */
 	ret = write_inode_now(inode, 0);
@@ -668,7 +668,7 @@ static int hfs_file_fsync(struct file *filp, loff_t start, loff_t end,
 	err = sync_blockdev(sb->s_bdev);
 	if (!ret)
 		ret = err;
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return ret;
 }
 
diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c
index aa3f0d6d0..a3ec3ae7d 100644
--- a/fs/hfs/mdb.c
+++ b/fs/hfs/mdb.c
@@ -166,7 +166,7 @@ int hfs_mdb_get(struct super_block *sb)
 		pr_warn("continuing without an alternate MDB\n");
 	}
 
-	HFS_SB(sb)->bitmap = (__be32 *)__get_free_pages(GFP_KERNEL, PAGE_SIZE < 8192 ? 1 : 0);
+	HFS_SB(sb)->bitmap = kmalloc(8192, GFP_KERNEL);
 	if (!HFS_SB(sb)->bitmap)
 		goto out;
 
@@ -360,7 +360,7 @@ void hfs_mdb_put(struct super_block *sb)
 	unload_nls(HFS_SB(sb)->nls_io);
 	unload_nls(HFS_SB(sb)->nls_disk);
 
-	free_pages((unsigned long)HFS_SB(sb)->bitmap, PAGE_SIZE < 8192 ? 1 : 0);
+	kfree(HFS_SB(sb)->bitmap);
 	kfree(HFS_SB(sb));
 	sb->s_fs_info = NULL;
 }
diff --git a/fs/hfs/super.c b/fs/hfs/super.c
index 4574fdd3d..1ca95c232 100644
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -483,8 +483,8 @@ static int __init init_hfs_fs(void)
 	int err;
 
 	hfs_inode_cachep = kmem_cache_create("hfs_inode_cache",
-		sizeof(struct hfs_inode_info), 0, SLAB_HWCACHE_ALIGN,
-		hfs_init_once);
+		sizeof(struct hfs_inode_info), 0,
+		SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT, hfs_init_once);
 	if (!hfs_inode_cachep)
 		return -ENOMEM;
 	err = register_filesystem(&hfs_fs_type);
diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c
index d0f39dcbb..a4e867e08 100644
--- a/fs/hfsplus/dir.c
+++ b/fs/hfsplus/dir.c
@@ -284,9 +284,9 @@ static int hfsplus_dir_release(struct inode *inode, struct file *file)
 {
 	struct hfsplus_readdir_data *rd = file->private_data;
 	if (rd) {
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 		list_del(&rd->list);
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 		kfree(rd);
 	}
 	return 0;
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 6dd107d74..1a6394cdb 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -229,14 +229,14 @@ static int hfsplus_file_release(struct inode *inode, struct file *file)
 	if (HFSPLUS_IS_RSRC(inode))
 		inode = HFSPLUS_I(inode)->rsrc_inode;
 	if (atomic_dec_and_test(&HFSPLUS_I(inode)->opencnt)) {
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 		hfsplus_file_truncate(inode);
 		if (inode->i_flags & S_DEAD) {
 			hfsplus_delete_cat(inode->i_ino,
 					   HFSPLUS_SB(sb)->hidden_dir, NULL);
 			hfsplus_delete_inode(inode);
 		}
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 	}
 	return 0;
 }
@@ -286,7 +286,7 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end,
 	error = filemap_write_and_wait_range(inode->i_mapping, start, end);
 	if (error)
 		return error;
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	/*
 	 * Sync inode metadata into the catalog and extent trees.
@@ -327,7 +327,7 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end,
 	if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags))
 		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
 
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	return error;
 }
@@ -403,6 +403,7 @@ struct inode *hfsplus_new_inode(struct super_block *sb, umode_t mode)
 	} else if (S_ISLNK(inode->i_mode)) {
 		sbi->file_count++;
 		inode->i_op = &page_symlink_inode_operations;
+		inode_nohighmem(inode);
 		inode->i_mapping->a_ops = &hfsplus_aops;
 		hip->clump_blocks = 1;
 	} else
@@ -526,6 +527,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd)
 			inode->i_mapping->a_ops = &hfsplus_aops;
 		} else if (S_ISLNK(inode->i_mode)) {
 			inode->i_op = &page_symlink_inode_operations;
+			inode_nohighmem(inode);
 			inode->i_mapping->a_ops = &hfsplus_aops;
 		} else {
 			init_special_inode(inode, inode->i_mode,
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index 0624ce4e0..32a49e292 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -93,7 +93,7 @@ static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags)
 		goto out_drop_write;
 	}
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	if ((flags & (FS_IMMUTABLE_FL|FS_APPEND_FL)) ||
 	    inode->i_flags & (S_IMMUTABLE|S_APPEND)) {
@@ -126,7 +126,7 @@ static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags)
 	mark_inode_dirty(inode);
 
 out_unlock_inode:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 out_drop_write:
 	mnt_drop_write_file(file);
 out:
diff --git a/fs/hfsplus/posix_acl.c b/fs/hfsplus/posix_acl.c
index df0c9af68..afb33eda6 100644
--- a/fs/hfsplus/posix_acl.c
+++ b/fs/hfsplus/posix_acl.c
@@ -21,10 +21,10 @@ struct posix_acl *hfsplus_get_posix_acl(struct inode *inode, int type)
 
 	switch (type) {
 	case ACL_TYPE_ACCESS:
-		xattr_name = POSIX_ACL_XATTR_ACCESS;
+		xattr_name = XATTR_NAME_POSIX_ACL_ACCESS;
 		break;
 	case ACL_TYPE_DEFAULT:
-		xattr_name = POSIX_ACL_XATTR_DEFAULT;
+		xattr_name = XATTR_NAME_POSIX_ACL_DEFAULT;
 		break;
 	default:
 		return ERR_PTR(-EINVAL);
@@ -66,7 +66,7 @@ int hfsplus_set_posix_acl(struct inode *inode, struct posix_acl *acl,
 
 	switch (type) {
 	case ACL_TYPE_ACCESS:
-		xattr_name = POSIX_ACL_XATTR_ACCESS;
+		xattr_name = XATTR_NAME_POSIX_ACL_ACCESS;
 		if (acl) {
 			err = posix_acl_equiv_mode(acl, &inode->i_mode);
 			if (err < 0)
@@ -76,7 +76,7 @@ int hfsplus_set_posix_acl(struct inode *inode, struct posix_acl *acl,
 		break;
 
 	case ACL_TYPE_DEFAULT:
-		xattr_name = POSIX_ACL_XATTR_DEFAULT;
+		xattr_name = XATTR_NAME_POSIX_ACL_DEFAULT;
 		if (!S_ISDIR(inode->i_mode))
 			return acl ? -EACCES : 0;
 		break;
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 7302d96ae..5d54490a1 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -663,7 +663,7 @@ static int __init init_hfsplus_fs(void)
 	int err;
 
 	hfsplus_inode_cachep = kmem_cache_create("hfsplus_icache",
-		HFSPLUS_INODE_SIZE, 0, SLAB_HWCACHE_ALIGN,
+		HFSPLUS_INODE_SIZE, 0, SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT,
 		hfsplus_init_once);
 	if (!hfsplus_inode_cachep)
 		return -ENOMEM;
diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c
index e41a010cd..ab01530b4 100644
--- a/fs/hfsplus/xattr.c
+++ b/fs/hfsplus/xattr.c
@@ -431,9 +431,6 @@ int hfsplus_setxattr(struct dentry *dentry, const char *name,
 	char *xattr_name;
 	int res;
 
-	if (!strcmp(name, ""))
-		return -EINVAL;
-
 	xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1,
 		GFP_KERNEL);
 	if (!xattr_name)
@@ -589,9 +586,6 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name,
 	int res;
 	char *xattr_name;
 
-	if (!strcmp(name, ""))
-		return -EINVAL;
-
 	xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1,
 			     GFP_KERNEL);
 	if (!xattr_name)
@@ -853,9 +847,6 @@ static int hfsplus_osx_getxattr(const struct xattr_handler *handler,
 				struct dentry *dentry, const char *name,
 				void *buffer, size_t size)
 {
-	if (!strcmp(name, ""))
-		return -EINVAL;
-
 	/*
 	 * Don't allow retrieving properly prefixed attributes
 	 * by prepending them with "osx."
@@ -876,9 +867,6 @@ static int hfsplus_osx_setxattr(const struct xattr_handler *handler,
 				struct dentry *dentry, const char *name,
 				const void *buffer, size_t size, int flags)
 {
-	if (!strcmp(name, ""))
-		return -EINVAL;
-
 	/*
 	 * Don't allow setting properly prefixed attributes
 	 * by prepending them with "osx."
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 5a7b3229b..d1abbee28 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -223,7 +223,7 @@ static struct inode *hostfs_alloc_inode(struct super_block *sb)
 {
 	struct hostfs_inode_info *hi;
 
-	hi = kmalloc(sizeof(*hi), GFP_KERNEL);
+	hi = kmalloc(sizeof(*hi), GFP_KERNEL_ACCOUNT);
 	if (hi == NULL)
 		return NULL;
 	hi->fd = -1;
@@ -378,9 +378,9 @@ static int hostfs_fsync(struct file *file, loff_t start, loff_t end,
 	if (ret)
 		return ret;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	ret = fsync_file(HOSTFS_I(inode)->fd, datasync);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	return ret;
 }
@@ -890,9 +890,14 @@ static const struct inode_operations hostfs_dir_iops = {
 	.setattr	= hostfs_setattr,
 };
 
-static const char *hostfs_follow_link(struct dentry *dentry, void **cookie)
+static const char *hostfs_get_link(struct dentry *dentry,
+				   struct inode *inode,
+				   struct delayed_call *done)
 {
-	char *link = __getname();
+	char *link;
+	if (!dentry)
+		return ERR_PTR(-ECHILD);
+	link = kmalloc(PATH_MAX, GFP_KERNEL);
 	if (link) {
 		char *path = dentry_name(dentry);
 		int err = -ENOMEM;
@@ -903,25 +908,20 @@ static const char *hostfs_follow_link(struct dentry *dentry, void **cookie)
 			__putname(path);
 		}
 		if (err < 0) {
-			__putname(link);
+			kfree(link);
 			return ERR_PTR(err);
 		}
 	} else {
 		return ERR_PTR(-ENOMEM);
 	}
 
-	return *cookie = link;
-}
-
-static void hostfs_put_link(struct inode *unused, void *cookie)
-{
-	__putname(cookie);
+	set_delayed_call(done, kfree_link, link);
+	return link;
 }
 
 static const struct inode_operations hostfs_link_iops = {
 	.readlink	= generic_readlink,
-	.follow_link	= hostfs_follow_link,
-	.put_link	= hostfs_put_link,
+	.get_link	= hostfs_get_link,
 };
 
 static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c
index dc540bfce..e57a53c13 100644
--- a/fs/hpfs/dir.c
+++ b/fs/hpfs/dir.c
@@ -33,7 +33,7 @@ static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence)
 	if (whence == SEEK_DATA || whence == SEEK_HOLE)
 		return -EINVAL;
 
-	mutex_lock(&i->i_mutex);
+	inode_lock(i);
 	hpfs_lock(s);
 
 	/*pr_info("dir lseek\n");*/
@@ -48,12 +48,12 @@ static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence)
 ok:
 	filp->f_pos = new_off;
 	hpfs_unlock(s);
-	mutex_unlock(&i->i_mutex);
+	inode_unlock(i);
 	return new_off;
 fail:
 	/*pr_warn("illegal lseek: %016llx\n", new_off);*/
 	hpfs_unlock(s);
-	mutex_unlock(&i->i_mutex);
+	inode_unlock(i);
 	return -ESPIPE;
 }
 
diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c
index 933c73780..1f3c6d762 100644
--- a/fs/hpfs/inode.c
+++ b/fs/hpfs/inode.c
@@ -77,6 +77,7 @@ void hpfs_read_inode(struct inode *i)
 			kfree(ea);
 			i->i_mode = S_IFLNK | 0777;
 			i->i_op = &page_symlink_inode_operations;
+			inode_nohighmem(i);
 			i->i_data.a_ops = &hpfs_symlink_aops;
 			set_nlink(i, 1);
 			i->i_size = ea_size;
diff --git a/fs/hpfs/map.c b/fs/hpfs/map.c
index a69bbc1e8..a13692918 100644
--- a/fs/hpfs/map.c
+++ b/fs/hpfs/map.c
@@ -133,7 +133,7 @@ __le32 *hpfs_load_bitmap_directory(struct super_block *s, secno bmp)
 void hpfs_load_hotfix_map(struct super_block *s, struct hpfs_spare_block *spareblock)
 {
 	struct quad_buffer_head qbh;
-	u32 *directory;
+	__le32 *directory;
 	u32 n_hotfixes, n_used_hotfixes;
 	unsigned i;
 
diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c
index bffb908ac..bb8d67e27 100644
--- a/fs/hpfs/namei.c
+++ b/fs/hpfs/namei.c
@@ -332,6 +332,7 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy
 	result->i_blocks = 1;
 	set_nlink(result, 1);
 	result->i_size = strlen(symlink);
+	inode_nohighmem(result);
 	result->i_op = &page_symlink_inode_operations;
 	result->i_data.a_ops = &hpfs_symlink_aops;
 
@@ -475,7 +476,7 @@ out:
 
 static int hpfs_symlink_readpage(struct file *file, struct page *page)
 {
-	char *link = kmap(page);
+	char *link = page_address(page);
 	struct inode *i = page->mapping->host;
 	struct fnode *fnode;
 	struct buffer_head *bh;
@@ -491,14 +492,12 @@ static int hpfs_symlink_readpage(struct file *file, struct page *page)
 		goto fail;
 	hpfs_unlock(i->i_sb);
 	SetPageUptodate(page);
-	kunmap(page);
 	unlock_page(page);
 	return 0;
 
 fail:
 	hpfs_unlock(i->i_sb);
 	SetPageError(page);
-	kunmap(page);
 	unlock_page(page);
 	return err;
 }
diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c
index a56159189..458cf4630 100644
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -261,7 +261,7 @@ static int init_inodecache(void)
 	hpfs_inode_cachep = kmem_cache_create("hpfs_inode_cache",
 					     sizeof(struct hpfs_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					     init_once);
 	if (hpfs_inode_cachep == NULL)
 		return -ENOMEM;
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 595ebdb41..e1f465a38 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -4,11 +4,11 @@
  * Nadia Yvette Chambers, 2002
  *
  * Copyright (C) 2002 Linus Torvalds.
+ * License: GPL
  */
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
-#include <linux/module.h>
 #include <linux/thread_info.h>
 #include <asm/current.h>
 #include <linux/sched.h>		/* remove ASAP */
@@ -141,7 +141,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 
 	vma_len = (loff_t)(vma->vm_end - vma->vm_start);
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	file_accessed(file);
 
 	ret = -ENOMEM;
@@ -157,7 +157,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	if (vma->vm_flags & VM_WRITE && inode->i_size < len)
 		inode->i_size = len;
 out:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	return ret;
 }
@@ -324,11 +324,48 @@ static void remove_huge_page(struct page *page)
 	delete_from_page_cache(page);
 }
 
+static void
+hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
+{
+	struct vm_area_struct *vma;
+
+	/*
+	 * end == 0 indicates that the entire range after
+	 * start should be unmapped.
+	 */
+	vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
+		unsigned long v_offset;
+		unsigned long v_end;
+
+		/*
+		 * Can the expression below overflow on 32-bit arches?
+		 * No, because the interval tree returns us only those vmas
+		 * which overlap the truncated area starting at pgoff,
+		 * and no vma on a 32-bit arch can span beyond the 4GB.
+		 */
+		if (vma->vm_pgoff < start)
+			v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
+		else
+			v_offset = 0;
+
+		if (!end)
+			v_end = vma->vm_end;
+		else {
+			v_end = ((end - vma->vm_pgoff) << PAGE_SHIFT)
+							+ vma->vm_start;
+			if (v_end > vma->vm_end)
+				v_end = vma->vm_end;
+		}
+
+		unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end,
+									NULL);
+	}
+}
 
 /*
  * remove_inode_hugepages handles two distinct cases: truncation and hole
  * punch.  There are subtle differences in operation for each case.
-
+ *
  * truncation is indicated by end of range being LLONG_MAX
  *	In this case, we first scan the range and release found pages.
  *	After releasing pages, hugetlb_unreserve_pages cleans up region/reserv
@@ -379,6 +416,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
 
 		for (i = 0; i < pagevec_count(&pvec); ++i) {
 			struct page *page = pvec.pages[i];
+			bool rsv_on_error;
 			u32 hash;
 
 			/*
@@ -395,37 +433,43 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart,
 							mapping, next, 0);
 			mutex_lock(&hugetlb_fault_mutex_table[hash]);
 
-			lock_page(page);
-			if (likely(!page_mapped(page))) {
-				bool rsv_on_error = !PagePrivate(page);
-				/*
-				 * We must free the huge page and remove
-				 * from page cache (remove_huge_page) BEFORE
-				 * removing the region/reserve map
-				 * (hugetlb_unreserve_pages).  In rare out
-				 * of memory conditions, removal of the
-				 * region/reserve map could fail.  Before
-				 * free'ing the page, note PagePrivate which
-				 * is used in case of error.
-				 */
-				remove_huge_page(page);
-				freed++;
-				if (!truncate_op) {
-					if (unlikely(hugetlb_unreserve_pages(
-							inode, next,
-							next + 1, 1)))
-						hugetlb_fix_reserve_counts(
-							inode, rsv_on_error);
-				}
-			} else {
-				/*
-				 * If page is mapped, it was faulted in after
-				 * being unmapped.  It indicates a race between
-				 * hole punch and page fault.  Do nothing in
-				 * this case.  Getting here in a truncate
-				 * operation is a bug.
-				 */
+			/*
+			 * If page is mapped, it was faulted in after being
+			 * unmapped in caller.  Unmap (again) now after taking
+			 * the fault mutex.  The mutex will prevent faults
+			 * until we finish removing the page.
+			 *
+			 * This race can only happen in the hole punch case.
+			 * Getting here in a truncate operation is a bug.
+			 */
+			if (unlikely(page_mapped(page))) {
 				BUG_ON(truncate_op);
+
+				i_mmap_lock_write(mapping);
+				hugetlb_vmdelete_list(&mapping->i_mmap,
+					next * pages_per_huge_page(h),
+					(next + 1) * pages_per_huge_page(h));
+				i_mmap_unlock_write(mapping);
+			}
+
+			lock_page(page);
+			/*
+			 * We must free the huge page and remove from page
+			 * cache (remove_huge_page) BEFORE removing the
+			 * region/reserve map (hugetlb_unreserve_pages).  In
+			 * rare out of memory conditions, removal of the
+			 * region/reserve map could fail.  Before free'ing
+			 * the page, note PagePrivate which is used in case
+			 * of error.
+			 */
+			rsv_on_error = !PagePrivate(page);
+			remove_huge_page(page);
+			freed++;
+			if (!truncate_op) {
+				if (unlikely(hugetlb_unreserve_pages(inode,
+							next, next + 1, 1)))
+					hugetlb_fix_reserve_counts(inode,
+								rsv_on_error);
 			}
 
 			unlock_page(page);
@@ -452,44 +496,6 @@ static void hugetlbfs_evict_inode(struct inode *inode)
 	clear_inode(inode);
 }
 
-static inline void
-hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end)
-{
-	struct vm_area_struct *vma;
-
-	/*
-	 * end == 0 indicates that the entire range after
-	 * start should be unmapped.
-	 */
-	vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
-		unsigned long v_offset;
-		unsigned long v_end;
-
-		/*
-		 * Can the expression below overflow on 32-bit arches?
-		 * No, because the interval tree returns us only those vmas
-		 * which overlap the truncated area starting at pgoff,
-		 * and no vma on a 32-bit arch can span beyond the 4GB.
-		 */
-		if (vma->vm_pgoff < start)
-			v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT;
-		else
-			v_offset = 0;
-
-		if (!end)
-			v_end = vma->vm_end;
-		else {
-			v_end = ((end - vma->vm_pgoff) << PAGE_SHIFT)
-							+ vma->vm_start;
-			if (v_end > vma->vm_end)
-				v_end = vma->vm_end;
-		}
-
-		unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end,
-									NULL);
-	}
-}
-
 static int hugetlb_vmtruncate(struct inode *inode, loff_t offset)
 {
 	pgoff_t pgoff;
@@ -524,7 +530,7 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 	if (hole_end > hole_start) {
 		struct address_space *mapping = inode->i_mapping;
 
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 		i_mmap_lock_write(mapping);
 		if (!RB_EMPTY_ROOT(&mapping->i_mmap))
 			hugetlb_vmdelete_list(&mapping->i_mmap,
@@ -532,7 +538,7 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
 						hole_end  >> PAGE_SHIFT);
 		i_mmap_unlock_write(mapping);
 		remove_inode_hugepages(inode, hole_start, hole_end);
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 	}
 
 	return 0;
@@ -566,7 +572,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
 	start = offset >> hpage_shift;
 	end = (offset + len + hpage_size - 1) >> hpage_shift;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	/* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */
 	error = inode_newsize_ok(inode, offset + len);
@@ -653,7 +659,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset,
 		i_size_write(inode, offset + len);
 	inode->i_ctime = CURRENT_TIME;
 out:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return error;
 }
 
@@ -711,7 +717,7 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb,
 /*
  * Hugetlbfs is not reclaimable; therefore its i_mmap_rwsem will never
  * be taken from reclaim -- unlike regular filesystems. This needs an
- * annotation because huge_pmd_share() does an allocation under
+ * annotation because huge_pmd_share() does an allocation under hugetlb's
  * i_mmap_rwsem.
  */
 static struct lock_class_key hugetlbfs_i_mmap_rwsem_key;
@@ -741,7 +747,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
 		/*
 		 * The policy is initialized here even if we are creating a
 		 * private inode because initialization simply creates an
-		 * an empty rb tree and calls spin_lock_init(), later when we
+		 * an empty rb tree and calls rwlock_init(), later when we
 		 * call mpol_free_shared_policy() it will just return because
 		 * the rb tree will still be empty.
 		 */
@@ -763,6 +769,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb,
 			break;
 		case S_IFLNK:
 			inode->i_op = &page_symlink_inode_operations;
+			inode_nohighmem(inode);
 			break;
 		}
 		lockdep_annotate_inode_mutex_key(inode);
@@ -1204,7 +1211,6 @@ static struct file_system_type hugetlbfs_fs_type = {
 	.mount		= hugetlbfs_mount,
 	.kill_sb	= kill_litter_super,
 };
-MODULE_ALIAS_FS("hugetlbfs");
 
 static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE];
 
@@ -1324,7 +1330,7 @@ static int __init init_hugetlbfs_fs(void)
 	error = -ENOMEM;
 	hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache",
 					sizeof(struct hugetlbfs_inode_info),
-					0, 0, init_once);
+					0, SLAB_ACCOUNT, init_once);
 	if (hugetlbfs_inode_cachep == NULL)
 		goto out2;
 
@@ -1358,26 +1364,4 @@ static int __init init_hugetlbfs_fs(void)
  out2:
 	return error;
 }
-
-static void __exit exit_hugetlbfs_fs(void)
-{
-	struct hstate *h;
-	int i;
-
-
-	/*
-	 * Make sure all delayed rcu free inodes are flushed before we
-	 * destroy cache.
-	 */
-	rcu_barrier();
-	kmem_cache_destroy(hugetlbfs_inode_cachep);
-	i = 0;
-	for_each_hstate(h)
-		kern_unmount(hugetlbfs_vfsmount[i++]);
-	unregister_filesystem(&hugetlbfs_fs_type);
-}
-
-module_init(init_hugetlbfs_fs)
-module_exit(exit_hugetlbfs_fs)
-
-MODULE_LICENSE("GPL");
+fs_initcall(init_hugetlbfs_fs)
diff --git a/fs/inode.c b/fs/inode.c
index 1be5f9003..69b8b526c 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -154,6 +154,12 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
 	inode->i_rdev = 0;
 	inode->dirtied_when = 0;
 
+#ifdef CONFIG_CGROUP_WRITEBACK
+	inode->i_wb_frn_winner = 0;
+	inode->i_wb_frn_avg_time = 0;
+	inode->i_wb_frn_history = 0;
+#endif
+
 	if (security_inode_alloc(inode))
 		goto out;
 	spin_lock_init(&inode->i_lock);
@@ -225,7 +231,7 @@ void __destroy_inode(struct inode *inode)
 	inode_detach_wb(inode);
 	security_inode_free(inode);
 	fsnotify_inode_delete(inode);
-	locks_free_lock_context(inode->i_flctx);
+	locks_free_lock_context(inode);
 	if (!inode->i_nlink) {
 		WARN_ON(atomic_long_read(&inode->i_sb->s_remove_count) == 0);
 		atomic_long_dec(&inode->i_sb->s_remove_count);
@@ -495,7 +501,7 @@ void clear_inode(struct inode *inode)
 	 */
 	spin_lock_irq(&inode->i_data.tree_lock);
 	BUG_ON(inode->i_data.nrpages);
-	BUG_ON(inode->i_data.nrshadows);
+	BUG_ON(inode->i_data.nrexceptional);
 	spin_unlock_irq(&inode->i_data.tree_lock);
 	BUG_ON(!list_empty(&inode->i_data.private_list));
 	BUG_ON(!(inode->i_state & I_FREEING));
@@ -966,9 +972,9 @@ void lock_two_nondirectories(struct inode *inode1, struct inode *inode2)
 		swap(inode1, inode2);
 
 	if (inode1 && !S_ISDIR(inode1->i_mode))
-		mutex_lock(&inode1->i_mutex);
+		inode_lock(inode1);
 	if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1)
-		mutex_lock_nested(&inode2->i_mutex, I_MUTEX_NONDIR2);
+		inode_lock_nested(inode2, I_MUTEX_NONDIR2);
 }
 EXPORT_SYMBOL(lock_two_nondirectories);
 
@@ -980,9 +986,9 @@ EXPORT_SYMBOL(lock_two_nondirectories);
 void unlock_two_nondirectories(struct inode *inode1, struct inode *inode2)
 {
 	if (inode1 && !S_ISDIR(inode1->i_mode))
-		mutex_unlock(&inode1->i_mutex);
+		inode_unlock(inode1);
 	if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1)
-		mutex_unlock(&inode2->i_mutex);
+		inode_unlock(inode2);
 }
 EXPORT_SYMBOL(unlock_two_nondirectories);
 
@@ -1883,7 +1889,7 @@ void __init inode_init(void)
 					 sizeof(struct inode),
 					 0,
 					 (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
-					 SLAB_MEM_SPREAD),
+					 SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					 init_once);
 
 	/* Hash may have been set up in inode_init_early */
@@ -2028,3 +2034,9 @@ void inode_set_flags(struct inode *inode, unsigned int flags,
 				  new_flags) != old_flags));
 }
 EXPORT_SYMBOL(inode_set_flags);
+
+void inode_nohighmem(struct inode *inode)
+{
+	mapping_set_gfp_mask(inode->i_mapping, GFP_USER);
+}
+EXPORT_SYMBOL(inode_nohighmem);
diff --git a/fs/internal.h b/fs/internal.h
index 71859c4d0..b71deeece 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -55,7 +55,7 @@ extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
 /*
  * namespace.c
  */
-extern int copy_mount_options(const void __user *, unsigned long *);
+extern void *copy_mount_options(const void __user *);
 extern char *copy_mount_string(const void __user *);
 
 extern struct vfsmount *lookup_mnt(struct path *);
@@ -151,3 +151,10 @@ extern void mnt_pin_kill(struct mount *m);
  * fs/nsfs.c
  */
 extern struct dentry_operations ns_dentry_operations;
+
+/*
+ * fs/ioctl.c
+ */
+extern int do_vfs_ioctl(struct file *file, unsigned int fd, unsigned int cmd,
+		    unsigned long arg);
+extern long vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 5d01d2638..116a333e9 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -15,6 +15,7 @@
 #include <linux/writeback.h>
 #include <linux/buffer_head.h>
 #include <linux/falloc.h>
+#include "internal.h"
 
 #include <asm/ioctls.h>
 
@@ -32,8 +33,7 @@
  *
  * Returns 0 on success, -errno on error.
  */
-static long vfs_ioctl(struct file *filp, unsigned int cmd,
-		      unsigned long arg)
+long vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	int error = -ENOTTY;
 
@@ -215,6 +215,29 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg)
 	return error;
 }
 
+static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd,
+			     u64 off, u64 olen, u64 destoff)
+{
+	struct fd src_file = fdget(srcfd);
+	int ret;
+
+	if (!src_file.file)
+		return -EBADF;
+	ret = vfs_clone_file_range(src_file.file, off, dst_file, destoff, olen);
+	fdput(src_file);
+	return ret;
+}
+
+static long ioctl_file_clone_range(struct file *file, void __user *argp)
+{
+	struct file_clone_range args;
+
+	if (copy_from_user(&args, argp, sizeof(args)))
+		return -EFAULT;
+	return ioctl_file_clone(file, args.src_fd, args.src_offset,
+				args.src_length, args.dest_offset);
+}
+
 #ifdef CONFIG_BLOCK
 
 static inline sector_t logical_to_blk(struct inode *inode, loff_t offset)
@@ -411,9 +434,9 @@ int generic_block_fiemap(struct inode *inode,
 			 u64 len, get_block_t *get_block)
 {
 	int ret;
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	ret = __generic_block_fiemap(inode, fieinfo, start, len, get_block);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return ret;
 }
 EXPORT_SYMBOL(generic_block_fiemap);
@@ -545,6 +568,41 @@ static int ioctl_fsthaw(struct file *filp)
 	return thaw_super(sb);
 }
 
+static long ioctl_file_dedupe_range(struct file *file, void __user *arg)
+{
+	struct file_dedupe_range __user *argp = arg;
+	struct file_dedupe_range *same = NULL;
+	int ret;
+	unsigned long size;
+	u16 count;
+
+	if (get_user(count, &argp->dest_count)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	size = offsetof(struct file_dedupe_range __user, info[count]);
+
+	same = memdup_user(argp, size);
+	if (IS_ERR(same)) {
+		ret = PTR_ERR(same);
+		same = NULL;
+		goto out;
+	}
+
+	ret = vfs_dedupe_file_range(file, same);
+	if (ret)
+		goto out;
+
+	ret = copy_to_user(argp, same, size);
+	if (ret)
+		ret = -EFAULT;
+
+out:
+	kfree(same);
+	return ret;
+}
+
 /*
  * When you add any new common ioctls to the switches above and below
  * please update compat_sys_ioctl() too.
@@ -600,6 +658,15 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
 	case FIGETBSZ:
 		return put_user(inode->i_sb->s_blocksize, argp);
 
+	case FICLONE:
+		return ioctl_file_clone(filp, arg, 0, 0, 0);
+
+	case FICLONERANGE:
+		return ioctl_file_clone_range(filp, argp);
+
+	case FIDEDUPERANGE:
+		return ioctl_file_dedupe_range(filp, argp);
+
 	default:
 		if (S_ISREG(inode->i_mode))
 			error = file_ioctl(filp, cmd, arg);
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index d67a16f2a..bcd2d41b3 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -94,7 +94,7 @@ static int __init init_inodecache(void)
 	isofs_inode_cachep = kmem_cache_create("isofs_inode_cache",
 					sizeof(struct iso_inode_info),
 					0, (SLAB_RECLAIM_ACCOUNT|
-					SLAB_MEM_SPREAD),
+					SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					init_once);
 	if (isofs_inode_cachep == NULL)
 		return -ENOMEM;
@@ -1417,6 +1417,7 @@ static int isofs_read_inode(struct inode *inode, int relocated)
 		inode->i_fop = &isofs_dir_operations;
 	} else if (S_ISLNK(inode->i_mode)) {
 		inode->i_op = &page_symlink_inode_operations;
+		inode_nohighmem(inode);
 		inode->i_data.a_ops = &isofs_symlink_aops;
 	} else
 		/* XXX - parse_rock_ridge_inode() had already set i_rdev. */
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index 735d7522a..5384ceb35 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -687,7 +687,7 @@ static int rock_ridge_symlink_readpage(struct file *file, struct page *page)
 	struct inode *inode = page->mapping->host;
 	struct iso_inode_info *ei = ISOFS_I(inode);
 	struct isofs_sb_info *sbi = ISOFS_SB(inode->i_sb);
-	char *link = kmap(page);
+	char *link = page_address(page);
 	unsigned long bufsize = ISOFS_BUFFER_SIZE(inode);
 	struct buffer_head *bh;
 	char *rpnt = link;
@@ -774,7 +774,6 @@ repeat:
 	brelse(bh);
 	*rpnt = '\0';
 	SetPageUptodate(page);
-	kunmap(page);
 	unlock_page(page);
 	return 0;
 
@@ -791,7 +790,6 @@ fail:
 	brelse(bh);
 error:
 	SetPageError(page);
-	kunmap(page);
 	unlock_page(page);
 	return -EIO;
 }
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index ca181e81c..081dff087 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -764,13 +764,11 @@ void jbd2_journal_unlock_updates (journal_t *journal)
 
 static void warn_dirty_buffer(struct buffer_head *bh)
 {
-	char b[BDEVNAME_SIZE];
-
 	printk(KERN_WARNING
-	       "JBD2: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). "
+	       "JBD2: Spotted dirty metadata buffer (dev = %pg, blocknr = %llu). "
 	       "There's a risk of filesystem corruption in case of system "
 	       "crash.\n",
-	       bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr);
+	       bh->b_bdev, (unsigned long long)bh->b_blocknr);
 }
 
 /* Call t_frozen trigger and copy buffer data into jh->b_frozen_data. */
diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c
index c1f04947d..b288c8ae1 100644
--- a/fs/jffs2/build.c
+++ b/fs/jffs2/build.c
@@ -17,6 +17,7 @@
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/mtd/mtd.h>
+#include <linux/mm.h> /* kvfree() */
 #include "nodelist.h"
 
 static void jffs2_build_remove_unlinked_inode(struct jffs2_sb_info *,
@@ -422,12 +423,7 @@ int jffs2_do_mount_fs(struct jffs2_sb_info *c)
 	return 0;
 
  out_free:
-#ifndef __ECOS
-	if (jffs2_blocks_use_vmalloc(c))
-		vfree(c->blocks);
-	else
-#endif
-		kfree(c->blocks);
+	kvfree(c->blocks);
 
 	return ret;
 }
diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c
index d211b8e18..30c4c9ebb 100644
--- a/fs/jffs2/dir.c
+++ b/fs/jffs2/dir.c
@@ -843,9 +843,14 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry,
 
 		pr_notice("%s(): Link succeeded, unlink failed (err %d). You now have a hard link\n",
 			  __func__, ret);
-		/* Might as well let the VFS know */
-		d_instantiate(new_dentry, d_inode(old_dentry));
-		ihold(d_inode(old_dentry));
+		/*
+		 * We can't keep the target in dcache after that.
+		 * For one thing, we can't afford dentry aliases for directories.
+		 * For another, if there was a victim, we _can't_ set new inode
+		 * for that sucker and we have to trigger mount eviction - the
+		 * caller won't do it on its own since we are returning an error.
+		 */
+		d_invalidate(new_dentry);
 		new_dir_i->i_mtime = new_dir_i->i_ctime = ITIME(now);
 		return ret;
 	}
diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c
index 3361979d7..cad86bac3 100644
--- a/fs/jffs2/file.c
+++ b/fs/jffs2/file.c
@@ -39,10 +39,10 @@ int jffs2_fsync(struct file *filp, loff_t start, loff_t end, int datasync)
 	if (ret)
 		return ret;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	/* Trigger GC to flush any pending writes for this inode */
 	jffs2_flush_wbuf_gc(c, inode->i_ino);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	return 0;
 }
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 2caf16820..bead25ae8 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -596,10 +596,7 @@ int jffs2_do_fill_super(struct super_block *sb, void *data, int silent)
 out_root:
 	jffs2_free_ino_caches(c);
 	jffs2_free_raw_node_refs(c);
-	if (jffs2_blocks_use_vmalloc(c))
-		vfree(c->blocks);
-	else
-		kfree(c->blocks);
+	kvfree(c->blocks);
  out_inohash:
 	jffs2_clear_xattr_subsystem(c);
 	kfree(c->inocache_list);
diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c
index bf12fe5f8..7a28facd7 100644
--- a/fs/jffs2/security.c
+++ b/fs/jffs2/security.c
@@ -52,9 +52,6 @@ static int jffs2_security_getxattr(const struct xattr_handler *handler,
 				   struct dentry *dentry, const char *name,
 				   void *buffer, size_t size)
 {
-	if (!strcmp(name, ""))
-		return -EINVAL;
-
 	return do_jffs2_getxattr(d_inode(dentry), JFFS2_XPREFIX_SECURITY,
 				 name, buffer, size);
 }
@@ -63,31 +60,12 @@ static int jffs2_security_setxattr(const struct xattr_handler *handler,
 				   struct dentry *dentry, const char *name,
 				   const void *buffer, size_t size, int flags)
 {
-	if (!strcmp(name, ""))
-		return -EINVAL;
-
 	return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_SECURITY,
 				 name, buffer, size, flags);
 }
 
-static size_t jffs2_security_listxattr(const struct xattr_handler *handler,
-				       struct dentry *dentry, char *list,
-				       size_t list_size, const char *name,
-				       size_t name_len)
-{
-	size_t retlen = XATTR_SECURITY_PREFIX_LEN + name_len + 1;
-
-	if (list && retlen <= list_size) {
-		strcpy(list, XATTR_SECURITY_PREFIX);
-		strcpy(list + XATTR_SECURITY_PREFIX_LEN, name);
-	}
-
-	return retlen;
-}
-
 const struct xattr_handler jffs2_security_xattr_handler = {
 	.prefix = XATTR_SECURITY_PREFIX,
-	.list = jffs2_security_listxattr,
 	.set = jffs2_security_setxattr,
 	.get = jffs2_security_getxattr
 };
diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c
index d86c5e317..0a9a114bb 100644
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -331,10 +331,7 @@ static void jffs2_put_super (struct super_block *sb)
 
 	jffs2_free_ino_caches(c);
 	jffs2_free_raw_node_refs(c);
-	if (jffs2_blocks_use_vmalloc(c))
-		vfree(c->blocks);
-	else
-		kfree(c->blocks);
+	kvfree(c->blocks);
 	jffs2_flash_cleanup(c);
 	kfree(c->inocache_list);
 	jffs2_clear_xattr_subsystem(c);
@@ -387,7 +384,7 @@ static int __init init_jffs2_fs(void)
 	jffs2_inode_cachep = kmem_cache_create("jffs2_i",
 					     sizeof(struct jffs2_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					     jffs2_i_init_once);
 	if (!jffs2_inode_cachep) {
 		pr_err("error: Failed to initialise inode cache\n");
diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c
index 8ce2f2401..2cabd649d 100644
--- a/fs/jffs2/symlink.c
+++ b/fs/jffs2/symlink.c
@@ -14,7 +14,7 @@
 const struct inode_operations jffs2_symlink_inode_operations =
 {
 	.readlink =	generic_readlink,
-	.follow_link =	simple_follow_link,
+	.get_link =	simple_get_link,
 	.setattr =	jffs2_setattr,
 	.setxattr =	jffs2_setxattr,
 	.getxattr =	jffs2_getxattr,
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index f3a4857ff..5a3da3f52 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -1153,7 +1153,7 @@ static struct jffs2_sb_info *work_to_sb(struct work_struct *work)
 {
 	struct delayed_work *dwork;
 
-	dwork = container_of(work, struct delayed_work, work);
+	dwork = to_delayed_work(work);
 	return container_of(dwork, struct jffs2_sb_info, wbuf_dwork);
 }
 
diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c
index 4c2c03663..da3e18503 100644
--- a/fs/jffs2/xattr.c
+++ b/fs/jffs2/xattr.c
@@ -967,7 +967,8 @@ ssize_t jffs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
 	struct jffs2_xattr_ref *ref, **pref;
 	struct jffs2_xattr_datum *xd;
 	const struct xattr_handler *xhandle;
-	ssize_t len, rc;
+	const char *prefix;
+	ssize_t prefix_len, len, rc;
 	int retry = 0;
 
 	rc = check_xattr_ref_inode(c, ic);
@@ -998,18 +999,23 @@ ssize_t jffs2_listxattr(struct dentry *dentry, char *buffer, size_t size)
 			}
 		}
 		xhandle = xprefix_to_handler(xd->xprefix);
-		if (!xhandle)
+		if (!xhandle || (xhandle->list && !xhandle->list(dentry)))
 			continue;
+		prefix = xhandle->prefix ?: xhandle->name;
+		prefix_len = strlen(prefix);
+		rc = prefix_len + xd->name_len + 1;
+
 		if (buffer) {
-			rc = xhandle->list(xhandle, dentry, buffer + len,
-					   size - len, xd->xname,
-					   xd->name_len);
-		} else {
-			rc = xhandle->list(xhandle, dentry, NULL, 0,
-					   xd->xname, xd->name_len);
+			if (rc > size - len) {
+				rc = -ERANGE;
+				goto out;
+			}
+			memcpy(buffer, prefix, prefix_len);
+			buffer += prefix_len;
+			memcpy(buffer, xd->xname, xd->name_len);
+			buffer += xd->name_len;
+			*buffer++ = 0;
 		}
-		if (rc < 0)
-			goto out;
 		len += rc;
 	}
 	rc = len;
diff --git a/fs/jffs2/xattr_trusted.c b/fs/jffs2/xattr_trusted.c
index a562da0d6..b2555ef07 100644
--- a/fs/jffs2/xattr_trusted.c
+++ b/fs/jffs2/xattr_trusted.c
@@ -20,8 +20,6 @@ static int jffs2_trusted_getxattr(const struct xattr_handler *handler,
 				  struct dentry *dentry, const char *name,
 				  void *buffer, size_t size)
 {
-	if (!strcmp(name, ""))
-		return -EINVAL;
 	return do_jffs2_getxattr(d_inode(dentry), JFFS2_XPREFIX_TRUSTED,
 				 name, buffer, size);
 }
@@ -30,28 +28,13 @@ static int jffs2_trusted_setxattr(const struct xattr_handler *handler,
 				  struct dentry *dentry, const char *name,
 				  const void *buffer, size_t size, int flags)
 {
-	if (!strcmp(name, ""))
-		return -EINVAL;
 	return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_TRUSTED,
 				 name, buffer, size, flags);
 }
 
-static size_t jffs2_trusted_listxattr(const struct xattr_handler *handler,
-				      struct dentry *dentry, char *list,
-				      size_t list_size, const char *name,
-				      size_t name_len)
+static bool jffs2_trusted_listxattr(struct dentry *dentry)
 {
-	size_t retlen = XATTR_TRUSTED_PREFIX_LEN + name_len + 1;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return 0;
-
-	if (list && retlen<=list_size) {
-		strcpy(list, XATTR_TRUSTED_PREFIX);
-		strcpy(list + XATTR_TRUSTED_PREFIX_LEN, name);
-	}
-
-	return retlen;
+	return capable(CAP_SYS_ADMIN);
 }
 
 const struct xattr_handler jffs2_trusted_xattr_handler = {
diff --git a/fs/jffs2/xattr_user.c b/fs/jffs2/xattr_user.c
index cbc0472e5..539bd630b 100644
--- a/fs/jffs2/xattr_user.c
+++ b/fs/jffs2/xattr_user.c
@@ -20,8 +20,6 @@ static int jffs2_user_getxattr(const struct xattr_handler *handler,
 			       struct dentry *dentry, const char *name,
 			       void *buffer, size_t size)
 {
-	if (!strcmp(name, ""))
-		return -EINVAL;
 	return do_jffs2_getxattr(d_inode(dentry), JFFS2_XPREFIX_USER,
 				 name, buffer, size);
 }
@@ -30,30 +28,12 @@ static int jffs2_user_setxattr(const struct xattr_handler *handler,
 			       struct dentry *dentry, const char *name,
 			       const void *buffer, size_t size, int flags)
 {
-	if (!strcmp(name, ""))
-		return -EINVAL;
 	return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_USER,
 				 name, buffer, size, flags);
 }
 
-static size_t jffs2_user_listxattr(const struct xattr_handler *handler,
-				   struct dentry *dentry, char *list,
-				   size_t list_size, const char *name,
-				   size_t name_len)
-{
-	size_t retlen = XATTR_USER_PREFIX_LEN + name_len + 1;
-
-	if (list && retlen <= list_size) {
-		strcpy(list, XATTR_USER_PREFIX);
-		strcpy(list + XATTR_USER_PREFIX_LEN, name);
-	}
-
-	return retlen;
-}
-
 const struct xattr_handler jffs2_user_xattr_handler = {
 	.prefix = XATTR_USER_PREFIX,
-	.list = jffs2_user_listxattr,
 	.set = jffs2_user_setxattr,
 	.get = jffs2_user_getxattr
 };
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index 0c8ca830b..49456853e 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -40,10 +40,10 @@ struct posix_acl *jfs_get_acl(struct inode *inode, int type)
 
 	switch(type) {
 		case ACL_TYPE_ACCESS:
-			ea_name = POSIX_ACL_XATTR_ACCESS;
+			ea_name = XATTR_NAME_POSIX_ACL_ACCESS;
 			break;
 		case ACL_TYPE_DEFAULT:
-			ea_name = POSIX_ACL_XATTR_DEFAULT;
+			ea_name = XATTR_NAME_POSIX_ACL_DEFAULT;
 			break;
 		default:
 			return ERR_PTR(-EINVAL);
@@ -82,7 +82,7 @@ static int __jfs_set_acl(tid_t tid, struct inode *inode, int type,
 
 	switch (type) {
 	case ACL_TYPE_ACCESS:
-		ea_name = POSIX_ACL_XATTR_ACCESS;
+		ea_name = XATTR_NAME_POSIX_ACL_ACCESS;
 		if (acl) {
 			rc = posix_acl_equiv_mode(acl, &inode->i_mode);
 			if (rc < 0)
@@ -94,7 +94,7 @@ static int __jfs_set_acl(tid_t tid, struct inode *inode, int type,
 		}
 		break;
 	case ACL_TYPE_DEFAULT:
-		ea_name = POSIX_ACL_XATTR_DEFAULT;
+		ea_name = XATTR_NAME_POSIX_ACL_DEFAULT;
 		break;
 	default:
 		return -EINVAL;
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index 0e026a7bd..4ce7735dd 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -38,17 +38,17 @@ int jfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 	if (rc)
 		return rc;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	if (!(inode->i_state & I_DIRTY_ALL) ||
 	    (datasync && !(inode->i_state & I_DIRTY_DATASYNC))) {
 		/* Make sure committed changes hit the disk */
 		jfs_flush_journal(JFS_SBI(inode->i_sb)->log, 1);
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 		return rc;
 	}
 
 	rc |= jfs_commit_inode(inode, 1);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	return rc ? -EIO : 0;
 }
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 41aa3ca6a..9d9bae63a 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -60,6 +60,7 @@ struct inode *jfs_iget(struct super_block *sb, unsigned long ino)
 	} else if (S_ISLNK(inode->i_mode)) {
 		if (inode->i_size >= IDATASIZE) {
 			inode->i_op = &page_symlink_inode_operations;
+			inode_nohighmem(inode);
 			inode->i_mapping->a_ops = &jfs_aops;
 		} else {
 			inode->i_op = &jfs_fast_symlink_inode_operations;
diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c
index 8db8b7d61..8653cac7e 100644
--- a/fs/jfs/ioctl.c
+++ b/fs/jfs/ioctl.c
@@ -96,7 +96,7 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		}
 
 		/* Lock against other parallel changes of flags */
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 
 		jfs_get_inode_flags(jfs_inode);
 		oldflags = jfs_inode->mode2;
@@ -109,7 +109,7 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 			((flags ^ oldflags) &
 			(JFS_APPEND_FL | JFS_IMMUTABLE_FL))) {
 			if (!capable(CAP_LINUX_IMMUTABLE)) {
-				mutex_unlock(&inode->i_mutex);
+				inode_unlock(inode);
 				err = -EPERM;
 				goto setflags_out;
 			}
@@ -120,7 +120,7 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		jfs_inode->mode2 = flags;
 
 		jfs_set_inode_flags(inode);
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 		inode->i_ctime = CURRENT_TIME_SEC;
 		mark_inode_dirty(inode);
 setflags_out:
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index a69bdf2a1..a270cb7ff 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -1835,17 +1835,16 @@ static int lbmLogInit(struct jfs_log * log)
 	for (i = 0; i < LOGPAGES;) {
 		char *buffer;
 		uint offset;
-		struct page *page;
+		struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO);
 
-		buffer = (char *) get_zeroed_page(GFP_KERNEL);
-		if (buffer == NULL)
+		if (!page)
 			goto error;
-		page = virt_to_page(buffer);
+		buffer = page_address(page);
 		for (offset = 0; offset < PAGE_SIZE; offset += LOGPSIZE) {
 			lbuf = kmalloc(sizeof(struct lbuf), GFP_KERNEL);
 			if (lbuf == NULL) {
 				if (offset == 0)
-					free_page((unsigned long) buffer);
+					__free_page(page);
 				goto error;
 			}
 			if (offset) /* we already have one reference */
diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c
index 9d7551f5c..701f89370 100644
--- a/fs/jfs/namei.c
+++ b/fs/jfs/namei.c
@@ -983,6 +983,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry,
 		jfs_info("jfs_symlink: allocate extent ip:0x%p", ip);
 
 		ip->i_op = &jfs_symlink_inode_operations;
+		inode_nohighmem(ip);
 		ip->i_mapping->a_ops = &jfs_aops;
 
 		/*
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 8f9176caf..4f5d85ba8 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -792,7 +792,7 @@ static ssize_t jfs_quota_write(struct super_block *sb, int type,
 	struct buffer_head tmp_bh;
 	struct buffer_head *bh;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	while (towrite > 0) {
 		tocopy = sb->s_blocksize - offset < towrite ?
 				sb->s_blocksize - offset : towrite;
@@ -824,7 +824,7 @@ static ssize_t jfs_quota_write(struct super_block *sb, int type,
 	}
 out:
 	if (len == towrite) {
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 		return err;
 	}
 	if (inode->i_size < off+len-towrite)
@@ -832,7 +832,7 @@ out:
 	inode->i_version++;
 	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 	mark_inode_dirty(inode);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return len - towrite;
 }
 
@@ -898,7 +898,7 @@ static int __init init_jfs_fs(void)
 
 	jfs_inode_cachep =
 	    kmem_cache_create("jfs_ip", sizeof(struct jfs_inode_info), 0,
-			    SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
+			    SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT,
 			    init_once);
 	if (jfs_inode_cachep == NULL)
 		return -ENOMEM;
diff --git a/fs/jfs/symlink.c b/fs/jfs/symlink.c
index 5929e2363..f8db4fde0 100644
--- a/fs/jfs/symlink.c
+++ b/fs/jfs/symlink.c
@@ -23,7 +23,7 @@
 
 const struct inode_operations jfs_fast_symlink_inode_operations = {
 	.readlink	= generic_readlink,
-	.follow_link	= simple_follow_link,
+	.get_link	= simple_get_link,
 	.setattr	= jfs_setattr,
 	.setxattr	= jfs_setxattr,
 	.getxattr	= jfs_getxattr,
@@ -33,8 +33,7 @@ const struct inode_operations jfs_fast_symlink_inode_operations = {
 
 const struct inode_operations jfs_symlink_inode_operations = {
 	.readlink	= generic_readlink,
-	.follow_link	= page_follow_link_light,
-	.put_link	= page_put_link,
+	.get_link	= page_get_link,
 	.setattr	= jfs_setattr,
 	.setxattr	= jfs_setxattr,
 	.getxattr	= jfs_getxattr,
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 91e004518..996b7742c 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -541,14 +541,7 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
 	if (!kn)
 		goto err_out1;
 
-	/*
-	 * If the ino of the sysfs entry created for a kmem cache gets
-	 * allocated from an ida layer, which is accounted to the memcg that
-	 * owns the cache, the memcg will get pinned forever. So do not account
-	 * ino ida allocations.
-	 */
-	ret = ida_simple_get(&root->ino_ida, 1, 0,
-			     GFP_KERNEL | __GFP_NOACCOUNT);
+	ret = ida_simple_get(&root->ino_ida, 1, 0, GFP_KERNEL);
 	if (ret < 0)
 		goto err_out2;
 	kn->ino = ret;
@@ -694,6 +687,29 @@ static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent,
 	return NULL;
 }
 
+static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent,
+					  const unsigned char *path,
+					  const void *ns)
+{
+	static char path_buf[PATH_MAX];	/* protected by kernfs_mutex */
+	size_t len = strlcpy(path_buf, path, PATH_MAX);
+	char *p = path_buf;
+	char *name;
+
+	lockdep_assert_held(&kernfs_mutex);
+
+	if (len >= PATH_MAX)
+		return NULL;
+
+	while ((name = strsep(&p, "/")) && parent) {
+		if (*name == '\0')
+			continue;
+		parent = kernfs_find_ns(parent, name, ns);
+	}
+
+	return parent;
+}
+
 /**
  * kernfs_find_and_get_ns - find and get kernfs_node with the given name
  * @parent: kernfs_node to search under
@@ -719,6 +735,29 @@ struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent,
 EXPORT_SYMBOL_GPL(kernfs_find_and_get_ns);
 
 /**
+ * kernfs_walk_and_get_ns - find and get kernfs_node with the given path
+ * @parent: kernfs_node to search under
+ * @path: path to look for
+ * @ns: the namespace tag to use
+ *
+ * Look for kernfs_node with path @path under @parent and get a reference
+ * if found.  This function may sleep and returns pointer to the found
+ * kernfs_node on success, %NULL on failure.
+ */
+struct kernfs_node *kernfs_walk_and_get_ns(struct kernfs_node *parent,
+					   const char *path, const void *ns)
+{
+	struct kernfs_node *kn;
+
+	mutex_lock(&kernfs_mutex);
+	kn = kernfs_walk_ns(parent, path, ns);
+	kernfs_get(kn);
+	mutex_unlock(&kernfs_mutex);
+
+	return kn;
+}
+
+/**
  * kernfs_create_root - create a new kernfs hierarchy
  * @scops: optional syscall operations for the hierarchy
  * @flags: KERNFS_ROOT_* flags
@@ -1472,9 +1511,9 @@ static loff_t kernfs_dir_fop_llseek(struct file *file, loff_t offset,
 	struct inode *inode = file_inode(file);
 	loff_t ret;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	ret = generic_file_llseek(file, offset, whence);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	return ret;
 }
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index 756dd56aa..16405ae88 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -205,7 +205,7 @@ int kernfs_iop_removexattr(struct dentry *dentry, const char *name)
 	if (!attrs)
 		return -ENOMEM;
 
-	return simple_xattr_remove(&attrs->xattrs, name);
+	return simple_xattr_set(&attrs->xattrs, name, NULL, 0, XATTR_REPLACE);
 }
 
 ssize_t kernfs_iop_getxattr(struct dentry *dentry, const char *name, void *buf,
@@ -230,7 +230,7 @@ ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size)
 	if (!attrs)
 		return -ENOMEM;
 
-	return simple_xattr_list(&attrs->xattrs, buf, size);
+	return simple_xattr_list(d_inode(dentry), &attrs->xattrs, buf, size);
 }
 
 static inline void set_default_inode_attr(struct inode *inode, umode_t mode)
diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c
index db272528a..117b8b341 100644
--- a/fs/kernfs/symlink.c
+++ b/fs/kernfs/symlink.c
@@ -112,18 +112,25 @@ static int kernfs_getlink(struct dentry *dentry, char *path)
 	return error;
 }
 
-static const char *kernfs_iop_follow_link(struct dentry *dentry, void **cookie)
+static const char *kernfs_iop_get_link(struct dentry *dentry,
+				       struct inode *inode,
+				       struct delayed_call *done)
 {
-	int error = -ENOMEM;
-	unsigned long page = get_zeroed_page(GFP_KERNEL);
-	if (!page)
+	char *body;
+	int error;
+
+	if (!dentry)
+		return ERR_PTR(-ECHILD);
+	body = kzalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!body)
 		return ERR_PTR(-ENOMEM);
-	error = kernfs_getlink(dentry, (char *)page);
+	error = kernfs_getlink(dentry, body);
 	if (unlikely(error < 0)) {
-		free_page((unsigned long)page);
+		kfree(body);
 		return ERR_PTR(error);
 	}
-	return *cookie = (char *)page;
+	set_delayed_call(done, kfree_link, body);
+	return body;
 }
 
 const struct inode_operations kernfs_symlink_iops = {
@@ -132,8 +139,7 @@ const struct inode_operations kernfs_symlink_iops = {
 	.getxattr	= kernfs_iop_getxattr,
 	.listxattr	= kernfs_iop_listxattr,
 	.readlink	= generic_readlink,
-	.follow_link	= kernfs_iop_follow_link,
-	.put_link	= free_page_put_link,
+	.get_link	= kernfs_iop_get_link,
 	.setattr	= kernfs_iop_setattr,
 	.getattr	= kernfs_iop_getattr,
 	.permission	= kernfs_iop_permission,
diff --git a/fs/libfs.c b/fs/libfs.c
index c7cbfb092..0ca80b2af 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -89,7 +89,7 @@ EXPORT_SYMBOL(dcache_dir_close);
 loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence)
 {
 	struct dentry *dentry = file->f_path.dentry;
-	mutex_lock(&d_inode(dentry)->i_mutex);
+	inode_lock(d_inode(dentry));
 	switch (whence) {
 		case 1:
 			offset += file->f_pos;
@@ -97,7 +97,7 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence)
 			if (offset >= 0)
 				break;
 		default:
-			mutex_unlock(&d_inode(dentry)->i_mutex);
+			inode_unlock(d_inode(dentry));
 			return -EINVAL;
 	}
 	if (offset != file->f_pos) {
@@ -124,7 +124,7 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence)
 			spin_unlock(&dentry->d_lock);
 		}
 	}
-	mutex_unlock(&d_inode(dentry)->i_mutex);
+	inode_unlock(d_inode(dentry));
 	return offset;
 }
 EXPORT_SYMBOL(dcache_dir_lseek);
@@ -941,7 +941,7 @@ int __generic_file_fsync(struct file *file, loff_t start, loff_t end,
 	if (err)
 		return err;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	ret = sync_mapping_buffers(inode->i_mapping);
 	if (!(inode->i_state & I_DIRTY_ALL))
 		goto out;
@@ -953,7 +953,7 @@ int __generic_file_fsync(struct file *file, loff_t start, loff_t end,
 		ret = err;
 
 out:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return ret;
 }
 EXPORT_SYMBOL(__generic_file_fsync);
@@ -1019,17 +1019,12 @@ int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 }
 EXPORT_SYMBOL(noop_fsync);
 
-void kfree_put_link(struct inode *unused, void *cookie)
+/* Because kfree isn't assignment-compatible with void(void*) ;-/ */
+void kfree_link(void *p)
 {
-	kfree(cookie);
+	kfree(p);
 }
-EXPORT_SYMBOL(kfree_put_link);
-
-void free_page_put_link(struct inode *unused, void *cookie)
-{
-	free_page((unsigned long) cookie);
-}
-EXPORT_SYMBOL(free_page_put_link);
+EXPORT_SYMBOL(kfree_link);
 
 /*
  * nop .set_page_dirty method so that people can use .page_mkwrite on
@@ -1092,14 +1087,15 @@ simple_nosetlease(struct file *filp, long arg, struct file_lock **flp,
 }
 EXPORT_SYMBOL(simple_nosetlease);
 
-const char *simple_follow_link(struct dentry *dentry, void **cookie)
+const char *simple_get_link(struct dentry *dentry, struct inode *inode,
+			    struct delayed_call *done)
 {
-	return d_inode(dentry)->i_link;
+	return inode->i_link;
 }
-EXPORT_SYMBOL(simple_follow_link);
+EXPORT_SYMBOL(simple_get_link);
 
 const struct inode_operations simple_symlink_inode_operations = {
-	.follow_link = simple_follow_link,
+	.get_link = simple_get_link,
 	.readlink = generic_readlink
 };
 EXPORT_SYMBOL(simple_symlink_inode_operations);
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 5f31ebd96..154a107cd 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -25,13 +25,17 @@
 #include <linux/mutex.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
+#include <linux/inetdevice.h>
 
 #include <linux/sunrpc/types.h>
 #include <linux/sunrpc/stats.h>
 #include <linux/sunrpc/clnt.h>
 #include <linux/sunrpc/svc.h>
 #include <linux/sunrpc/svcsock.h>
+#include <linux/sunrpc/svc_xprt.h>
 #include <net/ip.h>
+#include <net/addrconf.h>
+#include <net/ipv6.h>
 #include <linux/lockd/lockd.h>
 #include <linux/nfs.h>
 
@@ -44,7 +48,7 @@
 
 static struct svc_program	nlmsvc_program;
 
-struct nlmsvc_binding *		nlmsvc_ops;
+const struct nlmsvc_binding	*nlmsvc_ops;
 EXPORT_SYMBOL_GPL(nlmsvc_ops);
 
 static DEFINE_MUTEX(nlmsvc_mutex);
@@ -90,8 +94,7 @@ static unsigned long get_lockd_grace_period(void)
 
 static void grace_ender(struct work_struct *grace)
 {
-	struct delayed_work *dwork = container_of(grace, struct delayed_work,
-						  work);
+	struct delayed_work *dwork = to_delayed_work(grace);
 	struct lockd_net *ln = container_of(dwork, struct lockd_net,
 					    grace_period_end);
 
@@ -279,6 +282,68 @@ static void lockd_down_net(struct svc_serv *serv, struct net *net)
 	}
 }
 
+static int lockd_inetaddr_event(struct notifier_block *this,
+	unsigned long event, void *ptr)
+{
+	struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
+	struct sockaddr_in sin;
+
+	if (event != NETDEV_DOWN)
+		goto out;
+
+	if (nlmsvc_rqst) {
+		dprintk("lockd_inetaddr_event: removed %pI4\n",
+			&ifa->ifa_local);
+		sin.sin_family = AF_INET;
+		sin.sin_addr.s_addr = ifa->ifa_local;
+		svc_age_temp_xprts_now(nlmsvc_rqst->rq_server,
+			(struct sockaddr *)&sin);
+	}
+
+out:
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block lockd_inetaddr_notifier = {
+	.notifier_call = lockd_inetaddr_event,
+};
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int lockd_inet6addr_event(struct notifier_block *this,
+	unsigned long event, void *ptr)
+{
+	struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr;
+	struct sockaddr_in6 sin6;
+
+	if (event != NETDEV_DOWN)
+		goto out;
+
+	if (nlmsvc_rqst) {
+		dprintk("lockd_inet6addr_event: removed %pI6\n", &ifa->addr);
+		sin6.sin6_family = AF_INET6;
+		sin6.sin6_addr = ifa->addr;
+		svc_age_temp_xprts_now(nlmsvc_rqst->rq_server,
+			(struct sockaddr *)&sin6);
+	}
+
+out:
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block lockd_inet6addr_notifier = {
+	.notifier_call = lockd_inet6addr_event,
+};
+#endif
+
+static void lockd_svc_exit_thread(void)
+{
+	unregister_inetaddr_notifier(&lockd_inetaddr_notifier);
+#if IS_ENABLED(CONFIG_IPV6)
+	unregister_inet6addr_notifier(&lockd_inet6addr_notifier);
+#endif
+	svc_exit_thread(nlmsvc_rqst);
+}
+
 static int lockd_start_svc(struct svc_serv *serv)
 {
 	int error;
@@ -315,7 +380,7 @@ static int lockd_start_svc(struct svc_serv *serv)
 	return 0;
 
 out_task:
-	svc_exit_thread(nlmsvc_rqst);
+	lockd_svc_exit_thread();
 	nlmsvc_task = NULL;
 out_rqst:
 	nlmsvc_rqst = NULL;
@@ -360,6 +425,10 @@ static struct svc_serv *lockd_create_svc(void)
 		printk(KERN_WARNING "lockd_up: create service failed\n");
 		return ERR_PTR(-ENOMEM);
 	}
+	register_inetaddr_notifier(&lockd_inetaddr_notifier);
+#if IS_ENABLED(CONFIG_IPV6)
+	register_inet6addr_notifier(&lockd_inet6addr_notifier);
+#endif
 	dprintk("lockd_up: service created\n");
 	return serv;
 }
@@ -428,7 +497,7 @@ lockd_down(struct net *net)
 	}
 	kthread_stop(nlmsvc_task);
 	dprintk("lockd_down: service stopped\n");
-	svc_exit_thread(nlmsvc_rqst);
+	lockd_svc_exit_thread();
 	dprintk("lockd_down: service destroyed\n");
 	nlmsvc_task = NULL;
 	nlmsvc_rqst = NULL;
diff --git a/fs/locks.c b/fs/locks.c
index 6333263b7..7c5f91be9 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -119,7 +119,6 @@
 #include <linux/fdtable.h>
 #include <linux/fs.h>
 #include <linux/init.h>
-#include <linux/module.h>
 #include <linux/security.h>
 #include <linux/slab.h>
 #include <linux/syscalls.h>
@@ -230,16 +229,44 @@ locks_get_lock_context(struct inode *inode, int type)
 		ctx = smp_load_acquire(&inode->i_flctx);
 	}
 out:
+	trace_locks_get_lock_context(inode, type, ctx);
 	return ctx;
 }
 
+static void
+locks_dump_ctx_list(struct list_head *list, char *list_type)
+{
+	struct file_lock *fl;
+
+	list_for_each_entry(fl, list, fl_list) {
+		pr_warn("%s: fl_owner=%p fl_flags=0x%x fl_type=0x%x fl_pid=%u\n", list_type, fl->fl_owner, fl->fl_flags, fl->fl_type, fl->fl_pid);
+	}
+}
+
+static void
+locks_check_ctx_lists(struct inode *inode)
+{
+	struct file_lock_context *ctx = inode->i_flctx;
+
+	if (unlikely(!list_empty(&ctx->flc_flock) ||
+		     !list_empty(&ctx->flc_posix) ||
+		     !list_empty(&ctx->flc_lease))) {
+		pr_warn("Leaked locks on dev=0x%x:0x%x ino=0x%lx:\n",
+			MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev),
+			inode->i_ino);
+		locks_dump_ctx_list(&ctx->flc_flock, "FLOCK");
+		locks_dump_ctx_list(&ctx->flc_posix, "POSIX");
+		locks_dump_ctx_list(&ctx->flc_lease, "LEASE");
+	}
+}
+
 void
-locks_free_lock_context(struct file_lock_context *ctx)
+locks_free_lock_context(struct inode *inode)
 {
-	if (ctx) {
-		WARN_ON_ONCE(!list_empty(&ctx->flc_flock));
-		WARN_ON_ONCE(!list_empty(&ctx->flc_posix));
-		WARN_ON_ONCE(!list_empty(&ctx->flc_lease));
+	struct file_lock_context *ctx = inode->i_flctx;
+
+	if (unlikely(ctx)) {
+		locks_check_ctx_lists(inode);
 		kmem_cache_free(flctx_cache, ctx);
 	}
 }
@@ -934,7 +961,8 @@ out:
 	return error;
 }
 
-static int __posix_lock_file(struct inode *inode, struct file_lock *request, struct file_lock *conflock)
+static int posix_lock_inode(struct inode *inode, struct file_lock *request,
+			    struct file_lock *conflock)
 {
 	struct file_lock *fl, *tmp;
 	struct file_lock *new_fl = NULL;
@@ -1142,6 +1170,8 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 	if (new_fl2)
 		locks_free_lock(new_fl2);
 	locks_dispose_list(&dispose);
+	trace_posix_lock_inode(inode, request, error);
+
 	return error;
 }
 
@@ -1162,7 +1192,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 int posix_lock_file(struct file *filp, struct file_lock *fl,
 			struct file_lock *conflock)
 {
-	return __posix_lock_file(file_inode(filp), fl, conflock);
+	return posix_lock_inode(file_inode(filp), fl, conflock);
 }
 EXPORT_SYMBOL(posix_lock_file);
 
@@ -1178,7 +1208,7 @@ static int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl)
 	int error;
 	might_sleep ();
 	for (;;) {
-		error = __posix_lock_file(inode, fl, NULL);
+		error = posix_lock_inode(inode, fl, NULL);
 		if (error != FILE_LOCK_DEFERRED)
 			break;
 		error = wait_event_interruptible(fl->fl_wait, !fl->fl_next);
@@ -1191,6 +1221,7 @@ static int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl)
 	return error;
 }
 
+#ifdef CONFIG_MANDATORY_FILE_LOCKING
 /**
  * locks_mandatory_locked - Check for an active lock
  * @file: the file to check
@@ -1227,20 +1258,16 @@ int locks_mandatory_locked(struct file *file)
 
 /**
  * locks_mandatory_area - Check for a conflicting lock
- * @read_write: %FLOCK_VERIFY_WRITE for exclusive access, %FLOCK_VERIFY_READ
- *		for shared
- * @inode:      the file to check
+ * @inode:	the file to check
  * @filp:       how the file was opened (if it was)
- * @offset:     start of area to check
- * @count:      length of area to check
+ * @start:	first byte in the file to check
+ * @end:	lastbyte in the file to check
+ * @type:	%F_WRLCK for a write lock, else %F_RDLCK
  *
  * Searches the inode's list of locks to find any POSIX locks which conflict.
- * This function is called from rw_verify_area() and
- * locks_verify_truncate().
  */
-int locks_mandatory_area(int read_write, struct inode *inode,
-			 struct file *filp, loff_t offset,
-			 size_t count)
+int locks_mandatory_area(struct inode *inode, struct file *filp, loff_t start,
+			 loff_t end, unsigned char type)
 {
 	struct file_lock fl;
 	int error;
@@ -1252,15 +1279,15 @@ int locks_mandatory_area(int read_write, struct inode *inode,
 	fl.fl_flags = FL_POSIX | FL_ACCESS;
 	if (filp && !(filp->f_flags & O_NONBLOCK))
 		sleep = true;
-	fl.fl_type = (read_write == FLOCK_VERIFY_WRITE) ? F_WRLCK : F_RDLCK;
-	fl.fl_start = offset;
-	fl.fl_end = offset + count - 1;
+	fl.fl_type = type;
+	fl.fl_start = start;
+	fl.fl_end = end;
 
 	for (;;) {
 		if (filp) {
 			fl.fl_owner = filp;
 			fl.fl_flags &= ~FL_SLEEP;
-			error = __posix_lock_file(inode, &fl, NULL);
+			error = posix_lock_inode(inode, &fl, NULL);
 			if (!error)
 				break;
 		}
@@ -1268,7 +1295,7 @@ int locks_mandatory_area(int read_write, struct inode *inode,
 		if (sleep)
 			fl.fl_flags |= FL_SLEEP;
 		fl.fl_owner = current->files;
-		error = __posix_lock_file(inode, &fl, NULL);
+		error = posix_lock_inode(inode, &fl, NULL);
 		if (error != FILE_LOCK_DEFERRED)
 			break;
 		error = wait_event_interruptible(fl.fl_wait, !fl.fl_next);
@@ -1289,6 +1316,7 @@ int locks_mandatory_area(int read_write, struct inode *inode,
 }
 
 EXPORT_SYMBOL(locks_mandatory_area);
+#endif /* CONFIG_MANDATORY_FILE_LOCKING */
 
 static void lease_clear_pending(struct file_lock *fl, int arg)
 {
@@ -1503,12 +1531,10 @@ void lease_get_mtime(struct inode *inode, struct timespec *time)
 	ctx = smp_load_acquire(&inode->i_flctx);
 	if (ctx && !list_empty_careful(&ctx->flc_lease)) {
 		spin_lock(&ctx->flc_lock);
-		if (!list_empty(&ctx->flc_lease)) {
-			fl = list_first_entry(&ctx->flc_lease,
-						struct file_lock, fl_list);
-			if (fl->fl_type == F_WRLCK)
-				has_lease = true;
-		}
+		fl = list_first_entry_or_null(&ctx->flc_lease,
+					      struct file_lock, fl_list);
+		if (fl && (fl->fl_type == F_WRLCK))
+			has_lease = true;
 		spin_unlock(&ctx->flc_lock);
 	}
 
@@ -1624,12 +1650,12 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr
 	 * bother, maybe that's a sign this just isn't a good file to
 	 * hand out a delegation on.
 	 */
-	if (is_deleg && !mutex_trylock(&inode->i_mutex))
+	if (is_deleg && !inode_trylock(inode))
 		return -EAGAIN;
 
 	if (is_deleg && arg == F_WRLCK) {
 		/* Write delegations are not currently supported: */
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 		WARN_ON_ONCE(1);
 		return -EINVAL;
 	}
@@ -1706,7 +1732,7 @@ out:
 	spin_unlock(&ctx->flc_lock);
 	locks_dispose_list(&dispose);
 	if (is_deleg)
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 	if (!error && !my_fl)
 		*flp = NULL;
 	return error;
@@ -2165,6 +2191,8 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
 	if (file_lock == NULL)
 		return -ENOLCK;
 
+	inode = file_inode(filp);
+
 	/*
 	 * This might block, so we do it before checking the inode.
 	 */
@@ -2172,8 +2200,6 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
 	if (copy_from_user(&flock, l, sizeof(flock)))
 		goto out;
 
-	inode = file_inode(filp);
-
 	/* Don't allow mandatory locks on files that may be memory mapped
 	 * and shared.
 	 */
@@ -2220,10 +2246,12 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
 	error = do_lock_file_wait(filp, cmd, file_lock);
 
 	/*
-	 * Attempt to detect a close/fcntl race and recover by
-	 * releasing the lock that was just acquired.
+	 * Attempt to detect a close/fcntl race and recover by releasing the
+	 * lock that was just acquired. There is no need to do that when we're
+	 * unlocking though, or for OFD locks.
 	 */
-	if (!error && file_lock->fl_type != F_UNLCK) {
+	if (!error && file_lock->fl_type != F_UNLCK &&
+	    !(file_lock->fl_flags & FL_OFDLCK)) {
 		/*
 		 * We need that spin_lock here - it prevents reordering between
 		 * update of i_flctx->flc_posix and check for it done in
@@ -2240,6 +2268,7 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
 		}
 	}
 out:
+	trace_fcntl_setlk(inode, file_lock, error);
 	locks_free_lock(file_lock);
 	return error;
 }
@@ -2362,10 +2391,12 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
 	error = do_lock_file_wait(filp, cmd, file_lock);
 
 	/*
-	 * Attempt to detect a close/fcntl race and recover by
-	 * releasing the lock that was just acquired.
+	 * Attempt to detect a close/fcntl race and recover by releasing the
+	 * lock that was just acquired. There is no need to do that when we're
+	 * unlocking though, or for OFD locks.
 	 */
-	if (!error && file_lock->fl_type != F_UNLCK) {
+	if (!error && file_lock->fl_type != F_UNLCK &&
+	    !(file_lock->fl_flags & FL_OFDLCK)) {
 		/*
 		 * We need that spin_lock here - it prevents reordering between
 		 * update of i_flctx->flc_posix and check for it done in
@@ -2394,6 +2425,7 @@ out:
  */
 void locks_remove_posix(struct file *filp, fl_owner_t owner)
 {
+	int error;
 	struct file_lock lock;
 	struct file_lock_context *ctx;
 
@@ -2416,10 +2448,11 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner)
 	lock.fl_ops = NULL;
 	lock.fl_lmops = NULL;
 
-	vfs_lock_file(filp, F_SETLK, &lock, NULL);
+	error = vfs_lock_file(filp, F_SETLK, &lock, NULL);
 
 	if (lock.fl_ops && lock.fl_ops->fl_release_private)
 		lock.fl_ops->fl_release_private(&lock);
+	trace_locks_remove_posix(file_inode(filp), &lock, error);
 }
 
 EXPORT_SYMBOL(locks_remove_posix);
@@ -2715,7 +2748,7 @@ static int __init proc_locks_init(void)
 	proc_create("locks", 0, NULL, &proc_locks_operations);
 	return 0;
 }
-module_init(proc_locks_init);
+fs_initcall(proc_locks_init);
 #endif
 
 static int __init filelock_init(void)
diff --git a/fs/logfs/Kconfig b/fs/logfs/Kconfig
index 09ed066c0..2b4503163 100644
--- a/fs/logfs/Kconfig
+++ b/fs/logfs/Kconfig
@@ -1,6 +1,6 @@
 config LOGFS
 	tristate "LogFS file system"
-	depends on (MTD || BLOCK)
+	depends on MTD || (!MTD && BLOCK)
 	select ZLIB_INFLATE
 	select ZLIB_DEFLATE
 	select CRC32
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
index f9b45d46d..542468e9b 100644
--- a/fs/logfs/dir.c
+++ b/fs/logfs/dir.c
@@ -528,7 +528,8 @@ static int logfs_symlink(struct inode *dir, struct dentry *dentry,
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
 
-	inode->i_op = &logfs_symlink_iops;
+	inode->i_op = &page_symlink_inode_operations;
+	inode_nohighmem(inode);
 	inode->i_mapping->a_ops = &logfs_reg_aops;
 
 	return __logfs_create(dir, dentry, inode, target, destlen);
@@ -776,12 +777,6 @@ fail:
 	return -EIO;
 }
 
-const struct inode_operations logfs_symlink_iops = {
-	.readlink	= generic_readlink,
-	.follow_link	= page_follow_link_light,
-	.put_link	= page_put_link,
-};
-
 const struct inode_operations logfs_dir_iops = {
 	.create		= logfs_create,
 	.link		= logfs_link,
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
index 1a6f0167b..61eaeb1b6 100644
--- a/fs/logfs/file.c
+++ b/fs/logfs/file.c
@@ -204,12 +204,12 @@ long logfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		if (err)
 			return err;
 
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 		oldflags = li->li_flags;
 		flags &= LOGFS_FL_USER_MODIFIABLE;
 		flags |= oldflags & ~LOGFS_FL_USER_MODIFIABLE;
 		li->li_flags = flags;
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 
 		inode->i_ctime = CURRENT_TIME;
 		mark_inode_dirty_sync(inode);
@@ -230,11 +230,11 @@ int logfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 	if (ret)
 		return ret;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	logfs_get_wblocks(sb, NULL, WF_LOCK);
 	logfs_write_anchor(sb);
 	logfs_put_wblocks(sb, NULL, WF_LOCK);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	return 0;
 }
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
index af49e2d69..db9cfc598 100644
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -64,7 +64,8 @@ static void logfs_inode_setops(struct inode *inode)
 		inode->i_mapping->a_ops = &logfs_reg_aops;
 		break;
 	case S_IFLNK:
-		inode->i_op = &logfs_symlink_iops;
+		inode->i_op = &page_symlink_inode_operations;
+		inode_nohighmem(inode);
 		inode->i_mapping->a_ops = &logfs_reg_aops;
 		break;
 	case S_IFSOCK:	/* fall through */
@@ -408,7 +409,8 @@ const struct super_operations logfs_super_operations = {
 int logfs_init_inode_cache(void)
 {
 	logfs_inode_cache = kmem_cache_create("logfs_inode_cache",
-			sizeof(struct logfs_inode), 0, SLAB_RECLAIM_ACCOUNT,
+			sizeof(struct logfs_inode), 0,
+			SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT,
 			logfs_init_once);
 	if (!logfs_inode_cache)
 		return -ENOMEM;
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
index 5f0937609..27d040e35 100644
--- a/fs/logfs/logfs.h
+++ b/fs/logfs/logfs.h
@@ -302,7 +302,7 @@ struct logfs_block {
 	struct inode *inode;
 	struct logfs_transaction *ta;
 	unsigned long alias_map[LOGFS_BLOCK_FACTOR / BITS_PER_LONG];
-	struct logfs_block_ops *ops;
+	const struct logfs_block_ops *ops;
 	int full;
 	int partial;
 	int reserved_bytes;
@@ -485,7 +485,7 @@ static inline int logfs_get_sb_bdev(struct logfs_super *s,
 #endif
 
 /* dev_mtd.c */
-#ifdef CONFIG_MTD
+#if IS_ENABLED(CONFIG_MTD)
 int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr);
 #else
 static inline int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr)
@@ -495,7 +495,6 @@ static inline int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr)
 #endif
 
 /* dir.c */
-extern const struct inode_operations logfs_symlink_iops;
 extern const struct inode_operations logfs_dir_iops;
 extern const struct file_operations logfs_dir_fops;
 int logfs_replay_journal(struct super_block *sb);
@@ -579,7 +578,7 @@ int logfs_exist_block(struct inode *inode, u64 bix);
 int get_page_reserve(struct inode *inode, struct page *page);
 void logfs_get_wblocks(struct super_block *sb, struct page *page, int lock);
 void logfs_put_wblocks(struct super_block *sb, struct page *page, int lock);
-extern struct logfs_block_ops indirect_block_ops;
+extern const struct logfs_block_ops indirect_block_ops;
 
 /* segment.c */
 int logfs_erase_segment(struct super_block *sb, u32 ofs, int ensure_erase);
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
index 380d86e1a..20973c9e5 100644
--- a/fs/logfs/readwrite.c
+++ b/fs/logfs/readwrite.c
@@ -569,13 +569,13 @@ static void indirect_free_block(struct super_block *sb,
 }
 
 
-static struct logfs_block_ops inode_block_ops = {
+static const struct logfs_block_ops inode_block_ops = {
 	.write_block = inode_write_block,
 	.free_block = inode_free_block,
 	.write_alias = inode_write_alias,
 };
 
-struct logfs_block_ops indirect_block_ops = {
+const struct logfs_block_ops indirect_block_ops = {
 	.write_block = indirect_write_block,
 	.free_block = indirect_free_block,
 	.write_alias = indirect_write_alias,
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
index 6de0fbfc6..d270e4b2a 100644
--- a/fs/logfs/segment.c
+++ b/fs/logfs/segment.c
@@ -197,7 +197,7 @@ static int btree_write_alias(struct super_block *sb, struct logfs_block *block,
 	return 0;
 }
 
-static struct logfs_block_ops btree_block_ops = {
+static const struct logfs_block_ops btree_block_ops = {
 	.write_block	= btree_write_block,
 	.free_block	= __free_block,
 	.write_alias	= btree_write_alias,
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 086cd0a61..f975d667c 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -91,7 +91,7 @@ static int __init init_inodecache(void)
 	minix_inode_cachep = kmem_cache_create("minix_inode_cache",
 					     sizeof(struct minix_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					     init_once);
 	if (minix_inode_cachep == NULL)
 		return -ENOMEM;
@@ -435,8 +435,7 @@ static const struct address_space_operations minix_aops = {
 
 static const struct inode_operations minix_symlink_inode_operations = {
 	.readlink	= generic_readlink,
-	.follow_link	= page_follow_link_light,
-	.put_link	= page_put_link,
+	.get_link	= page_get_link,
 	.getattr	= minix_getattr,
 };
 
@@ -452,6 +451,7 @@ void minix_set_inode(struct inode *inode, dev_t rdev)
 		inode->i_mapping->a_ops = &minix_aops;
 	} else if (S_ISLNK(inode->i_mode)) {
 		inode->i_op = &minix_symlink_inode_operations;
+		inode_nohighmem(inode);
 		inode->i_mapping->a_ops = &minix_aops;
 	} else
 		init_special_inode(inode, inode->i_mode, rdev);
diff --git a/fs/minix/itree_v1.c b/fs/minix/itree_v1.c
index 282e15ad8..46ca39d6c 100644
--- a/fs/minix/itree_v1.c
+++ b/fs/minix/itree_v1.c
@@ -24,16 +24,15 @@ static inline block_t *i_data(struct inode *inode)
 static int block_to_path(struct inode * inode, long block, int offsets[DEPTH])
 {
 	int n = 0;
-	char b[BDEVNAME_SIZE];
 
 	if (block < 0) {
-		printk("MINIX-fs: block_to_path: block %ld < 0 on dev %s\n",
-			block, bdevname(inode->i_sb->s_bdev, b));
+		printk("MINIX-fs: block_to_path: block %ld < 0 on dev %pg\n",
+			block, inode->i_sb->s_bdev);
 	} else if (block >= (minix_sb(inode->i_sb)->s_max_size/BLOCK_SIZE)) {
 		if (printk_ratelimit())
 			printk("MINIX-fs: block_to_path: "
-			       "block %ld too big on dev %s\n",
-				block, bdevname(inode->i_sb->s_bdev, b));
+			       "block %ld too big on dev %pg\n",
+				block, inode->i_sb->s_bdev);
 	} else if (block < 7) {
 		offsets[n++] = block;
 	} else if ((block -= 7) < 512) {
diff --git a/fs/minix/itree_v2.c b/fs/minix/itree_v2.c
index 78e2d93e5..1ee101352 100644
--- a/fs/minix/itree_v2.c
+++ b/fs/minix/itree_v2.c
@@ -26,18 +26,17 @@ static inline block_t *i_data(struct inode *inode)
 static int block_to_path(struct inode * inode, long block, int offsets[DEPTH])
 {
 	int n = 0;
-	char b[BDEVNAME_SIZE];
 	struct super_block *sb = inode->i_sb;
 
 	if (block < 0) {
-		printk("MINIX-fs: block_to_path: block %ld < 0 on dev %s\n",
-			block, bdevname(sb->s_bdev, b));
+		printk("MINIX-fs: block_to_path: block %ld < 0 on dev %pg\n",
+			block, sb->s_bdev);
 	} else if ((u64)block * (u64)sb->s_blocksize >=
 			minix_sb(sb)->s_max_size) {
 		if (printk_ratelimit())
 			printk("MINIX-fs: block_to_path: "
-			       "block %ld too big on dev %s\n",
-				block, bdevname(sb->s_bdev, b));
+			       "block %ld too big on dev %pg\n",
+				block, sb->s_bdev);
 	} else if (block < DIRCOUNT) {
 		offsets[n++] = block;
 	} else if ((block -= DIRCOUNT) < INDIRCOUNT(sb)) {
diff --git a/fs/namei.c b/fs/namei.c
index d8ee4da93..9c590e0f6 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -505,13 +505,13 @@ struct nameidata {
 	int		total_link_count;
 	struct saved {
 		struct path link;
-		void *cookie;
+		struct delayed_call done;
 		const char *name;
-		struct inode *inode;
 		unsigned seq;
 	} *stack, internal[EMBEDDED_LEVELS];
 	struct filename	*name;
 	struct nameidata *saved;
+	struct inode	*link_inode;
 	unsigned	root_seq;
 	int		dfd;
 };
@@ -534,10 +534,8 @@ static void restore_nameidata(void)
 	current->nameidata = old;
 	if (old)
 		old->total_link_count = now->total_link_count;
-	if (now->stack != now->internal) {
+	if (now->stack != now->internal)
 		kfree(now->stack);
-		now->stack = now->internal;
-	}
 }
 
 static int __nd_alloc_stack(struct nameidata *nd)
@@ -592,11 +590,8 @@ static void drop_links(struct nameidata *nd)
 	int i = nd->depth;
 	while (i--) {
 		struct saved *last = nd->stack + i;
-		struct inode *inode = last->inode;
-		if (last->cookie && inode->i_op->put_link) {
-			inode->i_op->put_link(inode, last->cookie);
-			last->cookie = NULL;
-		}
+		do_delayed_call(&last->done);
+		clear_delayed_call(&last->done);
 	}
 }
 
@@ -657,7 +652,7 @@ static bool legitimize_links(struct nameidata *nd)
  * Path walking has 2 modes, rcu-walk and ref-walk (see
  * Documentation/filesystems/path-lookup.txt).  In situations when we can't
  * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab
- * normal reference counts on dentries and vfsmounts to transition to rcu-walk
+ * normal reference counts on dentries and vfsmounts to transition to ref-walk
  * mode.  Refcounts are grabbed at the last known good point before rcu-walk
  * got stuck, so ref-walk may continue from there. If this is not successful
  * (eg. a seqcount has changed), then failure is returned and it's up to caller
@@ -807,19 +802,19 @@ static int complete_walk(struct nameidata *nd)
 
 static void set_root(struct nameidata *nd)
 {
-	get_fs_root(current->fs, &nd->root);
-}
-
-static void set_root_rcu(struct nameidata *nd)
-{
 	struct fs_struct *fs = current->fs;
-	unsigned seq;
 
-	do {
-		seq = read_seqcount_begin(&fs->seq);
-		nd->root = fs->root;
-		nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
-	} while (read_seqcount_retry(&fs->seq, seq));
+	if (nd->flags & LOOKUP_RCU) {
+		unsigned seq;
+
+		do {
+			seq = read_seqcount_begin(&fs->seq);
+			nd->root = fs->root;
+			nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
+		} while (read_seqcount_retry(&fs->seq, seq));
+	} else {
+		get_fs_root(fs, &nd->root);
+	}
 }
 
 static void path_put_conditional(struct path *path, struct nameidata *nd)
@@ -841,8 +836,28 @@ static inline void path_to_nameidata(const struct path *path,
 	nd->path.dentry = path->dentry;
 }
 
+static int nd_jump_root(struct nameidata *nd)
+{
+	if (nd->flags & LOOKUP_RCU) {
+		struct dentry *d;
+		nd->path = nd->root;
+		d = nd->path.dentry;
+		nd->inode = d->d_inode;
+		nd->seq = nd->root_seq;
+		if (unlikely(read_seqcount_retry(&d->d_seq, nd->seq)))
+			return -ECHILD;
+	} else {
+		path_put(&nd->path);
+		nd->path = nd->root;
+		path_get(&nd->path);
+		nd->inode = nd->path.dentry->d_inode;
+	}
+	nd->flags |= LOOKUP_JUMPED;
+	return 0;
+}
+
 /*
- * Helper to directly jump to a known parsed path from ->follow_link,
+ * Helper to directly jump to a known parsed path from ->get_link,
  * caller must have taken a reference to path beforehand.
  */
 void nd_jump_link(struct path *path)
@@ -858,9 +873,7 @@ void nd_jump_link(struct path *path)
 static inline void put_link(struct nameidata *nd)
 {
 	struct saved *last = nd->stack + --nd->depth;
-	struct inode *inode = last->inode;
-	if (last->cookie && inode->i_op->put_link)
-		inode->i_op->put_link(inode, last->cookie);
+	do_delayed_call(&last->done);
 	if (!(nd->flags & LOOKUP_RCU))
 		path_put(&last->link);
 }
@@ -892,7 +905,7 @@ static inline int may_follow_link(struct nameidata *nd)
 		return 0;
 
 	/* Allowed if owner and follower match. */
-	inode = nd->stack[0].inode;
+	inode = nd->link_inode;
 	if (uid_eq(current_cred()->fsuid, inode->i_uid))
 		return 0;
 
@@ -983,7 +996,7 @@ const char *get_link(struct nameidata *nd)
 {
 	struct saved *last = nd->stack + nd->depth - 1;
 	struct dentry *dentry = last->link.dentry;
-	struct inode *inode = last->inode;
+	struct inode *inode = nd->link_inode;
 	int error;
 	const char *res;
 
@@ -1004,36 +1017,27 @@ const char *get_link(struct nameidata *nd)
 	nd->last_type = LAST_BIND;
 	res = inode->i_link;
 	if (!res) {
+		const char * (*get)(struct dentry *, struct inode *,
+				struct delayed_call *);
+		get = inode->i_op->get_link;
 		if (nd->flags & LOOKUP_RCU) {
-			if (unlikely(unlazy_walk(nd, NULL, 0)))
-				return ERR_PTR(-ECHILD);
+			res = get(NULL, inode, &last->done);
+			if (res == ERR_PTR(-ECHILD)) {
+				if (unlikely(unlazy_walk(nd, NULL, 0)))
+					return ERR_PTR(-ECHILD);
+				res = get(dentry, inode, &last->done);
+			}
+		} else {
+			res = get(dentry, inode, &last->done);
 		}
-		res = inode->i_op->follow_link(dentry, &last->cookie);
-		if (IS_ERR_OR_NULL(res)) {
-			last->cookie = NULL;
+		if (IS_ERR_OR_NULL(res))
 			return res;
-		}
 	}
 	if (*res == '/') {
-		if (nd->flags & LOOKUP_RCU) {
-			struct dentry *d;
-			if (!nd->root.mnt)
-				set_root_rcu(nd);
-			nd->path = nd->root;
-			d = nd->path.dentry;
-			nd->inode = d->d_inode;
-			nd->seq = nd->root_seq;
-			if (unlikely(read_seqcount_retry(&d->d_seq, nd->seq)))
-				return ERR_PTR(-ECHILD);
-		} else {
-			if (!nd->root.mnt)
-				set_root(nd);
-			path_put(&nd->path);
-			nd->path = nd->root;
-			path_get(&nd->root);
-			nd->inode = nd->path.dentry->d_inode;
-		}
-		nd->flags |= LOOKUP_JUMPED;
+		if (!nd->root.mnt)
+			set_root(nd);
+		if (unlikely(nd_jump_root(nd)))
+			return ERR_PTR(-ECHILD);
 		while (unlikely(*++res == '/'))
 			;
 	}
@@ -1294,8 +1298,6 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
 static int follow_dotdot_rcu(struct nameidata *nd)
 {
 	struct inode *inode = nd->inode;
-	if (!nd->root.mnt)
-		set_root_rcu(nd);
 
 	while (1) {
 		if (path_equal(&nd->path, &nd->root))
@@ -1415,9 +1417,6 @@ static void follow_mount(struct path *path)
 
 static int follow_dotdot(struct nameidata *nd)
 {
-	if (!nd->root.mnt)
-		set_root(nd);
-
 	while(1) {
 		struct dentry *old = nd->path.dentry;
 
@@ -1630,9 +1629,9 @@ static int lookup_slow(struct nameidata *nd, struct path *path)
 	parent = nd->path.dentry;
 	BUG_ON(nd->inode != parent->d_inode);
 
-	mutex_lock(&parent->d_inode->i_mutex);
+	inode_lock(parent->d_inode);
 	dentry = __lookup_hash(&nd->last, parent, nd->flags);
-	mutex_unlock(&parent->d_inode->i_mutex);
+	inode_unlock(parent->d_inode);
 	if (IS_ERR(dentry))
 		return PTR_ERR(dentry);
 	path->mnt = nd->path.mnt;
@@ -1655,6 +1654,8 @@ static inline int may_lookup(struct nameidata *nd)
 static inline int handle_dots(struct nameidata *nd, int type)
 {
 	if (type == LAST_DOTDOT) {
+		if (!nd->root.mnt)
+			set_root(nd);
 		if (nd->flags & LOOKUP_RCU) {
 			return follow_dotdot_rcu(nd);
 		} else
@@ -1691,8 +1692,8 @@ static int pick_link(struct nameidata *nd, struct path *link,
 
 	last = nd->stack + nd->depth++;
 	last->link = *link;
-	last->cookie = NULL;
-	last->inode = inode;
+	clear_delayed_call(&last->done);
+	nd->link_inode = inode;
 	last->seq = seq;
 	return 1;
 }
@@ -2025,18 +2026,19 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
 	}
 
 	nd->root.mnt = NULL;
+	nd->path.mnt = NULL;
+	nd->path.dentry = NULL;
 
 	nd->m_seq = read_seqbegin(&mount_lock);
 	if (*s == '/') {
-		if (flags & LOOKUP_RCU) {
+		if (flags & LOOKUP_RCU)
 			rcu_read_lock();
-			set_root_rcu(nd);
-			nd->seq = nd->root_seq;
-		} else {
-			set_root(nd);
-			path_get(&nd->root);
-		}
-		nd->path = nd->root;
+		set_root(nd);
+		if (likely(!nd_jump_root(nd)))
+			return s;
+		nd->root.mnt = NULL;
+		rcu_read_unlock();
+		return ERR_PTR(-ECHILD);
 	} else if (nd->dfd == AT_FDCWD) {
 		if (flags & LOOKUP_RCU) {
 			struct fs_struct *fs = current->fs;
@@ -2047,11 +2049,14 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
 			do {
 				seq = read_seqcount_begin(&fs->seq);
 				nd->path = fs->pwd;
+				nd->inode = nd->path.dentry->d_inode;
 				nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq);
 			} while (read_seqcount_retry(&fs->seq, seq));
 		} else {
 			get_fs_pwd(current->fs, &nd->path);
+			nd->inode = nd->path.dentry->d_inode;
 		}
+		return s;
 	} else {
 		/* Caller must check execute permissions on the starting path component */
 		struct fd f = fdget_raw(nd->dfd);
@@ -2081,16 +2086,6 @@ static const char *path_init(struct nameidata *nd, unsigned flags)
 		fdput(f);
 		return s;
 	}
-
-	nd->inode = nd->path.dentry->d_inode;
-	if (!(flags & LOOKUP_RCU))
-		return s;
-	if (likely(!read_seqcount_retry(&nd->path.dentry->d_seq, nd->seq)))
-		return s;
-	if (!(nd->flags & LOOKUP_ROOT))
-		nd->root.mnt = NULL;
-	rcu_read_unlock();
-	return ERR_PTR(-ECHILD);
 }
 
 static const char *trailing_symlink(struct nameidata *nd)
@@ -2239,10 +2234,10 @@ struct dentry *kern_path_locked(const char *name, struct path *path)
 		putname(filename);
 		return ERR_PTR(-EINVAL);
 	}
-	mutex_lock_nested(&path->dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+	inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
 	d = __lookup_hash(&last, path->dentry, 0);
 	if (IS_ERR(d)) {
-		mutex_unlock(&path->dentry->d_inode->i_mutex);
+		inode_unlock(path->dentry->d_inode);
 		path_put(path);
 	}
 	putname(filename);
@@ -2283,6 +2278,8 @@ EXPORT_SYMBOL(vfs_path_lookup);
  *
  * Note that this routine is purely a helper for filesystem usage and should
  * not be called by generic code.
+ *
+ * The caller must hold base->i_mutex.
  */
 struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 {
@@ -2290,7 +2287,7 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 	unsigned int c;
 	int err;
 
-	WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex));
+	WARN_ON_ONCE(!inode_is_locked(base->d_inode));
 
 	this.name = name;
 	this.len = len;
@@ -2326,6 +2323,75 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
 }
 EXPORT_SYMBOL(lookup_one_len);
 
+/**
+ * lookup_one_len_unlocked - filesystem helper to lookup single pathname component
+ * @name:	pathname component to lookup
+ * @base:	base directory to lookup from
+ * @len:	maximum length @len should be interpreted to
+ *
+ * Note that this routine is purely a helper for filesystem usage and should
+ * not be called by generic code.
+ *
+ * Unlike lookup_one_len, it should be called without the parent
+ * i_mutex held, and will take the i_mutex itself if necessary.
+ */
+struct dentry *lookup_one_len_unlocked(const char *name,
+				       struct dentry *base, int len)
+{
+	struct qstr this;
+	unsigned int c;
+	int err;
+	struct dentry *ret;
+
+	this.name = name;
+	this.len = len;
+	this.hash = full_name_hash(name, len);
+	if (!len)
+		return ERR_PTR(-EACCES);
+
+	if (unlikely(name[0] == '.')) {
+		if (len < 2 || (len == 2 && name[1] == '.'))
+			return ERR_PTR(-EACCES);
+	}
+
+	while (len--) {
+		c = *(const unsigned char *)name++;
+		if (c == '/' || c == '\0')
+			return ERR_PTR(-EACCES);
+	}
+	/*
+	 * See if the low-level filesystem might want
+	 * to use its own hash..
+	 */
+	if (base->d_flags & DCACHE_OP_HASH) {
+		int err = base->d_op->d_hash(base, &this);
+		if (err < 0)
+			return ERR_PTR(err);
+	}
+
+	err = inode_permission(base->d_inode, MAY_EXEC);
+	if (err)
+		return ERR_PTR(err);
+
+	/*
+	 * __d_lookup() is used to try to get a quick answer and avoid the
+	 * mutex.  A false-negative does no harm.
+	 */
+	ret = __d_lookup(base, &this);
+	if (ret && unlikely(ret->d_flags & DCACHE_OP_REVALIDATE)) {
+		dput(ret);
+		ret = NULL;
+	}
+	if (ret)
+		return ret;
+
+	inode_lock(base->d_inode);
+	ret =  __lookup_hash(&this, base, 0);
+	inode_unlock(base->d_inode);
+	return ret;
+}
+EXPORT_SYMBOL(lookup_one_len_unlocked);
+
 int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
 		 struct path *path, int *empty)
 {
@@ -2402,7 +2468,7 @@ mountpoint_last(struct nameidata *nd, struct path *path)
 		goto done;
 	}
 
-	mutex_lock(&dir->d_inode->i_mutex);
+	inode_lock(dir->d_inode);
 	dentry = d_lookup(dir, &nd->last);
 	if (!dentry) {
 		/*
@@ -2412,16 +2478,16 @@ mountpoint_last(struct nameidata *nd, struct path *path)
 		 */
 		dentry = d_alloc(dir, &nd->last);
 		if (!dentry) {
-			mutex_unlock(&dir->d_inode->i_mutex);
+			inode_unlock(dir->d_inode);
 			return -ENOMEM;
 		}
 		dentry = lookup_real(dir->d_inode, dentry, nd->flags);
 		if (IS_ERR(dentry)) {
-			mutex_unlock(&dir->d_inode->i_mutex);
+			inode_unlock(dir->d_inode);
 			return PTR_ERR(dentry);
 		}
 	}
-	mutex_unlock(&dir->d_inode->i_mutex);
+	inode_unlock(dir->d_inode);
 
 done:
 	if (d_is_negative(dentry)) {
@@ -2611,7 +2677,7 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
 	struct dentry *p;
 
 	if (p1 == p2) {
-		mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
+		inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
 		return NULL;
 	}
 
@@ -2619,29 +2685,29 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2)
 
 	p = d_ancestor(p2, p1);
 	if (p) {
-		mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT);
-		mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_CHILD);
+		inode_lock_nested(p2->d_inode, I_MUTEX_PARENT);
+		inode_lock_nested(p1->d_inode, I_MUTEX_CHILD);
 		return p;
 	}
 
 	p = d_ancestor(p1, p2);
 	if (p) {
-		mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
-		mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD);
+		inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
+		inode_lock_nested(p2->d_inode, I_MUTEX_CHILD);
 		return p;
 	}
 
-	mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT);
-	mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT2);
+	inode_lock_nested(p1->d_inode, I_MUTEX_PARENT);
+	inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2);
 	return NULL;
 }
 EXPORT_SYMBOL(lock_rename);
 
 void unlock_rename(struct dentry *p1, struct dentry *p2)
 {
-	mutex_unlock(&p1->d_inode->i_mutex);
+	inode_unlock(p1->d_inode);
 	if (p1 != p2) {
-		mutex_unlock(&p2->d_inode->i_mutex);
+		inode_unlock(p2->d_inode);
 		mutex_unlock(&p1->d_inode->i_sb->s_vfs_rename_mutex);
 	}
 }
@@ -2674,10 +2740,6 @@ static int may_open(struct path *path, int acc_mode, int flag)
 	struct inode *inode = dentry->d_inode;
 	int error;
 
-	/* O_PATH? */
-	if (!acc_mode)
-		return 0;
-
 	if (!inode)
 		return -ENOENT;
 
@@ -2699,7 +2761,7 @@ static int may_open(struct path *path, int acc_mode, int flag)
 		break;
 	}
 
-	error = inode_permission(inode, acc_mode);
+	error = inode_permission(inode, MAY_OPEN | acc_mode);
 	if (error)
 		return error;
 
@@ -2891,7 +2953,7 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry,
 	if (*opened & FILE_CREATED) {
 		WARN_ON(!(open_flag & O_CREAT));
 		fsnotify_create(dir, dentry);
-		acc_mode = MAY_OPEN;
+		acc_mode = 0;
 	}
 	error = may_open(&file->f_path, acc_mode, open_flag);
 	if (error)
@@ -3084,9 +3146,9 @@ retry_lookup:
 		 * dropping this one anyway.
 		 */
 	}
-	mutex_lock(&dir->d_inode->i_mutex);
+	inode_lock(dir->d_inode);
 	error = lookup_open(nd, &path, file, op, got_write, opened);
-	mutex_unlock(&dir->d_inode->i_mutex);
+	inode_unlock(dir->d_inode);
 
 	if (error <= 0) {
 		if (error)
@@ -3104,7 +3166,7 @@ retry_lookup:
 		/* Don't check for write permission, don't truncate */
 		open_flag &= ~O_TRUNC;
 		will_truncate = false;
-		acc_mode = MAY_OPEN;
+		acc_mode = 0;
 		path_to_nameidata(&path, nd);
 		goto finish_open_created;
 	}
@@ -3187,10 +3249,11 @@ finish_open:
 		got_write = true;
 	}
 finish_open_created:
-	error = may_open(&nd->path, acc_mode, open_flag);
-	if (error)
-		goto out;
-
+	if (likely(!(open_flag & O_PATH))) {
+		error = may_open(&nd->path, acc_mode, open_flag);
+		if (error)
+			goto out;
+	}
 	BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */
 	error = vfs_open(&nd->path, file, current_cred());
 	if (!error) {
@@ -3281,7 +3344,7 @@ static int do_tmpfile(struct nameidata *nd, unsigned flags,
 		goto out2;
 	audit_inode(nd->name, child, 0);
 	/* Don't check for other permissions, the inode was just created */
-	error = may_open(&path, MAY_OPEN, op->open_flag);
+	error = may_open(&path, 0, op->open_flag);
 	if (error)
 		goto out2;
 	file->f_path.mnt = path.mnt;
@@ -3434,7 +3497,7 @@ static struct dentry *filename_create(int dfd, struct filename *name,
 	 * Do the final lookup.
 	 */
 	lookup_flags |= LOOKUP_CREATE | LOOKUP_EXCL;
-	mutex_lock_nested(&path->dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+	inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT);
 	dentry = __lookup_hash(&last, path->dentry, lookup_flags);
 	if (IS_ERR(dentry))
 		goto unlock;
@@ -3463,7 +3526,7 @@ fail:
 	dput(dentry);
 	dentry = ERR_PTR(error);
 unlock:
-	mutex_unlock(&path->dentry->d_inode->i_mutex);
+	inode_unlock(path->dentry->d_inode);
 	if (!err2)
 		mnt_drop_write(path->mnt);
 out:
@@ -3483,7 +3546,7 @@ EXPORT_SYMBOL(kern_path_create);
 void done_path_create(struct path *path, struct dentry *dentry)
 {
 	dput(dentry);
-	mutex_unlock(&path->dentry->d_inode->i_mutex);
+	inode_unlock(path->dentry->d_inode);
 	mnt_drop_write(path->mnt);
 	path_put(path);
 }
@@ -3680,7 +3743,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
 		return -EPERM;
 
 	dget(dentry);
-	mutex_lock(&dentry->d_inode->i_mutex);
+	inode_lock(dentry->d_inode);
 
 	error = -EBUSY;
 	if (is_local_mountpoint(dentry))
@@ -3700,7 +3763,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry)
 	detach_mounts(dentry);
 
 out:
-	mutex_unlock(&dentry->d_inode->i_mutex);
+	inode_unlock(dentry->d_inode);
 	dput(dentry);
 	if (!error)
 		d_delete(dentry);
@@ -3739,7 +3802,7 @@ retry:
 	if (error)
 		goto exit1;
 
-	mutex_lock_nested(&path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+	inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
 	dentry = __lookup_hash(&last, path.dentry, lookup_flags);
 	error = PTR_ERR(dentry);
 	if (IS_ERR(dentry))
@@ -3755,7 +3818,7 @@ retry:
 exit3:
 	dput(dentry);
 exit2:
-	mutex_unlock(&path.dentry->d_inode->i_mutex);
+	inode_unlock(path.dentry->d_inode);
 	mnt_drop_write(path.mnt);
 exit1:
 	path_put(&path);
@@ -3801,7 +3864,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegate
 	if (!dir->i_op->unlink)
 		return -EPERM;
 
-	mutex_lock(&target->i_mutex);
+	inode_lock(target);
 	if (is_local_mountpoint(dentry))
 		error = -EBUSY;
 	else {
@@ -3818,7 +3881,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegate
 		}
 	}
 out:
-	mutex_unlock(&target->i_mutex);
+	inode_unlock(target);
 
 	/* We don't d_delete() NFS sillyrenamed files--they still exist. */
 	if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) {
@@ -3861,7 +3924,7 @@ retry:
 	if (error)
 		goto exit1;
 retry_deleg:
-	mutex_lock_nested(&path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+	inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT);
 	dentry = __lookup_hash(&last, path.dentry, lookup_flags);
 	error = PTR_ERR(dentry);
 	if (!IS_ERR(dentry)) {
@@ -3879,7 +3942,7 @@ retry_deleg:
 exit2:
 		dput(dentry);
 	}
-	mutex_unlock(&path.dentry->d_inode->i_mutex);
+	inode_unlock(path.dentry->d_inode);
 	if (inode)
 		iput(inode);	/* truncate the inode here */
 	inode = NULL;
@@ -4031,7 +4094,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
 	if (error)
 		return error;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	/* Make sure we don't allow creating hardlink to an unlinked file */
 	if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE))
 		error =  -ENOENT;
@@ -4048,7 +4111,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
 		inode->i_state &= ~I_LINKABLE;
 		spin_unlock(&inode->i_lock);
 	}
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	if (!error)
 		fsnotify_link(dir, inode, new_dentry);
 	return error;
@@ -4248,7 +4311,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (!is_dir || (flags & RENAME_EXCHANGE))
 		lock_two_nondirectories(source, target);
 	else if (target)
-		mutex_lock(&target->i_mutex);
+		inode_lock(target);
 
 	error = -EBUSY;
 	if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry))
@@ -4301,7 +4364,7 @@ out:
 	if (!is_dir || (flags & RENAME_EXCHANGE))
 		unlock_two_nondirectories(source, target);
 	else if (target)
-		mutex_unlock(&target->i_mutex);
+		inode_unlock(target);
 	dput(new_dentry);
 	if (!error) {
 		fsnotify_move(old_dir, new_dir, old_name, is_dir,
@@ -4503,72 +4566,73 @@ EXPORT_SYMBOL(readlink_copy);
 
 /*
  * A helper for ->readlink().  This should be used *ONLY* for symlinks that
- * have ->follow_link() touching nd only in nd_set_link().  Using (or not
- * using) it for any given inode is up to filesystem.
+ * have ->get_link() not calling nd_jump_link().  Using (or not using) it
+ * for any given inode is up to filesystem.
  */
 int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen)
 {
-	void *cookie;
+	DEFINE_DELAYED_CALL(done);
 	struct inode *inode = d_inode(dentry);
 	const char *link = inode->i_link;
 	int res;
 
 	if (!link) {
-		link = inode->i_op->follow_link(dentry, &cookie);
+		link = inode->i_op->get_link(dentry, inode, &done);
 		if (IS_ERR(link))
 			return PTR_ERR(link);
 	}
 	res = readlink_copy(buffer, buflen, link);
-	if (inode->i_op->put_link)
-		inode->i_op->put_link(inode, cookie);
+	do_delayed_call(&done);
 	return res;
 }
 EXPORT_SYMBOL(generic_readlink);
 
 /* get the link contents into pagecache */
-static char *page_getlink(struct dentry * dentry, struct page **ppage)
+const char *page_get_link(struct dentry *dentry, struct inode *inode,
+			  struct delayed_call *callback)
 {
 	char *kaddr;
 	struct page *page;
-	struct address_space *mapping = dentry->d_inode->i_mapping;
-	page = read_mapping_page(mapping, 0, NULL);
-	if (IS_ERR(page))
-		return (char*)page;
-	*ppage = page;
-	kaddr = kmap(page);
-	nd_terminate_link(kaddr, dentry->d_inode->i_size, PAGE_SIZE - 1);
+	struct address_space *mapping = inode->i_mapping;
+
+	if (!dentry) {
+		page = find_get_page(mapping, 0);
+		if (!page)
+			return ERR_PTR(-ECHILD);
+		if (!PageUptodate(page)) {
+			put_page(page);
+			return ERR_PTR(-ECHILD);
+		}
+	} else {
+		page = read_mapping_page(mapping, 0, NULL);
+		if (IS_ERR(page))
+			return (char*)page;
+	}
+	set_delayed_call(callback, page_put_link, page);
+	BUG_ON(mapping_gfp_mask(mapping) & __GFP_HIGHMEM);
+	kaddr = page_address(page);
+	nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1);
 	return kaddr;
 }
 
-int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
-{
-	struct page *page = NULL;
-	int res = readlink_copy(buffer, buflen, page_getlink(dentry, &page));
-	if (page) {
-		kunmap(page);
-		page_cache_release(page);
-	}
-	return res;
-}
-EXPORT_SYMBOL(page_readlink);
+EXPORT_SYMBOL(page_get_link);
 
-const char *page_follow_link_light(struct dentry *dentry, void **cookie)
+void page_put_link(void *arg)
 {
-	struct page *page = NULL;
-	char *res = page_getlink(dentry, &page);
-	if (!IS_ERR(res))
-		*cookie = page;
-	return res;
+	put_page(arg);
 }
-EXPORT_SYMBOL(page_follow_link_light);
+EXPORT_SYMBOL(page_put_link);
 
-void page_put_link(struct inode *unused, void *cookie)
+int page_readlink(struct dentry *dentry, char __user *buffer, int buflen)
 {
-	struct page *page = cookie;
-	kunmap(page);
-	page_cache_release(page);
+	DEFINE_DELAYED_CALL(done);
+	int res = readlink_copy(buffer, buflen,
+				page_get_link(dentry, d_inode(dentry),
+					      &done));
+	do_delayed_call(&done);
+	return res;
 }
-EXPORT_SYMBOL(page_put_link);
+EXPORT_SYMBOL(page_readlink);
 
 /*
  * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS
@@ -4579,7 +4643,6 @@ int __page_symlink(struct inode *inode, const char *symname, int len, int nofs)
 	struct page *page;
 	void *fsdata;
 	int err;
-	char *kaddr;
 	unsigned int flags = AOP_FLAG_UNINTERRUPTIBLE;
 	if (nofs)
 		flags |= AOP_FLAG_NOFS;
@@ -4590,9 +4653,7 @@ retry:
 	if (err)
 		goto fail;
 
-	kaddr = kmap_atomic(page);
-	memcpy(kaddr, symname, len-1);
-	kunmap_atomic(kaddr);
+	memcpy(page_address(page), symname, len-1);
 
 	err = pagecache_write_end(NULL, mapping, 0, len-1, len-1,
 							page, fsdata);
@@ -4617,7 +4678,6 @@ EXPORT_SYMBOL(page_symlink);
 
 const struct inode_operations page_symlink_inode_operations = {
 	.readlink	= generic_readlink,
-	.follow_link	= page_follow_link_light,
-	.put_link	= page_put_link,
+	.get_link	= page_get_link,
 };
 EXPORT_SYMBOL(page_symlink_inode_operations);
diff --git a/fs/namespace.c b/fs/namespace.c
index fc5002819..4fb1691b4 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -463,7 +463,6 @@ void __mnt_drop_write(struct vfsmount *mnt)
 	mnt_dec_writers(real_mount(mnt));
 	preempt_enable();
 }
-EXPORT_SYMBOL_GPL(__mnt_drop_write);
 
 /**
  * mnt_drop_write - give up write access to a mount
@@ -1585,6 +1584,14 @@ static inline bool may_mount(void)
 	return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN);
 }
 
+static inline bool may_mandlock(void)
+{
+#ifndef	CONFIG_MANDATORY_FILE_LOCKING
+	return false;
+#endif
+	return capable(CAP_SYS_ADMIN);
+}
+
 /*
  * Now umount can handle mount points as well as block devices.
  * This is important for filesystems which use unnamed block devices.
@@ -1804,7 +1811,6 @@ int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg,
 	}
 	return 0;
 }
-EXPORT_SYMBOL_GPL(iterate_mounts);
 
 static void cleanup_group_ids(struct mount *mnt, struct mount *end)
 {
@@ -1955,9 +1961,9 @@ static struct mountpoint *lock_mount(struct path *path)
 	struct vfsmount *mnt;
 	struct dentry *dentry = path->dentry;
 retry:
-	mutex_lock(&dentry->d_inode->i_mutex);
+	inode_lock(dentry->d_inode);
 	if (unlikely(cant_mount(dentry))) {
-		mutex_unlock(&dentry->d_inode->i_mutex);
+		inode_unlock(dentry->d_inode);
 		return ERR_PTR(-ENOENT);
 	}
 	namespace_lock();
@@ -1968,13 +1974,13 @@ retry:
 			mp = new_mountpoint(dentry);
 		if (IS_ERR(mp)) {
 			namespace_unlock();
-			mutex_unlock(&dentry->d_inode->i_mutex);
+			inode_unlock(dentry->d_inode);
 			return mp;
 		}
 		return mp;
 	}
 	namespace_unlock();
-	mutex_unlock(&path->dentry->d_inode->i_mutex);
+	inode_unlock(path->dentry->d_inode);
 	path_put(path);
 	path->mnt = mnt;
 	dentry = path->dentry = dget(mnt->mnt_root);
@@ -1986,7 +1992,7 @@ static void unlock_mount(struct mountpoint *where)
 	struct dentry *dentry = where->m_dentry;
 	put_mountpoint(where);
 	namespace_unlock();
-	mutex_unlock(&dentry->d_inode->i_mutex);
+	inode_unlock(dentry->d_inode);
 }
 
 static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp)
@@ -2603,18 +2609,18 @@ static long exact_copy_from_user(void *to, const void __user * from,
 	return n;
 }
 
-int copy_mount_options(const void __user * data, unsigned long *where)
+void *copy_mount_options(const void __user * data)
 {
 	int i;
-	unsigned long page;
 	unsigned long size;
+	char *copy;
 
-	*where = 0;
 	if (!data)
-		return 0;
+		return NULL;
 
-	if (!(page = __get_free_page(GFP_KERNEL)))
-		return -ENOMEM;
+	copy = kmalloc(PAGE_SIZE, GFP_KERNEL);
+	if (!copy)
+		return ERR_PTR(-ENOMEM);
 
 	/* We only care that *some* data at the address the user
 	 * gave us is valid.  Just in case, we'll zero
@@ -2625,15 +2631,14 @@ int copy_mount_options(const void __user * data, unsigned long *where)
 	if (size > PAGE_SIZE)
 		size = PAGE_SIZE;
 
-	i = size - exact_copy_from_user((void *)page, data, size);
+	i = size - exact_copy_from_user(copy, data, size);
 	if (!i) {
-		free_page(page);
-		return -EFAULT;
+		kfree(copy);
+		return ERR_PTR(-EFAULT);
 	}
 	if (i != PAGE_SIZE)
-		memset((char *)page + i, 0, PAGE_SIZE - i);
-	*where = page;
-	return 0;
+		memset(copy + i, 0, PAGE_SIZE - i);
+	return copy;
 }
 
 char *copy_mount_string(const void __user *data)
@@ -2679,6 +2684,8 @@ long do_mount(const char *dev_name, const char __user *dir_name,
 				   type_page, flags, data_page);
 	if (!retval && !may_mount())
 		retval = -EPERM;
+	if (!retval && (flags & MS_MANDLOCK) && !may_mandlock())
+		retval = -EPERM;
 	if (retval)
 		goto dput_out;
 
@@ -2898,7 +2905,7 @@ SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
 	int ret;
 	char *kernel_type;
 	char *kernel_dev;
-	unsigned long data_page;
+	void *options;
 
 	kernel_type = copy_mount_string(type);
 	ret = PTR_ERR(kernel_type);
@@ -2910,14 +2917,14 @@ SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name,
 	if (IS_ERR(kernel_dev))
 		goto out_dev;
 
-	ret = copy_mount_options(data, &data_page);
-	if (ret < 0)
+	options = copy_mount_options(data);
+	ret = PTR_ERR(options);
+	if (IS_ERR(options))
 		goto out_data;
 
-	ret = do_mount(kernel_dev, dir_name, kernel_type, flags,
-		(void *) data_page);
+	ret = do_mount(kernel_dev, dir_name, kernel_type, flags, options);
 
-	free_page(data_page);
+	kfree(options);
 out_data:
 	kfree(kernel_dev);
 out_dev:
@@ -2941,9 +2948,9 @@ bool is_path_reachable(struct mount *mnt, struct dentry *dentry,
 	return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry);
 }
 
-int path_is_under(struct path *path1, struct path *path2)
+bool path_is_under(struct path *path1, struct path *path2)
 {
-	int res;
+	bool res;
 	read_seqlock_excl(&mount_lock);
 	res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2);
 	read_sequnlock_excl(&mount_lock);
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index f0e3e9e74..b7f8eaeea 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -369,7 +369,7 @@ ncp_lookup_validate(struct dentry *dentry, unsigned int flags)
 	if (!res) {
 		struct inode *inode = d_inode(dentry);
 
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 		if (finfo.i.dirEntNum == NCP_FINFO(inode)->dirEntNum) {
 			ncp_new_dentry(dentry);
 			val=1;
@@ -377,7 +377,7 @@ ncp_lookup_validate(struct dentry *dentry, unsigned int flags)
 			ncp_dbg(2, "found, but dirEntNum changed\n");
 
 		ncp_update_inode2(inode, &finfo);
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 	}
 
 finished:
@@ -633,15 +633,15 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx,
 				d_rehash(newdent);
 		} else {
 			spin_lock(&dentry->d_lock);
-			NCP_FINFO(inode)->flags &= ~NCPI_DIR_CACHE;
+			NCP_FINFO(dir)->flags &= ~NCPI_DIR_CACHE;
 			spin_unlock(&dentry->d_lock);
 		}
 	} else {
 		struct inode *inode = d_inode(newdent);
 
-		mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
+		inode_lock_nested(inode, I_MUTEX_CHILD);
 		ncp_update_inode2(inode, entry);
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 	}
 
 	if (ctl.idx >= NCP_DIRCACHE_SIZE) {
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index 011324ce9..dd38ca1f2 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -224,10 +224,10 @@ ncp_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	iocb->ki_pos = pos;
 
 	if (pos > i_size_read(inode)) {
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 		if (pos > i_size_read(inode))
 			i_size_write(inode, pos);
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 	}
 	ncp_dbg(1, "exit %pD2\n", file);
 outrel:
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index 9605a2f63..1af15fcbe 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -82,7 +82,7 @@ static int init_inodecache(void)
 	ncp_inode_cachep = kmem_cache_create("ncp_inode_cache",
 					     sizeof(struct ncp_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					     init_once);
 	if (ncp_inode_cachep == NULL)
 		return -ENOMEM;
@@ -244,8 +244,7 @@ static void ncp_set_attr(struct inode *inode, struct ncp_entry_info *nwinfo)
 #if defined(CONFIG_NCPFS_EXTRAS) || defined(CONFIG_NCPFS_NFS_NS)
 static const struct inode_operations ncp_symlink_inode_operations = {
 	.readlink	= generic_readlink,
-	.follow_link	= page_follow_link_light,
-	.put_link	= page_put_link,
+	.get_link	= page_get_link,
 	.setattr	= ncp_notify_change,
 };
 #endif
@@ -283,6 +282,7 @@ ncp_iget(struct super_block *sb, struct ncp_entry_info *info)
 #if defined(CONFIG_NCPFS_EXTRAS) || defined(CONFIG_NCPFS_NFS_NS)
 		} else if (S_ISLNK(inode->i_mode)) {
 			inode->i_op = &ncp_symlink_inode_operations;
+			inode_nohighmem(inode);
 			inode->i_data.a_ops = &ncp_symlink_aops;
 #endif
 		} else {
diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c
index c59a59c37..35ab51c04 100644
--- a/fs/nfs/blocklayout/extent_tree.c
+++ b/fs/nfs/blocklayout/extent_tree.c
@@ -476,6 +476,7 @@ static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
 
 		for (i = 0; i < nr_pages; i++)
 			put_page(arg->layoutupdate_pages[i]);
+		vfree(arg->start_p);
 		kfree(arg->layoutupdate_pages);
 	} else {
 		put_page(arg->layoutupdate_page);
@@ -559,10 +560,15 @@ retry:
 
 	if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) {
 		void *p = start_p, *end = p + arg->layoutupdate_len;
+		struct page *page = NULL;
 		int i = 0;
 
-		for ( ; p < end; p += PAGE_SIZE)
-			arg->layoutupdate_pages[i++] = vmalloc_to_page(p);
+		arg->start_p = start_p;
+		for ( ; p < end; p += PAGE_SIZE) {
+			page = vmalloc_to_page(p);
+			arg->layoutupdate_pages[i++] = page;
+			get_page(page);
+		}
 	}
 
 	dprintk("%s found %zu ranges\n", __func__, count);
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 807eb6ef4..f0939d097 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -83,8 +83,11 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,
 
 	res = htonl(NFS4ERR_BADHANDLE);
 	inode = nfs_delegation_find_inode(cps->clp, &args->fh);
-	if (inode == NULL)
+	if (inode == NULL) {
+		trace_nfs4_cb_recall(cps->clp, &args->fh, NULL,
+				&args->stateid, -ntohl(res));
 		goto out;
+	}
 	/* Set up a helper thread to actually return the delegation */
 	switch (nfs_async_inode_return_delegation(inode, &args->stateid)) {
 	case 0:
@@ -96,7 +99,8 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy,
 	default:
 		res = htonl(NFS4ERR_RESOURCE);
 	}
-	trace_nfs4_recall_delegation(inode, -ntohl(res));
+	trace_nfs4_cb_recall(cps->clp, &args->fh, inode,
+			&args->stateid, -ntohl(res));
 	iput(inode);
 out:
 	dprintk("%s: exit with status = %d\n", __func__, ntohl(res));
@@ -160,6 +164,22 @@ static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp,
 	return lo;
 }
 
+/*
+ * Enforce RFC5661 section 12.5.5.2.1. (Layout Recall and Return Sequencing)
+ */
+static bool pnfs_check_stateid_sequence(struct pnfs_layout_hdr *lo,
+					const nfs4_stateid *new)
+{
+	u32 oldseq, newseq;
+
+	oldseq = be32_to_cpu(lo->plh_stateid.seqid);
+	newseq = be32_to_cpu(new->seqid);
+
+	if (newseq > oldseq + 1)
+		return false;
+	return true;
+}
+
 static u32 initiate_file_draining(struct nfs_client *clp,
 				  struct cb_layoutrecallargs *args)
 {
@@ -169,34 +189,52 @@ static u32 initiate_file_draining(struct nfs_client *clp,
 	LIST_HEAD(free_me_list);
 
 	lo = get_layout_by_fh(clp, &args->cbl_fh, &args->cbl_stateid);
-	if (!lo)
+	if (!lo) {
+		trace_nfs4_cb_layoutrecall_file(clp, &args->cbl_fh, NULL,
+				&args->cbl_stateid, -rv);
 		goto out;
+	}
 
 	ino = lo->plh_inode;
 
 	spin_lock(&ino->i_lock);
+	if (!pnfs_check_stateid_sequence(lo, &args->cbl_stateid)) {
+		rv = NFS4ERR_DELAY;
+		goto unlock;
+	}
 	pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
 	spin_unlock(&ino->i_lock);
 
 	pnfs_layoutcommit_inode(ino, false);
 
 	spin_lock(&ino->i_lock);
-	if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) ||
-	    pnfs_mark_matching_lsegs_invalid(lo, &free_me_list,
-					&args->cbl_range)) {
+	/*
+	 * Enforce RFC5661 Section 12.5.5.2.1.5 (Bulk Recall and Return)
+	 */
+	if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
 		rv = NFS4ERR_DELAY;
 		goto unlock;
 	}
 
+	if (pnfs_mark_matching_lsegs_return(lo, &free_me_list,
+					&args->cbl_range)) {
+		rv = NFS4_OK;
+		goto unlock;
+	}
+
 	if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) {
 		NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo,
 			&args->cbl_range);
 	}
+	pnfs_mark_layout_returned_if_empty(lo);
 unlock:
 	spin_unlock(&ino->i_lock);
 	pnfs_free_lseg_list(&free_me_list);
+	/* Free all lsegs that are attached to commit buckets */
+	nfs_commit_inode(ino, 0);
 	pnfs_put_layout_hdr(lo);
-	trace_nfs4_cb_layoutrecall_inode(clp, &args->cbl_fh, ino, -rv);
+	trace_nfs4_cb_layoutrecall_file(clp, &args->cbl_fh, ino,
+			&args->cbl_stateid, -rv);
 	iput(ino);
 out:
 	return rv;
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index ce5a21861..9cce67043 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -940,7 +940,7 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence)
 	dfprintk(FILE, "NFS: llseek dir(%pD2, %lld, %d)\n",
 			filp, offset, whence);
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	switch (whence) {
 		case 1:
 			offset += filp->f_pos;
@@ -957,7 +957,7 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence)
 		dir_ctx->duped = 0;
 	}
 out:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return offset;
 }
 
@@ -972,9 +972,9 @@ static int nfs_fsync_dir(struct file *filp, loff_t start, loff_t end,
 
 	dfprintk(FILE, "NFS: fsync dir(%pD2) datasync %d\n", filp, datasync);
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return 0;
 }
 
@@ -1894,15 +1894,14 @@ int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
 	attr.ia_mode = S_IFLNK | S_IRWXUGO;
 	attr.ia_valid = ATTR_MODE;
 
-	page = alloc_page(GFP_HIGHUSER);
+	page = alloc_page(GFP_USER);
 	if (!page)
 		return -ENOMEM;
 
-	kaddr = kmap_atomic(page);
+	kaddr = page_address(page);
 	memcpy(kaddr, symname, pathlen);
 	if (pathlen < PAGE_SIZE)
 		memset(kaddr + pathlen, 0, PAGE_SIZE - pathlen);
-	kunmap_atomic(kaddr);
 
 	trace_nfs_symlink_enter(dir, dentry);
 	error = NFS_PROTO(dir)->symlink(dir, dentry, page, pathlen, &attr);
@@ -2432,6 +2431,20 @@ int nfs_may_open(struct inode *inode, struct rpc_cred *cred, int openflags)
 }
 EXPORT_SYMBOL_GPL(nfs_may_open);
 
+static int nfs_execute_ok(struct inode *inode, int mask)
+{
+	struct nfs_server *server = NFS_SERVER(inode);
+	int ret;
+
+	if (mask & MAY_NOT_BLOCK)
+		ret = nfs_revalidate_inode_rcu(server, inode);
+	else
+		ret = nfs_revalidate_inode(server, inode);
+	if (ret == 0 && !execute_ok(inode))
+		ret = -EACCES;
+	return ret;
+}
+
 int nfs_permission(struct inode *inode, int mask)
 {
 	struct rpc_cred *cred;
@@ -2449,6 +2462,9 @@ int nfs_permission(struct inode *inode, int mask)
 		case S_IFLNK:
 			goto out;
 		case S_IFREG:
+			if ((mask & MAY_OPEN) &&
+			   nfs_server_capable(inode, NFS_CAP_ATOMIC_OPEN))
+				return 0;
 			break;
 		case S_IFDIR:
 			/*
@@ -2481,8 +2497,8 @@ force_lookup:
 			res = PTR_ERR(cred);
 	}
 out:
-	if (!res && (mask & MAY_EXEC) && !execute_ok(inode))
-		res = -EACCES;
+	if (!res && (mask & MAY_EXEC))
+		res = nfs_execute_ok(inode, mask);
 
 	dfprintk(VFS, "NFS: permission(%s/%lu), mask=0x%x, res=%d\n",
 		inode->i_sb->s_id, inode->i_ino, mask, res);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 4b1d08f56..7a0cfd326 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -117,12 +117,6 @@ static inline int put_dreq(struct nfs_direct_req *dreq)
 	return atomic_dec_and_test(&dreq->io_count);
 }
 
-void nfs_direct_set_resched_writes(struct nfs_direct_req *dreq)
-{
-	dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
-}
-EXPORT_SYMBOL_GPL(nfs_direct_set_resched_writes);
-
 static void
 nfs_direct_good_bytes(struct nfs_direct_req *dreq, struct nfs_pgio_header *hdr)
 {
@@ -586,7 +580,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
 	if (!count)
 		goto out;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	result = nfs_sync_mapping(mapping);
 	if (result)
 		goto out_unlock;
@@ -614,7 +608,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
 	NFS_I(inode)->read_io += count;
 	result = nfs_direct_read_schedule_iovec(dreq, iter, pos);
 
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	if (!result) {
 		result = nfs_direct_wait(dreq);
@@ -628,7 +622,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
 out_release:
 	nfs_direct_req_release(dreq);
 out_unlock:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 out:
 	return result;
 }
@@ -670,6 +664,10 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
 
 	req = nfs_list_entry(reqs.next);
 	nfs_direct_setup_mirroring(dreq, &desc, req);
+	if (desc.pg_error < 0) {
+		list_splice_init(&reqs, &failed);
+		goto out_failed;
+	}
 
 	list_for_each_entry_safe(req, tmp, &reqs, wb_list) {
 		if (!nfs_pageio_add_request(&desc, req)) {
@@ -677,13 +675,17 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
 			nfs_list_add_request(req, &failed);
 			spin_lock(cinfo.lock);
 			dreq->flags = 0;
-			dreq->error = -EIO;
+			if (desc.pg_error < 0)
+				dreq->error = desc.pg_error;
+			else
+				dreq->error = -EIO;
 			spin_unlock(cinfo.lock);
 		}
 		nfs_release_request(req);
 	}
 	nfs_pageio_complete(&desc);
 
+out_failed:
 	while (!list_empty(&failed)) {
 		req = nfs_list_entry(failed.next);
 		nfs_list_remove_request(req);
@@ -727,14 +729,20 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data)
 		nfs_direct_write_complete(dreq, data->inode);
 }
 
-static void nfs_direct_error_cleanup(struct nfs_inode *nfsi)
+static void nfs_direct_resched_write(struct nfs_commit_info *cinfo,
+		struct nfs_page *req)
 {
-	/* There is no lock to clear */
+	struct nfs_direct_req *dreq = cinfo->dreq;
+
+	spin_lock(&dreq->lock);
+	dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+	spin_unlock(&dreq->lock);
+	nfs_mark_request_commit(req, NULL, cinfo, 0);
 }
 
 static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops = {
 	.completion = nfs_direct_commit_complete,
-	.error_cleanup = nfs_direct_error_cleanup,
+	.resched_write = nfs_direct_resched_write,
 };
 
 static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
@@ -839,10 +847,25 @@ static void nfs_write_sync_pgio_error(struct list_head *head)
 	}
 }
 
+static void nfs_direct_write_reschedule_io(struct nfs_pgio_header *hdr)
+{
+	struct nfs_direct_req *dreq = hdr->dreq;
+
+	spin_lock(&dreq->lock);
+	if (dreq->error == 0) {
+		dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+		/* fake unstable write to let common nfs resend pages */
+		hdr->verf.committed = NFS_UNSTABLE;
+		hdr->good_bytes = hdr->args.count;
+	}
+	spin_unlock(&dreq->lock);
+}
+
 static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = {
 	.error_cleanup = nfs_write_sync_pgio_error,
 	.init_hdr = nfs_direct_pgio_init,
 	.completion = nfs_direct_write_completion,
+	.reschedule_io = nfs_direct_write_reschedule_io,
 };
 
 
@@ -900,6 +923,11 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
 			}
 
 			nfs_direct_setup_mirroring(dreq, &desc, req);
+			if (desc.pg_error < 0) {
+				nfs_free_request(req);
+				result = desc.pg_error;
+				break;
+			}
 
 			nfs_lock_request(req);
 			req->wb_index = pos >> PAGE_SHIFT;
@@ -977,7 +1005,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
 	pos = iocb->ki_pos;
 	end = (pos + iov_iter_count(iter) - 1) >> PAGE_CACHE_SHIFT;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	result = nfs_sync_mapping(mapping);
 	if (result)
@@ -1017,7 +1045,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
 					      pos >> PAGE_CACHE_SHIFT, end);
 	}
 
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	if (!result) {
 		result = nfs_direct_wait(dreq);
@@ -1038,7 +1066,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
 out_release:
 	nfs_direct_req_release(dreq);
 out_unlock:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return result;
 }
 
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 93e236429..748bb813b 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -278,9 +278,9 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 		ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
 		if (ret != 0)
 			break;
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 		ret = nfs_file_fsync_commit(file, start, end, datasync);
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 		/*
 		 * If nfs_file_fsync_commit detected a server reboot, then
 		 * resend all dirty pages that might have been covered by
@@ -514,7 +514,7 @@ static void nfs_check_dirty_writeback(struct page *page,
 	 * so it will not block due to pages that will shortly be freeable.
 	 */
 	nfsi = NFS_I(mapping->host);
-	if (test_bit(NFS_INO_COMMIT, &nfsi->flags)) {
+	if (atomic_read(&nfsi->commit_info.rpcs_out)) {
 		*writeback = true;
 		return;
 	}
@@ -545,7 +545,7 @@ static int nfs_launder_page(struct page *page)
 		inode->i_ino, (long long)page_offset(page));
 
 	nfs_fscache_wait_on_page_write(nfsi, page);
-	return nfs_wb_page(inode, page);
+	return nfs_wb_launder_page(inode, page);
 }
 
 static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file,
@@ -756,7 +756,7 @@ do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
 
 	l_ctx = nfs_get_lock_context(nfs_file_open_context(filp));
 	if (!IS_ERR(l_ctx)) {
-		status = nfs_iocounter_wait(&l_ctx->io_count);
+		status = nfs_iocounter_wait(l_ctx);
 		nfs_put_lock_context(l_ctx);
 		if (status < 0)
 			return status;
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 02ec07973..3384dc8e6 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -202,6 +202,7 @@ static int filelayout_async_handle_error(struct rpc_task *task,
 			task->tk_status);
 		nfs4_mark_deviceid_unavailable(devid);
 		pnfs_error_mark_layout_for_return(inode, lseg);
+		pnfs_set_lo_fail(lseg);
 		rpc_wake_up(&tbl->slot_tbl_waitq);
 		/* fall through */
 	default:
@@ -883,13 +884,19 @@ static void
 filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio,
 			struct nfs_page *req)
 {
-	if (!pgio->pg_lseg)
+	if (!pgio->pg_lseg) {
 		pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
 					   req->wb_context,
 					   0,
 					   NFS4_MAX_UINT64,
 					   IOMODE_READ,
 					   GFP_KERNEL);
+		if (IS_ERR(pgio->pg_lseg)) {
+			pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+			pgio->pg_lseg = NULL;
+			return;
+		}
+	}
 	/* If no lseg, fall back to read through mds */
 	if (pgio->pg_lseg == NULL)
 		nfs_pageio_reset_read_mds(pgio);
@@ -902,13 +909,20 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio,
 	struct nfs_commit_info cinfo;
 	int status;
 
-	if (!pgio->pg_lseg)
+	if (!pgio->pg_lseg) {
 		pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
 					   req->wb_context,
 					   0,
 					   NFS4_MAX_UINT64,
 					   IOMODE_RW,
 					   GFP_NOFS);
+		if (IS_ERR(pgio->pg_lseg)) {
+			pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+			pgio->pg_lseg = NULL;
+			return;
+		}
+	}
+
 	/* If no lseg, fall back to write through mds */
 	if (pgio->pg_lseg == NULL)
 		goto out_mds;
@@ -957,7 +971,7 @@ filelayout_mark_request_commit(struct nfs_page *req,
 	u32 i, j;
 
 	if (fl->commit_through_mds) {
-		nfs_request_add_commit_list(req, &cinfo->mds->list, cinfo);
+		nfs_request_add_commit_list(req, cinfo);
 	} else {
 		/* Note that we are calling nfs4_fl_calc_j_index on each page
 		 * that ends up being committed to a data server.  An attractive
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 2a2e2d8dd..0cb1abd53 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -505,9 +505,17 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
 	}
 
 	p = xdr_inline_decode(&stream, 4);
-	if (p)
-		fls->flags = be32_to_cpup(p);
+	if (!p)
+		goto out_sort_mirrors;
+	fls->flags = be32_to_cpup(p);
+
+	p = xdr_inline_decode(&stream, 4);
+	if (!p)
+		goto out_sort_mirrors;
+	for (i=0; i < fls->mirror_array_cnt; i++)
+		fls->mirror_array[i]->report_interval = be32_to_cpup(p);
 
+out_sort_mirrors:
 	ff_layout_sort_mirrors(fls);
 	rc = ff_layout_check_layout(lgr);
 	if (rc)
@@ -603,7 +611,9 @@ nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror,
 		mirror->start_time = now;
 	if (ktime_equal(mirror->last_report_time, notime))
 		mirror->last_report_time = now;
-	if (layoutstats_timer != 0)
+	if (mirror->report_interval != 0)
+		report_interval = (s64)mirror->report_interval * 1000LL;
+	else if (layoutstats_timer != 0)
 		report_interval = (s64)layoutstats_timer * 1000LL;
 	if (ktime_to_ms(ktime_sub(now, mirror->last_report_time)) >=
 			report_interval) {
@@ -785,13 +795,19 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
 	int ds_idx;
 
 	/* Use full layout for now */
-	if (!pgio->pg_lseg)
+	if (!pgio->pg_lseg) {
 		pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
 						   req->wb_context,
 						   0,
 						   NFS4_MAX_UINT64,
 						   IOMODE_READ,
 						   GFP_KERNEL);
+		if (IS_ERR(pgio->pg_lseg)) {
+			pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+			pgio->pg_lseg = NULL;
+			return;
+		}
+	}
 	/* If no lseg, fall back to read through mds */
 	if (pgio->pg_lseg == NULL)
 		goto out_mds;
@@ -825,13 +841,19 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio,
 	int i;
 	int status;
 
-	if (!pgio->pg_lseg)
+	if (!pgio->pg_lseg) {
 		pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
 						   req->wb_context,
 						   0,
 						   NFS4_MAX_UINT64,
 						   IOMODE_RW,
 						   GFP_NOFS);
+		if (IS_ERR(pgio->pg_lseg)) {
+			pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+			pgio->pg_lseg = NULL;
+			return;
+		}
+	}
 	/* If no lseg, fall back to write through mds */
 	if (pgio->pg_lseg == NULL)
 		goto out_mds;
@@ -867,18 +889,25 @@ static unsigned int
 ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio,
 				    struct nfs_page *req)
 {
-	if (!pgio->pg_lseg)
+	if (!pgio->pg_lseg) {
 		pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
 						   req->wb_context,
 						   0,
 						   NFS4_MAX_UINT64,
 						   IOMODE_RW,
 						   GFP_NOFS);
+		if (IS_ERR(pgio->pg_lseg)) {
+			pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+			pgio->pg_lseg = NULL;
+			goto out;
+		}
+	}
 	if (pgio->pg_lseg)
 		return FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg);
 
 	/* no lseg means that pnfs is not in use, so no mirroring here */
 	nfs_pageio_reset_write_mds(pgio);
+out:
 	return 1;
 }
 
@@ -912,18 +941,7 @@ static void ff_layout_reset_write(struct nfs_pgio_header *hdr, bool retry_pnfs)
 			hdr->args.count,
 			(unsigned long long)hdr->args.offset);
 
-		if (!hdr->dreq) {
-			struct nfs_open_context *ctx;
-
-			ctx = nfs_list_entry(hdr->pages.next)->wb_context;
-			set_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags);
-			hdr->completion_ops->error_cleanup(&hdr->pages);
-		} else {
-			nfs_direct_set_resched_writes(hdr->dreq);
-			/* fake unstable write to let common nfs resend pages */
-			hdr->verf.committed = NFS_UNSTABLE;
-			hdr->good_bytes = hdr->args.count;
-		}
+		hdr->completion_ops->reschedule_io(hdr);
 		return;
 	}
 
@@ -1101,7 +1119,7 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task,
 	return -NFS4ERR_RESET_TO_PNFS;
 out_retry:
 	task->tk_status = 0;
-	rpc_restart_call(task);
+	rpc_restart_call_prepare(task);
 	rpc_delay(task, NFS_JUKEBOX_RETRY_TIME);
 	return -EAGAIN;
 }
@@ -1159,6 +1177,14 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
 		}
 	}
 
+	switch (status) {
+	case NFS4ERR_DELAY:
+	case NFS4ERR_GRACE:
+		return;
+	default:
+		break;
+	}
+
 	mirror = FF_LAYOUT_COMP(lseg, idx);
 	err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
 				       mirror, offset, length, status, opnum,
@@ -1189,7 +1215,7 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
 					hdr->pgio_mirror_idx + 1,
 					&hdr->pgio_mirror_idx))
 			goto out_eagain;
-		set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
+		set_bit(NFS_LAYOUT_RETURN_REQUESTED,
 			&hdr->lseg->pls_layout->plh_flags);
 		pnfs_read_resend_pnfs(hdr);
 		return task->tk_status;
@@ -1242,14 +1268,31 @@ ff_layout_reset_to_mds(struct pnfs_layout_segment *lseg, int idx)
 	return ff_layout_test_devid_unavailable(node);
 }
 
-static int ff_layout_read_prepare_common(struct rpc_task *task,
-					 struct nfs_pgio_header *hdr)
+static void ff_layout_read_record_layoutstats_start(struct rpc_task *task,
+		struct nfs_pgio_header *hdr)
 {
+	if (test_and_set_bit(NFS_IOHDR_STAT, &hdr->flags))
+		return;
 	nfs4_ff_layout_stat_io_start_read(hdr->inode,
 			FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
 			hdr->args.count,
 			task->tk_start);
+}
+
+static void ff_layout_read_record_layoutstats_done(struct rpc_task *task,
+		struct nfs_pgio_header *hdr)
+{
+	if (!test_and_clear_bit(NFS_IOHDR_STAT, &hdr->flags))
+		return;
+	nfs4_ff_layout_stat_io_end_read(task,
+			FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
+			hdr->args.count,
+			hdr->res.count);
+}
 
+static int ff_layout_read_prepare_common(struct rpc_task *task,
+					 struct nfs_pgio_header *hdr)
+{
 	if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
 		rpc_exit(task, -EIO);
 		return -EIO;
@@ -1265,6 +1308,7 @@ static int ff_layout_read_prepare_common(struct rpc_task *task,
 	}
 	hdr->pgio_done_cb = ff_layout_read_done_cb;
 
+	ff_layout_read_record_layoutstats_start(task, hdr);
 	return 0;
 }
 
@@ -1323,10 +1367,6 @@ static void ff_layout_read_call_done(struct rpc_task *task, void *data)
 
 	dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status);
 
-	nfs4_ff_layout_stat_io_end_read(task,
-			FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
-			hdr->args.count, hdr->res.count);
-
 	if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
 	    task->tk_status == 0) {
 		nfs4_sequence_done(task, &hdr->res.seq_res);
@@ -1341,10 +1381,20 @@ static void ff_layout_read_count_stats(struct rpc_task *task, void *data)
 {
 	struct nfs_pgio_header *hdr = data;
 
+	ff_layout_read_record_layoutstats_done(task, hdr);
 	rpc_count_iostats_metrics(task,
 	    &NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_READ]);
 }
 
+static void ff_layout_read_release(void *data)
+{
+	struct nfs_pgio_header *hdr = data;
+
+	ff_layout_read_record_layoutstats_done(&hdr->task, hdr);
+	pnfs_generic_rw_release(data);
+}
+
+
 static int ff_layout_write_done_cb(struct rpc_task *task,
 				struct nfs_pgio_header *hdr)
 {
@@ -1362,15 +1412,12 @@ static int ff_layout_write_done_cb(struct rpc_task *task,
 
 	switch (err) {
 	case -NFS4ERR_RESET_TO_PNFS:
-		pnfs_set_retry_layoutget(hdr->lseg->pls_layout);
 		ff_layout_reset_write(hdr, true);
 		return task->tk_status;
 	case -NFS4ERR_RESET_TO_MDS:
-		pnfs_clear_retry_layoutget(hdr->lseg->pls_layout);
 		ff_layout_reset_write(hdr, false);
 		return task->tk_status;
 	case -EAGAIN:
-		rpc_restart_call_prepare(task);
 		return -EAGAIN;
 	}
 
@@ -1402,11 +1449,9 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
 
 	switch (err) {
 	case -NFS4ERR_RESET_TO_PNFS:
-		pnfs_set_retry_layoutget(data->lseg->pls_layout);
 		pnfs_generic_prepare_to_resend_writes(data);
 		return -EAGAIN;
 	case -NFS4ERR_RESET_TO_MDS:
-		pnfs_clear_retry_layoutget(data->lseg->pls_layout);
 		pnfs_generic_prepare_to_resend_writes(data);
 		return -EAGAIN;
 	case -EAGAIN:
@@ -1421,14 +1466,31 @@ static int ff_layout_commit_done_cb(struct rpc_task *task,
 	return 0;
 }
 
-static int ff_layout_write_prepare_common(struct rpc_task *task,
-					  struct nfs_pgio_header *hdr)
+static void ff_layout_write_record_layoutstats_start(struct rpc_task *task,
+		struct nfs_pgio_header *hdr)
 {
+	if (test_and_set_bit(NFS_IOHDR_STAT, &hdr->flags))
+		return;
 	nfs4_ff_layout_stat_io_start_write(hdr->inode,
 			FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
 			hdr->args.count,
 			task->tk_start);
+}
+
+static void ff_layout_write_record_layoutstats_done(struct rpc_task *task,
+		struct nfs_pgio_header *hdr)
+{
+	if (!test_and_clear_bit(NFS_IOHDR_STAT, &hdr->flags))
+		return;
+	nfs4_ff_layout_stat_io_end_write(task,
+			FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
+			hdr->args.count, hdr->res.count,
+			hdr->res.verf->committed);
+}
 
+static int ff_layout_write_prepare_common(struct rpc_task *task,
+					  struct nfs_pgio_header *hdr)
+{
 	if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) {
 		rpc_exit(task, -EIO);
 		return -EIO;
@@ -1445,6 +1507,7 @@ static int ff_layout_write_prepare_common(struct rpc_task *task,
 		return -EAGAIN;
 	}
 
+	ff_layout_write_record_layoutstats_start(task, hdr);
 	return 0;
 }
 
@@ -1480,11 +1543,6 @@ static void ff_layout_write_call_done(struct rpc_task *task, void *data)
 {
 	struct nfs_pgio_header *hdr = data;
 
-	nfs4_ff_layout_stat_io_end_write(task,
-			FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx),
-			hdr->args.count, hdr->res.count,
-			hdr->res.verf->committed);
-
 	if (test_bit(NFS_IOHDR_REDO, &hdr->flags) &&
 	    task->tk_status == 0) {
 		nfs4_sequence_done(task, &hdr->res.seq_res);
@@ -1499,18 +1557,53 @@ static void ff_layout_write_count_stats(struct rpc_task *task, void *data)
 {
 	struct nfs_pgio_header *hdr = data;
 
+	ff_layout_write_record_layoutstats_done(task, hdr);
 	rpc_count_iostats_metrics(task,
 	    &NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_WRITE]);
 }
 
-static void ff_layout_commit_prepare_common(struct rpc_task *task,
+static void ff_layout_write_release(void *data)
+{
+	struct nfs_pgio_header *hdr = data;
+
+	ff_layout_write_record_layoutstats_done(&hdr->task, hdr);
+	pnfs_generic_rw_release(data);
+}
+
+static void ff_layout_commit_record_layoutstats_start(struct rpc_task *task,
 		struct nfs_commit_data *cdata)
 {
+	if (test_and_set_bit(NFS_IOHDR_STAT, &cdata->flags))
+		return;
 	nfs4_ff_layout_stat_io_start_write(cdata->inode,
 			FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index),
 			0, task->tk_start);
 }
 
+static void ff_layout_commit_record_layoutstats_done(struct rpc_task *task,
+		struct nfs_commit_data *cdata)
+{
+	struct nfs_page *req;
+	__u64 count = 0;
+
+	if (!test_and_clear_bit(NFS_IOHDR_STAT, &cdata->flags))
+		return;
+
+	if (task->tk_status == 0) {
+		list_for_each_entry(req, &cdata->pages, wb_list)
+			count += req->wb_bytes;
+	}
+	nfs4_ff_layout_stat_io_end_write(task,
+			FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index),
+			count, count, NFS_FILE_SYNC);
+}
+
+static void ff_layout_commit_prepare_common(struct rpc_task *task,
+		struct nfs_commit_data *cdata)
+{
+	ff_layout_commit_record_layoutstats_start(task, cdata);
+}
+
 static void ff_layout_commit_prepare_v3(struct rpc_task *task, void *data)
 {
 	ff_layout_commit_prepare_common(task, data);
@@ -1531,19 +1624,6 @@ static void ff_layout_commit_prepare_v4(struct rpc_task *task, void *data)
 
 static void ff_layout_commit_done(struct rpc_task *task, void *data)
 {
-	struct nfs_commit_data *cdata = data;
-	struct nfs_page *req;
-	__u64 count = 0;
-
-	if (task->tk_status == 0) {
-		list_for_each_entry(req, &cdata->pages, wb_list)
-			count += req->wb_bytes;
-	}
-
-	nfs4_ff_layout_stat_io_end_write(task,
-			FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index),
-			count, count, NFS_FILE_SYNC);
-
 	pnfs_generic_write_commit_done(task, data);
 }
 
@@ -1551,50 +1631,59 @@ static void ff_layout_commit_count_stats(struct rpc_task *task, void *data)
 {
 	struct nfs_commit_data *cdata = data;
 
+	ff_layout_commit_record_layoutstats_done(task, cdata);
 	rpc_count_iostats_metrics(task,
 	    &NFS_CLIENT(cdata->inode)->cl_metrics[NFSPROC4_CLNT_COMMIT]);
 }
 
+static void ff_layout_commit_release(void *data)
+{
+	struct nfs_commit_data *cdata = data;
+
+	ff_layout_commit_record_layoutstats_done(&cdata->task, cdata);
+	pnfs_generic_commit_release(data);
+}
+
 static const struct rpc_call_ops ff_layout_read_call_ops_v3 = {
 	.rpc_call_prepare = ff_layout_read_prepare_v3,
 	.rpc_call_done = ff_layout_read_call_done,
 	.rpc_count_stats = ff_layout_read_count_stats,
-	.rpc_release = pnfs_generic_rw_release,
+	.rpc_release = ff_layout_read_release,
 };
 
 static const struct rpc_call_ops ff_layout_read_call_ops_v4 = {
 	.rpc_call_prepare = ff_layout_read_prepare_v4,
 	.rpc_call_done = ff_layout_read_call_done,
 	.rpc_count_stats = ff_layout_read_count_stats,
-	.rpc_release = pnfs_generic_rw_release,
+	.rpc_release = ff_layout_read_release,
 };
 
 static const struct rpc_call_ops ff_layout_write_call_ops_v3 = {
 	.rpc_call_prepare = ff_layout_write_prepare_v3,
 	.rpc_call_done = ff_layout_write_call_done,
 	.rpc_count_stats = ff_layout_write_count_stats,
-	.rpc_release = pnfs_generic_rw_release,
+	.rpc_release = ff_layout_write_release,
 };
 
 static const struct rpc_call_ops ff_layout_write_call_ops_v4 = {
 	.rpc_call_prepare = ff_layout_write_prepare_v4,
 	.rpc_call_done = ff_layout_write_call_done,
 	.rpc_count_stats = ff_layout_write_count_stats,
-	.rpc_release = pnfs_generic_rw_release,
+	.rpc_release = ff_layout_write_release,
 };
 
 static const struct rpc_call_ops ff_layout_commit_call_ops_v3 = {
 	.rpc_call_prepare = ff_layout_commit_prepare_v3,
 	.rpc_call_done = ff_layout_commit_done,
 	.rpc_count_stats = ff_layout_commit_count_stats,
-	.rpc_release = pnfs_generic_commit_release,
+	.rpc_release = ff_layout_commit_release,
 };
 
 static const struct rpc_call_ops ff_layout_commit_call_ops_v4 = {
 	.rpc_call_prepare = ff_layout_commit_prepare_v4,
 	.rpc_call_done = ff_layout_commit_done,
 	.rpc_count_stats = ff_layout_commit_count_stats,
-	.rpc_release = pnfs_generic_commit_release,
+	.rpc_release = ff_layout_commit_release,
 };
 
 static enum pnfs_try_status
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h
index 2bb08bc6a..dd353bb7d 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.h
+++ b/fs/nfs/flexfilelayout/flexfilelayout.h
@@ -85,6 +85,7 @@ struct nfs4_ff_layout_mirror {
 	struct nfs4_ff_layoutstat	write_stat;
 	ktime_t				start_time;
 	ktime_t				last_report_time;
+	u32				report_interval;
 };
 
 struct nfs4_ff_layout_segment {
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index e125e55de..eb370460c 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -218,63 +218,55 @@ static void extend_ds_error(struct nfs4_ff_layout_ds_err *err,
 	err->length = end - err->offset;
 }
 
-static bool ds_error_can_merge(struct nfs4_ff_layout_ds_err *err,  u64 offset,
-			       u64 length, int status, enum nfs_opnum4 opnum,
-			       nfs4_stateid *stateid,
-			       struct nfs4_deviceid *deviceid)
+static int
+ff_ds_error_match(const struct nfs4_ff_layout_ds_err *e1,
+		const struct nfs4_ff_layout_ds_err *e2)
 {
-	return err->status == status && err->opnum == opnum &&
-	       nfs4_stateid_match(&err->stateid, stateid) &&
-	       !memcmp(&err->deviceid, deviceid, sizeof(*deviceid)) &&
-	       end_offset(err->offset, err->length) >= offset &&
-	       err->offset <= end_offset(offset, length);
-}
-
-static bool merge_ds_error(struct nfs4_ff_layout_ds_err *old,
-			   struct nfs4_ff_layout_ds_err *new)
-{
-	if (!ds_error_can_merge(old, new->offset, new->length, new->status,
-				new->opnum, &new->stateid, &new->deviceid))
-		return false;
-
-	extend_ds_error(old, new->offset, new->length);
-	return true;
+	int ret;
+
+	if (e1->opnum != e2->opnum)
+		return e1->opnum < e2->opnum ? -1 : 1;
+	if (e1->status != e2->status)
+		return e1->status < e2->status ? -1 : 1;
+	ret = memcmp(&e1->stateid, &e2->stateid, sizeof(e1->stateid));
+	if (ret != 0)
+		return ret;
+	ret = memcmp(&e1->deviceid, &e2->deviceid, sizeof(e1->deviceid));
+	if (ret != 0)
+		return ret;
+	if (end_offset(e1->offset, e1->length) < e2->offset)
+		return -1;
+	if (e1->offset > end_offset(e2->offset, e2->length))
+		return 1;
+	/* If ranges overlap or are contiguous, they are the same */
+	return 0;
 }
 
-static bool
+static void
 ff_layout_add_ds_error_locked(struct nfs4_flexfile_layout *flo,
 			      struct nfs4_ff_layout_ds_err *dserr)
 {
-	struct nfs4_ff_layout_ds_err *err;
-
-	list_for_each_entry(err, &flo->error_list, list) {
-		if (merge_ds_error(err, dserr)) {
-			return true;
-		}
-	}
-
-	list_add(&dserr->list, &flo->error_list);
-	return false;
-}
-
-static bool
-ff_layout_update_ds_error(struct nfs4_flexfile_layout *flo, u64 offset,
-			  u64 length, int status, enum nfs_opnum4 opnum,
-			  nfs4_stateid *stateid, struct nfs4_deviceid *deviceid)
-{
-	bool found = false;
-	struct nfs4_ff_layout_ds_err *err;
-
-	list_for_each_entry(err, &flo->error_list, list) {
-		if (ds_error_can_merge(err, offset, length, status, opnum,
-				       stateid, deviceid)) {
-			found = true;
-			extend_ds_error(err, offset, length);
+	struct nfs4_ff_layout_ds_err *err, *tmp;
+	struct list_head *head = &flo->error_list;
+	int match;
+
+	/* Do insertion sort w/ merges */
+	list_for_each_entry_safe(err, tmp, &flo->error_list, list) {
+		match = ff_ds_error_match(err, dserr);
+		if (match < 0)
+			continue;
+		if (match > 0) {
+			/* Add entry "dserr" _before_ entry "err" */
+			head = &err->list;
 			break;
 		}
+		/* Entries match, so merge "err" into "dserr" */
+		extend_ds_error(dserr, err->offset, err->length);
+		list_del(&err->list);
+		kfree(err);
 	}
 
-	return found;
+	list_add_tail(&dserr->list, head);
 }
 
 int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
@@ -283,7 +275,6 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
 			     gfp_t gfp_flags)
 {
 	struct nfs4_ff_layout_ds_err *dserr;
-	bool needfree;
 
 	if (status == 0)
 		return 0;
@@ -291,14 +282,6 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
 	if (mirror->mirror_ds == NULL)
 		return -EINVAL;
 
-	spin_lock(&flo->generic_hdr.plh_inode->i_lock);
-	if (ff_layout_update_ds_error(flo, offset, length, status, opnum,
-				      &mirror->stateid,
-				      &mirror->mirror_ds->id_node.deviceid)) {
-		spin_unlock(&flo->generic_hdr.plh_inode->i_lock);
-		return 0;
-	}
-	spin_unlock(&flo->generic_hdr.plh_inode->i_lock);
 	dserr = kmalloc(sizeof(*dserr), gfp_flags);
 	if (!dserr)
 		return -ENOMEM;
@@ -313,10 +296,8 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
 	       NFS4_DEVICEID4_SIZE);
 
 	spin_lock(&flo->generic_hdr.plh_inode->i_lock);
-	needfree = ff_layout_add_ds_error_locked(flo, dserr);
+	ff_layout_add_ds_error_locked(flo, dserr);
 	spin_unlock(&flo->generic_hdr.plh_inode->i_lock);
-	if (needfree)
-		kfree(dserr);
 
 	return 0;
 }
@@ -429,22 +410,14 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
 					 mirror, lseg->pls_range.offset,
 					 lseg->pls_range.length, NFS4ERR_NXIO,
 					 OP_ILLEGAL, GFP_NOIO);
-		if (fail_return) {
-			pnfs_error_mark_layout_for_return(ino, lseg);
+		if (!fail_return) {
 			if (ff_layout_has_available_ds(lseg))
-				pnfs_set_retry_layoutget(lseg->pls_layout);
-			else
-				pnfs_clear_retry_layoutget(lseg->pls_layout);
-
-		} else {
-			if (ff_layout_has_available_ds(lseg))
-				set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
+				set_bit(NFS_LAYOUT_RETURN_REQUESTED,
 					&lseg->pls_layout->plh_flags);
-			else {
+			else
 				pnfs_error_mark_layout_for_return(ino, lseg);
-				pnfs_clear_retry_layoutget(lseg->pls_layout);
-			}
-		}
+		} else
+			pnfs_error_mark_layout_for_return(ino, lseg);
 	}
 out_update_creds:
 	if (ff_layout_update_mirror_cred(mirror, ds))
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 3e2071a17..86faecf8f 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -71,19 +71,25 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr)
 	return nfs_fileid_to_ino_t(fattr->fileid);
 }
 
-/**
- * nfs_wait_bit_killable - helper for functions that are sleeping on bit locks
- * @word: long word containing the bit lock
- */
-int nfs_wait_bit_killable(struct wait_bit_key *key, int mode)
+static int nfs_wait_killable(int mode)
 {
 	freezable_schedule_unsafe();
 	if (signal_pending_state(mode, current))
 		return -ERESTARTSYS;
 	return 0;
 }
+
+int nfs_wait_bit_killable(struct wait_bit_key *key, int mode)
+{
+	return nfs_wait_killable(mode);
+}
 EXPORT_SYMBOL_GPL(nfs_wait_bit_killable);
 
+int nfs_wait_atomic_killable(atomic_t *p)
+{
+	return nfs_wait_killable(TASK_KILLABLE);
+}
+
 /**
  * nfs_compat_user_ino64 - returns the user-visible inode number
  * @fileid: 64-bit fileid
@@ -408,9 +414,10 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
 				inode->i_fop = NULL;
 				inode->i_flags |= S_AUTOMOUNT;
 			}
-		} else if (S_ISLNK(inode->i_mode))
+		} else if (S_ISLNK(inode->i_mode)) {
 			inode->i_op = &nfs_symlink_inode_operations;
-		else
+			inode_nohighmem(inode);
+		} else
 			init_special_inode(inode, inode->i_mode, fattr->rdev);
 
 		memset(&inode->i_atime, 0, sizeof(inode->i_atime));
@@ -654,9 +661,9 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 	trace_nfs_getattr_enter(inode);
 	/* Flush out writes to the server in order to update c/mtime.  */
 	if (S_ISREG(inode->i_mode)) {
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 		err = nfs_sync_inode(inode);
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 		if (err)
 			goto out;
 	}
@@ -699,7 +706,7 @@ static void nfs_init_lock_context(struct nfs_lock_context *l_ctx)
 	l_ctx->lockowner.l_owner = current->files;
 	l_ctx->lockowner.l_pid = current->tgid;
 	INIT_LIST_HEAD(&l_ctx->list);
-	nfs_iocounter_init(&l_ctx->io_count);
+	atomic_set(&l_ctx->io_count, 0);
 }
 
 static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context *ctx)
@@ -912,6 +919,12 @@ void nfs_file_clear_open_context(struct file *filp)
 	if (ctx) {
 		struct inode *inode = d_inode(ctx->dentry);
 
+		/*
+		 * We fatal error on write before. Try to writeback
+		 * every page again.
+		 */
+		if (ctx->error < 0)
+			invalidate_inode_pages2(inode->i_mapping);
 		filp->private_data = NULL;
 		spin_lock(&inode->i_lock);
 		list_move_tail(&ctx->list, &NFS_I(inode)->open_files);
@@ -1086,6 +1099,27 @@ static bool nfs_mapping_need_revalidate_inode(struct inode *inode)
 		|| NFS_STALE(inode);
 }
 
+int nfs_revalidate_mapping_rcu(struct inode *inode)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+	unsigned long *bitlock = &nfsi->flags;
+	int ret = 0;
+
+	if (IS_SWAPFILE(inode))
+		goto out;
+	if (nfs_mapping_need_revalidate_inode(inode)) {
+		ret = -ECHILD;
+		goto out;
+	}
+	spin_lock(&inode->i_lock);
+	if (test_bit(NFS_INO_INVALIDATING, bitlock) ||
+	    (nfsi->cache_validity & NFS_INO_INVALID_DATA))
+		ret = -ECHILD;
+	spin_unlock(&inode->i_lock);
+out:
+	return ret;
+}
+
 /**
  * __nfs_revalidate_mapping - Revalidate the pagecache
  * @inode - pointer to host inode
@@ -1144,9 +1178,9 @@ static int __nfs_revalidate_mapping(struct inode *inode,
 	spin_unlock(&inode->i_lock);
 	trace_nfs_invalidate_mapping_enter(inode);
 	if (may_lock) {
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 		ret = nfs_invalidate_mapping(inode, mapping);
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 	} else
 		ret = nfs_invalidate_mapping(inode, mapping);
 	trace_nfs_invalidate_mapping_exit(inode, ret);
@@ -1935,7 +1969,7 @@ static int __init nfs_init_inodecache(void)
 	nfs_inode_cachep = kmem_cache_create("nfs_inode_cache",
 					     sizeof(struct nfs_inode),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					     init_once);
 	if (nfs_inode_cachep == NULL)
 		return -ENOMEM;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 9dea85f7f..9a547aa3e 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -238,7 +238,7 @@ extern void nfs_pgheader_init(struct nfs_pageio_descriptor *desc,
 			      struct nfs_pgio_header *hdr,
 			      void (*release)(struct nfs_pgio_header *hdr));
 void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos);
-int nfs_iocounter_wait(struct nfs_io_counter *c);
+int nfs_iocounter_wait(struct nfs_lock_context *l_ctx);
 
 extern const struct nfs_pageio_ops nfs_pgio_rw_ops;
 struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *);
@@ -252,18 +252,18 @@ void nfs_free_request(struct nfs_page *req);
 struct nfs_pgio_mirror *
 nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc);
 
-static inline void nfs_iocounter_init(struct nfs_io_counter *c)
-{
-	c->flags = 0;
-	atomic_set(&c->io_count, 0);
-}
-
 static inline bool nfs_pgio_has_mirroring(struct nfs_pageio_descriptor *desc)
 {
 	WARN_ON_ONCE(desc->pg_mirror_count < 1);
 	return desc->pg_mirror_count > 1;
 }
 
+static inline bool nfs_match_open_context(const struct nfs_open_context *ctx1,
+		const struct nfs_open_context *ctx2)
+{
+	return ctx1->cred == ctx2->cred && ctx1->state == ctx2->state;
+}
+
 /* nfs2xdr.c */
 extern struct rpc_procinfo nfs_procedures[];
 extern int nfs2_decode_dirent(struct xdr_stream *,
@@ -380,6 +380,7 @@ extern void nfs_clear_inode(struct inode *);
 extern void nfs_evict_inode(struct inode *);
 void nfs_zap_acl_cache(struct inode *inode);
 extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode);
+extern int nfs_wait_atomic_killable(atomic_t *p);
 
 /* super.c */
 extern const struct super_operations nfs_sops;
@@ -483,7 +484,7 @@ void nfs_retry_commit(struct list_head *page_list,
 		      struct nfs_commit_info *cinfo,
 		      u32 ds_commit_idx);
 void nfs_commitdata_release(struct nfs_commit_data *data);
-void nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst,
+void nfs_request_add_commit_list(struct nfs_page *req,
 				 struct nfs_commit_info *cinfo);
 void nfs_request_add_commit_list_locked(struct nfs_page *req,
 		struct list_head *dst,
@@ -519,7 +520,6 @@ static inline void nfs_inode_dio_wait(struct inode *inode)
 	inode_dio_wait(inode);
 }
 extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq);
-extern void nfs_direct_set_resched_writes(struct nfs_direct_req *dreq);
 
 /* nfs4proc.c */
 extern void __nfs4_read_done_cb(struct nfs_pgio_header *);
@@ -696,9 +696,32 @@ static inline u32 nfs_fhandle_hash(const struct nfs_fh *fh)
 {
 	return ~crc32_le(0xFFFFFFFF, &fh->data[0], fh->size);
 }
+static inline u32 nfs_stateid_hash(const nfs4_stateid *stateid)
+{
+	return ~crc32_le(0xFFFFFFFF, &stateid->other[0],
+				NFS4_STATEID_OTHER_SIZE);
+}
 #else
 static inline u32 nfs_fhandle_hash(const struct nfs_fh *fh)
 {
 	return 0;
 }
+static inline u32 nfs_stateid_hash(nfs4_stateid *stateid)
+{
+	return 0;
+}
 #endif
+
+static inline bool nfs_error_is_fatal(int err)
+{
+	switch (err) {
+	case -ERESTARTSYS:
+	case -EIO:
+	case -ENOSPC:
+	case -EROFS:
+	case -E2BIG:
+		return true;
+	default:
+		return false;
+	}
+}
diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c
index 1ebe2fc7c..17c0fa1ec 100644
--- a/fs/nfs/nfs3acl.c
+++ b/fs/nfs/nfs3acl.c
@@ -284,12 +284,12 @@ nfs3_listxattr(struct dentry *dentry, char *data, size_t size)
 	int error;
 
 	error = nfs3_list_one_acl(inode, ACL_TYPE_ACCESS,
-			POSIX_ACL_XATTR_ACCESS, data, size, &result);
+			XATTR_NAME_POSIX_ACL_ACCESS, data, size, &result);
 	if (error)
 		return error;
 
 	error = nfs3_list_one_acl(inode, ACL_TYPE_DEFAULT,
-			POSIX_ACL_XATTR_DEFAULT, data, size, &result);
+			XATTR_NAME_POSIX_ACL_DEFAULT, data, size, &result);
 	if (error)
 		return error;
 	return result;
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 6b1ce9825..dff83460e 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -16,29 +16,8 @@
 
 #define NFSDBG_FACILITY NFSDBG_PROC
 
-static int nfs42_set_rw_stateid(nfs4_stateid *dst, struct file *file,
-				fmode_t fmode)
-{
-	struct nfs_open_context *open;
-	struct nfs_lock_context *lock;
-	int ret;
-
-	open = get_nfs_open_context(nfs_file_open_context(file));
-	lock = nfs_get_lock_context(open);
-	if (IS_ERR(lock)) {
-		put_nfs_open_context(open);
-		return PTR_ERR(lock);
-	}
-
-	ret = nfs4_set_rw_stateid(dst, open, lock, fmode);
-
-	nfs_put_lock_context(lock);
-	put_nfs_open_context(open);
-	return ret;
-}
-
 static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
-				 loff_t offset, loff_t len)
+		struct nfs_lock_context *lock, loff_t offset, loff_t len)
 {
 	struct inode *inode = file_inode(filep);
 	struct nfs_server *server = NFS_SERVER(inode);
@@ -56,7 +35,8 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
 	msg->rpc_argp = &args;
 	msg->rpc_resp = &res;
 
-	status = nfs42_set_rw_stateid(&args.falloc_stateid, filep, FMODE_WRITE);
+	status = nfs4_set_rw_stateid(&args.falloc_stateid, lock->open_context,
+			lock, FMODE_WRITE);
 	if (status)
 		return status;
 
@@ -78,15 +58,26 @@ static int nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep,
 {
 	struct nfs_server *server = NFS_SERVER(file_inode(filep));
 	struct nfs4_exception exception = { };
+	struct nfs_lock_context *lock;
 	int err;
 
+	lock = nfs_get_lock_context(nfs_file_open_context(filep));
+	if (IS_ERR(lock))
+		return PTR_ERR(lock);
+
+	exception.inode = file_inode(filep);
+	exception.state = lock->open_context->state;
+
 	do {
-		err = _nfs42_proc_fallocate(msg, filep, offset, len);
-		if (err == -ENOTSUPP)
-			return -EOPNOTSUPP;
+		err = _nfs42_proc_fallocate(msg, filep, lock, offset, len);
+		if (err == -ENOTSUPP) {
+			err = -EOPNOTSUPP;
+			break;
+		}
 		err = nfs4_handle_exception(server, err, &exception);
 	} while (exception.retry);
 
+	nfs_put_lock_context(lock);
 	return err;
 }
 
@@ -101,13 +92,13 @@ int nfs42_proc_allocate(struct file *filep, loff_t offset, loff_t len)
 	if (!nfs_server_capable(inode, NFS_CAP_ALLOCATE))
 		return -EOPNOTSUPP;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	err = nfs42_proc_fallocate(&msg, filep, offset, len);
 	if (err == -EOPNOTSUPP)
 		NFS_SERVER(inode)->caps &= ~NFS_CAP_ALLOCATE;
 
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return err;
 }
 
@@ -123,7 +114,7 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len)
 		return -EOPNOTSUPP;
 
 	nfs_wb_all(inode);
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	err = nfs42_proc_fallocate(&msg, filep, offset, len);
 	if (err == 0)
@@ -131,11 +122,12 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len)
 	if (err == -EOPNOTSUPP)
 		NFS_SERVER(inode)->caps &= ~NFS_CAP_DEALLOCATE;
 
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return err;
 }
 
-static loff_t _nfs42_proc_llseek(struct file *filep, loff_t offset, int whence)
+static loff_t _nfs42_proc_llseek(struct file *filep,
+		struct nfs_lock_context *lock, loff_t offset, int whence)
 {
 	struct inode *inode = file_inode(filep);
 	struct nfs42_seek_args args = {
@@ -156,7 +148,8 @@ static loff_t _nfs42_proc_llseek(struct file *filep, loff_t offset, int whence)
 	if (!nfs_server_capable(inode, NFS_CAP_SEEK))
 		return -ENOTSUPP;
 
-	status = nfs42_set_rw_stateid(&args.sa_stateid, filep, FMODE_READ);
+	status = nfs4_set_rw_stateid(&args.sa_stateid, lock->open_context,
+			lock, FMODE_READ);
 	if (status)
 		return status;
 
@@ -175,17 +168,28 @@ loff_t nfs42_proc_llseek(struct file *filep, loff_t offset, int whence)
 {
 	struct nfs_server *server = NFS_SERVER(file_inode(filep));
 	struct nfs4_exception exception = { };
+	struct nfs_lock_context *lock;
 	loff_t err;
 
+	lock = nfs_get_lock_context(nfs_file_open_context(filep));
+	if (IS_ERR(lock))
+		return PTR_ERR(lock);
+
+	exception.inode = file_inode(filep);
+	exception.state = lock->open_context->state;
+
 	do {
-		err = _nfs42_proc_llseek(filep, offset, whence);
+		err = _nfs42_proc_llseek(filep, lock, offset, whence);
 		if (err >= 0)
 			break;
-		if (err == -ENOTSUPP)
-			return -EOPNOTSUPP;
+		if (err == -ENOTSUPP) {
+			err = -EOPNOTSUPP;
+			break;
+		}
 		err = nfs4_handle_exception(server, err, &exception);
 	} while (exception.retry);
 
+	nfs_put_lock_context(lock);
 	return err;
 }
 
@@ -204,6 +208,8 @@ static void
 nfs42_layoutstat_done(struct rpc_task *task, void *calldata)
 {
 	struct nfs42_layoutstat_data *data = calldata;
+	struct inode *inode = data->inode;
+	struct pnfs_layout_hdr *lo;
 
 	if (!nfs4_sequence_done(task, &data->res.seq_res))
 		return;
@@ -211,12 +217,35 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata)
 	switch (task->tk_status) {
 	case 0:
 		break;
+	case -NFS4ERR_EXPIRED:
+	case -NFS4ERR_STALE_STATEID:
+	case -NFS4ERR_OLD_STATEID:
+	case -NFS4ERR_BAD_STATEID:
+		spin_lock(&inode->i_lock);
+		lo = NFS_I(inode)->layout;
+		if (lo && nfs4_stateid_match(&data->args.stateid,
+					     &lo->plh_stateid)) {
+			LIST_HEAD(head);
+
+			/*
+			 * Mark the bad layout state as invalid, then retry
+			 * with the current stateid.
+			 */
+			set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
+			pnfs_mark_matching_lsegs_invalid(lo, &head, NULL);
+			spin_unlock(&inode->i_lock);
+			pnfs_free_lseg_list(&head);
+		} else
+			spin_unlock(&inode->i_lock);
+		break;
 	case -ENOTSUPP:
 	case -EOPNOTSUPP:
-		NFS_SERVER(data->inode)->caps &= ~NFS_CAP_LAYOUTSTATS;
+		NFS_SERVER(inode)->caps &= ~NFS_CAP_LAYOUTSTATS;
 	default:
-		dprintk("%s server returns %d\n", __func__, task->tk_status);
+		break;
 	}
+
+	dprintk("%s server returns %d\n", __func__, task->tk_status);
 }
 
 static void
@@ -273,8 +302,9 @@ int nfs42_proc_layoutstats_generic(struct nfs_server *server,
 }
 
 static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f,
-			     struct file *dst_f, loff_t src_offset,
-			     loff_t dst_offset, loff_t count)
+		struct file *dst_f, struct nfs_lock_context *src_lock,
+		struct nfs_lock_context *dst_lock, loff_t src_offset,
+		loff_t dst_offset, loff_t count)
 {
 	struct inode *src_inode = file_inode(src_f);
 	struct inode *dst_inode = file_inode(dst_f);
@@ -295,11 +325,13 @@ static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f,
 	msg->rpc_argp = &args;
 	msg->rpc_resp = &res;
 
-	status = nfs42_set_rw_stateid(&args.src_stateid, src_f, FMODE_READ);
+	status = nfs4_set_rw_stateid(&args.src_stateid, src_lock->open_context,
+			src_lock, FMODE_READ);
 	if (status)
 		return status;
 
-	status = nfs42_set_rw_stateid(&args.dst_stateid, dst_f, FMODE_WRITE);
+	status = nfs4_set_rw_stateid(&args.dst_stateid, dst_lock->open_context,
+			dst_lock, FMODE_WRITE);
 	if (status)
 		return status;
 
@@ -324,22 +356,48 @@ int nfs42_proc_clone(struct file *src_f, struct file *dst_f,
 	};
 	struct inode *inode = file_inode(src_f);
 	struct nfs_server *server = NFS_SERVER(file_inode(src_f));
-	struct nfs4_exception exception = { };
-	int err;
+	struct nfs_lock_context *src_lock;
+	struct nfs_lock_context *dst_lock;
+	struct nfs4_exception src_exception = { };
+	struct nfs4_exception dst_exception = { };
+	int err, err2;
 
 	if (!nfs_server_capable(inode, NFS_CAP_CLONE))
 		return -EOPNOTSUPP;
 
+	src_lock = nfs_get_lock_context(nfs_file_open_context(src_f));
+	if (IS_ERR(src_lock))
+		return PTR_ERR(src_lock);
+
+	src_exception.inode = file_inode(src_f);
+	src_exception.state = src_lock->open_context->state;
+
+	dst_lock = nfs_get_lock_context(nfs_file_open_context(dst_f));
+	if (IS_ERR(dst_lock)) {
+		err = PTR_ERR(dst_lock);
+		goto out_put_src_lock;
+	}
+
+	dst_exception.inode = file_inode(dst_f);
+	dst_exception.state = dst_lock->open_context->state;
+
 	do {
-		err = _nfs42_proc_clone(&msg, src_f, dst_f, src_offset,
-					dst_offset, count);
+		err = _nfs42_proc_clone(&msg, src_f, dst_f, src_lock, dst_lock,
+					src_offset, dst_offset, count);
 		if (err == -ENOTSUPP || err == -EOPNOTSUPP) {
 			NFS_SERVER(inode)->caps &= ~NFS_CAP_CLONE;
-			return -EOPNOTSUPP;
+			err = -EOPNOTSUPP;
+			break;
 		}
-		err = nfs4_handle_exception(server, err, &exception);
-	} while (exception.retry);
 
-	return err;
+		err2 = nfs4_handle_exception(server, err, &src_exception);
+		err = nfs4_handle_exception(server, err, &dst_exception);
+		if (!err)
+			err = err2;
+	} while (src_exception.retry || dst_exception.retry);
 
+	nfs_put_lock_context(dst_lock);
+out_put_src_lock:
+	nfs_put_lock_context(src_lock);
+	return err;
 }
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index db9b5fea5..57ca1c803 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -141,11 +141,11 @@ nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 		ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
 		if (ret != 0)
 			break;
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 		ret = nfs_file_fsync_commit(file, start, end, datasync);
 		if (!ret)
 			ret = pnfs_sync_inode(inode, !!datasync);
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 		/*
 		 * If nfs_file_fsync_commit detected a server reboot, then
 		 * resend all dirty pages that might have been covered by
@@ -195,75 +195,37 @@ static long nfs42_fallocate(struct file *filep, int mode, loff_t offset, loff_t
 	return nfs42_proc_allocate(filep, offset, len);
 }
 
-static noinline long
-nfs42_ioctl_clone(struct file *dst_file, unsigned long srcfd,
-		  u64 src_off, u64 dst_off, u64 count)
+static int nfs42_clone_file_range(struct file *src_file, loff_t src_off,
+		struct file *dst_file, loff_t dst_off, u64 count)
 {
 	struct inode *dst_inode = file_inode(dst_file);
 	struct nfs_server *server = NFS_SERVER(dst_inode);
-	struct fd src_file;
-	struct inode *src_inode;
+	struct inode *src_inode = file_inode(src_file);
 	unsigned int bs = server->clone_blksize;
 	bool same_inode = false;
 	int ret;
 
-	/* dst file must be opened for writing */
-	if (!(dst_file->f_mode & FMODE_WRITE))
-		return -EINVAL;
-
-	ret = mnt_want_write_file(dst_file);
-	if (ret)
-		return ret;
-
-	src_file = fdget(srcfd);
-	if (!src_file.file) {
-		ret = -EBADF;
-		goto out_drop_write;
-	}
-
-	src_inode = file_inode(src_file.file);
-
-	if (src_inode == dst_inode)
-		same_inode = true;
-
-	/* src file must be opened for reading */
-	if (!(src_file.file->f_mode & FMODE_READ))
-		goto out_fput;
-
-	/* src and dst must be regular files */
-	ret = -EISDIR;
-	if (!S_ISREG(src_inode->i_mode) || !S_ISREG(dst_inode->i_mode))
-		goto out_fput;
-
-	ret = -EXDEV;
-	if (src_file.file->f_path.mnt != dst_file->f_path.mnt ||
-	    src_inode->i_sb != dst_inode->i_sb)
-		goto out_fput;
-
 	/* check alignment w.r.t. clone_blksize */
 	ret = -EINVAL;
 	if (bs) {
 		if (!IS_ALIGNED(src_off, bs) || !IS_ALIGNED(dst_off, bs))
-			goto out_fput;
+			goto out;
 		if (!IS_ALIGNED(count, bs) && i_size_read(src_inode) != (src_off + count))
-			goto out_fput;
+			goto out;
 	}
 
-	/* verify if ranges are overlapped within the same file */
-	if (same_inode) {
-		if (dst_off + count > src_off && dst_off < src_off + count)
-			goto out_fput;
-	}
+	if (src_inode == dst_inode)
+		same_inode = true;
 
 	/* XXX: do we lock at all? what if server needs CB_RECALL_LAYOUT? */
 	if (same_inode) {
-		mutex_lock(&src_inode->i_mutex);
+		inode_lock(src_inode);
 	} else if (dst_inode < src_inode) {
-		mutex_lock_nested(&dst_inode->i_mutex, I_MUTEX_PARENT);
-		mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_CHILD);
+		inode_lock_nested(dst_inode, I_MUTEX_PARENT);
+		inode_lock_nested(src_inode, I_MUTEX_CHILD);
 	} else {
-		mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_PARENT);
-		mutex_lock_nested(&dst_inode->i_mutex, I_MUTEX_CHILD);
+		inode_lock_nested(src_inode, I_MUTEX_PARENT);
+		inode_lock_nested(dst_inode, I_MUTEX_CHILD);
 	}
 
 	/* flush all pending writes on both src and dst so that server
@@ -275,7 +237,7 @@ nfs42_ioctl_clone(struct file *dst_file, unsigned long srcfd,
 	if (ret)
 		goto out_unlock;
 
-	ret = nfs42_proc_clone(src_file.file, dst_file, src_off, dst_off, count);
+	ret = nfs42_proc_clone(src_file, dst_file, src_off, dst_off, count);
 
 	/* truncate inode page cache of the dst range so that future reads can fetch
 	 * new data from server */
@@ -284,45 +246,17 @@ nfs42_ioctl_clone(struct file *dst_file, unsigned long srcfd,
 
 out_unlock:
 	if (same_inode) {
-		mutex_unlock(&src_inode->i_mutex);
+		inode_unlock(src_inode);
 	} else if (dst_inode < src_inode) {
-		mutex_unlock(&src_inode->i_mutex);
-		mutex_unlock(&dst_inode->i_mutex);
+		inode_unlock(src_inode);
+		inode_unlock(dst_inode);
 	} else {
-		mutex_unlock(&dst_inode->i_mutex);
-		mutex_unlock(&src_inode->i_mutex);
+		inode_unlock(dst_inode);
+		inode_unlock(src_inode);
 	}
-out_fput:
-	fdput(src_file);
-out_drop_write:
-	mnt_drop_write_file(dst_file);
+out:
 	return ret;
 }
-
-static long nfs42_ioctl_clone_range(struct file *dst_file, void __user *argp)
-{
-	struct btrfs_ioctl_clone_range_args args;
-
-	if (copy_from_user(&args, argp, sizeof(args)))
-		return -EFAULT;
-
-	return nfs42_ioctl_clone(dst_file, args.src_fd, args.src_offset,
-				 args.dest_offset, args.src_length);
-}
-
-long nfs4_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
-{
-	void __user *argp = (void __user *)arg;
-
-	switch (cmd) {
-	case BTRFS_IOC_CLONE:
-		return nfs42_ioctl_clone(file, arg, 0, 0, 0);
-	case BTRFS_IOC_CLONE_RANGE:
-		return nfs42_ioctl_clone_range(file, argp);
-	}
-
-	return -ENOTTY;
-}
 #endif /* CONFIG_NFS_V4_2 */
 
 const struct file_operations nfs4_file_operations = {
@@ -342,8 +276,7 @@ const struct file_operations nfs4_file_operations = {
 #ifdef CONFIG_NFS_V4_2
 	.llseek		= nfs4_file_llseek,
 	.fallocate	= nfs42_fallocate,
-	.unlocked_ioctl = nfs4_ioctl,
-	.compat_ioctl	= nfs4_ioctl,
+	.clone_file_range = nfs42_clone_file_range,
 #else
 	.llseek		= nfs_file_llseek,
 #endif
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 98a441573..14881594d 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -208,6 +208,9 @@ static const u32 nfs4_pnfs_open_bitmap[3] = {
 	| FATTR4_WORD1_TIME_METADATA
 	| FATTR4_WORD1_TIME_MODIFY,
 	FATTR4_WORD2_MDSTHRESHOLD
+#ifdef CONFIG_NFS_V4_SECURITY_LABEL
+	| FATTR4_WORD2_SECURITY_LABEL
+#endif
 };
 
 static const u32 nfs4_open_noattr_bitmap[3] = {
@@ -1598,6 +1601,7 @@ _nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data)
 
 	if (!data->rpc_done) {
 		state = nfs4_try_open_cached(data);
+		trace_nfs4_cached_open(data->state);
 		goto out;
 	}
 
@@ -2015,6 +2019,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata)
 	}
 	return;
 unlock_no_action:
+	trace_nfs4_cached_open(data->state);
 	rcu_read_unlock();
 out_no_action:
 	task->tk_action = NULL;
@@ -2703,6 +2708,7 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
 	status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
 	if (status == 0 && state != NULL)
 		renew_lease(server, timestamp);
+	trace_nfs4_setattr(inode, &arg.stateid, status);
 	return status;
 }
 
@@ -2719,7 +2725,6 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
 	int err;
 	do {
 		err = _nfs4_do_setattr(inode, cred, fattr, sattr, state, ilabel, olabel);
-		trace_nfs4_setattr(inode, err);
 		switch (err) {
 		case -NFS4ERR_OPENMODE:
 			if (!(sattr->ia_valid & ATTR_SIZE)) {
@@ -5048,7 +5053,6 @@ static void nfs4_init_boot_verifier(const struct nfs_client *clp,
 static int
 nfs4_init_nonuniform_client_string(struct nfs_client *clp)
 {
-	int result;
 	size_t len;
 	char *str;
 
@@ -5076,7 +5080,7 @@ nfs4_init_nonuniform_client_string(struct nfs_client *clp)
 		return -ENOMEM;
 
 	rcu_read_lock();
-	result = scnprintf(str, len, "Linux NFSv4.0 %s/%s %s",
+	scnprintf(str, len, "Linux NFSv4.0 %s/%s %s",
 			clp->cl_ipaddr,
 			rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR),
 			rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_PROTO));
@@ -5089,7 +5093,6 @@ nfs4_init_nonuniform_client_string(struct nfs_client *clp)
 static int
 nfs4_init_uniquifier_client_string(struct nfs_client *clp)
 {
-	int result;
 	size_t len;
 	char *str;
 
@@ -5109,7 +5112,7 @@ nfs4_init_uniquifier_client_string(struct nfs_client *clp)
 	if (!str)
 		return -ENOMEM;
 
-	result = scnprintf(str, len, "Linux NFSv%u.%u %s/%s",
+	scnprintf(str, len, "Linux NFSv%u.%u %s/%s",
 			clp->rpc_ops->version, clp->cl_minorversion,
 			nfs4_client_id_uniquifier,
 			clp->cl_rpcclient->cl_nodename);
@@ -5120,7 +5123,6 @@ nfs4_init_uniquifier_client_string(struct nfs_client *clp)
 static int
 nfs4_init_uniform_client_string(struct nfs_client *clp)
 {
-	int result;
 	size_t len;
 	char *str;
 
@@ -5145,7 +5147,7 @@ nfs4_init_uniform_client_string(struct nfs_client *clp)
 	if (!str)
 		return -ENOMEM;
 
-	result = scnprintf(str, len, "Linux NFSv%u.%u %s",
+	scnprintf(str, len, "Linux NFSv%u.%u %s",
 			clp->rpc_ops->version, clp->cl_minorversion,
 			clp->cl_rpcclient->cl_nodename);
 	clp->cl_owner_id = str;
@@ -5384,6 +5386,11 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
 	if (data == NULL)
 		return -ENOMEM;
 	nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
+
+	nfs4_state_protect(server->nfs_client,
+			NFS_SP4_MACH_CRED_CLEANUP,
+			&task_setup_data.rpc_client, &msg);
+
 	data->args.fhandle = &data->fh;
 	data->args.stateid = &data->stateid;
 	data->args.bitmask = server->cache_consistency_bitmask;
@@ -5426,7 +5433,7 @@ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4
 	int err;
 	do {
 		err = _nfs4_proc_delegreturn(inode, cred, stateid, issync);
-		trace_nfs4_delegreturn(inode, err);
+		trace_nfs4_delegreturn(inode, stateid, err);
 		switch (err) {
 			case -NFS4ERR_STALE_STATEID:
 			case -NFS4ERR_EXPIRED:
@@ -5936,6 +5943,7 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
 		data->cancelled = 1;
 	rpc_put_task(task);
 	dprintk("%s: done, ret = %d!\n", __func__, ret);
+	trace_nfs4_set_lock(fl, state, &data->res.stateid, cmd, ret);
 	return ret;
 }
 
@@ -5952,7 +5960,6 @@ static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request
 		if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0)
 			return 0;
 		err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_RECLAIM);
-		trace_nfs4_lock_reclaim(request, state, F_SETLK, err);
 		if (err != -NFS4ERR_DELAY)
 			break;
 		nfs4_handle_exception(server, err, &exception);
@@ -5979,7 +5986,6 @@ static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request
 		if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0)
 			return 0;
 		err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_EXPIRED);
-		trace_nfs4_lock_expired(request, state, F_SETLK, err);
 		switch (err) {
 		default:
 			goto out;
@@ -6087,7 +6093,6 @@ static int nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *
 
 	do {
 		err = _nfs4_proc_setlk(state, cmd, request);
-		trace_nfs4_set_lock(request, state, cmd, err);
 		if (err == -NFS4ERR_DENIED)
 			err = -EAGAIN;
 		err = nfs4_handle_exception(NFS_SERVER(state->inode),
@@ -6253,9 +6258,6 @@ static int nfs4_xattr_set_nfs4_acl(const struct xattr_handler *handler,
 				   const void *buf, size_t buflen,
 				   int flags)
 {
-	if (strcmp(key, "") != 0)
-		return -EINVAL;
-
 	return nfs4_proc_set_acl(d_inode(dentry), buf, buflen);
 }
 
@@ -6263,32 +6265,15 @@ static int nfs4_xattr_get_nfs4_acl(const struct xattr_handler *handler,
 				   struct dentry *dentry, const char *key,
 				   void *buf, size_t buflen)
 {
-	if (strcmp(key, "") != 0)
-		return -EINVAL;
-
 	return nfs4_proc_get_acl(d_inode(dentry), buf, buflen);
 }
 
-static size_t nfs4_xattr_list_nfs4_acl(const struct xattr_handler *handler,
-				       struct dentry *dentry, char *list,
-				       size_t list_len, const char *name,
-				       size_t name_len)
+static bool nfs4_xattr_list_nfs4_acl(struct dentry *dentry)
 {
-	size_t len = sizeof(XATTR_NAME_NFSV4_ACL);
-
-	if (!nfs4_server_supports_acls(NFS_SERVER(d_inode(dentry))))
-		return 0;
-
-	if (list && len <= list_len)
-		memcpy(list, XATTR_NAME_NFSV4_ACL, len);
-	return len;
+	return nfs4_server_supports_acls(NFS_SERVER(d_inode(dentry)));
 }
 
 #ifdef CONFIG_NFS_V4_SECURITY_LABEL
-static inline int nfs4_server_supports_labels(struct nfs_server *server)
-{
-	return server->caps & NFS_CAP_SECURITY_LABEL;
-}
 
 static int nfs4_xattr_set_nfs4_label(const struct xattr_handler *handler,
 				     struct dentry *dentry, const char *key,
@@ -6310,29 +6295,34 @@ static int nfs4_xattr_get_nfs4_label(const struct xattr_handler *handler,
 	return -EOPNOTSUPP;
 }
 
-static size_t nfs4_xattr_list_nfs4_label(const struct xattr_handler *handler,
-					 struct dentry *dentry, char *list,
-					 size_t list_len, const char *name,
-					 size_t name_len)
+static ssize_t
+nfs4_listxattr_nfs4_label(struct inode *inode, char *list, size_t list_len)
 {
-	size_t len = 0;
+	int len = 0;
 
-	if (nfs_server_capable(d_inode(dentry), NFS_CAP_SECURITY_LABEL)) {
-		len = security_inode_listsecurity(d_inode(dentry), NULL, 0);
-		if (list && len <= list_len)
-			security_inode_listsecurity(d_inode(dentry), list, len);
+	if (nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL)) {
+		len = security_inode_listsecurity(inode, list, list_len);
+		if (list_len && len > list_len)
+			return -ERANGE;
 	}
 	return len;
 }
 
 static const struct xattr_handler nfs4_xattr_nfs4_label_handler = {
 	.prefix = XATTR_SECURITY_PREFIX,
-	.list	= nfs4_xattr_list_nfs4_label,
 	.get	= nfs4_xattr_get_nfs4_label,
 	.set	= nfs4_xattr_set_nfs4_label,
 };
-#endif
 
+#else
+
+static ssize_t
+nfs4_listxattr_nfs4_label(struct inode *inode, char *list, size_t list_len)
+{
+	return 0;
+}
+
+#endif
 
 /*
  * nfs_fhget will use either the mounted_on_fileid or the fileid
@@ -6862,10 +6852,13 @@ static const struct nfs41_state_protection nfs4_sp4_mach_cred_request = {
 	},
 	.allow.u.words = {
 		[0] = 1 << (OP_CLOSE) |
+		      1 << (OP_OPEN_DOWNGRADE) |
 		      1 << (OP_LOCKU) |
+		      1 << (OP_DELEGRETURN) |
 		      1 << (OP_COMMIT),
 		[1] = 1 << (OP_SECINFO - 32) |
 		      1 << (OP_SECINFO_NO_NAME - 32) |
+		      1 << (OP_LAYOUTRETURN - 32) |
 		      1 << (OP_TEST_STATEID - 32) |
 		      1 << (OP_FREE_STATEID - 32) |
 		      1 << (OP_WRITE - 32)
@@ -6930,11 +6923,19 @@ static int nfs4_sp4_select_mode(struct nfs_client *clp,
 		}
 
 		if (test_bit(OP_CLOSE, sp->allow.u.longs) &&
+		    test_bit(OP_OPEN_DOWNGRADE, sp->allow.u.longs) &&
+		    test_bit(OP_DELEGRETURN, sp->allow.u.longs) &&
 		    test_bit(OP_LOCKU, sp->allow.u.longs)) {
 			dfprintk(MOUNT, "  cleanup mode enabled\n");
 			set_bit(NFS_SP4_MACH_CRED_CLEANUP, &clp->cl_sp4_flags);
 		}
 
+		if (test_bit(OP_LAYOUTRETURN, sp->allow.u.longs)) {
+			dfprintk(MOUNT, "  pnfs cleanup mode enabled\n");
+			set_bit(NFS_SP4_MACH_CRED_PNFS_CLEANUP,
+				&clp->cl_sp4_flags);
+		}
+
 		if (test_bit(OP_SECINFO, sp->allow.u.longs) &&
 		    test_bit(OP_SECINFO_NO_NAME, sp->allow.u.longs)) {
 			dfprintk(MOUNT, "  secinfo mode enabled\n");
@@ -7763,6 +7764,7 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
 	struct nfs4_layoutget *lgp = calldata;
 	struct nfs_server *server = NFS_SERVER(lgp->args.inode);
 	struct nfs4_session *session = nfs4_get_session(server);
+	int ret;
 
 	dprintk("--> %s\n", __func__);
 	/* Note the is a race here, where a CB_LAYOUTRECALL can come in
@@ -7773,12 +7775,12 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata)
 	if (nfs41_setup_sequence(session, &lgp->args.seq_args,
 				&lgp->res.seq_res, task))
 		return;
-	if (pnfs_choose_layoutget_stateid(&lgp->args.stateid,
+	ret = pnfs_choose_layoutget_stateid(&lgp->args.stateid,
 					  NFS_I(lgp->args.inode)->layout,
 					  &lgp->args.range,
-					  lgp->args.ctx->state)) {
-		rpc_exit(task, NFS4_OK);
-	}
+					  lgp->args.ctx->state);
+	if (ret < 0)
+		rpc_exit(task, ret);
 }
 
 static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
@@ -7798,6 +7800,15 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata)
 	switch (task->tk_status) {
 	case 0:
 		goto out;
+
+	/*
+	 * NFS4ERR_LAYOUTUNAVAILABLE means we are not supposed to use pnfs
+	 * on the file. set tk_status to -ENODATA to tell upper layer to
+	 * retry go inband.
+	 */
+	case -NFS4ERR_LAYOUTUNAVAILABLE:
+		task->tk_status = -ENODATA;
+		goto out;
 	/*
 	 * NFS4ERR_BADLAYOUT means the MDS cannot return a layout of
 	 * length lgp->args.minlength != 0 (see RFC5661 section 18.43.3).
@@ -7994,6 +8005,7 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags)
 	trace_nfs4_layoutget(lgp->args.ctx,
 			&lgp->args.range,
 			&lgp->res.range,
+			&lgp->res.stateid,
 			status);
 	/* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */
 	if (status == 0 && lgp->res.layoutp->len)
@@ -8050,9 +8062,10 @@ static void nfs4_layoutreturn_release(void *calldata)
 
 	dprintk("--> %s\n", __func__);
 	spin_lock(&lo->plh_inode->i_lock);
+	pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range);
+	pnfs_mark_layout_returned_if_empty(lo);
 	if (lrp->res.lrs_present)
 		pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
-	pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range);
 	pnfs_clear_layoutreturn_waitbit(lo);
 	spin_unlock(&lo->plh_inode->i_lock);
 	pnfs_free_lseg_list(&freeme);
@@ -8085,6 +8098,10 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync)
 	};
 	int status = 0;
 
+	nfs4_state_protect(NFS_SERVER(lrp->args.inode)->nfs_client,
+			NFS_SP4_MACH_CRED_PNFS_CLEANUP,
+			&task_setup_data.rpc_client, &msg);
+
 	dprintk("--> %s\n", __func__);
 	if (!sync) {
 		lrp->inode = nfs_igrab_and_active(lrp->args.inode);
@@ -8100,7 +8117,7 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync)
 		return PTR_ERR(task);
 	if (sync)
 		status = task->tk_status;
-	trace_nfs4_layoutreturn(lrp->args.inode, status);
+	trace_nfs4_layoutreturn(lrp->args.inode, &lrp->args.stateid, status);
 	dprintk("<-- %s status=%d\n", __func__, status);
 	rpc_put_task(task);
 	return status;
@@ -8248,7 +8265,7 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync)
 		return PTR_ERR(task);
 	if (sync)
 		status = task->tk_status;
-	trace_nfs4_layoutcommit(data->args.inode, status);
+	trace_nfs4_layoutcommit(data->args.inode, &data->args.stateid, status);
 	dprintk("%s: status %d\n", __func__, status);
 	rpc_put_task(task);
 	return status;
@@ -8748,6 +8765,24 @@ const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = {
 #endif
 };
 
+ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size)
+{
+	ssize_t error, error2;
+
+	error = generic_listxattr(dentry, list, size);
+	if (error < 0)
+		return error;
+	if (list) {
+		list += error;
+		size -= error;
+	}
+
+	error2 = nfs4_listxattr_nfs4_label(d_inode(dentry), list, size);
+	if (error2 < 0)
+		return error2;
+	return error + error2;
+}
+
 static const struct inode_operations nfs4_dir_inode_operations = {
 	.create		= nfs_create,
 	.lookup		= nfs_lookup,
@@ -8764,7 +8799,7 @@ static const struct inode_operations nfs4_dir_inode_operations = {
 	.setattr	= nfs_setattr,
 	.getxattr	= generic_getxattr,
 	.setxattr	= generic_setxattr,
-	.listxattr	= generic_listxattr,
+	.listxattr	= nfs4_listxattr,
 	.removexattr	= generic_removexattr,
 };
 
@@ -8774,7 +8809,7 @@ static const struct inode_operations nfs4_file_inode_operations = {
 	.setattr	= nfs_setattr,
 	.getxattr	= generic_getxattr,
 	.setxattr	= generic_setxattr,
-	.listxattr	= generic_listxattr,
+	.listxattr	= nfs4_listxattr,
 	.removexattr	= generic_removexattr,
 };
 
@@ -8833,7 +8868,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
 };
 
 static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = {
-	.prefix	= XATTR_NAME_NFSV4_ACL,
+	.name	= XATTR_NAME_NFSV4_ACL,
 	.list	= nfs4_xattr_list_nfs4_acl,
 	.get	= nfs4_xattr_get_nfs4_acl,
 	.set	= nfs4_xattr_set_nfs4_acl,
diff --git a/fs/nfs/nfs4sysctl.c b/fs/nfs/nfs4sysctl.c
index 0fbd3ab1b..8693d77c4 100644
--- a/fs/nfs/nfs4sysctl.c
+++ b/fs/nfs/nfs4sysctl.c
@@ -12,7 +12,7 @@
 #include "nfs4idmap.h"
 #include "callback.h"
 
-static const int nfs_set_port_min = 0;
+static const int nfs_set_port_min;
 static const int nfs_set_port_max = 65535;
 static struct ctl_table_header *nfs4_callback_sysctl_table;
 
diff --git a/fs/nfs/nfs4trace.c b/fs/nfs/nfs4trace.c
index d774335cc..2850bce19 100644
--- a/fs/nfs/nfs4trace.c
+++ b/fs/nfs/nfs4trace.c
@@ -6,6 +6,7 @@
 #include "internal.h"
 #include "nfs4session.h"
 #include "callback.h"
+#include "pnfs.h"
 
 #define CREATE_TRACE_POINTS
 #include "nfs4trace.h"
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index 671cf68fe..2c8d05dae 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -321,6 +321,7 @@ TRACE_EVENT(nfs4_sequence_done,
 			__entry->highest_slotid = res->sr_highest_slotid;
 			__entry->target_highest_slotid =
 					res->sr_target_highest_slotid;
+			__entry->status_flags = res->sr_status_flags;
 			__entry->error = res->sr_status;
 		),
 		TP_printk(
@@ -399,6 +400,10 @@ DECLARE_EVENT_CLASS(nfs4_open_event,
 			__field(u64, fileid)
 			__field(u64, dir)
 			__string(name, ctx->dentry->d_name.name)
+			__field(int, stateid_seq)
+			__field(u32, stateid_hash)
+			__field(int, openstateid_seq)
+			__field(u32, openstateid_hash)
 		),
 
 		TP_fast_assign(
@@ -409,8 +414,22 @@ DECLARE_EVENT_CLASS(nfs4_open_event,
 			__entry->flags = flags;
 			__entry->fmode = (__force unsigned int)ctx->mode;
 			__entry->dev = ctx->dentry->d_sb->s_dev;
-			if (!IS_ERR_OR_NULL(state))
+			if (!IS_ERR_OR_NULL(state)) {
 				inode = state->inode;
+				__entry->stateid_seq =
+					be32_to_cpu(state->stateid.seqid);
+				__entry->stateid_hash =
+					nfs_stateid_hash(&state->stateid);
+				__entry->openstateid_seq =
+					be32_to_cpu(state->open_stateid.seqid);
+				__entry->openstateid_hash =
+					nfs_stateid_hash(&state->open_stateid);
+			} else {
+				__entry->stateid_seq = 0;
+				__entry->stateid_hash = 0;
+				__entry->openstateid_seq = 0;
+				__entry->openstateid_hash = 0;
+			}
 			if (inode != NULL) {
 				__entry->fileid = NFS_FILEID(inode);
 				__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
@@ -425,7 +444,8 @@ DECLARE_EVENT_CLASS(nfs4_open_event,
 		TP_printk(
 			"error=%d (%s) flags=%d (%s) fmode=%s "
 			"fileid=%02x:%02x:%llu fhandle=0x%08x "
-			"name=%02x:%02x:%llu/%s",
+			"name=%02x:%02x:%llu/%s stateid=%d:0x%08x "
+			"openstateid=%d:0x%08x",
 			 __entry->error,
 			 show_nfsv4_errors(__entry->error),
 			 __entry->flags,
@@ -436,7 +456,9 @@ DECLARE_EVENT_CLASS(nfs4_open_event,
 			 __entry->fhandle,
 			 MAJOR(__entry->dev), MINOR(__entry->dev),
 			 (unsigned long long)__entry->dir,
-			 __get_str(name)
+			 __get_str(name),
+			 __entry->stateid_seq, __entry->stateid_hash,
+			 __entry->openstateid_seq, __entry->openstateid_hash
 		)
 );
 
@@ -452,6 +474,45 @@ DEFINE_NFS4_OPEN_EVENT(nfs4_open_reclaim);
 DEFINE_NFS4_OPEN_EVENT(nfs4_open_expired);
 DEFINE_NFS4_OPEN_EVENT(nfs4_open_file);
 
+TRACE_EVENT(nfs4_cached_open,
+		TP_PROTO(
+			const struct nfs4_state *state
+		),
+		TP_ARGS(state),
+		TP_STRUCT__entry(
+			__field(dev_t, dev)
+			__field(u32, fhandle)
+			__field(u64, fileid)
+			__field(unsigned int, fmode)
+			__field(int, stateid_seq)
+			__field(u32, stateid_hash)
+		),
+
+		TP_fast_assign(
+			const struct inode *inode = state->inode;
+
+			__entry->dev = inode->i_sb->s_dev;
+			__entry->fileid = NFS_FILEID(inode);
+			__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+			__entry->fmode = (__force unsigned int)state->state;
+			__entry->stateid_seq =
+				be32_to_cpu(state->stateid.seqid);
+			__entry->stateid_hash =
+				nfs_stateid_hash(&state->stateid);
+		),
+
+		TP_printk(
+			"fmode=%s fileid=%02x:%02x:%llu "
+			"fhandle=0x%08x stateid=%d:0x%08x",
+			__entry->fmode ?  show_fmode_flags(__entry->fmode) :
+					  "closed",
+			MAJOR(__entry->dev), MINOR(__entry->dev),
+			(unsigned long long)__entry->fileid,
+			__entry->fhandle,
+			__entry->stateid_seq, __entry->stateid_hash
+		)
+);
+
 TRACE_EVENT(nfs4_close,
 		TP_PROTO(
 			const struct nfs4_state *state,
@@ -468,6 +529,8 @@ TRACE_EVENT(nfs4_close,
 			__field(u64, fileid)
 			__field(unsigned int, fmode)
 			__field(int, error)
+			__field(int, stateid_seq)
+			__field(u32, stateid_hash)
 		),
 
 		TP_fast_assign(
@@ -478,18 +541,23 @@ TRACE_EVENT(nfs4_close,
 			__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
 			__entry->fmode = (__force unsigned int)state->state;
 			__entry->error = error;
+			__entry->stateid_seq =
+				be32_to_cpu(args->stateid.seqid);
+			__entry->stateid_hash =
+				nfs_stateid_hash(&args->stateid);
 		),
 
 		TP_printk(
 			"error=%d (%s) fmode=%s fileid=%02x:%02x:%llu "
-			"fhandle=0x%08x",
+			"fhandle=0x%08x openstateid=%d:0x%08x",
 			__entry->error,
 			show_nfsv4_errors(__entry->error),
 			__entry->fmode ?  show_fmode_flags(__entry->fmode) :
 					  "closed",
 			MAJOR(__entry->dev), MINOR(__entry->dev),
 			(unsigned long long)__entry->fileid,
-			__entry->fhandle
+			__entry->fhandle,
+			__entry->stateid_seq, __entry->stateid_hash
 		)
 );
 
@@ -523,6 +591,8 @@ DECLARE_EVENT_CLASS(nfs4_lock_event,
 			__field(dev_t, dev)
 			__field(u32, fhandle)
 			__field(u64, fileid)
+			__field(int, stateid_seq)
+			__field(u32, stateid_hash)
 		),
 
 		TP_fast_assign(
@@ -536,11 +606,16 @@ DECLARE_EVENT_CLASS(nfs4_lock_event,
 			__entry->dev = inode->i_sb->s_dev;
 			__entry->fileid = NFS_FILEID(inode);
 			__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+			__entry->stateid_seq =
+				be32_to_cpu(state->stateid.seqid);
+			__entry->stateid_hash =
+				nfs_stateid_hash(&state->stateid);
 		),
 
 		TP_printk(
 			"error=%d (%s) cmd=%s:%s range=%lld:%lld "
-			"fileid=%02x:%02x:%llu fhandle=0x%08x",
+			"fileid=%02x:%02x:%llu fhandle=0x%08x "
+			"stateid=%d:0x%08x",
 			__entry->error,
 			show_nfsv4_errors(__entry->error),
 			show_lock_cmd(__entry->cmd),
@@ -549,7 +624,8 @@ DECLARE_EVENT_CLASS(nfs4_lock_event,
 			(long long)__entry->end,
 			MAJOR(__entry->dev), MINOR(__entry->dev),
 			(unsigned long long)__entry->fileid,
-			__entry->fhandle
+			__entry->fhandle,
+			__entry->stateid_seq, __entry->stateid_hash
 		)
 );
 
@@ -563,11 +639,73 @@ DECLARE_EVENT_CLASS(nfs4_lock_event,
 			), \
 			TP_ARGS(request, state, cmd, error))
 DEFINE_NFS4_LOCK_EVENT(nfs4_get_lock);
-DEFINE_NFS4_LOCK_EVENT(nfs4_set_lock);
-DEFINE_NFS4_LOCK_EVENT(nfs4_lock_reclaim);
-DEFINE_NFS4_LOCK_EVENT(nfs4_lock_expired);
 DEFINE_NFS4_LOCK_EVENT(nfs4_unlock);
 
+TRACE_EVENT(nfs4_set_lock,
+		TP_PROTO(
+			const struct file_lock *request,
+			const struct nfs4_state *state,
+			const nfs4_stateid *lockstateid,
+			int cmd,
+			int error
+		),
+
+		TP_ARGS(request, state, lockstateid, cmd, error),
+
+		TP_STRUCT__entry(
+			__field(int, error)
+			__field(int, cmd)
+			__field(char, type)
+			__field(loff_t, start)
+			__field(loff_t, end)
+			__field(dev_t, dev)
+			__field(u32, fhandle)
+			__field(u64, fileid)
+			__field(int, stateid_seq)
+			__field(u32, stateid_hash)
+			__field(int, lockstateid_seq)
+			__field(u32, lockstateid_hash)
+		),
+
+		TP_fast_assign(
+			const struct inode *inode = state->inode;
+
+			__entry->error = error;
+			__entry->cmd = cmd;
+			__entry->type = request->fl_type;
+			__entry->start = request->fl_start;
+			__entry->end = request->fl_end;
+			__entry->dev = inode->i_sb->s_dev;
+			__entry->fileid = NFS_FILEID(inode);
+			__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+			__entry->stateid_seq =
+				be32_to_cpu(state->stateid.seqid);
+			__entry->stateid_hash =
+				nfs_stateid_hash(&state->stateid);
+			__entry->lockstateid_seq =
+				be32_to_cpu(lockstateid->seqid);
+			__entry->lockstateid_hash =
+				nfs_stateid_hash(lockstateid);
+		),
+
+		TP_printk(
+			"error=%d (%s) cmd=%s:%s range=%lld:%lld "
+			"fileid=%02x:%02x:%llu fhandle=0x%08x "
+			"stateid=%d:0x%08x lockstateid=%d:0x%08x",
+			__entry->error,
+			show_nfsv4_errors(__entry->error),
+			show_lock_cmd(__entry->cmd),
+			show_lock_type(__entry->type),
+			(long long)__entry->start,
+			(long long)__entry->end,
+			MAJOR(__entry->dev), MINOR(__entry->dev),
+			(unsigned long long)__entry->fileid,
+			__entry->fhandle,
+			__entry->stateid_seq, __entry->stateid_hash,
+			__entry->lockstateid_seq, __entry->lockstateid_hash
+		)
+);
+
 DECLARE_EVENT_CLASS(nfs4_set_delegation_event,
 		TP_PROTO(
 			const struct inode *inode,
@@ -621,20 +759,28 @@ TRACE_EVENT(nfs4_delegreturn_exit,
 			__field(dev_t, dev)
 			__field(u32, fhandle)
 			__field(int, error)
+			__field(int, stateid_seq)
+			__field(u32, stateid_hash)
 		),
 
 		TP_fast_assign(
 			__entry->dev = res->server->s_dev;
 			__entry->fhandle = nfs_fhandle_hash(args->fhandle);
 			__entry->error = error;
+			__entry->stateid_seq =
+				be32_to_cpu(args->stateid->seqid);
+			__entry->stateid_hash =
+				nfs_stateid_hash(args->stateid);
 		),
 
 		TP_printk(
-			"error=%d (%s) dev=%02x:%02x fhandle=0x%08x",
+			"error=%d (%s) dev=%02x:%02x fhandle=0x%08x "
+			"stateid=%d:0x%08x",
 			__entry->error,
 			show_nfsv4_errors(__entry->error),
 			MAJOR(__entry->dev), MINOR(__entry->dev),
-			__entry->fhandle
+			__entry->fhandle,
+			__entry->stateid_seq, __entry->stateid_hash
 		)
 );
 
@@ -653,6 +799,8 @@ DECLARE_EVENT_CLASS(nfs4_test_stateid_event,
 			__field(dev_t, dev)
 			__field(u32, fhandle)
 			__field(u64, fileid)
+			__field(int, stateid_seq)
+			__field(u32, stateid_hash)
 		),
 
 		TP_fast_assign(
@@ -662,15 +810,21 @@ DECLARE_EVENT_CLASS(nfs4_test_stateid_event,
 			__entry->dev = inode->i_sb->s_dev;
 			__entry->fileid = NFS_FILEID(inode);
 			__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+			__entry->stateid_seq =
+				be32_to_cpu(state->stateid.seqid);
+			__entry->stateid_hash =
+				nfs_stateid_hash(&state->stateid);
 		),
 
 		TP_printk(
-			"error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x",
+			"error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+			"stateid=%d:0x%08x",
 			__entry->error,
 			show_nfsv4_errors(__entry->error),
 			MAJOR(__entry->dev), MINOR(__entry->dev),
 			(unsigned long long)__entry->fileid,
-			__entry->fhandle
+			__entry->fhandle,
+			__entry->stateid_seq, __entry->stateid_hash
 		)
 );
 
@@ -820,7 +974,6 @@ DECLARE_EVENT_CLASS(nfs4_inode_event,
 			), \
 			TP_ARGS(inode, error))
 
-DEFINE_NFS4_INODE_EVENT(nfs4_setattr);
 DEFINE_NFS4_INODE_EVENT(nfs4_access);
 DEFINE_NFS4_INODE_EVENT(nfs4_readlink);
 DEFINE_NFS4_INODE_EVENT(nfs4_readdir);
@@ -830,8 +983,59 @@ DEFINE_NFS4_INODE_EVENT(nfs4_set_acl);
 DEFINE_NFS4_INODE_EVENT(nfs4_get_security_label);
 DEFINE_NFS4_INODE_EVENT(nfs4_set_security_label);
 #endif /* CONFIG_NFS_V4_SECURITY_LABEL */
-DEFINE_NFS4_INODE_EVENT(nfs4_recall_delegation);
-DEFINE_NFS4_INODE_EVENT(nfs4_delegreturn);
+
+DECLARE_EVENT_CLASS(nfs4_inode_stateid_event,
+		TP_PROTO(
+			const struct inode *inode,
+			const nfs4_stateid *stateid,
+			int error
+		),
+
+		TP_ARGS(inode, stateid, error),
+
+		TP_STRUCT__entry(
+			__field(dev_t, dev)
+			__field(u32, fhandle)
+			__field(u64, fileid)
+			__field(int, error)
+			__field(int, stateid_seq)
+			__field(u32, stateid_hash)
+		),
+
+		TP_fast_assign(
+			__entry->dev = inode->i_sb->s_dev;
+			__entry->fileid = NFS_FILEID(inode);
+			__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+			__entry->error = error;
+			__entry->stateid_seq =
+				be32_to_cpu(stateid->seqid);
+			__entry->stateid_hash =
+				nfs_stateid_hash(stateid);
+		),
+
+		TP_printk(
+			"error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+			"stateid=%d:0x%08x",
+			__entry->error,
+			show_nfsv4_errors(__entry->error),
+			MAJOR(__entry->dev), MINOR(__entry->dev),
+			(unsigned long long)__entry->fileid,
+			__entry->fhandle,
+			__entry->stateid_seq, __entry->stateid_hash
+		)
+);
+
+#define DEFINE_NFS4_INODE_STATEID_EVENT(name) \
+	DEFINE_EVENT(nfs4_inode_stateid_event, name, \
+			TP_PROTO( \
+				const struct inode *inode, \
+				const nfs4_stateid *stateid, \
+				int error \
+			), \
+			TP_ARGS(inode, stateid, error))
+
+DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_setattr);
+DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_delegreturn);
 
 DECLARE_EVENT_CLASS(nfs4_getattr_event,
 		TP_PROTO(
@@ -941,8 +1145,74 @@ DECLARE_EVENT_CLASS(nfs4_inode_callback_event,
 			), \
 			TP_ARGS(clp, fhandle, inode, error))
 DEFINE_NFS4_INODE_CALLBACK_EVENT(nfs4_cb_getattr);
-DEFINE_NFS4_INODE_CALLBACK_EVENT(nfs4_cb_layoutrecall_inode);
 
+DECLARE_EVENT_CLASS(nfs4_inode_stateid_callback_event,
+		TP_PROTO(
+			const struct nfs_client *clp,
+			const struct nfs_fh *fhandle,
+			const struct inode *inode,
+			const nfs4_stateid *stateid,
+			int error
+		),
+
+		TP_ARGS(clp, fhandle, inode, stateid, error),
+
+		TP_STRUCT__entry(
+			__field(int, error)
+			__field(dev_t, dev)
+			__field(u32, fhandle)
+			__field(u64, fileid)
+			__string(dstaddr, clp ?
+				rpc_peeraddr2str(clp->cl_rpcclient,
+					RPC_DISPLAY_ADDR) : "unknown")
+			__field(int, stateid_seq)
+			__field(u32, stateid_hash)
+		),
+
+		TP_fast_assign(
+			__entry->error = error;
+			__entry->fhandle = nfs_fhandle_hash(fhandle);
+			if (inode != NULL) {
+				__entry->fileid = NFS_FILEID(inode);
+				__entry->dev = inode->i_sb->s_dev;
+			} else {
+				__entry->fileid = 0;
+				__entry->dev = 0;
+			}
+			__assign_str(dstaddr, clp ?
+				rpc_peeraddr2str(clp->cl_rpcclient,
+					RPC_DISPLAY_ADDR) : "unknown")
+			__entry->stateid_seq =
+				be32_to_cpu(stateid->seqid);
+			__entry->stateid_hash =
+				nfs_stateid_hash(stateid);
+		),
+
+		TP_printk(
+			"error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
+			"stateid=%d:0x%08x dstaddr=%s",
+			__entry->error,
+			show_nfsv4_errors(__entry->error),
+			MAJOR(__entry->dev), MINOR(__entry->dev),
+			(unsigned long long)__entry->fileid,
+			__entry->fhandle,
+			__entry->stateid_seq, __entry->stateid_hash,
+			__get_str(dstaddr)
+		)
+);
+
+#define DEFINE_NFS4_INODE_STATEID_CALLBACK_EVENT(name) \
+	DEFINE_EVENT(nfs4_inode_stateid_callback_event, name, \
+			TP_PROTO( \
+				const struct nfs_client *clp, \
+				const struct nfs_fh *fhandle, \
+				const struct inode *inode, \
+				const nfs4_stateid *stateid, \
+				int error \
+			), \
+			TP_ARGS(clp, fhandle, inode, stateid, error))
+DEFINE_NFS4_INODE_STATEID_CALLBACK_EVENT(nfs4_cb_recall);
+DEFINE_NFS4_INODE_STATEID_CALLBACK_EVENT(nfs4_cb_layoutrecall_file);
 
 DECLARE_EVENT_CLASS(nfs4_idmap_event,
 		TP_PROTO(
@@ -1005,28 +1275,37 @@ DECLARE_EVENT_CLASS(nfs4_read_event,
 			__field(loff_t, offset)
 			__field(size_t, count)
 			__field(int, error)
+			__field(int, stateid_seq)
+			__field(u32, stateid_hash)
 		),
 
 		TP_fast_assign(
 			const struct inode *inode = hdr->inode;
+			const struct nfs4_state *state =
+				hdr->args.context->state;
 			__entry->dev = inode->i_sb->s_dev;
 			__entry->fileid = NFS_FILEID(inode);
 			__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
 			__entry->offset = hdr->args.offset;
 			__entry->count = hdr->args.count;
 			__entry->error = error;
+			__entry->stateid_seq =
+				be32_to_cpu(state->stateid.seqid);
+			__entry->stateid_hash =
+				nfs_stateid_hash(&state->stateid);
 		),
 
 		TP_printk(
 			"error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
-			"offset=%lld count=%zu",
+			"offset=%lld count=%zu stateid=%d:0x%08x",
 			__entry->error,
 			show_nfsv4_errors(__entry->error),
 			MAJOR(__entry->dev), MINOR(__entry->dev),
 			(unsigned long long)__entry->fileid,
 			__entry->fhandle,
 			(long long)__entry->offset,
-			__entry->count
+			__entry->count,
+			__entry->stateid_seq, __entry->stateid_hash
 		)
 );
 #define DEFINE_NFS4_READ_EVENT(name) \
@@ -1056,28 +1335,37 @@ DECLARE_EVENT_CLASS(nfs4_write_event,
 			__field(loff_t, offset)
 			__field(size_t, count)
 			__field(int, error)
+			__field(int, stateid_seq)
+			__field(u32, stateid_hash)
 		),
 
 		TP_fast_assign(
 			const struct inode *inode = hdr->inode;
+			const struct nfs4_state *state =
+				hdr->args.context->state;
 			__entry->dev = inode->i_sb->s_dev;
 			__entry->fileid = NFS_FILEID(inode);
 			__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
 			__entry->offset = hdr->args.offset;
 			__entry->count = hdr->args.count;
 			__entry->error = error;
+			__entry->stateid_seq =
+				be32_to_cpu(state->stateid.seqid);
+			__entry->stateid_hash =
+				nfs_stateid_hash(&state->stateid);
 		),
 
 		TP_printk(
 			"error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
-			"offset=%lld count=%zu",
+			"offset=%lld count=%zu stateid=%d:0x%08x",
 			__entry->error,
 			show_nfsv4_errors(__entry->error),
 			MAJOR(__entry->dev), MINOR(__entry->dev),
 			(unsigned long long)__entry->fileid,
 			__entry->fhandle,
 			(long long)__entry->offset,
-			__entry->count
+			__entry->count,
+			__entry->stateid_seq, __entry->stateid_hash
 		)
 );
 
@@ -1154,10 +1442,11 @@ TRACE_EVENT(nfs4_layoutget,
 			const struct nfs_open_context *ctx,
 			const struct pnfs_layout_range *args,
 			const struct pnfs_layout_range *res,
+			const nfs4_stateid *layout_stateid,
 			int error
 		),
 
-		TP_ARGS(ctx, args, res, error),
+		TP_ARGS(ctx, args, res, layout_stateid, error),
 
 		TP_STRUCT__entry(
 			__field(dev_t, dev)
@@ -1167,10 +1456,15 @@ TRACE_EVENT(nfs4_layoutget,
 			__field(u64, offset)
 			__field(u64, count)
 			__field(int, error)
+			__field(int, stateid_seq)
+			__field(u32, stateid_hash)
+			__field(int, layoutstateid_seq)
+			__field(u32, layoutstateid_hash)
 		),
 
 		TP_fast_assign(
 			const struct inode *inode = d_inode(ctx->dentry);
+			const struct nfs4_state *state = ctx->state;
 			__entry->dev = inode->i_sb->s_dev;
 			__entry->fileid = NFS_FILEID(inode);
 			__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
@@ -1178,11 +1472,25 @@ TRACE_EVENT(nfs4_layoutget,
 			__entry->offset = args->offset;
 			__entry->count = args->length;
 			__entry->error = error;
+			__entry->stateid_seq =
+				be32_to_cpu(state->stateid.seqid);
+			__entry->stateid_hash =
+				nfs_stateid_hash(&state->stateid);
+			if (!error) {
+				__entry->layoutstateid_seq =
+				be32_to_cpu(layout_stateid->seqid);
+				__entry->layoutstateid_hash =
+				nfs_stateid_hash(layout_stateid);
+			} else {
+				__entry->layoutstateid_seq = 0;
+				__entry->layoutstateid_hash = 0;
+			}
 		),
 
 		TP_printk(
 			"error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x "
-			"iomode=%s offset=%llu count=%llu",
+			"iomode=%s offset=%llu count=%llu stateid=%d:0x%08x "
+			"layoutstateid=%d:0x%08x",
 			__entry->error,
 			show_nfsv4_errors(__entry->error),
 			MAJOR(__entry->dev), MINOR(__entry->dev),
@@ -1190,14 +1498,83 @@ TRACE_EVENT(nfs4_layoutget,
 			__entry->fhandle,
 			show_pnfs_iomode(__entry->iomode),
 			(unsigned long long)__entry->offset,
-			(unsigned long long)__entry->count
+			(unsigned long long)__entry->count,
+			__entry->stateid_seq, __entry->stateid_hash,
+			__entry->layoutstateid_seq, __entry->layoutstateid_hash
 		)
 );
 
-DEFINE_NFS4_INODE_EVENT(nfs4_layoutcommit);
-DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn);
+DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_layoutcommit);
+DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_layoutreturn);
 DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn_on_close);
 
+#define show_pnfs_update_layout_reason(reason)				\
+	__print_symbolic(reason,					\
+		{ PNFS_UPDATE_LAYOUT_UNKNOWN, "unknown" },		\
+		{ PNFS_UPDATE_LAYOUT_NO_PNFS, "no pnfs" },		\
+		{ PNFS_UPDATE_LAYOUT_RD_ZEROLEN, "read+zerolen" },	\
+		{ PNFS_UPDATE_LAYOUT_MDSTHRESH, "mdsthresh" },		\
+		{ PNFS_UPDATE_LAYOUT_NOMEM, "nomem" },			\
+		{ PNFS_UPDATE_LAYOUT_BULK_RECALL, "bulk recall" },	\
+		{ PNFS_UPDATE_LAYOUT_IO_TEST_FAIL, "io test fail" },	\
+		{ PNFS_UPDATE_LAYOUT_FOUND_CACHED, "found cached" },	\
+		{ PNFS_UPDATE_LAYOUT_RETURN, "layoutreturn" },		\
+		{ PNFS_UPDATE_LAYOUT_BLOCKED, "layouts blocked" },	\
+		{ PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET, "sent layoutget" })
+
+TRACE_EVENT(pnfs_update_layout,
+		TP_PROTO(struct inode *inode,
+			loff_t pos,
+			u64 count,
+			enum pnfs_iomode iomode,
+			struct pnfs_layout_hdr *lo,
+			enum pnfs_update_layout_reason reason
+		),
+		TP_ARGS(inode, pos, count, iomode, lo, reason),
+		TP_STRUCT__entry(
+			__field(dev_t, dev)
+			__field(u64, fileid)
+			__field(u32, fhandle)
+			__field(loff_t, pos)
+			__field(u64, count)
+			__field(enum pnfs_iomode, iomode)
+			__field(int, layoutstateid_seq)
+			__field(u32, layoutstateid_hash)
+			__field(enum pnfs_update_layout_reason, reason)
+		),
+		TP_fast_assign(
+			__entry->dev = inode->i_sb->s_dev;
+			__entry->fileid = NFS_FILEID(inode);
+			__entry->fhandle = nfs_fhandle_hash(NFS_FH(inode));
+			__entry->pos = pos;
+			__entry->count = count;
+			__entry->iomode = iomode;
+			__entry->reason = reason;
+			if (lo != NULL) {
+				__entry->layoutstateid_seq =
+				be32_to_cpu(lo->plh_stateid.seqid);
+				__entry->layoutstateid_hash =
+				nfs_stateid_hash(&lo->plh_stateid);
+			} else {
+				__entry->layoutstateid_seq = 0;
+				__entry->layoutstateid_hash = 0;
+			}
+		),
+		TP_printk(
+			"fileid=%02x:%02x:%llu fhandle=0x%08x "
+			"iomode=%s pos=%llu count=%llu "
+			"layoutstateid=%d:0x%08x (%s)",
+			MAJOR(__entry->dev), MINOR(__entry->dev),
+			(unsigned long long)__entry->fileid,
+			__entry->fhandle,
+			show_pnfs_iomode(__entry->iomode),
+			(unsigned long long)__entry->pos,
+			(unsigned long long)__entry->count,
+			__entry->layoutstateid_seq, __entry->layoutstateid_hash,
+			show_pnfs_update_layout_reason(__entry->reason)
+		)
+);
+
 #endif /* CONFIG_NFS_V4_1 */
 
 #endif /* _TRACE_NFS4_H */
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index 59f838cdc..9f80a086b 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -39,7 +39,6 @@
 			{ 1 << NFS_INO_INVALIDATING, "INVALIDATING" }, \
 			{ 1 << NFS_INO_FLUSHING, "FLUSHING" }, \
 			{ 1 << NFS_INO_FSCACHE, "FSCACHE" }, \
-			{ 1 << NFS_INO_COMMIT, "COMMIT" }, \
 			{ 1 << NFS_INO_LAYOUTCOMMIT, "NEED_LAYOUTCOMMIT" }, \
 			{ 1 << NFS_INO_LAYOUTCOMMITTING, "LAYOUTCOMMIT" })
 
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 452a011ba..8ce4f61cb 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -101,53 +101,18 @@ nfs_page_free(struct nfs_page *p)
 	kmem_cache_free(nfs_page_cachep, p);
 }
 
-static void
-nfs_iocounter_inc(struct nfs_io_counter *c)
-{
-	atomic_inc(&c->io_count);
-}
-
-static void
-nfs_iocounter_dec(struct nfs_io_counter *c)
-{
-	if (atomic_dec_and_test(&c->io_count)) {
-		clear_bit(NFS_IO_INPROGRESS, &c->flags);
-		smp_mb__after_atomic();
-		wake_up_bit(&c->flags, NFS_IO_INPROGRESS);
-	}
-}
-
-static int
-__nfs_iocounter_wait(struct nfs_io_counter *c)
-{
-	wait_queue_head_t *wq = bit_waitqueue(&c->flags, NFS_IO_INPROGRESS);
-	DEFINE_WAIT_BIT(q, &c->flags, NFS_IO_INPROGRESS);
-	int ret = 0;
-
-	do {
-		prepare_to_wait(wq, &q.wait, TASK_KILLABLE);
-		set_bit(NFS_IO_INPROGRESS, &c->flags);
-		if (atomic_read(&c->io_count) == 0)
-			break;
-		ret = nfs_wait_bit_killable(&q.key, TASK_KILLABLE);
-	} while (atomic_read(&c->io_count) != 0 && !ret);
-	finish_wait(wq, &q.wait);
-	return ret;
-}
-
 /**
  * nfs_iocounter_wait - wait for i/o to complete
- * @c: nfs_io_counter to use
+ * @l_ctx: nfs_lock_context with io_counter to use
  *
  * returns -ERESTARTSYS if interrupted by a fatal signal.
  * Otherwise returns 0 once the io_count hits 0.
  */
 int
-nfs_iocounter_wait(struct nfs_io_counter *c)
+nfs_iocounter_wait(struct nfs_lock_context *l_ctx)
 {
-	if (atomic_read(&c->io_count) == 0)
-		return 0;
-	return __nfs_iocounter_wait(c);
+	return wait_on_atomic_t(&l_ctx->io_count, nfs_wait_atomic_killable,
+			TASK_KILLABLE);
 }
 
 /*
@@ -370,7 +335,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct page *page,
 		return ERR_CAST(l_ctx);
 	}
 	req->wb_lock_context = l_ctx;
-	nfs_iocounter_inc(&l_ctx->io_count);
+	atomic_inc(&l_ctx->io_count);
 
 	/* Initialize the request struct. Initially, we assume a
 	 * long write-back delay. This will be adjusted in
@@ -431,7 +396,8 @@ static void nfs_clear_request(struct nfs_page *req)
 		req->wb_page = NULL;
 	}
 	if (l_ctx != NULL) {
-		nfs_iocounter_dec(&l_ctx->io_count);
+		if (atomic_dec_and_test(&l_ctx->io_count))
+			wake_up_atomic_t(&l_ctx->io_count);
 		nfs_put_lock_context(l_ctx);
 		req->wb_lock_context = NULL;
 	}
@@ -664,22 +630,11 @@ EXPORT_SYMBOL_GPL(nfs_initiate_pgio);
  * @desc: IO descriptor
  * @hdr: pageio header
  */
-static int nfs_pgio_error(struct nfs_pageio_descriptor *desc,
-			  struct nfs_pgio_header *hdr)
+static void nfs_pgio_error(struct nfs_pgio_header *hdr)
 {
-	struct nfs_pgio_mirror *mirror;
-	u32 midx;
-
 	set_bit(NFS_IOHDR_REDO, &hdr->flags);
 	nfs_pgio_data_destroy(hdr);
 	hdr->completion_ops->completion(hdr);
-	/* TODO: Make sure it's right to clean up all mirrors here
-	 *       and not just hdr->pgio_mirror_idx */
-	for (midx = 0; midx < desc->pg_mirror_count; midx++) {
-		mirror = &desc->pg_mirrors[midx];
-		desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
-	}
-	return -ENOMEM;
 }
 
 /**
@@ -800,8 +755,11 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
 	unsigned int pagecount, pageused;
 
 	pagecount = nfs_page_array_len(mirror->pg_base, mirror->pg_count);
-	if (!nfs_pgarray_set(&hdr->page_array, pagecount))
-		return nfs_pgio_error(desc, hdr);
+	if (!nfs_pgarray_set(&hdr->page_array, pagecount)) {
+		nfs_pgio_error(hdr);
+		desc->pg_error = -ENOMEM;
+		return desc->pg_error;
+	}
 
 	nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq);
 	pages = hdr->page_array.pagevec;
@@ -819,8 +777,11 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc,
 			*pages++ = last_page = req->wb_page;
 		}
 	}
-	if (WARN_ON_ONCE(pageused != pagecount))
-		return nfs_pgio_error(desc, hdr);
+	if (WARN_ON_ONCE(pageused != pagecount)) {
+		nfs_pgio_error(hdr);
+		desc->pg_error = -EINVAL;
+		return desc->pg_error;
+	}
 
 	if ((desc->pg_ioflags & FLUSH_COND_STABLE) &&
 	    (desc->pg_moreio || nfs_reqs_to_commit(&cinfo)))
@@ -835,18 +796,13 @@ EXPORT_SYMBOL_GPL(nfs_generic_pgio);
 
 static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc)
 {
-	struct nfs_pgio_mirror *mirror;
 	struct nfs_pgio_header *hdr;
 	int ret;
 
-	mirror = nfs_pgio_current_mirror(desc);
-
 	hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
 	if (!hdr) {
-		/* TODO: make sure this is right with mirroring - or
-		 *       should it back out all mirrors? */
-		desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
-		return -ENOMEM;
+		desc->pg_error = -ENOMEM;
+		return desc->pg_error;
 	}
 	nfs_pgheader_init(desc, hdr, nfs_pgio_header_free);
 	ret = nfs_generic_pgio(desc, hdr);
@@ -874,6 +830,9 @@ static int nfs_pageio_setup_mirroring(struct nfs_pageio_descriptor *pgio,
 
 	mirror_count = pgio->pg_ops->pg_get_mirror_count(pgio, req);
 
+	if (pgio->pg_error < 0)
+		return pgio->pg_error;
+
 	if (!mirror_count || mirror_count > NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX)
 		return -EINVAL;
 
@@ -903,12 +862,6 @@ static void nfs_pageio_cleanup_mirroring(struct nfs_pageio_descriptor *pgio)
 	pgio->pg_mirrors_dynamic = NULL;
 }
 
-static bool nfs_match_open_context(const struct nfs_open_context *ctx1,
-		const struct nfs_open_context *ctx2)
-{
-	return ctx1->cred == ctx2->cred && ctx1->state == ctx2->state;
-}
-
 static bool nfs_match_lock_context(const struct nfs_lock_context *l1,
 		const struct nfs_lock_context *l2)
 {
@@ -982,6 +935,8 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
 	} else {
 		if (desc->pg_ops->pg_init)
 			desc->pg_ops->pg_init(desc, req);
+		if (desc->pg_error < 0)
+			return 0;
 		mirror->pg_base = req->wb_pgbase;
 	}
 	if (!nfs_can_coalesce_requests(prev, req, desc))
@@ -1147,6 +1102,8 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
 	bytes = req->wb_bytes;
 
 	nfs_pageio_setup_mirroring(desc, req);
+	if (desc->pg_error < 0)
+		goto out_failed;
 
 	for (midx = 0; midx < desc->pg_mirror_count; midx++) {
 		if (midx) {
@@ -1163,7 +1120,8 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
 
 			if (IS_ERR(dupreq)) {
 				nfs_page_group_unlock(req);
-				return 0;
+				desc->pg_error = PTR_ERR(dupreq);
+				goto out_failed;
 			}
 
 			nfs_lock_request(dupreq);
@@ -1176,10 +1134,32 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
 		if (nfs_pgio_has_mirroring(desc))
 			desc->pg_mirror_idx = midx;
 		if (!nfs_pageio_add_request_mirror(desc, dupreq))
-			return 0;
+			goto out_failed;
 	}
 
 	return 1;
+
+out_failed:
+	/*
+	 * We might have failed before sending any reqs over wire.
+	 * Clean up rest of the reqs in mirror pg_list.
+	 */
+	if (desc->pg_error) {
+		struct nfs_pgio_mirror *mirror;
+		void (*func)(struct list_head *);
+
+		/* remember fatal errors */
+		if (nfs_error_is_fatal(desc->pg_error))
+			mapping_set_error(desc->pg_inode->i_mapping,
+					  desc->pg_error);
+
+		func = desc->pg_completion_ops->error_cleanup;
+		for (midx = 0; midx < desc->pg_mirror_count; midx++) {
+			mirror = &desc->pg_mirrors[midx];
+			func(&mirror->pg_list);
+		}
+	}
+	return 0;
 }
 
 /*
@@ -1232,7 +1212,7 @@ int nfs_pageio_resend(struct nfs_pageio_descriptor *desc,
 	nfs_pageio_complete(desc);
 	if (!list_empty(&failed)) {
 		list_move(&failed, &hdr->pages);
-		return -EIO;
+		return desc->pg_error < 0 ? desc->pg_error : -EIO;
 	}
 	return 0;
 }
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index bec038449..2fa483e6d 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -52,9 +52,7 @@ static DEFINE_SPINLOCK(pnfs_spinlock);
  */
 static LIST_HEAD(pnfs_modules_tbl);
 
-static int
-pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid,
-		       enum pnfs_iomode iomode, bool sync);
+static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo);
 
 /* Return the registered pnfs layout driver module matching given id */
 static struct pnfs_layoutdriver_type *
@@ -243,6 +241,8 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
 {
 	struct inode *inode = lo->plh_inode;
 
+	pnfs_layoutreturn_before_put_layout_hdr(lo);
+
 	if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) {
 		if (!list_empty(&lo->plh_segs))
 			WARN_ONCE(1, "NFS: BUG unfreed layout segments.\n");
@@ -252,6 +252,27 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
 	}
 }
 
+/*
+ * Mark a pnfs_layout_hdr and all associated layout segments as invalid
+ *
+ * In order to continue using the pnfs_layout_hdr, a full recovery
+ * is required.
+ * Note that caller must hold inode->i_lock.
+ */
+static int
+pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo,
+		struct list_head *lseg_list)
+{
+	struct pnfs_layout_range range = {
+		.iomode = IOMODE_ANY,
+		.offset = 0,
+		.length = NFS4_MAX_UINT64,
+	};
+
+	set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
+	return pnfs_mark_matching_lsegs_invalid(lo, lseg_list, &range);
+}
+
 static int
 pnfs_iomode_to_fail_bit(u32 iomode)
 {
@@ -345,58 +366,6 @@ pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo,
 	rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq);
 }
 
-/* Return true if layoutreturn is needed */
-static bool
-pnfs_layout_need_return(struct pnfs_layout_hdr *lo,
-			struct pnfs_layout_segment *lseg)
-{
-	struct pnfs_layout_segment *s;
-
-	if (!test_and_clear_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
-		return false;
-
-	list_for_each_entry(s, &lo->plh_segs, pls_list)
-		if (s != lseg && test_bit(NFS_LSEG_LAYOUTRETURN, &s->pls_flags))
-			return false;
-
-	return true;
-}
-
-static bool
-pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo)
-{
-	if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
-		return false;
-	lo->plh_return_iomode = 0;
-	pnfs_get_layout_hdr(lo);
-	clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, &lo->plh_flags);
-	return true;
-}
-
-static void pnfs_layoutreturn_before_put_lseg(struct pnfs_layout_segment *lseg,
-		struct pnfs_layout_hdr *lo, struct inode *inode)
-{
-	lo = lseg->pls_layout;
-	inode = lo->plh_inode;
-
-	spin_lock(&inode->i_lock);
-	if (pnfs_layout_need_return(lo, lseg)) {
-		nfs4_stateid stateid;
-		enum pnfs_iomode iomode;
-		bool send;
-
-		stateid = lo->plh_stateid;
-		iomode = lo->plh_return_iomode;
-		send = pnfs_prepare_layoutreturn(lo);
-		spin_unlock(&inode->i_lock);
-		if (send) {
-			/* Send an async layoutreturn so we dont deadlock */
-			pnfs_send_layoutreturn(lo, stateid, iomode, false);
-		}
-	} else
-		spin_unlock(&inode->i_lock);
-}
-
 void
 pnfs_put_lseg(struct pnfs_layout_segment *lseg)
 {
@@ -410,15 +379,8 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg)
 		atomic_read(&lseg->pls_refcount),
 		test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
 
-	/* Handle the case where refcount != 1 */
-	if (atomic_add_unless(&lseg->pls_refcount, -1, 1))
-		return;
-
 	lo = lseg->pls_layout;
 	inode = lo->plh_inode;
-	/* Do we need a layoutreturn? */
-	if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
-		pnfs_layoutreturn_before_put_lseg(lseg, lo, inode);
 
 	if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) {
 		if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags)) {
@@ -566,10 +528,10 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg,
 int
 pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
 			    struct list_head *tmp_list,
-			    struct pnfs_layout_range *recall_range)
+			    const struct pnfs_layout_range *recall_range)
 {
 	struct pnfs_layout_segment *lseg, *next;
-	int invalid = 0, removed = 0;
+	int remaining = 0;
 
 	dprintk("%s:Begin lo %p\n", __func__, lo);
 
@@ -582,11 +544,11 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
 				"offset %llu length %llu\n", __func__,
 				lseg, lseg->pls_range.iomode, lseg->pls_range.offset,
 				lseg->pls_range.length);
-			invalid++;
-			removed += mark_lseg_invalid(lseg, tmp_list);
+			if (!mark_lseg_invalid(lseg, tmp_list))
+				remaining++;
 		}
-	dprintk("%s:Return %i\n", __func__, invalid - removed);
-	return invalid - removed;
+	dprintk("%s:Return %i\n", __func__, remaining);
+	return remaining;
 }
 
 /* note free_me must contain lsegs from a single layout_hdr */
@@ -613,12 +575,10 @@ pnfs_destroy_layout(struct nfs_inode *nfsi)
 	spin_lock(&nfsi->vfs_inode.i_lock);
 	lo = nfsi->layout;
 	if (lo) {
-		lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */
-		pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL);
 		pnfs_get_layout_hdr(lo);
+		pnfs_mark_layout_stateid_invalid(lo, &tmp_list);
 		pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RO_FAILED);
 		pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RW_FAILED);
-		pnfs_clear_retry_layoutget(lo);
 		spin_unlock(&nfsi->vfs_inode.i_lock);
 		pnfs_free_lseg_list(&tmp_list);
 		pnfs_put_layout_hdr(lo);
@@ -677,11 +637,6 @@ pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list,
 {
 	struct pnfs_layout_hdr *lo;
 	struct inode *inode;
-	struct pnfs_layout_range range = {
-		.iomode = IOMODE_ANY,
-		.offset = 0,
-		.length = NFS4_MAX_UINT64,
-	};
 	LIST_HEAD(lseg_list);
 	int ret = 0;
 
@@ -696,13 +651,15 @@ pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list,
 
 		spin_lock(&inode->i_lock);
 		list_del_init(&lo->plh_bulk_destroy);
-		lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */
-		if (is_bulk_recall)
-			set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
-		if (pnfs_mark_matching_lsegs_invalid(lo, &lseg_list, &range))
+		if (pnfs_mark_layout_stateid_invalid(lo, &lseg_list)) {
+			if (is_bulk_recall)
+				set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
 			ret = -EAGAIN;
+		}
 		spin_unlock(&inode->i_lock);
 		pnfs_free_lseg_list(&lseg_list);
+		/* Free all lsegs that are attached to commit buckets */
+		nfs_commit_inode(inode, 0);
 		pnfs_put_layout_hdr(lo);
 		iput(inode);
 	}
@@ -826,7 +783,7 @@ pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo)
 
 int
 pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
-			      struct pnfs_layout_range *range,
+			      const struct pnfs_layout_range *range,
 			      struct nfs4_state *open_state)
 {
 	int status = 0;
@@ -861,7 +818,7 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo,
 static struct pnfs_layout_segment *
 send_layoutget(struct pnfs_layout_hdr *lo,
 	   struct nfs_open_context *ctx,
-	   struct pnfs_layout_range *range,
+	   const struct pnfs_layout_range *range,
 	   gfp_t gfp_flags)
 {
 	struct inode *ino = lo->plh_inode;
@@ -894,7 +851,7 @@ send_layoutget(struct pnfs_layout_hdr *lo,
 				lgp->args.minlength = i_size - range->offset;
 		}
 		lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE;
-		lgp->args.range = *range;
+		pnfs_copy_range(&lgp->args.range, range);
 		lgp->args.type = server->pnfs_curr_ld->id;
 		lgp->args.inode = ino;
 		lgp->args.ctx = get_nfs_open_context(ctx);
@@ -904,17 +861,9 @@ send_layoutget(struct pnfs_layout_hdr *lo,
 		lseg = nfs4_proc_layoutget(lgp, gfp_flags);
 	} while (lseg == ERR_PTR(-EAGAIN));
 
-	if (IS_ERR(lseg)) {
-		switch (PTR_ERR(lseg)) {
-		case -ENOMEM:
-		case -ERESTARTSYS:
-			break;
-		default:
-			/* remember that LAYOUTGET failed and suspend trying */
-			pnfs_layout_io_set_failed(lo, range->iomode);
-		}
-		return NULL;
-	} else
+	if (IS_ERR(lseg) && !nfs_error_is_fatal(PTR_ERR(lseg)))
+		lseg = NULL;
+	else
 		pnfs_layout_clear_fail_bit(lo,
 				pnfs_iomode_to_fail_bit(range->iomode));
 
@@ -944,8 +893,19 @@ void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo)
 	rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq);
 }
 
+static bool
+pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo)
+{
+	if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
+		return false;
+	lo->plh_return_iomode = 0;
+	pnfs_get_layout_hdr(lo);
+	clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
+	return true;
+}
+
 static int
-pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid,
+pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, const nfs4_stateid *stateid,
 		       enum pnfs_iomode iomode, bool sync)
 {
 	struct inode *ino = lo->plh_inode;
@@ -962,7 +922,7 @@ pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid,
 		goto out;
 	}
 
-	lrp->args.stateid = stateid;
+	nfs4_stateid_copy(&lrp->args.stateid, stateid);
 	lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id;
 	lrp->args.inode = ino;
 	lrp->args.range.iomode = iomode;
@@ -978,6 +938,48 @@ out:
 	return status;
 }
 
+/* Return true if layoutreturn is needed */
+static bool
+pnfs_layout_need_return(struct pnfs_layout_hdr *lo)
+{
+	struct pnfs_layout_segment *s;
+
+	if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
+		return false;
+
+	/* Defer layoutreturn until all lsegs are done */
+	list_for_each_entry(s, &lo->plh_segs, pls_list) {
+		if (test_bit(NFS_LSEG_LAYOUTRETURN, &s->pls_flags))
+			return false;
+	}
+
+	return true;
+}
+
+static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+	struct inode *inode= lo->plh_inode;
+
+	if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
+		return;
+	spin_lock(&inode->i_lock);
+	if (pnfs_layout_need_return(lo)) {
+		nfs4_stateid stateid;
+		enum pnfs_iomode iomode;
+		bool send;
+
+		nfs4_stateid_copy(&stateid, &lo->plh_stateid);
+		iomode = lo->plh_return_iomode;
+		send = pnfs_prepare_layoutreturn(lo);
+		spin_unlock(&inode->i_lock);
+		if (send) {
+			/* Send an async layoutreturn so we dont deadlock */
+			pnfs_send_layoutreturn(lo, &stateid, iomode, false);
+		}
+	} else
+		spin_unlock(&inode->i_lock);
+}
+
 /*
  * Initiates a LAYOUTRETURN(FILE), and removes the pnfs_layout_hdr
  * when the layout segment list is empty.
@@ -1005,7 +1007,7 @@ _pnfs_return_layout(struct inode *ino)
 		dprintk("NFS: %s no layout to return\n", __func__);
 		goto out;
 	}
-	stateid = nfsi->layout->plh_stateid;
+	nfs4_stateid_copy(&stateid, &nfsi->layout->plh_stateid);
 	/* Reference matched in nfs4_layoutreturn_release */
 	pnfs_get_layout_hdr(lo);
 	empty = list_empty(&lo->plh_segs);
@@ -1033,7 +1035,7 @@ _pnfs_return_layout(struct inode *ino)
 	spin_unlock(&ino->i_lock);
 	pnfs_free_lseg_list(&tmp_list);
 	if (send)
-		status = pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true);
+		status = pnfs_send_layoutreturn(lo, &stateid, IOMODE_ANY, true);
 out_put_layout_hdr:
 	pnfs_put_layout_hdr(lo);
 out:
@@ -1096,13 +1098,12 @@ bool pnfs_roc(struct inode *ino)
 			goto out_noroc;
 	}
 
-	stateid = lo->plh_stateid;
+	nfs4_stateid_copy(&stateid, &lo->plh_stateid);
 	/* always send layoutreturn if being marked so */
-	if (test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
+	if (test_and_clear_bit(NFS_LAYOUT_RETURN_REQUESTED,
 				   &lo->plh_flags))
 		layoutreturn = pnfs_prepare_layoutreturn(lo);
 
-	pnfs_clear_retry_layoutget(lo);
 	list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list)
 		/* If we are sending layoutreturn, invalidate all valid lsegs */
 		if (layoutreturn || test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
@@ -1124,7 +1125,7 @@ out_noroc:
 	pnfs_free_lseg_list(&tmp_list);
 	pnfs_layoutcommit_inode(ino, true);
 	if (layoutreturn)
-		pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true);
+		pnfs_send_layoutreturn(lo, &stateid, IOMODE_ANY, true);
 	return roc;
 }
 
@@ -1149,6 +1150,7 @@ void pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
 
 	spin_lock(&ino->i_lock);
 	lo = NFS_I(ino)->layout;
+	pnfs_mark_layout_returned_if_empty(lo);
 	if (pnfs_seqid_is_newer(barrier, lo->plh_barrier))
 		lo->plh_barrier = barrier;
 	spin_unlock(&ino->i_lock);
@@ -1465,25 +1467,15 @@ static bool pnfs_within_mdsthreshold(struct nfs_open_context *ctx,
 	return ret;
 }
 
-/* stop waiting if someone clears NFS_LAYOUT_RETRY_LAYOUTGET bit. */
-static int pnfs_layoutget_retry_bit_wait(struct wait_bit_key *key, int mode)
-{
-	if (!test_bit(NFS_LAYOUT_RETRY_LAYOUTGET, key->flags))
-		return 1;
-	return nfs_wait_bit_killable(key, mode);
-}
-
 static bool pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo)
 {
-	if (!pnfs_should_retry_layoutget(lo))
-		return false;
 	/*
 	 * send layoutcommit as it can hold up layoutreturn due to lseg
 	 * reference
 	 */
 	pnfs_layoutcommit_inode(lo->plh_inode, false);
 	return !wait_on_bit_action(&lo->plh_flags, NFS_LAYOUT_RETURN,
-				   pnfs_layoutget_retry_bit_wait,
+				   nfs_wait_bit_killable,
 				   TASK_UNINTERRUPTIBLE);
 }
 
@@ -1520,14 +1512,23 @@ pnfs_update_layout(struct inode *ino,
 	struct pnfs_layout_segment *lseg = NULL;
 	bool first;
 
-	if (!pnfs_enabled_sb(NFS_SERVER(ino)))
+	if (!pnfs_enabled_sb(NFS_SERVER(ino))) {
+		trace_pnfs_update_layout(ino, pos, count, iomode, NULL,
+				 PNFS_UPDATE_LAYOUT_NO_PNFS);
 		goto out;
+	}
 
-	if (iomode == IOMODE_READ && i_size_read(ino) == 0)
+	if (iomode == IOMODE_READ && i_size_read(ino) == 0) {
+		trace_pnfs_update_layout(ino, pos, count, iomode, NULL,
+				 PNFS_UPDATE_LAYOUT_RD_ZEROLEN);
 		goto out;
+	}
 
-	if (pnfs_within_mdsthreshold(ctx, ino, iomode))
+	if (pnfs_within_mdsthreshold(ctx, ino, iomode)) {
+		trace_pnfs_update_layout(ino, pos, count, iomode, NULL,
+				 PNFS_UPDATE_LAYOUT_MDSTHRESH);
 		goto out;
+	}
 
 lookup_again:
 	first = false;
@@ -1535,19 +1536,25 @@ lookup_again:
 	lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);
 	if (lo == NULL) {
 		spin_unlock(&ino->i_lock);
+		trace_pnfs_update_layout(ino, pos, count, iomode, NULL,
+				 PNFS_UPDATE_LAYOUT_NOMEM);
 		goto out;
 	}
 
 	/* Do we even need to bother with this? */
 	if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) {
+		trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+				 PNFS_UPDATE_LAYOUT_BULK_RECALL);
 		dprintk("%s matches recall, use MDS\n", __func__);
 		goto out_unlock;
 	}
 
 	/* if LAYOUTGET already failed once we don't try again */
-	if (pnfs_layout_io_test_failed(lo, iomode) &&
-	    !pnfs_should_retry_layoutget(lo))
+	if (pnfs_layout_io_test_failed(lo, iomode)) {
+		trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+				 PNFS_UPDATE_LAYOUT_IO_TEST_FAIL);
 		goto out_unlock;
+	}
 
 	first = list_empty(&lo->plh_segs);
 	if (first) {
@@ -1567,8 +1574,11 @@ lookup_again:
 		 * already exists
 		 */
 		lseg = pnfs_find_lseg(lo, &arg);
-		if (lseg)
+		if (lseg) {
+			trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+					PNFS_UPDATE_LAYOUT_FOUND_CACHED);
 			goto out_unlock;
+		}
 	}
 
 	/*
@@ -1585,11 +1595,16 @@ lookup_again:
 			dprintk("%s retrying\n", __func__);
 			goto lookup_again;
 		}
+		trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+				PNFS_UPDATE_LAYOUT_RETURN);
 		goto out_put_layout_hdr;
 	}
 
-	if (pnfs_layoutgets_blocked(lo))
+	if (pnfs_layoutgets_blocked(lo)) {
+		trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+				PNFS_UPDATE_LAYOUT_BLOCKED);
 		goto out_unlock;
+	}
 	atomic_inc(&lo->plh_outstanding);
 	spin_unlock(&ino->i_lock);
 
@@ -1612,8 +1627,9 @@ lookup_again:
 		arg.length = PAGE_CACHE_ALIGN(arg.length);
 
 	lseg = send_layoutget(lo, ctx, &arg, gfp_flags);
-	pnfs_clear_retry_layoutget(lo);
 	atomic_dec(&lo->plh_outstanding);
+	trace_pnfs_update_layout(ino, pos, count, iomode, lo,
+				 PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET);
 out_put_layout_hdr:
 	if (first)
 		pnfs_clear_first_layoutget(lo);
@@ -1623,7 +1639,7 @@ out:
 			"(%s, offset: %llu, length: %llu)\n",
 			__func__, ino->i_sb->s_id,
 			(unsigned long long)NFS_FILEID(ino),
-			lseg == NULL ? "not found" : "found",
+			IS_ERR_OR_NULL(lseg) ? "not found" : "found",
 			iomode==IOMODE_RW ?  "read/write" : "read-only",
 			(unsigned long long)pos,
 			(unsigned long long)count);
@@ -1730,16 +1746,40 @@ out_forget_reply:
 }
 
 static void
+pnfs_set_plh_return_iomode(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode)
+{
+	if (lo->plh_return_iomode == iomode)
+		return;
+	if (lo->plh_return_iomode != 0)
+		iomode = IOMODE_ANY;
+	lo->plh_return_iomode = iomode;
+	set_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
+}
+
+/**
+ * pnfs_mark_matching_lsegs_return - Free or return matching layout segments
+ * @lo: pointer to layout header
+ * @tmp_list: list header to be used with pnfs_free_lseg_list()
+ * @return_range: describe layout segment ranges to be returned
+ *
+ * This function is mainly intended for use by layoutrecall. It attempts
+ * to free the layout segment immediately, or else to mark it for return
+ * as soon as its reference count drops to zero.
+ */
+int
 pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
 				struct list_head *tmp_list,
-				struct pnfs_layout_range *return_range)
+				const struct pnfs_layout_range *return_range)
 {
 	struct pnfs_layout_segment *lseg, *next;
+	int remaining = 0;
 
 	dprintk("%s:Begin lo %p\n", __func__, lo);
 
 	if (list_empty(&lo->plh_segs))
-		return;
+		return 0;
+
+	assert_spin_locked(&lo->plh_inode->i_lock);
 
 	list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
 		if (should_free_lseg(&lseg->pls_range, return_range)) {
@@ -1748,39 +1788,47 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
 				lseg, lseg->pls_range.iomode,
 				lseg->pls_range.offset,
 				lseg->pls_range.length);
+			if (mark_lseg_invalid(lseg, tmp_list))
+				continue;
+			remaining++;
 			set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
-			mark_lseg_invalid(lseg, tmp_list);
-			set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE,
-					&lo->plh_flags);
+			pnfs_set_plh_return_iomode(lo, return_range->iomode);
 		}
+	return remaining;
 }
 
 void pnfs_error_mark_layout_for_return(struct inode *inode,
 				       struct pnfs_layout_segment *lseg)
 {
 	struct pnfs_layout_hdr *lo = NFS_I(inode)->layout;
-	int iomode = pnfs_iomode_to_fail_bit(lseg->pls_range.iomode);
 	struct pnfs_layout_range range = {
 		.iomode = lseg->pls_range.iomode,
 		.offset = 0,
 		.length = NFS4_MAX_UINT64,
 	};
 	LIST_HEAD(free_me);
+	bool return_now = false;
 
 	spin_lock(&inode->i_lock);
-	/* set failure bit so that pnfs path will be retried later */
-	pnfs_layout_set_fail_bit(lo, iomode);
-	if (lo->plh_return_iomode == 0)
-		lo->plh_return_iomode = range.iomode;
-	else if (lo->plh_return_iomode != range.iomode)
-		lo->plh_return_iomode = IOMODE_ANY;
+	pnfs_set_plh_return_iomode(lo, range.iomode);
 	/*
 	 * mark all matching lsegs so that we are sure to have no live
 	 * segments at hand when sending layoutreturn. See pnfs_put_lseg()
 	 * for how it works.
 	 */
-	pnfs_mark_matching_lsegs_return(lo, &free_me, &range);
-	spin_unlock(&inode->i_lock);
+	if (!pnfs_mark_matching_lsegs_return(lo, &free_me, &range)) {
+		nfs4_stateid stateid;
+		enum pnfs_iomode iomode = lo->plh_return_iomode;
+
+		nfs4_stateid_copy(&stateid, &lo->plh_stateid);
+		return_now = pnfs_prepare_layoutreturn(lo);
+		spin_unlock(&inode->i_lock);
+		if (return_now)
+			pnfs_send_layoutreturn(lo, &stateid, iomode, false);
+	} else {
+		spin_unlock(&inode->i_lock);
+		nfs_commit_inode(inode, 0);
+	}
 	pnfs_free_lseg_list(&free_me);
 }
 EXPORT_SYMBOL_GPL(pnfs_error_mark_layout_for_return);
@@ -1802,6 +1850,11 @@ pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *r
 						   rd_size,
 						   IOMODE_READ,
 						   GFP_KERNEL);
+		if (IS_ERR(pgio->pg_lseg)) {
+			pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+			pgio->pg_lseg = NULL;
+			return;
+		}
 	}
 	/* If no lseg, fall back to read through mds */
 	if (pgio->pg_lseg == NULL)
@@ -1814,13 +1867,19 @@ void
 pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
 			   struct nfs_page *req, u64 wb_size)
 {
-	if (pgio->pg_lseg == NULL)
+	if (pgio->pg_lseg == NULL) {
 		pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
 						   req->wb_context,
 						   req_offset(req),
 						   wb_size,
 						   IOMODE_RW,
 						   GFP_NOFS);
+		if (IS_ERR(pgio->pg_lseg)) {
+			pgio->pg_error = PTR_ERR(pgio->pg_lseg);
+			pgio->pg_lseg = NULL;
+			return;
+		}
+	}
 	/* If no lseg, fall back to write through mds */
 	if (pgio->pg_lseg == NULL)
 		nfs_pageio_reset_write_mds(pgio);
@@ -1988,15 +2047,13 @@ static void pnfs_writehdr_free(struct nfs_pgio_header *hdr)
 int
 pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc)
 {
-	struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
-
 	struct nfs_pgio_header *hdr;
 	int ret;
 
 	hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
 	if (!hdr) {
-		desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
-		return -ENOMEM;
+		desc->pg_error = -ENOMEM;
+		return desc->pg_error;
 	}
 	nfs_pgheader_init(desc, hdr, pnfs_writehdr_free);
 
@@ -2119,15 +2176,13 @@ static void pnfs_readhdr_free(struct nfs_pgio_header *hdr)
 int
 pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc)
 {
-	struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
-
 	struct nfs_pgio_header *hdr;
 	int ret;
 
 	hdr = nfs_pgio_header_alloc(desc->pg_rw_ops);
 	if (!hdr) {
-		desc->pg_completion_ops->error_cleanup(&mirror->pg_list);
-		return -ENOMEM;
+		desc->pg_error = -ENOMEM;
+		return desc->pg_error;
 	}
 	nfs_pgheader_init(desc, hdr, pnfs_readhdr_free);
 	hdr->lseg = pnfs_get_lseg(desc->pg_lseg);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index d1990e90e..1ac1db5f6 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -94,11 +94,10 @@ enum {
 	NFS_LAYOUT_RO_FAILED = 0,	/* get ro layout failed stop trying */
 	NFS_LAYOUT_RW_FAILED,		/* get rw layout failed stop trying */
 	NFS_LAYOUT_BULK_RECALL,		/* bulk recall affecting layout */
-	NFS_LAYOUT_RETURN,		/* Return this layout ASAP */
-	NFS_LAYOUT_RETURN_BEFORE_CLOSE,	/* Return this layout before close */
+	NFS_LAYOUT_RETURN,		/* layoutreturn in progress */
+	NFS_LAYOUT_RETURN_REQUESTED,	/* Return this layout ASAP */
 	NFS_LAYOUT_INVALID_STID,	/* layout stateid id is invalid */
 	NFS_LAYOUT_FIRST_LAYOUTGET,	/* Serialize first layoutget */
-	NFS_LAYOUT_RETRY_LAYOUTGET,	/* Retry layoutget */
 };
 
 enum layoutdriver_policy_flags {
@@ -261,11 +260,14 @@ void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
 			     bool update_barrier);
 int pnfs_choose_layoutget_stateid(nfs4_stateid *dst,
 				  struct pnfs_layout_hdr *lo,
-				  struct pnfs_layout_range *range,
+				  const struct pnfs_layout_range *range,
 				  struct nfs4_state *open_state);
 int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
 				struct list_head *tmp_list,
-				struct pnfs_layout_range *recall_range);
+				const struct pnfs_layout_range *recall_range);
+int pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
+				struct list_head *tmp_list,
+				const struct pnfs_layout_range *recall_range);
 bool pnfs_roc(struct inode *ino);
 void pnfs_roc_release(struct inode *ino);
 void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
@@ -379,26 +381,6 @@ nfs4_get_deviceid(struct nfs4_deviceid_node *d)
 	return d;
 }
 
-static inline void pnfs_set_retry_layoutget(struct pnfs_layout_hdr *lo)
-{
-	if (!test_and_set_bit(NFS_LAYOUT_RETRY_LAYOUTGET, &lo->plh_flags))
-		atomic_inc(&lo->plh_refcount);
-}
-
-static inline void pnfs_clear_retry_layoutget(struct pnfs_layout_hdr *lo)
-{
-	if (test_and_clear_bit(NFS_LAYOUT_RETRY_LAYOUTGET, &lo->plh_flags)) {
-		atomic_dec(&lo->plh_refcount);
-		/* wake up waiters for LAYOUTRETURN as that is not needed */
-		wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN);
-	}
-}
-
-static inline bool pnfs_should_retry_layoutget(struct pnfs_layout_hdr *lo)
-{
-	return test_bit(NFS_LAYOUT_RETRY_LAYOUTGET, &lo->plh_flags);
-}
-
 static inline struct pnfs_layout_segment *
 pnfs_get_lseg(struct pnfs_layout_segment *lseg)
 {
@@ -409,6 +391,12 @@ pnfs_get_lseg(struct pnfs_layout_segment *lseg)
 	return lseg;
 }
 
+static inline bool
+pnfs_is_valid_lseg(struct pnfs_layout_segment *lseg)
+{
+	return test_bit(NFS_LSEG_VALID, &lseg->pls_flags) != 0;
+}
+
 /* Return true if a layout driver is being used for this mountpoint */
 static inline int pnfs_enabled_sb(struct nfs_server *nfss)
 {
@@ -556,6 +544,26 @@ pnfs_calc_offset_length(u64 offset, u64 end)
 	return 1 + end - offset;
 }
 
+/**
+ * pnfs_mark_layout_returned_if_empty - marks the layout as returned
+ * @lo: layout header
+ *
+ * Note: Caller must hold inode->i_lock
+ */
+static inline void
+pnfs_mark_layout_returned_if_empty(struct pnfs_layout_hdr *lo)
+{
+	if (list_empty(&lo->plh_segs))
+		set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
+}
+
+static inline void
+pnfs_copy_range(struct pnfs_layout_range *dst,
+		const struct pnfs_layout_range *src)
+{
+	memcpy(dst, src, sizeof(*dst));
+}
+
 extern unsigned int layoutstats_timer;
 
 #ifdef NFS_DEBUG
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index 24655b807..81ac6480f 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -266,17 +266,14 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
 		} else {
 			nfs_retry_commit(mds_pages, NULL, cinfo, 0);
 			pnfs_generic_retry_commit(cinfo, 0);
-			cinfo->completion_ops->error_cleanup(NFS_I(inode));
 			return -ENOMEM;
 		}
 	}
 
 	nreq += pnfs_generic_alloc_ds_commits(cinfo, &list);
 
-	if (nreq == 0) {
-		cinfo->completion_ops->error_cleanup(NFS_I(inode));
+	if (nreq == 0)
 		goto out;
-	}
 
 	atomic_add(nreq, &cinfo->mds->rpcs_out);
 
@@ -871,6 +868,11 @@ pnfs_layout_mark_request_commit(struct nfs_page *req,
 	buckets = cinfo->ds->buckets;
 	list = &buckets[ds_commit_idx].written;
 	if (list_empty(list)) {
+		if (!pnfs_is_valid_lseg(lseg)) {
+			spin_unlock(cinfo->lock);
+			cinfo->completion_ops->resched_write(cinfo, req);
+			return;
+		}
 		/* Non-empty buckets hold a reference on the lseg.  That ref
 		 * is normally transferred to the COMMIT call and released
 		 * there.  It could also be released if the last req is pulled
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 0a5e33f33..eb31e23e7 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -85,6 +85,23 @@ void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio)
 }
 EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds);
 
+static void nfs_readpage_release(struct nfs_page *req)
+{
+	struct inode *inode = d_inode(req->wb_context->dentry);
+
+	dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb->s_id,
+		(unsigned long long)NFS_FILEID(inode), req->wb_bytes,
+		(long long)req_offset(req));
+
+	if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) {
+		if (PageUptodate(req->wb_page))
+			nfs_readpage_to_fscache(inode, req->wb_page, 0);
+
+		unlock_page(req->wb_page);
+	}
+	nfs_release_request(req);
+}
+
 int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
 		       struct page *page)
 {
@@ -106,7 +123,10 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
 
 	nfs_pageio_init_read(&pgio, inode, false,
 			     &nfs_async_read_completion_ops);
-	nfs_pageio_add_request(&pgio, new);
+	if (!nfs_pageio_add_request(&pgio, new)) {
+		nfs_list_remove_request(new);
+		nfs_readpage_release(new);
+	}
 	nfs_pageio_complete(&pgio);
 
 	/* It doesn't make sense to do mirrored reads! */
@@ -115,24 +135,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
 	pgm = &pgio.pg_mirrors[0];
 	NFS_I(inode)->read_io += pgm->pg_bytes_written;
 
-	return 0;
-}
-
-static void nfs_readpage_release(struct nfs_page *req)
-{
-	struct inode *inode = d_inode(req->wb_context->dentry);
-
-	dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb->s_id,
-		(unsigned long long)NFS_FILEID(inode), req->wb_bytes,
-		(long long)req_offset(req));
-
-	if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) {
-		if (PageUptodate(req->wb_page))
-			nfs_readpage_to_fscache(inode, req->wb_page, 0);
-
-		unlock_page(req->wb_page);
-	}
-	nfs_release_request(req);
+	return pgio.pg_error < 0 ? pgio.pg_error : 0;
 }
 
 static void nfs_page_group_set_uptodate(struct nfs_page *req)
@@ -361,6 +364,8 @@ readpage_async_filler(void *data, struct page *page)
 	if (len < PAGE_CACHE_SIZE)
 		zero_user_segment(page, len, PAGE_CACHE_SIZE);
 	if (!nfs_pageio_add_request(desc->pgio, new)) {
+		nfs_list_remove_request(new);
+		nfs_readpage_release(new);
 		error = desc->pgio->pg_error;
 		goto out_unlock;
 	}
diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c
index b6de433da..4fe3eead3 100644
--- a/fs/nfs/symlink.c
+++ b/fs/nfs/symlink.c
@@ -42,21 +42,35 @@ error:
 	return -EIO;
 }
 
-static const char *nfs_follow_link(struct dentry *dentry, void **cookie)
+static const char *nfs_get_link(struct dentry *dentry,
+				struct inode *inode,
+				struct delayed_call *done)
 {
-	struct inode *inode = d_inode(dentry);
 	struct page *page;
 	void *err;
 
-	err = ERR_PTR(nfs_revalidate_mapping(inode, inode->i_mapping));
-	if (err)
-		return err;
-	page = read_cache_page(&inode->i_data, 0,
-				(filler_t *)nfs_symlink_filler, inode);
-	if (IS_ERR(page))
-		return ERR_CAST(page);
-	*cookie = page;
-	return kmap(page);
+	if (!dentry) {
+		err = ERR_PTR(nfs_revalidate_mapping_rcu(inode));
+		if (err)
+			return err;
+		page = find_get_page(inode->i_mapping, 0);
+		if (!page)
+			return ERR_PTR(-ECHILD);
+		if (!PageUptodate(page)) {
+			put_page(page);
+			return ERR_PTR(-ECHILD);
+		}
+	} else {
+		err = ERR_PTR(nfs_revalidate_mapping(inode, inode->i_mapping));
+		if (err)
+			return err;
+		page = read_cache_page(&inode->i_data, 0,
+					(filler_t *)nfs_symlink_filler, inode);
+		if (IS_ERR(page))
+			return ERR_CAST(page);
+	}
+	set_delayed_call(done, page_put_link, page);
+	return page_address(page);
 }
 
 /*
@@ -64,8 +78,7 @@ static const char *nfs_follow_link(struct dentry *dentry, void **cookie)
  */
 const struct inode_operations nfs_symlink_inode_operations = {
 	.readlink	= generic_readlink,
-	.follow_link	= nfs_follow_link,
-	.put_link	= page_put_link,
+	.get_link	= nfs_get_link,
 	.getattr	= nfs_getattr,
 	.setattr	= nfs_setattr,
 };
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 7b9316406..5754835a2 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -21,6 +21,8 @@
 #include <linux/nfs_page.h>
 #include <linux/backing-dev.h>
 #include <linux/export.h>
+#include <linux/freezer.h>
+#include <linux/wait.h>
 
 #include <asm/uaccess.h>
 
@@ -244,11 +246,9 @@ static int wb_priority(struct writeback_control *wbc)
 {
 	int ret = 0;
 	if (wbc->for_reclaim)
-		return FLUSH_HIGHPRI | FLUSH_STABLE;
+		return FLUSH_HIGHPRI | FLUSH_COND_STABLE;
 	if (wbc->sync_mode == WB_SYNC_ALL)
 		ret = FLUSH_COND_STABLE;
-	if (wbc->for_kupdate || wbc->for_background)
-		ret |= FLUSH_LOWPRI;
 	return ret;
 }
 
@@ -545,12 +545,22 @@ try_again:
 	return head;
 }
 
+static void nfs_write_error_remove_page(struct nfs_page *req)
+{
+	nfs_unlock_request(req);
+	nfs_end_page_writeback(req);
+	nfs_release_request(req);
+	generic_error_remove_page(page_file_mapping(req->wb_page),
+				  req->wb_page);
+}
+
 /*
  * Find an associated nfs write request, and prepare to flush it out
  * May return an error if the user signalled nfs_wait_on_request().
  */
 static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
-				struct page *page, bool nonblock)
+				struct page *page, bool nonblock,
+				bool launder)
 {
 	struct nfs_page *req;
 	int ret = 0;
@@ -567,8 +577,21 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio,
 
 	ret = 0;
 	if (!nfs_pageio_add_request(pgio, req)) {
-		nfs_redirty_request(req);
 		ret = pgio->pg_error;
+		/*
+		 * Remove the problematic req upon fatal errors
+		 * in launder case, while other dirty pages can
+		 * still be around until they get flushed.
+		 */
+		if (nfs_error_is_fatal(ret)) {
+			nfs_context_set_write_error(req->wb_context, ret);
+			if (launder) {
+				nfs_write_error_remove_page(req);
+				goto out;
+			}
+		}
+		nfs_redirty_request(req);
+		ret = -EAGAIN;
 	} else
 		nfs_add_stats(page_file_mapping(page)->host,
 				NFSIOS_WRITEPAGES, 1);
@@ -576,12 +599,14 @@ out:
 	return ret;
 }
 
-static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio)
+static int nfs_do_writepage(struct page *page, struct writeback_control *wbc,
+			    struct nfs_pageio_descriptor *pgio, bool launder)
 {
 	int ret;
 
 	nfs_pageio_cond_complete(pgio, page_file_index(page));
-	ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE);
+	ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE,
+				   launder);
 	if (ret == -EAGAIN) {
 		redirty_page_for_writepage(wbc, page);
 		ret = 0;
@@ -592,7 +617,9 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, st
 /*
  * Write an mmapped page to the server.
  */
-static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc)
+static int nfs_writepage_locked(struct page *page,
+				struct writeback_control *wbc,
+				bool launder)
 {
 	struct nfs_pageio_descriptor pgio;
 	struct inode *inode = page_file_mapping(page)->host;
@@ -601,7 +628,7 @@ static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc
 	nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE);
 	nfs_pageio_init_write(&pgio, inode, wb_priority(wbc),
 				false, &nfs_async_write_completion_ops);
-	err = nfs_do_writepage(page, wbc, &pgio);
+	err = nfs_do_writepage(page, wbc, &pgio, launder);
 	nfs_pageio_complete(&pgio);
 	if (err < 0)
 		return err;
@@ -614,7 +641,7 @@ int nfs_writepage(struct page *page, struct writeback_control *wbc)
 {
 	int ret;
 
-	ret = nfs_writepage_locked(page, wbc);
+	ret = nfs_writepage_locked(page, wbc, false);
 	unlock_page(page);
 	return ret;
 }
@@ -623,7 +650,7 @@ static int nfs_writepages_callback(struct page *page, struct writeback_control *
 {
 	int ret;
 
-	ret = nfs_do_writepage(page, wbc, data);
+	ret = nfs_do_writepage(page, wbc, data, false);
 	unlock_page(page);
 	return ret;
 }
@@ -803,11 +830,10 @@ EXPORT_SYMBOL_GPL(nfs_request_add_commit_list_locked);
  * holding the nfs_page lock.
  */
 void
-nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst,
-			    struct nfs_commit_info *cinfo)
+nfs_request_add_commit_list(struct nfs_page *req, struct nfs_commit_info *cinfo)
 {
 	spin_lock(cinfo->lock);
-	nfs_request_add_commit_list_locked(req, dst, cinfo);
+	nfs_request_add_commit_list_locked(req, &cinfo->mds->list, cinfo);
 	spin_unlock(cinfo->lock);
 	nfs_mark_page_unstable(req->wb_page, cinfo);
 }
@@ -865,7 +891,7 @@ nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
 {
 	if (pnfs_mark_request_commit(req, lseg, cinfo, ds_commit_idx))
 		return;
-	nfs_request_add_commit_list(req, &cinfo->mds->list, cinfo);
+	nfs_request_add_commit_list(req, cinfo);
 }
 
 static void
@@ -1128,7 +1154,8 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
 		if (req == NULL)
 			return 0;
 		l_ctx = req->wb_lock_context;
-		do_flush = req->wb_page != page || req->wb_context != ctx;
+		do_flush = req->wb_page != page ||
+			!nfs_match_open_context(req->wb_context, ctx);
 		/* for now, flush if more than 1 request in page_group */
 		do_flush |= req->wb_this_page != req;
 		if (l_ctx && flctx &&
@@ -1326,9 +1353,15 @@ static void nfs_async_write_error(struct list_head *head)
 	}
 }
 
+static void nfs_async_write_reschedule_io(struct nfs_pgio_header *hdr)
+{
+	nfs_async_write_error(&hdr->pages);
+}
+
 static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops = {
 	.error_cleanup = nfs_async_write_error,
 	.completion = nfs_write_completion,
+	.reschedule_io = nfs_async_write_reschedule_io,
 };
 
 void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio,
@@ -1529,27 +1562,21 @@ static void nfs_writeback_result(struct rpc_task *task,
 	}
 }
 
-
-static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait)
+static int wait_on_commit(struct nfs_mds_commit_info *cinfo)
 {
-	int ret;
+	return wait_on_atomic_t(&cinfo->rpcs_out,
+			nfs_wait_atomic_killable, TASK_KILLABLE);
+}
 
-	if (!test_and_set_bit(NFS_INO_COMMIT, &nfsi->flags))
-		return 1;
-	if (!may_wait)
-		return 0;
-	ret = out_of_line_wait_on_bit_lock(&nfsi->flags,
-				NFS_INO_COMMIT,
-				nfs_wait_bit_killable,
-				TASK_KILLABLE);
-	return (ret < 0) ? ret : 1;
+static void nfs_commit_begin(struct nfs_mds_commit_info *cinfo)
+{
+	atomic_inc(&cinfo->rpcs_out);
 }
 
-static void nfs_commit_clear_lock(struct nfs_inode *nfsi)
+static void nfs_commit_end(struct nfs_mds_commit_info *cinfo)
 {
-	clear_bit(NFS_INO_COMMIT, &nfsi->flags);
-	smp_mb__after_atomic();
-	wake_up_bit(&nfsi->flags, NFS_INO_COMMIT);
+	if (atomic_dec_and_test(&cinfo->rpcs_out))
+		wake_up_atomic_t(&cinfo->rpcs_out);
 }
 
 void nfs_commitdata_release(struct nfs_commit_data *data)
@@ -1666,6 +1693,13 @@ void nfs_retry_commit(struct list_head *page_list,
 }
 EXPORT_SYMBOL_GPL(nfs_retry_commit);
 
+static void
+nfs_commit_resched_write(struct nfs_commit_info *cinfo,
+		struct nfs_page *req)
+{
+	__set_page_dirty_nobuffers(req->wb_page);
+}
+
 /*
  * Commit dirty pages
  */
@@ -1687,7 +1721,6 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how,
 				   data->mds_ops, how, 0);
  out_bad:
 	nfs_retry_commit(head, NULL, cinfo, 0);
-	cinfo->completion_ops->error_cleanup(NFS_I(inode));
 	return -ENOMEM;
 }
 
@@ -1749,8 +1782,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
 		clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC);
 
 	nfs_init_cinfo(&cinfo, data->inode, data->dreq);
-	if (atomic_dec_and_test(&cinfo.mds->rpcs_out))
-		nfs_commit_clear_lock(NFS_I(data->inode));
+	nfs_commit_end(cinfo.mds);
 }
 
 static void nfs_commit_release(void *calldata)
@@ -1769,7 +1801,7 @@ static const struct rpc_call_ops nfs_commit_ops = {
 
 static const struct nfs_commit_completion_ops nfs_commit_completion_ops = {
 	.completion = nfs_commit_release_pages,
-	.error_cleanup = nfs_commit_clear_lock,
+	.resched_write = nfs_commit_resched_write,
 };
 
 int nfs_generic_commit_list(struct inode *inode, struct list_head *head,
@@ -1788,30 +1820,25 @@ int nfs_commit_inode(struct inode *inode, int how)
 	LIST_HEAD(head);
 	struct nfs_commit_info cinfo;
 	int may_wait = how & FLUSH_SYNC;
+	int error = 0;
 	int res;
 
-	res = nfs_commit_set_lock(NFS_I(inode), may_wait);
-	if (res <= 0)
-		goto out_mark_dirty;
 	nfs_init_cinfo_from_inode(&cinfo, inode);
+	nfs_commit_begin(cinfo.mds);
 	res = nfs_scan_commit(inode, &head, &cinfo);
-	if (res) {
-		int error;
-
+	if (res)
 		error = nfs_generic_commit_list(inode, &head, how, &cinfo);
-		if (error < 0)
-			return error;
-		if (!may_wait)
-			goto out_mark_dirty;
-		error = wait_on_bit_action(&NFS_I(inode)->flags,
-				NFS_INO_COMMIT,
-				nfs_wait_bit_killable,
-				TASK_KILLABLE);
-		if (error < 0)
-			return error;
-	} else
-		nfs_commit_clear_lock(NFS_I(inode));
+	nfs_commit_end(cinfo.mds);
+	if (error < 0)
+		goto out_error;
+	if (!may_wait)
+		goto out_mark_dirty;
+	error = wait_on_commit(cinfo.mds);
+	if (error < 0)
+		return error;
 	return res;
+out_error:
+	res = error;
 	/* Note: If we exit without ensuring that the commit is complete,
 	 * we must mark the inode as dirty. Otherwise, future calls to
 	 * sync_inode() with the WB_SYNC_ALL flag set will fail to ensure
@@ -1821,6 +1848,7 @@ out_mark_dirty:
 	__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
 	return res;
 }
+EXPORT_SYMBOL_GPL(nfs_commit_inode);
 
 int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
 {
@@ -1911,7 +1939,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page)
 /*
  * Write back all requests on one page - we do this before reading it.
  */
-int nfs_wb_page(struct inode *inode, struct page *page)
+int nfs_wb_single_page(struct inode *inode, struct page *page, bool launder)
 {
 	loff_t range_start = page_file_offset(page);
 	loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1);
@@ -1928,7 +1956,7 @@ int nfs_wb_page(struct inode *inode, struct page *page)
 	for (;;) {
 		wait_on_page_writeback(page);
 		if (clear_page_dirty_for_io(page)) {
-			ret = nfs_writepage_locked(page, &wbc);
+			ret = nfs_writepage_locked(page, &wbc, launder);
 			if (ret < 0)
 				goto out_error;
 			continue;
diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c
index 77e7a5cca..1a03bc305 100644
--- a/fs/nfsd/lockd.c
+++ b/fs/nfsd/lockd.c
@@ -58,7 +58,7 @@ nlm_fclose(struct file *filp)
 	fput(filp);
 }
 
-static struct nlmsvc_binding	nfsd_nlm_ops = {
+static const struct nlmsvc_binding nfsd_nlm_ops = {
 	.fopen		= nlm_fopen,		/* open file for locking */
 	.fclose		= nlm_fclose,		/* close file */
 };
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index d8b16c256..5fbf3bbd0 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -92,7 +92,7 @@ struct nfsd_net {
 
 	struct file *rec_file;
 	bool in_grace;
-	struct nfsd4_client_tracking_ops *client_tracking_ops;
+	const struct nfsd4_client_tracking_ops *client_tracking_ops;
 
 	time_t nfsd4_lease;
 	time_t nfsd4_grace;
diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c
index 00575d776..2246454de 100644
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -823,7 +823,7 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp,
 		} else
 			dchild = dget(dparent);
 	} else
-		dchild = lookup_one_len(name, dparent, namlen);
+		dchild = lookup_one_len_unlocked(name, dparent, namlen);
 	if (IS_ERR(dchild))
 		return rv;
 	if (d_mountpoint(dchild))
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index e7f50c408..7389cb1d7 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -792,12 +792,16 @@ static void warn_no_callback_path(struct nfs4_client *clp, int reason)
 
 static void nfsd4_mark_cb_down(struct nfs4_client *clp, int reason)
 {
+	if (test_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags))
+		return;
 	clp->cl_cb_state = NFSD4_CB_DOWN;
 	warn_no_callback_path(clp, reason);
 }
 
 static void nfsd4_mark_cb_fault(struct nfs4_client *clp, int reason)
 {
+	if (test_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags))
+		return;
 	clp->cl_cb_state = NFSD4_CB_FAULT;
 	warn_no_callback_path(clp, reason);
 }
@@ -1143,7 +1147,7 @@ nfsd4_run_cb_work(struct work_struct *work)
 }
 
 void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
-		struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op)
+		const struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op)
 {
 	cb->cb_clp = clp;
 	cb->cb_msg.rpc_proc = &nfs4_cb_procedures[op];
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index c9d6c715c..ce2d010d3 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -22,7 +22,7 @@ struct nfs4_layout {
 static struct kmem_cache *nfs4_layout_cache;
 static struct kmem_cache *nfs4_layout_stateid_cache;
 
-static struct nfsd4_callback_ops nfsd4_cb_layout_ops;
+static const struct nfsd4_callback_ops nfsd4_cb_layout_ops;
 static const struct lock_manager_operations nfsd4_layouts_lm_ops;
 
 const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] =  {
@@ -624,24 +624,39 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
 {
 	struct nfs4_layout_stateid *ls =
 		container_of(cb, struct nfs4_layout_stateid, ls_recall);
+	struct nfsd_net *nn;
+	ktime_t now, cutoff;
 	LIST_HEAD(reaplist);
 
+
 	switch (task->tk_status) {
 	case 0:
-		return 1;
+	case -NFS4ERR_DELAY:
+		/*
+		 * Anything left? If not, then call it done. Note that we don't
+		 * take the spinlock since this is an optimization and nothing
+		 * should get added until the cb counter goes to zero.
+		 */
+		if (list_empty(&ls->ls_layouts))
+			return 1;
+
+		/* Poll the client until it's done with the layout */
+		now = ktime_get();
+		nn = net_generic(ls->ls_stid.sc_client->net, nfsd_net_id);
+
+		/* Client gets 2 lease periods to return it */
+		cutoff = ktime_add_ns(task->tk_start,
+					 nn->nfsd4_lease * NSEC_PER_SEC * 2);
+
+		if (ktime_before(now, cutoff)) {
+			rpc_delay(task, HZ/100); /* 10 mili-seconds */
+			return 0;
+		}
+		/* Fallthrough */
 	case -NFS4ERR_NOMATCHING_LAYOUT:
 		trace_layout_recall_done(&ls->ls_stid.sc_stateid);
 		task->tk_status = 0;
 		return 1;
-	case -NFS4ERR_DELAY:
-		/* Poll the client until it's done with the layout */
-		/* FIXME: cap number of retries.
-		 * The pnfs standard states that we need to only expire
-		 * the client after at-least "lease time" .eg lease-time * 2
-		 * when failing to communicate a recall
-		 */
-		rpc_delay(task, HZ/100); /* 10 mili-seconds */
-		return 0;
 	default:
 		/*
 		 * Unknown error or non-responding client, we'll need to fence.
@@ -665,7 +680,7 @@ nfsd4_cb_layout_release(struct nfsd4_callback *cb)
 	nfs4_put_stid(&ls->ls_stid);
 }
 
-static struct nfsd4_callback_ops nfsd4_cb_layout_ops = {
+static const struct nfsd4_callback_ops nfsd4_cb_layout_ops = {
 	.prepare	= nfsd4_cb_layout_prepare,
 	.done		= nfsd4_cb_layout_done,
 	.release	= nfsd4_cb_layout_release,
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index a9f096c7e..4cba7865f 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -55,10 +55,10 @@ nfsd4_security_inode_setsecctx(struct svc_fh *resfh, struct xdr_netobj *label, u
 	struct inode *inode = d_inode(resfh->fh_dentry);
 	int status;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	status = security_inode_setsecctx(resfh->fh_dentry,
 		label->data, label->len);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	if (status)
 		/*
@@ -774,8 +774,9 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags);
 
 	/* check stateid */
-	status = nfs4_preprocess_stateid_op(rqstp, cstate, &read->rd_stateid,
-			RD_STATE, &read->rd_filp, &read->rd_tmp_file);
+	status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
+					&read->rd_stateid, RD_STATE,
+					&read->rd_filp, &read->rd_tmp_file);
 	if (status) {
 		dprintk("NFSD: nfsd4_read: couldn't process stateid!\n");
 		goto out;
@@ -921,7 +922,8 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 
 	if (setattr->sa_iattr.ia_valid & ATTR_SIZE) {
 		status = nfs4_preprocess_stateid_op(rqstp, cstate,
-			&setattr->sa_stateid, WR_STATE, NULL, NULL);
+				&cstate->current_fh, &setattr->sa_stateid,
+				WR_STATE, NULL, NULL);
 		if (status) {
 			dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n");
 			return status;
@@ -985,8 +987,8 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	if (write->wr_offset >= OFFSET_MAX)
 		return nfserr_inval;
 
-	status = nfs4_preprocess_stateid_op(rqstp, cstate, stateid, WR_STATE,
-			&filp, NULL);
+	status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
+						stateid, WR_STATE, &filp, NULL);
 	if (status) {
 		dprintk("NFSD: nfsd4_write: couldn't process stateid!\n");
 		return status;
@@ -1010,13 +1012,54 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 }
 
 static __be32
+nfsd4_clone(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+		struct nfsd4_clone *clone)
+{
+	struct file *src, *dst;
+	__be32 status;
+
+	status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->save_fh,
+					    &clone->cl_src_stateid, RD_STATE,
+					    &src, NULL);
+	if (status) {
+		dprintk("NFSD: %s: couldn't process src stateid!\n", __func__);
+		goto out;
+	}
+
+	status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
+					    &clone->cl_dst_stateid, WR_STATE,
+					    &dst, NULL);
+	if (status) {
+		dprintk("NFSD: %s: couldn't process dst stateid!\n", __func__);
+		goto out_put_src;
+	}
+
+	/* fix up for NFS-specific error code */
+	if (!S_ISREG(file_inode(src)->i_mode) ||
+	    !S_ISREG(file_inode(dst)->i_mode)) {
+		status = nfserr_wrong_type;
+		goto out_put_dst;
+	}
+
+	status = nfsd4_clone_file_range(src, clone->cl_src_pos,
+			dst, clone->cl_dst_pos, clone->cl_count);
+
+out_put_dst:
+	fput(dst);
+out_put_src:
+	fput(src);
+out:
+	return status;
+}
+
+static __be32
 nfsd4_fallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		struct nfsd4_fallocate *fallocate, int flags)
 {
 	__be32 status = nfserr_notsupp;
 	struct file *file;
 
-	status = nfs4_preprocess_stateid_op(rqstp, cstate,
+	status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
 					    &fallocate->falloc_stateid,
 					    WR_STATE, &file, NULL);
 	if (status != nfs_ok) {
@@ -1055,7 +1098,7 @@ nfsd4_seek(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	__be32 status;
 	struct file *file;
 
-	status = nfs4_preprocess_stateid_op(rqstp, cstate,
+	status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
 					    &seek->seek_stateid,
 					    RD_STATE, &file, NULL);
 	if (status) {
@@ -2279,6 +2322,12 @@ static struct nfsd4_operation nfsd4_ops[] = {
 		.op_name = "OP_DEALLOCATE",
 		.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
 	},
+	[OP_CLONE] = {
+		.op_func = (nfsd4op_func)nfsd4_clone,
+		.op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
+		.op_name = "OP_CLONE",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
+	},
 	[OP_SEEK] = {
 		.op_func = (nfsd4op_func)nfsd4_seek,
 		.op_name = "OP_SEEK",
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index e3d47091b..dc8ebecf5 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -192,7 +192,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
 
 	dir = nn->rec_file->f_path.dentry;
 	/* lock the parent */
-	mutex_lock(&d_inode(dir)->i_mutex);
+	inode_lock(d_inode(dir));
 
 	dentry = lookup_one_len(dname, dir, HEXDIR_LEN-1);
 	if (IS_ERR(dentry)) {
@@ -213,7 +213,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp)
 out_put:
 	dput(dentry);
 out_unlock:
-	mutex_unlock(&d_inode(dir)->i_mutex);
+	inode_unlock(d_inode(dir));
 	if (status == 0) {
 		if (nn->in_grace) {
 			crp = nfs4_client_to_reclaim(dname, nn);
@@ -286,7 +286,7 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn)
 	}
 
 	status = iterate_dir(nn->rec_file, &ctx.ctx);
-	mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT);
+	inode_lock_nested(d_inode(dir), I_MUTEX_PARENT);
 
 	list_for_each_entry_safe(entry, tmp, &ctx.names, list) {
 		if (!status) {
@@ -302,7 +302,7 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn)
 		list_del(&entry->list);
 		kfree(entry);
 	}
-	mutex_unlock(&d_inode(dir)->i_mutex);
+	inode_unlock(d_inode(dir));
 	nfs4_reset_creds(original_cred);
 
 	list_for_each_entry_safe(entry, tmp, &ctx.names, list) {
@@ -322,7 +322,7 @@ nfsd4_unlink_clid_dir(char *name, int namlen, struct nfsd_net *nn)
 	dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name);
 
 	dir = nn->rec_file->f_path.dentry;
-	mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT);
+	inode_lock_nested(d_inode(dir), I_MUTEX_PARENT);
 	dentry = lookup_one_len(name, dir, namlen);
 	if (IS_ERR(dentry)) {
 		status = PTR_ERR(dentry);
@@ -335,7 +335,7 @@ nfsd4_unlink_clid_dir(char *name, int namlen, struct nfsd_net *nn)
 out:
 	dput(dentry);
 out_unlock:
-	mutex_unlock(&d_inode(dir)->i_mutex);
+	inode_unlock(d_inode(dir));
 	return status;
 }
 
@@ -631,7 +631,7 @@ nfsd4_check_legacy_client(struct nfs4_client *clp)
 	return -ENOENT;
 }
 
-static struct nfsd4_client_tracking_ops nfsd4_legacy_tracking_ops = {
+static const struct nfsd4_client_tracking_ops nfsd4_legacy_tracking_ops = {
 	.init		= nfsd4_legacy_tracking_init,
 	.exit		= nfsd4_legacy_tracking_exit,
 	.create		= nfsd4_create_clid_dir,
@@ -1050,7 +1050,7 @@ out_err:
 		printk(KERN_ERR "NFSD: Unable to end grace period: %d\n", ret);
 }
 
-static struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops = {
+static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops = {
 	.init		= nfsd4_init_cld_pipe,
 	.exit		= nfsd4_remove_cld_pipe,
 	.create		= nfsd4_cld_create,
@@ -1394,7 +1394,7 @@ nfsd4_umh_cltrack_grace_done(struct nfsd_net *nn)
 	kfree(legacy);
 }
 
-static struct nfsd4_client_tracking_ops nfsd4_umh_tracking_ops = {
+static const struct nfsd4_client_tracking_ops nfsd4_umh_tracking_ops = {
 	.init		= nfsd4_umh_cltrack_init,
 	.exit		= NULL,
 	.create		= nfsd4_umh_cltrack_create,
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 6b800b5b8..c484a2b6c 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -98,7 +98,7 @@ static struct kmem_cache *odstate_slab;
 
 static void free_session(struct nfsd4_session *);
 
-static struct nfsd4_callback_ops nfsd4_cb_recall_ops;
+static const struct nfsd4_callback_ops nfsd4_cb_recall_ops;
 
 static bool is_session_dead(struct nfsd4_session *ses)
 {
@@ -1857,15 +1857,28 @@ static void copy_clid(struct nfs4_client *target, struct nfs4_client *source)
 	target->cl_clientid.cl_id = source->cl_clientid.cl_id; 
 }
 
-static int copy_cred(struct svc_cred *target, struct svc_cred *source)
+int strdup_if_nonnull(char **target, char *source)
 {
-	if (source->cr_principal) {
-		target->cr_principal =
-				kstrdup(source->cr_principal, GFP_KERNEL);
-		if (target->cr_principal == NULL)
+	if (source) {
+		*target = kstrdup(source, GFP_KERNEL);
+		if (!*target)
 			return -ENOMEM;
 	} else
-		target->cr_principal = NULL;
+		*target = NULL;
+	return 0;
+}
+
+static int copy_cred(struct svc_cred *target, struct svc_cred *source)
+{
+	int ret;
+
+	ret = strdup_if_nonnull(&target->cr_principal, source->cr_principal);
+	if (ret)
+		return ret;
+	ret = strdup_if_nonnull(&target->cr_raw_principal,
+					source->cr_raw_principal);
+	if (ret)
+		return ret;
 	target->cr_flavor = source->cr_flavor;
 	target->cr_uid = source->cr_uid;
 	target->cr_gid = source->cr_gid;
@@ -1969,6 +1982,9 @@ static bool mach_creds_match(struct nfs4_client *cl, struct svc_rqst *rqstp)
 		return false;
 	if (!svc_rqst_integrity_protected(rqstp))
 		return false;
+	if (cl->cl_cred.cr_raw_principal)
+		return 0 == strcmp(cl->cl_cred.cr_raw_principal,
+						cr->cr_raw_principal);
 	if (!cr->cr_principal)
 		return false;
 	return 0 == strcmp(cl->cl_cred.cr_principal, cr->cr_principal);
@@ -2240,7 +2256,8 @@ nfsd4_store_cache_entry(struct nfsd4_compoundres *resp)
 	base = resp->cstate.data_offset;
 	slot->sl_datalen = buf->len - base;
 	if (read_bytes_from_xdr_buf(buf, base, slot->sl_data, slot->sl_datalen))
-		WARN("%s: sessions DRC could not cache compound\n", __func__);
+		WARN(1, "%s: sessions DRC could not cache compound\n",
+		     __func__);
 	return;
 }
 
@@ -2365,10 +2382,27 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
 	if (exid->flags & ~EXCHGID4_FLAG_MASK_A)
 		return nfserr_inval;
 
+	new = create_client(exid->clname, rqstp, &verf);
+	if (new == NULL)
+		return nfserr_jukebox;
+
 	switch (exid->spa_how) {
 	case SP4_MACH_CRED:
-		if (!svc_rqst_integrity_protected(rqstp))
-			return nfserr_inval;
+		if (!svc_rqst_integrity_protected(rqstp)) {
+			status = nfserr_inval;
+			goto out_nolock;
+		}
+		/*
+		 * Sometimes userspace doesn't give us a principal.
+		 * Which is a bug, really.  Anyway, we can't enforce
+		 * MACH_CRED in that case, better to give up now:
+		 */
+		if (!new->cl_cred.cr_principal &&
+					!new->cl_cred.cr_raw_principal) {
+			status = nfserr_serverfault;
+			goto out_nolock;
+		}
+		new->cl_mach_cred = true;
 	case SP4_NONE:
 		break;
 	default:				/* checked by xdr code */
@@ -2377,10 +2411,6 @@ nfsd4_exchange_id(struct svc_rqst *rqstp,
 		return nfserr_encr_alg_unsupp;
 	}
 
-	new = create_client(exid->clname, rqstp, &verf);
-	if (new == NULL)
-		return nfserr_jukebox;
-
 	/* Cases below refer to rfc 5661 section 18.35.4: */
 	spin_lock(&nn->client_lock);
 	conf = find_confirmed_client_by_name(&exid->clname, nn);
@@ -2442,7 +2472,6 @@ out_new:
 			goto out;
 	}
 	new->cl_minorversion = cstate->minorversion;
-	new->cl_mach_cred = (exid->spa_how == SP4_MACH_CRED);
 
 	gen_clid(new, nn);
 	add_to_unconfirmed(new);
@@ -2460,6 +2489,7 @@ out_copy:
 
 out:
 	spin_unlock(&nn->client_lock);
+out_nolock:
 	if (new)
 		expire_client(new);
 	if (unconf)
@@ -3648,7 +3678,7 @@ static void nfsd4_cb_recall_release(struct nfsd4_callback *cb)
 	nfs4_put_stid(&dp->dl_stid);
 }
 
-static struct nfsd4_callback_ops nfsd4_cb_recall_ops = {
+static const struct nfsd4_callback_ops nfsd4_cb_recall_ops = {
 	.prepare	= nfsd4_cb_recall_prepare,
 	.done		= nfsd4_cb_recall_done,
 	.release	= nfsd4_cb_recall_release,
@@ -4541,8 +4571,7 @@ static void
 laundromat_main(struct work_struct *laundry)
 {
 	time_t t;
-	struct delayed_work *dwork = container_of(laundry, struct delayed_work,
-						  work);
+	struct delayed_work *dwork = to_delayed_work(laundry);
 	struct nfsd_net *nn = container_of(dwork, struct nfsd_net,
 					   laundromat_work);
 
@@ -4797,10 +4826,9 @@ nfs4_check_file(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfs4_stid *s,
  */
 __be32
 nfs4_preprocess_stateid_op(struct svc_rqst *rqstp,
-		struct nfsd4_compound_state *cstate, stateid_t *stateid,
-		int flags, struct file **filpp, bool *tmp_file)
+		struct nfsd4_compound_state *cstate, struct svc_fh *fhp,
+		stateid_t *stateid, int flags, struct file **filpp, bool *tmp_file)
 {
-	struct svc_fh *fhp = &cstate->current_fh;
 	struct inode *ino = d_inode(fhp->fh_dentry);
 	struct net *net = SVC_NET(rqstp);
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 51c9e9ca3..d6ef0955a 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1675,6 +1675,25 @@ nfsd4_decode_fallocate(struct nfsd4_compoundargs *argp,
 }
 
 static __be32
+nfsd4_decode_clone(struct nfsd4_compoundargs *argp, struct nfsd4_clone *clone)
+{
+	DECODE_HEAD;
+
+	status = nfsd4_decode_stateid(argp, &clone->cl_src_stateid);
+	if (status)
+		return status;
+	status = nfsd4_decode_stateid(argp, &clone->cl_dst_stateid);
+	if (status)
+		return status;
+
+	READ_BUF(8 + 8 + 8);
+	p = xdr_decode_hyper(p, &clone->cl_src_pos);
+	p = xdr_decode_hyper(p, &clone->cl_dst_pos);
+	p = xdr_decode_hyper(p, &clone->cl_count);
+	DECODE_TAIL;
+}
+
+static __be32
 nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek)
 {
 	DECODE_HEAD;
@@ -1785,6 +1804,7 @@ static nfsd4_dec nfsd4_dec_ops[] = {
 	[OP_READ_PLUS]		= (nfsd4_dec)nfsd4_decode_notsupp,
 	[OP_SEEK]		= (nfsd4_dec)nfsd4_decode_seek,
 	[OP_WRITE_SAME]		= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_CLONE]		= (nfsd4_dec)nfsd4_decode_clone,
 };
 
 static inline bool
@@ -2838,14 +2858,14 @@ nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd,
 	__be32 nfserr;
 	int ignore_crossmnt = 0;
 
-	dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen);
+	dentry = lookup_one_len_unlocked(name, cd->rd_fhp->fh_dentry, namlen);
 	if (IS_ERR(dentry))
 		return nfserrno(PTR_ERR(dentry));
 	if (d_really_is_negative(dentry)) {
 		/*
-		 * nfsd_buffered_readdir drops the i_mutex between
-		 * readdir and calling this callback, leaving a window
-		 * where this directory entry could have gone away.
+		 * we're not holding the i_mutex here, so there's
+		 * a window where this directory entry could have gone
+		 * away.
 		 */
 		dput(dentry);
 		return nfserr_noent;
@@ -4292,6 +4312,7 @@ static nfsd4_enc nfsd4_enc_ops[] = {
 	[OP_READ_PLUS]		= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_SEEK]		= (nfsd4_enc)nfsd4_encode_seek,
 	[OP_WRITE_SAME]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_CLONE]		= (nfsd4_enc)nfsd4_encode_noop,
 };
 
 /*
diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h
index 2087bae17..f84fe6bf9 100644
--- a/fs/nfsd/nfsfh.h
+++ b/fs/nfsd/nfsfh.h
@@ -7,6 +7,7 @@
 #ifndef _LINUX_NFSD_NFSFH_H
 #define _LINUX_NFSD_NFSFH_H
 
+#include <linux/crc32.h>
 #include <linux/sunrpc/svc.h>
 #include <uapi/linux/nfsd/nfsfh.h>
 
@@ -205,6 +206,28 @@ static inline bool fh_fsid_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2)
 	return true;
 }
 
+#ifdef CONFIG_CRC32
+/**
+ * knfsd_fh_hash - calculate the crc32 hash for the filehandle
+ * @fh - pointer to filehandle
+ *
+ * returns a crc32 hash for the filehandle that is compatible with
+ * the one displayed by "wireshark".
+ */
+
+static inline u32
+knfsd_fh_hash(struct knfsd_fh *fh)
+{
+	return ~crc32_le(0xFFFFFFFF, (unsigned char *)&fh->fh_base, fh->fh_size);
+}
+#else
+static inline u32
+knfsd_fh_hash(struct knfsd_fh *fh)
+{
+	return 0;
+}
+#endif
+
 #ifdef CONFIG_NFSD_V3
 /*
  * The wcc data stored in current_fh should be cleared
@@ -265,7 +288,7 @@ fh_lock_nested(struct svc_fh *fhp, unsigned int subclass)
 	}
 
 	inode = d_inode(dentry);
-	mutex_lock_nested(&inode->i_mutex, subclass);
+	inode_lock_nested(inode, subclass);
 	fill_pre_wcc(fhp);
 	fhp->fh_locked = true;
 }
@@ -284,7 +307,7 @@ fh_unlock(struct svc_fh *fhp)
 {
 	if (fhp->fh_locked) {
 		fill_post_wcc(fhp);
-		mutex_unlock(&d_inode(fhp->fh_dentry)->i_mutex);
+		inode_unlock(d_inode(fhp->fh_dentry));
 		fhp->fh_locked = false;
 	}
 }
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index ad4e2377d..45007acaf 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -14,9 +14,13 @@
 
 #include <linux/sunrpc/stats.h>
 #include <linux/sunrpc/svcsock.h>
+#include <linux/sunrpc/svc_xprt.h>
 #include <linux/lockd/bind.h>
 #include <linux/nfsacl.h>
 #include <linux/seq_file.h>
+#include <linux/inetdevice.h>
+#include <net/addrconf.h>
+#include <net/ipv6.h>
 #include <net/net_namespace.h>
 #include "nfsd.h"
 #include "cache.h"
@@ -306,22 +310,81 @@ static void nfsd_shutdown_net(struct net *net)
 	nfsd_shutdown_generic();
 }
 
+static int nfsd_inetaddr_event(struct notifier_block *this, unsigned long event,
+	void *ptr)
+{
+	struct in_ifaddr *ifa = (struct in_ifaddr *)ptr;
+	struct net_device *dev = ifa->ifa_dev->dev;
+	struct net *net = dev_net(dev);
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+	struct sockaddr_in sin;
+
+	if (event != NETDEV_DOWN)
+		goto out;
+
+	if (nn->nfsd_serv) {
+		dprintk("nfsd_inetaddr_event: removed %pI4\n", &ifa->ifa_local);
+		sin.sin_family = AF_INET;
+		sin.sin_addr.s_addr = ifa->ifa_local;
+		svc_age_temp_xprts_now(nn->nfsd_serv, (struct sockaddr *)&sin);
+	}
+
+out:
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block nfsd_inetaddr_notifier = {
+	.notifier_call = nfsd_inetaddr_event,
+};
+
+#if IS_ENABLED(CONFIG_IPV6)
+static int nfsd_inet6addr_event(struct notifier_block *this,
+	unsigned long event, void *ptr)
+{
+	struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr;
+	struct net_device *dev = ifa->idev->dev;
+	struct net *net = dev_net(dev);
+	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+	struct sockaddr_in6 sin6;
+
+	if (event != NETDEV_DOWN)
+		goto out;
+
+	if (nn->nfsd_serv) {
+		dprintk("nfsd_inet6addr_event: removed %pI6\n", &ifa->addr);
+		sin6.sin6_family = AF_INET6;
+		sin6.sin6_addr = ifa->addr;
+		svc_age_temp_xprts_now(nn->nfsd_serv, (struct sockaddr *)&sin6);
+	}
+
+out:
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block nfsd_inet6addr_notifier = {
+	.notifier_call = nfsd_inet6addr_event,
+};
+#endif
+
 static void nfsd_last_thread(struct svc_serv *serv, struct net *net)
 {
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 
+	unregister_inetaddr_notifier(&nfsd_inetaddr_notifier);
+#if IS_ENABLED(CONFIG_IPV6)
+	unregister_inet6addr_notifier(&nfsd_inet6addr_notifier);
+#endif
 	/*
 	 * write_ports can create the server without actually starting
 	 * any threads--if we get shut down before any threads are
 	 * started, then nfsd_last_thread will be run before any of this
-	 * other initialization has been done.
+	 * other initialization has been done except the rpcb information.
 	 */
+	svc_rpcb_cleanup(serv, net);
 	if (!nn->nfsd_net_up)
 		return;
-	nfsd_shutdown_net(net);
-
-	svc_rpcb_cleanup(serv, net);
 
+	nfsd_shutdown_net(net);
 	printk(KERN_WARNING "nfsd: last server has exited, flushing export "
 			    "cache\n");
 	nfsd_export_flush(net);
@@ -425,6 +488,10 @@ int nfsd_create_serv(struct net *net)
 	}
 
 	set_max_drc();
+	register_inetaddr_notifier(&nfsd_inetaddr_notifier);
+#if IS_ENABLED(CONFIG_IPV6)
+	register_inet6addr_notifier(&nfsd_inet6addr_notifier);
+#endif
 	do_gettimeofday(&nn->nfssvc_boot);		/* record boot time */
 	return 0;
 }
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 77fdf4de9..c050c5303 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -65,7 +65,7 @@ struct nfsd4_callback {
 	struct nfs4_client *cb_clp;
 	u32 cb_minorversion;
 	struct rpc_message cb_msg;
-	struct nfsd4_callback_ops *cb_ops;
+	const struct nfsd4_callback_ops *cb_ops;
 	struct work_struct cb_work;
 	int cb_seq_status;
 	int cb_status;
@@ -578,8 +578,8 @@ struct nfsd4_compound_state;
 struct nfsd_net;
 
 extern __be32 nfs4_preprocess_stateid_op(struct svc_rqst *rqstp,
-		struct nfsd4_compound_state *cstate, stateid_t *stateid,
-		int flags, struct file **filp, bool *tmp_file);
+		struct nfsd4_compound_state *cstate, struct svc_fh *fhp,
+		stateid_t *stateid, int flags, struct file **filp, bool *tmp_file);
 __be32 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate,
 		     stateid_t *stateid, unsigned char typemask,
 		     struct nfs4_stid **s, struct nfsd_net *nn);
@@ -599,7 +599,7 @@ extern void nfsd4_probe_callback(struct nfs4_client *clp);
 extern void nfsd4_probe_callback_sync(struct nfs4_client *clp);
 extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
 extern void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
-		struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op);
+		const struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op);
 extern void nfsd4_run_cb(struct nfsd4_callback *cb);
 extern int nfsd4_create_callback_queue(void);
 extern void nfsd4_destroy_callback_queue(void);
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index 0befe7627..328704190 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -8,6 +8,47 @@
 #define _NFSD_TRACE_H
 
 #include <linux/tracepoint.h>
+#include "nfsfh.h"
+
+DECLARE_EVENT_CLASS(nfsd_io_class,
+	TP_PROTO(struct svc_rqst *rqstp,
+		 struct svc_fh	*fhp,
+		 loff_t		offset,
+		 int		len),
+	TP_ARGS(rqstp, fhp, offset, len),
+	TP_STRUCT__entry(
+		__field(__be32, xid)
+		__field_struct(struct knfsd_fh, fh)
+		__field(loff_t, offset)
+		__field(int, len)
+	),
+	TP_fast_assign(
+		__entry->xid = rqstp->rq_xid,
+		fh_copy_shallow(&__entry->fh, &fhp->fh_handle);
+		__entry->offset = offset;
+		__entry->len = len;
+	),
+	TP_printk("xid=0x%x fh=0x%x offset=%lld len=%d",
+		  __be32_to_cpu(__entry->xid), knfsd_fh_hash(&__entry->fh),
+		  __entry->offset, __entry->len)
+)
+
+#define DEFINE_NFSD_IO_EVENT(name)		\
+DEFINE_EVENT(nfsd_io_class, name,		\
+	TP_PROTO(struct svc_rqst *rqstp,	\
+		 struct svc_fh	*fhp,		\
+		 loff_t		offset,		\
+		 int		len),		\
+	TP_ARGS(rqstp, fhp, offset, len))
+
+DEFINE_NFSD_IO_EVENT(read_start);
+DEFINE_NFSD_IO_EVENT(read_opened);
+DEFINE_NFSD_IO_EVENT(read_io_done);
+DEFINE_NFSD_IO_EVENT(read_done);
+DEFINE_NFSD_IO_EVENT(write_start);
+DEFINE_NFSD_IO_EVENT(write_opened);
+DEFINE_NFSD_IO_EVENT(write_io_done);
+DEFINE_NFSD_IO_EVENT(write_done);
 
 #include "state.h"
 
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 994d66fbb..5d2a57e4c 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -36,12 +36,14 @@
 #endif /* CONFIG_NFSD_V3 */
 
 #ifdef CONFIG_NFSD_V4
+#include "../internal.h"
 #include "acl.h"
 #include "idmap.h"
 #endif /* CONFIG_NFSD_V4 */
 
 #include "nfsd.h"
 #include "vfs.h"
+#include "trace.h"
 
 #define NFSDDBG_FACILITY		NFSDDBG_FILEOP
 
@@ -217,10 +219,16 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
 		host_err = PTR_ERR(dentry);
 		if (IS_ERR(dentry))
 			goto out_nfserr;
-		/*
-		 * check if we have crossed a mount point ...
-		 */
 		if (nfsd_mountpoint(dentry, exp)) {
+			/*
+			 * We don't need the i_mutex after all.  It's
+			 * still possible we could open this (regular
+			 * files can be mountpoints too), but the
+			 * i_mutex is just there to prevent renames of
+			 * something that we might be about to delegate,
+			 * and a mountpoint won't be renamed:
+			 */
+			fh_unlock(fhp);
 			if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) {
 				dput(dentry);
 				goto out_nfserr;
@@ -485,9 +493,9 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp,
 
 	dentry = fhp->fh_dentry;
 
-	mutex_lock(&d_inode(dentry)->i_mutex);
+	inode_lock(d_inode(dentry));
 	host_error = security_inode_setsecctx(dentry, label->data, label->len);
-	mutex_unlock(&d_inode(dentry)->i_mutex);
+	inode_unlock(d_inode(dentry));
 	return nfserrno(host_error);
 }
 #else
@@ -498,6 +506,13 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp,
 }
 #endif
 
+__be32 nfsd4_clone_file_range(struct file *src, u64 src_pos, struct file *dst,
+		u64 dst_pos, u64 count)
+{
+	return nfserrno(vfs_clone_file_range(src, src_pos, dst, dst_pos,
+			count));
+}
+
 __be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp,
 			   struct file *file, loff_t offset, loff_t len,
 			   int flags)
@@ -983,16 +998,23 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
 	struct raparms	*ra;
 	__be32 err;
 
+	trace_read_start(rqstp, fhp, offset, vlen);
 	err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file);
 	if (err)
 		return err;
 
 	ra = nfsd_init_raparms(file);
+
+	trace_read_opened(rqstp, fhp, offset, vlen);
 	err = nfsd_vfs_read(rqstp, file, offset, vec, vlen, count);
+	trace_read_io_done(rqstp, fhp, offset, vlen);
+
 	if (ra)
 		nfsd_put_raparams(file, ra);
 	fput(file);
 
+	trace_read_done(rqstp, fhp, offset, vlen);
+
 	return err;
 }
 
@@ -1008,24 +1030,31 @@ nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
 {
 	__be32			err = 0;
 
+	trace_write_start(rqstp, fhp, offset, vlen);
+
 	if (file) {
 		err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry,
 				NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE);
 		if (err)
 			goto out;
+		trace_write_opened(rqstp, fhp, offset, vlen);
 		err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, cnt,
 				stablep);
+		trace_write_io_done(rqstp, fhp, offset, vlen);
 	} else {
 		err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file);
 		if (err)
 			goto out;
 
+		trace_write_opened(rqstp, fhp, offset, vlen);
 		if (cnt)
 			err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen,
 					     cnt, stablep);
+		trace_write_io_done(rqstp, fhp, offset, vlen);
 		fput(file);
 	}
 out:
+	trace_write_done(rqstp, fhp, offset, vlen);
 	return err;
 }
 
@@ -1809,7 +1838,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
 	offset = *offsetp;
 
 	while (1) {
-		struct inode *dir_inode = file_inode(file);
 		unsigned int reclen;
 
 		cdp->err = nfserr_eof; /* will be cleared on successful read */
@@ -1828,15 +1856,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
 		if (!size)
 			break;
 
-		/*
-		 * Various filldir functions may end up calling back into
-		 * lookup_one_len() and the file system's ->lookup() method.
-		 * These expect i_mutex to be held, as it would within readdir.
-		 */
-		host_err = mutex_lock_killable(&dir_inode->i_mutex);
-		if (host_err)
-			break;
-
 		de = (struct buffered_dirent *)buf.dirent;
 		while (size > 0) {
 			offset = de->offset;
@@ -1853,7 +1872,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func,
 			size -= reclen;
 			de = (struct buffered_dirent *)((char *)de + reclen);
 		}
-		mutex_unlock(&dir_inode->i_mutex);
 		if (size > 0) /* We bailed out early */
 			break;
 
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index fcfc48cbe..c11ba316f 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -56,6 +56,8 @@ __be32          nfsd4_set_nfs4_label(struct svc_rqst *, struct svc_fh *,
 		    struct xdr_netobj *);
 __be32		nfsd4_vfs_fallocate(struct svc_rqst *, struct svc_fh *,
 				    struct file *, loff_t, loff_t, int);
+__be32		nfsd4_clone_file_range(struct file *, u64, struct file *,
+			u64, u64);
 #endif /* CONFIG_NFSD_V4 */
 __be32		nfsd_create(struct svc_rqst *, struct svc_fh *,
 				char *name, int len, struct iattr *attrs,
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index ce7362c88..d9554813e 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -491,6 +491,15 @@ struct nfsd4_fallocate {
 	u64		falloc_length;
 };
 
+struct nfsd4_clone {
+	/* request */
+	stateid_t	cl_src_stateid;
+	stateid_t	cl_dst_stateid;
+	u64		cl_src_pos;
+	u64		cl_dst_pos;
+	u64		cl_count;
+};
+
 struct nfsd4_seek {
 	/* request */
 	stateid_t	seek_stateid;
@@ -555,6 +564,7 @@ struct nfsd4_op {
 		/* NFSv4.2 */
 		struct nfsd4_fallocate		allocate;
 		struct nfsd4_fallocate		deallocate;
+		struct nfsd4_clone		clone;
 		struct nfsd4_seek		seek;
 	} u;
 	struct nfs4_replay *			replay;
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index ac2f64943..21a1e2e0d 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -510,6 +510,7 @@ static int __nilfs_read_inode(struct super_block *sb,
 		inode->i_mapping->a_ops = &nilfs_aops;
 	} else if (S_ISLNK(inode->i_mode)) {
 		inode->i_op = &nilfs_symlink_inode_operations;
+		inode_nohighmem(inode);
 		inode->i_mapping->a_ops = &nilfs_aops;
 	} else {
 		inode->i_op = &nilfs_special_inode_operations;
@@ -1002,7 +1003,7 @@ int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 	if (ret)
 		return ret;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	isize = i_size_read(inode);
 
@@ -1112,6 +1113,6 @@ int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 	if (ret == 1)
 		ret = 0;
 
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return ret;
 }
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
index aba43811d..e8fe24882 100644
--- a/fs/nilfs2/ioctl.c
+++ b/fs/nilfs2/ioctl.c
@@ -158,7 +158,7 @@ static int nilfs_ioctl_setflags(struct inode *inode, struct file *filp,
 
 	flags = nilfs_mask_flags(inode->i_mode, flags);
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	oldflags = NILFS_I(inode)->i_flags;
 
@@ -186,7 +186,7 @@ static int nilfs_ioctl_setflags(struct inode *inode, struct file *filp,
 	nilfs_mark_inode_dirty(inode);
 	ret = nilfs_transaction_commit(inode->i_sb);
 out:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	mnt_drop_write_file(filp);
 	return ret;
 }
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index c9a1a491a..7ccdb961e 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -161,6 +161,7 @@ static int nilfs_symlink(struct inode *dir, struct dentry *dentry,
 
 	/* slow symlink */
 	inode->i_op = &nilfs_symlink_inode_operations;
+	inode_nohighmem(inode);
 	inode->i_mapping->a_ops = &nilfs_aops;
 	err = page_symlink(inode, symname, l);
 	if (err)
@@ -568,8 +569,7 @@ const struct inode_operations nilfs_special_inode_operations = {
 
 const struct inode_operations nilfs_symlink_inode_operations = {
 	.readlink	= generic_readlink,
-	.follow_link	= page_follow_link_light,
-	.put_link	= page_put_link,
+	.get_link	= page_get_link,
 	.permission     = nilfs_permission,
 };
 
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 354013ea2..7f5d3d9f1 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1316,13 +1316,11 @@ nilfs_mount(struct file_system_type *fs_type, int flags,
 	}
 
 	if (!s->s_root) {
-		char b[BDEVNAME_SIZE];
-
-		s_new = true;
+ 		s_new = true;
 
 		/* New superblock instance created */
 		s->s_mode = mode;
-		strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id));
+		snprintf(s->s_id, sizeof(s->s_id), "%pg", sd.bdev);
 		sb_set_blocksize(s, block_size(sd.bdev));
 
 		err = nilfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0);
@@ -1418,7 +1416,8 @@ static int __init nilfs_init_cachep(void)
 {
 	nilfs_inode_cachep = kmem_cache_create("nilfs2_inode_cache",
 			sizeof(struct nilfs_inode_info), 0,
-			SLAB_RECLAIM_ACCOUNT, nilfs_inode_init_once);
+			SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT,
+			nilfs_inode_init_once);
 	if (!nilfs_inode_cachep)
 		goto fail;
 
diff --git a/fs/notify/group.c b/fs/notify/group.c
index 53e45b61d..d16b62cb2 100644
--- a/fs/notify/group.c
+++ b/fs/notify/group.c
@@ -22,7 +22,6 @@
 #include <linux/srcu.h>
 #include <linux/rculist.h>
 #include <linux/wait.h>
-#include <linux/module.h>
 
 #include <linux/fsnotify_backend.h>
 #include "fsnotify.h"
@@ -73,7 +72,6 @@ void fsnotify_get_group(struct fsnotify_group *group)
 {
 	atomic_inc(&group->refcnt);
 }
-EXPORT_SYMBOL_GPL(fsnotify_get_group);
 
 /*
  * Drop a reference to a group.  Free it if it's through.
@@ -83,7 +81,6 @@ void fsnotify_put_group(struct fsnotify_group *group)
 	if (atomic_dec_and_test(&group->refcnt))
 		fsnotify_final_destroy_group(group);
 }
-EXPORT_SYMBOL_GPL(fsnotify_put_group);
 
 /*
  * Create a new fsnotify_group and hold a reference for the group returned.
@@ -112,7 +109,6 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops)
 
 	return group;
 }
-EXPORT_SYMBOL_GPL(fsnotify_alloc_group);
 
 int fsnotify_fasync(int fd, struct file *file, int on)
 {
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index e785fd954..741077dee 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -199,8 +199,7 @@ void fsnotify_unmount_inodes(struct super_block *sb)
 				break;
 			}
 			spin_unlock(&next_i->i_lock);
-			next_i = list_entry(next_i->i_sb_list.next,
-						struct inode, i_sb_list);
+			next_i = list_next_entry(next_i, i_sb_list);
 		}
 
 		/*
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 8175f3cd4..7115c5d7d 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -91,10 +91,14 @@
 #include <linux/fsnotify_backend.h>
 #include "fsnotify.h"
 
+#define FSNOTIFY_REAPER_DELAY	(1)	/* 1 jiffy */
+
 struct srcu_struct fsnotify_mark_srcu;
 static DEFINE_SPINLOCK(destroy_lock);
 static LIST_HEAD(destroy_list);
-static DECLARE_WAIT_QUEUE_HEAD(destroy_waitq);
+
+static void fsnotify_mark_destroy(struct work_struct *work);
+static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy);
 
 void fsnotify_get_mark(struct fsnotify_mark *mark)
 {
@@ -109,7 +113,6 @@ void fsnotify_put_mark(struct fsnotify_mark *mark)
 		mark->free_mark(mark);
 	}
 }
-EXPORT_SYMBOL_GPL(fsnotify_put_mark);
 
 /* Calculate mask of events for a list of marks */
 u32 fsnotify_recalc_mask(struct hlist_head *head)
@@ -190,7 +193,8 @@ void fsnotify_free_mark(struct fsnotify_mark *mark)
 	spin_lock(&destroy_lock);
 	list_add(&mark->g_list, &destroy_list);
 	spin_unlock(&destroy_lock);
-	wake_up(&destroy_waitq);
+	queue_delayed_work(system_unbound_wq, &reaper_work,
+				FSNOTIFY_REAPER_DELAY);
 
 	/*
 	 * Some groups like to know that marks are being freed.  This is a
@@ -209,7 +213,6 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark,
 	mutex_unlock(&group->mark_mutex);
 	fsnotify_free_mark(mark);
 }
-EXPORT_SYMBOL_GPL(fsnotify_destroy_mark);
 
 void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock)
 {
@@ -390,11 +393,11 @@ err:
 	spin_lock(&destroy_lock);
 	list_add(&mark->g_list, &destroy_list);
 	spin_unlock(&destroy_lock);
-	wake_up(&destroy_waitq);
+	queue_delayed_work(system_unbound_wq, &reaper_work,
+				FSNOTIFY_REAPER_DELAY);
 
 	return ret;
 }
-EXPORT_SYMBOL_GPL(fsnotify_add_mark);
 
 int fsnotify_add_mark(struct fsnotify_mark *mark, struct fsnotify_group *group,
 		      struct inode *inode, struct vfsmount *mnt, int allow_dups)
@@ -495,41 +498,21 @@ void fsnotify_init_mark(struct fsnotify_mark *mark,
 	atomic_set(&mark->refcnt, 1);
 	mark->free_mark = free_mark;
 }
-EXPORT_SYMBOL_GPL(fsnotify_init_mark);
 
-static int fsnotify_mark_destroy(void *ignored)
+static void fsnotify_mark_destroy(struct work_struct *work)
 {
 	struct fsnotify_mark *mark, *next;
 	struct list_head private_destroy_list;
 
-	for (;;) {
-		spin_lock(&destroy_lock);
-		/* exchange the list head */
-		list_replace_init(&destroy_list, &private_destroy_list);
-		spin_unlock(&destroy_lock);
-
-		synchronize_srcu(&fsnotify_mark_srcu);
+	spin_lock(&destroy_lock);
+	/* exchange the list head */
+	list_replace_init(&destroy_list, &private_destroy_list);
+	spin_unlock(&destroy_lock);
 
-		list_for_each_entry_safe(mark, next, &private_destroy_list, g_list) {
-			list_del_init(&mark->g_list);
-			fsnotify_put_mark(mark);
-		}
+	synchronize_srcu(&fsnotify_mark_srcu);
 
-		wait_event_interruptible(destroy_waitq, !list_empty(&destroy_list));
+	list_for_each_entry_safe(mark, next, &private_destroy_list, g_list) {
+		list_del_init(&mark->g_list);
+		fsnotify_put_mark(mark);
 	}
-
-	return 0;
-}
-
-static int __init fsnotify_mark_init(void)
-{
-	struct task_struct *thread;
-
-	thread = kthread_run(fsnotify_mark_destroy, NULL,
-			     "fsnotify_mark");
-	if (IS_ERR(thread))
-		panic("unable to start fsnotify mark destruction thread.");
-
-	return 0;
 }
-device_initcall(fsnotify_mark_init);
diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c
index 9e38dafa3..b2eff5816 100644
--- a/fs/ntfs/dir.c
+++ b/fs/ntfs/dir.c
@@ -1509,7 +1509,7 @@ static int ntfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
 	err = filemap_write_and_wait_range(vi->i_mapping, start, end);
 	if (err)
 		return err;
-	mutex_lock(&vi->i_mutex);
+	inode_lock(vi);
 
 	BUG_ON(!S_ISDIR(vi->i_mode));
 	/* If the bitmap attribute inode is in memory sync it, too. */
@@ -1532,7 +1532,7 @@ static int ntfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
 	else
 		ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx.  Error "
 				"%u.", datasync ? "data" : "", vi->i_ino, -ret);
-	mutex_unlock(&vi->i_mutex);
+	inode_unlock(vi);
 	return ret;
 }
 
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index 9d383e5ef..bed4d427d 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -1944,14 +1944,14 @@ static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	ssize_t written = 0;
 	ssize_t err;
 
-	mutex_lock(&vi->i_mutex);
+	inode_lock(vi);
 	/* We can write back this queue in page reclaim. */
 	current->backing_dev_info = inode_to_bdi(vi);
 	err = ntfs_prepare_file_for_write(iocb, from);
 	if (iov_iter_count(from) && !err)
 		written = ntfs_perform_write(file, from, iocb->ki_pos);
 	current->backing_dev_info = NULL;
-	mutex_unlock(&vi->i_mutex);
+	inode_unlock(vi);
 	if (likely(written > 0)) {
 		err = generic_write_sync(file, iocb->ki_pos, written);
 		if (err < 0)
@@ -1996,7 +1996,7 @@ static int ntfs_file_fsync(struct file *filp, loff_t start, loff_t end,
 	err = filemap_write_and_wait_range(vi->i_mapping, start, end);
 	if (err)
 		return err;
-	mutex_lock(&vi->i_mutex);
+	inode_lock(vi);
 
 	BUG_ON(S_ISDIR(vi->i_mode));
 	if (!datasync || !NInoNonResident(NTFS_I(vi)))
@@ -2015,7 +2015,7 @@ static int ntfs_file_fsync(struct file *filp, loff_t start, loff_t end,
 	else
 		ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx.  Error "
 				"%u.", datasync ? "data" : "", vi->i_ino, -ret);
-	mutex_unlock(&vi->i_mutex);
+	inode_unlock(vi);
 	return ret;
 }
 
diff --git a/fs/ntfs/quota.c b/fs/ntfs/quota.c
index d80e3315c..9793e68ba 100644
--- a/fs/ntfs/quota.c
+++ b/fs/ntfs/quota.c
@@ -48,7 +48,7 @@ bool ntfs_mark_quotas_out_of_date(ntfs_volume *vol)
 		ntfs_error(vol->sb, "Quota inodes are not open.");
 		return false;
 	}
-	mutex_lock(&vol->quota_q_ino->i_mutex);
+	inode_lock(vol->quota_q_ino);
 	ictx = ntfs_index_ctx_get(NTFS_I(vol->quota_q_ino));
 	if (!ictx) {
 		ntfs_error(vol->sb, "Failed to get index context.");
@@ -98,7 +98,7 @@ bool ntfs_mark_quotas_out_of_date(ntfs_volume *vol)
 	ntfs_index_entry_mark_dirty(ictx);
 set_done:
 	ntfs_index_ctx_put(ictx);
-	mutex_unlock(&vol->quota_q_ino->i_mutex);
+	inode_unlock(vol->quota_q_ino);
 	/*
 	 * We set the flag so we do not try to mark the quotas out of date
 	 * again on remount.
@@ -110,7 +110,7 @@ done:
 err_out:
 	if (ictx)
 		ntfs_index_ctx_put(ictx);
-	mutex_unlock(&vol->quota_q_ino->i_mutex);
+	inode_unlock(vol->quota_q_ino);
 	return false;
 }
 
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index d1a853585..1b38abdaa 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -1284,10 +1284,10 @@ static int check_windows_hibernation_status(ntfs_volume *vol)
 	 * Find the inode number for the hibernation file by looking up the
 	 * filename hiberfil.sys in the root directory.
 	 */
-	mutex_lock(&vol->root_ino->i_mutex);
+	inode_lock(vol->root_ino);
 	mref = ntfs_lookup_inode_by_name(NTFS_I(vol->root_ino), hiberfil, 12,
 			&name);
-	mutex_unlock(&vol->root_ino->i_mutex);
+	inode_unlock(vol->root_ino);
 	if (IS_ERR_MREF(mref)) {
 		ret = MREF_ERR(mref);
 		/* If the file does not exist, Windows is not hibernated. */
@@ -1377,10 +1377,10 @@ static bool load_and_init_quota(ntfs_volume *vol)
 	 * Find the inode number for the quota file by looking up the filename
 	 * $Quota in the extended system files directory $Extend.
 	 */
-	mutex_lock(&vol->extend_ino->i_mutex);
+	inode_lock(vol->extend_ino);
 	mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), Quota, 6,
 			&name);
-	mutex_unlock(&vol->extend_ino->i_mutex);
+	inode_unlock(vol->extend_ino);
 	if (IS_ERR_MREF(mref)) {
 		/*
 		 * If the file does not exist, quotas are disabled and have
@@ -1460,10 +1460,10 @@ static bool load_and_init_usnjrnl(ntfs_volume *vol)
 	 * Find the inode number for the transaction log file by looking up the
 	 * filename $UsnJrnl in the extended system files directory $Extend.
 	 */
-	mutex_lock(&vol->extend_ino->i_mutex);
+	inode_lock(vol->extend_ino);
 	mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), UsnJrnl, 8,
 			&name);
-	mutex_unlock(&vol->extend_ino->i_mutex);
+	inode_unlock(vol->extend_ino);
 	if (IS_ERR_MREF(mref)) {
 		/*
 		 * If the file does not exist, transaction logging is disabled,
@@ -3139,8 +3139,8 @@ static int __init init_ntfs_fs(void)
 
 	ntfs_big_inode_cache = kmem_cache_create(ntfs_big_inode_cache_name,
 			sizeof(big_ntfs_inode), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
-			ntfs_big_inode_init_once);
+			SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
+			SLAB_ACCOUNT, ntfs_big_inode_init_once);
 	if (!ntfs_big_inode_cache) {
 		pr_crit("Failed to create %s!\n", ntfs_big_inode_cache_name);
 		goto big_inode_err_out;
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 86181d652..d002579c6 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -164,7 +164,7 @@ static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et,
 				     struct ocfs2_extent_rec *rec);
 static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et);
 static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et);
-static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
+static const struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = {
 	.eo_set_last_eb_blk	= ocfs2_dinode_set_last_eb_blk,
 	.eo_get_last_eb_blk	= ocfs2_dinode_get_last_eb_blk,
 	.eo_update_clusters	= ocfs2_dinode_update_clusters,
@@ -286,7 +286,7 @@ static void ocfs2_xattr_value_update_clusters(struct ocfs2_extent_tree *et,
 	le32_add_cpu(&vb->vb_xv->xr_clusters, clusters);
 }
 
-static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = {
+static const struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = {
 	.eo_set_last_eb_blk	= ocfs2_xattr_value_set_last_eb_blk,
 	.eo_get_last_eb_blk	= ocfs2_xattr_value_get_last_eb_blk,
 	.eo_update_clusters	= ocfs2_xattr_value_update_clusters,
@@ -332,7 +332,7 @@ static void ocfs2_xattr_tree_update_clusters(struct ocfs2_extent_tree *et,
 	le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters);
 }
 
-static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
+static const struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = {
 	.eo_set_last_eb_blk	= ocfs2_xattr_tree_set_last_eb_blk,
 	.eo_get_last_eb_blk	= ocfs2_xattr_tree_get_last_eb_blk,
 	.eo_update_clusters	= ocfs2_xattr_tree_update_clusters,
@@ -379,7 +379,7 @@ static void ocfs2_dx_root_fill_root_el(struct ocfs2_extent_tree *et)
 	et->et_root_el = &dx_root->dr_list;
 }
 
-static struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = {
+static const struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = {
 	.eo_set_last_eb_blk	= ocfs2_dx_root_set_last_eb_blk,
 	.eo_get_last_eb_blk	= ocfs2_dx_root_get_last_eb_blk,
 	.eo_update_clusters	= ocfs2_dx_root_update_clusters,
@@ -425,7 +425,7 @@ ocfs2_refcount_tree_extent_contig(struct ocfs2_extent_tree *et,
 	return CONTIG_NONE;
 }
 
-static struct ocfs2_extent_tree_operations ocfs2_refcount_tree_et_ops = {
+static const struct ocfs2_extent_tree_operations ocfs2_refcount_tree_et_ops = {
 	.eo_set_last_eb_blk	= ocfs2_refcount_tree_set_last_eb_blk,
 	.eo_get_last_eb_blk	= ocfs2_refcount_tree_get_last_eb_blk,
 	.eo_update_clusters	= ocfs2_refcount_tree_update_clusters,
@@ -438,7 +438,7 @@ static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et,
 				     struct buffer_head *bh,
 				     ocfs2_journal_access_func access,
 				     void *obj,
-				     struct ocfs2_extent_tree_operations *ops)
+				     const struct ocfs2_extent_tree_operations *ops)
 {
 	et->et_ops = ops;
 	et->et_root_bh = bh;
@@ -5719,7 +5719,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
 		goto bail;
 	}
 
-	mutex_lock(&tl_inode->i_mutex);
+	inode_lock(tl_inode);
 
 	if (ocfs2_truncate_log_needs_flush(osb)) {
 		ret = __ocfs2_flush_truncate_log(osb);
@@ -5776,7 +5776,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
 out_commit:
 	ocfs2_commit_trans(osb, handle);
 out:
-	mutex_unlock(&tl_inode->i_mutex);
+	inode_unlock(tl_inode);
 bail:
 	if (meta_ac)
 		ocfs2_free_alloc_context(meta_ac);
@@ -5832,7 +5832,7 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb,
 	struct ocfs2_dinode *di;
 	struct ocfs2_truncate_log *tl;
 
-	BUG_ON(mutex_trylock(&tl_inode->i_mutex));
+	BUG_ON(inode_trylock(tl_inode));
 
 	start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk);
 
@@ -5980,7 +5980,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
 	struct ocfs2_dinode *di;
 	struct ocfs2_truncate_log *tl;
 
-	BUG_ON(mutex_trylock(&tl_inode->i_mutex));
+	BUG_ON(inode_trylock(tl_inode));
 
 	di = (struct ocfs2_dinode *) tl_bh->b_data;
 
@@ -6008,7 +6008,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb)
 		goto out;
 	}
 
-	mutex_lock(&data_alloc_inode->i_mutex);
+	inode_lock(data_alloc_inode);
 
 	status = ocfs2_inode_lock(data_alloc_inode, &data_alloc_bh, 1);
 	if (status < 0) {
@@ -6035,7 +6035,7 @@ out_unlock:
 	ocfs2_inode_unlock(data_alloc_inode, 1);
 
 out_mutex:
-	mutex_unlock(&data_alloc_inode->i_mutex);
+	inode_unlock(data_alloc_inode);
 	iput(data_alloc_inode);
 
 out:
@@ -6047,9 +6047,9 @@ int ocfs2_flush_truncate_log(struct ocfs2_super *osb)
 	int status;
 	struct inode *tl_inode = osb->osb_tl_inode;
 
-	mutex_lock(&tl_inode->i_mutex);
+	inode_lock(tl_inode);
 	status = __ocfs2_flush_truncate_log(osb);
-	mutex_unlock(&tl_inode->i_mutex);
+	inode_unlock(tl_inode);
 
 	return status;
 }
@@ -6174,8 +6174,7 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb,
 	}
 
 bail:
-	if (tl_inode)
-		iput(tl_inode);
+	iput(tl_inode);
 	brelse(tl_bh);
 
 	if (status < 0) {
@@ -6209,7 +6208,7 @@ int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
 		(unsigned long long)le64_to_cpu(tl_copy->i_blkno),
 		num_recs);
 
-	mutex_lock(&tl_inode->i_mutex);
+	inode_lock(tl_inode);
 	for(i = 0; i < num_recs; i++) {
 		if (ocfs2_truncate_log_needs_flush(osb)) {
 			status = __ocfs2_flush_truncate_log(osb);
@@ -6240,7 +6239,7 @@ int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb,
 	}
 
 bail_up:
-	mutex_unlock(&tl_inode->i_mutex);
+	inode_unlock(tl_inode);
 
 	return status;
 }
@@ -6347,7 +6346,7 @@ static int ocfs2_free_cached_blocks(struct ocfs2_super *osb,
 		goto out;
 	}
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	ret = ocfs2_inode_lock(inode, &di_bh, 1);
 	if (ret) {
@@ -6396,7 +6395,7 @@ out_unlock:
 	ocfs2_inode_unlock(inode, 1);
 	brelse(di_bh);
 out_mutex:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	iput(inode);
 out:
 	while(head) {
@@ -6440,7 +6439,7 @@ static int ocfs2_free_cached_clusters(struct ocfs2_super *osb,
 	handle_t *handle;
 	int ret = 0;
 
-	mutex_lock(&tl_inode->i_mutex);
+	inode_lock(tl_inode);
 
 	while (head) {
 		if (ocfs2_truncate_log_needs_flush(osb)) {
@@ -6472,7 +6471,7 @@ static int ocfs2_free_cached_clusters(struct ocfs2_super *osb,
 		}
 	}
 
-	mutex_unlock(&tl_inode->i_mutex);
+	inode_unlock(tl_inode);
 
 	while (head) {
 		/* Premature exit may have left some dangling items. */
@@ -7356,7 +7355,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range)
 		goto out;
 	}
 
-	mutex_lock(&main_bm_inode->i_mutex);
+	inode_lock(main_bm_inode);
 
 	ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 0);
 	if (ret < 0) {
@@ -7423,7 +7422,7 @@ out_unlock:
 	ocfs2_inode_unlock(main_bm_inode, 0);
 	brelse(main_bm_bh);
 out_mutex:
-	mutex_unlock(&main_bm_inode->i_mutex);
+	inode_unlock(main_bm_inode);
 	iput(main_bm_inode);
 out:
 	return ret;
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index fb09b97db..f3dc1b0df 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -54,7 +54,7 @@
  */
 struct ocfs2_extent_tree_operations;
 struct ocfs2_extent_tree {
-	struct ocfs2_extent_tree_operations	*et_ops;
+	const struct ocfs2_extent_tree_operations *et_ops;
 	struct buffer_head			*et_root_bh;
 	struct ocfs2_extent_list		*et_root_el;
 	struct ocfs2_caching_info		*et_ci;
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index e6795c7c7..cda0361e9 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -2047,9 +2047,9 @@ static int ocfs2_try_to_free_truncate_log(struct ocfs2_super *osb,
 	int ret = 0;
 	unsigned int truncated_clusters;
 
-	mutex_lock(&osb->osb_tl_inode->i_mutex);
+	inode_lock(osb->osb_tl_inode);
 	truncated_clusters = osb->truncated_clusters;
-	mutex_unlock(&osb->osb_tl_inode->i_mutex);
+	inode_unlock(osb->osb_tl_inode);
 
 	/*
 	 * Check whether we can succeed in allocating if we free
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 709fbbd44..a76b9ea77 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1254,15 +1254,15 @@ static const struct file_operations o2hb_debug_fops = {
 
 void o2hb_exit(void)
 {
-	kfree(o2hb_db_livenodes);
-	kfree(o2hb_db_liveregions);
-	kfree(o2hb_db_quorumregions);
-	kfree(o2hb_db_failedregions);
 	debugfs_remove(o2hb_debug_failedregions);
 	debugfs_remove(o2hb_debug_quorumregions);
 	debugfs_remove(o2hb_debug_liveregions);
 	debugfs_remove(o2hb_debug_livenodes);
 	debugfs_remove(o2hb_debug_dir);
+	kfree(o2hb_db_livenodes);
+	kfree(o2hb_db_liveregions);
+	kfree(o2hb_db_quorumregions);
+	kfree(o2hb_db_failedregions);
 }
 
 static struct dentry *o2hb_debug_create(const char *name, struct dentry *dir,
@@ -1438,13 +1438,15 @@ static void o2hb_region_release(struct config_item *item)
 
 	kfree(reg->hr_slots);
 
-	kfree(reg->hr_db_regnum);
-	kfree(reg->hr_db_livenodes);
 	debugfs_remove(reg->hr_debug_livenodes);
 	debugfs_remove(reg->hr_debug_regnum);
 	debugfs_remove(reg->hr_debug_elapsed_time);
 	debugfs_remove(reg->hr_debug_pinned);
 	debugfs_remove(reg->hr_debug_dir);
+	kfree(reg->hr_db_livenodes);
+	kfree(reg->hr_db_regnum);
+	kfree(reg->hr_debug_elapsed_time);
+	kfree(reg->hr_debug_pinned);
 
 	spin_lock(&o2hb_live_lock);
 	list_del(&reg->hr_all_item);
@@ -1780,8 +1782,8 @@ static ssize_t o2hb_region_dev_store(struct config_item *item,
 	}
 	++live_threshold;
 	atomic_set(&reg->hr_steady_iterations, live_threshold);
-	/* unsteady_iterations is double the steady_iterations */
-	atomic_set(&reg->hr_unsteady_iterations, (live_threshold << 1));
+	/* unsteady_iterations is triple the steady_iterations */
+	atomic_set(&reg->hr_unsteady_iterations, (live_threshold * 3));
 
 	hb_task = kthread_run(o2hb_thread, reg, "o2hb-%s",
 			      reg->hr_item.ci_name);
diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c
index 72afdca3c..ebe543894 100644
--- a/fs/ocfs2/cluster/nodemanager.c
+++ b/fs/ocfs2/cluster/nodemanager.c
@@ -757,7 +757,7 @@ int o2nm_depend_item(struct config_item *item)
 
 void o2nm_undepend_item(struct config_item *item)
 {
-	configfs_undepend_item(&o2nm_cluster_group.cs_subsys, item);
+	configfs_undepend_item(item);
 }
 
 int o2nm_depend_this_node(void)
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index ffecf89c8..e1adf285f 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -4361,7 +4361,7 @@ static int ocfs2_dx_dir_remove_index(struct inode *dir,
 		mlog_errno(ret);
 		goto out;
 	}
-	mutex_lock(&dx_alloc_inode->i_mutex);
+	inode_lock(dx_alloc_inode);
 
 	ret = ocfs2_inode_lock(dx_alloc_inode, &dx_alloc_bh, 1);
 	if (ret) {
@@ -4410,7 +4410,7 @@ out_unlock:
 	ocfs2_inode_unlock(dx_alloc_inode, 1);
 
 out_mutex:
-	mutex_unlock(&dx_alloc_inode->i_mutex);
+	inode_unlock(dx_alloc_inode);
 	brelse(dx_alloc_bh);
 out:
 	iput(dx_alloc_inode);
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index e88ccf8c8..68c607e63 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -376,17 +376,6 @@ struct dlm_lock
 		 lksb_kernel_allocated:1;
 };
 
-
-#define DLM_LKSB_UNUSED1           0x01
-#define DLM_LKSB_PUT_LVB           0x02
-#define DLM_LKSB_GET_LVB           0x04
-#define DLM_LKSB_UNUSED2           0x08
-#define DLM_LKSB_UNUSED3           0x10
-#define DLM_LKSB_UNUSED4           0x20
-#define DLM_LKSB_UNUSED5           0x40
-#define DLM_LKSB_UNUSED6           0x80
-
-
 enum dlm_lockres_list {
 	DLM_GRANTED_LIST = 0,
 	DLM_CONVERTING_LIST = 1,
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 4e2162b35..9477d6e1d 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2388,8 +2388,8 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
 
 	spin_lock(&res->spinlock);
 	BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF);
+	__dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);
 	if (test_bit(node, res->refmap)) {
-		__dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);
 		dlm_lockres_clear_refmap_bit(dlm, res, node);
 		cleared = 1;
 	}
@@ -2549,7 +2549,7 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm,
 	}
 
 fail:
-	if (oldmle) {
+	if (ret != -EEXIST && oldmle) {
 		/* master is known, detach if not already detached */
 		dlm_mle_detach_hb_events(dlm, oldmle);
 		dlm_put_mle(oldmle);
@@ -3045,7 +3045,7 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data,
 	int ret = 0;
 
 	if (!dlm_grab(dlm))
-		return -EINVAL;
+		return 0;
 
 	name = migrate->name;
 	namelen = migrate->namelen;
@@ -3136,7 +3136,8 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
 				mlog(0, "tried to migrate %.*s, but some "
 				     "process beat me to it\n",
 				     namelen, name);
-				ret = -EEXIST;
+				spin_unlock(&tmp->spinlock);
+				return -EEXIST;
 			} else {
 				/* bad.  2 NODES are trying to migrate! */
 				mlog(ML_ERROR, "migration error  mle: "
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 42f0cae93..b94a425f0 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -1373,6 +1373,7 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
 	char *buf = NULL;
 	struct dlm_work_item *item = NULL;
 	struct dlm_lock_resource *res = NULL;
+	unsigned int hash;
 
 	if (!dlm_grab(dlm))
 		return -EINVAL;
@@ -1400,7 +1401,10 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
 	/* lookup the lock to see if we have a secondary queue for this
 	 * already...  just add the locks in and this will have its owner
 	 * and RECOVERY flag changed when it completes. */
-	res = dlm_lookup_lockres(dlm, mres->lockname, mres->lockname_len);
+	hash = dlm_lockid_hash(mres->lockname, mres->lockname_len);
+	spin_lock(&dlm->spinlock);
+	res = __dlm_lookup_lockres(dlm, mres->lockname, mres->lockname_len,
+			hash);
 	if (res) {
 	 	/* this will get a ref on res */
 		/* mark it as recovering/migrating and hash it */
@@ -1421,13 +1425,16 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
 				     mres->lockname_len, mres->lockname);
 				ret = -EFAULT;
 				spin_unlock(&res->spinlock);
+				spin_unlock(&dlm->spinlock);
 				dlm_lockres_put(res);
 				goto leave;
 			}
 			res->state |= DLM_LOCK_RES_MIGRATING;
 		}
 		spin_unlock(&res->spinlock);
+		spin_unlock(&dlm->spinlock);
 	} else {
+		spin_unlock(&dlm->spinlock);
 		/* need to allocate, just like if it was
 		 * mastered here normally  */
 		res = dlm_new_lockres(dlm, mres->lockname, mres->lockname_len);
@@ -2452,11 +2459,7 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
 	 * perhaps later we can genericize this for other waiters. */
 	wake_up(&dlm->migration_wq);
 
-	if (test_bit(idx, dlm->recovery_map))
-		mlog(0, "domain %s, node %u already added "
-		     "to recovery map!\n", dlm->name, idx);
-	else
-		set_bit(idx, dlm->recovery_map);
+	set_bit(idx, dlm->recovery_map);
 }
 
 void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data)
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 2e3c9dbab..1082b2c30 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -421,7 +421,7 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data,
 	}
 
 	if (!dlm_grab(dlm))
-		return DLM_REJECTED;
+		return DLM_FORWARD;
 
 	mlog_bug_on_msg(!dlm_domain_fully_joined(dlm),
 			"Domain %s not fully joined!\n", dlm->name);
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index b5cf27dcb..03768bb3a 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -638,7 +638,7 @@ static int __init init_dlmfs_fs(void)
 	dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache",
 				sizeof(struct dlmfs_inode_private),
 				0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
-					SLAB_MEM_SPREAD),
+					SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 				dlmfs_init_once);
 	if (!dlmfs_inode_cache) {
 		status = -ENOMEM;
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index b002acf50..474e57f83 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -2438,12 +2438,6 @@ bail:
  * done this we have to return AOP_TRUNCATED_PAGE so the aop method
  * that called us can bubble that back up into the VFS who will then
  * immediately retry the aop call.
- *
- * We do a blocking lock and immediate unlock before returning, though, so that
- * the lock has a great chance of being cached on this node by the time the VFS
- * calls back to retry the aop.    This has a potential to livelock as nodes
- * ping locks back and forth, but that's a risk we're willing to take to avoid
- * the lock inversion simply.
  */
 int ocfs2_inode_lock_with_page(struct inode *inode,
 			      struct buffer_head **ret_bh,
@@ -2455,8 +2449,6 @@ int ocfs2_inode_lock_with_page(struct inode *inode,
 	ret = ocfs2_inode_lock_full(inode, ret_bh, ex, OCFS2_LOCK_NONBLOCK);
 	if (ret == -EAGAIN) {
 		unlock_page(page);
-		if (ocfs2_inode_lock(inode, ret_bh, ex) == 0)
-			ocfs2_inode_unlock(inode, ex);
 		ret = AOP_TRUNCATED_PAGE;
 	}
 
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 0e5b4515f..7cb38fdca 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1302,6 +1302,14 @@ int ocfs2_getattr(struct vfsmount *mnt,
 	}
 
 	generic_fillattr(inode, stat);
+	/*
+	 * If there is inline data in the inode, the inode will normally not
+	 * have data blocks allocated (it may have an external xattr block).
+	 * Report at least one sector for such files, so tools like tar, rsync,
+	 * others don't incorrectly think the file is completely sparse.
+	 */
+	if (unlikely(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL))
+		stat->blocks += (stat->size + 511)>>9;
 
 	/* We set the blksize from the cluster size for performance */
 	stat->blksize = osb->s_clustersize;
@@ -1864,7 +1872,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode,
 	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
 		return -EROFS;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	/*
 	 * This prevents concurrent writes on other nodes
@@ -1983,7 +1991,7 @@ out_rw_unlock:
 	ocfs2_rw_unlock(inode, 1);
 
 out:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return ret;
 }
 
@@ -2291,7 +2299,7 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb,
 	appending = iocb->ki_flags & IOCB_APPEND ? 1 : 0;
 	direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 relock:
 	/*
@@ -2427,7 +2435,7 @@ out:
 		ocfs2_rw_unlock(inode, rw_level);
 
 out_mutex:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	if (written)
 		ret = written;
@@ -2539,7 +2547,7 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence)
 	struct inode *inode = file->f_mapping->host;
 	int ret = 0;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	switch (whence) {
 	case SEEK_SET:
@@ -2577,7 +2585,7 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence)
 	offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
 
 out:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	if (ret)
 		return ret;
 	return offset;
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 8f87e05ee..36294446d 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -361,6 +361,7 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
 		    break;
 	    case S_IFLNK:
 		    inode->i_op = &ocfs2_symlink_inode_operations;
+		    inode_nohighmem(inode);
 		    i_size_write(inode, le64_to_cpu(fe->i_size));
 		    break;
 	    default:
@@ -629,10 +630,10 @@ static int ocfs2_remove_inode(struct inode *inode,
 		goto bail;
 	}
 
-	mutex_lock(&inode_alloc_inode->i_mutex);
+	inode_lock(inode_alloc_inode);
 	status = ocfs2_inode_lock(inode_alloc_inode, &inode_alloc_bh, 1);
 	if (status < 0) {
-		mutex_unlock(&inode_alloc_inode->i_mutex);
+		inode_unlock(inode_alloc_inode);
 
 		mlog_errno(status);
 		goto bail;
@@ -679,7 +680,7 @@ bail_commit:
 	ocfs2_commit_trans(osb, handle);
 bail_unlock:
 	ocfs2_inode_unlock(inode_alloc_inode, 1);
-	mutex_unlock(&inode_alloc_inode->i_mutex);
+	inode_unlock(inode_alloc_inode);
 	brelse(inode_alloc_bh);
 bail:
 	iput(inode_alloc_inode);
@@ -750,10 +751,10 @@ static int ocfs2_wipe_inode(struct inode *inode,
 		/* Lock the orphan dir. The lock will be held for the entire
 		 * delete_inode operation. We do this now to avoid races with
 		 * recovery completion on other nodes. */
-		mutex_lock(&orphan_dir_inode->i_mutex);
+		inode_lock(orphan_dir_inode);
 		status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
 		if (status < 0) {
-			mutex_unlock(&orphan_dir_inode->i_mutex);
+			inode_unlock(orphan_dir_inode);
 
 			mlog_errno(status);
 			goto bail;
@@ -802,7 +803,7 @@ bail_unlock_dir:
 		return status;
 
 	ocfs2_inode_unlock(orphan_dir_inode, 1);
-	mutex_unlock(&orphan_dir_inode->i_mutex);
+	inode_unlock(orphan_dir_inode);
 	brelse(orphan_dir_bh);
 bail:
 	iput(orphan_dir_inode);
diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c
index 3cb097ccc..4506ec5ec 100644
--- a/fs/ocfs2/ioctl.c
+++ b/fs/ocfs2/ioctl.c
@@ -86,7 +86,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
 	unsigned oldflags;
 	int status;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	status = ocfs2_inode_lock(inode, &bh, 1);
 	if (status < 0) {
@@ -135,7 +135,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags,
 bail_unlock:
 	ocfs2_inode_unlock(inode, 1);
 bail:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	brelse(bh);
 
@@ -287,7 +287,7 @@ static int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb,
 	struct ocfs2_dinode *dinode_alloc = NULL;
 
 	if (inode_alloc)
-		mutex_lock(&inode_alloc->i_mutex);
+		inode_lock(inode_alloc);
 
 	if (o2info_coherent(&fi->ifi_req)) {
 		status = ocfs2_inode_lock(inode_alloc, &bh, 0);
@@ -317,7 +317,7 @@ bail:
 		ocfs2_inode_unlock(inode_alloc, 0);
 
 	if (inode_alloc)
-		mutex_unlock(&inode_alloc->i_mutex);
+		inode_unlock(inode_alloc);
 
 	brelse(bh);
 
@@ -547,7 +547,7 @@ static int ocfs2_info_freefrag_scan_bitmap(struct ocfs2_super *osb,
 	struct ocfs2_dinode *gb_dinode = NULL;
 
 	if (gb_inode)
-		mutex_lock(&gb_inode->i_mutex);
+		inode_lock(gb_inode);
 
 	if (o2info_coherent(&ffg->iff_req)) {
 		status = ocfs2_inode_lock(gb_inode, &bh, 0);
@@ -604,11 +604,9 @@ bail:
 		ocfs2_inode_unlock(gb_inode, 0);
 
 	if (gb_inode)
-		mutex_unlock(&gb_inode->i_mutex);
-
-	if (gb_inode)
-		iput(gb_inode);
+		inode_unlock(gb_inode);
 
+	iput(gb_inode);
 	brelse(bh);
 
 	return status;
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 13534f4fe..61b833b72 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1042,8 +1042,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
 
 //	up_write(&journal->j_trans_barrier);
 done:
-	if (inode)
-		iput(inode);
+	iput(inode);
 }
 
 static void ocfs2_clear_journal_error(struct super_block *sb,
@@ -1687,9 +1686,7 @@ done:
 	if (got_lock)
 		ocfs2_inode_unlock(inode, 1);
 
-	if (inode)
-		iput(inode);
-
+	iput(inode);
 	brelse(bh);
 
 	return status;
@@ -1796,8 +1793,7 @@ static int ocfs2_trylock_journal(struct ocfs2_super *osb,
 
 	ocfs2_inode_unlock(inode, 1);
 bail:
-	if (inode)
-		iput(inode);
+	iput(inode);
 
 	return status;
 }
@@ -2092,7 +2088,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
 		return status;
 	}
 
-	mutex_lock(&orphan_dir_inode->i_mutex);
+	inode_lock(orphan_dir_inode);
 	status = ocfs2_inode_lock(orphan_dir_inode, NULL, 0);
 	if (status < 0) {
 		mlog_errno(status);
@@ -2110,7 +2106,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
 out_cluster:
 	ocfs2_inode_unlock(orphan_dir_inode, 0);
 out:
-	mutex_unlock(&orphan_dir_inode->i_mutex);
+	inode_unlock(orphan_dir_inode);
 	iput(orphan_dir_inode);
 	return status;
 }
@@ -2200,7 +2196,7 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
 		oi->ip_next_orphan = NULL;
 
 		if (oi->ip_flags & OCFS2_INODE_DIO_ORPHAN_ENTRY) {
-			mutex_lock(&inode->i_mutex);
+			inode_lock(inode);
 			ret = ocfs2_rw_lock(inode, 1);
 			if (ret < 0) {
 				mlog_errno(ret);
@@ -2239,7 +2235,7 @@ unlock_inode:
 unlock_rw:
 			ocfs2_rw_unlock(inode, 1);
 unlock_mutex:
-			mutex_unlock(&inode->i_mutex);
+			inode_unlock(inode);
 
 			/* clear dio flag in ocfs2_inode_info */
 			oi->ip_flags &= ~OCFS2_INODE_DIO_ORPHAN_ENTRY;
diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c
index 0a4457fb0..7d62c43a2 100644
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -358,8 +358,7 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb)
 bail:
 	if (status < 0)
 		brelse(alloc_bh);
-	if (inode)
-		iput(inode);
+	iput(inode);
 
 	trace_ocfs2_load_local_alloc(osb->local_alloc_bits);
 
@@ -415,7 +414,7 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb)
 		goto out;
 	}
 
-	mutex_lock(&main_bm_inode->i_mutex);
+	inode_lock(main_bm_inode);
 
 	status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
 	if (status < 0) {
@@ -469,12 +468,11 @@ out_unlock:
 	ocfs2_inode_unlock(main_bm_inode, 1);
 
 out_mutex:
-	mutex_unlock(&main_bm_inode->i_mutex);
+	inode_unlock(main_bm_inode);
 	iput(main_bm_inode);
 
 out:
-	if (local_alloc_inode)
-		iput(local_alloc_inode);
+	iput(local_alloc_inode);
 
 	kfree(alloc_copy);
 }
@@ -508,7 +506,7 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb,
 		goto bail;
 	}
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	status = ocfs2_read_inode_block_full(inode, &alloc_bh,
 					     OCFS2_BH_IGNORE_CACHE);
@@ -541,7 +539,7 @@ bail:
 	brelse(alloc_bh);
 
 	if (inode) {
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 		iput(inode);
 	}
 
@@ -573,7 +571,7 @@ int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb,
 		goto out;
 	}
 
-	mutex_lock(&main_bm_inode->i_mutex);
+	inode_lock(main_bm_inode);
 
 	status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
 	if (status < 0) {
@@ -603,7 +601,7 @@ out_unlock:
 	ocfs2_inode_unlock(main_bm_inode, 1);
 
 out_mutex:
-	mutex_unlock(&main_bm_inode->i_mutex);
+	inode_unlock(main_bm_inode);
 
 	brelse(main_bm_bh);
 
@@ -645,7 +643,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
 		goto bail;
 	}
 
-	mutex_lock(&local_alloc_inode->i_mutex);
+	inode_lock(local_alloc_inode);
 
 	/*
 	 * We must double check state and allocator bits because
@@ -711,7 +709,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb,
 	status = 0;
 bail:
 	if (status < 0 && local_alloc_inode) {
-		mutex_unlock(&local_alloc_inode->i_mutex);
+		inode_unlock(local_alloc_inode);
 		iput(local_alloc_inode);
 	}
 
@@ -1327,9 +1325,7 @@ bail:
 
 	brelse(main_bm_bh);
 
-	if (main_bm_inode)
-		iput(main_bm_inode);
-
+	iput(main_bm_inode);
 	kfree(alloc_copy);
 
 	if (ac)
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 9581d190f..77ebc2bc1 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -147,6 +147,10 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	ret = ocfs2_inode_lock(inode, &di_bh, 1);
 	if (ret < 0) {
 		mlog_errno(ret);
+		if (ret == -ENOMEM)
+			ret = VM_FAULT_OOM;
+		else
+			ret = VM_FAULT_SIGBUS;
 		goto out;
 	}
 
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 124471d26..e3d05d990 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -276,7 +276,7 @@ static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context,
 	 *	context->data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv;
 	 */
 
-	mutex_lock(&tl_inode->i_mutex);
+	inode_lock(tl_inode);
 
 	if (ocfs2_truncate_log_needs_flush(osb)) {
 		ret = __ocfs2_flush_truncate_log(osb);
@@ -338,7 +338,7 @@ out_commit:
 	ocfs2_commit_trans(osb, handle);
 
 out_unlock_mutex:
-	mutex_unlock(&tl_inode->i_mutex);
+	inode_unlock(tl_inode);
 
 	if (context->data_ac) {
 		ocfs2_free_alloc_context(context->data_ac);
@@ -632,7 +632,7 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
 		goto out;
 	}
 
-	mutex_lock(&gb_inode->i_mutex);
+	inode_lock(gb_inode);
 
 	ret = ocfs2_inode_lock(gb_inode, &gb_bh, 1);
 	if (ret) {
@@ -640,7 +640,7 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
 		goto out_unlock_gb_mutex;
 	}
 
-	mutex_lock(&tl_inode->i_mutex);
+	inode_lock(tl_inode);
 
 	handle = ocfs2_start_trans(osb, credits);
 	if (IS_ERR(handle)) {
@@ -708,11 +708,11 @@ out_commit:
 	brelse(gd_bh);
 
 out_unlock_tl_inode:
-	mutex_unlock(&tl_inode->i_mutex);
+	inode_unlock(tl_inode);
 
 	ocfs2_inode_unlock(gb_inode, 1);
 out_unlock_gb_mutex:
-	mutex_unlock(&gb_inode->i_mutex);
+	inode_unlock(gb_inode);
 	brelse(gb_bh);
 	iput(gb_inode);
 
@@ -905,7 +905,7 @@ static int ocfs2_move_extents(struct ocfs2_move_extents_context *context)
 	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
 		return -EROFS;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	/*
 	 * This prevents concurrent writes from other nodes
@@ -969,7 +969,7 @@ out_inode_unlock:
 out_rw_unlock:
 	ocfs2_rw_unlock(inode, 1);
 out:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	return status;
 }
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 3123408da..6b3e87189 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -1045,7 +1045,7 @@ leave:
 	if (orphan_dir) {
 		/* This was locked for us in ocfs2_prepare_orphan_dir() */
 		ocfs2_inode_unlock(orphan_dir, 1);
-		mutex_unlock(&orphan_dir->i_mutex);
+		inode_unlock(orphan_dir);
 		iput(orphan_dir);
 	}
 
@@ -1664,7 +1664,7 @@ bail:
 	if (orphan_dir) {
 		/* This was locked for us in ocfs2_prepare_orphan_dir() */
 		ocfs2_inode_unlock(orphan_dir, 1);
-		mutex_unlock(&orphan_dir->i_mutex);
+		inode_unlock(orphan_dir);
 		iput(orphan_dir);
 	}
 
@@ -1683,8 +1683,7 @@ bail:
 	if (new_inode)
 		sync_mapping_buffers(old_inode->i_mapping);
 
-	if (new_inode)
-		iput(new_inode);
+	iput(new_inode);
 
 	ocfs2_free_dir_lookup_result(&target_lookup_res);
 	ocfs2_free_dir_lookup_result(&old_entry_lookup);
@@ -1958,6 +1957,7 @@ static int ocfs2_symlink(struct inode *dir,
 	inode->i_rdev = 0;
 	newsize = l - 1;
 	inode->i_op = &ocfs2_symlink_inode_operations;
+	inode_nohighmem(inode);
 	if (l > ocfs2_fast_symlink_chars(sb)) {
 		u32 offset = 0;
 
@@ -2121,11 +2121,11 @@ static int ocfs2_lookup_lock_orphan_dir(struct ocfs2_super *osb,
 		return ret;
 	}
 
-	mutex_lock(&orphan_dir_inode->i_mutex);
+	inode_lock(orphan_dir_inode);
 
 	ret = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
 	if (ret < 0) {
-		mutex_unlock(&orphan_dir_inode->i_mutex);
+		inode_unlock(orphan_dir_inode);
 		iput(orphan_dir_inode);
 
 		mlog_errno(ret);
@@ -2226,7 +2226,7 @@ out:
 
 	if (ret) {
 		ocfs2_inode_unlock(orphan_dir_inode, 1);
-		mutex_unlock(&orphan_dir_inode->i_mutex);
+		inode_unlock(orphan_dir_inode);
 		iput(orphan_dir_inode);
 	}
 
@@ -2372,6 +2372,15 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
 	     (unsigned long long)OCFS2_I(orphan_dir_inode)->ip_blkno,
 	     name, strlen(name));
 
+	status = ocfs2_journal_access_di(handle,
+					 INODE_CACHE(orphan_dir_inode),
+					 orphan_dir_bh,
+					 OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto leave;
+	}
+
 	/* find it's spot in the orphan directory */
 	status = ocfs2_find_entry(name, strlen(name), orphan_dir_inode,
 				  &lookup);
@@ -2387,15 +2396,6 @@ int ocfs2_orphan_del(struct ocfs2_super *osb,
 		goto leave;
 	}
 
-	status = ocfs2_journal_access_di(handle,
-					 INODE_CACHE(orphan_dir_inode),
-					 orphan_dir_bh,
-					 OCFS2_JOURNAL_ACCESS_WRITE);
-	if (status < 0) {
-		mlog_errno(status);
-		goto leave;
-	}
-
 	/* do the i_nlink dance! :) */
 	orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data;
 	if (S_ISDIR(inode->i_mode))
@@ -2495,7 +2495,7 @@ out:
 			ocfs2_free_alloc_context(inode_ac);
 
 		/* Unroll orphan dir locking */
-		mutex_unlock(&orphan_dir->i_mutex);
+		inode_unlock(orphan_dir);
 		ocfs2_inode_unlock(orphan_dir, 1);
 		iput(orphan_dir);
 	}
@@ -2602,7 +2602,7 @@ leave:
 	if (orphan_dir) {
 		/* This was locked for us in ocfs2_prepare_orphan_dir() */
 		ocfs2_inode_unlock(orphan_dir, 1);
-		mutex_unlock(&orphan_dir->i_mutex);
+		inode_unlock(orphan_dir);
 		iput(orphan_dir);
 	}
 
@@ -2689,7 +2689,7 @@ int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb,
 
 bail_unlock_orphan:
 	ocfs2_inode_unlock(orphan_dir_inode, 1);
-	mutex_unlock(&orphan_dir_inode->i_mutex);
+	inode_unlock(orphan_dir_inode);
 	iput(orphan_dir_inode);
 
 	ocfs2_free_dir_lookup_result(&orphan_insert);
@@ -2721,10 +2721,10 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb,
 		goto bail;
 	}
 
-	mutex_lock(&orphan_dir_inode->i_mutex);
+	inode_lock(orphan_dir_inode);
 	status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
 	if (status < 0) {
-		mutex_unlock(&orphan_dir_inode->i_mutex);
+		inode_unlock(orphan_dir_inode);
 		iput(orphan_dir_inode);
 		mlog_errno(status);
 		goto bail;
@@ -2770,7 +2770,7 @@ bail_commit:
 
 bail_unlock_orphan:
 	ocfs2_inode_unlock(orphan_dir_inode, 1);
-	mutex_unlock(&orphan_dir_inode->i_mutex);
+	inode_unlock(orphan_dir_inode);
 	brelse(orphan_dir_bh);
 	iput(orphan_dir_inode);
 
@@ -2834,12 +2834,12 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir,
 		goto leave;
 	}
 
-	mutex_lock(&orphan_dir_inode->i_mutex);
+	inode_lock(orphan_dir_inode);
 
 	status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1);
 	if (status < 0) {
 		mlog_errno(status);
-		mutex_unlock(&orphan_dir_inode->i_mutex);
+		inode_unlock(orphan_dir_inode);
 		iput(orphan_dir_inode);
 		goto leave;
 	}
@@ -2901,7 +2901,7 @@ out_commit:
 	ocfs2_commit_trans(osb, handle);
 orphan_unlock:
 	ocfs2_inode_unlock(orphan_dir_inode, 1);
-	mutex_unlock(&orphan_dir_inode->i_mutex);
+	inode_unlock(orphan_dir_inode);
 	iput(orphan_dir_inode);
 leave:
 
diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h
index b6d51333a..d153e6e31 100644
--- a/fs/ocfs2/quota.h
+++ b/fs/ocfs2/quota.h
@@ -82,7 +82,7 @@ struct ocfs2_quota_chunk {
 extern struct kmem_cache *ocfs2_dquot_cachep;
 extern struct kmem_cache *ocfs2_qf_chunk_cachep;
 
-extern struct qtree_fmt_operations ocfs2_global_ops;
+extern const struct qtree_fmt_operations ocfs2_global_ops;
 
 struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery(
 				struct ocfs2_super *osb, int slot_num);
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index c93d67220..9c9dd30bc 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -123,7 +123,7 @@ static int ocfs2_global_is_id(void *dp, struct dquot *dquot)
 		      dquot->dq_id);
 }
 
-struct qtree_fmt_operations ocfs2_global_ops = {
+const struct qtree_fmt_operations ocfs2_global_ops = {
 	.mem2disk_dqblk = ocfs2_global_mem2diskdqb,
 	.disk2mem_dqblk = ocfs2_global_disk2memdqb,
 	.is_id = ocfs2_global_is_id,
@@ -308,7 +308,7 @@ int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
 		WARN_ON(bh != oinfo->dqi_gqi_bh);
 	spin_unlock(&dq_data_lock);
 	if (ex) {
-		mutex_lock(&oinfo->dqi_gqinode->i_mutex);
+		inode_lock(oinfo->dqi_gqinode);
 		down_write(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
 	} else {
 		down_read(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
@@ -320,7 +320,7 @@ void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex)
 {
 	if (ex) {
 		up_write(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
-		mutex_unlock(&oinfo->dqi_gqinode->i_mutex);
+		inode_unlock(oinfo->dqi_gqinode);
 	} else {
 		up_read(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem);
 	}
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 252119860..3eff031aa 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -807,7 +807,7 @@ int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh)
 			mlog_errno(ret);
 			goto out;
 		}
-		mutex_lock(&alloc_inode->i_mutex);
+		inode_lock(alloc_inode);
 
 		ret = ocfs2_inode_lock(alloc_inode, &alloc_bh, 1);
 		if (ret) {
@@ -867,7 +867,7 @@ out_unlock:
 	}
 out_mutex:
 	if (alloc_inode) {
-		mutex_unlock(&alloc_inode->i_mutex);
+		inode_unlock(alloc_inode);
 		iput(alloc_inode);
 	}
 out:
@@ -4197,7 +4197,7 @@ static int __ocfs2_reflink(struct dentry *old_dentry,
 		goto out;
 	}
 
-	mutex_lock_nested(&new_inode->i_mutex, I_MUTEX_CHILD);
+	inode_lock_nested(new_inode, I_MUTEX_CHILD);
 	ret = ocfs2_inode_lock_nested(new_inode, &new_bh, 1,
 				      OI_LS_REFLINK_TARGET);
 	if (ret) {
@@ -4231,7 +4231,7 @@ inode_unlock:
 	ocfs2_inode_unlock(new_inode, 1);
 	brelse(new_bh);
 out_unlock:
-	mutex_unlock(&new_inode->i_mutex);
+	inode_unlock(new_inode);
 out:
 	if (!ret) {
 		ret = filemap_fdatawait(inode->i_mapping);
@@ -4402,11 +4402,11 @@ static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir,
 			return error;
 	}
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	error = dquot_initialize(dir);
 	if (!error)
 		error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	if (!error)
 		fsnotify_create(dir, new_dentry);
 	return error;
diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c
index 79b802130..576b9a048 100644
--- a/fs/ocfs2/resize.c
+++ b/fs/ocfs2/resize.c
@@ -301,7 +301,7 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters)
 		goto out;
 	}
 
-	mutex_lock(&main_bm_inode->i_mutex);
+	inode_lock(main_bm_inode);
 
 	ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
 	if (ret < 0) {
@@ -375,7 +375,7 @@ out_unlock:
 	ocfs2_inode_unlock(main_bm_inode, 1);
 
 out_mutex:
-	mutex_unlock(&main_bm_inode->i_mutex);
+	inode_unlock(main_bm_inode);
 	iput(main_bm_inode);
 
 out:
@@ -486,7 +486,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input)
 		goto out;
 	}
 
-	mutex_lock(&main_bm_inode->i_mutex);
+	inode_lock(main_bm_inode);
 
 	ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1);
 	if (ret < 0) {
@@ -590,7 +590,7 @@ out_unlock:
 	ocfs2_inode_unlock(main_bm_inode, 1);
 
 out_mutex:
-	mutex_unlock(&main_bm_inode->i_mutex);
+	inode_unlock(main_bm_inode);
 	iput(main_bm_inode);
 
 out:
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index e78a203d4..1e0959214 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -322,8 +322,7 @@ static void __ocfs2_free_slot_info(struct ocfs2_slot_info *si)
 	if (si == NULL)
 		return;
 
-	if (si->si_inode)
-		iput(si->si_inode);
+	iput(si->si_inode);
 	if (si->si_bh) {
 		for (i = 0; i < si->si_blocks; i++) {
 			if (si->si_bh[i]) {
@@ -503,8 +502,17 @@ int ocfs2_find_slot(struct ocfs2_super *osb)
 	trace_ocfs2_find_slot(osb->slot_num);
 
 	status = ocfs2_update_disk_slot(osb, si, osb->slot_num);
-	if (status < 0)
+	if (status < 0) {
 		mlog_errno(status);
+		/*
+		 * if write block failed, invalidate slot to avoid overwrite
+		 * slot during dismount in case another node rightly has mounted
+		 */
+		spin_lock(&osb->osb_lock);
+		ocfs2_invalidate_slot(si, osb->slot_num);
+		osb->slot_num = OCFS2_INVALID_SLOT;
+		spin_unlock(&osb->osb_lock);
+	}
 
 bail:
 	return status;
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index fc6d25f6d..2f19aeec5 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -141,7 +141,7 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac)
 		if (ac->ac_which != OCFS2_AC_USE_LOCAL)
 			ocfs2_inode_unlock(inode, 1);
 
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 
 		iput(inode);
 		ac->ac_inode = NULL;
@@ -797,11 +797,11 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb,
 		return -EINVAL;
 	}
 
-	mutex_lock(&alloc_inode->i_mutex);
+	inode_lock(alloc_inode);
 
 	status = ocfs2_inode_lock(alloc_inode, &bh, 1);
 	if (status < 0) {
-		mutex_unlock(&alloc_inode->i_mutex);
+		inode_unlock(alloc_inode);
 		iput(alloc_inode);
 
 		mlog_errno(status);
@@ -2875,10 +2875,10 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
 		goto bail;
 	}
 
-	mutex_lock(&inode_alloc_inode->i_mutex);
+	inode_lock(inode_alloc_inode);
 	status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0);
 	if (status < 0) {
-		mutex_unlock(&inode_alloc_inode->i_mutex);
+		inode_unlock(inode_alloc_inode);
 		iput(inode_alloc_inode);
 		mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n",
 		     (u32)suballoc_slot, status);
@@ -2891,7 +2891,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res)
 		mlog(ML_ERROR, "test suballoc bit failed %d\n", status);
 
 	ocfs2_inode_unlock(inode_alloc_inode, 0);
-	mutex_unlock(&inode_alloc_inode->i_mutex);
+	inode_unlock(inode_alloc_inode);
 
 	iput(inode_alloc_inode);
 	brelse(alloc_bh);
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 2de4c8a93..faa136509 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1280,6 +1280,8 @@ static int ocfs2_parse_options(struct super_block *sb,
 	int status, user_stack = 0;
 	char *p;
 	u32 tmp;
+	int token, option;
+	substring_t args[MAX_OPT_ARGS];
 
 	trace_ocfs2_parse_options(is_remount, options ? options : "(none)");
 
@@ -1298,9 +1300,6 @@ static int ocfs2_parse_options(struct super_block *sb,
 	}
 
 	while ((p = strsep(&options, ",")) != NULL) {
-		int token, option;
-		substring_t args[MAX_OPT_ARGS];
-
 		if (!*p)
 			continue;
 
@@ -1367,7 +1366,6 @@ static int ocfs2_parse_options(struct super_block *sb,
 				mopt->atime_quantum = option;
 			break;
 		case Opt_slot:
-			option = 0;
 			if (match_int(&args[0], &option)) {
 				status = 0;
 				goto bail;
@@ -1376,7 +1374,6 @@ static int ocfs2_parse_options(struct super_block *sb,
 				mopt->slot = (s16)option;
 			break;
 		case Opt_commit:
-			option = 0;
 			if (match_int(&args[0], &option)) {
 				status = 0;
 				goto bail;
@@ -1388,7 +1385,6 @@ static int ocfs2_parse_options(struct super_block *sb,
 			mopt->commit_interval = HZ * option;
 			break;
 		case Opt_localalloc:
-			option = 0;
 			if (match_int(&args[0], &option)) {
 				status = 0;
 				goto bail;
@@ -1726,8 +1722,7 @@ static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf)
 	ocfs2_inode_unlock(inode, 0);
 	status = 0;
 bail:
-	if (inode)
-		iput(inode);
+	iput(inode);
 
 	if (status)
 		mlog_errno(status);
@@ -1771,7 +1766,7 @@ static int ocfs2_initialize_mem_caches(void)
 				       sizeof(struct ocfs2_inode_info),
 				       0,
 				       (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 				       ocfs2_inode_init_once);
 	ocfs2_dquot_cachep = kmem_cache_create("ocfs2_dquot_cache",
 					sizeof(struct ocfs2_dquot),
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index 66edce7ec..6c2a3e3c5 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -88,8 +88,7 @@ const struct address_space_operations ocfs2_fast_symlink_aops = {
 
 const struct inode_operations ocfs2_symlink_inode_operations = {
 	.readlink	= generic_readlink,
-	.follow_link	= page_follow_link_light,
-	.put_link	= page_put_link,
+	.get_link	= page_get_link,
 	.getattr	= ocfs2_getattr,
 	.setattr	= ocfs2_setattr,
 	.setxattr	= generic_setxattr,
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index e9164f098..7d3d979f5 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -544,8 +544,7 @@ static inline const char *ocfs2_xattr_prefix(int name_index)
 
 	if (name_index > 0 && name_index < OCFS2_XATTR_MAX)
 		handler = ocfs2_xattr_handler_map[name_index];
-
-	return handler ? handler->prefix : NULL;
+	return handler ? xattr_prefix(handler) : NULL;
 }
 
 static u32 ocfs2_xattr_name_hash(struct inode *inode,
@@ -884,14 +883,39 @@ static int ocfs2_xattr_value_truncate(struct inode *inode,
 	return ret;
 }
 
-static int ocfs2_xattr_list_entry(char *buffer, size_t size,
-				  size_t *result, const char *prefix,
+static int ocfs2_xattr_list_entry(struct super_block *sb,
+				  char *buffer, size_t size,
+				  size_t *result, int type,
 				  const char *name, int name_len)
 {
 	char *p = buffer + *result;
-	int prefix_len = strlen(prefix);
-	int total_len = prefix_len + name_len + 1;
+	const char *prefix;
+	int prefix_len;
+	int total_len;
 
+	switch(type) {
+	case OCFS2_XATTR_INDEX_USER:
+		if (OCFS2_SB(sb)->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
+			return 0;
+		break;
+
+	case OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS:
+	case OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT:
+		if (!(sb->s_flags & MS_POSIXACL))
+			return 0;
+		break;
+
+	case OCFS2_XATTR_INDEX_TRUSTED:
+		if (!capable(CAP_SYS_ADMIN))
+			return 0;
+		break;
+	}
+
+	prefix = ocfs2_xattr_prefix(type);
+	if (!prefix)
+		return 0;
+	prefix_len = strlen(prefix);
+	total_len = prefix_len + name_len + 1;
 	*result += total_len;
 
 	/* we are just looking for how big our buffer needs to be */
@@ -914,23 +938,20 @@ static int ocfs2_xattr_list_entries(struct inode *inode,
 {
 	size_t result = 0;
 	int i, type, ret;
-	const char *prefix, *name;
+	const char *name;
 
 	for (i = 0 ; i < le16_to_cpu(header->xh_count); i++) {
 		struct ocfs2_xattr_entry *entry = &header->xh_entries[i];
 		type = ocfs2_xattr_get_type(entry);
-		prefix = ocfs2_xattr_prefix(type);
-
-		if (prefix) {
-			name = (const char *)header +
-				le16_to_cpu(entry->xe_name_offset);
+		name = (const char *)header +
+			le16_to_cpu(entry->xe_name_offset);
 
-			ret = ocfs2_xattr_list_entry(buffer, buffer_size,
-						     &result, prefix, name,
-						     entry->xe_name_len);
-			if (ret)
-				return ret;
-		}
+		ret = ocfs2_xattr_list_entry(inode->i_sb,
+					     buffer, buffer_size,
+					     &result, type, name,
+					     entry->xe_name_len);
+		if (ret)
+			return ret;
 	}
 
 	return result;
@@ -2503,7 +2524,7 @@ static int ocfs2_xattr_free_block(struct inode *inode,
 		mlog_errno(ret);
 		goto out;
 	}
-	mutex_lock(&xb_alloc_inode->i_mutex);
+	inode_lock(xb_alloc_inode);
 
 	ret = ocfs2_inode_lock(xb_alloc_inode, &xb_alloc_bh, 1);
 	if (ret < 0) {
@@ -2528,7 +2549,7 @@ out_unlock:
 	ocfs2_inode_unlock(xb_alloc_inode, 1);
 	brelse(xb_alloc_bh);
 out_mutex:
-	mutex_unlock(&xb_alloc_inode->i_mutex);
+	inode_unlock(xb_alloc_inode);
 	iput(xb_alloc_inode);
 out:
 	brelse(blk_bh);
@@ -3598,17 +3619,17 @@ int ocfs2_xattr_set(struct inode *inode,
 		}
 	}
 
-	mutex_lock(&tl_inode->i_mutex);
+	inode_lock(tl_inode);
 
 	if (ocfs2_truncate_log_needs_flush(osb)) {
 		ret = __ocfs2_flush_truncate_log(osb);
 		if (ret < 0) {
-			mutex_unlock(&tl_inode->i_mutex);
+			inode_unlock(tl_inode);
 			mlog_errno(ret);
 			goto cleanup;
 		}
 	}
-	mutex_unlock(&tl_inode->i_mutex);
+	inode_unlock(tl_inode);
 
 	ret = ocfs2_init_xattr_set_ctxt(inode, di, &xi, &xis,
 					&xbs, &ctxt, ref_meta, &credits);
@@ -4033,32 +4054,30 @@ static int ocfs2_list_xattr_bucket(struct inode *inode,
 	int ret = 0, type;
 	struct ocfs2_xattr_tree_list *xl = (struct ocfs2_xattr_tree_list *)para;
 	int i, block_off, new_offset;
-	const char *prefix, *name;
+	const char *name;
 
 	for (i = 0 ; i < le16_to_cpu(bucket_xh(bucket)->xh_count); i++) {
 		struct ocfs2_xattr_entry *entry = &bucket_xh(bucket)->xh_entries[i];
 		type = ocfs2_xattr_get_type(entry);
-		prefix = ocfs2_xattr_prefix(type);
 
-		if (prefix) {
-			ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
-								bucket_xh(bucket),
-								i,
-								&block_off,
-								&new_offset);
-			if (ret)
-				break;
+		ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb,
+							bucket_xh(bucket),
+							i,
+							&block_off,
+							&new_offset);
+		if (ret)
+			break;
 
-			name = (const char *)bucket_block(bucket, block_off) +
-				new_offset;
-			ret = ocfs2_xattr_list_entry(xl->buffer,
-						     xl->buffer_size,
-						     &xl->result,
-						     prefix, name,
-						     entry->xe_name_len);
-			if (ret)
-				break;
-		}
+		name = (const char *)bucket_block(bucket, block_off) +
+			new_offset;
+		ret = ocfs2_xattr_list_entry(inode->i_sb,
+					     xl->buffer,
+					     xl->buffer_size,
+					     &xl->result,
+					     type, name,
+					     entry->xe_name_len);
+		if (ret)
+			break;
 	}
 
 	return ret;
@@ -5441,7 +5460,7 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode,
 		return ret;
 	}
 
-	mutex_lock(&tl_inode->i_mutex);
+	inode_lock(tl_inode);
 
 	if (ocfs2_truncate_log_needs_flush(osb)) {
 		ret = __ocfs2_flush_truncate_log(osb);
@@ -5485,7 +5504,7 @@ out_commit:
 out:
 	ocfs2_schedule_truncate_log_flush(osb, 1);
 
-	mutex_unlock(&tl_inode->i_mutex);
+	inode_unlock(tl_inode);
 
 	if (meta_ac)
 		ocfs2_free_alloc_context(meta_ac);
@@ -7226,31 +7245,14 @@ int ocfs2_init_security_and_acl(struct inode *dir,
 leave:
 	return ret;
 }
+
 /*
  * 'security' attributes support
  */
-static size_t ocfs2_xattr_security_list(const struct xattr_handler *handler,
-					struct dentry *dentry, char *list,
-					size_t list_size, const char *name,
-					size_t name_len)
-{
-	const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN;
-	const size_t total_len = prefix_len + name_len + 1;
-
-	if (list && total_len <= list_size) {
-		memcpy(list, XATTR_SECURITY_PREFIX, prefix_len);
-		memcpy(list + prefix_len, name, name_len);
-		list[prefix_len + name_len] = '\0';
-	}
-	return total_len;
-}
-
 static int ocfs2_xattr_security_get(const struct xattr_handler *handler,
 				    struct dentry *dentry, const char *name,
 				    void *buffer, size_t size)
 {
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
 	return ocfs2_xattr_get(d_inode(dentry), OCFS2_XATTR_INDEX_SECURITY,
 			       name, buffer, size);
 }
@@ -7259,9 +7261,6 @@ static int ocfs2_xattr_security_set(const struct xattr_handler *handler,
 				    struct dentry *dentry, const char *name,
 				    const void *value, size_t size, int flags)
 {
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
-
 	return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_SECURITY,
 			       name, value, size, flags);
 }
@@ -7314,7 +7313,6 @@ int ocfs2_init_security_set(handle_t *handle,
 
 const struct xattr_handler ocfs2_xattr_security_handler = {
 	.prefix	= XATTR_SECURITY_PREFIX,
-	.list	= ocfs2_xattr_security_list,
 	.get	= ocfs2_xattr_security_get,
 	.set	= ocfs2_xattr_security_set,
 };
@@ -7322,31 +7320,10 @@ const struct xattr_handler ocfs2_xattr_security_handler = {
 /*
  * 'trusted' attributes support
  */
-static size_t ocfs2_xattr_trusted_list(const struct xattr_handler *handler,
-				       struct dentry *dentry, char *list,
-				       size_t list_size, const char *name,
-				       size_t name_len)
-{
-	const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
-	const size_t total_len = prefix_len + name_len + 1;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return 0;
-
-	if (list && total_len <= list_size) {
-		memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len);
-		memcpy(list + prefix_len, name, name_len);
-		list[prefix_len + name_len] = '\0';
-	}
-	return total_len;
-}
-
 static int ocfs2_xattr_trusted_get(const struct xattr_handler *handler,
 				   struct dentry *dentry, const char *name,
 				   void *buffer, size_t size)
 {
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
 	return ocfs2_xattr_get(d_inode(dentry), OCFS2_XATTR_INDEX_TRUSTED,
 			       name, buffer, size);
 }
@@ -7355,16 +7332,12 @@ static int ocfs2_xattr_trusted_set(const struct xattr_handler *handler,
 				   struct dentry *dentry, const char *name,
 				   const void *value, size_t size, int flags)
 {
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
-
 	return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_TRUSTED,
 			       name, value, size, flags);
 }
 
 const struct xattr_handler ocfs2_xattr_trusted_handler = {
 	.prefix	= XATTR_TRUSTED_PREFIX,
-	.list	= ocfs2_xattr_trusted_list,
 	.get	= ocfs2_xattr_trusted_get,
 	.set	= ocfs2_xattr_trusted_set,
 };
@@ -7372,34 +7345,12 @@ const struct xattr_handler ocfs2_xattr_trusted_handler = {
 /*
  * 'user' attributes support
  */
-static size_t ocfs2_xattr_user_list(const struct xattr_handler *handler,
-				    struct dentry *dentry, char *list,
-				    size_t list_size, const char *name,
-				    size_t name_len)
-{
-	const size_t prefix_len = XATTR_USER_PREFIX_LEN;
-	const size_t total_len = prefix_len + name_len + 1;
-	struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
-
-	if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
-		return 0;
-
-	if (list && total_len <= list_size) {
-		memcpy(list, XATTR_USER_PREFIX, prefix_len);
-		memcpy(list + prefix_len, name, name_len);
-		list[prefix_len + name_len] = '\0';
-	}
-	return total_len;
-}
-
 static int ocfs2_xattr_user_get(const struct xattr_handler *handler,
 				struct dentry *dentry, const char *name,
 				void *buffer, size_t size)
 {
 	struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
 
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
 	if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
 		return -EOPNOTSUPP;
 	return ocfs2_xattr_get(d_inode(dentry), OCFS2_XATTR_INDEX_USER, name,
@@ -7412,8 +7363,6 @@ static int ocfs2_xattr_user_set(const struct xattr_handler *handler,
 {
 	struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb);
 
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
 	if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR)
 		return -EOPNOTSUPP;
 
@@ -7423,7 +7372,6 @@ static int ocfs2_xattr_user_set(const struct xattr_handler *handler,
 
 const struct xattr_handler ocfs2_xattr_user_handler = {
 	.prefix	= XATTR_USER_PREFIX,
-	.list	= ocfs2_xattr_user_list,
 	.get	= ocfs2_xattr_user_get,
 	.set	= ocfs2_xattr_user_set,
 };
diff --git a/fs/open.c b/fs/open.c
index 32dfe4f2a..16f561ea7 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -61,13 +61,12 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
 	if (ret)
 		newattrs.ia_valid |= ret | ATTR_FORCE;
 
-	mutex_lock(&dentry->d_inode->i_mutex);
+	inode_lock(dentry->d_inode);
 	/* Note any delegations or leases have already been broken: */
 	ret = notify_change(dentry, &newattrs, NULL);
-	mutex_unlock(&dentry->d_inode->i_mutex);
+	inode_unlock(dentry->d_inode);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(do_truncate);
 
 long vfs_truncate(struct path *path, loff_t length)
 {
@@ -514,7 +513,7 @@ static int chmod_common(struct path *path, umode_t mode)
 	if (error)
 		return error;
 retry_deleg:
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	error = security_path_chmod(path, mode);
 	if (error)
 		goto out_unlock;
@@ -522,7 +521,7 @@ retry_deleg:
 	newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
 	error = notify_change(path->dentry, &newattrs, &delegated_inode);
 out_unlock:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	if (delegated_inode) {
 		error = break_deleg_wait(&delegated_inode);
 		if (!error)
@@ -597,11 +596,11 @@ retry_deleg:
 	if (!S_ISDIR(inode->i_mode))
 		newattrs.ia_valid |=
 			ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV;
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	error = security_path_chown(path, uid, gid);
 	if (!error)
 		error = notify_change(path->dentry, &newattrs, &delegated_inode);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	if (delegated_inode) {
 		error = break_deleg_wait(&delegated_inode);
 		if (!error)
@@ -682,7 +681,6 @@ int open_check_o_direct(struct file *f)
 	}
 	return 0;
 }
-EXPORT_SYMBOL_GPL(open_check_o_direct);
 
 static int do_dentry_open(struct file *f,
 			  struct inode *inode,
@@ -892,7 +890,7 @@ EXPORT_SYMBOL(dentry_open);
 static inline int build_open_flags(int flags, umode_t mode, struct open_flags *op)
 {
 	int lookup_flags = 0;
-	int acc_mode;
+	int acc_mode = ACC_MODE(flags);
 
 	if (flags & (O_CREAT | __O_TMPFILE))
 		op->mode = (mode & S_IALLUGO) | S_IFREG;
@@ -914,7 +912,6 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o
 	if (flags & __O_TMPFILE) {
 		if ((flags & O_TMPFILE_MASK) != O_TMPFILE)
 			return -EINVAL;
-		acc_mode = MAY_OPEN | ACC_MODE(flags);
 		if (!(acc_mode & MAY_WRITE))
 			return -EINVAL;
 	} else if (flags & O_PATH) {
@@ -924,8 +921,6 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o
 		 */
 		flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH;
 		acc_mode = 0;
-	} else {
-		acc_mode = MAY_OPEN | ACC_MODE(flags);
 	}
 
 	op->open_flag = flags;
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index 15e4500cd..b61b883c8 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -443,7 +443,7 @@ static int __init init_openprom_fs(void)
 					    sizeof(struct op_inode_info),
 					    0,
 					    (SLAB_RECLAIM_ACCOUNT |
-					     SLAB_MEM_SPREAD),
+					     SLAB_MEM_SPREAD | SLAB_ACCOUNT),
 					    op_inode_init_once);
 	if (!op_inode_cachep)
 		return -ENOMEM;
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index eff6319d5..d894e7cd9 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -248,9 +248,9 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
 	if (err)
 		goto out_cleanup;
 
-	mutex_lock(&newdentry->d_inode->i_mutex);
+	inode_lock(newdentry->d_inode);
 	err = ovl_set_attr(newdentry, stat);
-	mutex_unlock(&newdentry->d_inode->i_mutex);
+	inode_unlock(newdentry->d_inode);
 	if (err)
 		goto out_cleanup;
 
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 692ceda3b..52f6de5d4 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -167,7 +167,7 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
 	struct dentry *newdentry;
 	int err;
 
-	mutex_lock_nested(&udir->i_mutex, I_MUTEX_PARENT);
+	inode_lock_nested(udir, I_MUTEX_PARENT);
 	newdentry = lookup_one_len(dentry->d_name.name, upperdir,
 				   dentry->d_name.len);
 	err = PTR_ERR(newdentry);
@@ -185,7 +185,7 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
 out_dput:
 	dput(newdentry);
 out_unlock:
-	mutex_unlock(&udir->i_mutex);
+	inode_unlock(udir);
 	return err;
 }
 
@@ -258,9 +258,9 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry,
 	if (err)
 		goto out_cleanup;
 
-	mutex_lock(&opaquedir->d_inode->i_mutex);
+	inode_lock(opaquedir->d_inode);
 	err = ovl_set_attr(opaquedir, &stat);
-	mutex_unlock(&opaquedir->d_inode->i_mutex);
+	inode_unlock(opaquedir->d_inode);
 	if (err)
 		goto out_cleanup;
 
@@ -599,7 +599,7 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
 	struct dentry *upper = ovl_dentry_upper(dentry);
 	int err;
 
-	mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+	inode_lock_nested(dir, I_MUTEX_PARENT);
 	err = -ESTALE;
 	if (upper->d_parent == upperdir) {
 		/* Don't let d_delete() think it can reset d_inode */
@@ -618,8 +618,9 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
 	 * sole user of this dentry.  Too tricky...  Just unhash for
 	 * now.
 	 */
-	d_drop(dentry);
-	mutex_unlock(&dir->i_mutex);
+	if (!err)
+		d_drop(dentry);
+	inode_unlock(dir);
 
 	return err;
 }
@@ -903,6 +904,13 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old,
 	if (!overwrite && new_is_dir && !old_opaque && new_opaque)
 		ovl_remove_opaque(newdentry);
 
+	/*
+	 * Old dentry now lives in different location. Dentries in
+	 * lowerstack are stale. We cannot drop them here because
+	 * access to them is lockless. This could be only pure upper
+	 * or opaque directory - numlower is zero. Or upper non-dir
+	 * entry - its pureness is tracked by flag opaque.
+	 */
 	if (old_opaque != new_opaque) {
 		ovl_dentry_set_opaque(old, new_opaque);
 		if (!overwrite)
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index b29036aa8..a4ff5d0d7 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -63,9 +63,11 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr)
 	if (!err) {
 		upperdentry = ovl_dentry_upper(dentry);
 
-		mutex_lock(&upperdentry->d_inode->i_mutex);
+		inode_lock(upperdentry->d_inode);
 		err = notify_change(upperdentry, attr, NULL);
-		mutex_unlock(&upperdentry->d_inode->i_mutex);
+		if (!err)
+			ovl_copyattr(upperdentry->d_inode, dentry->d_inode);
+		inode_unlock(upperdentry->d_inode);
 	}
 	ovl_drop_write(dentry);
 out:
@@ -108,6 +110,29 @@ int ovl_permission(struct inode *inode, int mask)
 
 	realdentry = ovl_entry_real(oe, &is_upper);
 
+	if (ovl_is_default_permissions(inode)) {
+		struct kstat stat;
+		struct path realpath = { .dentry = realdentry };
+
+		if (mask & MAY_NOT_BLOCK)
+			return -ECHILD;
+
+		realpath.mnt = ovl_entry_mnt_real(oe, inode, is_upper);
+
+		err = vfs_getattr(&realpath, &stat);
+		if (err)
+			return err;
+
+		if ((stat.mode ^ inode->i_mode) & S_IFMT)
+			return -ESTALE;
+
+		inode->i_mode = stat.mode;
+		inode->i_uid = stat.uid;
+		inode->i_gid = stat.gid;
+
+		return generic_permission(inode, mask);
+	}
+
 	/* Careful in RCU walk mode */
 	realinode = ACCESS_ONCE(realdentry->d_inode);
 	if (!realinode) {
@@ -144,57 +169,23 @@ out_dput:
 	return err;
 }
 
-
-struct ovl_link_data {
-	struct dentry *realdentry;
-	void *cookie;
-};
-
-static const char *ovl_follow_link(struct dentry *dentry, void **cookie)
+static const char *ovl_get_link(struct dentry *dentry,
+				struct inode *inode,
+				struct delayed_call *done)
 {
 	struct dentry *realdentry;
 	struct inode *realinode;
-	struct ovl_link_data *data = NULL;
-	const char *ret;
+
+	if (!dentry)
+		return ERR_PTR(-ECHILD);
 
 	realdentry = ovl_dentry_real(dentry);
 	realinode = realdentry->d_inode;
 
-	if (WARN_ON(!realinode->i_op->follow_link))
+	if (WARN_ON(!realinode->i_op->get_link))
 		return ERR_PTR(-EPERM);
 
-	if (realinode->i_op->put_link) {
-		data = kmalloc(sizeof(struct ovl_link_data), GFP_KERNEL);
-		if (!data)
-			return ERR_PTR(-ENOMEM);
-		data->realdentry = realdentry;
-	}
-
-	ret = realinode->i_op->follow_link(realdentry, cookie);
-	if (IS_ERR_OR_NULL(ret)) {
-		kfree(data);
-		return ret;
-	}
-
-	if (data)
-		data->cookie = *cookie;
-
-	*cookie = data;
-
-	return ret;
-}
-
-static void ovl_put_link(struct inode *unused, void *c)
-{
-	struct inode *realinode;
-	struct ovl_link_data *data = c;
-
-	if (!data)
-		return;
-
-	realinode = data->realdentry->d_inode;
-	realinode->i_op->put_link(realinode, data->cookie);
-	kfree(data);
+	return realinode->i_op->get_link(realdentry, realinode, done);
 }
 
 static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
@@ -391,8 +382,7 @@ static const struct inode_operations ovl_file_inode_operations = {
 
 static const struct inode_operations ovl_symlink_inode_operations = {
 	.setattr	= ovl_setattr,
-	.follow_link	= ovl_follow_link,
-	.put_link	= ovl_put_link,
+	.get_link	= ovl_get_link,
 	.readlink	= ovl_readlink,
 	.getattr	= ovl_getattr,
 	.setxattr	= ovl_setxattr,
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index e17154aea..99b4168c3 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -142,7 +142,10 @@ struct dentry *ovl_dentry_upper(struct dentry *dentry);
 struct dentry *ovl_dentry_lower(struct dentry *dentry);
 struct dentry *ovl_dentry_real(struct dentry *dentry);
 struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper);
+struct vfsmount *ovl_entry_mnt_real(struct ovl_entry *oe, struct inode *inode,
+				    bool is_upper);
 struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry);
+bool ovl_is_default_permissions(struct inode *inode);
 void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache);
 struct dentry *ovl_workdir(struct dentry *dentry);
 int ovl_want_write(struct dentry *dentry);
diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c
index adcb1398c..fdaf28f75 100644
--- a/fs/overlayfs/readdir.c
+++ b/fs/overlayfs/readdir.c
@@ -228,7 +228,7 @@ static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd)
 				dput(dentry);
 			}
 		}
-		mutex_unlock(&dir->d_inode->i_mutex);
+		inode_unlock(dir->d_inode);
 	}
 	revert_creds(old_cred);
 	put_cred(override_cred);
@@ -399,7 +399,7 @@ static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin)
 	loff_t res;
 	struct ovl_dir_file *od = file->private_data;
 
-	mutex_lock(&file_inode(file)->i_mutex);
+	inode_lock(file_inode(file));
 	if (!file->f_pos)
 		ovl_dir_reset(file);
 
@@ -429,7 +429,7 @@ static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin)
 		res = offset;
 	}
 out_unlock:
-	mutex_unlock(&file_inode(file)->i_mutex);
+	inode_unlock(file_inode(file));
 
 	return res;
 }
@@ -454,10 +454,10 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
 			ovl_path_upper(dentry, &upperpath);
 			realfile = ovl_path_open(&upperpath, O_RDONLY);
 			smp_mb__before_spinlock();
-			mutex_lock(&inode->i_mutex);
+			inode_lock(inode);
 			if (!od->upperfile) {
 				if (IS_ERR(realfile)) {
-					mutex_unlock(&inode->i_mutex);
+					inode_unlock(inode);
 					return PTR_ERR(realfile);
 				}
 				od->upperfile = realfile;
@@ -467,7 +467,7 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end,
 					fput(realfile);
 				realfile = od->upperfile;
 			}
-			mutex_unlock(&inode->i_mutex);
+			inode_unlock(inode);
 		}
 	}
 
@@ -479,9 +479,9 @@ static int ovl_dir_release(struct inode *inode, struct file *file)
 	struct ovl_dir_file *od = file->private_data;
 
 	if (od->cache) {
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 		ovl_cache_put(od, file->f_path.dentry);
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 	}
 	fput(od->realfile);
 	if (od->upperfile)
@@ -557,7 +557,7 @@ void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list)
 {
 	struct ovl_cache_entry *p;
 
-	mutex_lock_nested(&upper->d_inode->i_mutex, I_MUTEX_CHILD);
+	inode_lock_nested(upper->d_inode, I_MUTEX_CHILD);
 	list_for_each_entry(p, list, l_node) {
 		struct dentry *dentry;
 
@@ -575,5 +575,5 @@ void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list)
 			ovl_cleanup(upper->d_inode, dentry);
 		dput(dentry);
 	}
-	mutex_unlock(&upper->d_inode->i_mutex);
+	inode_unlock(upper->d_inode);
 }
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index f42c9407f..619ad4b01 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -16,6 +16,7 @@
 #include <linux/slab.h>
 #include <linux/parser.h>
 #include <linux/module.h>
+#include <linux/pagemap.h>
 #include <linux/sched.h>
 #include <linux/statfs.h>
 #include <linux/seq_file.h>
@@ -25,12 +26,11 @@ MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
 MODULE_DESCRIPTION("Overlay filesystem");
 MODULE_LICENSE("GPL");
 
-#define OVERLAYFS_SUPER_MAGIC 0x794c7630
-
 struct ovl_config {
 	char *lowerdir;
 	char *upperdir;
 	char *workdir;
+	bool default_permissions;
 };
 
 /* private information held for overlayfs's superblock */
@@ -76,12 +76,14 @@ enum ovl_path_type ovl_path_type(struct dentry *dentry)
 	if (oe->__upperdentry) {
 		type = __OVL_PATH_UPPER;
 
-		if (oe->numlower) {
-			if (S_ISDIR(dentry->d_inode->i_mode))
-				type |= __OVL_PATH_MERGE;
-		} else if (!oe->opaque) {
+		/*
+		 * Non-dir dentry can hold lower dentry from previous
+		 * location. Its purity depends only on opaque flag.
+		 */
+		if (oe->numlower && S_ISDIR(dentry->d_inode->i_mode))
+			type |= __OVL_PATH_MERGE;
+		else if (!oe->opaque)
 			type |= __OVL_PATH_PURE;
-		}
 	} else {
 		if (oe->numlower > 1)
 			type |= __OVL_PATH_MERGE;
@@ -155,6 +157,18 @@ struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper)
 	return realdentry;
 }
 
+struct vfsmount *ovl_entry_mnt_real(struct ovl_entry *oe, struct inode *inode,
+				    bool is_upper)
+{
+	if (is_upper) {
+		struct ovl_fs *ofs = inode->i_sb->s_fs_info;
+
+		return ofs->upper_mnt;
+	} else {
+		return oe->numlower ? oe->lowerstack[0].mnt : NULL;
+	}
+}
+
 struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry)
 {
 	struct ovl_entry *oe = dentry->d_fsdata;
@@ -162,6 +176,13 @@ struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry)
 	return oe->cache;
 }
 
+bool ovl_is_default_permissions(struct inode *inode)
+{
+	struct ovl_fs *ofs = inode->i_sb->s_fs_info;
+
+	return ofs->config.default_permissions;
+}
+
 void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache)
 {
 	struct ovl_entry *oe = dentry->d_fsdata;
@@ -210,7 +231,7 @@ void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry)
 {
 	struct ovl_entry *oe = dentry->d_fsdata;
 
-	WARN_ON(!mutex_is_locked(&upperdentry->d_parent->d_inode->i_mutex));
+	WARN_ON(!inode_is_locked(upperdentry->d_parent->d_inode));
 	WARN_ON(oe->__upperdentry);
 	BUG_ON(!upperdentry->d_inode);
 	/*
@@ -225,7 +246,7 @@ void ovl_dentry_version_inc(struct dentry *dentry)
 {
 	struct ovl_entry *oe = dentry->d_fsdata;
 
-	WARN_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
+	WARN_ON(!inode_is_locked(dentry->d_inode));
 	oe->version++;
 }
 
@@ -233,7 +254,7 @@ u64 ovl_dentry_version_get(struct dentry *dentry)
 {
 	struct ovl_entry *oe = dentry->d_fsdata;
 
-	WARN_ON(!mutex_is_locked(&dentry->d_inode->i_mutex));
+	WARN_ON(!inode_is_locked(dentry->d_inode));
 	return oe->version;
 }
 
@@ -322,6 +343,7 @@ static const struct dentry_operations ovl_dentry_operations = {
 
 static const struct dentry_operations ovl_reval_dentry_operations = {
 	.d_release = ovl_dentry_release,
+	.d_select_inode = ovl_d_select_inode,
 	.d_revalidate = ovl_dentry_revalidate,
 	.d_weak_revalidate = ovl_dentry_weak_revalidate,
 };
@@ -356,9 +378,9 @@ static inline struct dentry *ovl_lookup_real(struct dentry *dir,
 {
 	struct dentry *dentry;
 
-	mutex_lock(&dir->d_inode->i_mutex);
+	inode_lock(dir->d_inode);
 	dentry = lookup_one_len(name->name, dir, name->len);
-	mutex_unlock(&dir->d_inode->i_mutex);
+	inode_unlock(dir->d_inode);
 
 	if (IS_ERR(dentry)) {
 		if (PTR_ERR(dentry) == -ENOENT)
@@ -595,6 +617,8 @@ static int ovl_show_options(struct seq_file *m, struct dentry *dentry)
 		seq_show_option(m, "upperdir", ufs->config.upperdir);
 		seq_show_option(m, "workdir", ufs->config.workdir);
 	}
+	if (ufs->config.default_permissions)
+		seq_puts(m, ",default_permissions");
 	return 0;
 }
 
@@ -619,6 +643,7 @@ enum {
 	OPT_LOWERDIR,
 	OPT_UPPERDIR,
 	OPT_WORKDIR,
+	OPT_DEFAULT_PERMISSIONS,
 	OPT_ERR,
 };
 
@@ -626,6 +651,7 @@ static const match_table_t ovl_tokens = {
 	{OPT_LOWERDIR,			"lowerdir=%s"},
 	{OPT_UPPERDIR,			"upperdir=%s"},
 	{OPT_WORKDIR,			"workdir=%s"},
+	{OPT_DEFAULT_PERMISSIONS,	"default_permissions"},
 	{OPT_ERR,			NULL}
 };
 
@@ -686,6 +712,10 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config)
 				return -ENOMEM;
 			break;
 
+		case OPT_DEFAULT_PERMISSIONS:
+			config->default_permissions = true;
+			break;
+
 		default:
 			pr_err("overlayfs: unrecognized mount option \"%s\" or missing value\n", p);
 			return -EINVAL;
@@ -717,7 +747,7 @@ static struct dentry *ovl_workdir_create(struct vfsmount *mnt,
 	if (err)
 		return ERR_PTR(err);
 
-	mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+	inode_lock_nested(dir, I_MUTEX_PARENT);
 retry:
 	work = lookup_one_len(OVL_WORKDIR_NAME, dentry,
 			      strlen(OVL_WORKDIR_NAME));
@@ -743,7 +773,7 @@ retry:
 			goto out_dput;
 	}
 out_unlock:
-	mutex_unlock(&dir->i_mutex);
+	inode_unlock(dir);
 	mnt_drop_write(mnt);
 
 	return work;
diff --git a/fs/pipe.c b/fs/pipe.c
index 42cf8ddf0..ab8dad3cc 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -38,6 +38,12 @@ unsigned int pipe_max_size = 1048576;
  */
 unsigned int pipe_min_size = PAGE_SIZE;
 
+/* Maximum allocatable pages per user. Hard limit is unset by default, soft
+ * matches default values.
+ */
+unsigned long pipe_user_pages_hard;
+unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR;
+
 /*
  * We use a start+len construction, which provides full use of the 
  * allocated memory.
@@ -583,20 +589,49 @@ pipe_fasync(int fd, struct file *filp, int on)
 	return retval;
 }
 
+static void account_pipe_buffers(struct pipe_inode_info *pipe,
+                                 unsigned long old, unsigned long new)
+{
+	atomic_long_add(new - old, &pipe->user->pipe_bufs);
+}
+
+static bool too_many_pipe_buffers_soft(struct user_struct *user)
+{
+	return pipe_user_pages_soft &&
+	       atomic_long_read(&user->pipe_bufs) >= pipe_user_pages_soft;
+}
+
+static bool too_many_pipe_buffers_hard(struct user_struct *user)
+{
+	return pipe_user_pages_hard &&
+	       atomic_long_read(&user->pipe_bufs) >= pipe_user_pages_hard;
+}
+
 struct pipe_inode_info *alloc_pipe_info(void)
 {
 	struct pipe_inode_info *pipe;
 
 	pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
 	if (pipe) {
-		pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * PIPE_DEF_BUFFERS, GFP_KERNEL);
+		unsigned long pipe_bufs = PIPE_DEF_BUFFERS;
+		struct user_struct *user = get_current_user();
+
+		if (!too_many_pipe_buffers_hard(user)) {
+			if (too_many_pipe_buffers_soft(user))
+				pipe_bufs = 1;
+			pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * pipe_bufs, GFP_KERNEL);
+		}
+
 		if (pipe->bufs) {
 			init_waitqueue_head(&pipe->wait);
 			pipe->r_counter = pipe->w_counter = 1;
-			pipe->buffers = PIPE_DEF_BUFFERS;
+			pipe->buffers = pipe_bufs;
+			pipe->user = user;
+			account_pipe_buffers(pipe, 0, pipe_bufs);
 			mutex_init(&pipe->mutex);
 			return pipe;
 		}
+		free_uid(user);
 		kfree(pipe);
 	}
 
@@ -607,6 +642,8 @@ void free_pipe_info(struct pipe_inode_info *pipe)
 {
 	int i;
 
+	account_pipe_buffers(pipe, pipe->buffers, 0);
+	free_uid(pipe->user);
 	for (i = 0; i < pipe->buffers; i++) {
 		struct pipe_buffer *buf = pipe->bufs + i;
 		if (buf->ops)
@@ -998,6 +1035,7 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages)
 			memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer));
 	}
 
+	account_pipe_buffers(pipe, pipe->buffers, nr_pages);
 	pipe->curbuf = 0;
 	kfree(pipe->bufs);
 	pipe->bufs = bufs;
@@ -1069,6 +1107,11 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg)
 		if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) {
 			ret = -EPERM;
 			goto out;
+		} else if ((too_many_pipe_buffers_hard(pipe->user) ||
+			    too_many_pipe_buffers_soft(pipe->user)) &&
+		           !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) {
+			ret = -EPERM;
+			goto out;
 		}
 		ret = pipe_set_size(pipe, nr_pages);
 		break;
diff --git a/fs/pnode.c b/fs/pnode.c
index 6367e1e43..c524fdddc 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -202,6 +202,11 @@ static struct mount *last_dest, *last_source, *dest_master;
 static struct mountpoint *mp;
 static struct hlist_head *list;
 
+static inline bool peers(struct mount *m1, struct mount *m2)
+{
+	return m1->mnt_group_id == m2->mnt_group_id && m1->mnt_group_id;
+}
+
 static int propagate_one(struct mount *m)
 {
 	struct mount *child;
@@ -212,7 +217,7 @@ static int propagate_one(struct mount *m)
 	/* skip if mountpoint isn't covered by it */
 	if (!is_subdir(mp->m_dentry, m->mnt.mnt_root))
 		return 0;
-	if (m->mnt_group_id == last_dest->mnt_group_id) {
+	if (peers(m, last_dest)) {
 		type = CL_MAKE_SHARED;
 	} else {
 		struct mount *n, *p;
@@ -223,7 +228,7 @@ static int propagate_one(struct mount *m)
 					last_source = last_source->mnt_master;
 					last_dest = last_source->mnt_parent;
 				}
-				if (n->mnt_group_id != last_dest->mnt_group_id) {
+				if (!peers(n, last_dest)) {
 					last_source = last_source->mnt_master;
 					last_dest = last_source->mnt_parent;
 				}
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 4adde1e2c..711dd5170 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -769,8 +769,6 @@ posix_acl_xattr_get(const struct xattr_handler *handler,
 	struct posix_acl *acl;
 	int error;
 
-	if (strcmp(name, "") != 0)
-		return -EINVAL;
 	if (!IS_POSIXACL(d_backing_inode(dentry)))
 		return -EOPNOTSUPP;
 	if (d_is_symlink(dentry))
@@ -797,8 +795,6 @@ posix_acl_xattr_set(const struct xattr_handler *handler,
 	struct posix_acl *acl = NULL;
 	int ret;
 
-	if (strcmp(name, "") != 0)
-		return -EINVAL;
 	if (!IS_POSIXACL(inode))
 		return -EOPNOTSUPP;
 	if (!inode->i_op->set_acl)
@@ -827,25 +823,14 @@ out:
 	return ret;
 }
 
-static size_t
-posix_acl_xattr_list(const struct xattr_handler *handler,
-		     struct dentry *dentry, char *list, size_t list_size,
-		     const char *name, size_t name_len)
+static bool
+posix_acl_xattr_list(struct dentry *dentry)
 {
-	const char *xname = handler->prefix;
-	size_t size;
-
-	if (!IS_POSIXACL(d_backing_inode(dentry)))
-		return 0;
-
-	size = strlen(xname) + 1;
-	if (list && size <= list_size)
-		memcpy(list, xname, size);
-	return size;
+	return IS_POSIXACL(d_backing_inode(dentry));
 }
 
 const struct xattr_handler posix_acl_access_xattr_handler = {
-	.prefix = POSIX_ACL_XATTR_ACCESS,
+	.name = XATTR_NAME_POSIX_ACL_ACCESS,
 	.flags = ACL_TYPE_ACCESS,
 	.list = posix_acl_xattr_list,
 	.get = posix_acl_xattr_get,
@@ -854,7 +839,7 @@ const struct xattr_handler posix_acl_access_xattr_handler = {
 EXPORT_SYMBOL_GPL(posix_acl_access_xattr_handler);
 
 const struct xattr_handler posix_acl_default_xattr_handler = {
-	.prefix = POSIX_ACL_XATTR_DEFAULT,
+	.name = XATTR_NAME_POSIX_ACL_DEFAULT,
 	.flags = ACL_TYPE_DEFAULT,
 	.list = posix_acl_xattr_list,
 	.get = posix_acl_xattr_get,
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 8be0e4cd2..4f764c2ac 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -953,6 +953,7 @@ static ssize_t environ_read(struct file *file, char __user *buf,
 	unsigned long src = *ppos;
 	int ret = 0;
 	struct mm_struct *mm = file->private_data;
+	unsigned long env_start, env_end;
 
 	if (!mm)
 		return 0;
@@ -964,19 +965,25 @@ static ssize_t environ_read(struct file *file, char __user *buf,
 	ret = 0;
 	if (!atomic_inc_not_zero(&mm->mm_users))
 		goto free;
+
+	down_read(&mm->mmap_sem);
+	env_start = mm->env_start;
+	env_end = mm->env_end;
+	up_read(&mm->mmap_sem);
+
 	while (count > 0) {
 		size_t this_len, max_len;
 		int retval;
 
-		if (src >= (mm->env_end - mm->env_start))
+		if (src >= (env_end - env_start))
 			break;
 
-		this_len = mm->env_end - (mm->env_start + src);
+		this_len = env_end - (env_start + src);
 
 		max_len = min_t(size_t, PAGE_SIZE, count);
 		this_len = min(max_len, this_len);
 
-		retval = access_remote_vm(mm, (mm->env_start + src),
+		retval = access_remote_vm(mm, (env_start + src),
 			page, this_len, 0);
 
 		if (retval <= 0) {
@@ -1565,12 +1572,16 @@ static int proc_exe_link(struct dentry *dentry, struct path *exe_path)
 		return -ENOENT;
 }
 
-static const char *proc_pid_follow_link(struct dentry *dentry, void **cookie)
+static const char *proc_pid_get_link(struct dentry *dentry,
+				     struct inode *inode,
+				     struct delayed_call *done)
 {
-	struct inode *inode = d_inode(dentry);
 	struct path path;
 	int error = -EACCES;
 
+	if (!dentry)
+		return ERR_PTR(-ECHILD);
+
 	/* Are we allowed to snoop on the tasks file descriptors? */
 	if (!proc_fd_access_allowed(inode))
 		goto out;
@@ -1631,7 +1642,7 @@ out:
 
 const struct inode_operations proc_pid_link_inode_operations = {
 	.readlink	= proc_pid_readlink,
-	.follow_link	= proc_pid_follow_link,
+	.get_link	= proc_pid_get_link,
 	.setattr	= proc_setattr,
 };
 
@@ -1896,7 +1907,7 @@ static const struct dentry_operations tid_map_files_dentry_operations = {
 	.d_delete	= pid_delete_dentry,
 };
 
-static int proc_map_files_get_link(struct dentry *dentry, struct path *path)
+static int map_files_get_link(struct dentry *dentry, struct path *path)
 {
 	unsigned long vm_start, vm_end;
 	struct vm_area_struct *vma;
@@ -1922,7 +1933,7 @@ static int proc_map_files_get_link(struct dentry *dentry, struct path *path)
 	down_read(&mm->mmap_sem);
 	vma = find_exact_vma(mm, vm_start, vm_end);
 	if (vma && vma->vm_file) {
-		*path = vma_pr_or_file(vma)->f_path;
+		*path = vma->vm_file->f_path;
 		path_get(path);
 		rc = 0;
 	}
@@ -1946,20 +1957,22 @@ struct map_files_info {
  * path to the file in question.
  */
 static const char *
-proc_map_files_follow_link(struct dentry *dentry, void **cookie)
+proc_map_files_get_link(struct dentry *dentry,
+			struct inode *inode,
+		        struct delayed_call *done)
 {
 	if (!capable(CAP_SYS_ADMIN))
 		return ERR_PTR(-EPERM);
 
-	return proc_pid_follow_link(dentry, NULL);
+	return proc_pid_get_link(dentry, inode, done);
 }
 
 /*
- * Identical to proc_pid_link_inode_operations except for follow_link()
+ * Identical to proc_pid_link_inode_operations except for get_link()
  */
 static const struct inode_operations proc_map_files_link_inode_operations = {
 	.readlink	= proc_pid_readlink,
-	.follow_link	= proc_map_files_follow_link,
+	.get_link	= proc_map_files_get_link,
 	.setattr	= proc_setattr,
 };
 
@@ -1976,7 +1989,7 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
 		return -ENOENT;
 
 	ei = PROC_I(inode);
-	ei->op.proc_get_link = proc_map_files_get_link;
+	ei->op.proc_get_link = map_files_get_link;
 
 	inode->i_op = &proc_map_files_link_inode_operations;
 	inode->i_size = 64;
@@ -2360,7 +2373,7 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
 				   size_t count, loff_t *ppos)
 {
 	struct inode * inode = file_inode(file);
-	char *page;
+	void *page;
 	ssize_t length;
 	struct task_struct *task = get_proc_task(inode);
 
@@ -2375,14 +2388,11 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
 	if (*ppos != 0)
 		goto out;
 
-	length = -ENOMEM;
-	page = (char*)__get_free_page(GFP_TEMPORARY);
-	if (!page)
+	page = memdup_user(buf, count);
+	if (IS_ERR(page)) {
+		length = PTR_ERR(page);
 		goto out;
-
-	length = -EFAULT;
-	if (copy_from_user(page, buf, count))
-		goto out_free;
+	}
 
 	/* Guard against adverse ptrace interaction */
 	length = mutex_lock_interruptible(&task->signal->cred_guard_mutex);
@@ -2391,10 +2401,10 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
 
 	length = security_setprocattr(task,
 				      (char*)file->f_path.dentry->d_name.name,
-				      (void*)page, count);
+				      page, count);
 	mutex_unlock(&task->signal->cred_guard_mutex);
 out_free:
-	free_page((unsigned long) page);
+	kfree(page);
 out:
 	put_task_struct(task);
 out_no_task:
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index 3c2a915c6..56afa5ef0 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -258,6 +258,7 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx,
 				     name, len, instantiate, p,
 				     (void *)(unsigned long)fd))
 			goto out_fd_loop;
+		cond_resched();
 		rcu_read_lock();
 	}
 	rcu_read_unlock();
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index bd95b9fde..42305ddcb 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -95,7 +95,8 @@ void __init proc_init_inodecache(void)
 	proc_inode_cachep = kmem_cache_create("proc_inode_cache",
 					     sizeof(struct proc_inode),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD|SLAB_PANIC),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT|
+						SLAB_PANIC),
 					     init_once);
 }
 
@@ -393,24 +394,25 @@ static const struct file_operations proc_reg_file_ops_no_compat = {
 };
 #endif
 
-static const char *proc_follow_link(struct dentry *dentry, void **cookie)
+static void proc_put_link(void *p)
 {
-	struct proc_dir_entry *pde = PDE(d_inode(dentry));
-	if (unlikely(!use_pde(pde)))
-		return ERR_PTR(-EINVAL);
-	*cookie = pde;
-	return pde->data;
+	unuse_pde(p);
 }
 
-static void proc_put_link(struct inode *unused, void *p)
+static const char *proc_get_link(struct dentry *dentry,
+				 struct inode *inode,
+				 struct delayed_call *done)
 {
-	unuse_pde(p);
+	struct proc_dir_entry *pde = PDE(inode);
+	if (unlikely(!use_pde(pde)))
+		return ERR_PTR(-EINVAL);
+	set_delayed_call(done, proc_put_link, pde);
+	return pde->data;
 }
 
 const struct inode_operations proc_link_inode_operations = {
 	.readlink	= generic_readlink,
-	.follow_link	= proc_follow_link,
-	.put_link	= proc_put_link,
+	.get_link	= proc_get_link,
 };
 
 struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 92e6726f6..a939f5ed7 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -552,9 +552,9 @@ static int open_kcore(struct inode *inode, struct file *filp)
 	if (kcore_need_update)
 		kcore_update_ram();
 	if (i_size_read(inode) != proc_root_kcore->size) {
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 		i_size_write(inode, proc_root_kcore->size);
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 	}
 	return 0;
 }
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index ae836fbb0..70b9f953b 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -57,11 +57,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 	/*
 	 * Estimate the amount of memory available for userspace allocations,
 	 * without causing swapping.
-	 *
-	 * Free memory cannot be taken below the low watermark, before the
-	 * system starts swapping.
 	 */
-	available = i.freeram - wmark_low;
+	available = i.freeram - totalreserve_pages;
 
 	/*
 	 * Not all the page cache can be freed, otherwise the system will
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 1b0ea4a5d..276f12431 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -30,14 +30,18 @@ static const struct proc_ns_operations *ns_entries[] = {
 	&mntns_operations,
 };
 
-static const char *proc_ns_follow_link(struct dentry *dentry, void **cookie)
+static const char *proc_ns_get_link(struct dentry *dentry,
+				    struct inode *inode,
+				    struct delayed_call *done)
 {
-	struct inode *inode = d_inode(dentry);
 	const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops;
 	struct task_struct *task;
 	struct path ns_path;
 	void *error = ERR_PTR(-EACCES);
 
+	if (!dentry)
+		return ERR_PTR(-ECHILD);
+
 	task = get_proc_task(inode);
 	if (!task)
 		return error;
@@ -74,7 +78,7 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl
 
 static const struct inode_operations proc_ns_link_inode_operations = {
 	.readlink	= proc_ns_readlink,
-	.follow_link	= proc_ns_follow_link,
+	.get_link	= proc_ns_get_link,
 	.setattr	= proc_setattr,
 };
 
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index cb8eda060..f8595e8b5 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -45,10 +45,7 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region)
 	file = region->vm_file;
 
 	if (file) {
-		struct inode *inode;
-
-		file = vmr_pr_or_file(region);
-		inode = file_inode(file);
+		struct inode *inode = file_inode(region->vm_file);
 		dev = inode->i_sb->s_dev;
 		ino = inode->i_ino;
 	}
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 93484034a..b2855eea5 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -103,9 +103,9 @@ u64 stable_page_flags(struct page *page)
 	 * pseudo flags for the well known (anonymous) memory mapped pages
 	 *
 	 * Note that page->_mapcount is overloaded in SLOB/SLUB/SLQB, so the
-	 * simple test in page_mapped() is not enough.
+	 * simple test in page_mapcount() is not enough.
 	 */
-	if (!PageSlab(page) && page_mapped(page))
+	if (!PageSlab(page) && page_mapcount(page))
 		u |= 1 << KPF_MMAP;
 	if (PageAnon(page))
 		u |= 1 << KPF_ANON;
diff --git a/fs/proc/self.c b/fs/proc/self.c
index 113b8d061..b6a8d3529 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -18,26 +18,28 @@ static int proc_self_readlink(struct dentry *dentry, char __user *buffer,
 	return readlink_copy(buffer, buflen, tmp);
 }
 
-static const char *proc_self_follow_link(struct dentry *dentry, void **cookie)
+static const char *proc_self_get_link(struct dentry *dentry,
+				      struct inode *inode,
+				      struct delayed_call *done)
 {
-	struct pid_namespace *ns = dentry->d_sb->s_fs_info;
+	struct pid_namespace *ns = inode->i_sb->s_fs_info;
 	pid_t tgid = task_tgid_nr_ns(current, ns);
 	char *name;
 
 	if (!tgid)
 		return ERR_PTR(-ENOENT);
 	/* 11 for max length of signed int in decimal + NULL term */
-	name = kmalloc(12, GFP_KERNEL);
-	if (!name)
-		return ERR_PTR(-ENOMEM);
+	name = kmalloc(12, dentry ? GFP_KERNEL : GFP_ATOMIC);
+	if (unlikely(!name))
+		return dentry ? ERR_PTR(-ENOMEM) : ERR_PTR(-ECHILD);
 	sprintf(name, "%d", tgid);
-	return *cookie = name;
+	set_delayed_call(done, kfree_link, name);
+	return name;
 }
 
 static const struct inode_operations proc_self_inode_operations = {
 	.readlink	= proc_self_readlink,
-	.follow_link	= proc_self_follow_link,
-	.put_link	= kfree_put_link,
+	.get_link	= proc_self_get_link,
 };
 
 static unsigned self_inum;
@@ -48,7 +50,7 @@ int proc_setup_self(struct super_block *s)
 	struct pid_namespace *ns = s->s_fs_info;
 	struct dentry *self;
 	
-	mutex_lock(&root_inode->i_mutex);
+	inode_lock(root_inode);
 	self = d_alloc_name(s->s_root, "self");
 	if (self) {
 		struct inode *inode = new_inode_pseudo(s);
@@ -67,7 +69,7 @@ int proc_setup_self(struct super_block *s)
 	} else {
 		self = ERR_PTR(-ENOMEM);
 	}
-	mutex_unlock(&root_inode->i_mutex);
+	inode_unlock(root_inode);
 	if (IS_ERR(self)) {
 		pr_err("proc_fill_super: can't allocate /proc/self\n");
 		return PTR_ERR(self);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index e4733feb0..fa95ab2d3 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -14,6 +14,7 @@
 #include <linux/swapops.h>
 #include <linux/mmu_notifier.h>
 #include <linux/page_idle.h>
+#include <linux/shmem_fs.h>
 
 #include <asm/elf.h>
 #include <asm/uaccess.h>
@@ -22,9 +23,13 @@
 
 void task_mem(struct seq_file *m, struct mm_struct *mm)
 {
-	unsigned long data, text, lib, swap, ptes, pmds;
+	unsigned long text, lib, swap, ptes, pmds, anon, file, shmem;
 	unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss;
 
+	anon = get_mm_counter(mm, MM_ANONPAGES);
+	file = get_mm_counter(mm, MM_FILEPAGES);
+	shmem = get_mm_counter(mm, MM_SHMEMPAGES);
+
 	/*
 	 * Note: to minimize their overhead, mm maintains hiwater_vm and
 	 * hiwater_rss only when about to *lower* total_vm or rss.  Any
@@ -35,11 +40,10 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 	hiwater_vm = total_vm = mm->total_vm;
 	if (hiwater_vm < mm->hiwater_vm)
 		hiwater_vm = mm->hiwater_vm;
-	hiwater_rss = total_rss = get_mm_rss(mm);
+	hiwater_rss = total_rss = anon + file + shmem;
 	if (hiwater_rss < mm->hiwater_rss)
 		hiwater_rss = mm->hiwater_rss;
 
-	data = mm->total_vm - mm->shared_vm - mm->stack_vm;
 	text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10;
 	lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text;
 	swap = get_mm_counter(mm, MM_SWAPENTS);
@@ -52,6 +56,9 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 		"VmPin:\t%8lu kB\n"
 		"VmHWM:\t%8lu kB\n"
 		"VmRSS:\t%8lu kB\n"
+		"RssAnon:\t%8lu kB\n"
+		"RssFile:\t%8lu kB\n"
+		"RssShmem:\t%8lu kB\n"
 		"VmData:\t%8lu kB\n"
 		"VmStk:\t%8lu kB\n"
 		"VmExe:\t%8lu kB\n"
@@ -65,7 +72,10 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 		mm->pinned_vm << (PAGE_SHIFT-10),
 		hiwater_rss << (PAGE_SHIFT-10),
 		total_rss << (PAGE_SHIFT-10),
-		data << (PAGE_SHIFT-10),
+		anon << (PAGE_SHIFT-10),
+		file << (PAGE_SHIFT-10),
+		shmem << (PAGE_SHIFT-10),
+		mm->data_vm << (PAGE_SHIFT-10),
 		mm->stack_vm << (PAGE_SHIFT-10), text, lib,
 		ptes >> 10,
 		pmds >> 10,
@@ -82,10 +92,11 @@ unsigned long task_statm(struct mm_struct *mm,
 			 unsigned long *shared, unsigned long *text,
 			 unsigned long *data, unsigned long *resident)
 {
-	*shared = get_mm_counter(mm, MM_FILEPAGES);
+	*shared = get_mm_counter(mm, MM_FILEPAGES) +
+			get_mm_counter(mm, MM_SHMEMPAGES);
 	*text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK))
 								>> PAGE_SHIFT;
-	*data = mm->total_vm - mm->shared_vm;
+	*data = mm->data_vm + mm->stack_vm;
 	*resident = *shared + get_mm_counter(mm, MM_ANONPAGES);
 	return mm->total_vm;
 }
@@ -248,23 +259,29 @@ static int do_maps_open(struct inode *inode, struct file *file,
 				sizeof(struct proc_maps_private));
 }
 
-static pid_t pid_of_stack(struct proc_maps_private *priv,
-				struct vm_area_struct *vma, bool is_pid)
+/*
+ * Indicate if the VMA is a stack for the given task; for
+ * /proc/PID/maps that is the stack of the main task.
+ */
+static int is_stack(struct proc_maps_private *priv,
+		    struct vm_area_struct *vma, int is_pid)
 {
-	struct inode *inode = priv->inode;
-	struct task_struct *task;
-	pid_t ret = 0;
+	int stack = 0;
 
-	rcu_read_lock();
-	task = pid_task(proc_pid(inode), PIDTYPE_PID);
-	if (task) {
-		task = task_of_stack(task, vma, is_pid);
+	if (is_pid) {
+		stack = vma->vm_start <= vma->vm_mm->start_stack &&
+			vma->vm_end >= vma->vm_mm->start_stack;
+	} else {
+		struct inode *inode = priv->inode;
+		struct task_struct *task;
+
+		rcu_read_lock();
+		task = pid_task(proc_pid(inode), PIDTYPE_PID);
 		if (task)
-			ret = task_pid_nr_ns(task, inode->i_sb->s_fs_info);
+			stack = vma_is_stack_for_task(vma, task);
+		rcu_read_unlock();
 	}
-	rcu_read_unlock();
-
-	return ret;
+	return stack;
 }
 
 static void
@@ -281,10 +298,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
 	const char *name = NULL;
 
 	if (file) {
-		struct inode *inode;
-
-		file = vma_pr_or_file(vma);
-		inode = file_inode(file);
+		struct inode *inode = file_inode(vma->vm_file);
 		dev = inode->i_sb->s_dev;
 		ino = inode->i_ino;
 		pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT;
@@ -327,8 +341,6 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
 
 	name = arch_vma_name(vma);
 	if (!name) {
-		pid_t tid;
-
 		if (!mm) {
 			name = "[vdso]";
 			goto done;
@@ -340,21 +352,8 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
 			goto done;
 		}
 
-		tid = pid_of_stack(priv, vma, is_pid);
-		if (tid != 0) {
-			/*
-			 * Thread stack in /proc/PID/task/TID/maps or
-			 * the main process stack.
-			 */
-			if (!is_pid || (vma->vm_start <= mm->start_stack &&
-			    vma->vm_end >= mm->start_stack)) {
-				name = "[stack]";
-			} else {
-				/* Thread stack in /proc/PID/maps */
-				seq_pad(m, ' ');
-				seq_printf(m, "[stack:%d]", tid);
-			}
-		}
+		if (is_stack(priv, vma, is_pid))
+			name = "[stack]";
 	}
 
 done:
@@ -454,12 +453,14 @@ struct mem_size_stats {
 	unsigned long private_hugetlb;
 	u64 pss;
 	u64 swap_pss;
+	bool check_shmem_swap;
 };
 
 static void smaps_account(struct mem_size_stats *mss, struct page *page,
-		unsigned long size, bool young, bool dirty)
+		bool compound, bool young, bool dirty)
 {
-	int mapcount;
+	int i, nr = compound ? 1 << compound_order(page) : 1;
+	unsigned long size = nr * PAGE_SIZE;
 
 	if (PageAnon(page))
 		mss->anonymous += size;
@@ -468,26 +469,53 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page,
 	/* Accumulate the size in pages that have been accessed. */
 	if (young || page_is_young(page) || PageReferenced(page))
 		mss->referenced += size;
-	mapcount = page_mapcount(page);
-	if (mapcount >= 2) {
-		u64 pss_delta;
 
-		if (dirty || PageDirty(page))
-			mss->shared_dirty += size;
-		else
-			mss->shared_clean += size;
-		pss_delta = (u64)size << PSS_SHIFT;
-		do_div(pss_delta, mapcount);
-		mss->pss += pss_delta;
-	} else {
+	/*
+	 * page_count(page) == 1 guarantees the page is mapped exactly once.
+	 * If any subpage of the compound page mapped with PTE it would elevate
+	 * page_count().
+	 */
+	if (page_count(page) == 1) {
 		if (dirty || PageDirty(page))
 			mss->private_dirty += size;
 		else
 			mss->private_clean += size;
 		mss->pss += (u64)size << PSS_SHIFT;
+		return;
+	}
+
+	for (i = 0; i < nr; i++, page++) {
+		int mapcount = page_mapcount(page);
+
+		if (mapcount >= 2) {
+			if (dirty || PageDirty(page))
+				mss->shared_dirty += PAGE_SIZE;
+			else
+				mss->shared_clean += PAGE_SIZE;
+			mss->pss += (PAGE_SIZE << PSS_SHIFT) / mapcount;
+		} else {
+			if (dirty || PageDirty(page))
+				mss->private_dirty += PAGE_SIZE;
+			else
+				mss->private_clean += PAGE_SIZE;
+			mss->pss += PAGE_SIZE << PSS_SHIFT;
+		}
 	}
 }
 
+#ifdef CONFIG_SHMEM
+static int smaps_pte_hole(unsigned long addr, unsigned long end,
+		struct mm_walk *walk)
+{
+	struct mem_size_stats *mss = walk->private;
+
+	mss->swap += shmem_partial_swap_usage(
+			walk->vma->vm_file->f_mapping, addr, end);
+
+	return 0;
+}
+#endif
+
 static void smaps_pte_entry(pte_t *pte, unsigned long addr,
 		struct mm_walk *walk)
 {
@@ -515,11 +543,25 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr,
 			}
 		} else if (is_migration_entry(swpent))
 			page = migration_entry_to_page(swpent);
+	} else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap
+							&& pte_none(*pte))) {
+		page = find_get_entry(vma->vm_file->f_mapping,
+						linear_page_index(vma, addr));
+		if (!page)
+			return;
+
+		if (radix_tree_exceptional_entry(page))
+			mss->swap += PAGE_SIZE;
+		else
+			page_cache_release(page);
+
+		return;
 	}
 
 	if (!page)
 		return;
-	smaps_account(mss, page, PAGE_SIZE, pte_young(*pte), pte_dirty(*pte));
+
+	smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte));
 }
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -535,8 +577,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
 	if (IS_ERR_OR_NULL(page))
 		return;
 	mss->anonymous_thp += HPAGE_PMD_SIZE;
-	smaps_account(mss, page, HPAGE_PMD_SIZE,
-			pmd_young(*pmd), pmd_dirty(*pmd));
+	smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd));
 }
 #else
 static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr,
@@ -552,7 +593,8 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 	pte_t *pte;
 	spinlock_t *ptl;
 
-	if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+	ptl = pmd_trans_huge_lock(pmd, vma);
+	if (ptl) {
 		smaps_pmd_entry(pmd, addr, walk);
 		spin_unlock(ptl);
 		return 0;
@@ -674,6 +716,31 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
 	};
 
 	memset(&mss, 0, sizeof mss);
+
+#ifdef CONFIG_SHMEM
+	if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) {
+		/*
+		 * For shared or readonly shmem mappings we know that all
+		 * swapped out pages belong to the shmem object, and we can
+		 * obtain the swap value much more efficiently. For private
+		 * writable mappings, we might have COW pages that are
+		 * not affected by the parent swapped out pages of the shmem
+		 * object, so we have to distinguish them during the page walk.
+		 * Unless we know that the shmem object (or the part mapped by
+		 * our VMA) has no swapped out pages at all.
+		 */
+		unsigned long shmem_swapped = shmem_swap_usage(vma);
+
+		if (!shmem_swapped || (vma->vm_flags & VM_SHARED) ||
+					!(vma->vm_flags & VM_WRITE)) {
+			mss.swap = shmem_swapped;
+		} else {
+			mss.check_shmem_swap = true;
+			smaps_walk.pte_hole = smaps_pte_hole;
+		}
+	}
+#endif
+
 	/* mmap_sem is held in m_start */
 	walk_page_vma(vma, &smaps_walk);
 
@@ -820,9 +887,6 @@ static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
 	pmd = pmd_wrprotect(pmd);
 	pmd = pmd_clear_soft_dirty(pmd);
 
-	if (vma->vm_flags & VM_SOFTDIRTY)
-		vma->vm_flags &= ~VM_SOFTDIRTY;
-
 	set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
 }
 #else
@@ -841,7 +905,8 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
 	spinlock_t *ptl;
 	struct page *page;
 
-	if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+	ptl = pmd_trans_huge_lock(pmd, vma);
+	if (ptl) {
 		if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
 			clear_soft_dirty_pmd(vma, addr, pmd);
 			goto out;
@@ -1115,7 +1180,8 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
 	int err = 0;
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-	if (pmd_trans_huge_lock(pmdp, vma, &ptl) == 1) {
+	ptl = pmd_trans_huge_lock(pmdp, vma);
+	if (ptl) {
 		u64 flags = 0, frame = 0;
 		pmd_t pmd = *pmdp;
 
@@ -1447,7 +1513,8 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
 	pte_t *orig_pte;
 	pte_t *pte;
 
-	if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+	ptl = pmd_trans_huge_lock(pmd, vma);
+	if (ptl) {
 		pte_t huge_pte = *(pte_t *)pmd;
 		struct page *page;
 
@@ -1509,7 +1576,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
 	struct proc_maps_private *proc_priv = &numa_priv->proc_maps;
 	struct vm_area_struct *vma = v;
 	struct numa_maps *md = &numa_priv->md;
-	struct file *file = vma_pr_or_file(vma);
+	struct file *file = vma->vm_file;
 	struct mm_struct *mm = vma->vm_mm;
 	struct mm_walk walk = {
 		.hugetlb_entry = gather_hugetlb_stats,
@@ -1542,19 +1609,8 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid)
 		seq_file_path(m, file, "\n\t= ");
 	} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
 		seq_puts(m, " heap");
-	} else {
-		pid_t tid = pid_of_stack(proc_priv, vma, is_pid);
-		if (tid != 0) {
-			/*
-			 * Thread stack in /proc/PID/task/TID/maps or
-			 * the main process stack.
-			 */
-			if (!is_pid || (vma->vm_start <= mm->start_stack &&
-			    vma->vm_end >= mm->start_stack))
-				seq_puts(m, " stack");
-			else
-				seq_printf(m, " stack:%d", tid);
-		}
+	} else if (is_stack(proc_priv, vma, is_pid)) {
+		seq_puts(m, " stack");
 	}
 
 	if (is_vm_hugetlb_page(vma))
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 7aa92dbf9..faacb0c0d 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -123,23 +123,26 @@ unsigned long task_statm(struct mm_struct *mm,
 	return size;
 }
 
-static pid_t pid_of_stack(struct proc_maps_private *priv,
-				struct vm_area_struct *vma, bool is_pid)
+static int is_stack(struct proc_maps_private *priv,
+		    struct vm_area_struct *vma, int is_pid)
 {
-	struct inode *inode = priv->inode;
-	struct task_struct *task;
-	pid_t ret = 0;
-
-	rcu_read_lock();
-	task = pid_task(proc_pid(inode), PIDTYPE_PID);
-	if (task) {
-		task = task_of_stack(task, vma, is_pid);
+	struct mm_struct *mm = vma->vm_mm;
+	int stack = 0;
+
+	if (is_pid) {
+		stack = vma->vm_start <= mm->start_stack &&
+			vma->vm_end >= mm->start_stack;
+	} else {
+		struct inode *inode = priv->inode;
+		struct task_struct *task;
+
+		rcu_read_lock();
+		task = pid_task(proc_pid(inode), PIDTYPE_PID);
 		if (task)
-			ret = task_pid_nr_ns(task, inode->i_sb->s_fs_info);
+			stack = vma_is_stack_for_task(vma, task);
+		rcu_read_unlock();
 	}
-	rcu_read_unlock();
-
-	return ret;
+	return stack;
 }
 
 /*
@@ -160,10 +163,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma,
 	file = vma->vm_file;
 
 	if (file) {
-		struct inode *inode;
-
-		file = vma_pr_or_file(vma);
-		inode = file_inode(file);
+		struct inode *inode = file_inode(vma->vm_file);
 		dev = inode->i_sb->s_dev;
 		ino = inode->i_ino;
 		pgoff = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
@@ -184,21 +184,9 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma,
 	if (file) {
 		seq_pad(m, ' ');
 		seq_file_path(m, file, "");
-	} else if (mm) {
-		pid_t tid = pid_of_stack(priv, vma, is_pid);
-
-		if (tid != 0) {
-			seq_pad(m, ' ');
-			/*
-			 * Thread stack in /proc/PID/task/TID/maps or
-			 * the main process stack.
-			 */
-			if (!is_pid || (vma->vm_start <= mm->start_stack &&
-			    vma->vm_end >= mm->start_stack))
-				seq_printf(m, "[stack]");
-			else
-				seq_printf(m, "[stack:%d]", tid);
-		}
+	} else if (mm && is_stack(priv, vma, is_pid)) {
+		seq_pad(m, ' ');
+		seq_printf(m, "[stack]");
 	}
 
 	seq_putc(m, '\n');
diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c
index 947b0f4fd..e58a31e8f 100644
--- a/fs/proc/thread_self.c
+++ b/fs/proc/thread_self.c
@@ -19,26 +19,29 @@ static int proc_thread_self_readlink(struct dentry *dentry, char __user *buffer,
 	return readlink_copy(buffer, buflen, tmp);
 }
 
-static const char *proc_thread_self_follow_link(struct dentry *dentry, void **cookie)
+static const char *proc_thread_self_get_link(struct dentry *dentry,
+					     struct inode *inode,
+					     struct delayed_call *done)
 {
-	struct pid_namespace *ns = dentry->d_sb->s_fs_info;
+	struct pid_namespace *ns = inode->i_sb->s_fs_info;
 	pid_t tgid = task_tgid_nr_ns(current, ns);
 	pid_t pid = task_pid_nr_ns(current, ns);
 	char *name;
 
 	if (!pid)
 		return ERR_PTR(-ENOENT);
-	name = kmalloc(PROC_NUMBUF + 6 + PROC_NUMBUF, GFP_KERNEL);
-	if (!name)
-		return ERR_PTR(-ENOMEM);
+	name = kmalloc(PROC_NUMBUF + 6 + PROC_NUMBUF,
+				dentry ? GFP_KERNEL : GFP_ATOMIC);
+	if (unlikely(!name))
+		return dentry ? ERR_PTR(-ENOMEM) : ERR_PTR(-ECHILD);
 	sprintf(name, "%d/task/%d", tgid, pid);
-	return *cookie = name;
+	set_delayed_call(done, kfree_link, name);
+	return name;
 }
 
 static const struct inode_operations proc_thread_self_inode_operations = {
 	.readlink	= proc_thread_self_readlink,
-	.follow_link	= proc_thread_self_follow_link,
-	.put_link	= kfree_put_link,
+	.get_link	= proc_thread_self_get_link,
 };
 
 static unsigned thread_self_inum;
@@ -49,7 +52,7 @@ int proc_setup_thread_self(struct super_block *s)
 	struct pid_namespace *ns = s->s_fs_info;
 	struct dentry *thread_self;
 
-	mutex_lock(&root_inode->i_mutex);
+	inode_lock(root_inode);
 	thread_self = d_alloc_name(s->s_root, "thread-self");
 	if (thread_self) {
 		struct inode *inode = new_inode_pseudo(s);
@@ -68,7 +71,7 @@ int proc_setup_thread_self(struct super_block *s)
 	} else {
 		thread_self = ERR_PTR(-ENOMEM);
 	}
-	mutex_unlock(&root_inode->i_mutex);
+	inode_unlock(root_inode);
 	if (IS_ERR(thread_self)) {
 		pr_err("proc_fill_super: can't allocate /proc/thread_self\n");
 		return PTR_ERR(thread_self);
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c
index 8ebd9a334..2256e7e23 100644
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -95,9 +95,9 @@ static int show_vfsmnt(struct seq_file *m, struct vfsmount *mnt)
 {
 	struct proc_mounts *p = m->private;
 	struct mount *r = real_mount(mnt);
-	int err = 0;
 	struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
 	struct super_block *sb = mnt_path.dentry->d_sb;
+	int err;
 
 	if (sb->s_op->show_devname) {
 		err = sb->s_op->show_devname(m, mnt_path.dentry);
@@ -131,16 +131,17 @@ static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt)
 	struct mount *r = real_mount(mnt);
 	struct super_block *sb = mnt->mnt_sb;
 	struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
-	int err = 0;
+	int err;
 
 	seq_printf(m, "%i %i %u:%u ", r->mnt_id, r->mnt_parent->mnt_id,
 		   MAJOR(sb->s_dev), MINOR(sb->s_dev));
-	if (sb->s_op->show_path)
+	if (sb->s_op->show_path) {
 		err = sb->s_op->show_path(m, mnt->mnt_root);
-	else
+		if (err)
+			goto out;
+	} else {
 		seq_dentry(m, mnt->mnt_root, " \t\n\\");
-	if (err)
-		goto out;
+	}
 	seq_putc(m, ' ');
 
 	/* mountpoints outside of chroot jail will give SEQ_SKIP on this */
@@ -168,12 +169,13 @@ static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt)
 	seq_puts(m, " - ");
 	show_type(m, sb);
 	seq_putc(m, ' ');
-	if (sb->s_op->show_devname)
+	if (sb->s_op->show_devname) {
 		err = sb->s_op->show_devname(m, mnt->mnt_root);
-	else
+		if (err)
+			goto out;
+	} else {
 		mangle(m, r->mnt_devname ? r->mnt_devname : "none");
-	if (err)
-		goto out;
+	}
 	seq_puts(m, sb->s_flags & MS_RDONLY ? " ro" : " rw");
 	err = show_sb_opts(m, sb);
 	if (err)
@@ -191,7 +193,7 @@ static int show_vfsstat(struct seq_file *m, struct vfsmount *mnt)
 	struct mount *r = real_mount(mnt);
 	struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt };
 	struct super_block *sb = mnt_path.dentry->d_sb;
-	int err = 0;
+	int err;
 
 	/* device */
 	if (sb->s_op->show_devname) {
@@ -220,8 +222,7 @@ static int show_vfsstat(struct seq_file *m, struct vfsmount *mnt)
 	/* optional statistics */
 	if (sb->s_op->show_stats) {
 		seq_putc(m, ' ');
-		if (!err)
-			err = sb->s_op->show_stats(m, mnt_path.dentry);
+		err = sb->s_op->show_stats(m, mnt_path.dentry);
 	}
 
 	seq_putc(m, '\n');
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index d8c439d81..dc645b66c 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -377,7 +377,7 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count,
 		break;
 	}
 
-	mutex_lock(&d_inode(root)->i_mutex);
+	inode_lock(d_inode(root));
 
 	dentry = d_alloc_name(root, name);
 	if (!dentry)
@@ -397,12 +397,12 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count,
 	list_add(&private->list, &allpstore);
 	spin_unlock_irqrestore(&allpstore_lock, flags);
 
-	mutex_unlock(&d_inode(root)->i_mutex);
+	inode_unlock(d_inode(root));
 
 	return 0;
 
 fail_lockedalloc:
-	mutex_unlock(&d_inode(root)->i_mutex);
+	inode_unlock(d_inode(root));
 	kfree(private);
 fail_alloc:
 	iput(inode);
diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c
index c4bcb7788..3a67cfb14 100644
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -316,6 +316,7 @@ struct inode *qnx4_iget(struct super_block *sb, unsigned long ino)
 		inode->i_fop = &qnx4_dir_operations;
 	} else if (S_ISLNK(inode->i_mode)) {
 		inode->i_op = &page_symlink_inode_operations;
+		inode_nohighmem(inode);
 		inode->i_mapping->a_ops = &qnx4_aops;
 		qnx4_i(inode)->mmu_private = inode->i_size;
 	} else {
@@ -364,7 +365,7 @@ static int init_inodecache(void)
 	qnx4_inode_cachep = kmem_cache_create("qnx4_inode_cache",
 					     sizeof(struct qnx4_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					     init_once);
 	if (qnx4_inode_cachep == NULL)
 		return -ENOMEM;
diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c
index 32d2e1a97..47bb1de07 100644
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c
@@ -582,6 +582,7 @@ struct inode *qnx6_iget(struct super_block *sb, unsigned ino)
 		inode->i_mapping->a_ops = &qnx6_aops;
 	} else if (S_ISLNK(inode->i_mode)) {
 		inode->i_op = &page_symlink_inode_operations;
+		inode_nohighmem(inode);
 		inode->i_mapping->a_ops = &qnx6_aops;
 	} else
 		init_special_inode(inode, inode->i_mode, 0);
@@ -624,7 +625,7 @@ static int init_inodecache(void)
 	qnx6_inode_cachep = kmem_cache_create("qnx6_inode_cache",
 					     sizeof(struct qnx6_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					     init_once);
 	if (!qnx6_inode_cachep)
 		return -ENOMEM;
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index ef0d64b2a..3c3b81bb6 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -682,9 +682,9 @@ int dquot_quota_sync(struct super_block *sb, int type)
 			continue;
 		if (!sb_has_quota_active(sb, cnt))
 			continue;
-		mutex_lock(&dqopt->files[cnt]->i_mutex);
+		inode_lock(dqopt->files[cnt]);
 		truncate_inode_pages(&dqopt->files[cnt]->i_data, 0);
-		mutex_unlock(&dqopt->files[cnt]->i_mutex);
+		inode_unlock(dqopt->files[cnt]);
 	}
 	mutex_unlock(&dqopt->dqonoff_mutex);
 
@@ -2162,12 +2162,12 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags)
 			/* If quota was reenabled in the meantime, we have
 			 * nothing to do */
 			if (!sb_has_quota_loaded(sb, cnt)) {
-				mutex_lock(&toputinode[cnt]->i_mutex);
+				inode_lock(toputinode[cnt]);
 				toputinode[cnt]->i_flags &= ~(S_IMMUTABLE |
 				  S_NOATIME | S_NOQUOTA);
 				truncate_inode_pages(&toputinode[cnt]->i_data,
 						     0);
-				mutex_unlock(&toputinode[cnt]->i_mutex);
+				inode_unlock(toputinode[cnt]);
 				mark_inode_dirty_sync(toputinode[cnt]);
 			}
 			mutex_unlock(&dqopt->dqonoff_mutex);
@@ -2258,11 +2258,11 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
 		/* We don't want quota and atime on quota files (deadlocks
 		 * possible) Also nobody should write to the file - we use
 		 * special IO operations which ignore the immutable bit. */
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 		oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE |
 					     S_NOQUOTA);
 		inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE;
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 		/*
 		 * When S_NOQUOTA is set, remove dquot references as no more
 		 * references can be added
@@ -2305,12 +2305,12 @@ out_file_init:
 	iput(inode);
 out_lock:
 	if (oldflags != -1) {
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 		/* Set the flags back (in the case of accidental quotaon()
 		 * on a wrong file we don't want to mess up the flags) */
 		inode->i_flags &= ~(S_NOATIME | S_NOQUOTA | S_IMMUTABLE);
 		inode->i_flags |= oldflags;
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 	}
 	mutex_unlock(&dqopt->dqonoff_mutex);
 out_fmt:
@@ -2430,9 +2430,9 @@ int dquot_quota_on_mount(struct super_block *sb, char *qf_name,
 	struct dentry *dentry;
 	int error;
 
-	mutex_lock(&d_inode(sb->s_root)->i_mutex);
+	inode_lock(d_inode(sb->s_root));
 	dentry = lookup_one_len(qf_name, sb->s_root, strlen(qf_name));
-	mutex_unlock(&d_inode(sb->s_root)->i_mutex);
+	inode_unlock(d_inode(sb->s_root));
 	if (IS_ERR(dentry))
 		return PTR_ERR(dentry);
 
@@ -2924,4 +2924,4 @@ static int __init dquot_init(void)
 
 	return 0;
 }
-module_init(dquot_init);
+fs_initcall(dquot_init);
diff --git a/fs/quota/netlink.c b/fs/quota/netlink.c
index bb2869f5d..d07a2f91d 100644
--- a/fs/quota/netlink.c
+++ b/fs/quota/netlink.c
@@ -1,7 +1,5 @@
-
 #include <linux/cred.h>
 #include <linux/init.h>
-#include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/quotaops.h>
 #include <linux/sched.h>
@@ -105,5 +103,4 @@ static int __init quota_init(void)
 		       "VFS: Failed to create quota netlink interface.\n");
 	return 0;
 };
-
-module_init(quota_init);
+fs_initcall(quota_init);
diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c
index 2aa012a68..ed85d4f35 100644
--- a/fs/quota/quota_v2.c
+++ b/fs/quota/quota_v2.c
@@ -30,13 +30,13 @@ static void v2r1_mem2diskdqb(void *dp, struct dquot *dquot);
 static void v2r1_disk2memdqb(struct dquot *dquot, void *dp);
 static int v2r1_is_id(void *dp, struct dquot *dquot);
 
-static struct qtree_fmt_operations v2r0_qtree_ops = {
+static const struct qtree_fmt_operations v2r0_qtree_ops = {
 	.mem2disk_dqblk = v2r0_mem2diskdqb,
 	.disk2mem_dqblk = v2r0_disk2memdqb,
 	.is_id = v2r0_is_id,
 };
 
-static struct qtree_fmt_operations v2r1_qtree_ops = {
+static const struct qtree_fmt_operations v2r1_qtree_ops = {
 	.mem2disk_dqblk = v2r1_mem2diskdqb,
 	.disk2mem_dqblk = v2r1_disk2memdqb,
 	.is_id = v2r1_is_id,
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 889d558b4..38981b037 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -79,6 +79,7 @@ struct inode *ramfs_get_inode(struct super_block *sb,
 			break;
 		case S_IFLNK:
 			inode->i_op = &page_symlink_inode_operations;
+			inode_nohighmem(inode);
 			break;
 		}
 	}
diff --git a/fs/read_write.c b/fs/read_write.c
index 0a2893354..dadf24e5c 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -16,6 +16,8 @@
 #include <linux/pagemap.h>
 #include <linux/splice.h>
 #include <linux/compat.h>
+#include <linux/mount.h>
+#include <linux/fs.h>
 #include "internal.h"
 
 #include <asm/uaccess.h>
@@ -171,6 +173,45 @@ loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t si
 EXPORT_SYMBOL(fixed_size_llseek);
 
 /**
+ * no_seek_end_llseek - llseek implementation for fixed-sized devices
+ * @file:	file structure to seek on
+ * @offset:	file offset to seek to
+ * @whence:	type of seek
+ *
+ */
+loff_t no_seek_end_llseek(struct file *file, loff_t offset, int whence)
+{
+	switch (whence) {
+	case SEEK_SET: case SEEK_CUR:
+		return generic_file_llseek_size(file, offset, whence,
+						OFFSET_MAX, 0);
+	default:
+		return -EINVAL;
+	}
+}
+EXPORT_SYMBOL(no_seek_end_llseek);
+
+/**
+ * no_seek_end_llseek_size - llseek implementation for fixed-sized devices
+ * @file:	file structure to seek on
+ * @offset:	file offset to seek to
+ * @whence:	type of seek
+ * @size:	maximal offset allowed
+ *
+ */
+loff_t no_seek_end_llseek_size(struct file *file, loff_t offset, int whence, loff_t size)
+{
+	switch (whence) {
+	case SEEK_SET: case SEEK_CUR:
+		return generic_file_llseek_size(file, offset, whence,
+						size, 0);
+	default:
+		return -EINVAL;
+	}
+}
+EXPORT_SYMBOL(no_seek_end_llseek_size);
+
+/**
  * noop_llseek - No Operation Performed llseek implementation
  * @file:	file structure to seek on
  * @offset:	file offset to seek to
@@ -198,7 +239,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int whence)
 	struct inode *inode = file_inode(file);
 	loff_t retval;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	switch (whence) {
 		case SEEK_END:
 			offset += i_size_read(inode);
@@ -243,7 +284,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int whence)
 		retval = offset;
 	}
 out:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return retval;
 }
 EXPORT_SYMBOL(default_llseek);
@@ -395,9 +436,8 @@ int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t
 	}
 
 	if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
-		retval = locks_mandatory_area(
-			read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,
-			inode, file, pos, count);
+		retval = locks_mandatory_area(inode, file, pos, pos + count - 1,
+				read_write == READ ? F_RDLCK : F_WRLCK);
 		if (retval < 0)
 			return retval;
 	}
@@ -494,30 +534,6 @@ ssize_t __vfs_write(struct file *file, const char __user *p, size_t count,
 }
 EXPORT_SYMBOL(__vfs_write);
 
-vfs_readf_t vfs_readf(struct file *file)
-{
-	const struct file_operations *fop = file->f_op;
-
-	if (fop->read)
-		return fop->read;
-	if (fop->read_iter)
-		return new_sync_read;
-	return ERR_PTR(-ENOSYS);
-}
-EXPORT_SYMBOL_GPL(vfs_readf);
-
-vfs_writef_t vfs_writef(struct file *file)
-{
-	const struct file_operations *fop = file->f_op;
-
-	if (fop->write)
-		return fop->write;
-	if (fop->write_iter)
-		return new_sync_write;
-	return ERR_PTR(-ENOSYS);
-}
-EXPORT_SYMBOL_GPL(vfs_writef);
-
 ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t *pos)
 {
 	mm_segment_t old_fs;
@@ -1351,3 +1367,304 @@ COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd,
 	return do_sendfile(out_fd, in_fd, NULL, count, 0);
 }
 #endif
+
+/*
+ * copy_file_range() differs from regular file read and write in that it
+ * specifically allows return partial success.  When it does so is up to
+ * the copy_file_range method.
+ */
+ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
+			    struct file *file_out, loff_t pos_out,
+			    size_t len, unsigned int flags)
+{
+	struct inode *inode_in = file_inode(file_in);
+	struct inode *inode_out = file_inode(file_out);
+	ssize_t ret;
+
+	if (flags != 0)
+		return -EINVAL;
+
+	/* copy_file_range allows full ssize_t len, ignoring MAX_RW_COUNT  */
+	ret = rw_verify_area(READ, file_in, &pos_in, len);
+	if (ret >= 0)
+		ret = rw_verify_area(WRITE, file_out, &pos_out, len);
+	if (ret < 0)
+		return ret;
+
+	if (!(file_in->f_mode & FMODE_READ) ||
+	    !(file_out->f_mode & FMODE_WRITE) ||
+	    (file_out->f_flags & O_APPEND))
+		return -EBADF;
+
+	/* this could be relaxed once a method supports cross-fs copies */
+	if (inode_in->i_sb != inode_out->i_sb)
+		return -EXDEV;
+
+	if (len == 0)
+		return 0;
+
+	ret = mnt_want_write_file(file_out);
+	if (ret)
+		return ret;
+
+	ret = -EOPNOTSUPP;
+	if (file_out->f_op->copy_file_range)
+		ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out,
+						      pos_out, len, flags);
+	if (ret == -EOPNOTSUPP)
+		ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
+				len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
+
+	if (ret > 0) {
+		fsnotify_access(file_in);
+		add_rchar(current, ret);
+		fsnotify_modify(file_out);
+		add_wchar(current, ret);
+	}
+	inc_syscr(current);
+	inc_syscw(current);
+
+	mnt_drop_write_file(file_out);
+
+	return ret;
+}
+EXPORT_SYMBOL(vfs_copy_file_range);
+
+SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in,
+		int, fd_out, loff_t __user *, off_out,
+		size_t, len, unsigned int, flags)
+{
+	loff_t pos_in;
+	loff_t pos_out;
+	struct fd f_in;
+	struct fd f_out;
+	ssize_t ret = -EBADF;
+
+	f_in = fdget(fd_in);
+	if (!f_in.file)
+		goto out2;
+
+	f_out = fdget(fd_out);
+	if (!f_out.file)
+		goto out1;
+
+	ret = -EFAULT;
+	if (off_in) {
+		if (copy_from_user(&pos_in, off_in, sizeof(loff_t)))
+			goto out;
+	} else {
+		pos_in = f_in.file->f_pos;
+	}
+
+	if (off_out) {
+		if (copy_from_user(&pos_out, off_out, sizeof(loff_t)))
+			goto out;
+	} else {
+		pos_out = f_out.file->f_pos;
+	}
+
+	ret = vfs_copy_file_range(f_in.file, pos_in, f_out.file, pos_out, len,
+				  flags);
+	if (ret > 0) {
+		pos_in += ret;
+		pos_out += ret;
+
+		if (off_in) {
+			if (copy_to_user(off_in, &pos_in, sizeof(loff_t)))
+				ret = -EFAULT;
+		} else {
+			f_in.file->f_pos = pos_in;
+		}
+
+		if (off_out) {
+			if (copy_to_user(off_out, &pos_out, sizeof(loff_t)))
+				ret = -EFAULT;
+		} else {
+			f_out.file->f_pos = pos_out;
+		}
+	}
+
+out:
+	fdput(f_out);
+out1:
+	fdput(f_in);
+out2:
+	return ret;
+}
+
+static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
+{
+	struct inode *inode = file_inode(file);
+
+	if (unlikely(pos < 0))
+		return -EINVAL;
+
+	 if (unlikely((loff_t) (pos + len) < 0))
+		return -EINVAL;
+
+	if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
+		loff_t end = len ? pos + len - 1 : OFFSET_MAX;
+		int retval;
+
+		retval = locks_mandatory_area(inode, file, pos, end,
+				write ? F_WRLCK : F_RDLCK);
+		if (retval < 0)
+			return retval;
+	}
+
+	return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
+}
+
+int vfs_clone_file_range(struct file *file_in, loff_t pos_in,
+		struct file *file_out, loff_t pos_out, u64 len)
+{
+	struct inode *inode_in = file_inode(file_in);
+	struct inode *inode_out = file_inode(file_out);
+	int ret;
+
+	if (inode_in->i_sb != inode_out->i_sb ||
+	    file_in->f_path.mnt != file_out->f_path.mnt)
+		return -EXDEV;
+
+	if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
+		return -EISDIR;
+	if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
+		return -EINVAL;
+
+	if (!(file_in->f_mode & FMODE_READ) ||
+	    !(file_out->f_mode & FMODE_WRITE) ||
+	    (file_out->f_flags & O_APPEND))
+		return -EBADF;
+
+	if (!file_in->f_op->clone_file_range)
+		return -EOPNOTSUPP;
+
+	ret = clone_verify_area(file_in, pos_in, len, false);
+	if (ret)
+		return ret;
+
+	ret = clone_verify_area(file_out, pos_out, len, true);
+	if (ret)
+		return ret;
+
+	if (pos_in + len > i_size_read(inode_in))
+		return -EINVAL;
+
+	ret = mnt_want_write_file(file_out);
+	if (ret)
+		return ret;
+
+	ret = file_in->f_op->clone_file_range(file_in, pos_in,
+			file_out, pos_out, len);
+	if (!ret) {
+		fsnotify_access(file_in);
+		fsnotify_modify(file_out);
+	}
+
+	mnt_drop_write_file(file_out);
+	return ret;
+}
+EXPORT_SYMBOL(vfs_clone_file_range);
+
+int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
+{
+	struct file_dedupe_range_info *info;
+	struct inode *src = file_inode(file);
+	u64 off;
+	u64 len;
+	int i;
+	int ret;
+	bool is_admin = capable(CAP_SYS_ADMIN);
+	u16 count = same->dest_count;
+	struct file *dst_file;
+	loff_t dst_off;
+	ssize_t deduped;
+
+	if (!(file->f_mode & FMODE_READ))
+		return -EINVAL;
+
+	if (same->reserved1 || same->reserved2)
+		return -EINVAL;
+
+	off = same->src_offset;
+	len = same->src_length;
+
+	ret = -EISDIR;
+	if (S_ISDIR(src->i_mode))
+		goto out;
+
+	ret = -EINVAL;
+	if (!S_ISREG(src->i_mode))
+		goto out;
+
+	ret = clone_verify_area(file, off, len, false);
+	if (ret < 0)
+		goto out;
+	ret = 0;
+
+	/* pre-format output fields to sane values */
+	for (i = 0; i < count; i++) {
+		same->info[i].bytes_deduped = 0ULL;
+		same->info[i].status = FILE_DEDUPE_RANGE_SAME;
+	}
+
+	for (i = 0, info = same->info; i < count; i++, info++) {
+		struct inode *dst;
+		struct fd dst_fd = fdget(info->dest_fd);
+
+		dst_file = dst_fd.file;
+		if (!dst_file) {
+			info->status = -EBADF;
+			goto next_loop;
+		}
+		dst = file_inode(dst_file);
+
+		ret = mnt_want_write_file(dst_file);
+		if (ret) {
+			info->status = ret;
+			goto next_loop;
+		}
+
+		dst_off = info->dest_offset;
+		ret = clone_verify_area(dst_file, dst_off, len, true);
+		if (ret < 0) {
+			info->status = ret;
+			goto next_file;
+		}
+		ret = 0;
+
+		if (info->reserved) {
+			info->status = -EINVAL;
+		} else if (!(is_admin || (dst_file->f_mode & FMODE_WRITE))) {
+			info->status = -EINVAL;
+		} else if (file->f_path.mnt != dst_file->f_path.mnt) {
+			info->status = -EXDEV;
+		} else if (S_ISDIR(dst->i_mode)) {
+			info->status = -EISDIR;
+		} else if (dst_file->f_op->dedupe_file_range == NULL) {
+			info->status = -EINVAL;
+		} else {
+			deduped = dst_file->f_op->dedupe_file_range(file, off,
+							len, dst_file,
+							info->dest_offset);
+			if (deduped == -EBADE)
+				info->status = FILE_DEDUPE_RANGE_DIFFERS;
+			else if (deduped < 0)
+				info->status = deduped;
+			else
+				info->bytes_deduped += deduped;
+		}
+
+next_file:
+		mnt_drop_write_file(dst_file);
+next_loop:
+		fdput(dst_fd);
+
+		if (fatal_signal_pending(current))
+			goto out;
+	}
+
+out:
+	return ret;
+}
+EXPORT_SYMBOL(vfs_dedupe_file_range);
diff --git a/fs/readdir.c b/fs/readdir.c
index ced679179..e69ef3b79 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -44,7 +44,7 @@ int iterate_dir(struct file *file, struct dir_context *ctx)
 		fsnotify_access(file);
 		file_accessed(file);
 	}
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 out:
 	return res;
 }
diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c
index 4a024e2ce..3abd40041 100644
--- a/fs/reiserfs/dir.c
+++ b/fs/reiserfs/dir.c
@@ -38,11 +38,11 @@ static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end,
 	if (err)
 		return err;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	reiserfs_write_lock(inode->i_sb);
 	err = reiserfs_commit_for_inode(inode);
 	reiserfs_write_unlock(inode->i_sb);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	if (err < 0)
 		return err;
 	return 0;
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 96a1bcf33..9424a4ba9 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -158,7 +158,7 @@ static int reiserfs_sync_file(struct file *filp, loff_t start, loff_t end,
 	if (err)
 		return err;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	BUG_ON(!S_ISREG(inode->i_mode));
 	err = sync_mapping_buffers(inode->i_mapping);
 	reiserfs_write_lock(inode->i_sb);
@@ -166,7 +166,7 @@ static int reiserfs_sync_file(struct file *filp, loff_t start, loff_t end,
 	reiserfs_write_unlock(inode->i_sb);
 	if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb))
 		blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	if (barrier_done < 0)
 		return barrier_done;
 	return (err < 0) ? -EIO : 0;
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 3d8e7e671..ae9e5b308 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -1361,6 +1361,7 @@ static void init_inode(struct inode *inode, struct treepath *path)
 		inode->i_fop = &reiserfs_dir_operations;
 	} else if (S_ISLNK(inode->i_mode)) {
 		inode->i_op = &reiserfs_symlink_inode_operations;
+		inode_nohighmem(inode);
 		inode->i_mapping->a_ops = &reiserfs_address_space_operations;
 	} else {
 		inode->i_blocks = 0;
diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c
index 6ec8a30a0..036a1fc0a 100644
--- a/fs/reiserfs/ioctl.c
+++ b/fs/reiserfs/ioctl.c
@@ -224,7 +224,7 @@ out_unlock:
 	page_cache_release(page);
 
 out:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	reiserfs_write_unlock(inode->i_sb);
 	return retval;
 }
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 9d6486d41..44c2bdced 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -618,12 +618,10 @@ static void release_buffer_page(struct buffer_head *bh)
 
 static void reiserfs_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 {
-	char b[BDEVNAME_SIZE];
-
 	if (buffer_journaled(bh)) {
 		reiserfs_warning(NULL, "clm-2084",
-				 "pinned buffer %lu:%s sent to disk",
-				 bh->b_blocknr, bdevname(bh->b_bdev, b));
+				 "pinned buffer %lu:%pg sent to disk",
+				 bh->b_blocknr, bh->b_bdev);
 	}
 	if (uptodate)
 		set_buffer_uptodate(bh);
@@ -2387,11 +2385,10 @@ static int journal_read(struct super_block *sb)
 	int replay_count = 0;
 	int continue_replay = 1;
 	int ret;
-	char b[BDEVNAME_SIZE];
 
 	cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(sb);
-	reiserfs_info(sb, "checking transaction log (%s)\n",
-		      bdevname(journal->j_dev_bd, b));
+	reiserfs_info(sb, "checking transaction log (%pg)\n",
+		      journal->j_dev_bd);
 	start = get_seconds();
 
 	/*
@@ -2651,8 +2648,8 @@ static int journal_init_dev(struct super_block *super,
 
 	set_blocksize(journal->j_dev_bd, super->s_blocksize);
 	reiserfs_info(super,
-		      "journal_init_dev: journal device: %s\n",
-		      bdevname(journal->j_dev_bd, b));
+		      "journal_init_dev: journal device: %pg\n",
+		      journal->j_dev_bd);
 	return 0;
 }
 
@@ -2724,7 +2721,6 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
 	struct reiserfs_journal_header *jh;
 	struct reiserfs_journal *journal;
 	struct reiserfs_journal_list *jl;
-	char b[BDEVNAME_SIZE];
 	int ret;
 
 	journal = SB_JOURNAL(sb) = vzalloc(sizeof(struct reiserfs_journal));
@@ -2794,10 +2790,10 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
 	    && (le32_to_cpu(jh->jh_journal.jp_journal_magic) !=
 		sb_jp_journal_magic(rs))) {
 		reiserfs_warning(sb, "sh-460",
-				 "journal header magic %x (device %s) does "
+				 "journal header magic %x (device %pg) does "
 				 "not match to magic found in super block %x",
 				 jh->jh_journal.jp_journal_magic,
-				 bdevname(journal->j_dev_bd, b),
+				 journal->j_dev_bd,
 				 sb_jp_journal_magic(rs));
 		brelse(bhjh);
 		goto free_and_return;
@@ -2818,10 +2814,10 @@ int journal_init(struct super_block *sb, const char *j_dev_name,
 		journal->j_max_trans_age = commit_max_age;
 	}
 
-	reiserfs_info(sb, "journal params: device %s, size %u, "
+	reiserfs_info(sb, "journal params: device %pg, size %u, "
 		      "journal first block %u, max trans len %u, max batch %u, "
 		      "max commit age %u, max trans age %u\n",
-		      bdevname(journal->j_dev_bd, b),
+		      journal->j_dev_bd,
 		      SB_ONDISK_JOURNAL_SIZE(sb),
 		      SB_ONDISK_JOURNAL_1st_BLOCK(sb),
 		      journal->j_trans_max,
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index 47f96988f..2a12d46d7 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -1170,6 +1170,7 @@ static int reiserfs_symlink(struct inode *parent_dir,
 	reiserfs_update_inode_transaction(parent_dir);
 
 	inode->i_op = &reiserfs_symlink_inode_operations;
+	inode_nohighmem(inode);
 	inode->i_mapping->a_ops = &reiserfs_address_space_operations;
 
 	retval = reiserfs_add_entry(&th, parent_dir, dentry->d_name.name,
@@ -1664,8 +1665,7 @@ const struct inode_operations reiserfs_dir_inode_operations = {
  */
 const struct inode_operations reiserfs_symlink_inode_operations = {
 	.readlink = generic_readlink,
-	.follow_link = page_follow_link_light,
-	.put_link = page_put_link,
+	.get_link	= page_get_link,
 	.setattr = reiserfs_setattr,
 	.setxattr = reiserfs_setxattr,
 	.getxattr = reiserfs_getxattr,
diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c
index ae1dc841d..4f3f92807 100644
--- a/fs/reiserfs/prints.c
+++ b/fs/reiserfs/prints.c
@@ -139,11 +139,9 @@ static void sprintf_block_head(char *buf, struct buffer_head *bh)
 
 static void sprintf_buffer_head(char *buf, struct buffer_head *bh)
 {
-	char b[BDEVNAME_SIZE];
-
 	sprintf(buf,
-		"dev %s, size %zd, blocknr %llu, count %d, state 0x%lx, page %p, (%s, %s, %s)",
-		bdevname(bh->b_bdev, b), bh->b_size,
+		"dev %pg, size %zd, blocknr %llu, count %d, state 0x%lx, page %p, (%s, %s, %s)",
+		bh->b_bdev, bh->b_size,
 		(unsigned long long)bh->b_blocknr, atomic_read(&(bh->b_count)),
 		bh->b_state, bh->b_page,
 		buffer_uptodate(bh) ? "UPTODATE" : "!UPTODATE",
@@ -530,7 +528,6 @@ static int print_super_block(struct buffer_head *bh)
 	    (struct reiserfs_super_block *)(bh->b_data);
 	int skipped, data_blocks;
 	char *version;
-	char b[BDEVNAME_SIZE];
 
 	if (is_reiserfs_3_5(rs)) {
 		version = "3.5";
@@ -543,7 +540,7 @@ static int print_super_block(struct buffer_head *bh)
 		return 1;
 	}
 
-	printk("%s\'s super block is in block %llu\n", bdevname(bh->b_bdev, b),
+	printk("%pg\'s super block is in block %llu\n", bh->b_bdev,
 	       (unsigned long long)bh->b_blocknr);
 	printk("Reiserfs version %s\n", version);
 	printk("Block count %u\n", sb_block_count(rs));
diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c
index 621b9f381..fe999157d 100644
--- a/fs/reiserfs/procfs.c
+++ b/fs/reiserfs/procfs.c
@@ -303,11 +303,10 @@ static int show_journal(struct seq_file *m, void *unused)
 	struct reiserfs_sb_info *r = REISERFS_SB(sb);
 	struct reiserfs_super_block *rs = r->s_rs;
 	struct journal_params *jp = &rs->s_v1.s_journal;
-	char b[BDEVNAME_SIZE];
 
 	seq_printf(m,		/* on-disk fields */
 		   "jp_journal_1st_block: \t%i\n"
-		   "jp_journal_dev: \t%s[%x]\n"
+		   "jp_journal_dev: \t%pg[%x]\n"
 		   "jp_journal_size: \t%i\n"
 		   "jp_journal_trans_max: \t%i\n"
 		   "jp_journal_magic: \t%i\n"
@@ -348,7 +347,7 @@ static int show_journal(struct seq_file *m, void *unused)
 		   "prepare: \t%12lu\n"
 		   "prepare_retry: \t%12lu\n",
 		   DJP(jp_journal_1st_block),
-		   bdevname(SB_JOURNAL(sb)->j_dev_bd, b),
+		   SB_JOURNAL(sb)->j_dev_bd,
 		   DJP(jp_journal_dev),
 		   DJP(jp_journal_size),
 		   DJP(jp_journal_trans_max),
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 4a62fe8cc..c0306ec8e 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -288,7 +288,7 @@ static int finish_unfinished(struct super_block *s)
 		pathrelse(&path);
 
 		inode = reiserfs_iget(s, &obj_key);
-		if (!inode) {
+		if (IS_ERR_OR_NULL(inode)) {
 			/*
 			 * the unlink almost completed, it just did not
 			 * manage to remove "save" link and release objectid
@@ -626,7 +626,8 @@ static int __init init_inodecache(void)
 						  sizeof(struct
 							 reiserfs_inode_info),
 						  0, (SLAB_RECLAIM_ACCOUNT|
-							SLAB_MEM_SPREAD),
+						      SLAB_MEM_SPREAD|
+						      SLAB_ACCOUNT),
 						  init_once);
 	if (reiserfs_inode_cachep == NULL)
 		return -ENOMEM;
diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c
index 66b26fdff..57e0b2310 100644
--- a/fs/reiserfs/xattr.c
+++ b/fs/reiserfs/xattr.c
@@ -64,14 +64,14 @@
 #ifdef CONFIG_REISERFS_FS_XATTR
 static int xattr_create(struct inode *dir, struct dentry *dentry, int mode)
 {
-	BUG_ON(!mutex_is_locked(&dir->i_mutex));
+	BUG_ON(!inode_is_locked(dir));
 	return dir->i_op->create(dir, dentry, mode, true);
 }
 #endif
 
 static int xattr_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
-	BUG_ON(!mutex_is_locked(&dir->i_mutex));
+	BUG_ON(!inode_is_locked(dir));
 	return dir->i_op->mkdir(dir, dentry, mode);
 }
 
@@ -85,11 +85,11 @@ static int xattr_unlink(struct inode *dir, struct dentry *dentry)
 {
 	int error;
 
-	BUG_ON(!mutex_is_locked(&dir->i_mutex));
+	BUG_ON(!inode_is_locked(dir));
 
-	mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD);
+	inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD);
 	error = dir->i_op->unlink(dir, dentry);
-	mutex_unlock(&d_inode(dentry)->i_mutex);
+	inode_unlock(d_inode(dentry));
 
 	if (!error)
 		d_delete(dentry);
@@ -100,13 +100,13 @@ static int xattr_rmdir(struct inode *dir, struct dentry *dentry)
 {
 	int error;
 
-	BUG_ON(!mutex_is_locked(&dir->i_mutex));
+	BUG_ON(!inode_is_locked(dir));
 
-	mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD);
+	inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD);
 	error = dir->i_op->rmdir(dir, dentry);
 	if (!error)
 		d_inode(dentry)->i_flags |= S_DEAD;
-	mutex_unlock(&d_inode(dentry)->i_mutex);
+	inode_unlock(d_inode(dentry));
 	if (!error)
 		d_delete(dentry);
 
@@ -123,7 +123,7 @@ static struct dentry *open_xa_root(struct super_block *sb, int flags)
 	if (d_really_is_negative(privroot))
 		return ERR_PTR(-ENODATA);
 
-	mutex_lock_nested(&d_inode(privroot)->i_mutex, I_MUTEX_XATTR);
+	inode_lock_nested(d_inode(privroot), I_MUTEX_XATTR);
 
 	xaroot = dget(REISERFS_SB(sb)->xattr_root);
 	if (!xaroot)
@@ -139,7 +139,7 @@ static struct dentry *open_xa_root(struct super_block *sb, int flags)
 		}
 	}
 
-	mutex_unlock(&d_inode(privroot)->i_mutex);
+	inode_unlock(d_inode(privroot));
 	return xaroot;
 }
 
@@ -156,7 +156,7 @@ static struct dentry *open_xa_dir(const struct inode *inode, int flags)
 		 le32_to_cpu(INODE_PKEY(inode)->k_objectid),
 		 inode->i_generation);
 
-	mutex_lock_nested(&d_inode(xaroot)->i_mutex, I_MUTEX_XATTR);
+	inode_lock_nested(d_inode(xaroot), I_MUTEX_XATTR);
 
 	xadir = lookup_one_len(namebuf, xaroot, strlen(namebuf));
 	if (!IS_ERR(xadir) && d_really_is_negative(xadir)) {
@@ -170,7 +170,7 @@ static struct dentry *open_xa_dir(const struct inode *inode, int flags)
 		}
 	}
 
-	mutex_unlock(&d_inode(xaroot)->i_mutex);
+	inode_unlock(d_inode(xaroot));
 	dput(xaroot);
 	return xadir;
 }
@@ -195,7 +195,7 @@ fill_with_dentries(struct dir_context *ctx, const char *name, int namelen,
 		container_of(ctx, struct reiserfs_dentry_buf, ctx);
 	struct dentry *dentry;
 
-	WARN_ON_ONCE(!mutex_is_locked(&d_inode(dbuf->xadir)->i_mutex));
+	WARN_ON_ONCE(!inode_is_locked(d_inode(dbuf->xadir)));
 
 	if (dbuf->count == ARRAY_SIZE(dbuf->dentries))
 		return -ENOSPC;
@@ -254,7 +254,7 @@ static int reiserfs_for_each_xattr(struct inode *inode,
 		goto out_dir;
 	}
 
-	mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_XATTR);
+	inode_lock_nested(d_inode(dir), I_MUTEX_XATTR);
 
 	buf.xadir = dir;
 	while (1) {
@@ -276,7 +276,7 @@ static int reiserfs_for_each_xattr(struct inode *inode,
 			break;
 		buf.count = 0;
 	}
-	mutex_unlock(&d_inode(dir)->i_mutex);
+	inode_unlock(d_inode(dir));
 
 	cleanup_dentry_buf(&buf);
 
@@ -298,13 +298,13 @@ static int reiserfs_for_each_xattr(struct inode *inode,
 		if (!err) {
 			int jerror;
 
-			mutex_lock_nested(&d_inode(dir->d_parent)->i_mutex,
+			inode_lock_nested(d_inode(dir->d_parent),
 					  I_MUTEX_XATTR);
 			err = action(dir, data);
 			reiserfs_write_lock(inode->i_sb);
 			jerror = journal_end(&th);
 			reiserfs_write_unlock(inode->i_sb);
-			mutex_unlock(&d_inode(dir->d_parent)->i_mutex);
+			inode_unlock(d_inode(dir->d_parent));
 			err = jerror ?: err;
 		}
 	}
@@ -384,7 +384,7 @@ static struct dentry *xattr_lookup(struct inode *inode, const char *name,
 	if (IS_ERR(xadir))
 		return ERR_CAST(xadir);
 
-	mutex_lock_nested(&d_inode(xadir)->i_mutex, I_MUTEX_XATTR);
+	inode_lock_nested(d_inode(xadir), I_MUTEX_XATTR);
 	xafile = lookup_one_len(name, xadir, strlen(name));
 	if (IS_ERR(xafile)) {
 		err = PTR_ERR(xafile);
@@ -404,7 +404,7 @@ static struct dentry *xattr_lookup(struct inode *inode, const char *name,
 	if (err)
 		dput(xafile);
 out:
-	mutex_unlock(&d_inode(xadir)->i_mutex);
+	inode_unlock(d_inode(xadir));
 	dput(xadir);
 	if (err)
 		return ERR_PTR(err);
@@ -469,7 +469,7 @@ static int lookup_and_delete_xattr(struct inode *inode, const char *name)
 	if (IS_ERR(xadir))
 		return PTR_ERR(xadir);
 
-	mutex_lock_nested(&d_inode(xadir)->i_mutex, I_MUTEX_XATTR);
+	inode_lock_nested(d_inode(xadir), I_MUTEX_XATTR);
 	dentry = lookup_one_len(name, xadir, strlen(name));
 	if (IS_ERR(dentry)) {
 		err = PTR_ERR(dentry);
@@ -483,7 +483,7 @@ static int lookup_and_delete_xattr(struct inode *inode, const char *name)
 
 	dput(dentry);
 out_dput:
-	mutex_unlock(&d_inode(xadir)->i_mutex);
+	inode_unlock(d_inode(xadir));
 	dput(xadir);
 	return err;
 }
@@ -580,11 +580,11 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th,
 			.ia_valid = ATTR_SIZE | ATTR_CTIME,
 		};
 
-		mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_XATTR);
+		inode_lock_nested(d_inode(dentry), I_MUTEX_XATTR);
 		inode_dio_wait(d_inode(dentry));
 
 		err = reiserfs_setattr(dentry, &newattrs);
-		mutex_unlock(&d_inode(dentry)->i_mutex);
+		inode_unlock(d_inode(dentry));
 	} else
 		update_ctime(inode);
 out_unlock:
@@ -756,7 +756,8 @@ find_xattr_handler_prefix(const struct xattr_handler **handlers,
 		return NULL;
 
 	for_each_xattr_handler(handlers, xah) {
-		if (strncmp(xah->prefix, name, strlen(xah->prefix)) == 0)
+		const char *prefix = xattr_prefix(xah);
+		if (strncmp(prefix, name, strlen(prefix)) == 0)
 			break;
 	}
 
@@ -839,19 +840,16 @@ static int listxattr_filler(struct dir_context *ctx, const char *name,
 
 		handler = find_xattr_handler_prefix(b->dentry->d_sb->s_xattr,
 						    name);
-		if (!handler)	/* Unsupported xattr name */
+		if (!handler /* Unsupported xattr name */ ||
+		    (handler->list && !handler->list(b->dentry)))
 			return 0;
+		size = namelen + 1;
 		if (b->buf) {
-			size = handler->list(handler, b->dentry,
-					     b->buf + b->pos, b->size, name,
-					     namelen);
 			if (size > b->size)
 				return -ERANGE;
-		} else {
-			size = handler->list(handler, b->dentry,
-					     NULL, 0, name, namelen);
+			memcpy(b->buf + b->pos, name, namelen);
+			b->buf[b->pos + namelen] = 0;
 		}
-
 		b->pos += size;
 	}
 	return 0;
@@ -890,9 +888,9 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size)
 		goto out;
 	}
 
-	mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_XATTR);
+	inode_lock_nested(d_inode(dir), I_MUTEX_XATTR);
 	err = reiserfs_readdir_inode(d_inode(dir), &buf.ctx);
-	mutex_unlock(&d_inode(dir)->i_mutex);
+	inode_unlock(d_inode(dir));
 
 	if (!err)
 		err = buf.pos;
@@ -907,7 +905,7 @@ static int create_privroot(struct dentry *dentry)
 	int err;
 	struct inode *inode = d_inode(dentry->d_parent);
 
-	WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex));
+	WARN_ON_ONCE(!inode_is_locked(inode));
 
 	err = xattr_mkdir(inode, dentry, 0700);
 	if (err || d_really_is_negative(dentry)) {
@@ -997,7 +995,7 @@ int reiserfs_lookup_privroot(struct super_block *s)
 	int err = 0;
 
 	/* If we don't have the privroot located yet - go find it */
-	mutex_lock(&d_inode(s->s_root)->i_mutex);
+	inode_lock(d_inode(s->s_root));
 	dentry = lookup_one_len(PRIVROOT_NAME, s->s_root,
 				strlen(PRIVROOT_NAME));
 	if (!IS_ERR(dentry)) {
@@ -1007,7 +1005,7 @@ int reiserfs_lookup_privroot(struct super_block *s)
 			d_inode(dentry)->i_flags |= S_PRIVATE;
 	} else
 		err = PTR_ERR(dentry);
-	mutex_unlock(&d_inode(s->s_root)->i_mutex);
+	inode_unlock(d_inode(s->s_root));
 
 	return err;
 }
@@ -1027,14 +1025,14 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags)
 		goto error;
 
 	if (d_really_is_negative(privroot) && !(mount_flags & MS_RDONLY)) {
-		mutex_lock(&d_inode(s->s_root)->i_mutex);
+		inode_lock(d_inode(s->s_root));
 		err = create_privroot(REISERFS_SB(s)->priv_root);
-		mutex_unlock(&d_inode(s->s_root)->i_mutex);
+		inode_unlock(d_inode(s->s_root));
 	}
 
 	if (d_really_is_positive(privroot)) {
 		s->s_xattr = reiserfs_xattr_handlers;
-		mutex_lock(&d_inode(privroot)->i_mutex);
+		inode_lock(d_inode(privroot));
 		if (!REISERFS_SB(s)->xattr_root) {
 			struct dentry *dentry;
 
@@ -1045,7 +1043,7 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags)
 			else
 				err = PTR_ERR(dentry);
 		}
-		mutex_unlock(&d_inode(privroot)->i_mutex);
+		inode_unlock(d_inode(privroot));
 	}
 
 error:
diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c
index 4b34b9dc0..558a16bea 100644
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -186,10 +186,10 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type)
 
 	switch (type) {
 	case ACL_TYPE_ACCESS:
-		name = POSIX_ACL_XATTR_ACCESS;
+		name = XATTR_NAME_POSIX_ACL_ACCESS;
 		break;
 	case ACL_TYPE_DEFAULT:
-		name = POSIX_ACL_XATTR_DEFAULT;
+		name = XATTR_NAME_POSIX_ACL_DEFAULT;
 		break;
 	default:
 		BUG();
@@ -244,7 +244,7 @@ __reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
 
 	switch (type) {
 	case ACL_TYPE_ACCESS:
-		name = POSIX_ACL_XATTR_ACCESS;
+		name = XATTR_NAME_POSIX_ACL_ACCESS;
 		if (acl) {
 			error = posix_acl_equiv_mode(acl, &inode->i_mode);
 			if (error < 0)
@@ -256,7 +256,7 @@ __reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
 		}
 		break;
 	case ACL_TYPE_DEFAULT:
-		name = POSIX_ACL_XATTR_DEFAULT;
+		name = XATTR_NAME_POSIX_ACL_DEFAULT;
 		if (!S_ISDIR(inode->i_mode))
 			return acl ? -EACCES : 0;
 		break;
diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c
index ac659af43..ab0217d32 100644
--- a/fs/reiserfs/xattr_security.c
+++ b/fs/reiserfs/xattr_security.c
@@ -34,21 +34,9 @@ security_set(const struct xattr_handler *handler, struct dentry *dentry,
 	return reiserfs_xattr_set(d_inode(dentry), name, buffer, size, flags);
 }
 
-static size_t security_list(const struct xattr_handler *handler,
-			    struct dentry *dentry, char *list, size_t list_len,
-			    const char *name, size_t namelen)
+static bool security_list(struct dentry *dentry)
 {
-	const size_t len = namelen + 1;
-
-	if (IS_PRIVATE(d_inode(dentry)))
-		return 0;
-
-	if (list && len <= list_len) {
-		memcpy(list, name, namelen);
-		list[namelen] = '\0';
-	}
-
-	return len;
+	return !IS_PRIVATE(d_inode(dentry));
 }
 
 /* Initializes the security context for a new inode and returns the number
diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c
index a338adf1b..64b67aa64 100644
--- a/fs/reiserfs/xattr_trusted.c
+++ b/fs/reiserfs/xattr_trusted.c
@@ -33,20 +33,9 @@ trusted_set(const struct xattr_handler *handler, struct dentry *dentry,
 	return reiserfs_xattr_set(d_inode(dentry), name, buffer, size, flags);
 }
 
-static size_t trusted_list(const struct xattr_handler *handler,
-			   struct dentry *dentry, char *list, size_t list_size,
-			   const char *name, size_t name_len)
+static bool trusted_list(struct dentry *dentry)
 {
-	const size_t len = name_len + 1;
-
-	if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(d_inode(dentry)))
-		return 0;
-
-	if (list && len <= list_size) {
-		memcpy(list, name, name_len);
-		list[name_len] = '\0';
-	}
-	return len;
+	return capable(CAP_SYS_ADMIN) && !IS_PRIVATE(d_inode(dentry));
 }
 
 const struct xattr_handler reiserfs_xattr_trusted_handler = {
diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c
index 39c966719..12e6306f5 100644
--- a/fs/reiserfs/xattr_user.c
+++ b/fs/reiserfs/xattr_user.c
@@ -30,19 +30,9 @@ user_set(const struct xattr_handler *handler, struct dentry *dentry,
 	return reiserfs_xattr_set(d_inode(dentry), name, buffer, size, flags);
 }
 
-static size_t user_list(const struct xattr_handler *handler,
-			struct dentry *dentry, char *list, size_t list_size,
-			const char *name, size_t name_len)
+static bool user_list(struct dentry *dentry)
 {
-	const size_t len = name_len + 1;
-
-	if (!reiserfs_xattrs_user(dentry->d_sb))
-		return 0;
-	if (list && len <= list_size) {
-		memcpy(list, name, name_len);
-		list[name_len] = '\0';
-	}
-	return len;
+	return reiserfs_xattrs_user(dentry->d_sb);
 }
 
 const struct xattr_handler reiserfs_xattr_user_handler = {
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index 268733cda..6b00ca357 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -360,6 +360,7 @@ static struct inode *romfs_iget(struct super_block *sb, unsigned long pos)
 		break;
 	case ROMFH_SYM:
 		i->i_op = &page_symlink_inode_operations;
+		inode_nohighmem(i);
 		i->i_data.a_ops = &romfs_aops;
 		mode |= S_IRWXUGO;
 		break;
@@ -618,8 +619,8 @@ static int __init init_romfs_fs(void)
 	romfs_inode_cachep =
 		kmem_cache_create("romfs_i",
 				  sizeof(struct romfs_inode_info), 0,
-				  SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
-				  romfs_i_init_once);
+				  SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD |
+				  SLAB_ACCOUNT, romfs_i_init_once);
 
 	if (!romfs_inode_cachep) {
 		pr_err("Failed to initialise inode cache\n");
diff --git a/fs/select.c b/fs/select.c
index 015547330..79d0d4953 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -778,8 +778,8 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait,
 	return mask;
 }
 
-static int do_poll(unsigned int nfds,  struct poll_list *list,
-		   struct poll_wqueues *wait, struct timespec *end_time)
+static int do_poll(struct poll_list *list, struct poll_wqueues *wait,
+		   struct timespec *end_time)
 {
 	poll_table* pt = &wait->pt;
 	ktime_t expire, *to = NULL;
@@ -908,7 +908,7 @@ int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
 	}
 
 	poll_initwait(&table);
-	fdcount = do_poll(nfds, head, &table, end_time);
+	fdcount = do_poll(head, &table, end_time);
 	poll_freewait(&table);
 
 	for (walk = head; walk; walk = walk->next) {
diff --git a/fs/splice.c b/fs/splice.c
index b0ade1fd4..82bc0d64f 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -415,6 +415,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 			 */
 			if (!page->mapping) {
 				unlock_page(page);
+retry_lookup:
 				page = find_or_create_page(mapping, index,
 						mapping_gfp_mask(mapping));
 
@@ -439,13 +440,10 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 			error = mapping->a_ops->readpage(in, page);
 			if (unlikely(error)) {
 				/*
-				 * We really should re-lookup the page here,
-				 * but it complicates things a lot. Instead
-				 * lets just do what we already stored, and
-				 * we'll get it the next time we are called.
+				 * Re-lookup the page
 				 */
 				if (error == AOP_TRUNCATED_PAGE)
-					error = 0;
+					goto retry_lookup;
 
 				break;
 			}
@@ -1110,8 +1108,8 @@ EXPORT_SYMBOL(generic_splice_sendpage);
 /*
  * Attempt to initiate a splice from pipe to file.
  */
-long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
-		    loff_t *ppos, size_t len, unsigned int flags)
+static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
+			   loff_t *ppos, size_t len, unsigned int flags)
 {
 	ssize_t (*splice_write)(struct pipe_inode_info *, struct file *,
 				loff_t *, size_t, unsigned int);
@@ -1123,14 +1121,13 @@ long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
 
 	return splice_write(pipe, out, ppos, len, flags);
 }
-EXPORT_SYMBOL_GPL(do_splice_from);
 
 /*
  * Attempt to initiate a splice from a file to a pipe.
  */
-long do_splice_to(struct file *in, loff_t *ppos,
-		  struct pipe_inode_info *pipe, size_t len,
-		  unsigned int flags)
+static long do_splice_to(struct file *in, loff_t *ppos,
+			 struct pipe_inode_info *pipe, size_t len,
+			 unsigned int flags)
 {
 	ssize_t (*splice_read)(struct file *, loff_t *,
 			       struct pipe_inode_info *, size_t, unsigned int);
@@ -1150,7 +1147,6 @@ long do_splice_to(struct file *in, loff_t *ppos,
 
 	return splice_read(in, ppos, pipe, len, flags);
 }
-EXPORT_SYMBOL_GPL(do_splice_to);
 
 /**
  * splice_direct_to_actor - splices data directly between two non-pipes
diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index a1ce5ce60..0927b1e80 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -41,6 +41,7 @@
 #include <linux/fs.h>
 #include <linux/vfs.h>
 #include <linux/xattr.h>
+#include <linux/pagemap.h>
 
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
@@ -291,6 +292,7 @@ int squashfs_read_inode(struct inode *inode, long long ino)
 		set_nlink(inode, le32_to_cpu(sqsh_ino->nlink));
 		inode->i_size = le32_to_cpu(sqsh_ino->symlink_size);
 		inode->i_op = &squashfs_symlink_inode_ops;
+		inode_nohighmem(inode);
 		inode->i_data.a_ops = &squashfs_symlink_aops;
 		inode->i_mode |= S_IFLNK;
 		squashfs_i(inode)->start = block;
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 5056babe0..5e79bfa4f 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -80,7 +80,6 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
 {
 	struct squashfs_sb_info *msblk;
 	struct squashfs_super_block *sblk = NULL;
-	char b[BDEVNAME_SIZE];
 	struct inode *root;
 	long long root_inode;
 	unsigned short flags;
@@ -124,8 +123,8 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_magic = le32_to_cpu(sblk->s_magic);
 	if (sb->s_magic != SQUASHFS_MAGIC) {
 		if (!silent)
-			ERROR("Can't find a SQUASHFS superblock on %s\n",
-						bdevname(sb->s_bdev, b));
+			ERROR("Can't find a SQUASHFS superblock on %pg\n",
+						sb->s_bdev);
 		goto failed_mount;
 	}
 
@@ -178,7 +177,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent)
 	msblk->inodes = le32_to_cpu(sblk->inodes);
 	flags = le16_to_cpu(sblk->flags);
 
-	TRACE("Found valid superblock on %s\n", bdevname(sb->s_bdev, b));
+	TRACE("Found valid superblock on %pg\n", sb->s_bdev);
 	TRACE("Inodes are %scompressed\n", SQUASHFS_UNCOMPRESSED_INODES(flags)
 				? "un" : "");
 	TRACE("Data is %scompressed\n", SQUASHFS_UNCOMPRESSED_DATA(flags)
@@ -420,7 +419,8 @@ static int __init init_inodecache(void)
 {
 	squashfs_inode_cachep = kmem_cache_create("squashfs_inode_cache",
 		sizeof(struct squashfs_inode_info), 0,
-		SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT, init_once);
+		SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT,
+		init_once);
 
 	return squashfs_inode_cachep ? 0 : -ENOMEM;
 }
diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c
index 12806dffb..dbcc2f54b 100644
--- a/fs/squashfs/symlink.c
+++ b/fs/squashfs/symlink.c
@@ -119,8 +119,7 @@ const struct address_space_operations squashfs_symlink_aops = {
 
 const struct inode_operations squashfs_symlink_inode_ops = {
 	.readlink = generic_readlink,
-	.follow_link = page_follow_link_light,
-	.put_link = page_put_link,
+	.get_link = page_get_link,
 	.getxattr = generic_getxattr,
 	.listxattr = squashfs_listxattr
 };
diff --git a/fs/squashfs/xattr.c b/fs/squashfs/xattr.c
index 6a4cc3440..1e9de9628 100644
--- a/fs/squashfs/xattr.c
+++ b/fs/squashfs/xattr.c
@@ -58,7 +58,7 @@ ssize_t squashfs_listxattr(struct dentry *d, char *buffer,
 		struct squashfs_xattr_entry entry;
 		struct squashfs_xattr_val val;
 		const struct xattr_handler *handler;
-		int name_size, prefix_size = 0;
+		int name_size;
 
 		err = squashfs_read_metadata(sb, &entry, &start, &offset,
 							sizeof(entry));
@@ -67,15 +67,16 @@ ssize_t squashfs_listxattr(struct dentry *d, char *buffer,
 
 		name_size = le16_to_cpu(entry.size);
 		handler = squashfs_xattr_handler(le16_to_cpu(entry.type));
-		if (handler)
-			prefix_size = handler->list(handler, d, buffer, rest,
-						    NULL, name_size);
-		if (prefix_size) {
+		if (handler && (!handler->list || handler->list(d))) {
+			const char *prefix = handler->prefix ?: handler->name;
+			size_t prefix_size = strlen(prefix);
+
 			if (buffer) {
 				if (prefix_size + name_size + 1 > rest) {
 					err = -ERANGE;
 					goto failed;
 				}
+				memcpy(buffer, prefix, prefix_size);
 				buffer += prefix_size;
 			}
 			err = squashfs_read_metadata(sb, buffer, &start,
@@ -212,25 +213,10 @@ failed:
 }
 
 
-static size_t squashfs_xattr_handler_list(const struct xattr_handler *handler,
-					  struct dentry *d, char *list,
-					  size_t list_size, const char *name,
-					  size_t name_len)
-{
-	int len = strlen(handler->prefix);
-
-	if (list && len <= list_size)
-		memcpy(list, handler->prefix, len);
-	return len;
-}
-
 static int squashfs_xattr_handler_get(const struct xattr_handler *handler,
 				      struct dentry *d, const char *name,
 				      void *buffer, size_t size)
 {
-	if (name[0] == '\0')
-		return  -EINVAL;
-
 	return squashfs_xattr_get(d_inode(d), handler->flags, name,
 		buffer, size);
 }
@@ -241,22 +227,15 @@ static int squashfs_xattr_handler_get(const struct xattr_handler *handler,
 static const struct xattr_handler squashfs_xattr_user_handler = {
 	.prefix	= XATTR_USER_PREFIX,
 	.flags	= SQUASHFS_XATTR_USER,
-	.list	= squashfs_xattr_handler_list,
 	.get	= squashfs_xattr_handler_get
 };
 
 /*
  * Trusted namespace support
  */
-static size_t squashfs_trusted_xattr_handler_list(const struct xattr_handler *handler,
-						  struct dentry *d, char *list,
-						  size_t list_size, const char *name,
-						  size_t name_len)
+static bool squashfs_trusted_xattr_handler_list(struct dentry *d)
 {
-	if (!capable(CAP_SYS_ADMIN))
-		return 0;
-	return squashfs_xattr_handler_list(handler, d, list, list_size, name,
-					   name_len);
+	return capable(CAP_SYS_ADMIN);
 }
 
 static const struct xattr_handler squashfs_xattr_trusted_handler = {
@@ -272,7 +251,6 @@ static const struct xattr_handler squashfs_xattr_trusted_handler = {
 static const struct xattr_handler squashfs_xattr_security_handler = {
 	.prefix	= XATTR_SECURITY_PREFIX,
 	.flags	= SQUASHFS_XATTR_SECURITY,
-	.list	= squashfs_xattr_handler_list,
 	.get	= squashfs_xattr_handler_get
 };
 
diff --git a/fs/stat.c b/fs/stat.c
index d4a61d8dc..bc045c799 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -219,7 +219,7 @@ SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, stat
 #  define choose_32_64(a,b) b
 #endif
 
-#define valid_dev(x)  choose_32_64(old_valid_dev,new_valid_dev)(x)
+#define valid_dev(x)  choose_32_64(old_valid_dev(x),true)
 #define encode_dev(x) choose_32_64(old_encode_dev,new_encode_dev)(x)
 
 #ifndef INIT_STRUCT_STAT_PADDING
diff --git a/fs/super.c b/fs/super.c
index 6746d6087..74914b1ba 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -36,7 +36,7 @@
 #include "internal.h"
 
 
-LIST_HEAD(super_blocks);
+static LIST_HEAD(super_blocks);
 static DEFINE_SPINLOCK(sb_lock);
 
 static char *sb_writers_name[SB_FREEZE_LEVELS] = {
@@ -1013,10 +1013,8 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
 		blkdev_put(bdev, mode);
 		down_write(&s->s_umount);
 	} else {
-		char b[BDEVNAME_SIZE];
-
 		s->s_mode = mode;
-		strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
+		snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev);
 		sb_set_blocksize(s, block_size(bdev));
 		error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
 		if (error) {
@@ -1200,7 +1198,7 @@ int __sb_start_write(struct super_block *sb, int level, bool wait)
 	else
 		ret = percpu_down_read_trylock(sb->s_writers.rw_sem + level-1);
 
-	WARN_ON(force_trylock & !ret);
+	WARN_ON(force_trylock && !ret);
 	return ret;
 }
 EXPORT_SYMBOL(__sb_start_write);
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index 02fa1dcc5..d62c423a5 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -146,8 +146,7 @@ static inline void write3byte(struct sysv_sb_info *sbi,
 
 static const struct inode_operations sysv_symlink_inode_operations = {
 	.readlink	= generic_readlink,
-	.follow_link	= page_follow_link_light,
-	.put_link	= page_put_link,
+	.get_link	= page_get_link,
 	.getattr	= sysv_getattr,
 };
 
@@ -163,6 +162,7 @@ void sysv_set_inode(struct inode *inode, dev_t rdev)
 		inode->i_mapping->a_ops = &sysv_aops;
 	} else if (S_ISLNK(inode->i_mode)) {
 		inode->i_op = &sysv_symlink_inode_operations;
+		inode_nohighmem(inode);
 		inode->i_mapping->a_ops = &sysv_aops;
 	} else
 		init_special_inode(inode, inode->i_mode, rdev);
@@ -346,7 +346,7 @@ int __init sysv_init_icache(void)
 {
 	sysv_inode_cachep = kmem_cache_create("sysv_inode_cache",
 			sizeof(struct sysv_inode_info), 0,
-			SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
+			SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT,
 			init_once);
 	if (!sysv_inode_cachep)
 		return -ENOMEM;
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index c66f2423e..4a0e48f92 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -84,9 +84,9 @@ static int tracefs_syscall_mkdir(struct inode *inode, struct dentry *dentry, umo
 	 * the files within the tracefs system. It is up to the individual
 	 * mkdir routine to handle races.
 	 */
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	ret = tracefs_ops.mkdir(name);
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	kfree(name);
 
@@ -109,13 +109,13 @@ static int tracefs_syscall_rmdir(struct inode *inode, struct dentry *dentry)
 	 * This time we need to unlock not only the parent (inode) but
 	 * also the directory that is being deleted.
 	 */
-	mutex_unlock(&inode->i_mutex);
-	mutex_unlock(&dentry->d_inode->i_mutex);
+	inode_unlock(inode);
+	inode_unlock(dentry->d_inode);
 
 	ret = tracefs_ops.rmdir(name);
 
-	mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
-	mutex_lock(&dentry->d_inode->i_mutex);
+	inode_lock_nested(inode, I_MUTEX_PARENT);
+	inode_lock(dentry->d_inode);
 
 	kfree(name);
 
@@ -334,7 +334,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent)
 	if (!parent)
 		parent = tracefs_mount->mnt_root;
 
-	mutex_lock(&parent->d_inode->i_mutex);
+	inode_lock(parent->d_inode);
 	dentry = lookup_one_len(name, parent, strlen(name));
 	if (!IS_ERR(dentry) && dentry->d_inode) {
 		dput(dentry);
@@ -342,7 +342,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent)
 	}
 
 	if (IS_ERR(dentry)) {
-		mutex_unlock(&parent->d_inode->i_mutex);
+		inode_unlock(parent->d_inode);
 		simple_release_fs(&tracefs_mount, &tracefs_mount_count);
 	}
 
@@ -351,7 +351,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent)
 
 static struct dentry *failed_creating(struct dentry *dentry)
 {
-	mutex_unlock(&dentry->d_parent->d_inode->i_mutex);
+	inode_unlock(dentry->d_parent->d_inode);
 	dput(dentry);
 	simple_release_fs(&tracefs_mount, &tracefs_mount_count);
 	return NULL;
@@ -359,7 +359,7 @@ static struct dentry *failed_creating(struct dentry *dentry)
 
 static struct dentry *end_creating(struct dentry *dentry)
 {
-	mutex_unlock(&dentry->d_parent->d_inode->i_mutex);
+	inode_unlock(dentry->d_parent->d_inode);
 	return dentry;
 }
 
@@ -544,9 +544,9 @@ void tracefs_remove(struct dentry *dentry)
 	if (!parent || !parent->d_inode)
 		return;
 
-	mutex_lock(&parent->d_inode->i_mutex);
+	inode_lock(parent->d_inode);
 	ret = __tracefs_remove(dentry, parent);
-	mutex_unlock(&parent->d_inode->i_mutex);
+	inode_unlock(parent->d_inode);
 	if (!ret)
 		simple_release_fs(&tracefs_mount, &tracefs_mount_count);
 }
@@ -572,7 +572,7 @@ void tracefs_remove_recursive(struct dentry *dentry)
 
 	parent = dentry;
  down:
-	mutex_lock(&parent->d_inode->i_mutex);
+	inode_lock(parent->d_inode);
  loop:
 	/*
 	 * The parent->d_subdirs is protected by the d_lock. Outside that
@@ -587,7 +587,7 @@ void tracefs_remove_recursive(struct dentry *dentry)
 		/* perhaps simple_empty(child) makes more sense */
 		if (!list_empty(&child->d_subdirs)) {
 			spin_unlock(&parent->d_lock);
-			mutex_unlock(&parent->d_inode->i_mutex);
+			inode_unlock(parent->d_inode);
 			parent = child;
 			goto down;
 		}
@@ -608,10 +608,10 @@ void tracefs_remove_recursive(struct dentry *dentry)
 	}
 	spin_unlock(&parent->d_lock);
 
-	mutex_unlock(&parent->d_inode->i_mutex);
+	inode_unlock(parent->d_inode);
 	child = parent;
 	parent = parent->d_parent;
-	mutex_lock(&parent->d_inode->i_mutex);
+	inode_lock(parent->d_inode);
 
 	if (child != dentry)
 		/* go up */
@@ -619,7 +619,7 @@ void tracefs_remove_recursive(struct dentry *dentry)
 
 	if (!__tracefs_remove(child, parent))
 		simple_release_fs(&tracefs_mount, &tracefs_mount_count);
-	mutex_unlock(&parent->d_inode->i_mutex);
+	inode_unlock(parent->d_inode);
 }
 
 /**
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index e49bd2808..795992a83 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -515,8 +515,8 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
 	dbg_gen("dent '%pd' to ino %lu (nlink %d) in dir ino %lu",
 		dentry, inode->i_ino,
 		inode->i_nlink, dir->i_ino);
-	ubifs_assert(mutex_is_locked(&dir->i_mutex));
-	ubifs_assert(mutex_is_locked(&inode->i_mutex));
+	ubifs_assert(inode_is_locked(dir));
+	ubifs_assert(inode_is_locked(inode));
 
 	err = dbg_check_synced_i_size(c, inode);
 	if (err)
@@ -572,8 +572,8 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
 	dbg_gen("dent '%pd' from ino %lu (nlink %d) in dir ino %lu",
 		dentry, inode->i_ino,
 		inode->i_nlink, dir->i_ino);
-	ubifs_assert(mutex_is_locked(&dir->i_mutex));
-	ubifs_assert(mutex_is_locked(&inode->i_mutex));
+	ubifs_assert(inode_is_locked(dir));
+	ubifs_assert(inode_is_locked(inode));
 	err = dbg_check_synced_i_size(c, inode);
 	if (err)
 		return err;
@@ -661,8 +661,8 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
 
 	dbg_gen("directory '%pd', ino %lu in dir ino %lu", dentry,
 		inode->i_ino, dir->i_ino);
-	ubifs_assert(mutex_is_locked(&dir->i_mutex));
-	ubifs_assert(mutex_is_locked(&inode->i_mutex));
+	ubifs_assert(inode_is_locked(dir));
+	ubifs_assert(inode_is_locked(inode));
 	err = check_dir_empty(c, d_inode(dentry));
 	if (err)
 		return err;
@@ -996,10 +996,10 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	dbg_gen("dent '%pd' ino %lu in dir ino %lu to dent '%pd' in dir ino %lu",
 		old_dentry, old_inode->i_ino, old_dir->i_ino,
 		new_dentry, new_dir->i_ino);
-	ubifs_assert(mutex_is_locked(&old_dir->i_mutex));
-	ubifs_assert(mutex_is_locked(&new_dir->i_mutex));
+	ubifs_assert(inode_is_locked(old_dir));
+	ubifs_assert(inode_is_locked(new_dir));
 	if (unlink)
-		ubifs_assert(mutex_is_locked(&new_inode->i_mutex));
+		ubifs_assert(inode_is_locked(new_inode));
 
 
 	if (unlink && is_dir) {
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 0edc12856..065c88f8e 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1317,7 +1317,7 @@ int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 	err = filemap_write_and_wait_range(inode->i_mapping, start, end);
 	if (err)
 		return err;
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	/* Synchronize the inode unless this is a 'datasync()' call. */
 	if (!datasync || (inode->i_state & I_DIRTY_DATASYNC)) {
@@ -1332,7 +1332,7 @@ int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 	 */
 	err = ubifs_sync_wbufs_by_inode(c, inode);
 out:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return err;
 }
 
@@ -1608,7 +1608,7 @@ const struct inode_operations ubifs_file_inode_operations = {
 
 const struct inode_operations ubifs_symlink_inode_operations = {
 	.readlink    = generic_readlink,
-	.follow_link = simple_follow_link,
+	.get_link    = simple_get_link,
 	.setattr     = ubifs_setattr,
 	.getattr     = ubifs_getattr,
 	.setxattr    = ubifs_setxattr,
diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h
index 92a8491a8..c0a95e393 100644
--- a/fs/ubifs/key.h
+++ b/fs/ubifs/key.h
@@ -34,6 +34,12 @@
  * node. We use "r5" hash borrowed from reiserfs.
  */
 
+/*
+ * Lot's of the key helpers require a struct ubifs_info *c as the first parameter.
+ * But we are not using it at all currently. That's designed for future extensions of
+ * different c->key_format. But right now, there is only one key type, UBIFS_SIMPLE_KEY_FMT.
+ */
+
 #ifndef __UBIFS_KEY_H__
 #define __UBIFS_KEY_H__
 
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 1fd90c079..a233ba913 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -2248,8 +2248,8 @@ static int __init ubifs_init(void)
 
 	ubifs_inode_slab = kmem_cache_create("ubifs_inode_slab",
 				sizeof(struct ubifs_inode), 0,
-				SLAB_MEM_SPREAD | SLAB_RECLAIM_ACCOUNT,
-				&inode_slab_ctor);
+				SLAB_MEM_SPREAD | SLAB_RECLAIM_ACCOUNT |
+				SLAB_ACCOUNT, &inode_slab_ctor);
 	if (!ubifs_inode_slab)
 		return -ENOMEM;
 
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index e8b01b721..c7f4d434d 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -267,7 +267,7 @@ static int check_namespace(const struct qstr *nm)
 
 	if (!strncmp(nm->name, XATTR_TRUSTED_PREFIX,
 		     XATTR_TRUSTED_PREFIX_LEN)) {
-		if (nm->name[sizeof(XATTR_TRUSTED_PREFIX) - 1] == '\0')
+		if (nm->name[XATTR_TRUSTED_PREFIX_LEN] == '\0')
 			return -EINVAL;
 		type = TRUSTED_XATTR;
 	} else if (!strncmp(nm->name, XATTR_USER_PREFIX,
@@ -277,7 +277,7 @@ static int check_namespace(const struct qstr *nm)
 		type = USER_XATTR;
 	} else if (!strncmp(nm->name, XATTR_SECURITY_PREFIX,
 				     XATTR_SECURITY_PREFIX_LEN)) {
-		if (nm->name[sizeof(XATTR_SECURITY_PREFIX) - 1] == '\0')
+		if (nm->name[XATTR_SECURITY_PREFIX_LEN] == '\0')
 			return -EINVAL;
 		type = SECURITY_XATTR;
 	} else
@@ -313,7 +313,7 @@ static int setxattr(struct inode *host, const char *name, const void *value,
 	union ubifs_key key;
 	int err, type;
 
-	ubifs_assert(mutex_is_locked(&host->i_mutex));
+	ubifs_assert(inode_is_locked(host));
 
 	if (size > UBIFS_MAX_INO_DATA)
 		return -ERANGE;
@@ -550,7 +550,7 @@ int ubifs_removexattr(struct dentry *dentry, const char *name)
 
 	dbg_gen("xattr '%s', ino %lu ('%pd')", name,
 		host->i_ino, dentry);
-	ubifs_assert(mutex_is_locked(&host->i_mutex));
+	ubifs_assert(inode_is_locked(host));
 
 	err = check_namespace(&nm);
 	if (err < 0)
diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c
index 6d6a96b4e..e0fd65fe7 100644
--- a/fs/udf/balloc.c
+++ b/fs/udf/balloc.c
@@ -447,9 +447,6 @@ static void udf_table_free_blocks(struct super_block *sb,
 		 */
 
 		int adsize;
-		struct short_ad *sad = NULL;
-		struct long_ad *lad = NULL;
-		struct allocExtDesc *aed;
 
 		eloc.logicalBlockNum = start;
 		elen = EXT_RECORDED_ALLOCATED |
@@ -466,102 +463,17 @@ static void udf_table_free_blocks(struct super_block *sb,
 		}
 
 		if (epos.offset + (2 * adsize) > sb->s_blocksize) {
-			unsigned char *sptr, *dptr;
-			int loffset;
-
-			brelse(oepos.bh);
-			oepos = epos;
-
 			/* Steal a block from the extent being free'd */
-			epos.block.logicalBlockNum = eloc.logicalBlockNum;
+			udf_setup_indirect_aext(table, eloc.logicalBlockNum,
+						&epos);
+
 			eloc.logicalBlockNum++;
 			elen -= sb->s_blocksize;
-
-			epos.bh = udf_tread(sb,
-					udf_get_lb_pblock(sb, &epos.block, 0));
-			if (!epos.bh) {
-				brelse(oepos.bh);
-				goto error_return;
-			}
-			aed = (struct allocExtDesc *)(epos.bh->b_data);
-			aed->previousAllocExtLocation =
-				cpu_to_le32(oepos.block.logicalBlockNum);
-			if (epos.offset + adsize > sb->s_blocksize) {
-				loffset = epos.offset;
-				aed->lengthAllocDescs = cpu_to_le32(adsize);
-				sptr = iinfo->i_ext.i_data + epos.offset
-								- adsize;
-				dptr = epos.bh->b_data +
-					sizeof(struct allocExtDesc);
-				memcpy(dptr, sptr, adsize);
-				epos.offset = sizeof(struct allocExtDesc) +
-						adsize;
-			} else {
-				loffset = epos.offset + adsize;
-				aed->lengthAllocDescs = cpu_to_le32(0);
-				if (oepos.bh) {
-					sptr = oepos.bh->b_data + epos.offset;
-					aed = (struct allocExtDesc *)
-						oepos.bh->b_data;
-					le32_add_cpu(&aed->lengthAllocDescs,
-							adsize);
-				} else {
-					sptr = iinfo->i_ext.i_data +
-								epos.offset;
-					iinfo->i_lenAlloc += adsize;
-					mark_inode_dirty(table);
-				}
-				epos.offset = sizeof(struct allocExtDesc);
-			}
-			if (sbi->s_udfrev >= 0x0200)
-				udf_new_tag(epos.bh->b_data, TAG_IDENT_AED,
-					    3, 1, epos.block.logicalBlockNum,
-					    sizeof(struct tag));
-			else
-				udf_new_tag(epos.bh->b_data, TAG_IDENT_AED,
-					    2, 1, epos.block.logicalBlockNum,
-					    sizeof(struct tag));
-
-			switch (iinfo->i_alloc_type) {
-			case ICBTAG_FLAG_AD_SHORT:
-				sad = (struct short_ad *)sptr;
-				sad->extLength = cpu_to_le32(
-					EXT_NEXT_EXTENT_ALLOCDECS |
-					sb->s_blocksize);
-				sad->extPosition =
-					cpu_to_le32(epos.block.logicalBlockNum);
-				break;
-			case ICBTAG_FLAG_AD_LONG:
-				lad = (struct long_ad *)sptr;
-				lad->extLength = cpu_to_le32(
-					EXT_NEXT_EXTENT_ALLOCDECS |
-					sb->s_blocksize);
-				lad->extLocation =
-					cpu_to_lelb(epos.block);
-				break;
-			}
-			if (oepos.bh) {
-				udf_update_tag(oepos.bh->b_data, loffset);
-				mark_buffer_dirty(oepos.bh);
-			} else {
-				mark_inode_dirty(table);
-			}
 		}
 
 		/* It's possible that stealing the block emptied the extent */
-		if (elen) {
-			udf_write_aext(table, &epos, &eloc, elen, 1);
-
-			if (!epos.bh) {
-				iinfo->i_lenAlloc += adsize;
-				mark_inode_dirty(table);
-			} else {
-				aed = (struct allocExtDesc *)epos.bh->b_data;
-				le32_add_cpu(&aed->lengthAllocDescs, adsize);
-				udf_update_tag(epos.bh->b_data, epos.offset);
-				mark_buffer_dirty(epos.bh);
-			}
-		}
+		if (elen)
+			__udf_add_aext(table, &epos, &eloc, elen, 1);
 	}
 
 	brelse(epos.bh);
diff --git a/fs/udf/file.c b/fs/udf/file.c
index bddf3d071..1af98963d 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -122,7 +122,7 @@ static ssize_t udf_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	struct udf_inode_info *iinfo = UDF_I(inode);
 	int err;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 
 	retval = generic_write_checks(iocb, from);
 	if (retval <= 0)
@@ -136,7 +136,7 @@ static ssize_t udf_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 				(udf_file_entry_alloc_offset(inode) + end)) {
 			err = udf_expand_file_adinicb(inode);
 			if (err) {
-				mutex_unlock(&inode->i_mutex);
+				inode_unlock(inode);
 				udf_debug("udf_expand_adinicb: err=%d\n", err);
 				return err;
 			}
@@ -149,7 +149,7 @@ static ssize_t udf_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 
 	retval = __generic_file_write_iter(iocb, from);
 out:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 
 	if (retval > 0) {
 		mark_inode_dirty(inode);
@@ -223,12 +223,12 @@ static int udf_release_file(struct inode *inode, struct file *filp)
 		 * Grab i_mutex to avoid races with writes changing i_size
 		 * while we are running.
 		 */
-		mutex_lock(&inode->i_mutex);
+		inode_lock(inode);
 		down_write(&UDF_I(inode)->i_data_sem);
 		udf_discard_prealloc(inode);
 		udf_truncate_tail_extent(inode);
 		up_write(&UDF_I(inode)->i_data_sem);
-		mutex_unlock(&inode->i_mutex);
+		inode_unlock(inode);
 	}
 	return 0;
 }
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 566df9b5a..166d3ed32 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -262,7 +262,7 @@ int udf_expand_file_adinicb(struct inode *inode)
 		.nr_to_write = 1,
 	};
 
-	WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex));
+	WARN_ON_ONCE(!inode_is_locked(inode));
 	if (!iinfo->i_lenAlloc) {
 		if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD))
 			iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT;
@@ -539,9 +539,18 @@ static int udf_do_extend_file(struct inode *inode,
 		udf_add_aext(inode, last_pos, &last_ext->extLocation,
 			     last_ext->extLength, 1);
 		count++;
-	} else
+	} else {
+		struct kernel_lb_addr tmploc;
+		uint32_t tmplen;
+
 		udf_write_aext(inode, last_pos, &last_ext->extLocation,
 				last_ext->extLength, 1);
+		/*
+		 * We've rewritten the last extent but there may be empty
+		 * indirect extent after it - enter it.
+		 */
+		udf_next_aext(inode, last_pos, &tmploc, &tmplen, 0);
+	}
 
 	/* Managed to do everything necessary? */
 	if (!blocks)
@@ -1540,7 +1549,8 @@ reread:
 		break;
 	case ICBTAG_FILE_TYPE_SYMLINK:
 		inode->i_data.a_ops = &udf_symlink_aops;
-		inode->i_op = &udf_symlink_inode_operations;
+		inode->i_op = &page_symlink_inode_operations;
+		inode_nohighmem(inode);
 		inode->i_mode = S_IFLNK | S_IRWXUGO;
 		break;
 	case ICBTAG_FILE_TYPE_MAIN:
@@ -1866,22 +1876,90 @@ struct inode *__udf_iget(struct super_block *sb, struct kernel_lb_addr *ino,
 	return inode;
 }
 
-int udf_add_aext(struct inode *inode, struct extent_position *epos,
-		 struct kernel_lb_addr *eloc, uint32_t elen, int inc)
+int udf_setup_indirect_aext(struct inode *inode, int block,
+			    struct extent_position *epos)
 {
-	int adsize;
-	struct short_ad *sad = NULL;
-	struct long_ad *lad = NULL;
+	struct super_block *sb = inode->i_sb;
+	struct buffer_head *bh;
 	struct allocExtDesc *aed;
-	uint8_t *ptr;
-	struct udf_inode_info *iinfo = UDF_I(inode);
+	struct extent_position nepos;
+	struct kernel_lb_addr neloc;
+	int ver, adsize;
 
-	if (!epos->bh)
-		ptr = iinfo->i_ext.i_data + epos->offset -
-			udf_file_entry_alloc_offset(inode) +
-			iinfo->i_lenEAttr;
+	if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
+		adsize = sizeof(struct short_ad);
+	else if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_LONG)
+		adsize = sizeof(struct long_ad);
 	else
-		ptr = epos->bh->b_data + epos->offset;
+		return -EIO;
+
+	neloc.logicalBlockNum = block;
+	neloc.partitionReferenceNum = epos->block.partitionReferenceNum;
+
+	bh = udf_tgetblk(sb, udf_get_lb_pblock(sb, &neloc, 0));
+	if (!bh)
+		return -EIO;
+	lock_buffer(bh);
+	memset(bh->b_data, 0x00, sb->s_blocksize);
+	set_buffer_uptodate(bh);
+	unlock_buffer(bh);
+	mark_buffer_dirty_inode(bh, inode);
+
+	aed = (struct allocExtDesc *)(bh->b_data);
+	if (!UDF_QUERY_FLAG(sb, UDF_FLAG_STRICT)) {
+		aed->previousAllocExtLocation =
+				cpu_to_le32(epos->block.logicalBlockNum);
+	}
+	aed->lengthAllocDescs = cpu_to_le32(0);
+	if (UDF_SB(sb)->s_udfrev >= 0x0200)
+		ver = 3;
+	else
+		ver = 2;
+	udf_new_tag(bh->b_data, TAG_IDENT_AED, ver, 1, block,
+		    sizeof(struct tag));
+
+	nepos.block = neloc;
+	nepos.offset = sizeof(struct allocExtDesc);
+	nepos.bh = bh;
+
+	/*
+	 * Do we have to copy current last extent to make space for indirect
+	 * one?
+	 */
+	if (epos->offset + adsize > sb->s_blocksize) {
+		struct kernel_lb_addr cp_loc;
+		uint32_t cp_len;
+		int cp_type;
+
+		epos->offset -= adsize;
+		cp_type = udf_current_aext(inode, epos, &cp_loc, &cp_len, 0);
+		cp_len |= ((uint32_t)cp_type) << 30;
+
+		__udf_add_aext(inode, &nepos, &cp_loc, cp_len, 1);
+		udf_write_aext(inode, epos, &nepos.block,
+			       sb->s_blocksize | EXT_NEXT_EXTENT_ALLOCDECS, 0);
+	} else {
+		__udf_add_aext(inode, epos, &nepos.block,
+			       sb->s_blocksize | EXT_NEXT_EXTENT_ALLOCDECS, 0);
+	}
+
+	brelse(epos->bh);
+	*epos = nepos;
+
+	return 0;
+}
+
+/*
+ * Append extent at the given position - should be the first free one in inode
+ * / indirect extent. This function assumes there is enough space in the inode
+ * or indirect extent. Use udf_add_aext() if you didn't check for this before.
+ */
+int __udf_add_aext(struct inode *inode, struct extent_position *epos,
+		   struct kernel_lb_addr *eloc, uint32_t elen, int inc)
+{
+	struct udf_inode_info *iinfo = UDF_I(inode);
+	struct allocExtDesc *aed;
+	int adsize;
 
 	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
 		adsize = sizeof(struct short_ad);
@@ -1890,88 +1968,14 @@ int udf_add_aext(struct inode *inode, struct extent_position *epos,
 	else
 		return -EIO;
 
-	if (epos->offset + (2 * adsize) > inode->i_sb->s_blocksize) {
-		unsigned char *sptr, *dptr;
-		struct buffer_head *nbh;
-		int err, loffset;
-		struct kernel_lb_addr obloc = epos->block;
-
-		epos->block.logicalBlockNum = udf_new_block(inode->i_sb, NULL,
-						obloc.partitionReferenceNum,
-						obloc.logicalBlockNum, &err);
-		if (!epos->block.logicalBlockNum)
-			return -ENOSPC;
-		nbh = udf_tgetblk(inode->i_sb, udf_get_lb_pblock(inode->i_sb,
-								 &epos->block,
-								 0));
-		if (!nbh)
-			return -EIO;
-		lock_buffer(nbh);
-		memset(nbh->b_data, 0x00, inode->i_sb->s_blocksize);
-		set_buffer_uptodate(nbh);
-		unlock_buffer(nbh);
-		mark_buffer_dirty_inode(nbh, inode);
-
-		aed = (struct allocExtDesc *)(nbh->b_data);
-		if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT))
-			aed->previousAllocExtLocation =
-					cpu_to_le32(obloc.logicalBlockNum);
-		if (epos->offset + adsize > inode->i_sb->s_blocksize) {
-			loffset = epos->offset;
-			aed->lengthAllocDescs = cpu_to_le32(adsize);
-			sptr = ptr - adsize;
-			dptr = nbh->b_data + sizeof(struct allocExtDesc);
-			memcpy(dptr, sptr, adsize);
-			epos->offset = sizeof(struct allocExtDesc) + adsize;
-		} else {
-			loffset = epos->offset + adsize;
-			aed->lengthAllocDescs = cpu_to_le32(0);
-			sptr = ptr;
-			epos->offset = sizeof(struct allocExtDesc);
-
-			if (epos->bh) {
-				aed = (struct allocExtDesc *)epos->bh->b_data;
-				le32_add_cpu(&aed->lengthAllocDescs, adsize);
-			} else {
-				iinfo->i_lenAlloc += adsize;
-				mark_inode_dirty(inode);
-			}
-		}
-		if (UDF_SB(inode->i_sb)->s_udfrev >= 0x0200)
-			udf_new_tag(nbh->b_data, TAG_IDENT_AED, 3, 1,
-				    epos->block.logicalBlockNum, sizeof(struct tag));
-		else
-			udf_new_tag(nbh->b_data, TAG_IDENT_AED, 2, 1,
-				    epos->block.logicalBlockNum, sizeof(struct tag));
-		switch (iinfo->i_alloc_type) {
-		case ICBTAG_FLAG_AD_SHORT:
-			sad = (struct short_ad *)sptr;
-			sad->extLength = cpu_to_le32(EXT_NEXT_EXTENT_ALLOCDECS |
-						     inode->i_sb->s_blocksize);
-			sad->extPosition =
-				cpu_to_le32(epos->block.logicalBlockNum);
-			break;
-		case ICBTAG_FLAG_AD_LONG:
-			lad = (struct long_ad *)sptr;
-			lad->extLength = cpu_to_le32(EXT_NEXT_EXTENT_ALLOCDECS |
-						     inode->i_sb->s_blocksize);
-			lad->extLocation = cpu_to_lelb(epos->block);
-			memset(lad->impUse, 0x00, sizeof(lad->impUse));
-			break;
-		}
-		if (epos->bh) {
-			if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) ||
-			    UDF_SB(inode->i_sb)->s_udfrev >= 0x0201)
-				udf_update_tag(epos->bh->b_data, loffset);
-			else
-				udf_update_tag(epos->bh->b_data,
-						sizeof(struct allocExtDesc));
-			mark_buffer_dirty_inode(epos->bh, inode);
-			brelse(epos->bh);
-		} else {
-			mark_inode_dirty(inode);
-		}
-		epos->bh = nbh;
+	if (!epos->bh) {
+		WARN_ON(iinfo->i_lenAlloc !=
+			epos->offset - udf_file_entry_alloc_offset(inode));
+	} else {
+		aed = (struct allocExtDesc *)epos->bh->b_data;
+		WARN_ON(le32_to_cpu(aed->lengthAllocDescs) !=
+			epos->offset - sizeof(struct allocExtDesc));
+		WARN_ON(epos->offset + adsize > inode->i_sb->s_blocksize);
 	}
 
 	udf_write_aext(inode, epos, eloc, elen, inc);
@@ -1995,6 +1999,41 @@ int udf_add_aext(struct inode *inode, struct extent_position *epos,
 	return 0;
 }
 
+/*
+ * Append extent at given position - should be the first free one in inode
+ * / indirect extent. Takes care of allocating and linking indirect blocks.
+ */
+int udf_add_aext(struct inode *inode, struct extent_position *epos,
+		 struct kernel_lb_addr *eloc, uint32_t elen, int inc)
+{
+	int adsize;
+	struct super_block *sb = inode->i_sb;
+
+	if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_SHORT)
+		adsize = sizeof(struct short_ad);
+	else if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_LONG)
+		adsize = sizeof(struct long_ad);
+	else
+		return -EIO;
+
+	if (epos->offset + (2 * adsize) > sb->s_blocksize) {
+		int err;
+		int new_block;
+
+		new_block = udf_new_block(sb, NULL,
+					  epos->block.partitionReferenceNum,
+					  epos->block.logicalBlockNum, &err);
+		if (!new_block)
+			return -ENOSPC;
+
+		err = udf_setup_indirect_aext(inode, new_block, epos);
+		if (err)
+			return err;
+	}
+
+	return __udf_add_aext(inode, epos, eloc, elen, inc);
+}
+
 void udf_write_aext(struct inode *inode, struct extent_position *epos,
 		    struct kernel_lb_addr *eloc, uint32_t elen, int inc)
 {
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index c97b5a8d1..42eafb91f 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -921,7 +921,8 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
 	}
 
 	inode->i_data.a_ops = &udf_symlink_aops;
-	inode->i_op = &udf_symlink_inode_operations;
+	inode->i_op = &page_symlink_inode_operations;
+	inode_nohighmem(inode);
 
 	if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
 		struct kernel_lb_addr eloc;
@@ -1344,8 +1345,3 @@ const struct inode_operations udf_dir_inode_operations = {
 	.rename				= udf_rename,
 	.tmpfile			= udf_tmpfile,
 };
-const struct inode_operations udf_symlink_inode_operations = {
-	.readlink	= generic_readlink,
-	.follow_link	= page_follow_link_light,
-	.put_link	= page_put_link,
-};
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 81155b9b4..a522c15a0 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -179,7 +179,8 @@ static int __init init_inodecache(void)
 	udf_inode_cachep = kmem_cache_create("udf_inode_cache",
 					     sizeof(struct udf_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT |
-						 SLAB_MEM_SPREAD),
+						 SLAB_MEM_SPREAD |
+						 SLAB_ACCOUNT),
 					     init_once);
 	if (!udf_inode_cachep)
 		return -ENOMEM;
@@ -278,17 +279,12 @@ static void udf_sb_free_bitmap(struct udf_bitmap *bitmap)
 {
 	int i;
 	int nr_groups = bitmap->s_nr_groups;
-	int size = sizeof(struct udf_bitmap) + (sizeof(struct buffer_head *) *
-						nr_groups);
 
 	for (i = 0; i < nr_groups; i++)
 		if (bitmap->s_block_bitmap[i])
 			brelse(bitmap->s_block_bitmap[i]);
 
-	if (size <= PAGE_SIZE)
-		kfree(bitmap);
-	else
-		vfree(bitmap);
+	kvfree(bitmap);
 }
 
 static void udf_free_partition(struct udf_part_map *map)
@@ -1586,6 +1582,13 @@ static void udf_load_logicalvolint(struct super_block *sb, struct kernel_extent_
 }
 
 /*
+ * Maximum number of Terminating Descriptor redirections. The chosen number is
+ * arbitrary - just that we hopefully don't limit any real use of rewritten
+ * inode on write-once media but avoid looping for too long on corrupted media.
+ */
+#define UDF_MAX_TD_NESTING 64
+
+/*
  * Process a main/reserve volume descriptor sequence.
  *   @block		First block of first extent of the sequence.
  *   @lastblock		Lastblock of first extent of the sequence.
@@ -1609,6 +1612,7 @@ static noinline int udf_process_sequence(
 	uint16_t ident;
 	long next_s = 0, next_e = 0;
 	int ret;
+	unsigned int indirections = 0;
 
 	memset(vds, 0, sizeof(struct udf_vds_record) * VDS_POS_LENGTH);
 
@@ -1679,6 +1683,12 @@ static noinline int udf_process_sequence(
 			}
 			break;
 		case TAG_IDENT_TD: /* ISO 13346 3/10.9 */
+			if (++indirections > UDF_MAX_TD_NESTING) {
+				udf_err(sb, "too many TDs (max %u supported)\n", UDF_MAX_TD_NESTING);
+				brelse(bh);
+				return -EIO;
+			}
+
 			vds[VDS_POS_TERMINATING_DESC].block = block;
 			if (next_e) {
 				block = next_s;
diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c
index 862535b3b..8d6197730 100644
--- a/fs/udf/symlink.c
+++ b/fs/udf/symlink.c
@@ -107,7 +107,7 @@ static int udf_symlink_filler(struct file *file, struct page *page)
 	struct buffer_head *bh = NULL;
 	unsigned char *symlink;
 	int err;
-	unsigned char *p = kmap(page);
+	unsigned char *p = page_address(page);
 	struct udf_inode_info *iinfo;
 	uint32_t pos;
 
@@ -141,7 +141,6 @@ static int udf_symlink_filler(struct file *file, struct page *page)
 
 	up_read(&iinfo->i_data_sem);
 	SetPageUptodate(page);
-	kunmap(page);
 	unlock_page(page);
 	return 0;
 
@@ -149,7 +148,6 @@ out_unlock_inode:
 	up_read(&iinfo->i_data_sem);
 	SetPageError(page);
 out_unmap:
-	kunmap(page);
 	unlock_page(page);
 	return err;
 }
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 47bb3f5ca..fa0044b6b 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -85,7 +85,6 @@ extern const struct inode_operations udf_dir_inode_operations;
 extern const struct file_operations udf_dir_operations;
 extern const struct inode_operations udf_file_inode_operations;
 extern const struct file_operations udf_file_operations;
-extern const struct inode_operations udf_symlink_inode_operations;
 extern const struct address_space_operations udf_aops;
 extern const struct address_space_operations udf_adinicb_aops;
 extern const struct address_space_operations udf_symlink_aops;
@@ -159,6 +158,10 @@ extern int udf_write_inode(struct inode *, struct writeback_control *wbc);
 extern long udf_block_map(struct inode *, sector_t);
 extern int8_t inode_bmap(struct inode *, sector_t, struct extent_position *,
 			 struct kernel_lb_addr *, uint32_t *, sector_t *);
+extern int udf_setup_indirect_aext(struct inode *inode, int block,
+				   struct extent_position *epos);
+extern int __udf_add_aext(struct inode *inode, struct extent_position *epos,
+			  struct kernel_lb_addr *eloc, uint32_t elen, int inc);
 extern int udf_add_aext(struct inode *, struct extent_position *,
 			struct kernel_lb_addr *, uint32_t, int);
 extern void udf_write_aext(struct inode *, struct extent_position *,
diff --git a/fs/ufs/Makefile b/fs/ufs/Makefile
index 392db25c0..ec4a6b49f 100644
--- a/fs/ufs/Makefile
+++ b/fs/ufs/Makefile
@@ -5,5 +5,5 @@
 obj-$(CONFIG_UFS_FS) += ufs.o
 
 ufs-objs := balloc.o cylinder.o dir.o file.o ialloc.o inode.o \
-	    namei.o super.o symlink.o util.o
+	    namei.o super.o util.o
 ccflags-$(CONFIG_UFS_DEBUG)    += -DDEBUG
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index a064cf44b..d897e169a 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -528,11 +528,12 @@ static void ufs_set_inode_ops(struct inode *inode)
 		inode->i_mapping->a_ops = &ufs_aops;
 	} else if (S_ISLNK(inode->i_mode)) {
 		if (!inode->i_blocks) {
-			inode->i_op = &ufs_fast_symlink_inode_operations;
 			inode->i_link = (char *)UFS_I(inode)->i_u1.i_symlink;
+			inode->i_op = &simple_symlink_inode_operations;
 		} else {
-			inode->i_op = &ufs_symlink_inode_operations;
 			inode->i_mapping->a_ops = &ufs_aops;
+			inode->i_op = &page_symlink_inode_operations;
+			inode_nohighmem(inode);
 		}
 	} else
 		init_special_inode(inode, inode->i_mode,
diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c
index 479665543..acf4a3b61 100644
--- a/fs/ufs/namei.c
+++ b/fs/ufs/namei.c
@@ -123,14 +123,15 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry,
 
 	if (l > UFS_SB(sb)->s_uspi->s_maxsymlinklen) {
 		/* slow symlink */
-		inode->i_op = &ufs_symlink_inode_operations;
+		inode->i_op = &page_symlink_inode_operations;
+		inode_nohighmem(inode);
 		inode->i_mapping->a_ops = &ufs_aops;
 		err = page_symlink(inode, symname, l);
 		if (err)
 			goto out_fail;
 	} else {
 		/* fast symlink */
-		inode->i_op = &ufs_fast_symlink_inode_operations;
+		inode->i_op = &simple_symlink_inode_operations;
 		inode->i_link = (char *)UFS_I(inode)->i_u1.i_symlink;
 		memcpy(inode->i_link, symname, l);
 		inode->i_size = l-1;
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index f6390eec0..442fd52eb 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1427,7 +1427,7 @@ static int __init init_inodecache(void)
 	ufs_inode_cachep = kmem_cache_create("ufs_inode_cache",
 					     sizeof(struct ufs_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					     init_once);
 	if (ufs_inode_cachep == NULL)
 		return -ENOMEM;
diff --git a/fs/ufs/symlink.c b/fs/ufs/symlink.c
deleted file mode 100644
index 874480bb4..000000000
--- a/fs/ufs/symlink.c
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- *  linux/fs/ufs/symlink.c
- *
- * Only fast symlinks left here - the rest is done by generic code. AV, 1999
- *
- * Copyright (C) 1998
- * Daniel Pirkl <daniel.pirkl@emai.cz>
- * Charles University, Faculty of Mathematics and Physics
- *
- *  from
- *
- *  linux/fs/ext2/symlink.c
- *
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- *
- *  from
- *
- *  linux/fs/minix/symlink.c
- *
- *  Copyright (C) 1991, 1992  Linus Torvalds
- *
- *  ext2 symlink handling code
- */
-
-#include "ufs_fs.h"
-#include "ufs.h"
-
-const struct inode_operations ufs_fast_symlink_inode_operations = {
-	.readlink	= generic_readlink,
-	.follow_link	= simple_follow_link,
-	.setattr	= ufs_setattr,
-};
-
-const struct inode_operations ufs_symlink_inode_operations = {
-	.readlink	= generic_readlink,
-	.follow_link	= page_follow_link_light,
-	.put_link	= page_put_link,
-	.setattr	= ufs_setattr,
-};
diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h
index 7da4aca86..c87f4c3fa 100644
--- a/fs/ufs/ufs.h
+++ b/fs/ufs/ufs.h
@@ -136,10 +136,6 @@ extern __printf(3, 4)
 void ufs_panic(struct super_block *, const char *, const char *, ...);
 void ufs_mark_sb_dirty(struct super_block *sb);
 
-/* symlink.c */
-extern const struct inode_operations ufs_fast_symlink_inode_operations;
-extern const struct inode_operations ufs_symlink_inode_operations;
-
 static inline struct ufs_sb_info *UFS_SB(struct super_block *sb)
 {
 	return sb->s_fs_info;
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 503117031..66cdb4461 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -287,6 +287,12 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address,
 		goto out;
 
 	/*
+	 * We don't do userfault handling for the final child pid update.
+	 */
+	if (current->flags & PF_EXITING)
+		goto out;
+
+	/*
 	 * Check that we can return VM_FAULT_RETRY.
 	 *
 	 * NOTE: it should become possible to return VM_FAULT_RETRY
diff --git a/fs/utimes.c b/fs/utimes.c
index aa138d645..85c40f4f3 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -103,9 +103,9 @@ static int utimes_common(struct path *path, struct timespec *times)
 		}
 	}
 retry_deleg:
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	error = notify_change(path->dentry, &newattrs, &delegated_inode);
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	if (delegated_inode) {
 		error = break_deleg_wait(&delegated_inode);
 		if (!error)
diff --git a/fs/xattr.c b/fs/xattr.c
index 0c317c4fd..4861322e2 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -129,7 +129,7 @@ vfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 	if (error)
 		return error;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	error = security_inode_setxattr(dentry, name, value, size, flags);
 	if (error)
 		goto out;
@@ -137,7 +137,7 @@ vfs_setxattr(struct dentry *dentry, const char *name, const void *value,
 	error = __vfs_setxattr_noperm(dentry, name, value, size, flags);
 
 out:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return error;
 }
 EXPORT_SYMBOL_GPL(vfs_setxattr);
@@ -207,26 +207,6 @@ vfs_getxattr_alloc(struct dentry *dentry, const char *name, char **xattr_value,
 	*xattr_value = value;
 	return error;
 }
-EXPORT_SYMBOL_GPL(vfs_getxattr_alloc);
-
-/* Compare an extended attribute value with the given value */
-int vfs_xattr_cmp(struct dentry *dentry, const char *xattr_name,
-		  const char *value, size_t size, gfp_t flags)
-{
-	char *xattr_value = NULL;
-	int rc;
-
-	rc = vfs_getxattr_alloc(dentry, xattr_name, &xattr_value, 0, flags);
-	if (rc < 0)
-		return rc;
-
-	if ((rc != size) || (memcmp(xattr_value, value, rc) != 0))
-		rc = -EINVAL;
-	else
-		rc = 0;
-	kfree(xattr_value);
-	return rc;
-}
 
 ssize_t
 vfs_getxattr(struct dentry *dentry, const char *name, void *value, size_t size)
@@ -297,7 +277,7 @@ vfs_removexattr(struct dentry *dentry, const char *name)
 	if (error)
 		return error;
 
-	mutex_lock(&inode->i_mutex);
+	inode_lock(inode);
 	error = security_inode_removexattr(dentry, name);
 	if (error)
 		goto out;
@@ -310,7 +290,7 @@ vfs_removexattr(struct dentry *dentry, const char *name)
 	}
 
 out:
-	mutex_unlock(&inode->i_mutex);
+	inode_unlock(inode);
 	return error;
 }
 EXPORT_SYMBOL_GPL(vfs_removexattr);
@@ -325,7 +305,6 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value,
 {
 	int error;
 	void *kvalue = NULL;
-	void *vvalue = NULL;	/* If non-NULL, we used vmalloc() */
 	char kname[XATTR_NAME_MAX + 1];
 
 	if (flags & ~(XATTR_CREATE|XATTR_REPLACE))
@@ -342,10 +321,9 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value,
 			return -E2BIG;
 		kvalue = kmalloc(size, GFP_KERNEL | __GFP_NOWARN);
 		if (!kvalue) {
-			vvalue = vmalloc(size);
-			if (!vvalue)
+			kvalue = vmalloc(size);
+			if (!kvalue)
 				return -ENOMEM;
-			kvalue = vvalue;
 		}
 		if (copy_from_user(kvalue, value, size)) {
 			error = -EFAULT;
@@ -358,10 +336,8 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value,
 
 	error = vfs_setxattr(d, kname, kvalue, size, flags);
 out:
-	if (vvalue)
-		vfree(vvalue);
-	else
-		kfree(kvalue);
+	kvfree(kvalue);
+
 	return error;
 }
 
@@ -429,7 +405,6 @@ getxattr(struct dentry *d, const char __user *name, void __user *value,
 {
 	ssize_t error;
 	void *kvalue = NULL;
-	void *vvalue = NULL;
 	char kname[XATTR_NAME_MAX + 1];
 
 	error = strncpy_from_user(kname, name, sizeof(kname));
@@ -443,10 +418,9 @@ getxattr(struct dentry *d, const char __user *name, void __user *value,
 			size = XATTR_SIZE_MAX;
 		kvalue = kzalloc(size, GFP_KERNEL | __GFP_NOWARN);
 		if (!kvalue) {
-			vvalue = vmalloc(size);
-			if (!vvalue)
+			kvalue = vmalloc(size);
+			if (!kvalue)
 				return -ENOMEM;
-			kvalue = vvalue;
 		}
 	}
 
@@ -462,10 +436,9 @@ getxattr(struct dentry *d, const char __user *name, void __user *value,
 		   than XATTR_SIZE_MAX bytes. Not possible. */
 		error = -E2BIG;
 	}
-	if (vvalue)
-		vfree(vvalue);
-	else
-		kfree(kvalue);
+
+	kvfree(kvalue);
+
 	return error;
 }
 
@@ -522,17 +495,15 @@ listxattr(struct dentry *d, char __user *list, size_t size)
 {
 	ssize_t error;
 	char *klist = NULL;
-	char *vlist = NULL;	/* If non-NULL, we used vmalloc() */
 
 	if (size) {
 		if (size > XATTR_LIST_MAX)
 			size = XATTR_LIST_MAX;
 		klist = kmalloc(size, __GFP_NOWARN | GFP_KERNEL);
 		if (!klist) {
-			vlist = vmalloc(size);
-			if (!vlist)
+			klist = vmalloc(size);
+			if (!klist)
 				return -ENOMEM;
-			klist = vlist;
 		}
 	}
 
@@ -545,10 +516,9 @@ listxattr(struct dentry *d, char __user *list, size_t size)
 		   than XATTR_LIST_MAX bytes. Not possible. */
 		error = -E2BIG;
 	}
-	if (vlist)
-		vfree(vlist);
-	else
-		kfree(klist);
+
+	kvfree(klist);
+
 	return error;
 }
 
@@ -701,13 +671,20 @@ xattr_resolve_name(const struct xattr_handler **handlers, const char **name)
 		return NULL;
 
 	for_each_xattr_handler(handlers, handler) {
-		const char *n = strcmp_prefix(*name, handler->prefix);
+		const char *n;
+
+		n = strcmp_prefix(*name, xattr_prefix(handler));
 		if (n) {
+			if (!handler->prefix ^ !*n) {
+				if (*n)
+					continue;
+				return ERR_PTR(-EINVAL);
+			}
 			*name = n;
-			break;
+			return handler;
 		}
 	}
-	return handler;
+	return ERR_PTR(-EOPNOTSUPP);
 }
 
 /*
@@ -719,8 +696,8 @@ generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t s
 	const struct xattr_handler *handler;
 
 	handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
-	if (!handler)
-		return -EOPNOTSUPP;
+	if (IS_ERR(handler))
+		return PTR_ERR(handler);
 	return handler->get(handler, dentry, name, buffer, size);
 }
 
@@ -736,19 +713,25 @@ generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
 
 	if (!buffer) {
 		for_each_xattr_handler(handlers, handler) {
-			size += handler->list(handler, dentry, NULL, 0,
-					      NULL, 0);
+			if (!handler->name ||
+			    (handler->list && !handler->list(dentry)))
+				continue;
+			size += strlen(handler->name) + 1;
 		}
 	} else {
 		char *buf = buffer;
+		size_t len;
 
 		for_each_xattr_handler(handlers, handler) {
-			size = handler->list(handler, dentry, buf, buffer_size,
-					     NULL, 0);
-			if (size > buffer_size)
+			if (!handler->name ||
+			    (handler->list && !handler->list(dentry)))
+				continue;
+			len = strlen(handler->name);
+			if (len + 1 > buffer_size)
 				return -ERANGE;
-			buf += size;
-			buffer_size -= size;
+			memcpy(buf, handler->name, len + 1);
+			buf += len + 1;
+			buffer_size -= len + 1;
 		}
 		size = buf - buffer;
 	}
@@ -766,8 +749,8 @@ generic_setxattr(struct dentry *dentry, const char *name, const void *value, siz
 	if (size == 0)
 		value = "";  /* empty EA, do not remove */
 	handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
-	if (!handler)
-		return -EOPNOTSUPP;
+	if (IS_ERR(handler))
+		return PTR_ERR(handler);
 	return handler->set(handler, dentry, name, value, size, flags);
 }
 
@@ -781,8 +764,8 @@ generic_removexattr(struct dentry *dentry, const char *name)
 	const struct xattr_handler *handler;
 
 	handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name);
-	if (!handler)
-		return -EOPNOTSUPP;
+	if (IS_ERR(handler))
+		return PTR_ERR(handler);
 	return handler->set(handler, dentry, name, NULL, 0, XATTR_REPLACE);
 }
 
@@ -809,7 +792,7 @@ EXPORT_SYMBOL(generic_removexattr);
 const char *xattr_full_name(const struct xattr_handler *handler,
 			    const char *name)
 {
-	size_t prefix_len = strlen(handler->prefix);
+	size_t prefix_len = strlen(xattr_prefix(handler));
 
 	return name - prefix_len;
 }
@@ -864,8 +847,22 @@ int simple_xattr_get(struct simple_xattrs *xattrs, const char *name,
 	return ret;
 }
 
-static int __simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
-			      const void *value, size_t size, int flags)
+/**
+ * simple_xattr_set - xattr SET operation for in-memory/pseudo filesystems
+ * @xattrs: target simple_xattr list
+ * @name: name of the extended attribute
+ * @value: value of the xattr. If %NULL, will remove the attribute.
+ * @size: size of the new xattr
+ * @flags: %XATTR_{CREATE|REPLACE}
+ *
+ * %XATTR_CREATE is set, the xattr shouldn't exist already; otherwise fails
+ * with -EEXIST.  If %XATTR_REPLACE is set, the xattr should exist;
+ * otherwise, fails with -ENODATA.
+ *
+ * Returns 0 on success, -errno on failure.
+ */
+int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
+		     const void *value, size_t size, int flags)
 {
 	struct simple_xattr *xattr;
 	struct simple_xattr *new_xattr = NULL;
@@ -915,73 +912,64 @@ out:
 
 }
 
-/**
- * simple_xattr_set - xattr SET operation for in-memory/pseudo filesystems
- * @xattrs: target simple_xattr list
- * @name: name of the new extended attribute
- * @value: value of the new xattr. If %NULL, will remove the attribute
- * @size: size of the new xattr
- * @flags: %XATTR_{CREATE|REPLACE}
- *
- * %XATTR_CREATE is set, the xattr shouldn't exist already; otherwise fails
- * with -EEXIST.  If %XATTR_REPLACE is set, the xattr should exist;
- * otherwise, fails with -ENODATA.
- *
- * Returns 0 on success, -errno on failure.
- */
-int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
-		     const void *value, size_t size, int flags)
-{
-	if (size == 0)
-		value = ""; /* empty EA, do not remove */
-	return __simple_xattr_set(xattrs, name, value, size, flags);
-}
-
-/*
- * xattr REMOVE operation for in-memory/pseudo filesystems
- */
-int simple_xattr_remove(struct simple_xattrs *xattrs, const char *name)
+static bool xattr_is_trusted(const char *name)
 {
-	return __simple_xattr_set(xattrs, name, NULL, 0, XATTR_REPLACE);
+	return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
 }
 
-static bool xattr_is_trusted(const char *name)
+static int xattr_list_one(char **buffer, ssize_t *remaining_size,
+			  const char *name)
 {
-	return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN);
+	size_t len = strlen(name) + 1;
+	if (*buffer) {
+		if (*remaining_size < len)
+			return -ERANGE;
+		memcpy(*buffer, name, len);
+		*buffer += len;
+	}
+	*remaining_size -= len;
+	return 0;
 }
 
 /*
  * xattr LIST operation for in-memory/pseudo filesystems
  */
-ssize_t simple_xattr_list(struct simple_xattrs *xattrs, char *buffer,
-			  size_t size)
+ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs,
+			  char *buffer, size_t size)
 {
 	bool trusted = capable(CAP_SYS_ADMIN);
 	struct simple_xattr *xattr;
-	size_t used = 0;
+	ssize_t remaining_size = size;
+	int err = 0;
+
+#ifdef CONFIG_FS_POSIX_ACL
+	if (inode->i_acl) {
+		err = xattr_list_one(&buffer, &remaining_size,
+				     XATTR_NAME_POSIX_ACL_ACCESS);
+		if (err)
+			return err;
+	}
+	if (inode->i_default_acl) {
+		err = xattr_list_one(&buffer, &remaining_size,
+				     XATTR_NAME_POSIX_ACL_DEFAULT);
+		if (err)
+			return err;
+	}
+#endif
 
 	spin_lock(&xattrs->lock);
 	list_for_each_entry(xattr, &xattrs->head, list) {
-		size_t len;
-
 		/* skip "trusted." attributes for unprivileged callers */
 		if (!trusted && xattr_is_trusted(xattr->name))
 			continue;
 
-		len = strlen(xattr->name) + 1;
-		used += len;
-		if (buffer) {
-			if (size < used) {
-				used = -ERANGE;
-				break;
-			}
-			memcpy(buffer, xattr->name, len);
-			buffer += len;
-		}
+		err = xattr_list_one(&buffer, &remaining_size, xattr->name);
+		if (err)
+			break;
 	}
 	spin_unlock(&xattrs->lock);
 
-	return used;
+	return err ? err : size - remaining_size;
 }
 
 /*
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index cc6b768fc..d1c66e465 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -84,6 +84,7 @@ kmem_zalloc(size_t size, xfs_km_flags_t flags)
 #define KM_ZONE_HWALIGN	SLAB_HWCACHE_ALIGN
 #define KM_ZONE_RECLAIM	SLAB_RECLAIM_ACCOUNT
 #define KM_ZONE_SPREAD	SLAB_MEM_SPREAD
+#define KM_ZONE_ACCOUNT	SLAB_ACCOUNT
 
 #define kmem_zone	kmem_cache
 #define kmem_zone_t	struct kmem_cache
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index 3479294c1..a708e38b4 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -535,6 +535,7 @@ xfs_agfl_write_verify(
 }
 
 const struct xfs_buf_ops xfs_agfl_buf_ops = {
+	.name = "xfs_agfl",
 	.verify_read = xfs_agfl_read_verify,
 	.verify_write = xfs_agfl_write_verify,
 };
@@ -1926,7 +1927,7 @@ xfs_alloc_space_available(
  * Decide whether to use this allocation group for this allocation.
  * If so, fix up the btree freelist's size.
  */
-STATIC int			/* error */
+int			/* error */
 xfs_alloc_fix_freelist(
 	struct xfs_alloc_arg	*args,	/* allocation argument structure */
 	int			flags)	/* XFS_ALLOC_FLAG_... */
@@ -2339,6 +2340,7 @@ xfs_agf_write_verify(
 }
 
 const struct xfs_buf_ops xfs_agf_buf_ops = {
+	.name = "xfs_agf",
 	.verify_read = xfs_agf_read_verify,
 	.verify_write = xfs_agf_write_verify,
 };
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index 0ecde4d5c..135eb3d24 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -235,5 +235,6 @@ xfs_alloc_get_rec(
 
 int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp,
 			xfs_agnumber_t agno, int flags, struct xfs_buf **bpp);
+int xfs_alloc_fix_freelist(struct xfs_alloc_arg *args, int flags);
 
 #endif	/* __XFS_ALLOC_H__ */
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index 90de071dd..444626ddb 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -293,14 +293,7 @@ xfs_allocbt_verify(
 	level = be16_to_cpu(block->bb_level);
 	switch (block->bb_magic) {
 	case cpu_to_be32(XFS_ABTB_CRC_MAGIC):
-		if (!xfs_sb_version_hascrc(&mp->m_sb))
-			return false;
-		if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
-			return false;
-		if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
-			return false;
-		if (pag &&
-		    be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
+		if (!xfs_btree_sblock_v5hdr_verify(bp))
 			return false;
 		/* fall through */
 	case cpu_to_be32(XFS_ABTB_MAGIC):
@@ -311,14 +304,7 @@ xfs_allocbt_verify(
 			return false;
 		break;
 	case cpu_to_be32(XFS_ABTC_CRC_MAGIC):
-		if (!xfs_sb_version_hascrc(&mp->m_sb))
-			return false;
-		if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
-			return false;
-		if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
-			return false;
-		if (pag &&
-		    be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
+		if (!xfs_btree_sblock_v5hdr_verify(bp))
 			return false;
 		/* fall through */
 	case cpu_to_be32(XFS_ABTC_MAGIC):
@@ -332,21 +318,7 @@ xfs_allocbt_verify(
 		return false;
 	}
 
-	/* numrecs verification */
-	if (be16_to_cpu(block->bb_numrecs) > mp->m_alloc_mxr[level != 0])
-		return false;
-
-	/* sibling pointer verification */
-	if (!block->bb_u.s.bb_leftsib ||
-	    (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks &&
-	     block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK)))
-		return false;
-	if (!block->bb_u.s.bb_rightsib ||
-	    (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks &&
-	     block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK)))
-		return false;
-
-	return true;
+	return xfs_btree_sblock_verify(bp, mp->m_alloc_mxr[level != 0]);
 }
 
 static void
@@ -379,6 +351,7 @@ xfs_allocbt_write_verify(
 }
 
 const struct xfs_buf_ops xfs_allocbt_buf_ops = {
+	.name = "xfs_allocbt",
 	.verify_read = xfs_allocbt_read_verify,
 	.verify_write = xfs_allocbt_write_verify,
 };
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index f949818fa..fa3b948ef 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -207,7 +207,7 @@ xfs_attr_set(
 	struct xfs_trans_res	tres;
 	xfs_fsblock_t		firstblock;
 	int			rsvd = (flags & ATTR_ROOT) != 0;
-	int			error, err2, committed, local;
+	int			error, err2, local;
 
 	XFS_STATS_INC(mp, xs_attr_set);
 
@@ -334,25 +334,15 @@ xfs_attr_set(
 		 */
 		xfs_bmap_init(args.flist, args.firstblock);
 		error = xfs_attr_shortform_to_leaf(&args);
-		if (!error) {
-			error = xfs_bmap_finish(&args.trans, args.flist,
-						&committed);
-		}
+		if (!error)
+			error = xfs_bmap_finish(&args.trans, args.flist, dp);
 		if (error) {
-			ASSERT(committed);
 			args.trans = NULL;
 			xfs_bmap_cancel(&flist);
 			goto out;
 		}
 
 		/*
-		 * bmap_finish() may have committed the last trans and started
-		 * a new one.  We need the inode to be in all transactions.
-		 */
-		if (committed)
-			xfs_trans_ijoin(args.trans, dp, 0);
-
-		/*
 		 * Commit the leaf transformation.  We'll need another (linked)
 		 * transaction to add the new attribute to the leaf.
 		 */
@@ -568,7 +558,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
 {
 	xfs_inode_t *dp;
 	struct xfs_buf *bp;
-	int retval, error, committed, forkoff;
+	int retval, error, forkoff;
 
 	trace_xfs_attr_leaf_addname(args);
 
@@ -628,25 +618,15 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
 		 */
 		xfs_bmap_init(args->flist, args->firstblock);
 		error = xfs_attr3_leaf_to_node(args);
-		if (!error) {
-			error = xfs_bmap_finish(&args->trans, args->flist,
-						&committed);
-		}
+		if (!error)
+			error = xfs_bmap_finish(&args->trans, args->flist, dp);
 		if (error) {
-			ASSERT(committed);
 			args->trans = NULL;
 			xfs_bmap_cancel(args->flist);
 			return error;
 		}
 
 		/*
-		 * bmap_finish() may have committed the last trans and started
-		 * a new one.  We need the inode to be in all transactions.
-		 */
-		if (committed)
-			xfs_trans_ijoin(args->trans, dp, 0);
-
-		/*
 		 * Commit the current trans (including the inode) and start
 		 * a new one.
 		 */
@@ -729,25 +709,14 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
 			xfs_bmap_init(args->flist, args->firstblock);
 			error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
 			/* bp is gone due to xfs_da_shrink_inode */
-			if (!error) {
+			if (!error)
 				error = xfs_bmap_finish(&args->trans,
-							args->flist,
-							&committed);
-			}
+							args->flist, dp);
 			if (error) {
-				ASSERT(committed);
 				args->trans = NULL;
 				xfs_bmap_cancel(args->flist);
 				return error;
 			}
-
-			/*
-			 * bmap_finish() may have committed the last trans
-			 * and started a new one.  We need the inode to be
-			 * in all transactions.
-			 */
-			if (committed)
-				xfs_trans_ijoin(args->trans, dp, 0);
 		}
 
 		/*
@@ -775,7 +744,7 @@ xfs_attr_leaf_removename(xfs_da_args_t *args)
 {
 	xfs_inode_t *dp;
 	struct xfs_buf *bp;
-	int error, committed, forkoff;
+	int error, forkoff;
 
 	trace_xfs_attr_leaf_removename(args);
 
@@ -803,23 +772,13 @@ xfs_attr_leaf_removename(xfs_da_args_t *args)
 		xfs_bmap_init(args->flist, args->firstblock);
 		error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
 		/* bp is gone due to xfs_da_shrink_inode */
-		if (!error) {
-			error = xfs_bmap_finish(&args->trans, args->flist,
-						&committed);
-		}
+		if (!error)
+			error = xfs_bmap_finish(&args->trans, args->flist, dp);
 		if (error) {
-			ASSERT(committed);
 			args->trans = NULL;
 			xfs_bmap_cancel(args->flist);
 			return error;
 		}
-
-		/*
-		 * bmap_finish() may have committed the last trans and started
-		 * a new one.  We need the inode to be in all transactions.
-		 */
-		if (committed)
-			xfs_trans_ijoin(args->trans, dp, 0);
 	}
 	return 0;
 }
@@ -877,7 +836,7 @@ xfs_attr_node_addname(xfs_da_args_t *args)
 	xfs_da_state_blk_t *blk;
 	xfs_inode_t *dp;
 	xfs_mount_t *mp;
-	int committed, retval, error;
+	int retval, error;
 
 	trace_xfs_attr_node_addname(args);
 
@@ -938,27 +897,16 @@ restart:
 			state = NULL;
 			xfs_bmap_init(args->flist, args->firstblock);
 			error = xfs_attr3_leaf_to_node(args);
-			if (!error) {
+			if (!error)
 				error = xfs_bmap_finish(&args->trans,
-							args->flist,
-							&committed);
-			}
+							args->flist, dp);
 			if (error) {
-				ASSERT(committed);
 				args->trans = NULL;
 				xfs_bmap_cancel(args->flist);
 				goto out;
 			}
 
 			/*
-			 * bmap_finish() may have committed the last trans
-			 * and started a new one.  We need the inode to be
-			 * in all transactions.
-			 */
-			if (committed)
-				xfs_trans_ijoin(args->trans, dp, 0);
-
-			/*
 			 * Commit the node conversion and start the next
 			 * trans in the chain.
 			 */
@@ -977,23 +925,13 @@ restart:
 		 */
 		xfs_bmap_init(args->flist, args->firstblock);
 		error = xfs_da3_split(state);
-		if (!error) {
-			error = xfs_bmap_finish(&args->trans, args->flist,
-						&committed);
-		}
+		if (!error)
+			error = xfs_bmap_finish(&args->trans, args->flist, dp);
 		if (error) {
-			ASSERT(committed);
 			args->trans = NULL;
 			xfs_bmap_cancel(args->flist);
 			goto out;
 		}
-
-		/*
-		 * bmap_finish() may have committed the last trans and started
-		 * a new one.  We need the inode to be in all transactions.
-		 */
-		if (committed)
-			xfs_trans_ijoin(args->trans, dp, 0);
 	} else {
 		/*
 		 * Addition succeeded, update Btree hashvals.
@@ -1086,25 +1024,14 @@ restart:
 		if (retval && (state->path.active > 1)) {
 			xfs_bmap_init(args->flist, args->firstblock);
 			error = xfs_da3_join(state);
-			if (!error) {
+			if (!error)
 				error = xfs_bmap_finish(&args->trans,
-							args->flist,
-							&committed);
-			}
+							args->flist, dp);
 			if (error) {
-				ASSERT(committed);
 				args->trans = NULL;
 				xfs_bmap_cancel(args->flist);
 				goto out;
 			}
-
-			/*
-			 * bmap_finish() may have committed the last trans
-			 * and started a new one.  We need the inode to be
-			 * in all transactions.
-			 */
-			if (committed)
-				xfs_trans_ijoin(args->trans, dp, 0);
 		}
 
 		/*
@@ -1146,7 +1073,7 @@ xfs_attr_node_removename(xfs_da_args_t *args)
 	xfs_da_state_blk_t *blk;
 	xfs_inode_t *dp;
 	struct xfs_buf *bp;
-	int retval, error, committed, forkoff;
+	int retval, error, forkoff;
 
 	trace_xfs_attr_node_removename(args);
 
@@ -1220,24 +1147,13 @@ xfs_attr_node_removename(xfs_da_args_t *args)
 	if (retval && (state->path.active > 1)) {
 		xfs_bmap_init(args->flist, args->firstblock);
 		error = xfs_da3_join(state);
-		if (!error) {
-			error = xfs_bmap_finish(&args->trans, args->flist,
-						&committed);
-		}
+		if (!error)
+			error = xfs_bmap_finish(&args->trans, args->flist, dp);
 		if (error) {
-			ASSERT(committed);
 			args->trans = NULL;
 			xfs_bmap_cancel(args->flist);
 			goto out;
 		}
-
-		/*
-		 * bmap_finish() may have committed the last trans and started
-		 * a new one.  We need the inode to be in all transactions.
-		 */
-		if (committed)
-			xfs_trans_ijoin(args->trans, dp, 0);
-
 		/*
 		 * Commit the Btree join operation and start a new trans.
 		 */
@@ -1265,25 +1181,14 @@ xfs_attr_node_removename(xfs_da_args_t *args)
 			xfs_bmap_init(args->flist, args->firstblock);
 			error = xfs_attr3_leaf_to_shortform(bp, args, forkoff);
 			/* bp is gone due to xfs_da_shrink_inode */
-			if (!error) {
+			if (!error)
 				error = xfs_bmap_finish(&args->trans,
-							args->flist,
-							&committed);
-			}
+							args->flist, dp);
 			if (error) {
-				ASSERT(committed);
 				args->trans = NULL;
 				xfs_bmap_cancel(args->flist);
 				goto out;
 			}
-
-			/*
-			 * bmap_finish() may have committed the last trans
-			 * and started a new one.  We need the inode to be
-			 * in all transactions.
-			 */
-			if (committed)
-				xfs_trans_ijoin(args->trans, dp, 0);
 		} else
 			xfs_trans_brelse(args->trans, bp);
 	}
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index aa187f7ba..01a5ecfed 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -328,6 +328,7 @@ xfs_attr3_leaf_read_verify(
 }
 
 const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = {
+	.name = "xfs_attr3_leaf",
 	.verify_read = xfs_attr3_leaf_read_verify,
 	.verify_write = xfs_attr3_leaf_write_verify,
 };
diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c
index 5ab95ffa4..a572532a5 100644
--- a/fs/xfs/libxfs/xfs_attr_remote.c
+++ b/fs/xfs/libxfs/xfs_attr_remote.c
@@ -201,6 +201,7 @@ xfs_attr3_rmt_write_verify(
 }
 
 const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = {
+	.name = "xfs_attr3_rmt",
 	.verify_read = xfs_attr3_rmt_read_verify,
 	.verify_write = xfs_attr3_rmt_write_verify,
 };
@@ -447,8 +448,6 @@ xfs_attr_rmtval_set(
 	 * Roll through the "value", allocating blocks on disk as required.
 	 */
 	while (blkcnt > 0) {
-		int	committed;
-
 		/*
 		 * Allocate a single extent, up to the size of the value.
 		 *
@@ -466,24 +465,14 @@ xfs_attr_rmtval_set(
 		error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno,
 				  blkcnt, XFS_BMAPI_ATTRFORK, args->firstblock,
 				  args->total, &map, &nmap, args->flist);
-		if (!error) {
-			error = xfs_bmap_finish(&args->trans, args->flist,
-						&committed);
-		}
+		if (!error)
+			error = xfs_bmap_finish(&args->trans, args->flist, dp);
 		if (error) {
-			ASSERT(committed);
 			args->trans = NULL;
 			xfs_bmap_cancel(args->flist);
 			return error;
 		}
 
-		/*
-		 * bmap_finish() may have committed the last trans and started
-		 * a new one.  We need the inode to be in all transactions.
-		 */
-		if (committed)
-			xfs_trans_ijoin(args->trans, dp, 0);
-
 		ASSERT(nmap == 1);
 		ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
 		       (map.br_startblock != HOLESTARTBLOCK));
@@ -614,31 +603,20 @@ xfs_attr_rmtval_remove(
 	blkcnt = args->rmtblkcnt;
 	done = 0;
 	while (!done) {
-		int committed;
-
 		xfs_bmap_init(args->flist, args->firstblock);
 		error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt,
 				    XFS_BMAPI_ATTRFORK, 1, args->firstblock,
 				    args->flist, &done);
-		if (!error) {
+		if (!error)
 			error = xfs_bmap_finish(&args->trans, args->flist,
-						&committed);
-		}
+						args->dp);
 		if (error) {
-			ASSERT(committed);
 			args->trans = NULL;
 			xfs_bmap_cancel(args->flist);
 			return error;
 		}
 
 		/*
-		 * bmap_finish() may have committed the last trans and started
-		 * a new one.  We need the inode to be in all transactions.
-		 */
-		if (committed)
-			xfs_trans_ijoin(args->trans, args->dp, 0);
-
-		/*
 		 * Close out trans and start the next one in the chain.
 		 */
 		error = xfs_trans_roll(&args->trans, args->dp);
diff --git a/fs/xfs/libxfs/xfs_bit.c b/fs/xfs/libxfs/xfs_bit.c
index 0e8885a59..0a94cce5e 100644
--- a/fs/xfs/libxfs/xfs_bit.c
+++ b/fs/xfs/libxfs/xfs_bit.c
@@ -32,13 +32,13 @@ int
 xfs_bitmap_empty(uint *map, uint size)
 {
 	uint i;
-	uint ret = 0;
 
 	for (i = 0; i < size; i++) {
-		ret |= map[i];
+		if (map[i] != 0)
+			return 0;
 	}
 
-	return (ret == 0);
+	return 1;
 }
 
 /*
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 119c2422a..ef00156f4 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -325,9 +325,11 @@ xfs_check_block(
 
 /*
  * Check that the extents for the inode ip are in the right order in all
- * btree leaves.
+ * btree leaves. THis becomes prohibitively expensive for large extent count
+ * files, so don't bother with inodes that have more than 10,000 extents in
+ * them. The btree record ordering checks will still be done, so for such large
+ * bmapbt constructs that is going to catch most corruptions.
  */
-
 STATIC void
 xfs_bmap_check_leaf_extents(
 	xfs_btree_cur_t		*cur,	/* btree cursor or null */
@@ -352,6 +354,10 @@ xfs_bmap_check_leaf_extents(
 		return;
 	}
 
+	/* skip large extent count inodes */
+	if (ip->i_d.di_nextents > 10000)
+		return;
+
 	bno = NULLFSBLOCK;
 	mp = ip->i_mount;
 	ifp = XFS_IFORK_PTR(ip, whichfork);
@@ -1111,7 +1117,6 @@ xfs_bmap_add_attrfork(
 	xfs_trans_t		*tp;		/* transaction pointer */
 	int			blks;		/* space reservation */
 	int			version = 1;	/* superblock attr version */
-	int			committed;	/* xaction was committed */
 	int			logflags;	/* logging flags */
 	int			error;		/* error return value */
 
@@ -1214,7 +1219,7 @@ xfs_bmap_add_attrfork(
 			xfs_log_sb(tp);
 	}
 
-	error = xfs_bmap_finish(&tp, &flist, &committed);
+	error = xfs_bmap_finish(&tp, &flist, NULL);
 	if (error)
 		goto bmap_cancel;
 	error = xfs_trans_commit(tp);
@@ -1723,10 +1728,11 @@ xfs_bmap_add_extent_delay_real(
 	xfs_filblks_t		temp=0;	/* value for da_new calculations */
 	xfs_filblks_t		temp2=0;/* value for da_new calculations */
 	int			tmp_rval;	/* partial logging flags */
+	int			whichfork = XFS_DATA_FORK;
 	struct xfs_mount	*mp;
 
-	mp  = bma->tp ? bma->tp->t_mountp : NULL;
-	ifp = XFS_IFORK_PTR(bma->ip, XFS_DATA_FORK);
+	mp = bma->ip->i_mount;
+	ifp = XFS_IFORK_PTR(bma->ip, whichfork);
 
 	ASSERT(bma->idx >= 0);
 	ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
@@ -1785,7 +1791,7 @@ xfs_bmap_add_extent_delay_real(
 	 * Don't set contiguous if the combined extent would be too large.
 	 * Also check for all-three-contiguous being too large.
 	 */
-	if (bma->idx < bma->ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
+	if (bma->idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
 		state |= BMAP_RIGHT_VALID;
 		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx + 1), &RIGHT);
 
@@ -2016,10 +2022,10 @@ xfs_bmap_add_extent_delay_real(
 			XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
 		}
 
-		if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
+		if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
 			error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
 					bma->firstblock, bma->flist,
-					&bma->cur, 1, &tmp_rval, XFS_DATA_FORK);
+					&bma->cur, 1, &tmp_rval, whichfork);
 			rval |= tmp_rval;
 			if (error)
 				goto done;
@@ -2100,10 +2106,10 @@ xfs_bmap_add_extent_delay_real(
 			XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
 		}
 
-		if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
+		if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
 			error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
 				bma->firstblock, bma->flist, &bma->cur, 1,
-				&tmp_rval, XFS_DATA_FORK);
+				&tmp_rval, whichfork);
 			rval |= tmp_rval;
 			if (error)
 				goto done;
@@ -2169,10 +2175,10 @@ xfs_bmap_add_extent_delay_real(
 			XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
 		}
 
-		if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
+		if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
 			error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
 					bma->firstblock, bma->flist, &bma->cur,
-					1, &tmp_rval, XFS_DATA_FORK);
+					1, &tmp_rval, whichfork);
 			rval |= tmp_rval;
 			if (error)
 				goto done;
@@ -2215,13 +2221,13 @@ xfs_bmap_add_extent_delay_real(
 	}
 
 	/* convert to a btree if necessary */
-	if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) {
+	if (xfs_bmap_needs_btree(bma->ip, whichfork)) {
 		int	tmp_logflags;	/* partial log flag return val */
 
 		ASSERT(bma->cur == NULL);
 		error = xfs_bmap_extents_to_btree(bma->tp, bma->ip,
 				bma->firstblock, bma->flist, &bma->cur,
-				da_old > 0, &tmp_logflags, XFS_DATA_FORK);
+				da_old > 0, &tmp_logflags, whichfork);
 		bma->logflags |= tmp_logflags;
 		if (error)
 			goto done;
@@ -2242,7 +2248,7 @@ xfs_bmap_add_extent_delay_real(
 	if (bma->cur)
 		bma->cur->bc_private.b.allocated = 0;
 
-	xfs_bmap_check_leaf_extents(bma->cur, bma->ip, XFS_DATA_FORK);
+	xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork);
 done:
 	bma->logflags |= rval;
 	return error;
@@ -2939,7 +2945,7 @@ xfs_bmap_add_extent_hole_real(
 	int			state;	/* state bits, accessed thru macros */
 	struct xfs_mount	*mp;
 
-	mp = bma->tp ? bma->tp->t_mountp : NULL;
+	mp = bma->ip->i_mount;
 	ifp = XFS_IFORK_PTR(bma->ip, whichfork);
 
 	ASSERT(bma->idx >= 0);
@@ -5950,7 +5956,6 @@ xfs_bmap_split_extent(
 	struct xfs_trans        *tp;
 	struct xfs_bmap_free    free_list;
 	xfs_fsblock_t           firstfsb;
-	int                     committed;
 	int                     error;
 
 	tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT);
@@ -5971,7 +5976,7 @@ xfs_bmap_split_extent(
 	if (error)
 		goto out;
 
-	error = xfs_bmap_finish(&tp, &free_list, &committed);
+	error = xfs_bmap_finish(&tp, &free_list, NULL);
 	if (error)
 		goto out;
 
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index a160f8a5a..423a34e83 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -195,7 +195,7 @@ void	xfs_bmap_add_free(xfs_fsblock_t bno, xfs_filblks_t len,
 		struct xfs_bmap_free *flist, struct xfs_mount *mp);
 void	xfs_bmap_cancel(struct xfs_bmap_free *flist);
 int	xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist,
-			int *committed);
+			struct xfs_inode *ip);
 void	xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork);
 int	xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip,
 		xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork);
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index 6b0cf6546..1637c37bf 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -720,6 +720,7 @@ xfs_bmbt_write_verify(
 }
 
 const struct xfs_buf_ops xfs_bmbt_buf_ops = {
+	.name = "xfs_bmbt",
 	.verify_read = xfs_bmbt_read_verify,
 	.verify_write = xfs_bmbt_write_verify,
 };
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index af1bbee55..a0eb18ce3 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -4080,3 +4080,61 @@ xfs_btree_change_owner(
 
 	return 0;
 }
+
+/**
+ * xfs_btree_sblock_v5hdr_verify() -- verify the v5 fields of a short-format
+ *				      btree block
+ *
+ * @bp: buffer containing the btree block
+ * @max_recs: pointer to the m_*_mxr max records field in the xfs mount
+ * @pag_max_level: pointer to the per-ag max level field
+ */
+bool
+xfs_btree_sblock_v5hdr_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
+	struct xfs_perag	*pag = bp->b_pag;
+
+	if (!xfs_sb_version_hascrc(&mp->m_sb))
+		return false;
+	if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
+		return false;
+	if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
+		return false;
+	if (pag && be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
+		return false;
+	return true;
+}
+
+/**
+ * xfs_btree_sblock_verify() -- verify a short-format btree block
+ *
+ * @bp: buffer containing the btree block
+ * @max_recs: maximum records allowed in this btree node
+ */
+bool
+xfs_btree_sblock_verify(
+	struct xfs_buf		*bp,
+	unsigned int		max_recs)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
+
+	/* numrecs verification */
+	if (be16_to_cpu(block->bb_numrecs) > max_recs)
+		return false;
+
+	/* sibling pointer verification */
+	if (!block->bb_u.s.bb_leftsib ||
+	    (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks &&
+	     block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK)))
+		return false;
+	if (!block->bb_u.s.bb_rightsib ||
+	    (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks &&
+	     block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK)))
+		return false;
+
+	return true;
+}
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 992dec063..2e874be70 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -472,4 +472,7 @@ static inline int xfs_btree_get_level(struct xfs_btree_block *block)
 #define XFS_BTREE_TRACE_ARGR(c, r)
 #define	XFS_BTREE_TRACE_CURSOR(c, t)
 
+bool xfs_btree_sblock_v5hdr_verify(struct xfs_buf *bp);
+bool xfs_btree_sblock_verify(struct xfs_buf *bp, unsigned int max_recs);
+
 #endif	/* __XFS_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c
index e89a0f8f8..097bf7717 100644
--- a/fs/xfs/libxfs/xfs_da_btree.c
+++ b/fs/xfs/libxfs/xfs_da_btree.c
@@ -245,6 +245,7 @@ xfs_da3_node_read_verify(
 }
 
 const struct xfs_buf_ops xfs_da3_node_buf_ops = {
+	.name = "xfs_da3_node",
 	.verify_read = xfs_da3_node_read_verify,
 	.verify_write = xfs_da3_node_write_verify,
 };
diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c
index 9c10e2b8c..aa17cb788 100644
--- a/fs/xfs/libxfs/xfs_dir2_block.c
+++ b/fs/xfs/libxfs/xfs_dir2_block.c
@@ -123,6 +123,7 @@ xfs_dir3_block_write_verify(
 }
 
 const struct xfs_buf_ops xfs_dir3_block_buf_ops = {
+	.name = "xfs_dir3_block",
 	.verify_read = xfs_dir3_block_read_verify,
 	.verify_write = xfs_dir3_block_write_verify,
 };
diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c
index af71a84f3..725fc7841 100644
--- a/fs/xfs/libxfs/xfs_dir2_data.c
+++ b/fs/xfs/libxfs/xfs_dir2_data.c
@@ -305,11 +305,13 @@ xfs_dir3_data_write_verify(
 }
 
 const struct xfs_buf_ops xfs_dir3_data_buf_ops = {
+	.name = "xfs_dir3_data",
 	.verify_read = xfs_dir3_data_read_verify,
 	.verify_write = xfs_dir3_data_write_verify,
 };
 
 static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = {
+	.name = "xfs_dir3_data_reada",
 	.verify_read = xfs_dir3_data_reada_verify,
 	.verify_write = xfs_dir3_data_write_verify,
 };
diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c
index 3923e1f94..b887fb2a2 100644
--- a/fs/xfs/libxfs/xfs_dir2_leaf.c
+++ b/fs/xfs/libxfs/xfs_dir2_leaf.c
@@ -245,11 +245,13 @@ xfs_dir3_leafn_write_verify(
 }
 
 const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops = {
+	.name = "xfs_dir3_leaf1",
 	.verify_read = xfs_dir3_leaf1_read_verify,
 	.verify_write = xfs_dir3_leaf1_write_verify,
 };
 
 const struct xfs_buf_ops xfs_dir3_leafn_buf_ops = {
+	.name = "xfs_dir3_leafn",
 	.verify_read = xfs_dir3_leafn_read_verify,
 	.verify_write = xfs_dir3_leafn_write_verify,
 };
diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c
index 70b0cb2fd..63ee03db7 100644
--- a/fs/xfs/libxfs/xfs_dir2_node.c
+++ b/fs/xfs/libxfs/xfs_dir2_node.c
@@ -150,6 +150,7 @@ xfs_dir3_free_write_verify(
 }
 
 const struct xfs_buf_ops xfs_dir3_free_buf_ops = {
+	.name = "xfs_dir3_free",
 	.verify_read = xfs_dir3_free_read_verify,
 	.verify_write = xfs_dir3_free_write_verify,
 };
diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c
index 5331b7f04..3cc3cf767 100644
--- a/fs/xfs/libxfs/xfs_dquot_buf.c
+++ b/fs/xfs/libxfs/xfs_dquot_buf.c
@@ -54,7 +54,7 @@ xfs_dqcheck(
 	xfs_dqid_t	 id,
 	uint		 type,	  /* used only when IO_dorepair is true */
 	uint		 flags,
-	char		 *str)
+	const char	 *str)
 {
 	xfs_dqblk_t	 *d = (xfs_dqblk_t *)ddq;
 	int		errs = 0;
@@ -207,7 +207,8 @@ xfs_dquot_buf_verify_crc(
 STATIC bool
 xfs_dquot_buf_verify(
 	struct xfs_mount	*mp,
-	struct xfs_buf		*bp)
+	struct xfs_buf		*bp,
+	int			warn)
 {
 	struct xfs_dqblk	*d = (struct xfs_dqblk *)bp->b_addr;
 	xfs_dqid_t		id = 0;
@@ -240,8 +241,7 @@ xfs_dquot_buf_verify(
 		if (i == 0)
 			id = be32_to_cpu(ddq->d_id);
 
-		error = xfs_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN,
-				       "xfs_dquot_buf_verify");
+		error = xfs_dqcheck(mp, ddq, id + i, 0, warn, __func__);
 		if (error)
 			return false;
 	}
@@ -256,7 +256,7 @@ xfs_dquot_buf_read_verify(
 
 	if (!xfs_dquot_buf_verify_crc(mp, bp))
 		xfs_buf_ioerror(bp, -EFSBADCRC);
-	else if (!xfs_dquot_buf_verify(mp, bp))
+	else if (!xfs_dquot_buf_verify(mp, bp, XFS_QMOPT_DOWARN))
 		xfs_buf_ioerror(bp, -EFSCORRUPTED);
 
 	if (bp->b_error)
@@ -264,6 +264,25 @@ xfs_dquot_buf_read_verify(
 }
 
 /*
+ * readahead errors are silent and simply leave the buffer as !done so a real
+ * read will then be run with the xfs_dquot_buf_ops verifier. See
+ * xfs_inode_buf_verify() for why we use EIO and ~XBF_DONE here rather than
+ * reporting the failure.
+ */
+static void
+xfs_dquot_buf_readahead_verify(
+	struct xfs_buf	*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+
+	if (!xfs_dquot_buf_verify_crc(mp, bp) ||
+	    !xfs_dquot_buf_verify(mp, bp, 0)) {
+		xfs_buf_ioerror(bp, -EIO);
+		bp->b_flags &= ~XBF_DONE;
+	}
+}
+
+/*
  * we don't calculate the CRC here as that is done when the dquot is flushed to
  * the buffer after the update is done. This ensures that the dquot in the
  * buffer always has an up-to-date CRC value.
@@ -274,7 +293,7 @@ xfs_dquot_buf_write_verify(
 {
 	struct xfs_mount	*mp = bp->b_target->bt_mount;
 
-	if (!xfs_dquot_buf_verify(mp, bp)) {
+	if (!xfs_dquot_buf_verify(mp, bp, XFS_QMOPT_DOWARN)) {
 		xfs_buf_ioerror(bp, -EFSCORRUPTED);
 		xfs_verifier_error(bp);
 		return;
@@ -282,7 +301,13 @@ xfs_dquot_buf_write_verify(
 }
 
 const struct xfs_buf_ops xfs_dquot_buf_ops = {
+	.name = "xfs_dquot",
 	.verify_read = xfs_dquot_buf_read_verify,
 	.verify_write = xfs_dquot_buf_write_verify,
 };
 
+const struct xfs_buf_ops xfs_dquot_buf_ra_ops = {
+	.name = "xfs_dquot_ra",
+	.verify_read = xfs_dquot_buf_readahead_verify,
+	.verify_write = xfs_dquot_buf_write_verify,
+};
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index e2536bb1c..dc97eb21a 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -984,8 +984,6 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
 
 /*
  * Values for di_flags
- * There should be a one-to-one correspondence between these flags and the
- * XFS_XFLAG_s.
  */
 #define XFS_DIFLAG_REALTIME_BIT  0	/* file's blocks come from rt area */
 #define XFS_DIFLAG_PREALLOC_BIT  1	/* file space has been preallocated */
@@ -1026,6 +1024,15 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
 	 XFS_DIFLAG_EXTSZINHERIT | XFS_DIFLAG_NODEFRAG | XFS_DIFLAG_FILESTREAM)
 
 /*
+ * Values for di_flags2 These start by being exposed to userspace in the upper
+ * 16 bits of the XFS_XFLAG_s range.
+ */
+#define XFS_DIFLAG2_DAX_BIT	0	/* use DAX for this inode */
+#define XFS_DIFLAG2_DAX		(1 << XFS_DIFLAG2_DAX_BIT)
+
+#define XFS_DIFLAG2_ANY		(XFS_DIFLAG2_DAX)
+
+/*
  * Inode number format:
  * low inopblog bits - offset in block
  * next agblklog bits - block number in ag
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index b2b73a998..fffe3d01b 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -36,40 +36,6 @@ struct dioattr {
 #endif
 
 /*
- * Structure for XFS_IOC_FSGETXATTR[A] and XFS_IOC_FSSETXATTR.
- */
-#ifndef HAVE_FSXATTR
-struct fsxattr {
-	__u32		fsx_xflags;	/* xflags field value (get/set) */
-	__u32		fsx_extsize;	/* extsize field value (get/set)*/
-	__u32		fsx_nextents;	/* nextents field value (get)	*/
-	__u32		fsx_projid;	/* project identifier (get/set) */
-	unsigned char	fsx_pad[12];
-};
-#endif
-
-/*
- * Flags for the bs_xflags/fsx_xflags field
- * There should be a one-to-one correspondence between these flags and the
- * XFS_DIFLAG_s.
- */
-#define XFS_XFLAG_REALTIME	0x00000001	/* data in realtime volume */
-#define XFS_XFLAG_PREALLOC	0x00000002	/* preallocated file extents */
-#define XFS_XFLAG_IMMUTABLE	0x00000008	/* file cannot be modified */
-#define XFS_XFLAG_APPEND	0x00000010	/* all writes append */
-#define XFS_XFLAG_SYNC		0x00000020	/* all writes synchronous */
-#define XFS_XFLAG_NOATIME	0x00000040	/* do not update access time */
-#define XFS_XFLAG_NODUMP	0x00000080	/* do not include in backups */
-#define XFS_XFLAG_RTINHERIT	0x00000100	/* create with rt bit set */
-#define XFS_XFLAG_PROJINHERIT	0x00000200	/* create with parents projid */
-#define XFS_XFLAG_NOSYMLINKS	0x00000400	/* disallow symlink creation */
-#define XFS_XFLAG_EXTSIZE	0x00000800	/* extent size allocator hint */
-#define XFS_XFLAG_EXTSZINHERIT	0x00001000	/* inherit inode extent size */
-#define XFS_XFLAG_NODEFRAG	0x00002000  	/* do not defragment */
-#define XFS_XFLAG_FILESTREAM	0x00004000	/* use filestream allocator */
-#define XFS_XFLAG_HASATTR	0x80000000	/* no DIFLAG for this	*/
-
-/*
  * Structure for XFS_IOC_GETBMAP.
  * On input, fill in bmv_offset and bmv_length of the first structure
  * to indicate the area of interest in the file, and bmv_entries with
@@ -514,8 +480,8 @@ typedef struct xfs_swapext
 #define XFS_IOC_ALLOCSP		_IOW ('X', 10, struct xfs_flock64)
 #define XFS_IOC_FREESP		_IOW ('X', 11, struct xfs_flock64)
 #define XFS_IOC_DIOINFO		_IOR ('X', 30, struct dioattr)
-#define XFS_IOC_FSGETXATTR	_IOR ('X', 31, struct fsxattr)
-#define XFS_IOC_FSSETXATTR	_IOW ('X', 32, struct fsxattr)
+#define XFS_IOC_FSGETXATTR	FS_IOC_FSGETXATTR
+#define XFS_IOC_FSSETXATTR	FS_IOC_FSSETXATTR
 #define XFS_IOC_ALLOCSP64	_IOW ('X', 36, struct xfs_flock64)
 #define XFS_IOC_FREESP64	_IOW ('X', 37, struct xfs_flock64)
 #define XFS_IOC_GETBMAP		_IOWR('X', 38, struct getbmap)
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 70c1db99f..66d702e6b 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -2572,6 +2572,7 @@ xfs_agi_write_verify(
 }
 
 const struct xfs_buf_ops xfs_agi_buf_ops = {
+	.name = "xfs_agi",
 	.verify_read = xfs_agi_read_verify,
 	.verify_write = xfs_agi_write_verify,
 };
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index f39b285be..c679f3c05 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -221,7 +221,6 @@ xfs_inobt_verify(
 {
 	struct xfs_mount	*mp = bp->b_target->bt_mount;
 	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
-	struct xfs_perag	*pag = bp->b_pag;
 	unsigned int		level;
 
 	/*
@@ -237,14 +236,7 @@ xfs_inobt_verify(
 	switch (block->bb_magic) {
 	case cpu_to_be32(XFS_IBT_CRC_MAGIC):
 	case cpu_to_be32(XFS_FIBT_CRC_MAGIC):
-		if (!xfs_sb_version_hascrc(&mp->m_sb))
-			return false;
-		if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
-			return false;
-		if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn))
-			return false;
-		if (pag &&
-		    be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
+		if (!xfs_btree_sblock_v5hdr_verify(bp))
 			return false;
 		/* fall through */
 	case cpu_to_be32(XFS_IBT_MAGIC):
@@ -254,24 +246,12 @@ xfs_inobt_verify(
 		return 0;
 	}
 
-	/* numrecs and level verification */
+	/* level verification */
 	level = be16_to_cpu(block->bb_level);
 	if (level >= mp->m_in_maxlevels)
 		return false;
-	if (be16_to_cpu(block->bb_numrecs) > mp->m_inobt_mxr[level != 0])
-		return false;
-
-	/* sibling pointer verification */
-	if (!block->bb_u.s.bb_leftsib ||
-	    (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks &&
-	     block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK)))
-		return false;
-	if (!block->bb_u.s.bb_rightsib ||
-	    (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks &&
-	     block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK)))
-		return false;
 
-	return true;
+	return xfs_btree_sblock_verify(bp, mp->m_inobt_mxr[level != 0]);
 }
 
 static void
@@ -304,6 +284,7 @@ xfs_inobt_write_verify(
 }
 
 const struct xfs_buf_ops xfs_inobt_buf_ops = {
+	.name = "xfs_inobt",
 	.verify_read = xfs_inobt_read_verify,
 	.verify_write = xfs_inobt_write_verify,
 };
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 65485cfc4..1aabfda66 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -68,6 +68,8 @@ xfs_inobp_check(
  * recovery and we don't get unnecssary panics on debug kernels. We use EIO here
  * because all we want to do is say readahead failed; there is no-one to report
  * the error to, so this will distinguish it from a non-ra verifier failure.
+ * Changes to this readahead error behavour also need to be reflected in
+ * xfs_dquot_buf_readahead_verify().
  */
 static void
 xfs_inode_buf_verify(
@@ -134,11 +136,13 @@ xfs_inode_buf_write_verify(
 }
 
 const struct xfs_buf_ops xfs_inode_buf_ops = {
+	.name = "xfs_inode",
 	.verify_read = xfs_inode_buf_read_verify,
 	.verify_write = xfs_inode_buf_write_verify,
 };
 
 const struct xfs_buf_ops xfs_inode_buf_ra_ops = {
+	.name = "xxfs_inode_ra",
 	.verify_read = xfs_inode_buf_readahead_verify,
 	.verify_write = xfs_inode_buf_write_verify,
 };
diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h
index 1c55ccbb3..8e385f91d 100644
--- a/fs/xfs/libxfs/xfs_log_recover.h
+++ b/fs/xfs/libxfs/xfs_log_recover.h
@@ -60,6 +60,7 @@ typedef struct xlog_recover {
  */
 #define	XLOG_BC_TABLE_SIZE	64
 
+#define	XLOG_RECOVER_CRCPASS	0
 #define	XLOG_RECOVER_PASS1	1
 #define	XLOG_RECOVER_PASS2	2
 
diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h
index 1b0a08379..f51078f1e 100644
--- a/fs/xfs/libxfs/xfs_quota_defs.h
+++ b/fs/xfs/libxfs/xfs_quota_defs.h
@@ -153,7 +153,7 @@ typedef __uint16_t	xfs_qwarncnt_t;
 #define XFS_QMOPT_RESBLK_MASK	(XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS)
 
 extern int xfs_dqcheck(struct xfs_mount *mp, xfs_disk_dquot_t *ddq,
-		       xfs_dqid_t id, uint type, uint flags, char *str);
+		       xfs_dqid_t id, uint type, uint flags, const char *str);
 extern int xfs_calc_dquots_per_chunk(unsigned int nbblks);
 
 #endif	/* __XFS_QUOTA_H__ */
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index a0b071d88..8a53eaa34 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -679,11 +679,13 @@ xfs_sb_write_verify(
 }
 
 const struct xfs_buf_ops xfs_sb_buf_ops = {
+	.name = "xfs_sb",
 	.verify_read = xfs_sb_read_verify,
 	.verify_write = xfs_sb_write_verify,
 };
 
 const struct xfs_buf_ops xfs_sb_quiet_buf_ops = {
+	.name = "xfs_sb_quiet",
 	.verify_read = xfs_sb_quiet_read_verify,
 	.verify_write = xfs_sb_write_verify,
 };
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 5be529707..15c3ceb84 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -49,6 +49,7 @@ extern const struct xfs_buf_ops xfs_inobt_buf_ops;
 extern const struct xfs_buf_ops xfs_inode_buf_ops;
 extern const struct xfs_buf_ops xfs_inode_buf_ra_ops;
 extern const struct xfs_buf_ops xfs_dquot_buf_ops;
+extern const struct xfs_buf_ops xfs_dquot_buf_ra_ops;
 extern const struct xfs_buf_ops xfs_sb_buf_ops;
 extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops;
 extern const struct xfs_buf_ops xfs_symlink_buf_ops;
diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c
index cb6fd20a4..2e2c6716b 100644
--- a/fs/xfs/libxfs/xfs_symlink_remote.c
+++ b/fs/xfs/libxfs/xfs_symlink_remote.c
@@ -168,6 +168,7 @@ xfs_symlink_write_verify(
 }
 
 const struct xfs_buf_ops xfs_symlink_buf_ops = {
+	.name = "xfs_symlink",
 	.verify_read = xfs_symlink_read_verify,
 	.verify_write = xfs_symlink_write_verify,
 };
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 6bb470fbb..2d5df1f23 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -252,29 +252,6 @@ xfs_set_mode(struct inode *inode, umode_t mode)
 	return error;
 }
 
-static int
-xfs_acl_exists(struct inode *inode, unsigned char *name)
-{
-	int len = XFS_ACL_MAX_SIZE(XFS_M(inode->i_sb));
-
-	return (xfs_attr_get(XFS_I(inode), name, NULL, &len,
-			    ATTR_ROOT|ATTR_KERNOVAL) == 0);
-}
-
-int
-posix_acl_access_exists(struct inode *inode)
-{
-	return xfs_acl_exists(inode, SGI_ACL_FILE);
-}
-
-int
-posix_acl_default_exists(struct inode *inode)
-{
-	if (!S_ISDIR(inode->i_mode))
-		return 0;
-	return xfs_acl_exists(inode, SGI_ACL_DEFAULT);
-}
-
 int
 xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 {
diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h
index 52f8255d6..286fa8921 100644
--- a/fs/xfs/xfs_acl.h
+++ b/fs/xfs/xfs_acl.h
@@ -24,16 +24,12 @@ struct posix_acl;
 #ifdef CONFIG_XFS_POSIX_ACL
 extern struct posix_acl *xfs_get_acl(struct inode *inode, int type);
 extern int xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type);
-extern int posix_acl_access_exists(struct inode *inode);
-extern int posix_acl_default_exists(struct inode *inode);
 #else
 static inline struct posix_acl *xfs_get_acl(struct inode *inode, int type)
 {
 	return NULL;
 }
 # define xfs_set_acl					NULL
-# define posix_acl_access_exists(inode)			0
-# define posix_acl_default_exists(inode)		0
 #endif /* CONFIG_XFS_POSIX_ACL */
 
 extern void xfs_forget_acl(struct inode *inode, const char *name, int xflags);
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 29e7e5dd5..a9ebabfe7 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -55,7 +55,7 @@ xfs_count_page_state(
 	} while ((bh = bh->b_this_page) != head);
 }
 
-STATIC struct block_device *
+struct block_device *
 xfs_find_bdev_for_inode(
 	struct inode		*inode)
 {
@@ -1208,6 +1208,10 @@ xfs_vm_writepages(
 	struct writeback_control *wbc)
 {
 	xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
+	if (dax_mapping(mapping))
+		return dax_writeback_mapping_range(mapping,
+				xfs_find_bdev_for_inode(mapping->host), wbc);
+
 	return generic_writepages(mapping, wbc);
 }
 
@@ -1917,6 +1921,7 @@ xfs_vm_readpage(
 	struct file		*unused,
 	struct page		*page)
 {
+	trace_xfs_vm_readpage(page->mapping->host, 1);
 	return mpage_readpage(page, xfs_get_blocks);
 }
 
@@ -1927,6 +1932,7 @@ xfs_vm_readpages(
 	struct list_head	*pages,
 	unsigned		nr_pages)
 {
+	trace_xfs_vm_readpages(mapping->host, nr_pages);
 	return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);
 }
 
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index f6ffc9ae5..a4343c63f 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -62,5 +62,6 @@ int	xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset,
 			         struct buffer_head *map_bh, int create);
 
 extern void xfs_count_page_state(struct page *, int *, int *);
+extern struct block_device *xfs_find_bdev_for_inode(struct inode *);
 
 #endif /* __XFS_AOPS_H__ */
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index dbae6490a..6c876012b 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -75,7 +75,8 @@ xfs_zero_extent(
 	ssize_t		size = XFS_FSB_TO_B(mp, count_fsb);
 
 	if (IS_DAX(VFS_I(ip)))
-		return dax_clear_blocks(VFS_I(ip), block, size);
+		return dax_clear_sectors(xfs_find_bdev_for_inode(VFS_I(ip)),
+				sector, size);
 
 	/*
 	 * let the block layer decide on the fastest method of
@@ -91,32 +92,32 @@ xfs_zero_extent(
  * last due to locking considerations.  We never free any extents in
  * the first transaction.
  *
- * Return 1 if the given transaction was committed and a new one
- * started, and 0 otherwise in the committed parameter.
+ * If an inode *ip is provided, rejoin it to the transaction if
+ * the transaction was committed.
  */
 int						/* error */
 xfs_bmap_finish(
 	struct xfs_trans		**tp,	/* transaction pointer addr */
 	struct xfs_bmap_free		*flist,	/* i/o: list extents to free */
-	int				*committed)/* xact committed or not */
+	struct xfs_inode		*ip)
 {
 	struct xfs_efd_log_item		*efd;	/* extent free data */
 	struct xfs_efi_log_item		*efi;	/* extent free intention */
 	int				error;	/* error return value */
+	int				committed;/* xact committed or not */
 	struct xfs_bmap_free_item	*free;	/* free extent item */
 	struct xfs_bmap_free_item	*next;	/* next item on free list */
 
 	ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES);
-	if (flist->xbf_count == 0) {
-		*committed = 0;
+	if (flist->xbf_count == 0)
 		return 0;
-	}
+
 	efi = xfs_trans_get_efi(*tp, flist->xbf_count);
 	for (free = flist->xbf_first; free; free = free->xbfi_next)
 		xfs_trans_log_efi_extent(*tp, efi, free->xbfi_startblock,
 			free->xbfi_blockcount);
 
-	error = __xfs_trans_roll(tp, NULL, committed);
+	error = __xfs_trans_roll(tp, ip, &committed);
 	if (error) {
 		/*
 		 * If the transaction was committed, drop the EFD reference
@@ -128,16 +129,13 @@ xfs_bmap_finish(
 		 * transaction so we should return committed=1 even though we're
 		 * returning an error.
 		 */
-		if (*committed) {
+		if (committed) {
 			xfs_efi_release(efi);
 			xfs_force_shutdown((*tp)->t_mountp,
 				(error == -EFSCORRUPTED) ?
 					SHUTDOWN_CORRUPT_INCORE :
 					SHUTDOWN_META_IO_ERROR);
-		} else {
-			*committed = 1;
 		}
-
 		return error;
 	}
 
@@ -969,7 +967,6 @@ xfs_alloc_file_space(
 	xfs_bmbt_irec_t		imaps[1], *imapp;
 	xfs_bmap_free_t		free_list;
 	uint			qblocks, resblks, resrtextents;
-	int			committed;
 	int			error;
 
 	trace_xfs_alloc_file_space(ip);
@@ -1064,23 +1061,20 @@ xfs_alloc_file_space(
 		error = xfs_bmapi_write(tp, ip, startoffset_fsb,
 					allocatesize_fsb, alloc_type, &firstfsb,
 					resblks, imapp, &nimaps, &free_list);
-		if (error) {
+		if (error)
 			goto error0;
-		}
 
 		/*
 		 * Complete the transaction
 		 */
-		error = xfs_bmap_finish(&tp, &free_list, &committed);
-		if (error) {
+		error = xfs_bmap_finish(&tp, &free_list, NULL);
+		if (error)
 			goto error0;
-		}
 
 		error = xfs_trans_commit(tp);
 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
-		if (error) {
+		if (error)
 			break;
-		}
 
 		allocated_fsb = imapp->br_blockcount;
 
@@ -1206,7 +1200,6 @@ xfs_free_file_space(
 	xfs_off_t		offset,
 	xfs_off_t		len)
 {
-	int			committed;
 	int			done;
 	xfs_fileoff_t		endoffset_fsb;
 	int			error;
@@ -1346,17 +1339,15 @@ xfs_free_file_space(
 		error = xfs_bunmapi(tp, ip, startoffset_fsb,
 				  endoffset_fsb - startoffset_fsb,
 				  0, 2, &firstfsb, &free_list, &done);
-		if (error) {
+		if (error)
 			goto error0;
-		}
 
 		/*
 		 * complete the transaction
 		 */
-		error = xfs_bmap_finish(&tp, &free_list, &committed);
-		if (error) {
+		error = xfs_bmap_finish(&tp, &free_list, NULL);
+		if (error)
 			goto error0;
-		}
 
 		error = xfs_trans_commit(tp);
 		xfs_iunlock(ip, XFS_ILOCK_EXCL);
@@ -1434,7 +1425,6 @@ xfs_shift_file_space(
 	int			error;
 	struct xfs_bmap_free	free_list;
 	xfs_fsblock_t		first_block;
-	int			committed;
 	xfs_fileoff_t		stop_fsb;
 	xfs_fileoff_t		next_fsb;
 	xfs_fileoff_t		shift_fsb;
@@ -1526,7 +1516,7 @@ xfs_shift_file_space(
 		if (error)
 			goto out_bmap_cancel;
 
-		error = xfs_bmap_finish(&tp, &free_list, &committed);
+		error = xfs_bmap_finish(&tp, &free_list, NULL);
 		if (error)
 			goto out_bmap_cancel;
 
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 39090fc56..435c7de42 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -1052,7 +1052,7 @@ xfs_buf_ioend_work(
 	xfs_buf_ioend(bp);
 }
 
-void
+static void
 xfs_buf_ioend_async(
 	struct xfs_buf	*bp)
 {
@@ -1649,13 +1649,9 @@ xfs_setsize_buftarg(
 	btp->bt_meta_sectormask = sectorsize - 1;
 
 	if (set_blocksize(btp->bt_bdev, sectorsize)) {
-		char name[BDEVNAME_SIZE];
-
-		bdevname(btp->bt_bdev, name);
-
 		xfs_warn(btp->bt_mount,
-			"Cannot set_blocksize to %u on device %s",
-			sectorsize, name);
+			"Cannot set_blocksize to %u on device %pg",
+			sectorsize, btp->bt_bdev);
 		return -EINVAL;
 	}
 
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index c79b717d9..c75721acd 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -132,6 +132,7 @@ struct xfs_buf_map {
 	struct xfs_buf_map (map) = { .bm_bn = (blkno), .bm_len = (numblk) };
 
 struct xfs_buf_ops {
+	char *name;
 	void (*verify_read)(struct xfs_buf *);
 	void (*verify_write)(struct xfs_buf *);
 };
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 7ac6c5c58..9c44d38dc 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -306,7 +306,7 @@ xfs_qm_dqalloc(
 	xfs_fsblock_t	firstblock;
 	xfs_bmap_free_t flist;
 	xfs_bmbt_irec_t map;
-	int		nmaps, error, committed;
+	int		nmaps, error;
 	xfs_buf_t	*bp;
 	xfs_trans_t	*tp = *tpp;
 
@@ -379,11 +379,12 @@ xfs_qm_dqalloc(
 
 	xfs_trans_bhold(tp, bp);
 
-	if ((error = xfs_bmap_finish(tpp, &flist, &committed))) {
+	error = xfs_bmap_finish(tpp, &flist, NULL);
+	if (error)
 		goto error1;
-	}
 
-	if (committed) {
+	/* Transaction was committed? */
+	if (*tpp != tp) {
 		tp = *tpp;
 		xfs_trans_bjoin(tp, bp);
 	} else {
@@ -393,9 +394,9 @@ xfs_qm_dqalloc(
 	*O_bpp = bp;
 	return 0;
 
-      error1:
+error1:
 	xfs_bmap_cancel(&flist);
-      error0:
+error0:
 	xfs_iunlock(quotip, XFS_ILOCK_EXCL);
 
 	return error;
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 74d0e5966..88693a98f 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -164,9 +164,9 @@ xfs_verifier_error(
 {
 	struct xfs_mount *mp = bp->b_target->bt_mount;
 
-	xfs_alert(mp, "Metadata %s detected at %pF, block 0x%llx",
+	xfs_alert(mp, "Metadata %s detected at %pF, %s block 0x%llx",
 		  bp->b_error == -EFSBADCRC ? "CRC error" : "corruption",
-		  __return_address, bp->b_bn);
+		  __return_address, bp->b_ops->name, bp->b_bn);
 
 	xfs_alert(mp, "Unmount and run xfs_repair");
 
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index f5392ab2d..52883ac3c 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -55,7 +55,7 @@ xfs_rw_ilock(
 	int			type)
 {
 	if (type & XFS_IOLOCK_EXCL)
-		mutex_lock(&VFS_I(ip)->i_mutex);
+		inode_lock(VFS_I(ip));
 	xfs_ilock(ip, type);
 }
 
@@ -66,7 +66,7 @@ xfs_rw_iunlock(
 {
 	xfs_iunlock(ip, type);
 	if (type & XFS_IOLOCK_EXCL)
-		mutex_unlock(&VFS_I(ip)->i_mutex);
+		inode_unlock(VFS_I(ip));
 }
 
 static inline void
@@ -76,7 +76,7 @@ xfs_rw_ilock_demote(
 {
 	xfs_ilock_demote(ip, type);
 	if (type & XFS_IOLOCK_EXCL)
-		mutex_unlock(&VFS_I(ip)->i_mutex);
+		inode_unlock(VFS_I(ip));
 }
 
 /*
@@ -402,19 +402,26 @@ xfs_file_splice_read(
 	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 		return -EIO;
 
-	xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
-
 	trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
 
-	/* for dax, we need to avoid the page cache */
-	if (IS_DAX(VFS_I(ip)))
-		ret = default_file_splice_read(infilp, ppos, pipe, count, flags);
-	else
-		ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
-	if (ret > 0)
-		XFS_STATS_ADD(ip->i_mount, xs_read_bytes, ret);
+	/*
+	 * DAX inodes cannot ues the page cache for splice, so we have to push
+	 * them through the VFS IO path. This means it goes through
+	 * ->read_iter, which for us takes the XFS_IOLOCK_SHARED. Hence we
+	 * cannot lock the splice operation at this level for DAX inodes.
+	 */
+	if (IS_DAX(VFS_I(ip))) {
+		ret = default_file_splice_read(infilp, ppos, pipe, count,
+					       flags);
+		goto out;
+	}
 
+	xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
+	ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
 	xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+out:
+	if (ret > 0)
+		XFS_STATS_ADD(ip->i_mount, xs_read_bytes, ret);
 	return ret;
 }
 
@@ -1603,9 +1610,8 @@ xfs_filemap_pmd_fault(
 /*
  * pfn_mkwrite was originally inteneded to ensure we capture time stamp
  * updates on write faults. In reality, it's need to serialise against
- * truncate similar to page_mkwrite. Hence we open-code dax_pfn_mkwrite()
- * here and cycle the XFS_MMAPLOCK_SHARED to ensure we serialise the fault
- * barrier in place.
+ * truncate similar to page_mkwrite. Hence we cycle the XFS_MMAPLOCK_SHARED
+ * to ensure we serialise the fault barrier in place.
  */
 static int
 xfs_filemap_pfn_mkwrite(
@@ -1628,6 +1634,8 @@ xfs_filemap_pfn_mkwrite(
 	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	if (vmf->pgoff >= size)
 		ret = VM_FAULT_SIGBUS;
+	else if (IS_DAX(inode))
+		ret = dax_pfn_mkwrite(vma, vmf);
 	xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
 	sb_end_pagefault(inode->i_sb);
 	return ret;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 8ee393996..ceba1a83c 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -610,60 +610,69 @@ __xfs_iflock(
 
 STATIC uint
 _xfs_dic2xflags(
-	__uint16_t		di_flags)
+	__uint16_t		di_flags,
+	uint64_t		di_flags2,
+	bool			has_attr)
 {
 	uint			flags = 0;
 
 	if (di_flags & XFS_DIFLAG_ANY) {
 		if (di_flags & XFS_DIFLAG_REALTIME)
-			flags |= XFS_XFLAG_REALTIME;
+			flags |= FS_XFLAG_REALTIME;
 		if (di_flags & XFS_DIFLAG_PREALLOC)
-			flags |= XFS_XFLAG_PREALLOC;
+			flags |= FS_XFLAG_PREALLOC;
 		if (di_flags & XFS_DIFLAG_IMMUTABLE)
-			flags |= XFS_XFLAG_IMMUTABLE;
+			flags |= FS_XFLAG_IMMUTABLE;
 		if (di_flags & XFS_DIFLAG_APPEND)
-			flags |= XFS_XFLAG_APPEND;
+			flags |= FS_XFLAG_APPEND;
 		if (di_flags & XFS_DIFLAG_SYNC)
-			flags |= XFS_XFLAG_SYNC;
+			flags |= FS_XFLAG_SYNC;
 		if (di_flags & XFS_DIFLAG_NOATIME)
-			flags |= XFS_XFLAG_NOATIME;
+			flags |= FS_XFLAG_NOATIME;
 		if (di_flags & XFS_DIFLAG_NODUMP)
-			flags |= XFS_XFLAG_NODUMP;
+			flags |= FS_XFLAG_NODUMP;
 		if (di_flags & XFS_DIFLAG_RTINHERIT)
-			flags |= XFS_XFLAG_RTINHERIT;
+			flags |= FS_XFLAG_RTINHERIT;
 		if (di_flags & XFS_DIFLAG_PROJINHERIT)
-			flags |= XFS_XFLAG_PROJINHERIT;
+			flags |= FS_XFLAG_PROJINHERIT;
 		if (di_flags & XFS_DIFLAG_NOSYMLINKS)
-			flags |= XFS_XFLAG_NOSYMLINKS;
+			flags |= FS_XFLAG_NOSYMLINKS;
 		if (di_flags & XFS_DIFLAG_EXTSIZE)
-			flags |= XFS_XFLAG_EXTSIZE;
+			flags |= FS_XFLAG_EXTSIZE;
 		if (di_flags & XFS_DIFLAG_EXTSZINHERIT)
-			flags |= XFS_XFLAG_EXTSZINHERIT;
+			flags |= FS_XFLAG_EXTSZINHERIT;
 		if (di_flags & XFS_DIFLAG_NODEFRAG)
-			flags |= XFS_XFLAG_NODEFRAG;
+			flags |= FS_XFLAG_NODEFRAG;
 		if (di_flags & XFS_DIFLAG_FILESTREAM)
-			flags |= XFS_XFLAG_FILESTREAM;
+			flags |= FS_XFLAG_FILESTREAM;
 	}
 
+	if (di_flags2 & XFS_DIFLAG2_ANY) {
+		if (di_flags2 & XFS_DIFLAG2_DAX)
+			flags |= FS_XFLAG_DAX;
+	}
+
+	if (has_attr)
+		flags |= FS_XFLAG_HASATTR;
+
 	return flags;
 }
 
 uint
 xfs_ip2xflags(
-	xfs_inode_t		*ip)
+	struct xfs_inode	*ip)
 {
-	xfs_icdinode_t		*dic = &ip->i_d;
+	struct xfs_icdinode	*dic = &ip->i_d;
 
-	return _xfs_dic2xflags(dic->di_flags) |
-				(XFS_IFORK_Q(ip) ? XFS_XFLAG_HASATTR : 0);
+	return _xfs_dic2xflags(dic->di_flags, dic->di_flags2, XFS_IFORK_Q(ip));
 }
 
 uint
 xfs_dic2xflags(
-	xfs_dinode_t		*dip)
+	struct xfs_dinode	*dip)
 {
-	return _xfs_dic2xflags(be16_to_cpu(dip->di_flags)) |
-				(XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0);
+	return _xfs_dic2xflags(be16_to_cpu(dip->di_flags),
+				be64_to_cpu(dip->di_flags2), XFS_DFORK_Q(dip));
 }
 
 /*
@@ -862,7 +871,8 @@ xfs_ialloc(
 	case S_IFREG:
 	case S_IFDIR:
 		if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) {
-			uint	di_flags = 0;
+			uint64_t	di_flags2 = 0;
+			uint		di_flags = 0;
 
 			if (S_ISDIR(mode)) {
 				if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT)
@@ -898,7 +908,11 @@ xfs_ialloc(
 				di_flags |= XFS_DIFLAG_NODEFRAG;
 			if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM)
 				di_flags |= XFS_DIFLAG_FILESTREAM;
+			if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX)
+				di_flags2 |= XFS_DIFLAG2_DAX;
+
 			ip->i_d.di_flags |= di_flags;
+			ip->i_d.di_flags2 |= di_flags2;
 		}
 		/* FALLTHROUGH */
 	case S_IFLNK:
@@ -1143,7 +1157,6 @@ xfs_create(
 	xfs_bmap_free_t		free_list;
 	xfs_fsblock_t		first_block;
 	bool                    unlock_dp_on_error = false;
-	int			committed;
 	prid_t			prid;
 	struct xfs_dquot	*udqp = NULL;
 	struct xfs_dquot	*gdqp = NULL;
@@ -1226,7 +1239,7 @@ xfs_create(
 	 * pointing to itself.
 	 */
 	error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev,
-			       prid, resblks > 0, &ip, &committed);
+			       prid, resblks > 0, &ip, NULL);
 	if (error)
 		goto out_trans_cancel;
 
@@ -1275,7 +1288,7 @@ xfs_create(
 	 */
 	xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp);
 
-	error = xfs_bmap_finish(&tp, &free_list, &committed);
+	error = xfs_bmap_finish(&tp, &free_list, NULL);
 	if (error)
 		goto out_bmap_cancel;
 
@@ -1427,7 +1440,6 @@ xfs_link(
 	int			error;
 	xfs_bmap_free_t         free_list;
 	xfs_fsblock_t           first_block;
-	int			committed;
 	int			resblks;
 
 	trace_xfs_link(tdp, target_name);
@@ -1502,11 +1514,10 @@ xfs_link(
 	 * link transaction goes to disk before returning to
 	 * the user.
 	 */
-	if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) {
+	if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
 		xfs_trans_set_sync(tp);
-	}
 
-	error = xfs_bmap_finish (&tp, &free_list, &committed);
+	error = xfs_bmap_finish(&tp, &free_list, NULL);
 	if (error) {
 		xfs_bmap_cancel(&free_list);
 		goto error_return;
@@ -1555,7 +1566,6 @@ xfs_itruncate_extents(
 	xfs_fileoff_t		first_unmap_block;
 	xfs_fileoff_t		last_block;
 	xfs_filblks_t		unmap_len;
-	int			committed;
 	int			error = 0;
 	int			done = 0;
 
@@ -1601,9 +1611,7 @@ xfs_itruncate_extents(
 		 * Duplicate the transaction that has the permanent
 		 * reservation and commit the old transaction.
 		 */
-		error = xfs_bmap_finish(&tp, &free_list, &committed);
-		if (committed)
-			xfs_trans_ijoin(tp, ip, 0);
+		error = xfs_bmap_finish(&tp, &free_list, ip);
 		if (error)
 			goto out_bmap_cancel;
 
@@ -1774,7 +1782,6 @@ xfs_inactive_ifree(
 {
 	xfs_bmap_free_t		free_list;
 	xfs_fsblock_t		first_block;
-	int			committed;
 	struct xfs_mount	*mp = ip->i_mount;
 	struct xfs_trans	*tp;
 	int			error;
@@ -1841,7 +1848,7 @@ xfs_inactive_ifree(
 	 * Just ignore errors at this point.  There is nothing we can do except
 	 * to try to keep going. Make sure it's not a silent error.
 	 */
-	error = xfs_bmap_finish(&tp,  &free_list, &committed);
+	error = xfs_bmap_finish(&tp, &free_list, NULL);
 	if (error) {
 		xfs_notice(mp, "%s: xfs_bmap_finish returned error %d",
 			__func__, error);
@@ -2523,7 +2530,6 @@ xfs_remove(
 	int                     error = 0;
 	xfs_bmap_free_t         free_list;
 	xfs_fsblock_t           first_block;
-	int			committed;
 	uint			resblks;
 
 	trace_xfs_remove(dp, name);
@@ -2624,7 +2630,7 @@ xfs_remove(
 	if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
 		xfs_trans_set_sync(tp);
 
-	error = xfs_bmap_finish(&tp, &free_list, &committed);
+	error = xfs_bmap_finish(&tp, &free_list, NULL);
 	if (error)
 		goto out_bmap_cancel;
 
@@ -2701,7 +2707,6 @@ xfs_finish_rename(
 	struct xfs_trans	*tp,
 	struct xfs_bmap_free	*free_list)
 {
-	int			committed = 0;
 	int			error;
 
 	/*
@@ -2711,7 +2716,7 @@ xfs_finish_rename(
 	if (tp->t_mountp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC))
 		xfs_trans_set_sync(tp);
 
-	error = xfs_bmap_finish(&tp, free_list, &committed);
+	error = xfs_bmap_finish(&tp, free_list, NULL);
 	if (error) {
 		xfs_bmap_cancel(free_list);
 		xfs_trans_cancel(tp);
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index d42738dee..478d04e07 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -859,25 +859,25 @@ xfs_merge_ioc_xflags(
 	unsigned int	xflags = start;
 
 	if (flags & FS_IMMUTABLE_FL)
-		xflags |= XFS_XFLAG_IMMUTABLE;
+		xflags |= FS_XFLAG_IMMUTABLE;
 	else
-		xflags &= ~XFS_XFLAG_IMMUTABLE;
+		xflags &= ~FS_XFLAG_IMMUTABLE;
 	if (flags & FS_APPEND_FL)
-		xflags |= XFS_XFLAG_APPEND;
+		xflags |= FS_XFLAG_APPEND;
 	else
-		xflags &= ~XFS_XFLAG_APPEND;
+		xflags &= ~FS_XFLAG_APPEND;
 	if (flags & FS_SYNC_FL)
-		xflags |= XFS_XFLAG_SYNC;
+		xflags |= FS_XFLAG_SYNC;
 	else
-		xflags &= ~XFS_XFLAG_SYNC;
+		xflags &= ~FS_XFLAG_SYNC;
 	if (flags & FS_NOATIME_FL)
-		xflags |= XFS_XFLAG_NOATIME;
+		xflags |= FS_XFLAG_NOATIME;
 	else
-		xflags &= ~XFS_XFLAG_NOATIME;
+		xflags &= ~FS_XFLAG_NOATIME;
 	if (flags & FS_NODUMP_FL)
-		xflags |= XFS_XFLAG_NODUMP;
+		xflags |= FS_XFLAG_NODUMP;
 	else
-		xflags &= ~XFS_XFLAG_NODUMP;
+		xflags &= ~FS_XFLAG_NODUMP;
 
 	return xflags;
 }
@@ -945,40 +945,51 @@ xfs_set_diflags(
 	unsigned int		xflags)
 {
 	unsigned int		di_flags;
+	uint64_t		di_flags2;
 
 	/* can't set PREALLOC this way, just preserve it */
 	di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC);
-	if (xflags & XFS_XFLAG_IMMUTABLE)
+	if (xflags & FS_XFLAG_IMMUTABLE)
 		di_flags |= XFS_DIFLAG_IMMUTABLE;
-	if (xflags & XFS_XFLAG_APPEND)
+	if (xflags & FS_XFLAG_APPEND)
 		di_flags |= XFS_DIFLAG_APPEND;
-	if (xflags & XFS_XFLAG_SYNC)
+	if (xflags & FS_XFLAG_SYNC)
 		di_flags |= XFS_DIFLAG_SYNC;
-	if (xflags & XFS_XFLAG_NOATIME)
+	if (xflags & FS_XFLAG_NOATIME)
 		di_flags |= XFS_DIFLAG_NOATIME;
-	if (xflags & XFS_XFLAG_NODUMP)
+	if (xflags & FS_XFLAG_NODUMP)
 		di_flags |= XFS_DIFLAG_NODUMP;
-	if (xflags & XFS_XFLAG_NODEFRAG)
+	if (xflags & FS_XFLAG_NODEFRAG)
 		di_flags |= XFS_DIFLAG_NODEFRAG;
-	if (xflags & XFS_XFLAG_FILESTREAM)
+	if (xflags & FS_XFLAG_FILESTREAM)
 		di_flags |= XFS_DIFLAG_FILESTREAM;
 	if (S_ISDIR(ip->i_d.di_mode)) {
-		if (xflags & XFS_XFLAG_RTINHERIT)
+		if (xflags & FS_XFLAG_RTINHERIT)
 			di_flags |= XFS_DIFLAG_RTINHERIT;
-		if (xflags & XFS_XFLAG_NOSYMLINKS)
+		if (xflags & FS_XFLAG_NOSYMLINKS)
 			di_flags |= XFS_DIFLAG_NOSYMLINKS;
-		if (xflags & XFS_XFLAG_EXTSZINHERIT)
+		if (xflags & FS_XFLAG_EXTSZINHERIT)
 			di_flags |= XFS_DIFLAG_EXTSZINHERIT;
-		if (xflags & XFS_XFLAG_PROJINHERIT)
+		if (xflags & FS_XFLAG_PROJINHERIT)
 			di_flags |= XFS_DIFLAG_PROJINHERIT;
 	} else if (S_ISREG(ip->i_d.di_mode)) {
-		if (xflags & XFS_XFLAG_REALTIME)
+		if (xflags & FS_XFLAG_REALTIME)
 			di_flags |= XFS_DIFLAG_REALTIME;
-		if (xflags & XFS_XFLAG_EXTSIZE)
+		if (xflags & FS_XFLAG_EXTSIZE)
 			di_flags |= XFS_DIFLAG_EXTSIZE;
 	}
-
 	ip->i_d.di_flags = di_flags;
+
+	/* diflags2 only valid for v3 inodes. */
+	if (ip->i_d.di_version < 3)
+		return;
+
+	di_flags2 = 0;
+	if (xflags & FS_XFLAG_DAX)
+		di_flags2 |= XFS_DIFLAG2_DAX;
+
+	ip->i_d.di_flags2 = di_flags2;
+
 }
 
 STATIC void
@@ -988,22 +999,27 @@ xfs_diflags_to_linux(
 	struct inode		*inode = VFS_I(ip);
 	unsigned int		xflags = xfs_ip2xflags(ip);
 
-	if (xflags & XFS_XFLAG_IMMUTABLE)
+	if (xflags & FS_XFLAG_IMMUTABLE)
 		inode->i_flags |= S_IMMUTABLE;
 	else
 		inode->i_flags &= ~S_IMMUTABLE;
-	if (xflags & XFS_XFLAG_APPEND)
+	if (xflags & FS_XFLAG_APPEND)
 		inode->i_flags |= S_APPEND;
 	else
 		inode->i_flags &= ~S_APPEND;
-	if (xflags & XFS_XFLAG_SYNC)
+	if (xflags & FS_XFLAG_SYNC)
 		inode->i_flags |= S_SYNC;
 	else
 		inode->i_flags &= ~S_SYNC;
-	if (xflags & XFS_XFLAG_NOATIME)
+	if (xflags & FS_XFLAG_NOATIME)
 		inode->i_flags |= S_NOATIME;
 	else
 		inode->i_flags &= ~S_NOATIME;
+	if (xflags & FS_XFLAG_DAX)
+		inode->i_flags |= S_DAX;
+	else
+		inode->i_flags &= ~S_DAX;
+
 }
 
 static int
@@ -1016,11 +1032,11 @@ xfs_ioctl_setattr_xflags(
 
 	/* Can't change realtime flag if any extents are allocated. */
 	if ((ip->i_d.di_nextents || ip->i_delayed_blks) &&
-	    XFS_IS_REALTIME_INODE(ip) != (fa->fsx_xflags & XFS_XFLAG_REALTIME))
+	    XFS_IS_REALTIME_INODE(ip) != (fa->fsx_xflags & FS_XFLAG_REALTIME))
 		return -EINVAL;
 
 	/* If realtime flag is set then must have realtime device */
-	if (fa->fsx_xflags & XFS_XFLAG_REALTIME) {
+	if (fa->fsx_xflags & FS_XFLAG_REALTIME) {
 		if (mp->m_sb.sb_rblocks == 0 || mp->m_sb.sb_rextsize == 0 ||
 		    (ip->i_d.di_extsize % mp->m_sb.sb_rextsize))
 			return -EINVAL;
@@ -1031,7 +1047,7 @@ xfs_ioctl_setattr_xflags(
 	 * we have appropriate permission.
 	 */
 	if (((ip->i_d.di_flags & (XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND)) ||
-	     (fa->fsx_xflags & (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) &&
+	     (fa->fsx_xflags & (FS_XFLAG_IMMUTABLE | FS_XFLAG_APPEND))) &&
 	    !capable(CAP_LINUX_IMMUTABLE))
 		return -EPERM;
 
@@ -1095,8 +1111,8 @@ out_cancel:
  * extent size hint validation is somewhat cumbersome. Rules are:
  *
  * 1. extent size hint is only valid for directories and regular files
- * 2. XFS_XFLAG_EXTSIZE is only valid for regular files
- * 3. XFS_XFLAG_EXTSZINHERIT is only valid for directories.
+ * 2. FS_XFLAG_EXTSIZE is only valid for regular files
+ * 3. FS_XFLAG_EXTSZINHERIT is only valid for directories.
  * 4. can only be changed on regular files if no extents are allocated
  * 5. can be changed on directories at any time
  * 6. extsize hint of 0 turns off hints, clears inode flags.
@@ -1112,10 +1128,10 @@ xfs_ioctl_setattr_check_extsize(
 {
 	struct xfs_mount	*mp = ip->i_mount;
 
-	if ((fa->fsx_xflags & XFS_XFLAG_EXTSIZE) && !S_ISREG(ip->i_d.di_mode))
+	if ((fa->fsx_xflags & FS_XFLAG_EXTSIZE) && !S_ISREG(ip->i_d.di_mode))
 		return -EINVAL;
 
-	if ((fa->fsx_xflags & XFS_XFLAG_EXTSZINHERIT) &&
+	if ((fa->fsx_xflags & FS_XFLAG_EXTSZINHERIT) &&
 	    !S_ISDIR(ip->i_d.di_mode))
 		return -EINVAL;
 
@@ -1132,7 +1148,7 @@ xfs_ioctl_setattr_check_extsize(
 			return -EINVAL;
 
 		if (XFS_IS_REALTIME_INODE(ip) ||
-		    (fa->fsx_xflags & XFS_XFLAG_REALTIME)) {
+		    (fa->fsx_xflags & FS_XFLAG_REALTIME)) {
 			size = mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog;
 		} else {
 			size = mp->m_sb.sb_blocksize;
@@ -1143,7 +1159,7 @@ xfs_ioctl_setattr_check_extsize(
 		if (fa->fsx_extsize % size)
 			return -EINVAL;
 	} else
-		fa->fsx_xflags &= ~(XFS_XFLAG_EXTSIZE | XFS_XFLAG_EXTSZINHERIT);
+		fa->fsx_xflags &= ~(FS_XFLAG_EXTSIZE | FS_XFLAG_EXTSZINHERIT);
 
 	return 0;
 }
@@ -1168,7 +1184,7 @@ xfs_ioctl_setattr_check_projid(
 
 	if (xfs_get_projid(ip) != fa->fsx_projid)
 		return -EINVAL;
-	if ((fa->fsx_xflags & XFS_XFLAG_PROJINHERIT) !=
+	if ((fa->fsx_xflags & FS_XFLAG_PROJINHERIT) !=
 	    (ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT))
 		return -EINVAL;
 
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index f4f5b43cf..d81bdc080 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -129,7 +129,6 @@ xfs_iomap_write_direct(
 	xfs_trans_t	*tp;
 	xfs_bmap_free_t free_list;
 	uint		qblocks, resblks, resrtextents;
-	int		committed;
 	int		error;
 	int		lockmode;
 	int		bmapi_flags = XFS_BMAPI_PREALLOC;
@@ -203,15 +202,20 @@ xfs_iomap_write_direct(
 	 * this outside the transaction context, but if we commit and then crash
 	 * we may not have zeroed the blocks and this will be exposed on
 	 * recovery of the allocation. Hence we must zero before commit.
+	 *
 	 * Further, if we are mapping unwritten extents here, we need to zero
 	 * and convert them to written so that we don't need an unwritten extent
 	 * callback for DAX. This also means that we need to be able to dip into
-	 * the reserve block pool if there is no space left but we need to do
-	 * unwritten extent conversion.
+	 * the reserve block pool for bmbt block allocation if there is no space
+	 * left but we need to do unwritten extent conversion.
 	 */
+
 	if (IS_DAX(VFS_I(ip))) {
 		bmapi_flags = XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO;
-		tp->t_flags |= XFS_TRANS_RESERVE;
+		if (ISUNWRITTEN(imap)) {
+			tp->t_flags |= XFS_TRANS_RESERVE;
+			resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1;
+		}
 	}
 	error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write,
 				  resblks, resrtextents);
@@ -247,7 +251,7 @@ xfs_iomap_write_direct(
 	/*
 	 * Complete the transaction
 	 */
-	error = xfs_bmap_finish(&tp, &free_list, &committed);
+	error = xfs_bmap_finish(&tp, &free_list, NULL);
 	if (error)
 		goto out_bmap_cancel;
 
@@ -693,7 +697,7 @@ xfs_iomap_write_allocate(
 	xfs_bmap_free_t	free_list;
 	xfs_filblks_t	count_fsb;
 	xfs_trans_t	*tp;
-	int		nimaps, committed;
+	int		nimaps;
 	int		error = 0;
 	int		nres;
 
@@ -794,7 +798,7 @@ xfs_iomap_write_allocate(
 			if (error)
 				goto trans_cancel;
 
-			error = xfs_bmap_finish(&tp, &free_list, &committed);
+			error = xfs_bmap_finish(&tp, &free_list, NULL);
 			if (error)
 				goto trans_cancel;
 
@@ -852,7 +856,6 @@ xfs_iomap_write_unwritten(
 	xfs_bmap_free_t free_list;
 	xfs_fsize_t	i_size;
 	uint		resblks;
-	int		committed;
 	int		error;
 
 	trace_xfs_unwritten_convert(ip, offset, count);
@@ -924,7 +927,7 @@ xfs_iomap_write_unwritten(
 			xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 		}
 
-		error = xfs_bmap_finish(&tp, &free_list, &committed);
+		error = xfs_bmap_finish(&tp, &free_list, NULL);
 		if (error)
 			goto error_on_bmapi_transaction;
 
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 245268a0c..76b71a1c6 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -414,13 +414,17 @@ xfs_vn_rename(
  * uio is kmalloced for this reason...
  */
 STATIC const char *
-xfs_vn_follow_link(
+xfs_vn_get_link(
 	struct dentry		*dentry,
-	void			**cookie)
+	struct inode		*inode,
+	struct delayed_call	*done)
 {
 	char			*link;
 	int			error = -ENOMEM;
 
+	if (!dentry)
+		return ERR_PTR(-ECHILD);
+
 	link = kmalloc(MAXPATHLEN+1, GFP_KERNEL);
 	if (!link)
 		goto out_err;
@@ -429,7 +433,8 @@ xfs_vn_follow_link(
 	if (unlikely(error))
 		goto out_kfree;
 
-	return *cookie = link;
+	set_delayed_call(done, kfree_link, link);
+	return link;
 
  out_kfree:
 	kfree(link);
@@ -1172,8 +1177,7 @@ static const struct inode_operations xfs_dir_ci_inode_operations = {
 
 static const struct inode_operations xfs_symlink_inode_operations = {
 	.readlink		= generic_readlink,
-	.follow_link		= xfs_vn_follow_link,
-	.put_link		= kfree_put_link,
+	.get_link		= xfs_vn_get_link,
 	.getattr		= xfs_vn_getattr,
 	.setattr		= xfs_vn_setattr,
 	.setxattr		= generic_setxattr,
@@ -1201,8 +1205,8 @@ xfs_diflags_to_iflags(
 		inode->i_flags |= S_SYNC;
 	if (flags & XFS_DIFLAG_NOATIME)
 		inode->i_flags |= S_NOATIME;
-	/* XXX: Also needs an on-disk per inode flag! */
-	if (ip->i_mount->m_flags & XFS_MOUNT_DAX)
+	if (ip->i_mount->m_flags & XFS_MOUNT_DAX ||
+	    ip->i_d.di_flags2 & XFS_DIFLAG2_DAX)
 		inode->i_flags |= S_DAX;
 }
 
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index f52c72a1a..9c9a1c9bc 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1188,10 +1188,16 @@ xlog_iodone(xfs_buf_t *bp)
 	int			aborted = 0;
 
 	/*
-	 * Race to shutdown the filesystem if we see an error.
+	 * Race to shutdown the filesystem if we see an error or the iclog is in
+	 * IOABORT state. The IOABORT state is only set in DEBUG mode to inject
+	 * CRC errors into log recovery.
 	 */
-	if (XFS_TEST_ERROR(bp->b_error, l->l_mp,
-			XFS_ERRTAG_IODONE_IOERR, XFS_RANDOM_IODONE_IOERR)) {
+	if (XFS_TEST_ERROR(bp->b_error, l->l_mp, XFS_ERRTAG_IODONE_IOERR,
+			   XFS_RANDOM_IODONE_IOERR) ||
+	    iclog->ic_state & XLOG_STATE_IOABORT) {
+		if (iclog->ic_state & XLOG_STATE_IOABORT)
+			iclog->ic_state &= ~XLOG_STATE_IOABORT;
+
 		xfs_buf_ioerror_alert(bp, __func__);
 		xfs_buf_stale(bp);
 		xfs_force_shutdown(l->l_mp, SHUTDOWN_LOG_IO_ERROR);
@@ -1838,6 +1844,23 @@ xlog_sync(
 	/* calculcate the checksum */
 	iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header,
 					    iclog->ic_datap, size);
+#ifdef DEBUG
+	/*
+	 * Intentionally corrupt the log record CRC based on the error injection
+	 * frequency, if defined. This facilitates testing log recovery in the
+	 * event of torn writes. Hence, set the IOABORT state to abort the log
+	 * write on I/O completion and shutdown the fs. The subsequent mount
+	 * detects the bad CRC and attempts to recover.
+	 */
+	if (log->l_badcrc_factor &&
+	    (prandom_u32() % log->l_badcrc_factor == 0)) {
+		iclog->ic_header.h_crc &= 0xAAAAAAAA;
+		iclog->ic_state |= XLOG_STATE_IOABORT;
+		xfs_warn(log->l_mp,
+	"Intentionally corrupted log record at LSN 0x%llx. Shutdown imminent.",
+			 be64_to_cpu(iclog->ic_header.h_lsn));
+	}
+#endif
 
 	bp->b_io_length = BTOBB(count);
 	bp->b_fspriv = iclog;
@@ -2045,12 +2068,14 @@ xlog_print_tic_res(
 	    "QM_DQCLUSTER",
 	    "QM_QINOCREATE",
 	    "QM_QUOTAOFF_END",
-	    "SB_UNIT",
 	    "FSYNC_TS",
 	    "GROWFSRT_ALLOC",
 	    "GROWFSRT_ZERO",
 	    "GROWFSRT_FREE",
-	    "SWAPEXT"
+	    "SWAPEXT",
+	    "CHECKPOINT",
+	    "ICREATE",
+	    "CREATE_TMPFILE"
 	};
 
 	xfs_warn(mp, "xlog_write: reservation summary:");
@@ -2791,11 +2816,19 @@ xlog_state_do_callback(
 		}
 	} while (!ioerrors && loopdidcallbacks);
 
+#ifdef DEBUG
 	/*
-	 * make one last gasp attempt to see if iclogs are being left in
-	 * limbo..
+	 * Make one last gasp attempt to see if iclogs are being left in limbo.
+	 * If the above loop finds an iclog earlier than the current iclog and
+	 * in one of the syncing states, the current iclog is put into
+	 * DO_CALLBACK and the callbacks are deferred to the completion of the
+	 * earlier iclog. Walk the iclogs in order and make sure that no iclog
+	 * is in DO_CALLBACK unless an earlier iclog is in one of the syncing
+	 * states.
+	 *
+	 * Note that SYNCING|IOABORT is a valid state so we cannot just check
+	 * for ic_state == SYNCING.
 	 */
-#ifdef DEBUG
 	if (funcdidcallbacks) {
 		first_iclog = iclog = log->l_iclog;
 		do {
@@ -2810,7 +2843,7 @@ xlog_state_do_callback(
 			 * IOERROR - give up hope all ye who enter here
 			 */
 			if (iclog->ic_state == XLOG_STATE_WANT_SYNC ||
-			    iclog->ic_state == XLOG_STATE_SYNCING ||
+			    iclog->ic_state & XLOG_STATE_SYNCING ||
 			    iclog->ic_state == XLOG_STATE_DONE_SYNC ||
 			    iclog->ic_state == XLOG_STATE_IOERROR )
 				break;
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 8daba7491..ed8896310 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -62,6 +62,7 @@ static inline uint xlog_get_client_id(__be32 i)
 #define XLOG_STATE_CALLBACK  0x0020 /* Callback functions now */
 #define XLOG_STATE_DIRTY     0x0040 /* Dirty IC log, not ready for ACTIVE status*/
 #define XLOG_STATE_IOERROR   0x0080 /* IO error happened in sync'ing log */
+#define XLOG_STATE_IOABORT   0x0100 /* force abort on I/O completion (debug) */
 #define XLOG_STATE_ALL	     0x7FFF /* All possible valid flags */
 #define XLOG_STATE_NOTUSED   0x8000 /* This IC log not being used */
 
@@ -410,6 +411,8 @@ struct xlog {
 	/* The following field are used for debugging; need to hold icloglock */
 #ifdef DEBUG
 	void			*l_iclog_bak[XLOG_MAX_ICLOGS];
+	/* log record crc error injection factor */
+	uint32_t		l_badcrc_factor;
 #endif
 
 };
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index c5ecaacdd..be5568839 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -61,6 +61,9 @@ xlog_recover_check_summary(
 #else
 #define	xlog_recover_check_summary(log)
 #endif
+STATIC int
+xlog_do_recovery_pass(
+        struct xlog *, xfs_daddr_t, xfs_daddr_t, int, xfs_daddr_t *);
 
 /*
  * This structure is used during recovery to record the buf log items which
@@ -868,136 +871,365 @@ validate_head:
 }
 
 /*
- * Find the sync block number or the tail of the log.
- *
- * This will be the block number of the last record to have its
- * associated buffers synced to disk.  Every log record header has
- * a sync lsn embedded in it.  LSNs hold block numbers, so it is easy
- * to get a sync block number.  The only concern is to figure out which
- * log record header to believe.
- *
- * The following algorithm uses the log record header with the largest
- * lsn.  The entire log record does not need to be valid.  We only care
- * that the header is valid.
+ * Seek backwards in the log for log record headers.
  *
- * We could speed up search by using current head_blk buffer, but it is not
- * available.
+ * Given a starting log block, walk backwards until we find the provided number
+ * of records or hit the provided tail block. The return value is the number of
+ * records encountered or a negative error code. The log block and buffer
+ * pointer of the last record seen are returned in rblk and rhead respectively.
  */
 STATIC int
-xlog_find_tail(
+xlog_rseek_logrec_hdr(
 	struct xlog		*log,
-	xfs_daddr_t		*head_blk,
-	xfs_daddr_t		*tail_blk)
+	xfs_daddr_t		head_blk,
+	xfs_daddr_t		tail_blk,
+	int			count,
+	struct xfs_buf		*bp,
+	xfs_daddr_t		*rblk,
+	struct xlog_rec_header	**rhead,
+	bool			*wrapped)
 {
-	xlog_rec_header_t	*rhead;
-	xlog_op_header_t	*op_head;
+	int			i;
+	int			error;
+	int			found = 0;
 	char			*offset = NULL;
-	xfs_buf_t		*bp;
-	int			error, i, found;
-	xfs_daddr_t		umount_data_blk;
-	xfs_daddr_t		after_umount_blk;
-	xfs_lsn_t		tail_lsn;
-	int			hblks;
+	xfs_daddr_t		end_blk;
 
-	found = 0;
+	*wrapped = false;
 
 	/*
-	 * Find previous log record
+	 * Walk backwards from the head block until we hit the tail or the first
+	 * block in the log.
 	 */
-	if ((error = xlog_find_head(log, head_blk)))
-		return error;
-
-	bp = xlog_get_bp(log, 1);
-	if (!bp)
-		return -ENOMEM;
-	if (*head_blk == 0) {				/* special case */
-		error = xlog_bread(log, 0, 1, bp, &offset);
+	end_blk = head_blk > tail_blk ? tail_blk : 0;
+	for (i = (int) head_blk - 1; i >= end_blk; i--) {
+		error = xlog_bread(log, i, 1, bp, &offset);
 		if (error)
-			goto done;
+			goto out_error;
 
-		if (xlog_get_cycle(offset) == 0) {
-			*tail_blk = 0;
-			/* leave all other log inited values alone */
-			goto done;
+		if (*(__be32 *) offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
+			*rblk = i;
+			*rhead = (struct xlog_rec_header *) offset;
+			if (++found == count)
+				break;
 		}
 	}
 
 	/*
-	 * Search backwards looking for log record header block
+	 * If we haven't hit the tail block or the log record header count,
+	 * start looking again from the end of the physical log. Note that
+	 * callers can pass head == tail if the tail is not yet known.
 	 */
-	ASSERT(*head_blk < INT_MAX);
-	for (i = (int)(*head_blk) - 1; i >= 0; i--) {
+	if (tail_blk >= head_blk && found != count) {
+		for (i = log->l_logBBsize - 1; i >= (int) tail_blk; i--) {
+			error = xlog_bread(log, i, 1, bp, &offset);
+			if (error)
+				goto out_error;
+
+			if (*(__be32 *)offset ==
+			    cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
+				*wrapped = true;
+				*rblk = i;
+				*rhead = (struct xlog_rec_header *) offset;
+				if (++found == count)
+					break;
+			}
+		}
+	}
+
+	return found;
+
+out_error:
+	return error;
+}
+
+/*
+ * Seek forward in the log for log record headers.
+ *
+ * Given head and tail blocks, walk forward from the tail block until we find
+ * the provided number of records or hit the head block. The return value is the
+ * number of records encountered or a negative error code. The log block and
+ * buffer pointer of the last record seen are returned in rblk and rhead
+ * respectively.
+ */
+STATIC int
+xlog_seek_logrec_hdr(
+	struct xlog		*log,
+	xfs_daddr_t		head_blk,
+	xfs_daddr_t		tail_blk,
+	int			count,
+	struct xfs_buf		*bp,
+	xfs_daddr_t		*rblk,
+	struct xlog_rec_header	**rhead,
+	bool			*wrapped)
+{
+	int			i;
+	int			error;
+	int			found = 0;
+	char			*offset = NULL;
+	xfs_daddr_t		end_blk;
+
+	*wrapped = false;
+
+	/*
+	 * Walk forward from the tail block until we hit the head or the last
+	 * block in the log.
+	 */
+	end_blk = head_blk > tail_blk ? head_blk : log->l_logBBsize - 1;
+	for (i = (int) tail_blk; i <= end_blk; i++) {
 		error = xlog_bread(log, i, 1, bp, &offset);
 		if (error)
-			goto done;
+			goto out_error;
 
-		if (*(__be32 *)offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
-			found = 1;
-			break;
+		if (*(__be32 *) offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
+			*rblk = i;
+			*rhead = (struct xlog_rec_header *) offset;
+			if (++found == count)
+				break;
 		}
 	}
+
 	/*
-	 * If we haven't found the log record header block, start looking
-	 * again from the end of the physical log.  XXXmiken: There should be
-	 * a check here to make sure we didn't search more than N blocks in
-	 * the previous code.
+	 * If we haven't hit the head block or the log record header count,
+	 * start looking again from the start of the physical log.
 	 */
-	if (!found) {
-		for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) {
+	if (tail_blk > head_blk && found != count) {
+		for (i = 0; i < (int) head_blk; i++) {
 			error = xlog_bread(log, i, 1, bp, &offset);
 			if (error)
-				goto done;
+				goto out_error;
 
 			if (*(__be32 *)offset ==
 			    cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) {
-				found = 2;
-				break;
+				*wrapped = true;
+				*rblk = i;
+				*rhead = (struct xlog_rec_header *) offset;
+				if (++found == count)
+					break;
 			}
 		}
 	}
-	if (!found) {
-		xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
-		xlog_put_bp(bp);
-		ASSERT(0);
-		return -EIO;
+
+	return found;
+
+out_error:
+	return error;
+}
+
+/*
+ * Check the log tail for torn writes. This is required when torn writes are
+ * detected at the head and the head had to be walked back to a previous record.
+ * The tail of the previous record must now be verified to ensure the torn
+ * writes didn't corrupt the previous tail.
+ *
+ * Return an error if CRC verification fails as recovery cannot proceed.
+ */
+STATIC int
+xlog_verify_tail(
+	struct xlog		*log,
+	xfs_daddr_t		head_blk,
+	xfs_daddr_t		tail_blk)
+{
+	struct xlog_rec_header	*thead;
+	struct xfs_buf		*bp;
+	xfs_daddr_t		first_bad;
+	int			count;
+	int			error = 0;
+	bool			wrapped;
+	xfs_daddr_t		tmp_head;
+
+	bp = xlog_get_bp(log, 1);
+	if (!bp)
+		return -ENOMEM;
+
+	/*
+	 * Seek XLOG_MAX_ICLOGS + 1 records past the current tail record to get
+	 * a temporary head block that points after the last possible
+	 * concurrently written record of the tail.
+	 */
+	count = xlog_seek_logrec_hdr(log, head_blk, tail_blk,
+				     XLOG_MAX_ICLOGS + 1, bp, &tmp_head, &thead,
+				     &wrapped);
+	if (count < 0) {
+		error = count;
+		goto out;
 	}
 
-	/* find blk_no of tail of log */
-	rhead = (xlog_rec_header_t *)offset;
-	*tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn));
+	/*
+	 * If the call above didn't find XLOG_MAX_ICLOGS + 1 records, we ran
+	 * into the actual log head. tmp_head points to the start of the record
+	 * so update it to the actual head block.
+	 */
+	if (count < XLOG_MAX_ICLOGS + 1)
+		tmp_head = head_blk;
 
 	/*
-	 * Reset log values according to the state of the log when we
-	 * crashed.  In the case where head_blk == 0, we bump curr_cycle
-	 * one because the next write starts a new cycle rather than
-	 * continuing the cycle of the last good log record.  At this
-	 * point we have guaranteed that all partial log records have been
-	 * accounted for.  Therefore, we know that the last good log record
-	 * written was complete and ended exactly on the end boundary
-	 * of the physical log.
+	 * We now have a tail and temporary head block that covers at least
+	 * XLOG_MAX_ICLOGS records from the tail. We need to verify that these
+	 * records were completely written. Run a CRC verification pass from
+	 * tail to head and return the result.
 	 */
-	log->l_prev_block = i;
-	log->l_curr_block = (int)*head_blk;
-	log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
-	if (found == 2)
-		log->l_curr_cycle++;
-	atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
-	atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
-	xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle,
-					BBTOB(log->l_curr_block));
-	xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle,
-					BBTOB(log->l_curr_block));
+	error = xlog_do_recovery_pass(log, tmp_head, tail_blk,
+				      XLOG_RECOVER_CRCPASS, &first_bad);
+
+out:
+	xlog_put_bp(bp);
+	return error;
+}
+
+/*
+ * Detect and trim torn writes from the head of the log.
+ *
+ * Storage without sector atomicity guarantees can result in torn writes in the
+ * log in the event of a crash. Our only means to detect this scenario is via
+ * CRC verification. While we can't always be certain that CRC verification
+ * failure is due to a torn write vs. an unrelated corruption, we do know that
+ * only a certain number (XLOG_MAX_ICLOGS) of log records can be written out at
+ * one time. Therefore, CRC verify up to XLOG_MAX_ICLOGS records at the head of
+ * the log and treat failures in this range as torn writes as a matter of
+ * policy. In the event of CRC failure, the head is walked back to the last good
+ * record in the log and the tail is updated from that record and verified.
+ */
+STATIC int
+xlog_verify_head(
+	struct xlog		*log,
+	xfs_daddr_t		*head_blk,	/* in/out: unverified head */
+	xfs_daddr_t		*tail_blk,	/* out: tail block */
+	struct xfs_buf		*bp,
+	xfs_daddr_t		*rhead_blk,	/* start blk of last record */
+	struct xlog_rec_header	**rhead,	/* ptr to last record */
+	bool			*wrapped)	/* last rec. wraps phys. log */
+{
+	struct xlog_rec_header	*tmp_rhead;
+	struct xfs_buf		*tmp_bp;
+	xfs_daddr_t		first_bad;
+	xfs_daddr_t		tmp_rhead_blk;
+	int			found;
+	int			error;
+	bool			tmp_wrapped;
 
 	/*
-	 * Look for unmount record.  If we find it, then we know there
-	 * was a clean unmount.  Since 'i' could be the last block in
-	 * the physical log, we convert to a log block before comparing
-	 * to the head_blk.
+	 * Check the head of the log for torn writes. Search backwards from the
+	 * head until we hit the tail or the maximum number of log record I/Os
+	 * that could have been in flight at one time. Use a temporary buffer so
+	 * we don't trash the rhead/bp pointers from the caller.
+	 */
+	tmp_bp = xlog_get_bp(log, 1);
+	if (!tmp_bp)
+		return -ENOMEM;
+	error = xlog_rseek_logrec_hdr(log, *head_blk, *tail_blk,
+				      XLOG_MAX_ICLOGS, tmp_bp, &tmp_rhead_blk,
+				      &tmp_rhead, &tmp_wrapped);
+	xlog_put_bp(tmp_bp);
+	if (error < 0)
+		return error;
+
+	/*
+	 * Now run a CRC verification pass over the records starting at the
+	 * block found above to the current head. If a CRC failure occurs, the
+	 * log block of the first bad record is saved in first_bad.
+	 */
+	error = xlog_do_recovery_pass(log, *head_blk, tmp_rhead_blk,
+				      XLOG_RECOVER_CRCPASS, &first_bad);
+	if (error == -EFSBADCRC) {
+		/*
+		 * We've hit a potential torn write. Reset the error and warn
+		 * about it.
+		 */
+		error = 0;
+		xfs_warn(log->l_mp,
+"Torn write (CRC failure) detected at log block 0x%llx. Truncating head block from 0x%llx.",
+			 first_bad, *head_blk);
+
+		/*
+		 * Get the header block and buffer pointer for the last good
+		 * record before the bad record.
+		 *
+		 * Note that xlog_find_tail() clears the blocks at the new head
+		 * (i.e., the records with invalid CRC) if the cycle number
+		 * matches the the current cycle.
+		 */
+		found = xlog_rseek_logrec_hdr(log, first_bad, *tail_blk, 1, bp,
+					      rhead_blk, rhead, wrapped);
+		if (found < 0)
+			return found;
+		if (found == 0)		/* XXX: right thing to do here? */
+			return -EIO;
+
+		/*
+		 * Reset the head block to the starting block of the first bad
+		 * log record and set the tail block based on the last good
+		 * record.
+		 *
+		 * Bail out if the updated head/tail match as this indicates
+		 * possible corruption outside of the acceptable
+		 * (XLOG_MAX_ICLOGS) range. This is a job for xfs_repair...
+		 */
+		*head_blk = first_bad;
+		*tail_blk = BLOCK_LSN(be64_to_cpu((*rhead)->h_tail_lsn));
+		if (*head_blk == *tail_blk) {
+			ASSERT(0);
+			return 0;
+		}
+
+		/*
+		 * Now verify the tail based on the updated head. This is
+		 * required because the torn writes trimmed from the head could
+		 * have been written over the tail of a previous record. Return
+		 * any errors since recovery cannot proceed if the tail is
+		 * corrupt.
+		 *
+		 * XXX: This leaves a gap in truly robust protection from torn
+		 * writes in the log. If the head is behind the tail, the tail
+		 * pushes forward to create some space and then a crash occurs
+		 * causing the writes into the previous record's tail region to
+		 * tear, log recovery isn't able to recover.
+		 *
+		 * How likely is this to occur? If possible, can we do something
+		 * more intelligent here? Is it safe to push the tail forward if
+		 * we can determine that the tail is within the range of the
+		 * torn write (e.g., the kernel can only overwrite the tail if
+		 * it has actually been pushed forward)? Alternatively, could we
+		 * somehow prevent this condition at runtime?
+		 */
+		error = xlog_verify_tail(log, *head_blk, *tail_blk);
+	}
+
+	return error;
+}
+
+/*
+ * Check whether the head of the log points to an unmount record. In other
+ * words, determine whether the log is clean. If so, update the in-core state
+ * appropriately.
+ */
+static int
+xlog_check_unmount_rec(
+	struct xlog		*log,
+	xfs_daddr_t		*head_blk,
+	xfs_daddr_t		*tail_blk,
+	struct xlog_rec_header	*rhead,
+	xfs_daddr_t		rhead_blk,
+	struct xfs_buf		*bp,
+	bool			*clean)
+{
+	struct xlog_op_header	*op_head;
+	xfs_daddr_t		umount_data_blk;
+	xfs_daddr_t		after_umount_blk;
+	int			hblks;
+	int			error;
+	char			*offset;
+
+	*clean = false;
+
+	/*
+	 * Look for unmount record. If we find it, then we know there was a
+	 * clean unmount. Since 'i' could be the last block in the physical
+	 * log, we convert to a log block before comparing to the head_blk.
 	 *
-	 * Save the current tail lsn to use to pass to
-	 * xlog_clear_stale_blocks() below.  We won't want to clear the
-	 * unmount record if there is one, so we pass the lsn of the
-	 * unmount record rather than the block after it.
+	 * Save the current tail lsn to use to pass to xlog_clear_stale_blocks()
+	 * below. We won't want to clear the unmount record if there is one, so
+	 * we pass the lsn of the unmount record rather than the block after it.
 	 */
 	if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
 		int	h_size = be32_to_cpu(rhead->h_size);
@@ -1014,22 +1246,22 @@ xlog_find_tail(
 	} else {
 		hblks = 1;
 	}
-	after_umount_blk = (i + hblks + (int)
-		BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize;
-	tail_lsn = atomic64_read(&log->l_tail_lsn);
+	after_umount_blk = rhead_blk + hblks + BTOBB(be32_to_cpu(rhead->h_len));
+	after_umount_blk = do_mod(after_umount_blk, log->l_logBBsize);
 	if (*head_blk == after_umount_blk &&
 	    be32_to_cpu(rhead->h_num_logops) == 1) {
-		umount_data_blk = (i + hblks) % log->l_logBBsize;
+		umount_data_blk = rhead_blk + hblks;
+		umount_data_blk = do_mod(umount_data_blk, log->l_logBBsize);
 		error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
 		if (error)
-			goto done;
+			return error;
 
-		op_head = (xlog_op_header_t *)offset;
+		op_head = (struct xlog_op_header *)offset;
 		if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
 			/*
-			 * Set tail and last sync so that newly written
-			 * log records will point recovery to after the
-			 * current unmount record.
+			 * Set tail and last sync so that newly written log
+			 * records will point recovery to after the current
+			 * unmount record.
 			 */
 			xlog_assign_atomic_lsn(&log->l_tail_lsn,
 					log->l_curr_cycle, after_umount_blk);
@@ -1037,16 +1269,166 @@ xlog_find_tail(
 					log->l_curr_cycle, after_umount_blk);
 			*tail_blk = after_umount_blk;
 
-			/*
-			 * Note that the unmount was clean. If the unmount
-			 * was not clean, we need to know this to rebuild the
-			 * superblock counters from the perag headers if we
-			 * have a filesystem using non-persistent counters.
-			 */
-			log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN;
+			*clean = true;
 		}
 	}
 
+	return 0;
+}
+
+static void
+xlog_set_state(
+	struct xlog		*log,
+	xfs_daddr_t		head_blk,
+	struct xlog_rec_header	*rhead,
+	xfs_daddr_t		rhead_blk,
+	bool			bump_cycle)
+{
+	/*
+	 * Reset log values according to the state of the log when we
+	 * crashed.  In the case where head_blk == 0, we bump curr_cycle
+	 * one because the next write starts a new cycle rather than
+	 * continuing the cycle of the last good log record.  At this
+	 * point we have guaranteed that all partial log records have been
+	 * accounted for.  Therefore, we know that the last good log record
+	 * written was complete and ended exactly on the end boundary
+	 * of the physical log.
+	 */
+	log->l_prev_block = rhead_blk;
+	log->l_curr_block = (int)head_blk;
+	log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
+	if (bump_cycle)
+		log->l_curr_cycle++;
+	atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn));
+	atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn));
+	xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle,
+					BBTOB(log->l_curr_block));
+	xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle,
+					BBTOB(log->l_curr_block));
+}
+
+/*
+ * Find the sync block number or the tail of the log.
+ *
+ * This will be the block number of the last record to have its
+ * associated buffers synced to disk.  Every log record header has
+ * a sync lsn embedded in it.  LSNs hold block numbers, so it is easy
+ * to get a sync block number.  The only concern is to figure out which
+ * log record header to believe.
+ *
+ * The following algorithm uses the log record header with the largest
+ * lsn.  The entire log record does not need to be valid.  We only care
+ * that the header is valid.
+ *
+ * We could speed up search by using current head_blk buffer, but it is not
+ * available.
+ */
+STATIC int
+xlog_find_tail(
+	struct xlog		*log,
+	xfs_daddr_t		*head_blk,
+	xfs_daddr_t		*tail_blk)
+{
+	xlog_rec_header_t	*rhead;
+	char			*offset = NULL;
+	xfs_buf_t		*bp;
+	int			error;
+	xfs_daddr_t		rhead_blk;
+	xfs_lsn_t		tail_lsn;
+	bool			wrapped = false;
+	bool			clean = false;
+
+	/*
+	 * Find previous log record
+	 */
+	if ((error = xlog_find_head(log, head_blk)))
+		return error;
+	ASSERT(*head_blk < INT_MAX);
+
+	bp = xlog_get_bp(log, 1);
+	if (!bp)
+		return -ENOMEM;
+	if (*head_blk == 0) {				/* special case */
+		error = xlog_bread(log, 0, 1, bp, &offset);
+		if (error)
+			goto done;
+
+		if (xlog_get_cycle(offset) == 0) {
+			*tail_blk = 0;
+			/* leave all other log inited values alone */
+			goto done;
+		}
+	}
+
+	/*
+	 * Search backwards through the log looking for the log record header
+	 * block. This wraps all the way back around to the head so something is
+	 * seriously wrong if we can't find it.
+	 */
+	error = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, bp,
+				      &rhead_blk, &rhead, &wrapped);
+	if (error < 0)
+		return error;
+	if (!error) {
+		xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__);
+		return -EIO;
+	}
+	*tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn));
+
+	/*
+	 * Set the log state based on the current head record.
+	 */
+	xlog_set_state(log, *head_blk, rhead, rhead_blk, wrapped);
+	tail_lsn = atomic64_read(&log->l_tail_lsn);
+
+	/*
+	 * Look for an unmount record at the head of the log. This sets the log
+	 * state to determine whether recovery is necessary.
+	 */
+	error = xlog_check_unmount_rec(log, head_blk, tail_blk, rhead,
+				       rhead_blk, bp, &clean);
+	if (error)
+		goto done;
+
+	/*
+	 * Verify the log head if the log is not clean (e.g., we have anything
+	 * but an unmount record at the head). This uses CRC verification to
+	 * detect and trim torn writes. If discovered, CRC failures are
+	 * considered torn writes and the log head is trimmed accordingly.
+	 *
+	 * Note that we can only run CRC verification when the log is dirty
+	 * because there's no guarantee that the log data behind an unmount
+	 * record is compatible with the current architecture.
+	 */
+	if (!clean) {
+		xfs_daddr_t	orig_head = *head_blk;
+
+		error = xlog_verify_head(log, head_blk, tail_blk, bp,
+					 &rhead_blk, &rhead, &wrapped);
+		if (error)
+			goto done;
+
+		/* update in-core state again if the head changed */
+		if (*head_blk != orig_head) {
+			xlog_set_state(log, *head_blk, rhead, rhead_blk,
+				       wrapped);
+			tail_lsn = atomic64_read(&log->l_tail_lsn);
+			error = xlog_check_unmount_rec(log, head_blk, tail_blk,
+						       rhead, rhead_blk, bp,
+						       &clean);
+			if (error)
+				goto done;
+		}
+	}
+
+	/*
+	 * Note that the unmount was clean. If the unmount was not clean, we
+	 * need to know this to rebuild the superblock counters from the perag
+	 * headers if we have a filesystem using non-persistent counters.
+	 */
+	if (clean)
+		log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN;
+
 	/*
 	 * Make sure that there are no blocks in front of the head
 	 * with the same cycle number as the head.  This can happen
@@ -3204,6 +3586,7 @@ xlog_recover_dquot_ra_pass2(
 	struct xfs_disk_dquot	*recddq;
 	struct xfs_dq_logformat	*dq_f;
 	uint			type;
+	int			len;
 
 
 	if (mp->m_qflags == 0)
@@ -3224,8 +3607,12 @@ xlog_recover_dquot_ra_pass2(
 	ASSERT(dq_f);
 	ASSERT(dq_f->qlf_len == 1);
 
-	xfs_buf_readahead(mp->m_ddev_targp, dq_f->qlf_blkno,
-			  XFS_FSB_TO_BB(mp, dq_f->qlf_len), NULL);
+	len = XFS_FSB_TO_BB(mp, dq_f->qlf_len);
+	if (xlog_peek_buffer_cancelled(log, dq_f->qlf_blkno, len, 0))
+		return;
+
+	xfs_buf_readahead(mp->m_ddev_targp, dq_f->qlf_blkno, len,
+			  &xfs_dquot_buf_ra_ops);
 }
 
 STATIC void
@@ -4118,25 +4505,68 @@ xlog_recover_process_iunlinks(
 	mp->m_dmevmask = mp_dmevmask;
 }
 
+STATIC int
+xlog_unpack_data(
+	struct xlog_rec_header	*rhead,
+	char			*dp,
+	struct xlog		*log)
+{
+	int			i, j, k;
+
+	for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
+		  i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
+		*(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i];
+		dp += BBSIZE;
+	}
+
+	if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
+		xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead;
+		for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) {
+			j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+			k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
+			*(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k];
+			dp += BBSIZE;
+		}
+	}
+
+	return 0;
+}
+
 /*
- * Upack the log buffer data and crc check it. If the check fails, issue a
- * warning if and only if the CRC in the header is non-zero. This makes the
- * check an advisory warning, and the zero CRC check will prevent failure
- * warnings from being emitted when upgrading the kernel from one that does not
- * add CRCs by default.
- *
- * When filesystems are CRC enabled, this CRC mismatch becomes a fatal log
- * corruption failure
+ * CRC check, unpack and process a log record.
  */
 STATIC int
-xlog_unpack_data_crc(
+xlog_recover_process(
+	struct xlog		*log,
+	struct hlist_head	rhash[],
 	struct xlog_rec_header	*rhead,
 	char			*dp,
-	struct xlog		*log)
+	int			pass)
 {
+	int			error;
 	__le32			crc;
 
 	crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len));
+
+	/*
+	 * Nothing else to do if this is a CRC verification pass. Just return
+	 * if this a record with a non-zero crc. Unfortunately, mkfs always
+	 * sets h_crc to 0 so we must consider this valid even on v5 supers.
+	 * Otherwise, return EFSBADCRC on failure so the callers up the stack
+	 * know precisely what failed.
+	 */
+	if (pass == XLOG_RECOVER_CRCPASS) {
+		if (rhead->h_crc && crc != rhead->h_crc)
+			return -EFSBADCRC;
+		return 0;
+	}
+
+	/*
+	 * We're in the normal recovery path. Issue a warning if and only if the
+	 * CRC in the header is non-zero. This is an advisory warning and the
+	 * zero CRC check prevents warnings from being emitted when upgrading
+	 * the kernel from one that does not add CRCs by default.
+	 */
 	if (crc != rhead->h_crc) {
 		if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
 			xfs_alert(log->l_mp,
@@ -4147,47 +4577,18 @@ xlog_unpack_data_crc(
 		}
 
 		/*
-		 * If we've detected a log record corruption, then we can't
-		 * recover past this point. Abort recovery if we are enforcing
-		 * CRC protection by punting an error back up the stack.
+		 * If the filesystem is CRC enabled, this mismatch becomes a
+		 * fatal log corruption failure.
 		 */
 		if (xfs_sb_version_hascrc(&log->l_mp->m_sb))
 			return -EFSCORRUPTED;
 	}
 
-	return 0;
-}
-
-STATIC int
-xlog_unpack_data(
-	struct xlog_rec_header	*rhead,
-	char			*dp,
-	struct xlog		*log)
-{
-	int			i, j, k;
-	int			error;
-
-	error = xlog_unpack_data_crc(rhead, dp, log);
+	error = xlog_unpack_data(rhead, dp, log);
 	if (error)
 		return error;
 
-	for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
-		  i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
-		*(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i];
-		dp += BBSIZE;
-	}
-
-	if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
-		xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead;
-		for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) {
-			j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
-			k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
-			*(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k];
-			dp += BBSIZE;
-		}
-	}
-
-	return 0;
+	return xlog_recover_process_data(log, rhash, rhead, dp, pass);
 }
 
 STATIC int
@@ -4239,18 +4640,21 @@ xlog_do_recovery_pass(
 	struct xlog		*log,
 	xfs_daddr_t		head_blk,
 	xfs_daddr_t		tail_blk,
-	int			pass)
+	int			pass,
+	xfs_daddr_t		*first_bad)	/* out: first bad log rec */
 {
 	xlog_rec_header_t	*rhead;
 	xfs_daddr_t		blk_no;
+	xfs_daddr_t		rhead_blk;
 	char			*offset;
 	xfs_buf_t		*hbp, *dbp;
-	int			error = 0, h_size;
+	int			error = 0, h_size, h_len;
 	int			bblks, split_bblks;
 	int			hblks, split_hblks, wrapped_hblks;
 	struct hlist_head	rhash[XLOG_RHASH_SIZE];
 
 	ASSERT(head_blk != tail_blk);
+	rhead_blk = 0;
 
 	/*
 	 * Read the header of the tail block and get the iclog buffer size from
@@ -4274,7 +4678,31 @@ xlog_do_recovery_pass(
 		error = xlog_valid_rec_header(log, rhead, tail_blk);
 		if (error)
 			goto bread_err1;
+
+		/*
+		 * xfsprogs has a bug where record length is based on lsunit but
+		 * h_size (iclog size) is hardcoded to 32k. Now that we
+		 * unconditionally CRC verify the unmount record, this means the
+		 * log buffer can be too small for the record and cause an
+		 * overrun.
+		 *
+		 * Detect this condition here. Use lsunit for the buffer size as
+		 * long as this looks like the mkfs case. Otherwise, return an
+		 * error to avoid a buffer overrun.
+		 */
 		h_size = be32_to_cpu(rhead->h_size);
+		h_len = be32_to_cpu(rhead->h_len);
+		if (h_len > h_size) {
+			if (h_len <= log->l_mp->m_logbsize &&
+			    be32_to_cpu(rhead->h_num_logops) == 1) {
+				xfs_warn(log->l_mp,
+		"invalid iclog size (%d bytes), using lsunit (%d bytes)",
+					 h_size, log->l_mp->m_logbsize);
+				h_size = log->l_mp->m_logbsize;
+			} else
+				return -EFSCORRUPTED;
+		}
+
 		if ((be32_to_cpu(rhead->h_version) & XLOG_VERSION_2) &&
 		    (h_size > XLOG_HEADER_CYCLE_SIZE)) {
 			hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
@@ -4301,7 +4729,7 @@ xlog_do_recovery_pass(
 	}
 
 	memset(rhash, 0, sizeof(rhash));
-	blk_no = tail_blk;
+	blk_no = rhead_blk = tail_blk;
 	if (tail_blk > head_blk) {
 		/*
 		 * Perform recovery around the end of the physical log.
@@ -4408,19 +4836,18 @@ xlog_do_recovery_pass(
 					goto bread_err2;
 			}
 
-			error = xlog_unpack_data(rhead, offset, log);
+			error = xlog_recover_process(log, rhash, rhead, offset,
+						     pass);
 			if (error)
 				goto bread_err2;
 
-			error = xlog_recover_process_data(log, rhash,
-							rhead, offset, pass);
-			if (error)
-				goto bread_err2;
 			blk_no += bblks;
+			rhead_blk = blk_no;
 		}
 
 		ASSERT(blk_no >= log->l_logBBsize);
 		blk_no -= log->l_logBBsize;
+		rhead_blk = blk_no;
 	}
 
 	/* read first part of physical log */
@@ -4441,21 +4868,22 @@ xlog_do_recovery_pass(
 		if (error)
 			goto bread_err2;
 
-		error = xlog_unpack_data(rhead, offset, log);
+		error = xlog_recover_process(log, rhash, rhead, offset, pass);
 		if (error)
 			goto bread_err2;
 
-		error = xlog_recover_process_data(log, rhash,
-						rhead, offset, pass);
-		if (error)
-			goto bread_err2;
 		blk_no += bblks + hblks;
+		rhead_blk = blk_no;
 	}
 
  bread_err2:
 	xlog_put_bp(dbp);
  bread_err1:
 	xlog_put_bp(hbp);
+
+	if (error && first_bad)
+		*first_bad = rhead_blk;
+
 	return error;
 }
 
@@ -4493,7 +4921,7 @@ xlog_do_log_recovery(
 		INIT_LIST_HEAD(&log->l_buf_cancel_table[i]);
 
 	error = xlog_do_recovery_pass(log, head_blk, tail_blk,
-				      XLOG_RECOVER_PASS1);
+				      XLOG_RECOVER_PASS1, NULL);
 	if (error != 0) {
 		kmem_free(log->l_buf_cancel_table);
 		log->l_buf_cancel_table = NULL;
@@ -4504,7 +4932,7 @@ xlog_do_log_recovery(
 	 * When it is complete free the table of buf cancel items.
 	 */
 	error = xlog_do_recovery_pass(log, head_blk, tail_blk,
-				      XLOG_RECOVER_PASS2);
+				      XLOG_RECOVER_PASS2, NULL);
 #ifdef DEBUG
 	if (!error) {
 		int	i;
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index dc6221942..ade236e90 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -42,11 +42,11 @@ xfs_break_layouts(
 	while ((error = break_layout(inode, false) == -EWOULDBLOCK)) {
 		xfs_iunlock(ip, *iolock);
 		if (with_imutex && (*iolock & XFS_IOLOCK_EXCL))
-			mutex_unlock(&inode->i_mutex);
+			inode_unlock(inode);
 		error = break_layout(inode, true);
 		*iolock = XFS_IOLOCK_EXCL;
 		if (with_imutex)
-			mutex_lock(&inode->i_mutex);
+			inode_lock(inode);
 		xfs_ilock(ip, *iolock);
 	}
 
diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c
index ab1bac6a3..be02a68b2 100644
--- a/fs/xfs/xfs_rtalloc.c
+++ b/fs/xfs/xfs_rtalloc.c
@@ -766,7 +766,6 @@ xfs_growfs_rt_alloc(
 {
 	xfs_fileoff_t		bno;		/* block number in file */
 	struct xfs_buf		*bp;	/* temporary buffer for zeroing */
-	int			committed;	/* transaction committed flag */
 	xfs_daddr_t		d;		/* disk block address */
 	int			error;		/* error return value */
 	xfs_fsblock_t		firstblock;/* first block allocated in xaction */
@@ -811,7 +810,7 @@ xfs_growfs_rt_alloc(
 		/*
 		 * Free any blocks freed up in the transaction, then commit.
 		 */
-		error = xfs_bmap_finish(&tp, &flist, &committed);
+		error = xfs_bmap_finish(&tp, &flist, NULL);
 		if (error)
 			goto out_bmap_cancel;
 		error = xfs_trans_commit(tp);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 36bd8825b..59c9b7bd9 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -137,7 +137,7 @@ static const match_table_t tokens = {
 };
 
 
-STATIC unsigned long
+STATIC int
 suffix_kstrtoint(char *s, unsigned int base, int *res)
 {
 	int	last, shift_left_factor = 0, _res;
@@ -1714,8 +1714,8 @@ xfs_init_zones(void)
 
 	xfs_inode_zone =
 		kmem_zone_init_flags(sizeof(xfs_inode_t), "xfs_inode",
-			KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | KM_ZONE_SPREAD,
-			xfs_fs_inode_init_once);
+			KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | KM_ZONE_SPREAD |
+			KM_ZONE_ACCOUNT, xfs_fs_inode_init_once);
 	if (!xfs_inode_zone)
 		goto out_destroy_efi_zone;
 
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 996481eeb..b44284c1a 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -178,7 +178,6 @@ xfs_symlink(
 	struct xfs_bmap_free	free_list;
 	xfs_fsblock_t		first_block;
 	bool                    unlock_dp_on_error = false;
-	int			committed;
 	xfs_fileoff_t		first_fsb;
 	xfs_filblks_t		fs_blocks;
 	int			nmaps;
@@ -387,7 +386,7 @@ xfs_symlink(
 		xfs_trans_set_sync(tp);
 	}
 
-	error = xfs_bmap_finish(&tp, &free_list, &committed);
+	error = xfs_bmap_finish(&tp, &free_list, NULL);
 	if (error)
 		goto out_bmap_cancel;
 
@@ -434,7 +433,6 @@ xfs_inactive_symlink_rmt(
 	struct xfs_inode *ip)
 {
 	xfs_buf_t	*bp;
-	int		committed;
 	int		done;
 	int		error;
 	xfs_fsblock_t	first_block;
@@ -510,16 +508,10 @@ xfs_inactive_symlink_rmt(
 	/*
 	 * Commit the first transaction.  This logs the EFI and the inode.
 	 */
-	error = xfs_bmap_finish(&tp, &free_list, &committed);
+	error = xfs_bmap_finish(&tp, &free_list, ip);
 	if (error)
 		goto error_bmap_cancel;
 	/*
-	 * The transaction must have been committed, since there were
-	 * actually extents freed by xfs_bunmapi.  See xfs_bmap_finish.
-	 * The new tp has the extent freeing and EFDs.
-	 */
-	ASSERT(committed);
-	/*
 	 * The first xact was committed, so add the inode to the new one.
 	 * Mark it dirty so it will be logged and moved forward in the log as
 	 * part of every commit.
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index ee70f5dec..641d625eb 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -255,11 +255,47 @@ write_grant_head_show(
 }
 XFS_SYSFS_ATTR_RO(write_grant_head);
 
+#ifdef DEBUG
+STATIC ssize_t
+log_badcrc_factor_store(
+	struct kobject	*kobject,
+	const char	*buf,
+	size_t		count)
+{
+	struct xlog	*log = to_xlog(kobject);
+	int		ret;
+	uint32_t	val;
+
+	ret = kstrtouint(buf, 0, &val);
+	if (ret)
+		return ret;
+
+	log->l_badcrc_factor = val;
+
+	return count;
+}
+
+STATIC ssize_t
+log_badcrc_factor_show(
+	struct kobject	*kobject,
+	char		*buf)
+{
+	struct xlog	*log = to_xlog(kobject);
+
+	return snprintf(buf, PAGE_SIZE, "%d\n", log->l_badcrc_factor);
+}
+
+XFS_SYSFS_ATTR_RW(log_badcrc_factor);
+#endif	/* DEBUG */
+
 static struct attribute *xfs_log_attrs[] = {
 	ATTR_LIST(log_head_lsn),
 	ATTR_LIST(log_tail_lsn),
 	ATTR_LIST(reserve_grant_head),
 	ATTR_LIST(write_grant_head),
+#ifdef DEBUG
+	ATTR_LIST(log_badcrc_factor),
+#endif
 	NULL,
 };
 
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 877079eb0..391d797cb 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1222,6 +1222,32 @@ DEFINE_PAGE_EVENT(xfs_writepage);
 DEFINE_PAGE_EVENT(xfs_releasepage);
 DEFINE_PAGE_EVENT(xfs_invalidatepage);
 
+DECLARE_EVENT_CLASS(xfs_readpage_class,
+	TP_PROTO(struct inode *inode, int nr_pages),
+	TP_ARGS(inode, nr_pages),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(int, nr_pages)
+	),
+	TP_fast_assign(
+		__entry->dev = inode->i_sb->s_dev;
+		__entry->ino = inode->i_ino;
+		__entry->nr_pages = nr_pages;
+	),
+	TP_printk("dev %d:%d ino 0x%llx nr_pages %d",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->nr_pages)
+)
+
+#define DEFINE_READPAGE_EVENT(name)		\
+DEFINE_EVENT(xfs_readpage_class, name,	\
+	TP_PROTO(struct inode *inode, int nr_pages), \
+	TP_ARGS(inode, nr_pages))
+DEFINE_READPAGE_EVENT(xfs_vm_readpage);
+DEFINE_READPAGE_EVENT(xfs_vm_readpages);
+
 DECLARE_EVENT_CLASS(xfs_imap_class,
 	TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count,
 		 int type, struct xfs_bmbt_irec *irec),
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index ce78534a0..995170194 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -572,12 +572,16 @@ xfs_quota_warn(
 	struct xfs_dquot	*dqp,
 	int			type)
 {
-	/* no warnings for project quotas - we just return ENOSPC later */
+	enum quota_type qtype;
+
 	if (dqp->dq_flags & XFS_DQ_PROJ)
-		return;
-	quota_send_warning(make_kqid(&init_user_ns,
-				     (dqp->dq_flags & XFS_DQ_USER) ?
-				     USRQUOTA : GRPQUOTA,
+		qtype = PRJQUOTA;
+	else if (dqp->dq_flags & XFS_DQ_USER)
+		qtype = USRQUOTA;
+	else
+		qtype = GRPQUOTA;
+
+	quota_send_warning(make_kqid(&init_user_ns, qtype,
 				     be32_to_cpu(dqp->q_core.d_id)),
 			   mp->m_super->s_dev, type);
 }
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 839b35ca2..110f1d7d8 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -39,9 +39,6 @@ xfs_xattr_get(const struct xattr_handler *handler, struct dentry *dentry,
 	struct xfs_inode *ip = XFS_I(d_inode(dentry));
 	int error, asize = size;
 
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
-
 	/* Convert Linux syscall to XFS internal ATTR flags */
 	if (!size) {
 		xflags |= ATTR_KERNOVAL;
@@ -84,9 +81,6 @@ xfs_xattr_set(const struct xattr_handler *handler, struct dentry *dentry,
 	struct xfs_inode	*ip = XFS_I(d_inode(dentry));
 	int			error;
 
-	if (strcmp(name, "") == 0)
-		return -EINVAL;
-
 	/* Convert Linux syscall to XFS internal ATTR flags */
 	if (flags & XATTR_CREATE)
 		xflags |= ATTR_CREATE;
@@ -135,47 +129,19 @@ const struct xattr_handler *xfs_xattr_handlers[] = {
 	NULL
 };
 
-static unsigned int xfs_xattr_prefix_len(int flags)
-{
-	if (flags & XFS_ATTR_SECURE)
-		return sizeof("security");
-	else if (flags & XFS_ATTR_ROOT)
-		return sizeof("trusted");
-	else
-		return sizeof("user");
-}
-
-static const char *xfs_xattr_prefix(int flags)
-{
-	if (flags & XFS_ATTR_SECURE)
-		return xfs_xattr_security_handler.prefix;
-	else if (flags & XFS_ATTR_ROOT)
-		return xfs_xattr_trusted_handler.prefix;
-	else
-		return xfs_xattr_user_handler.prefix;
-}
-
 static int
-xfs_xattr_put_listent(
+__xfs_xattr_put_listent(
 	struct xfs_attr_list_context *context,
-	int		flags,
-	unsigned char	*name,
-	int		namelen,
-	int		valuelen,
-	unsigned char	*value)
+	char *prefix,
+	int prefix_len,
+	unsigned char *name,
+	int namelen)
 {
-	unsigned int prefix_len = xfs_xattr_prefix_len(flags);
 	char *offset;
 	int arraytop;
 
-	ASSERT(context->count >= 0);
-
-	/*
-	 * Only show root namespace entries if we are actually allowed to
-	 * see them.
-	 */
-	if ((flags & XFS_ATTR_ROOT) && !capable(CAP_SYS_ADMIN))
-		return 0;
+	if (!context->alist)
+		goto compute_size;
 
 	arraytop = context->count + prefix_len + namelen + 1;
 	if (arraytop > context->firstu) {
@@ -183,17 +149,19 @@ xfs_xattr_put_listent(
 		return 1;
 	}
 	offset = (char *)context->alist + context->count;
-	strncpy(offset, xfs_xattr_prefix(flags), prefix_len);
+	strncpy(offset, prefix, prefix_len);
 	offset += prefix_len;
 	strncpy(offset, (char *)name, namelen);			/* real name */
 	offset += namelen;
 	*offset = '\0';
+
+compute_size:
 	context->count += prefix_len + namelen + 1;
 	return 0;
 }
 
 static int
-xfs_xattr_put_listent_sizes(
+xfs_xattr_put_listent(
 	struct xfs_attr_list_context *context,
 	int		flags,
 	unsigned char	*name,
@@ -201,24 +169,55 @@ xfs_xattr_put_listent_sizes(
 	int		valuelen,
 	unsigned char	*value)
 {
-	context->count += xfs_xattr_prefix_len(flags) + namelen + 1;
-	return 0;
-}
+	char *prefix;
+	int prefix_len;
 
-static int
-list_one_attr(const char *name, const size_t len, void *data,
-		size_t size, ssize_t *result)
-{
-	char *p = data + *result;
+	ASSERT(context->count >= 0);
 
-	*result += len;
-	if (!size)
-		return 0;
-	if (*result > size)
-		return -ERANGE;
+	if (flags & XFS_ATTR_ROOT) {
+#ifdef CONFIG_XFS_POSIX_ACL
+		if (namelen == SGI_ACL_FILE_SIZE &&
+		    strncmp(name, SGI_ACL_FILE,
+			    SGI_ACL_FILE_SIZE) == 0) {
+			int ret = __xfs_xattr_put_listent(
+					context, XATTR_SYSTEM_PREFIX,
+					XATTR_SYSTEM_PREFIX_LEN,
+					XATTR_POSIX_ACL_ACCESS,
+					strlen(XATTR_POSIX_ACL_ACCESS));
+			if (ret)
+				return ret;
+		} else if (namelen == SGI_ACL_DEFAULT_SIZE &&
+			 strncmp(name, SGI_ACL_DEFAULT,
+				 SGI_ACL_DEFAULT_SIZE) == 0) {
+			int ret = __xfs_xattr_put_listent(
+					context, XATTR_SYSTEM_PREFIX,
+					XATTR_SYSTEM_PREFIX_LEN,
+					XATTR_POSIX_ACL_DEFAULT,
+					strlen(XATTR_POSIX_ACL_DEFAULT));
+			if (ret)
+				return ret;
+		}
+#endif
 
-	strcpy(p, name);
-	return 0;
+		/*
+		 * Only show root namespace entries if we are actually allowed to
+		 * see them.
+		 */
+		if (!capable(CAP_SYS_ADMIN))
+			return 0;
+
+		prefix = XATTR_TRUSTED_PREFIX;
+		prefix_len = XATTR_TRUSTED_PREFIX_LEN;
+	} else if (flags & XFS_ATTR_SECURE) {
+		prefix = XATTR_SECURITY_PREFIX;
+		prefix_len = XATTR_SECURITY_PREFIX_LEN;
+	} else {
+		prefix = XATTR_USER_PREFIX;
+		prefix_len = XATTR_USER_PREFIX_LEN;
+	}
+
+	return __xfs_xattr_put_listent(context, prefix, prefix_len, name,
+				       namelen);
 }
 
 ssize_t
@@ -227,7 +226,6 @@ xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size)
 	struct xfs_attr_list_context context;
 	struct attrlist_cursor_kern cursor = { 0 };
 	struct inode		*inode = d_inode(dentry);
-	int			error;
 
 	/*
 	 * First read the regular on-disk attributes.
@@ -236,37 +234,14 @@ xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size)
 	context.dp = XFS_I(inode);
 	context.cursor = &cursor;
 	context.resynch = 1;
-	context.alist = data;
+	context.alist = size ? data : NULL;
 	context.bufsize = size;
 	context.firstu = context.bufsize;
-
-	if (size)
-		context.put_listent = xfs_xattr_put_listent;
-	else
-		context.put_listent = xfs_xattr_put_listent_sizes;
+	context.put_listent = xfs_xattr_put_listent;
 
 	xfs_attr_list_int(&context);
 	if (context.count < 0)
 		return -ERANGE;
 
-	/*
-	 * Then add the two synthetic ACL attributes.
-	 */
-	if (posix_acl_access_exists(inode)) {
-		error = list_one_attr(POSIX_ACL_XATTR_ACCESS,
-				strlen(POSIX_ACL_XATTR_ACCESS) + 1,
-				data, size, &context.count);
-		if (error)
-			return error;
-	}
-
-	if (posix_acl_default_exists(inode)) {
-		error = list_one_attr(POSIX_ACL_XATTR_DEFAULT,
-				strlen(POSIX_ACL_XATTR_DEFAULT) + 1,
-				data, size, &context.count);
-		if (error)
-			return error;
-	}
-
 	return context.count;
 }
author	André Fabian Silva Delgado <emulatorman@parabola.nu>	2016-03-25 03:53:42 -0300
committer	André Fabian Silva Delgado <emulatorman@parabola.nu>	2016-03-25 03:53:42 -0300
commit	03dd4cb26d967f9588437b0fc9cc0e8353322bb7 (patch)
tree	fa581f6dc1c0596391690d1f67eceef3af8246dc /fs
parent	d4e493caf788ef44982e131ff9c786546904d934 (diff)