diff options
Diffstat (limited to 'fs')
490 files changed, 17340 insertions, 41977 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 620d93489..8aa56bb6e 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -320,31 +320,21 @@ fail_option_alloc: struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, const char *dev_name, char *data) { - int retval = -EINVAL; struct p9_fid *fid; - int rc; + int rc = -ENOMEM; v9ses->uname = kstrdup(V9FS_DEFUSER, GFP_KERNEL); if (!v9ses->uname) - return ERR_PTR(-ENOMEM); + goto err_names; v9ses->aname = kstrdup(V9FS_DEFANAME, GFP_KERNEL); - if (!v9ses->aname) { - kfree(v9ses->uname); - return ERR_PTR(-ENOMEM); - } + if (!v9ses->aname) + goto err_names; init_rwsem(&v9ses->rename_sem); rc = bdi_setup_and_register(&v9ses->bdi, "9p"); - if (rc) { - kfree(v9ses->aname); - kfree(v9ses->uname); - return ERR_PTR(rc); - } - - spin_lock(&v9fs_sessionlist_lock); - list_add(&v9ses->slist, &v9fs_sessionlist); - spin_unlock(&v9fs_sessionlist_lock); + if (rc) + goto err_names; v9ses->uid = INVALID_UID; v9ses->dfltuid = V9FS_DEFUID; @@ -352,10 +342,9 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, v9ses->clnt = p9_client_create(dev_name, data); if (IS_ERR(v9ses->clnt)) { - retval = PTR_ERR(v9ses->clnt); - v9ses->clnt = NULL; + rc = PTR_ERR(v9ses->clnt); p9_debug(P9_DEBUG_ERROR, "problem initializing 9p client\n"); - goto error; + goto err_bdi; } v9ses->flags = V9FS_ACCESS_USER; @@ -368,10 +357,8 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, } rc = v9fs_parse_options(v9ses, data); - if (rc < 0) { - retval = rc; - goto error; - } + if (rc < 0) + goto err_clnt; v9ses->maxdata = v9ses->clnt->msize - P9_IOHDRSZ; @@ -405,10 +392,9 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, fid = p9_client_attach(v9ses->clnt, NULL, v9ses->uname, INVALID_UID, v9ses->aname); if (IS_ERR(fid)) { - retval = PTR_ERR(fid); - fid = NULL; + rc = PTR_ERR(fid); p9_debug(P9_DEBUG_ERROR, "cannot attach\n"); - goto error; + goto err_clnt; } if ((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_SINGLE) @@ -420,12 +406,20 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, /* register the session for caching */ v9fs_cache_session_get_cookie(v9ses); #endif + spin_lock(&v9fs_sessionlist_lock); + list_add(&v9ses->slist, &v9fs_sessionlist); + spin_unlock(&v9fs_sessionlist_lock); return fid; -error: +err_clnt: + p9_client_destroy(v9ses->clnt); +err_bdi: bdi_destroy(&v9ses->bdi); - return ERR_PTR(retval); +err_names: + kfree(v9ses->uname); + kfree(v9ses->aname); + return ERR_PTR(rc); } /** diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index fb9ffcb43..0923f2cf3 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -149,8 +149,6 @@ extern int v9fs_vfs_unlink(struct inode *i, struct dentry *d); extern int v9fs_vfs_rmdir(struct inode *i, struct dentry *d); extern int v9fs_vfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry); -extern void v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, - void *p); extern struct inode *v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, struct super_block *sb, int new); diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 53f1e8a21..b1dc51888 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -1223,100 +1223,43 @@ ino_t v9fs_qid2ino(struct p9_qid *qid) } /** - * v9fs_readlink - read a symlink's location (internal version) + * v9fs_vfs_follow_link - follow a symlink path * @dentry: dentry for symlink - * @buffer: buffer to load symlink location into - * @buflen: length of buffer - * + * @cookie: place to pass the data to put_link() */ -static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen) +static const char *v9fs_vfs_follow_link(struct dentry *dentry, void **cookie) { - int retval; - - struct v9fs_session_info *v9ses; - struct p9_fid *fid; + struct v9fs_session_info *v9ses = v9fs_dentry2v9ses(dentry); + struct p9_fid *fid = v9fs_fid_lookup(dentry); struct p9_wstat *st; + char *res; + + p9_debug(P9_DEBUG_VFS, "%pd\n", dentry); - p9_debug(P9_DEBUG_VFS, " %pd\n", dentry); - retval = -EPERM; - v9ses = v9fs_dentry2v9ses(dentry); - fid = v9fs_fid_lookup(dentry); if (IS_ERR(fid)) - return PTR_ERR(fid); + return ERR_CAST(fid); if (!v9fs_proto_dotu(v9ses)) - return -EBADF; + return ERR_PTR(-EBADF); st = p9_client_stat(fid); if (IS_ERR(st)) - return PTR_ERR(st); + return ERR_CAST(st); if (!(st->mode & P9_DMSYMLINK)) { - retval = -EINVAL; - goto done; + p9stat_free(st); + kfree(st); + return ERR_PTR(-EINVAL); } + res = st->extension; + st->extension = NULL; + if (strlen(res) >= PATH_MAX) + res[PATH_MAX - 1] = '\0'; - /* copy extension buffer into buffer */ - retval = min(strlen(st->extension)+1, (size_t)buflen); - memcpy(buffer, st->extension, retval); - - p9_debug(P9_DEBUG_VFS, "%pd -> %s (%.*s)\n", - dentry, st->extension, buflen, buffer); - -done: p9stat_free(st); kfree(st); - return retval; -} - -/** - * v9fs_vfs_follow_link - follow a symlink path - * @dentry: dentry for symlink - * @nd: nameidata - * - */ - -static void *v9fs_vfs_follow_link(struct dentry *dentry, struct nameidata *nd) -{ - int len = 0; - char *link = __getname(); - - p9_debug(P9_DEBUG_VFS, "%pd\n", dentry); - - if (!link) - link = ERR_PTR(-ENOMEM); - else { - len = v9fs_readlink(dentry, link, PATH_MAX); - - if (len < 0) { - __putname(link); - link = ERR_PTR(len); - } else - link[min(len, PATH_MAX-1)] = 0; - } - nd_set_link(nd, link); - - return NULL; -} - -/** - * v9fs_vfs_put_link - release a symlink path - * @dentry: dentry for symlink - * @nd: nameidata - * @p: unused - * - */ - -void -v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p) -{ - char *s = nd_get_link(nd); - - p9_debug(P9_DEBUG_VFS, " %pd %s\n", - dentry, IS_ERR(s) ? "<error>" : s); - if (!IS_ERR(s)) - __putname(s); + return *cookie = res; } /** @@ -1369,6 +1312,8 @@ v9fs_vfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) return v9fs_vfs_mkspecial(dir, dentry, P9_DMSYMLINK, symname); } +#define U32_MAX_DIGITS 10 + /** * v9fs_vfs_link - create a hardlink * @old_dentry: dentry for file to link to @@ -1382,7 +1327,7 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { int retval; - char *name; + char name[1 + U32_MAX_DIGITS + 2]; /* sign + number + \n + \0 */ struct p9_fid *oldfid; p9_debug(P9_DEBUG_VFS, " %lu,%pd,%pd\n", @@ -1392,20 +1337,12 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir, if (IS_ERR(oldfid)) return PTR_ERR(oldfid); - name = __getname(); - if (unlikely(!name)) { - retval = -ENOMEM; - goto clunk_fid; - } - sprintf(name, "%d\n", oldfid->fid); retval = v9fs_vfs_mkspecial(dir, dentry, P9_DMLINK, name); - __putname(name); if (!retval) { v9fs_refresh_inode(oldfid, d_inode(old_dentry)); v9fs_invalidate_inode_attr(dir); } -clunk_fid: p9_client_clunk(oldfid); return retval; } @@ -1424,7 +1361,7 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rde { struct v9fs_session_info *v9ses = v9fs_inode2v9ses(dir); int retval; - char *name; + char name[2 + U32_MAX_DIGITS + 1 + U32_MAX_DIGITS + 1]; u32 perm; p9_debug(P9_DEBUG_VFS, " %lu,%pd mode: %hx MAJOR: %u MINOR: %u\n", @@ -1434,26 +1371,16 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rde if (!new_valid_dev(rdev)) return -EINVAL; - name = __getname(); - if (!name) - return -ENOMEM; /* build extension */ if (S_ISBLK(mode)) sprintf(name, "b %u %u", MAJOR(rdev), MINOR(rdev)); else if (S_ISCHR(mode)) sprintf(name, "c %u %u", MAJOR(rdev), MINOR(rdev)); - else if (S_ISFIFO(mode)) - *name = 0; - else if (S_ISSOCK(mode)) + else *name = 0; - else { - __putname(name); - return -EINVAL; - } perm = unixmode2p9mode(v9ses, mode); retval = v9fs_vfs_mkspecial(dir, dentry, perm, name); - __putname(name); return retval; } @@ -1529,7 +1456,7 @@ static const struct inode_operations v9fs_file_inode_operations = { static const struct inode_operations v9fs_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = v9fs_vfs_follow_link, - .put_link = v9fs_vfs_put_link, + .put_link = kfree_put_link, .getattr = v9fs_vfs_getattr, .setattr = v9fs_vfs_setattr, }; diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 4d3ecfb55..e8aa57dc8 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -904,41 +904,24 @@ error: /** * v9fs_vfs_follow_link_dotl - follow a symlink path * @dentry: dentry for symlink - * @nd: nameidata - * + * @cookie: place to pass the data to put_link() */ -static void * -v9fs_vfs_follow_link_dotl(struct dentry *dentry, struct nameidata *nd) +static const char * +v9fs_vfs_follow_link_dotl(struct dentry *dentry, void **cookie) { - int retval; - struct p9_fid *fid; - char *link = __getname(); + struct p9_fid *fid = v9fs_fid_lookup(dentry); char *target; + int retval; p9_debug(P9_DEBUG_VFS, "%pd\n", dentry); - if (!link) { - link = ERR_PTR(-ENOMEM); - goto ndset; - } - fid = v9fs_fid_lookup(dentry); - if (IS_ERR(fid)) { - __putname(link); - link = ERR_CAST(fid); - goto ndset; - } + if (IS_ERR(fid)) + return ERR_CAST(fid); retval = p9_client_readlink(fid, &target); - if (!retval) { - strcpy(link, target); - kfree(target); - goto ndset; - } - __putname(link); - link = ERR_PTR(retval); -ndset: - nd_set_link(nd, link); - return NULL; + if (retval) + return ERR_PTR(retval); + return *cookie = target; } int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode) @@ -1005,7 +988,7 @@ const struct inode_operations v9fs_file_inode_operations_dotl = { const struct inode_operations v9fs_symlink_inode_operations_dotl = { .readlink = generic_readlink, .follow_link = v9fs_vfs_follow_link_dotl, - .put_link = v9fs_vfs_put_link, + .put_link = kfree_put_link, .getattr = v9fs_vfs_getattr_dotl, .setattr = v9fs_vfs_setattr_dotl, .setxattr = generic_setxattr, diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index e99a338a4..bf495cede 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -130,11 +130,7 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, fid = v9fs_session_init(v9ses, dev_name, data); if (IS_ERR(fid)) { retval = PTR_ERR(fid); - /* - * we need to call session_close to tear down some - * of the data structure setup by session_init - */ - goto close_session; + goto free_session; } sb = sget(fs_type, NULL, v9fs_set_super, flags, v9ses); @@ -195,8 +191,8 @@ static struct dentry *v9fs_mount(struct file_system_type *fs_type, int flags, clunk_fid: p9_client_clunk(fid); -close_session: v9fs_session_close(v9ses); +free_session: kfree(v9ses); return ERR_PTR(retval); diff --git a/fs/Kconfig b/fs/Kconfig index b1083f6b6..011f43365 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -218,7 +218,6 @@ source "fs/pstore/Kconfig" source "fs/sysv/Kconfig" source "fs/ufs/Kconfig" source "fs/exofs/Kconfig" -source "fs/aufs/Kconfig" endif # MISC_FILESYSTEMS diff --git a/fs/Makefile b/fs/Makefile index 8c2df123d..cb20e4bf2 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -115,7 +115,6 @@ obj-$(CONFIG_AFS_FS) += afs/ obj-$(CONFIG_NILFS2_FS) += nilfs2/ obj-$(CONFIG_BEFS_FS) += befs/ obj-$(CONFIG_HOSTFS) += hostfs/ -obj-$(CONFIG_HPPFS) += hppfs/ obj-$(CONFIG_CACHEFILES) += cachefiles/ obj-$(CONFIG_DEBUG_FS) += debugfs/ obj-$(CONFIG_TRACING) += tracefs/ @@ -127,4 +126,3 @@ obj-y += exofs/ # Multiple modules obj-$(CONFIG_CEPH_FS) += ceph/ obj-$(CONFIG_PSTORE) += pstore/ obj-$(CONFIG_EFIVAR_FS) += efivarfs/ -obj-$(CONFIG_AUFS_FS) += aufs/ diff --git a/fs/adfs/super.c b/fs/adfs/super.c index a19c31d3f..4d4a0df83 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -242,7 +242,7 @@ static struct kmem_cache *adfs_inode_cachep; static struct inode *adfs_alloc_inode(struct super_block *sb) { struct adfs_inode_info *ei; - ei = (struct adfs_inode_info *)kmem_cache_alloc(adfs_inode_cachep, GFP_KERNEL); + ei = kmem_cache_alloc(adfs_inode_cachep, GFP_KERNEL); if (!ei) return NULL; return &ei->vfs_inode; diff --git a/fs/affs/affs.h b/fs/affs/affs.h index cffe8370f..c69a87eaf 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h @@ -64,7 +64,7 @@ struct affs_inode_info { /* short cut to get to the affs specific inode data */ static inline struct affs_inode_info *AFFS_I(struct inode *inode) { - return list_entry(inode, struct affs_inode_info, vfs_inode); + return container_of(inode, struct affs_inode_info, vfs_inode); } /* diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c index a8f463c02..5fa92bc79 100644 --- a/fs/affs/amigaffs.c +++ b/fs/affs/amigaffs.c @@ -140,7 +140,7 @@ affs_remove_link(struct dentry *dentry) { struct inode *dir, *inode = d_inode(dentry); struct super_block *sb = inode->i_sb; - struct buffer_head *bh = NULL, *link_bh = NULL; + struct buffer_head *bh, *link_bh = NULL; u32 link_ino, ino; int retval; diff --git a/fs/affs/inode.c b/fs/affs/inode.c index a022f4acc..173495005 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -346,7 +346,7 @@ affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s3 { struct super_block *sb = dir->i_sb; struct buffer_head *inode_bh = NULL; - struct buffer_head *bh = NULL; + struct buffer_head *bh; u32 block = 0; int retval; diff --git a/fs/affs/symlink.c b/fs/affs/symlink.c index f39b71c39..ea5b69a18 100644 --- a/fs/affs/symlink.c +++ b/fs/affs/symlink.c @@ -16,14 +16,12 @@ static int affs_symlink_readpage(struct file *file, struct page *page) struct inode *inode = page->mapping->host; char *link = kmap(page); struct slink_front *lf; - int err; int i, j; char c; char lc; pr_debug("follow_link(ino=%lu)\n", inode->i_ino); - err = -EIO; bh = affs_bread(inode->i_sb, inode->i_ino); if (!bh) goto fail; @@ -66,7 +64,7 @@ fail: SetPageError(page); kunmap(page); unlock_page(page); - return err; + return -EIO; } const struct address_space_operations affs_symlink_aops = { diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 3a57a1b0f..b50642870 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -85,7 +85,7 @@ int afs_open_socket(void) return -ENOMEM; } - ret = sock_create_kern(AF_RXRPC, SOCK_DGRAM, PF_INET, &socket); + ret = sock_create_kern(&init_net, AF_RXRPC, SOCK_DGRAM, PF_INET, &socket); if (ret < 0) { destroy_workqueue(afs_async_calls); _leave(" = %d [socket]", ret); diff --git a/fs/aufs/Kconfig b/fs/aufs/Kconfig deleted file mode 100644 index 63560ceda..000000000 --- a/fs/aufs/Kconfig +++ /dev/null @@ -1,185 +0,0 @@ -config AUFS_FS - tristate "Aufs (Advanced multi layered unification filesystem) support" - help - Aufs is a stackable unification filesystem such as Unionfs, - which unifies several directories and provides a merged single - directory. - In the early days, aufs was entirely re-designed and - re-implemented Unionfs Version 1.x series. Introducing many - original ideas, approaches and improvements, it becomes totally - different from Unionfs while keeping the basic features. - -if AUFS_FS -choice - prompt "Maximum number of branches" - default AUFS_BRANCH_MAX_127 - help - Specifies the maximum number of branches (or member directories) - in a single aufs. The larger value consumes more system - resources and has a minor impact to performance. -config AUFS_BRANCH_MAX_127 - bool "127" - help - Specifies the maximum number of branches (or member directories) - in a single aufs. The larger value consumes more system - resources and has a minor impact to performance. -config AUFS_BRANCH_MAX_511 - bool "511" - help - Specifies the maximum number of branches (or member directories) - in a single aufs. The larger value consumes more system - resources and has a minor impact to performance. -config AUFS_BRANCH_MAX_1023 - bool "1023" - help - Specifies the maximum number of branches (or member directories) - in a single aufs. The larger value consumes more system - resources and has a minor impact to performance. -config AUFS_BRANCH_MAX_32767 - bool "32767" - help - Specifies the maximum number of branches (or member directories) - in a single aufs. The larger value consumes more system - resources and has a minor impact to performance. -endchoice - -config AUFS_SBILIST - bool - depends on AUFS_MAGIC_SYSRQ || PROC_FS - default y - help - Automatic configuration for internal use. - When aufs supports Magic SysRq or /proc, enabled automatically. - -config AUFS_HNOTIFY - bool "Detect direct branch access (bypassing aufs)" - help - If you want to modify files on branches directly, eg. bypassing aufs, - and want aufs to detect the changes of them fully, then enable this - option and use 'udba=notify' mount option. - Currently there is only one available configuration, "fsnotify". - It will have a negative impact to the performance. - See detail in aufs.5. - -choice - prompt "method" if AUFS_HNOTIFY - default AUFS_HFSNOTIFY -config AUFS_HFSNOTIFY - bool "fsnotify" - select FSNOTIFY -endchoice - -config AUFS_EXPORT - bool "NFS-exportable aufs" - depends on EXPORTFS - help - If you want to export your mounted aufs via NFS, then enable this - option. There are several requirements for this configuration. - See detail in aufs.5. - -config AUFS_INO_T_64 - bool - depends on AUFS_EXPORT - depends on 64BIT && !(ALPHA || S390) - default y - help - Automatic configuration for internal use. - /* typedef unsigned long/int __kernel_ino_t */ - /* alpha and s390x are int */ - -config AUFS_XATTR - bool "support for XATTR/EA (including Security Labels)" - help - If your branch fs supports XATTR/EA and you want to make them - available in aufs too, then enable this opsion and specify the - branch attributes for EA. - See detail in aufs.5. - -config AUFS_FHSM - bool "File-based Hierarchical Storage Management" - help - Hierarchical Storage Management (or HSM) is a well-known feature - in the storage world. Aufs provides this feature as file-based. - with multiple branches. - These multiple branches are prioritized, ie. the topmost one - should be the fastest drive and be used heavily. - -config AUFS_RDU - bool "Readdir in userspace" - help - Aufs has two methods to provide a merged view for a directory, - by a user-space library and by kernel-space natively. The latter - is always enabled but sometimes large and slow. - If you enable this option, install the library in aufs2-util - package, and set some environment variables for your readdir(3), - then the work will be handled in user-space which generally - shows better performance in most cases. - See detail in aufs.5. - -config AUFS_SHWH - bool "Show whiteouts" - help - If you want to make the whiteouts in aufs visible, then enable - this option and specify 'shwh' mount option. Although it may - sounds like philosophy or something, but in technically it - simply shows the name of whiteout with keeping its behaviour. - -config AUFS_BR_RAMFS - bool "Ramfs (initramfs/rootfs) as an aufs branch" - help - If you want to use ramfs as an aufs branch fs, then enable this - option. Generally tmpfs is recommended. - Aufs prohibited them to be a branch fs by default, because - initramfs becomes unusable after switch_root or something - generally. If you sets initramfs as an aufs branch and boot your - system by switch_root, you will meet a problem easily since the - files in initramfs may be inaccessible. - Unless you are going to use ramfs as an aufs branch fs without - switch_root or something, leave it N. - -config AUFS_BR_FUSE - bool "Fuse fs as an aufs branch" - depends on FUSE_FS - select AUFS_POLL - help - If you want to use fuse-based userspace filesystem as an aufs - branch fs, then enable this option. - It implements the internal poll(2) operation which is - implemented by fuse only (curretnly). - -config AUFS_POLL - bool - help - Automatic configuration for internal use. - -config AUFS_BR_HFSPLUS - bool "Hfsplus as an aufs branch" - depends on HFSPLUS_FS - default y - help - If you want to use hfsplus fs as an aufs branch fs, then enable - this option. This option introduces a small overhead at - copying-up a file on hfsplus. - -config AUFS_BDEV_LOOP - bool - depends on BLK_DEV_LOOP - default y - help - Automatic configuration for internal use. - Convert =[ym] into =y. - -config AUFS_DEBUG - bool "Debug aufs" - help - Enable this to compile aufs internal debug code. - It will have a negative impact to the performance. - -config AUFS_MAGIC_SYSRQ - bool - depends on AUFS_DEBUG && MAGIC_SYSRQ - default y - help - Automatic configuration for internal use. - When aufs supports Magic SysRq, enabled automatically. -endif diff --git a/fs/aufs/Makefile b/fs/aufs/Makefile deleted file mode 100644 index c7a501e37..000000000 --- a/fs/aufs/Makefile +++ /dev/null @@ -1,44 +0,0 @@ - -include ${src}/magic.mk -ifeq (${CONFIG_AUFS_FS},m) -include ${src}/conf.mk -endif --include ${src}/priv_def.mk - -# cf. include/linux/kernel.h -# enable pr_debug -ccflags-y += -DDEBUG -# sparse requires the full pathname -ifdef M -ccflags-y += -include ${M}/../../include/uapi/linux/aufs_type.h -else -ccflags-y += -include ${srctree}/include/uapi/linux/aufs_type.h -endif - -obj-$(CONFIG_AUFS_FS) += aufs.o -aufs-y := module.o sbinfo.o super.o branch.o xino.o sysaufs.o opts.o \ - wkq.o vfsub.o dcsub.o \ - cpup.o whout.o wbr_policy.o \ - dinfo.o dentry.o \ - dynop.o \ - finfo.o file.o f_op.o \ - dir.o vdir.o \ - iinfo.o inode.o i_op.o i_op_add.o i_op_del.o i_op_ren.o \ - mvdown.o ioctl.o - -# all are boolean -aufs-$(CONFIG_PROC_FS) += procfs.o plink.o -aufs-$(CONFIG_SYSFS) += sysfs.o -aufs-$(CONFIG_DEBUG_FS) += dbgaufs.o -aufs-$(CONFIG_AUFS_BDEV_LOOP) += loop.o -aufs-$(CONFIG_AUFS_HNOTIFY) += hnotify.o -aufs-$(CONFIG_AUFS_HFSNOTIFY) += hfsnotify.o -aufs-$(CONFIG_AUFS_EXPORT) += export.o -aufs-$(CONFIG_AUFS_XATTR) += xattr.o -aufs-$(CONFIG_FS_POSIX_ACL) += posix_acl.o -aufs-$(CONFIG_AUFS_FHSM) += fhsm.o -aufs-$(CONFIG_AUFS_POLL) += poll.o -aufs-$(CONFIG_AUFS_RDU) += rdu.o -aufs-$(CONFIG_AUFS_BR_HFSPLUS) += hfsplus.o -aufs-$(CONFIG_AUFS_DEBUG) += debug.o -aufs-$(CONFIG_AUFS_MAGIC_SYSRQ) += sysrq.o diff --git a/fs/aufs/aufs.h b/fs/aufs/aufs.h deleted file mode 100644 index d82e9743f..000000000 --- a/fs/aufs/aufs.h +++ /dev/null @@ -1,59 +0,0 @@ -/* - * Copyright (C) 2005-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * all header files - */ - -#ifndef __AUFS_H__ -#define __AUFS_H__ - -#ifdef __KERNEL__ - -#define AuStub(type, name, body, ...) \ - static inline type name(__VA_ARGS__) { body; } - -#define AuStubVoid(name, ...) \ - AuStub(void, name, , __VA_ARGS__) -#define AuStubInt0(name, ...) \ - AuStub(int, name, return 0, __VA_ARGS__) - -#include "debug.h" - -#include "branch.h" -#include "cpup.h" -#include "dcsub.h" -#include "dbgaufs.h" -#include "dentry.h" -#include "dir.h" -#include "dynop.h" -#include "file.h" -#include "fstype.h" -#include "inode.h" -#include "loop.h" -#include "module.h" -#include "opts.h" -#include "rwsem.h" -#include "spl.h" -#include "super.h" -#include "sysaufs.h" -#include "vfsub.h" -#include "whout.h" -#include "wkq.h" - -#endif /* __KERNEL__ */ -#endif /* __AUFS_H__ */ diff --git a/fs/aufs/branch.c b/fs/aufs/branch.c deleted file mode 100644 index 2ec496438..000000000 --- a/fs/aufs/branch.c +++ /dev/null @@ -1,1414 +0,0 @@ -/* - * Copyright (C) 2005-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * branch management - */ - -#include <linux/compat.h> -#include <linux/statfs.h> -#include "aufs.h" - -/* - * free a single branch - */ -static void au_br_do_free(struct au_branch *br) -{ - int i; - struct au_wbr *wbr; - struct au_dykey **key; - - au_hnotify_fin_br(br); - - if (br->br_xino.xi_file) - fput(br->br_xino.xi_file); - mutex_destroy(&br->br_xino.xi_nondir_mtx); - - AuDebugOn(atomic_read(&br->br_count)); - - wbr = br->br_wbr; - if (wbr) { - for (i = 0; i < AuBrWh_Last; i++) - dput(wbr->wbr_wh[i]); - AuDebugOn(atomic_read(&wbr->wbr_wh_running)); - AuRwDestroy(&wbr->wbr_wh_rwsem); - } - - if (br->br_fhsm) { - au_br_fhsm_fin(br->br_fhsm); - kfree(br->br_fhsm); - } - - key = br->br_dykey; - for (i = 0; i < AuBrDynOp; i++, key++) - if (*key) - au_dy_put(*key); - else - break; - - /* recursive lock, s_umount of branch's */ - lockdep_off(); - path_put(&br->br_path); - lockdep_on(); - kfree(wbr); - kfree(br); -} - -/* - * frees all branches - */ -void au_br_free(struct au_sbinfo *sbinfo) -{ - aufs_bindex_t bmax; - struct au_branch **br; - - AuRwMustWriteLock(&sbinfo->si_rwsem); - - bmax = sbinfo->si_bend + 1; - br = sbinfo->si_branch; - while (bmax--) - au_br_do_free(*br++); -} - -/* - * find the index of a branch which is specified by @br_id. - */ -int au_br_index(struct super_block *sb, aufs_bindex_t br_id) -{ - aufs_bindex_t bindex, bend; - - bend = au_sbend(sb); - for (bindex = 0; bindex <= bend; bindex++) - if (au_sbr_id(sb, bindex) == br_id) - return bindex; - return -1; -} - -/* ---------------------------------------------------------------------- */ - -/* - * add a branch - */ - -static int test_overlap(struct super_block *sb, struct dentry *h_adding, - struct dentry *h_root) -{ - if (unlikely(h_adding == h_root - || au_test_loopback_overlap(sb, h_adding))) - return 1; - if (h_adding->d_sb != h_root->d_sb) - return 0; - return au_test_subdir(h_adding, h_root) - || au_test_subdir(h_root, h_adding); -} - -/* - * returns a newly allocated branch. @new_nbranch is a number of branches - * after adding a branch. - */ -static struct au_branch *au_br_alloc(struct super_block *sb, int new_nbranch, - int perm) -{ - struct au_branch *add_branch; - struct dentry *root; - struct inode *inode; - int err; - - err = -ENOMEM; - root = sb->s_root; - add_branch = kmalloc(sizeof(*add_branch), GFP_NOFS); - if (unlikely(!add_branch)) - goto out; - - err = au_hnotify_init_br(add_branch, perm); - if (unlikely(err)) - goto out_br; - - add_branch->br_wbr = NULL; - if (au_br_writable(perm)) { - /* may be freed separately at changing the branch permission */ - add_branch->br_wbr = kmalloc(sizeof(*add_branch->br_wbr), - GFP_NOFS); - if (unlikely(!add_branch->br_wbr)) - goto out_hnotify; - } - - add_branch->br_fhsm = NULL; - if (au_br_fhsm(perm)) { - err = au_fhsm_br_alloc(add_branch); - if (unlikely(err)) - goto out_wbr; - } - - err = au_sbr_realloc(au_sbi(sb), new_nbranch); - if (!err) - err = au_di_realloc(au_di(root), new_nbranch); - if (!err) { - inode = d_inode(root); - err = au_ii_realloc(au_ii(inode), new_nbranch); - } - if (!err) - return add_branch; /* success */ - -out_wbr: - kfree(add_branch->br_wbr); -out_hnotify: - au_hnotify_fin_br(add_branch); -out_br: - kfree(add_branch); -out: - return ERR_PTR(err); -} - -/* - * test if the branch permission is legal or not. - */ -static int test_br(struct inode *inode, int brperm, char *path) -{ - int err; - - err = (au_br_writable(brperm) && IS_RDONLY(inode)); - if (!err) - goto out; - - err = -EINVAL; - pr_err("write permission for readonly mount or inode, %s\n", path); - -out: - return err; -} - -/* - * returns: - * 0: success, the caller will add it - * plus: success, it is already unified, the caller should ignore it - * minus: error - */ -static int test_add(struct super_block *sb, struct au_opt_add *add, int remount) -{ - int err; - aufs_bindex_t bend, bindex; - struct dentry *root, *h_dentry; - struct inode *inode, *h_inode; - - root = sb->s_root; - bend = au_sbend(sb); - if (unlikely(bend >= 0 - && au_find_dbindex(root, add->path.dentry) >= 0)) { - err = 1; - if (!remount) { - err = -EINVAL; - pr_err("%s duplicated\n", add->pathname); - } - goto out; - } - - err = -ENOSPC; /* -E2BIG; */ - if (unlikely(AUFS_BRANCH_MAX <= add->bindex - || AUFS_BRANCH_MAX - 1 <= bend)) { - pr_err("number of branches exceeded %s\n", add->pathname); - goto out; - } - - err = -EDOM; - if (unlikely(add->bindex < 0 || bend + 1 < add->bindex)) { - pr_err("bad index %d\n", add->bindex); - goto out; - } - - inode = d_inode(add->path.dentry); - err = -ENOENT; - if (unlikely(!inode->i_nlink)) { - pr_err("no existence %s\n", add->pathname); - goto out; - } - - err = -EINVAL; - if (unlikely(inode->i_sb == sb)) { - pr_err("%s must be outside\n", add->pathname); - goto out; - } - - if (unlikely(au_test_fs_unsuppoted(inode->i_sb))) { - pr_err("unsupported filesystem, %s (%s)\n", - add->pathname, au_sbtype(inode->i_sb)); - goto out; - } - - if (unlikely(inode->i_sb->s_stack_depth)) { - pr_err("already stacked, %s (%s)\n", - add->pathname, au_sbtype(inode->i_sb)); - goto out; - } - - err = test_br(d_inode(add->path.dentry), add->perm, add->pathname); - if (unlikely(err)) - goto out; - - if (bend < 0) - return 0; /* success */ - - err = -EINVAL; - for (bindex = 0; bindex <= bend; bindex++) - if (unlikely(test_overlap(sb, add->path.dentry, - au_h_dptr(root, bindex)))) { - pr_err("%s is overlapped\n", add->pathname); - goto out; - } - - err = 0; - if (au_opt_test(au_mntflags(sb), WARN_PERM)) { - h_dentry = au_h_dptr(root, 0); - h_inode = d_inode(h_dentry); - if ((h_inode->i_mode & S_IALLUGO) != (inode->i_mode & S_IALLUGO) - || !uid_eq(h_inode->i_uid, inode->i_uid) - || !gid_eq(h_inode->i_gid, inode->i_gid)) - pr_warn("uid/gid/perm %s %u/%u/0%o, %u/%u/0%o\n", - add->pathname, - i_uid_read(inode), i_gid_read(inode), - (inode->i_mode & S_IALLUGO), - i_uid_read(h_inode), i_gid_read(h_inode), - (h_inode->i_mode & S_IALLUGO)); - } - -out: - return err; -} - -/* - * initialize or clean the whiteouts for an adding branch - */ -static int au_br_init_wh(struct super_block *sb, struct au_branch *br, - int new_perm) -{ - int err, old_perm; - aufs_bindex_t bindex; - struct mutex *h_mtx; - struct au_wbr *wbr; - struct au_hinode *hdir; - struct dentry *h_dentry; - - err = vfsub_mnt_want_write(au_br_mnt(br)); - if (unlikely(err)) - goto out; - - wbr = br->br_wbr; - old_perm = br->br_perm; - br->br_perm = new_perm; - hdir = NULL; - h_mtx = NULL; - bindex = au_br_index(sb, br->br_id); - if (0 <= bindex) { - hdir = au_hi(d_inode(sb->s_root), bindex); - au_hn_imtx_lock_nested(hdir, AuLsc_I_PARENT); - } else { - h_dentry = au_br_dentry(br); - h_mtx = &d_inode(h_dentry)->i_mutex; - mutex_lock_nested(h_mtx, AuLsc_I_PARENT); - } - if (!wbr) - err = au_wh_init(br, sb); - else { - wbr_wh_write_lock(wbr); - err = au_wh_init(br, sb); - wbr_wh_write_unlock(wbr); - } - if (hdir) - au_hn_imtx_unlock(hdir); - else - mutex_unlock(h_mtx); - vfsub_mnt_drop_write(au_br_mnt(br)); - br->br_perm = old_perm; - - if (!err && wbr && !au_br_writable(new_perm)) { - kfree(wbr); - br->br_wbr = NULL; - } - -out: - return err; -} - -static int au_wbr_init(struct au_branch *br, struct super_block *sb, - int perm) -{ - int err; - struct kstatfs kst; - struct au_wbr *wbr; - - wbr = br->br_wbr; - au_rw_init(&wbr->wbr_wh_rwsem); - memset(wbr->wbr_wh, 0, sizeof(wbr->wbr_wh)); - atomic_set(&wbr->wbr_wh_running, 0); - wbr->wbr_bytes = 0; - - /* - * a limit for rmdir/rename a dir - * cf. AUFS_MAX_NAMELEN in include/uapi/linux/aufs_type.h - */ - err = vfs_statfs(&br->br_path, &kst); - if (unlikely(err)) - goto out; - err = -EINVAL; - if (kst.f_namelen >= NAME_MAX) - err = au_br_init_wh(sb, br, perm); - else - pr_err("%pd(%s), unsupported namelen %ld\n", - au_br_dentry(br), - au_sbtype(au_br_dentry(br)->d_sb), kst.f_namelen); - -out: - return err; -} - -/* initialize a new branch */ -static int au_br_init(struct au_branch *br, struct super_block *sb, - struct au_opt_add *add) -{ - int err; - struct inode *h_inode; - - err = 0; - memset(&br->br_xino, 0, sizeof(br->br_xino)); - mutex_init(&br->br_xino.xi_nondir_mtx); - br->br_perm = add->perm; - br->br_path = add->path; /* set first, path_get() later */ - spin_lock_init(&br->br_dykey_lock); - memset(br->br_dykey, 0, sizeof(br->br_dykey)); - atomic_set(&br->br_count, 0); - atomic_set(&br->br_xino_running, 0); - br->br_id = au_new_br_id(sb); - AuDebugOn(br->br_id < 0); - - if (au_br_writable(add->perm)) { - err = au_wbr_init(br, sb, add->perm); - if (unlikely(err)) - goto out_err; - } - - if (au_opt_test(au_mntflags(sb), XINO)) { - h_inode = d_inode(add->path.dentry); - err = au_xino_br(sb, br, h_inode->i_ino, - au_sbr(sb, 0)->br_xino.xi_file, /*do_test*/1); - if (unlikely(err)) { - AuDebugOn(br->br_xino.xi_file); - goto out_err; - } - } - - sysaufs_br_init(br); - path_get(&br->br_path); - goto out; /* success */ - -out_err: - memset(&br->br_path, 0, sizeof(br->br_path)); -out: - return err; -} - -static void au_br_do_add_brp(struct au_sbinfo *sbinfo, aufs_bindex_t bindex, - struct au_branch *br, aufs_bindex_t bend, - aufs_bindex_t amount) -{ - struct au_branch **brp; - - AuRwMustWriteLock(&sbinfo->si_rwsem); - - brp = sbinfo->si_branch + bindex; - memmove(brp + 1, brp, sizeof(*brp) * amount); - *brp = br; - sbinfo->si_bend++; - if (unlikely(bend < 0)) - sbinfo->si_bend = 0; -} - -static void au_br_do_add_hdp(struct au_dinfo *dinfo, aufs_bindex_t bindex, - aufs_bindex_t bend, aufs_bindex_t amount) -{ - struct au_hdentry *hdp; - - AuRwMustWriteLock(&dinfo->di_rwsem); - - hdp = dinfo->di_hdentry + bindex; - memmove(hdp + 1, hdp, sizeof(*hdp) * amount); - au_h_dentry_init(hdp); - dinfo->di_bend++; - if (unlikely(bend < 0)) - dinfo->di_bstart = 0; -} - -static void au_br_do_add_hip(struct au_iinfo *iinfo, aufs_bindex_t bindex, - aufs_bindex_t bend, aufs_bindex_t amount) -{ - struct au_hinode *hip; - - AuRwMustWriteLock(&iinfo->ii_rwsem); - - hip = iinfo->ii_hinode + bindex; - memmove(hip + 1, hip, sizeof(*hip) * amount); - hip->hi_inode = NULL; - au_hn_init(hip); - iinfo->ii_bend++; - if (unlikely(bend < 0)) - iinfo->ii_bstart = 0; -} - -static void au_br_do_add(struct super_block *sb, struct au_branch *br, - aufs_bindex_t bindex) -{ - struct dentry *root, *h_dentry; - struct inode *root_inode, *h_inode; - aufs_bindex_t bend, amount; - - root = sb->s_root; - root_inode = d_inode(root); - bend = au_sbend(sb); - amount = bend + 1 - bindex; - h_dentry = au_br_dentry(br); - au_sbilist_lock(); - au_br_do_add_brp(au_sbi(sb), bindex, br, bend, amount); - au_br_do_add_hdp(au_di(root), bindex, bend, amount); - au_br_do_add_hip(au_ii(root_inode), bindex, bend, amount); - au_set_h_dptr(root, bindex, dget(h_dentry)); - h_inode = d_inode(h_dentry); - au_set_h_iptr(root_inode, bindex, au_igrab(h_inode), /*flags*/0); - au_sbilist_unlock(); -} - -int au_br_add(struct super_block *sb, struct au_opt_add *add, int remount) -{ - int err; - aufs_bindex_t bend, add_bindex; - struct dentry *root, *h_dentry; - struct inode *root_inode; - struct au_branch *add_branch; - - root = sb->s_root; - root_inode = d_inode(root); - IMustLock(root_inode); - err = test_add(sb, add, remount); - if (unlikely(err < 0)) - goto out; - if (err) { - err = 0; - goto out; /* success */ - } - - bend = au_sbend(sb); - add_branch = au_br_alloc(sb, bend + 2, add->perm); - err = PTR_ERR(add_branch); - if (IS_ERR(add_branch)) - goto out; - - err = au_br_init(add_branch, sb, add); - if (unlikely(err)) { - au_br_do_free(add_branch); - goto out; - } - - add_bindex = add->bindex; - if (!remount) - au_br_do_add(sb, add_branch, add_bindex); - else { - sysaufs_brs_del(sb, add_bindex); - au_br_do_add(sb, add_branch, add_bindex); - sysaufs_brs_add(sb, add_bindex); - } - - h_dentry = add->path.dentry; - if (!add_bindex) { - au_cpup_attr_all(root_inode, /*force*/1); - sb->s_maxbytes = h_dentry->d_sb->s_maxbytes; - } else - au_add_nlink(root_inode, d_inode(h_dentry)); - - /* - * this test/set prevents aufs from handling unnecesary notify events - * of xino files, in case of re-adding a writable branch which was - * once detached from aufs. - */ - if (au_xino_brid(sb) < 0 - && au_br_writable(add_branch->br_perm) - && !au_test_fs_bad_xino(h_dentry->d_sb) - && add_branch->br_xino.xi_file - && add_branch->br_xino.xi_file->f_path.dentry->d_parent == h_dentry) - au_xino_brid_set(sb, add_branch->br_id); - -out: - return err; -} - -/* ---------------------------------------------------------------------- */ - -static unsigned long long au_farray_cb(void *a, - unsigned long long max __maybe_unused, - void *arg) -{ - unsigned long long n; - struct file **p, *f; - struct au_sphlhead *files; - struct au_finfo *finfo; - struct super_block *sb = arg; - - n = 0; - p = a; - files = &au_sbi(sb)->si_files; - spin_lock(&files->spin); - hlist_for_each_entry(finfo, &files->head, fi_hlist) { - f = finfo->fi_file; - if (file_count(f) - && !special_file(file_inode(f)->i_mode)) { - get_file(f); - *p++ = f; - n++; - AuDebugOn(n > max); - } - } - spin_unlock(&files->spin); - - return n; -} - -static struct file **au_farray_alloc(struct super_block *sb, - unsigned long long *max) -{ - *max = atomic_long_read(&au_sbi(sb)->si_nfiles); - return au_array_alloc(max, au_farray_cb, sb); -} - -static void au_farray_free(struct file **a, unsigned long long max) -{ - unsigned long long ull; - - for (ull = 0; ull < max; ull++) - if (a[ull]) - fput(a[ull]); - au_array_free(a); -} - -/* ---------------------------------------------------------------------- */ - -/* - * delete a branch - */ - -/* to show the line number, do not make it inlined function */ -#define AuVerbose(do_info, fmt, ...) do { \ - if (do_info) \ - pr_info(fmt, ##__VA_ARGS__); \ -} while (0) - -static int au_test_ibusy(struct inode *inode, aufs_bindex_t bstart, - aufs_bindex_t bend) -{ - return (inode && !S_ISDIR(inode->i_mode)) || bstart == bend; -} - -static int au_test_dbusy(struct dentry *dentry, aufs_bindex_t bstart, - aufs_bindex_t bend) -{ - return au_test_ibusy(d_inode(dentry), bstart, bend); -} - -/* - * test if the branch is deletable or not. - */ -static int test_dentry_busy(struct dentry *root, aufs_bindex_t bindex, - unsigned int sigen, const unsigned int verbose) -{ - int err, i, j, ndentry; - aufs_bindex_t bstart, bend; - struct au_dcsub_pages dpages; - struct au_dpage *dpage; - struct dentry *d; - - err = au_dpages_init(&dpages, GFP_NOFS); - if (unlikely(err)) - goto out; - err = au_dcsub_pages(&dpages, root, NULL, NULL); - if (unlikely(err)) - goto out_dpages; - - for (i = 0; !err && i < dpages.ndpage; i++) { - dpage = dpages.dpages + i; - ndentry = dpage->ndentry; - for (j = 0; !err && j < ndentry; j++) { - d = dpage->dentries[j]; - AuDebugOn(au_dcount(d) <= 0); - if (!au_digen_test(d, sigen)) { - di_read_lock_child(d, AuLock_IR); - if (unlikely(au_dbrange_test(d))) { - di_read_unlock(d, AuLock_IR); - continue; - } - } else { - di_write_lock_child(d); - if (unlikely(au_dbrange_test(d))) { - di_write_unlock(d); - continue; - } - err = au_reval_dpath(d, sigen); - if (!err) - di_downgrade_lock(d, AuLock_IR); - else { - di_write_unlock(d); - break; - } - } - - /* AuDbgDentry(d); */ - bstart = au_dbstart(d); - bend = au_dbend(d); - if (bstart <= bindex - && bindex <= bend - && au_h_dptr(d, bindex) - && au_test_dbusy(d, bstart, bend)) { - err = -EBUSY; - AuVerbose(verbose, "busy %pd\n", d); - AuDbgDentry(d); - } - di_read_unlock(d, AuLock_IR); - } - } - -out_dpages: - au_dpages_free(&dpages); -out: - return err; -} - -static int test_inode_busy(struct super_block *sb, aufs_bindex_t bindex, - unsigned int sigen, const unsigned int verbose) -{ - int err; - unsigned long long max, ull; - struct inode *i, **array; - aufs_bindex_t bstart, bend; - - array = au_iarray_alloc(sb, &max); - err = PTR_ERR(array); - if (IS_ERR(array)) - goto out; - - err = 0; - AuDbg("b%d\n", bindex); - for (ull = 0; !err && ull < max; ull++) { - i = array[ull]; - if (unlikely(!i)) - break; - if (i->i_ino == AUFS_ROOT_INO) - continue; - - /* AuDbgInode(i); */ - if (au_iigen(i, NULL) == sigen) - ii_read_lock_child(i); - else { - ii_write_lock_child(i); - err = au_refresh_hinode_self(i); - au_iigen_dec(i); - if (!err) - ii_downgrade_lock(i); - else { - ii_write_unlock(i); - break; - } - } - - bstart = au_ibstart(i); - bend = au_ibend(i); - if (bstart <= bindex - && bindex <= bend - && au_h_iptr(i, bindex) - && au_test_ibusy(i, bstart, bend)) { - err = -EBUSY; - AuVerbose(verbose, "busy i%lu\n", i->i_ino); - AuDbgInode(i); - } - ii_read_unlock(i); - } - au_iarray_free(array, max); - -out: - return err; -} - -static int test_children_busy(struct dentry *root, aufs_bindex_t bindex, - const unsigned int verbose) -{ - int err; - unsigned int sigen; - - sigen = au_sigen(root->d_sb); - DiMustNoWaiters(root); - IiMustNoWaiters(d_inode(root)); - di_write_unlock(root); - err = test_dentry_busy(root, bindex, sigen, verbose); - if (!err) - err = test_inode_busy(root->d_sb, bindex, sigen, verbose); - di_write_lock_child(root); /* aufs_write_lock() calls ..._child() */ - - return err; -} - -static int test_dir_busy(struct file *file, aufs_bindex_t br_id, - struct file **to_free, int *idx) -{ - int err; - unsigned char matched, root; - aufs_bindex_t bindex, bend; - struct au_fidir *fidir; - struct au_hfile *hfile; - - err = 0; - root = IS_ROOT(file->f_path.dentry); - if (root) { - get_file(file); - to_free[*idx] = file; - (*idx)++; - goto out; - } - - matched = 0; - fidir = au_fi(file)->fi_hdir; - AuDebugOn(!fidir); - bend = au_fbend_dir(file); - for (bindex = au_fbstart(file); bindex <= bend; bindex++) { - hfile = fidir->fd_hfile + bindex; - if (!hfile->hf_file) - continue; - - if (hfile->hf_br->br_id == br_id) { - matched = 1; - break; - } - } - if (matched) - err = -EBUSY; - -out: - return err; -} - -static int test_file_busy(struct super_block *sb, aufs_bindex_t br_id, - struct file **to_free, int opened) -{ - int err, idx; - unsigned long long ull, max; - aufs_bindex_t bstart; - struct file *file, **array; - struct dentry *root; - struct au_hfile *hfile; - - array = au_farray_alloc(sb, &max); - err = PTR_ERR(array); - if (IS_ERR(array)) - goto out; - - err = 0; - idx = 0; - root = sb->s_root; - di_write_unlock(root); - for (ull = 0; ull < max; ull++) { - file = array[ull]; - if (unlikely(!file)) - break; - - /* AuDbg("%pD\n", file); */ - fi_read_lock(file); - bstart = au_fbstart(file); - if (!d_is_dir(file->f_path.dentry)) { - hfile = &au_fi(file)->fi_htop; - if (hfile->hf_br->br_id == br_id) - err = -EBUSY; - } else - err = test_dir_busy(file, br_id, to_free, &idx); - fi_read_unlock(file); - if (unlikely(err)) - break; - } - di_write_lock_child(root); - au_farray_free(array, max); - AuDebugOn(idx > opened); - -out: - return err; -} - -static void br_del_file(struct file **to_free, unsigned long long opened, - aufs_bindex_t br_id) -{ - unsigned long long ull; - aufs_bindex_t bindex, bstart, bend, bfound; - struct file *file; - struct au_fidir *fidir; - struct au_hfile *hfile; - - for (ull = 0; ull < opened; ull++) { - file = to_free[ull]; - if (unlikely(!file)) - break; - - /* AuDbg("%pD\n", file); */ - AuDebugOn(!d_is_dir(file->f_path.dentry)); - bfound = -1; - fidir = au_fi(file)->fi_hdir; - AuDebugOn(!fidir); - fi_write_lock(file); - bstart = au_fbstart(file); - bend = au_fbend_dir(file); - for (bindex = bstart; bindex <= bend; bindex++) { - hfile = fidir->fd_hfile + bindex; - if (!hfile->hf_file) - continue; - - if (hfile->hf_br->br_id == br_id) { - bfound = bindex; - break; - } - } - AuDebugOn(bfound < 0); - au_set_h_fptr(file, bfound, NULL); - if (bfound == bstart) { - for (bstart++; bstart <= bend; bstart++) - if (au_hf_dir(file, bstart)) { - au_set_fbstart(file, bstart); - break; - } - } - fi_write_unlock(file); - } -} - -static void au_br_do_del_brp(struct au_sbinfo *sbinfo, - const aufs_bindex_t bindex, - const aufs_bindex_t bend) -{ - struct au_branch **brp, **p; - - AuRwMustWriteLock(&sbinfo->si_rwsem); - - brp = sbinfo->si_branch + bindex; - if (bindex < bend) - memmove(brp, brp + 1, sizeof(*brp) * (bend - bindex)); - sbinfo->si_branch[0 + bend] = NULL; - sbinfo->si_bend--; - - p = krealloc(sbinfo->si_branch, sizeof(*p) * bend, AuGFP_SBILIST); - if (p) - sbinfo->si_branch = p; - /* harmless error */ -} - -static void au_br_do_del_hdp(struct au_dinfo *dinfo, const aufs_bindex_t bindex, - const aufs_bindex_t bend) -{ - struct au_hdentry *hdp, *p; - - AuRwMustWriteLock(&dinfo->di_rwsem); - - hdp = dinfo->di_hdentry; - if (bindex < bend) - memmove(hdp + bindex, hdp + bindex + 1, - sizeof(*hdp) * (bend - bindex)); - hdp[0 + bend].hd_dentry = NULL; - dinfo->di_bend--; - - p = krealloc(hdp, sizeof(*p) * bend, AuGFP_SBILIST); - if (p) - dinfo->di_hdentry = p; - /* harmless error */ -} - -static void au_br_do_del_hip(struct au_iinfo *iinfo, const aufs_bindex_t bindex, - const aufs_bindex_t bend) -{ - struct au_hinode *hip, *p; - - AuRwMustWriteLock(&iinfo->ii_rwsem); - - hip = iinfo->ii_hinode + bindex; - if (bindex < bend) - memmove(hip, hip + 1, sizeof(*hip) * (bend - bindex)); - iinfo->ii_hinode[0 + bend].hi_inode = NULL; - au_hn_init(iinfo->ii_hinode + bend); - iinfo->ii_bend--; - - p = krealloc(iinfo->ii_hinode, sizeof(*p) * bend, AuGFP_SBILIST); - if (p) - iinfo->ii_hinode = p; - /* harmless error */ -} - -static void au_br_do_del(struct super_block *sb, aufs_bindex_t bindex, - struct au_branch *br) -{ - aufs_bindex_t bend; - struct au_sbinfo *sbinfo; - struct dentry *root, *h_root; - struct inode *inode, *h_inode; - struct au_hinode *hinode; - - SiMustWriteLock(sb); - - root = sb->s_root; - inode = d_inode(root); - sbinfo = au_sbi(sb); - bend = sbinfo->si_bend; - - h_root = au_h_dptr(root, bindex); - hinode = au_hi(inode, bindex); - h_inode = au_igrab(hinode->hi_inode); - au_hiput(hinode); - - au_sbilist_lock(); - au_br_do_del_brp(sbinfo, bindex, bend); - au_br_do_del_hdp(au_di(root), bindex, bend); - au_br_do_del_hip(au_ii(inode), bindex, bend); - au_sbilist_unlock(); - - dput(h_root); - iput(h_inode); - au_br_do_free(br); -} - -static unsigned long long empty_cb(void *array, unsigned long long max, - void *arg) -{ - return max; -} - -int au_br_del(struct super_block *sb, struct au_opt_del *del, int remount) -{ - int err, rerr, i; - unsigned long long opened; - unsigned int mnt_flags; - aufs_bindex_t bindex, bend, br_id; - unsigned char do_wh, verbose; - struct au_branch *br; - struct au_wbr *wbr; - struct dentry *root; - struct file **to_free; - - err = 0; - opened = 0; - to_free = NULL; - root = sb->s_root; - bindex = au_find_dbindex(root, del->h_path.dentry); - if (bindex < 0) { - if (remount) - goto out; /* success */ - err = -ENOENT; - pr_err("%s no such branch\n", del->pathname); - goto out; - } - AuDbg("bindex b%d\n", bindex); - - err = -EBUSY; - mnt_flags = au_mntflags(sb); - verbose = !!au_opt_test(mnt_flags, VERBOSE); - bend = au_sbend(sb); - if (unlikely(!bend)) { - AuVerbose(verbose, "no more branches left\n"); - goto out; - } - br = au_sbr(sb, bindex); - AuDebugOn(!path_equal(&br->br_path, &del->h_path)); - - br_id = br->br_id; - opened = atomic_read(&br->br_count); - if (unlikely(opened)) { - to_free = au_array_alloc(&opened, empty_cb, NULL); - err = PTR_ERR(to_free); - if (IS_ERR(to_free)) - goto out; - - err = test_file_busy(sb, br_id, to_free, opened); - if (unlikely(err)) { - AuVerbose(verbose, "%llu file(s) opened\n", opened); - goto out; - } - } - - wbr = br->br_wbr; - do_wh = wbr && (wbr->wbr_whbase || wbr->wbr_plink || wbr->wbr_orph); - if (do_wh) { - /* instead of WbrWhMustWriteLock(wbr) */ - SiMustWriteLock(sb); - for (i = 0; i < AuBrWh_Last; i++) { - dput(wbr->wbr_wh[i]); - wbr->wbr_wh[i] = NULL; - } - } - - err = test_children_busy(root, bindex, verbose); - if (unlikely(err)) { - if (do_wh) - goto out_wh; - goto out; - } - - err = 0; - if (to_free) { - /* - * now we confirmed the branch is deletable. - * let's free the remaining opened dirs on the branch. - */ - di_write_unlock(root); - br_del_file(to_free, opened, br_id); - di_write_lock_child(root); - } - - if (!remount) - au_br_do_del(sb, bindex, br); - else { - sysaufs_brs_del(sb, bindex); - au_br_do_del(sb, bindex, br); - sysaufs_brs_add(sb, bindex); - } - - if (!bindex) { - au_cpup_attr_all(d_inode(root), /*force*/1); - sb->s_maxbytes = au_sbr_sb(sb, 0)->s_maxbytes; - } else - au_sub_nlink(d_inode(root), d_inode(del->h_path.dentry)); - if (au_opt_test(mnt_flags, PLINK)) - au_plink_half_refresh(sb, br_id); - - if (au_xino_brid(sb) == br_id) - au_xino_brid_set(sb, -1); - goto out; /* success */ - -out_wh: - /* revert */ - rerr = au_br_init_wh(sb, br, br->br_perm); - if (rerr) - pr_warn("failed re-creating base whiteout, %s. (%d)\n", - del->pathname, rerr); -out: - if (to_free) - au_farray_free(to_free, opened); - return err; -} - -/* ---------------------------------------------------------------------- */ - -static int au_ibusy(struct super_block *sb, struct aufs_ibusy __user *arg) -{ - int err; - aufs_bindex_t bstart, bend; - struct aufs_ibusy ibusy; - struct inode *inode, *h_inode; - - err = -EPERM; - if (unlikely(!capable(CAP_SYS_ADMIN))) - goto out; - - err = copy_from_user(&ibusy, arg, sizeof(ibusy)); - if (!err) - err = !access_ok(VERIFY_WRITE, &arg->h_ino, sizeof(arg->h_ino)); - if (unlikely(err)) { - err = -EFAULT; - AuTraceErr(err); - goto out; - } - - err = -EINVAL; - si_read_lock(sb, AuLock_FLUSH); - if (unlikely(ibusy.bindex < 0 || ibusy.bindex > au_sbend(sb))) - goto out_unlock; - - err = 0; - ibusy.h_ino = 0; /* invalid */ - inode = ilookup(sb, ibusy.ino); - if (!inode - || inode->i_ino == AUFS_ROOT_INO - || is_bad_inode(inode)) - goto out_unlock; - - ii_read_lock_child(inode); - bstart = au_ibstart(inode); - bend = au_ibend(inode); - if (bstart <= ibusy.bindex && ibusy.bindex <= bend) { - h_inode = au_h_iptr(inode, ibusy.bindex); - if (h_inode && au_test_ibusy(inode, bstart, bend)) - ibusy.h_ino = h_inode->i_ino; - } - ii_read_unlock(inode); - iput(inode); - -out_unlock: - si_read_unlock(sb); - if (!err) { - err = __put_user(ibusy.h_ino, &arg->h_ino); - if (unlikely(err)) { - err = -EFAULT; - AuTraceErr(err); - } - } -out: - return err; -} - -long au_ibusy_ioctl(struct file *file, unsigned long arg) -{ - return au_ibusy(file->f_path.dentry->d_sb, (void __user *)arg); -} - -#ifdef CONFIG_COMPAT -long au_ibusy_compat_ioctl(struct file *file, unsigned long arg) -{ - return au_ibusy(file->f_path.dentry->d_sb, compat_ptr(arg)); -} -#endif - -/* ---------------------------------------------------------------------- */ - -/* - * change a branch permission - */ - -static void au_warn_ima(void) -{ -#ifdef CONFIG_IMA - /* since it doesn't support mark_files_ro() */ - AuWarn1("RW -> RO makes IMA to produce wrong message\n"); -#endif -} - -static int do_need_sigen_inc(int a, int b) -{ - return au_br_whable(a) && !au_br_whable(b); -} - -static int need_sigen_inc(int old, int new) -{ - return do_need_sigen_inc(old, new) - || do_need_sigen_inc(new, old); -} - -static int au_br_mod_files_ro(struct super_block *sb, aufs_bindex_t bindex) -{ - int err, do_warn; - unsigned int mnt_flags; - unsigned long long ull, max; - aufs_bindex_t br_id; - unsigned char verbose, writer; - struct file *file, *hf, **array; - struct au_hfile *hfile; - - mnt_flags = au_mntflags(sb); - verbose = !!au_opt_test(mnt_flags, VERBOSE); - - array = au_farray_alloc(sb, &max); - err = PTR_ERR(array); - if (IS_ERR(array)) - goto out; - - do_warn = 0; - br_id = au_sbr_id(sb, bindex); - for (ull = 0; ull < max; ull++) { - file = array[ull]; - if (unlikely(!file)) - break; - - /* AuDbg("%pD\n", file); */ - fi_read_lock(file); - if (unlikely(au_test_mmapped(file))) { - err = -EBUSY; - AuVerbose(verbose, "mmapped %pD\n", file); - AuDbgFile(file); - FiMustNoWaiters(file); - fi_read_unlock(file); - goto out_array; - } - - hfile = &au_fi(file)->fi_htop; - hf = hfile->hf_file; - if (!d_is_reg(file->f_path.dentry) - || !(file->f_mode & FMODE_WRITE) - || hfile->hf_br->br_id != br_id - || !(hf->f_mode & FMODE_WRITE)) - array[ull] = NULL; - else { - do_warn = 1; - get_file(file); - } - - FiMustNoWaiters(file); - fi_read_unlock(file); - fput(file); - } - - err = 0; - if (do_warn) - au_warn_ima(); - - for (ull = 0; ull < max; ull++) { - file = array[ull]; - if (!file) - continue; - - /* todo: already flushed? */ - /* - * fs/super.c:mark_files_ro() is gone, but aufs keeps its - * approach which resets f_mode and calls mnt_drop_write() and - * file_release_write() for each file, because the branch - * attribute in aufs world is totally different from the native - * fs rw/ro mode. - */ - /* fi_read_lock(file); */ - hfile = &au_fi(file)->fi_htop; - hf = hfile->hf_file; - /* fi_read_unlock(file); */ - spin_lock(&hf->f_lock); - writer = !!(hf->f_mode & FMODE_WRITER); - hf->f_mode &= ~(FMODE_WRITE | FMODE_WRITER); - spin_unlock(&hf->f_lock); - if (writer) { - put_write_access(file_inode(hf)); - __mnt_drop_write(hf->f_path.mnt); - } - } - -out_array: - au_farray_free(array, max); -out: - AuTraceErr(err); - return err; -} - -int au_br_mod(struct super_block *sb, struct au_opt_mod *mod, int remount, - int *do_refresh) -{ - int err, rerr; - aufs_bindex_t bindex; - struct dentry *root; - struct au_branch *br; - struct au_br_fhsm *bf; - - root = sb->s_root; - bindex = au_find_dbindex(root, mod->h_root); - if (bindex < 0) { - if (remount) - return 0; /* success */ - err = -ENOENT; - pr_err("%s no such branch\n", mod->path); - goto out; - } - AuDbg("bindex b%d\n", bindex); - - err = test_br(d_inode(mod->h_root), mod->perm, mod->path); - if (unlikely(err)) - goto out; - - br = au_sbr(sb, bindex); - AuDebugOn(mod->h_root != au_br_dentry(br)); - if (br->br_perm == mod->perm) - return 0; /* success */ - - /* pre-allocate for non-fhsm --> fhsm */ - bf = NULL; - if (!au_br_fhsm(br->br_perm) && au_br_fhsm(mod->perm)) { - err = au_fhsm_br_alloc(br); - if (unlikely(err)) - goto out; - bf = br->br_fhsm; - br->br_fhsm = NULL; - } - - if (au_br_writable(br->br_perm)) { - /* remove whiteout base */ - err = au_br_init_wh(sb, br, mod->perm); - if (unlikely(err)) - goto out_bf; - - if (!au_br_writable(mod->perm)) { - /* rw --> ro, file might be mmapped */ - DiMustNoWaiters(root); - IiMustNoWaiters(d_inode(root)); - di_write_unlock(root); - err = au_br_mod_files_ro(sb, bindex); - /* aufs_write_lock() calls ..._child() */ - di_write_lock_child(root); - - if (unlikely(err)) { - rerr = -ENOMEM; - br->br_wbr = kmalloc(sizeof(*br->br_wbr), - GFP_NOFS); - if (br->br_wbr) - rerr = au_wbr_init(br, sb, br->br_perm); - if (unlikely(rerr)) { - AuIOErr("nested error %d (%d)\n", - rerr, err); - br->br_perm = mod->perm; - } - } - } - } else if (au_br_writable(mod->perm)) { - /* ro --> rw */ - err = -ENOMEM; - br->br_wbr = kmalloc(sizeof(*br->br_wbr), GFP_NOFS); - if (br->br_wbr) { - err = au_wbr_init(br, sb, mod->perm); - if (unlikely(err)) { - kfree(br->br_wbr); - br->br_wbr = NULL; - } - } - } - if (unlikely(err)) - goto out_bf; - - if (au_br_fhsm(br->br_perm)) { - if (!au_br_fhsm(mod->perm)) { - /* fhsm --> non-fhsm */ - au_br_fhsm_fin(br->br_fhsm); - kfree(br->br_fhsm); - br->br_fhsm = NULL; - } - } else if (au_br_fhsm(mod->perm)) - /* non-fhsm --> fhsm */ - br->br_fhsm = bf; - - *do_refresh |= need_sigen_inc(br->br_perm, mod->perm); - br->br_perm = mod->perm; - goto out; /* success */ - -out_bf: - kfree(bf); -out: - AuTraceErr(err); - return err; -} - -/* ---------------------------------------------------------------------- */ - -int au_br_stfs(struct au_branch *br, struct aufs_stfs *stfs) -{ - int err; - struct kstatfs kstfs; - - err = vfs_statfs(&br->br_path, &kstfs); - if (!err) { - stfs->f_blocks = kstfs.f_blocks; - stfs->f_bavail = kstfs.f_bavail; - stfs->f_files = kstfs.f_files; - stfs->f_ffree = kstfs.f_ffree; - } - - return err; -} diff --git a/fs/aufs/branch.h b/fs/aufs/branch.h deleted file mode 100644 index 3d026f57a..000000000 --- a/fs/aufs/branch.h +++ /dev/null @@ -1,279 +0,0 @@ -/* - * Copyright (C) 2005-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * branch filesystems and xino for them - */ - -#ifndef __AUFS_BRANCH_H__ -#define __AUFS_BRANCH_H__ - -#ifdef __KERNEL__ - -#include <linux/mount.h> -#include "dynop.h" -#include "rwsem.h" -#include "super.h" - -/* ---------------------------------------------------------------------- */ - -/* a xino file */ -struct au_xino_file { - struct file *xi_file; - struct mutex xi_nondir_mtx; - - /* todo: make xino files an array to support huge inode number */ - -#ifdef CONFIG_DEBUG_FS - struct dentry *xi_dbgaufs; -#endif -}; - -/* File-based Hierarchical Storage Management */ -struct au_br_fhsm { -#ifdef CONFIG_AUFS_FHSM - struct mutex bf_lock; - unsigned long bf_jiffy; - struct aufs_stfs bf_stfs; - int bf_readable; -#endif -}; - -/* members for writable branch only */ -enum {AuBrWh_BASE, AuBrWh_PLINK, AuBrWh_ORPH, AuBrWh_Last}; -struct au_wbr { - struct au_rwsem wbr_wh_rwsem; - struct dentry *wbr_wh[AuBrWh_Last]; - atomic_t wbr_wh_running; -#define wbr_whbase wbr_wh[AuBrWh_BASE] /* whiteout base */ -#define wbr_plink wbr_wh[AuBrWh_PLINK] /* pseudo-link dir */ -#define wbr_orph wbr_wh[AuBrWh_ORPH] /* dir for orphans */ - - /* mfs mode */ - unsigned long long wbr_bytes; -}; - -/* ext2 has 3 types of operations at least, ext3 has 4 */ -#define AuBrDynOp (AuDyLast * 4) - -#ifdef CONFIG_AUFS_HFSNOTIFY -/* support for asynchronous destruction */ -struct au_br_hfsnotify { - struct fsnotify_group *hfsn_group; -}; -#endif - -/* sysfs entries */ -struct au_brsysfs { - char name[16]; - struct attribute attr; -}; - -enum { - AuBrSysfs_BR, - AuBrSysfs_BRID, - AuBrSysfs_Last -}; - -/* protected by superblock rwsem */ -struct au_branch { - struct au_xino_file br_xino; - - aufs_bindex_t br_id; - - int br_perm; - struct path br_path; - spinlock_t br_dykey_lock; - struct au_dykey *br_dykey[AuBrDynOp]; - atomic_t br_count; - - struct au_wbr *br_wbr; - struct au_br_fhsm *br_fhsm; - - /* xino truncation */ - atomic_t br_xino_running; - -#ifdef CONFIG_AUFS_HFSNOTIFY - struct au_br_hfsnotify *br_hfsn; -#endif - -#ifdef CONFIG_SYSFS - /* entries under sysfs per mount-point */ - struct au_brsysfs br_sysfs[AuBrSysfs_Last]; -#endif -}; - -/* ---------------------------------------------------------------------- */ - -static inline struct vfsmount *au_br_mnt(struct au_branch *br) -{ - return br->br_path.mnt; -} - -static inline struct dentry *au_br_dentry(struct au_branch *br) -{ - return br->br_path.dentry; -} - -static inline struct super_block *au_br_sb(struct au_branch *br) -{ - return au_br_mnt(br)->mnt_sb; -} - -static inline int au_br_rdonly(struct au_branch *br) -{ - return ((au_br_sb(br)->s_flags & MS_RDONLY) - || !au_br_writable(br->br_perm)) - ? -EROFS : 0; -} - -static inline int au_br_hnotifyable(int brperm __maybe_unused) -{ -#ifdef CONFIG_AUFS_HNOTIFY - return !(brperm & AuBrPerm_RR); -#else - return 0; -#endif -} - -static inline int au_br_test_oflag(int oflag, struct au_branch *br) -{ - int err, exec_flag; - - err = 0; - exec_flag = oflag & __FMODE_EXEC; - if (unlikely(exec_flag && (au_br_mnt(br)->mnt_flags & MNT_NOEXEC))) - err = -EACCES; - - return err; -} - -/* ---------------------------------------------------------------------- */ - -/* branch.c */ -struct au_sbinfo; -void au_br_free(struct au_sbinfo *sinfo); -int au_br_index(struct super_block *sb, aufs_bindex_t br_id); -struct au_opt_add; -int au_br_add(struct super_block *sb, struct au_opt_add *add, int remount); -struct au_opt_del; -int au_br_del(struct super_block *sb, struct au_opt_del *del, int remount); -long au_ibusy_ioctl(struct file *file, unsigned long arg); -#ifdef CONFIG_COMPAT -long au_ibusy_compat_ioctl(struct file *file, unsigned long arg); -#endif -struct au_opt_mod; -int au_br_mod(struct super_block *sb, struct au_opt_mod *mod, int remount, - int *do_refresh); -struct aufs_stfs; -int au_br_stfs(struct au_branch *br, struct aufs_stfs *stfs); - -/* xino.c */ -static const loff_t au_loff_max = LLONG_MAX; - -int au_xib_trunc(struct super_block *sb); -ssize_t xino_fread(vfs_readf_t func, struct file *file, void *buf, size_t size, - loff_t *pos); -ssize_t xino_fwrite(vfs_writef_t func, struct file *file, void *buf, - size_t size, loff_t *pos); -struct file *au_xino_create2(struct file *base_file, struct file *copy_src); -struct file *au_xino_create(struct super_block *sb, char *fname, int silent); -ino_t au_xino_new_ino(struct super_block *sb); -void au_xino_delete_inode(struct inode *inode, const int unlinked); -int au_xino_write(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino, - ino_t ino); -int au_xino_read(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino, - ino_t *ino); -int au_xino_br(struct super_block *sb, struct au_branch *br, ino_t hino, - struct file *base_file, int do_test); -int au_xino_trunc(struct super_block *sb, aufs_bindex_t bindex); - -struct au_opt_xino; -int au_xino_set(struct super_block *sb, struct au_opt_xino *xino, int remount); -void au_xino_clr(struct super_block *sb); -struct file *au_xino_def(struct super_block *sb); -int au_xino_path(struct seq_file *seq, struct file *file); - -/* ---------------------------------------------------------------------- */ - -/* Superblock to branch */ -static inline -aufs_bindex_t au_sbr_id(struct super_block *sb, aufs_bindex_t bindex) -{ - return au_sbr(sb, bindex)->br_id; -} - -static inline -struct vfsmount *au_sbr_mnt(struct super_block *sb, aufs_bindex_t bindex) -{ - return au_br_mnt(au_sbr(sb, bindex)); -} - -static inline -struct super_block *au_sbr_sb(struct super_block *sb, aufs_bindex_t bindex) -{ - return au_br_sb(au_sbr(sb, bindex)); -} - -static inline void au_sbr_put(struct super_block *sb, aufs_bindex_t bindex) -{ - atomic_dec(&au_sbr(sb, bindex)->br_count); -} - -static inline int au_sbr_perm(struct super_block *sb, aufs_bindex_t bindex) -{ - return au_sbr(sb, bindex)->br_perm; -} - -static inline int au_sbr_whable(struct super_block *sb, aufs_bindex_t bindex) -{ - return au_br_whable(au_sbr_perm(sb, bindex)); -} - -/* ---------------------------------------------------------------------- */ - -/* - * wbr_wh_read_lock, wbr_wh_write_lock - * wbr_wh_read_unlock, wbr_wh_write_unlock, wbr_wh_downgrade_lock - */ -AuSimpleRwsemFuncs(wbr_wh, struct au_wbr *wbr, &wbr->wbr_wh_rwsem); - -#define WbrWhMustNoWaiters(wbr) AuRwMustNoWaiters(&wbr->wbr_wh_rwsem) -#define WbrWhMustAnyLock(wbr) AuRwMustAnyLock(&wbr->wbr_wh_rwsem) -#define WbrWhMustWriteLock(wbr) AuRwMustWriteLock(&wbr->wbr_wh_rwsem) - -/* ---------------------------------------------------------------------- */ - -#ifdef CONFIG_AUFS_FHSM -static inline void au_br_fhsm_init(struct au_br_fhsm *brfhsm) -{ - mutex_init(&brfhsm->bf_lock); - brfhsm->bf_jiffy = 0; - brfhsm->bf_readable = 0; -} - -static inline void au_br_fhsm_fin(struct au_br_fhsm *brfhsm) -{ - mutex_destroy(&brfhsm->bf_lock); -} -#else -AuStubVoid(au_br_fhsm_init, struct au_br_fhsm *brfhsm) -AuStubVoid(au_br_fhsm_fin, struct au_br_fhsm *brfhsm) -#endif - -#endif /* __KERNEL__ */ -#endif /* __AUFS_BRANCH_H__ */ diff --git a/fs/aufs/conf.mk b/fs/aufs/conf.mk deleted file mode 100644 index 0bbb2d3a5..000000000 --- a/fs/aufs/conf.mk +++ /dev/null @@ -1,38 +0,0 @@ - -AuConfStr = CONFIG_AUFS_FS=${CONFIG_AUFS_FS} - -define AuConf -ifdef ${1} -AuConfStr += ${1}=${${1}} -endif -endef - -AuConfAll = BRANCH_MAX_127 BRANCH_MAX_511 BRANCH_MAX_1023 BRANCH_MAX_32767 \ - SBILIST \ - HNOTIFY HFSNOTIFY \ - EXPORT INO_T_64 \ - XATTR \ - FHSM \ - RDU \ - SHWH \ - BR_RAMFS \ - BR_FUSE POLL \ - BR_HFSPLUS \ - BDEV_LOOP \ - DEBUG MAGIC_SYSRQ -$(foreach i, ${AuConfAll}, \ - $(eval $(call AuConf,CONFIG_AUFS_${i}))) - -AuConfName = ${obj}/conf.str -${AuConfName}.tmp: FORCE - @echo ${AuConfStr} | tr ' ' '\n' | sed -e 's/^/"/' -e 's/$$/\\n"/' > $@ -${AuConfName}: ${AuConfName}.tmp - @diff -q $< $@ > /dev/null 2>&1 || { \ - echo ' GEN ' $@; \ - cp -p $< $@; \ - } -FORCE: -clean-files += ${AuConfName} ${AuConfName}.tmp -${obj}/sysfs.o: ${AuConfName} - --include ${srctree}/${src}/conf_priv.mk diff --git a/fs/aufs/cpup.c b/fs/aufs/cpup.c deleted file mode 100644 index 3ea177a5d..000000000 --- a/fs/aufs/cpup.c +++ /dev/null @@ -1,1319 +0,0 @@ -/* - * Copyright (C) 2005-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * copy-up functions, see wbr_policy.c for copy-down - */ - -#include <linux/fs_stack.h> -#include <linux/mm.h> -#include "aufs.h" - -void au_cpup_attr_flags(struct inode *dst, unsigned int iflags) -{ - const unsigned int mask = S_DEAD | S_SWAPFILE | S_PRIVATE - | S_NOATIME | S_NOCMTIME | S_AUTOMOUNT; - - BUILD_BUG_ON(sizeof(iflags) != sizeof(dst->i_flags)); - - dst->i_flags |= iflags & ~mask; - if (au_test_fs_notime(dst->i_sb)) - dst->i_flags |= S_NOATIME | S_NOCMTIME; -} - -void au_cpup_attr_timesizes(struct inode *inode) -{ - struct inode *h_inode; - - h_inode = au_h_iptr(inode, au_ibstart(inode)); - fsstack_copy_attr_times(inode, h_inode); - fsstack_copy_inode_size(inode, h_inode); -} - -void au_cpup_attr_nlink(struct inode *inode, int force) -{ - struct inode *h_inode; - struct super_block *sb; - aufs_bindex_t bindex, bend; - - sb = inode->i_sb; - bindex = au_ibstart(inode); - h_inode = au_h_iptr(inode, bindex); - if (!force - && !S_ISDIR(h_inode->i_mode) - && au_opt_test(au_mntflags(sb), PLINK) - && au_plink_test(inode)) - return; - - /* - * 0 can happen in revalidating. - * h_inode->i_mutex may not be held here, but it is harmless since once - * i_nlink reaches 0, it will never become positive except O_TMPFILE - * case. - * todo: O_TMPFILE+linkat(AT_SYMLINK_FOLLOW) bypassing aufs may cause - * the incorrect link count. - */ - set_nlink(inode, h_inode->i_nlink); - - /* - * fewer nlink makes find(1) noisy, but larger nlink doesn't. - * it may includes whplink directory. - */ - if (S_ISDIR(h_inode->i_mode)) { - bend = au_ibend(inode); - for (bindex++; bindex <= bend; bindex++) { - h_inode = au_h_iptr(inode, bindex); - if (h_inode) - au_add_nlink(inode, h_inode); - } - } -} - -void au_cpup_attr_changeable(struct inode *inode) -{ - struct inode *h_inode; - - h_inode = au_h_iptr(inode, au_ibstart(inode)); - inode->i_mode = h_inode->i_mode; - inode->i_uid = h_inode->i_uid; - inode->i_gid = h_inode->i_gid; - au_cpup_attr_timesizes(inode); - au_cpup_attr_flags(inode, h_inode->i_flags); -} - -void au_cpup_igen(struct inode *inode, struct inode *h_inode) -{ - struct au_iinfo *iinfo = au_ii(inode); - - IiMustWriteLock(inode); - - iinfo->ii_higen = h_inode->i_generation; - iinfo->ii_hsb1 = h_inode->i_sb; -} - -void au_cpup_attr_all(struct inode *inode, int force) -{ - struct inode *h_inode; - - h_inode = au_h_iptr(inode, au_ibstart(inode)); - au_cpup_attr_changeable(inode); - if (inode->i_nlink > 0) - au_cpup_attr_nlink(inode, force); - inode->i_rdev = h_inode->i_rdev; - inode->i_blkbits = h_inode->i_blkbits; - au_cpup_igen(inode, h_inode); -} - -/* ---------------------------------------------------------------------- */ - -/* Note: dt_dentry and dt_h_dentry are not dget/dput-ed */ - -/* keep the timestamps of the parent dir when cpup */ -void au_dtime_store(struct au_dtime *dt, struct dentry *dentry, - struct path *h_path) -{ - struct inode *h_inode; - - dt->dt_dentry = dentry; - dt->dt_h_path = *h_path; - h_inode = d_inode(h_path->dentry); - dt->dt_atime = h_inode->i_atime; - dt->dt_mtime = h_inode->i_mtime; - /* smp_mb(); */ -} - -void au_dtime_revert(struct au_dtime *dt) -{ - struct iattr attr; - int err; - - attr.ia_atime = dt->dt_atime; - attr.ia_mtime = dt->dt_mtime; - attr.ia_valid = ATTR_FORCE | ATTR_MTIME | ATTR_MTIME_SET - | ATTR_ATIME | ATTR_ATIME_SET; - - /* no delegation since this is a directory */ - err = vfsub_notify_change(&dt->dt_h_path, &attr, /*delegated*/NULL); - if (unlikely(err)) - pr_warn("restoring timestamps failed(%d). ignored\n", err); -} - -/* ---------------------------------------------------------------------- */ - -/* internal use only */ -struct au_cpup_reg_attr { - int valid; - struct kstat st; - unsigned int iflags; /* inode->i_flags */ -}; - -static noinline_for_stack -int cpup_iattr(struct dentry *dst, aufs_bindex_t bindex, struct dentry *h_src, - struct au_cpup_reg_attr *h_src_attr) -{ - int err, sbits, icex; - unsigned int mnt_flags; - unsigned char verbose; - struct iattr ia; - struct path h_path; - struct inode *h_isrc, *h_idst; - struct kstat *h_st; - struct au_branch *br; - - h_path.dentry = au_h_dptr(dst, bindex); - h_idst = d_inode(h_path.dentry); - br = au_sbr(dst->d_sb, bindex); - h_path.mnt = au_br_mnt(br); - h_isrc = d_inode(h_src); - ia.ia_valid = ATTR_FORCE | ATTR_UID | ATTR_GID - | ATTR_ATIME | ATTR_MTIME - | ATTR_ATIME_SET | ATTR_MTIME_SET; - if (h_src_attr && h_src_attr->valid) { - h_st = &h_src_attr->st; - ia.ia_uid = h_st->uid; - ia.ia_gid = h_st->gid; - ia.ia_atime = h_st->atime; - ia.ia_mtime = h_st->mtime; - if (h_idst->i_mode != h_st->mode - && !S_ISLNK(h_idst->i_mode)) { - ia.ia_valid |= ATTR_MODE; - ia.ia_mode = h_st->mode; - } - sbits = !!(h_st->mode & (S_ISUID | S_ISGID)); - au_cpup_attr_flags(h_idst, h_src_attr->iflags); - } else { - ia.ia_uid = h_isrc->i_uid; - ia.ia_gid = h_isrc->i_gid; - ia.ia_atime = h_isrc->i_atime; - ia.ia_mtime = h_isrc->i_mtime; - if (h_idst->i_mode != h_isrc->i_mode - && !S_ISLNK(h_idst->i_mode)) { - ia.ia_valid |= ATTR_MODE; - ia.ia_mode = h_isrc->i_mode; - } - sbits = !!(h_isrc->i_mode & (S_ISUID | S_ISGID)); - au_cpup_attr_flags(h_idst, h_isrc->i_flags); - } - /* no delegation since it is just created */ - err = vfsub_notify_change(&h_path, &ia, /*delegated*/NULL); - - /* is this nfs only? */ - if (!err && sbits && au_test_nfs(h_path.dentry->d_sb)) { - ia.ia_valid = ATTR_FORCE | ATTR_MODE; - ia.ia_mode = h_isrc->i_mode; - err = vfsub_notify_change(&h_path, &ia, /*delegated*/NULL); - } - - icex = br->br_perm & AuBrAttr_ICEX; - if (!err) { - mnt_flags = au_mntflags(dst->d_sb); - verbose = !!au_opt_test(mnt_flags, VERBOSE); - err = au_cpup_xattr(h_path.dentry, h_src, icex, verbose); - } - - return err; -} - -/* ---------------------------------------------------------------------- */ - -static int au_do_copy_file(struct file *dst, struct file *src, loff_t len, - char *buf, unsigned long blksize) -{ - int err; - size_t sz, rbytes, wbytes; - unsigned char all_zero; - char *p, *zp; - struct mutex *h_mtx; - /* reduce stack usage */ - struct iattr *ia; - - zp = page_address(ZERO_PAGE(0)); - if (unlikely(!zp)) - return -ENOMEM; /* possible? */ - - err = 0; - all_zero = 0; - while (len) { - AuDbg("len %lld\n", len); - sz = blksize; - if (len < blksize) - sz = len; - - rbytes = 0; - /* todo: signal_pending? */ - while (!rbytes || err == -EAGAIN || err == -EINTR) { - rbytes = vfsub_read_k(src, buf, sz, &src->f_pos); - err = rbytes; - } - if (unlikely(err < 0)) - break; - - all_zero = 0; - if (len >= rbytes && rbytes == blksize) - all_zero = !memcmp(buf, zp, rbytes); - if (!all_zero) { - wbytes = rbytes; - p = buf; - while (wbytes) { - size_t b; - - b = vfsub_write_k(dst, p, wbytes, &dst->f_pos); - err = b; - /* todo: signal_pending? */ - if (unlikely(err == -EAGAIN || err == -EINTR)) - continue; - if (unlikely(err < 0)) - break; - wbytes -= b; - p += b; - } - if (unlikely(err < 0)) - break; - } else { - loff_t res; - - AuLabel(hole); - res = vfsub_llseek(dst, rbytes, SEEK_CUR); - err = res; - if (unlikely(res < 0)) - break; - } - len -= rbytes; - err = 0; - } - - /* the last block may be a hole */ - if (!err && all_zero) { - AuLabel(last hole); - - err = 1; - if (au_test_nfs(dst->f_path.dentry->d_sb)) { - /* nfs requires this step to make last hole */ - /* is this only nfs? */ - do { - /* todo: signal_pending? */ - err = vfsub_write_k(dst, "\0", 1, &dst->f_pos); - } while (err == -EAGAIN || err == -EINTR); - if (err == 1) - dst->f_pos--; - } - - if (err == 1) { - ia = (void *)buf; - ia->ia_size = dst->f_pos; - ia->ia_valid = ATTR_SIZE | ATTR_FILE; - ia->ia_file = dst; - h_mtx = &file_inode(dst)->i_mutex; - mutex_lock_nested(h_mtx, AuLsc_I_CHILD2); - /* no delegation since it is just created */ - err = vfsub_notify_change(&dst->f_path, ia, - /*delegated*/NULL); - mutex_unlock(h_mtx); - } - } - - return err; -} - -int au_copy_file(struct file *dst, struct file *src, loff_t len) -{ - int err; - unsigned long blksize; - unsigned char do_kfree; - char *buf; - - err = -ENOMEM; - blksize = dst->f_path.dentry->d_sb->s_blocksize; - if (!blksize || PAGE_SIZE < blksize) - blksize = PAGE_SIZE; - AuDbg("blksize %lu\n", blksize); - do_kfree = (blksize != PAGE_SIZE && blksize >= sizeof(struct iattr *)); - if (do_kfree) - buf = kmalloc(blksize, GFP_NOFS); - else - buf = (void *)__get_free_page(GFP_NOFS); - if (unlikely(!buf)) - goto out; - - if (len > (1 << 22)) - AuDbg("copying a large file %lld\n", (long long)len); - - src->f_pos = 0; - dst->f_pos = 0; - err = au_do_copy_file(dst, src, len, buf, blksize); - if (do_kfree) - kfree(buf); - else - free_page((unsigned long)buf); - -out: - return err; -} - -/* - * to support a sparse file which is opened with O_APPEND, - * we need to close the file. - */ -static int au_cp_regular(struct au_cp_generic *cpg) -{ - int err, i; - enum { SRC, DST }; - struct { - aufs_bindex_t bindex; - unsigned int flags; - struct dentry *dentry; - int force_wr; - struct file *file; - void *label; - } *f, file[] = { - { - .bindex = cpg->bsrc, - .flags = O_RDONLY | O_NOATIME | O_LARGEFILE, - .label = &&out - }, - { - .bindex = cpg->bdst, - .flags = O_WRONLY | O_NOATIME | O_LARGEFILE, - .force_wr = !!au_ftest_cpup(cpg->flags, RWDST), - .label = &&out_src - } - }; - struct super_block *sb; - - /* bsrc branch can be ro/rw. */ - sb = cpg->dentry->d_sb; - f = file; - for (i = 0; i < 2; i++, f++) { - f->dentry = au_h_dptr(cpg->dentry, f->bindex); - f->file = au_h_open(cpg->dentry, f->bindex, f->flags, - /*file*/NULL, f->force_wr); - err = PTR_ERR(f->file); - if (IS_ERR(f->file)) - goto *f->label; - } - - /* try stopping to update while we copyup */ - IMustLock(d_inode(file[SRC].dentry)); - err = au_copy_file(file[DST].file, file[SRC].file, cpg->len); - - fput(file[DST].file); - au_sbr_put(sb, file[DST].bindex); - -out_src: - fput(file[SRC].file); - au_sbr_put(sb, file[SRC].bindex); -out: - return err; -} - -static int au_do_cpup_regular(struct au_cp_generic *cpg, - struct au_cpup_reg_attr *h_src_attr) -{ - int err, rerr; - loff_t l; - struct path h_path; - struct inode *h_src_inode, *h_dst_inode; - - err = 0; - h_src_inode = au_h_iptr(d_inode(cpg->dentry), cpg->bsrc); - l = i_size_read(h_src_inode); - if (cpg->len == -1 || l < cpg->len) - cpg->len = l; - if (cpg->len) { - /* try stopping to update while we are referencing */ - mutex_lock_nested(&h_src_inode->i_mutex, AuLsc_I_CHILD); - au_pin_hdir_unlock(cpg->pin); - - h_path.dentry = au_h_dptr(cpg->dentry, cpg->bsrc); - h_path.mnt = au_sbr_mnt(cpg->dentry->d_sb, cpg->bsrc); - h_src_attr->iflags = h_src_inode->i_flags; - if (!au_test_nfs(h_src_inode->i_sb)) - err = vfs_getattr(&h_path, &h_src_attr->st); - else { - mutex_unlock(&h_src_inode->i_mutex); - err = vfs_getattr(&h_path, &h_src_attr->st); - mutex_lock_nested(&h_src_inode->i_mutex, AuLsc_I_CHILD); - } - if (unlikely(err)) { - mutex_unlock(&h_src_inode->i_mutex); - goto out; - } - h_src_attr->valid = 1; - err = au_cp_regular(cpg); - mutex_unlock(&h_src_inode->i_mutex); - rerr = au_pin_hdir_relock(cpg->pin); - if (!err && rerr) - err = rerr; - } - if (!err && (h_src_inode->i_state & I_LINKABLE)) { - h_path.dentry = au_h_dptr(cpg->dentry, cpg->bdst); - h_dst_inode = d_inode(h_path.dentry); - spin_lock(&h_dst_inode->i_lock); - h_dst_inode->i_state |= I_LINKABLE; - spin_unlock(&h_dst_inode->i_lock); - } - -out: - return err; -} - -static int au_do_cpup_symlink(struct path *h_path, struct dentry *h_src, - struct inode *h_dir) -{ - int err, symlen; - mm_segment_t old_fs; - union { - char *k; - char __user *u; - } sym; - struct inode *h_inode = d_inode(h_src); - const struct inode_operations *h_iop = h_inode->i_op; - - err = -ENOSYS; - if (unlikely(!h_iop->readlink)) - goto out; - - err = -ENOMEM; - sym.k = (void *)__get_free_page(GFP_NOFS); - if (unlikely(!sym.k)) - goto out; - - /* unnecessary to support mmap_sem since symlink is not mmap-able */ - old_fs = get_fs(); - set_fs(KERNEL_DS); - symlen = h_iop->readlink(h_src, sym.u, PATH_MAX); - err = symlen; - set_fs(old_fs); - - if (symlen > 0) { - sym.k[symlen] = 0; - err = vfsub_symlink(h_dir, h_path, sym.k); - } - free_page((unsigned long)sym.k); - -out: - return err; -} - -static noinline_for_stack -int cpup_entry(struct au_cp_generic *cpg, struct dentry *dst_parent, - struct au_cpup_reg_attr *h_src_attr) -{ - int err; - umode_t mode; - unsigned int mnt_flags; - unsigned char isdir, isreg, force; - const unsigned char do_dt = !!au_ftest_cpup(cpg->flags, DTIME); - struct au_dtime dt; - struct path h_path; - struct dentry *h_src, *h_dst, *h_parent; - struct inode *h_inode, *h_dir, *dir, *inode; - struct super_block *sb; - - /* bsrc branch can be ro/rw. */ - h_src = au_h_dptr(cpg->dentry, cpg->bsrc); - h_inode = d_inode(h_src); - AuDebugOn(h_inode != au_h_iptr(d_inode(cpg->dentry), cpg->bsrc)); - - /* try stopping to be referenced while we are creating */ - h_dst = au_h_dptr(cpg->dentry, cpg->bdst); - if (au_ftest_cpup(cpg->flags, RENAME)) - AuDebugOn(strncmp(h_dst->d_name.name, AUFS_WH_PFX, - AUFS_WH_PFX_LEN)); - h_parent = h_dst->d_parent; /* dir inode is locked */ - h_dir = d_inode(h_parent); - IMustLock(h_dir); - AuDebugOn(h_parent != h_dst->d_parent); - - sb = cpg->dentry->d_sb; - h_path.mnt = au_sbr_mnt(sb, cpg->bdst); - if (do_dt) { - h_path.dentry = h_parent; - au_dtime_store(&dt, dst_parent, &h_path); - } - h_path.dentry = h_dst; - - isreg = 0; - isdir = 0; - mode = h_inode->i_mode; - switch (mode & S_IFMT) { - case S_IFREG: - isreg = 1; - err = vfsub_create(h_dir, &h_path, mode | S_IWUSR, - /*want_excl*/true); - if (!err) - err = au_do_cpup_regular(cpg, h_src_attr); - break; - case S_IFDIR: - isdir = 1; - err = vfsub_mkdir(h_dir, &h_path, mode); - if (!err) { - /* - * strange behaviour from the users view, - * particularry setattr case - */ - dir = d_inode(dst_parent); - if (au_ibstart(dir) == cpg->bdst) - au_cpup_attr_nlink(dir, /*force*/1); - inode = d_inode(cpg->dentry); - au_cpup_attr_nlink(inode, /*force*/1); - } - break; - case S_IFLNK: - err = au_do_cpup_symlink(&h_path, h_src, h_dir); - break; - case S_IFCHR: - case S_IFBLK: - AuDebugOn(!capable(CAP_MKNOD)); - /*FALLTHROUGH*/ - case S_IFIFO: - case S_IFSOCK: - err = vfsub_mknod(h_dir, &h_path, mode, h_inode->i_rdev); - break; - default: - AuIOErr("Unknown inode type 0%o\n", mode); - err = -EIO; - } - - mnt_flags = au_mntflags(sb); - if (!au_opt_test(mnt_flags, UDBA_NONE) - && !isdir - && au_opt_test(mnt_flags, XINO) - && (h_inode->i_nlink == 1 - || (h_inode->i_state & I_LINKABLE)) - /* todo: unnecessary? */ - /* && d_inode(cpg->dentry)->i_nlink == 1 */ - && cpg->bdst < cpg->bsrc - && !au_ftest_cpup(cpg->flags, KEEPLINO)) - au_xino_write(sb, cpg->bsrc, h_inode->i_ino, /*ino*/0); - /* ignore this error */ - - if (!err) { - force = 0; - if (isreg) { - force = !!cpg->len; - if (cpg->len == -1) - force = !!i_size_read(h_inode); - } - au_fhsm_wrote(sb, cpg->bdst, force); - } - - if (do_dt) - au_dtime_revert(&dt); - return err; -} - -static int au_do_ren_after_cpup(struct au_cp_generic *cpg, struct path *h_path) -{ - int err; - struct dentry *dentry, *h_dentry, *h_parent, *parent; - struct inode *h_dir; - aufs_bindex_t bdst; - - dentry = cpg->dentry; - bdst = cpg->bdst; - h_dentry = au_h_dptr(dentry, bdst); - if (!au_ftest_cpup(cpg->flags, OVERWRITE)) { - dget(h_dentry); - au_set_h_dptr(dentry, bdst, NULL); - err = au_lkup_neg(dentry, bdst, /*wh*/0); - if (!err) - h_path->dentry = dget(au_h_dptr(dentry, bdst)); - au_set_h_dptr(dentry, bdst, h_dentry); - } else { - err = 0; - parent = dget_parent(dentry); - h_parent = au_h_dptr(parent, bdst); - dput(parent); - h_path->dentry = vfsub_lkup_one(&dentry->d_name, h_parent); - if (IS_ERR(h_path->dentry)) - err = PTR_ERR(h_path->dentry); - } - if (unlikely(err)) - goto out; - - h_parent = h_dentry->d_parent; /* dir inode is locked */ - h_dir = d_inode(h_parent); - IMustLock(h_dir); - AuDbg("%pd %pd\n", h_dentry, h_path->dentry); - /* no delegation since it is just created */ - err = vfsub_rename(h_dir, h_dentry, h_dir, h_path, /*delegated*/NULL); - dput(h_path->dentry); - -out: - return err; -} - -/* - * copyup the @dentry from @bsrc to @bdst. - * the caller must set the both of lower dentries. - * @len is for truncating when it is -1 copyup the entire file. - * in link/rename cases, @dst_parent may be different from the real one. - * basic->bsrc can be larger than basic->bdst. - */ -static int au_cpup_single(struct au_cp_generic *cpg, struct dentry *dst_parent) -{ - int err, rerr; - aufs_bindex_t old_ibstart; - unsigned char isdir, plink; - struct dentry *h_src, *h_dst, *h_parent; - struct inode *dst_inode, *h_dir, *inode, *delegated, *src_inode; - struct super_block *sb; - struct au_branch *br; - /* to reuduce stack size */ - struct { - struct au_dtime dt; - struct path h_path; - struct au_cpup_reg_attr h_src_attr; - } *a; - - err = -ENOMEM; - a = kmalloc(sizeof(*a), GFP_NOFS); - if (unlikely(!a)) - goto out; - a->h_src_attr.valid = 0; - - sb = cpg->dentry->d_sb; - br = au_sbr(sb, cpg->bdst); - a->h_path.mnt = au_br_mnt(br); - h_dst = au_h_dptr(cpg->dentry, cpg->bdst); - h_parent = h_dst->d_parent; /* dir inode is locked */ - h_dir = d_inode(h_parent); - IMustLock(h_dir); - - h_src = au_h_dptr(cpg->dentry, cpg->bsrc); - inode = d_inode(cpg->dentry); - - if (!dst_parent) - dst_parent = dget_parent(cpg->dentry); - else - dget(dst_parent); - - plink = !!au_opt_test(au_mntflags(sb), PLINK); - dst_inode = au_h_iptr(inode, cpg->bdst); - if (dst_inode) { - if (unlikely(!plink)) { - err = -EIO; - AuIOErr("hi%lu(i%lu) exists on b%d " - "but plink is disabled\n", - dst_inode->i_ino, inode->i_ino, cpg->bdst); - goto out_parent; - } - - if (dst_inode->i_nlink) { - const int do_dt = au_ftest_cpup(cpg->flags, DTIME); - - h_src = au_plink_lkup(inode, cpg->bdst); - err = PTR_ERR(h_src); - if (IS_ERR(h_src)) - goto out_parent; - if (unlikely(d_is_negative(h_src))) { - err = -EIO; - AuIOErr("i%lu exists on a upper branch " - "but not pseudo-linked\n", - inode->i_ino); - dput(h_src); - goto out_parent; - } - - if (do_dt) { - a->h_path.dentry = h_parent; - au_dtime_store(&a->dt, dst_parent, &a->h_path); - } - - a->h_path.dentry = h_dst; - delegated = NULL; - err = vfsub_link(h_src, h_dir, &a->h_path, &delegated); - if (!err && au_ftest_cpup(cpg->flags, RENAME)) - err = au_do_ren_after_cpup(cpg, &a->h_path); - if (do_dt) - au_dtime_revert(&a->dt); - if (unlikely(err == -EWOULDBLOCK)) { - pr_warn("cannot retry for NFSv4 delegation" - " for an internal link\n"); - iput(delegated); - } - dput(h_src); - goto out_parent; - } else - /* todo: cpup_wh_file? */ - /* udba work */ - au_update_ibrange(inode, /*do_put_zero*/1); - } - - isdir = S_ISDIR(inode->i_mode); - old_ibstart = au_ibstart(inode); - err = cpup_entry(cpg, dst_parent, &a->h_src_attr); - if (unlikely(err)) - goto out_rev; - dst_inode = d_inode(h_dst); - mutex_lock_nested(&dst_inode->i_mutex, AuLsc_I_CHILD2); - /* todo: necessary? */ - /* au_pin_hdir_unlock(cpg->pin); */ - - err = cpup_iattr(cpg->dentry, cpg->bdst, h_src, &a->h_src_attr); - if (unlikely(err)) { - /* todo: necessary? */ - /* au_pin_hdir_relock(cpg->pin); */ /* ignore an error */ - mutex_unlock(&dst_inode->i_mutex); - goto out_rev; - } - - if (cpg->bdst < old_ibstart) { - if (S_ISREG(inode->i_mode)) { - err = au_dy_iaop(inode, cpg->bdst, dst_inode); - if (unlikely(err)) { - /* ignore an error */ - /* au_pin_hdir_relock(cpg->pin); */ - mutex_unlock(&dst_inode->i_mutex); - goto out_rev; - } - } - au_set_ibstart(inode, cpg->bdst); - } else - au_set_ibend(inode, cpg->bdst); - au_set_h_iptr(inode, cpg->bdst, au_igrab(dst_inode), - au_hi_flags(inode, isdir)); - - /* todo: necessary? */ - /* err = au_pin_hdir_relock(cpg->pin); */ - mutex_unlock(&dst_inode->i_mutex); - if (unlikely(err)) - goto out_rev; - - src_inode = d_inode(h_src); - if (!isdir - && (src_inode->i_nlink > 1 - || src_inode->i_state & I_LINKABLE) - && plink) - au_plink_append(inode, cpg->bdst, h_dst); - - if (au_ftest_cpup(cpg->flags, RENAME)) { - a->h_path.dentry = h_dst; - err = au_do_ren_after_cpup(cpg, &a->h_path); - } - if (!err) - goto out_parent; /* success */ - - /* revert */ -out_rev: - a->h_path.dentry = h_parent; - au_dtime_store(&a->dt, dst_parent, &a->h_path); - a->h_path.dentry = h_dst; - rerr = 0; - if (d_is_positive(h_dst)) { - if (!isdir) { - /* no delegation since it is just created */ - rerr = vfsub_unlink(h_dir, &a->h_path, - /*delegated*/NULL, /*force*/0); - } else - rerr = vfsub_rmdir(h_dir, &a->h_path); - } - au_dtime_revert(&a->dt); - if (rerr) { - AuIOErr("failed removing broken entry(%d, %d)\n", err, rerr); - err = -EIO; - } -out_parent: - dput(dst_parent); - kfree(a); -out: - return err; -} - -#if 0 /* reserved */ -struct au_cpup_single_args { - int *errp; - struct au_cp_generic *cpg; - struct dentry *dst_parent; -}; - -static void au_call_cpup_single(void *args) -{ - struct au_cpup_single_args *a = args; - - au_pin_hdir_acquire_nest(a->cpg->pin); - *a->errp = au_cpup_single(a->cpg, a->dst_parent); - au_pin_hdir_release(a->cpg->pin); -} -#endif - -/* - * prevent SIGXFSZ in copy-up. - * testing CAP_MKNOD is for generic fs, - * but CAP_FSETID is for xfs only, currently. - */ -static int au_cpup_sio_test(struct au_pin *pin, umode_t mode) -{ - int do_sio; - struct super_block *sb; - struct inode *h_dir; - - do_sio = 0; - sb = au_pinned_parent(pin)->d_sb; - if (!au_wkq_test() - && (!au_sbi(sb)->si_plink_maint_pid - || au_plink_maint(sb, AuLock_NOPLM))) { - switch (mode & S_IFMT) { - case S_IFREG: - /* no condition about RLIMIT_FSIZE and the file size */ - do_sio = 1; - break; - case S_IFCHR: - case S_IFBLK: - do_sio = !capable(CAP_MKNOD); - break; - } - if (!do_sio) - do_sio = ((mode & (S_ISUID | S_ISGID)) - && !capable(CAP_FSETID)); - /* this workaround may be removed in the future */ - if (!do_sio) { - h_dir = au_pinned_h_dir(pin); - do_sio = h_dir->i_mode & S_ISVTX; - } - } - - return do_sio; -} - -#if 0 /* reserved */ -int au_sio_cpup_single(struct au_cp_generic *cpg, struct dentry *dst_parent) -{ - int err, wkq_err; - struct dentry *h_dentry; - - h_dentry = au_h_dptr(cpg->dentry, cpg->bsrc); - if (!au_cpup_sio_test(pin, d_inode(h_dentry)->i_mode)) - err = au_cpup_single(cpg, dst_parent); - else { - struct au_cpup_single_args args = { - .errp = &err, - .cpg = cpg, - .dst_parent = dst_parent - }; - wkq_err = au_wkq_wait(au_call_cpup_single, &args); - if (unlikely(wkq_err)) - err = wkq_err; - } - - return err; -} -#endif - -/* - * copyup the @dentry from the first active lower branch to @bdst, - * using au_cpup_single(). - */ -static int au_cpup_simple(struct au_cp_generic *cpg) -{ - int err; - unsigned int flags_orig; - struct dentry *dentry; - - AuDebugOn(cpg->bsrc < 0); - - dentry = cpg->dentry; - DiMustWriteLock(dentry); - - err = au_lkup_neg(dentry, cpg->bdst, /*wh*/1); - if (!err) { - flags_orig = cpg->flags; - au_fset_cpup(cpg->flags, RENAME); - err = au_cpup_single(cpg, NULL); - cpg->flags = flags_orig; - if (!err) - return 0; /* success */ - - /* revert */ - au_set_h_dptr(dentry, cpg->bdst, NULL); - au_set_dbstart(dentry, cpg->bsrc); - } - - return err; -} - -struct au_cpup_simple_args { - int *errp; - struct au_cp_generic *cpg; -}; - -static void au_call_cpup_simple(void *args) -{ - struct au_cpup_simple_args *a = args; - - au_pin_hdir_acquire_nest(a->cpg->pin); - *a->errp = au_cpup_simple(a->cpg); - au_pin_hdir_release(a->cpg->pin); -} - -static int au_do_sio_cpup_simple(struct au_cp_generic *cpg) -{ - int err, wkq_err; - struct dentry *dentry, *parent; - struct file *h_file; - struct inode *h_dir; - - dentry = cpg->dentry; - h_file = NULL; - if (au_ftest_cpup(cpg->flags, HOPEN)) { - AuDebugOn(cpg->bsrc < 0); - h_file = au_h_open_pre(dentry, cpg->bsrc, /*force_wr*/0); - err = PTR_ERR(h_file); - if (IS_ERR(h_file)) - goto out; - } - - parent = dget_parent(dentry); - h_dir = au_h_iptr(d_inode(parent), cpg->bdst); - if (!au_test_h_perm_sio(h_dir, MAY_EXEC | MAY_WRITE) - && !au_cpup_sio_test(cpg->pin, d_inode(dentry)->i_mode)) - err = au_cpup_simple(cpg); - else { - struct au_cpup_simple_args args = { - .errp = &err, - .cpg = cpg - }; - wkq_err = au_wkq_wait(au_call_cpup_simple, &args); - if (unlikely(wkq_err)) - err = wkq_err; - } - - dput(parent); - if (h_file) - au_h_open_post(dentry, cpg->bsrc, h_file); - -out: - return err; -} - -int au_sio_cpup_simple(struct au_cp_generic *cpg) -{ - aufs_bindex_t bsrc, bend; - struct dentry *dentry, *h_dentry; - - if (cpg->bsrc < 0) { - dentry = cpg->dentry; - bend = au_dbend(dentry); - for (bsrc = cpg->bdst + 1; bsrc <= bend; bsrc++) { - h_dentry = au_h_dptr(dentry, bsrc); - if (h_dentry) { - AuDebugOn(d_is_negative(h_dentry)); - break; - } - } - AuDebugOn(bsrc > bend); - cpg->bsrc = bsrc; - } - AuDebugOn(cpg->bsrc <= cpg->bdst); - return au_do_sio_cpup_simple(cpg); -} - -int au_sio_cpdown_simple(struct au_cp_generic *cpg) -{ - AuDebugOn(cpg->bdst <= cpg->bsrc); - return au_do_sio_cpup_simple(cpg); -} - -/* ---------------------------------------------------------------------- */ - -/* - * copyup the deleted file for writing. - */ -static int au_do_cpup_wh(struct au_cp_generic *cpg, struct dentry *wh_dentry, - struct file *file) -{ - int err; - unsigned int flags_orig; - aufs_bindex_t bsrc_orig; - struct dentry *h_d_dst, *h_d_start; - struct au_dinfo *dinfo; - struct au_hdentry *hdp; - - dinfo = au_di(cpg->dentry); - AuRwMustWriteLock(&dinfo->di_rwsem); - - bsrc_orig = cpg->bsrc; - cpg->bsrc = dinfo->di_bstart; - hdp = dinfo->di_hdentry; - h_d_dst = hdp[0 + cpg->bdst].hd_dentry; - dinfo->di_bstart = cpg->bdst; - hdp[0 + cpg->bdst].hd_dentry = wh_dentry; - h_d_start = NULL; - if (file) { - h_d_start = hdp[0 + cpg->bsrc].hd_dentry; - hdp[0 + cpg->bsrc].hd_dentry = au_hf_top(file)->f_path.dentry; - } - flags_orig = cpg->flags; - cpg->flags = !AuCpup_DTIME; - err = au_cpup_single(cpg, /*h_parent*/NULL); - cpg->flags = flags_orig; - if (file) { - if (!err) - err = au_reopen_nondir(file); - hdp[0 + cpg->bsrc].hd_dentry = h_d_start; - } - hdp[0 + cpg->bdst].hd_dentry = h_d_dst; - dinfo->di_bstart = cpg->bsrc; - cpg->bsrc = bsrc_orig; - - return err; -} - -static int au_cpup_wh(struct au_cp_generic *cpg, struct file *file) -{ - int err; - aufs_bindex_t bdst; - struct au_dtime dt; - struct dentry *dentry, *parent, *h_parent, *wh_dentry; - struct au_branch *br; - struct path h_path; - - dentry = cpg->dentry; - bdst = cpg->bdst; - br = au_sbr(dentry->d_sb, bdst); - parent = dget_parent(dentry); - h_parent = au_h_dptr(parent, bdst); - wh_dentry = au_whtmp_lkup(h_parent, br, &dentry->d_name); - err = PTR_ERR(wh_dentry); - if (IS_ERR(wh_dentry)) - goto out; - - h_path.dentry = h_parent; - h_path.mnt = au_br_mnt(br); - au_dtime_store(&dt, parent, &h_path); - err = au_do_cpup_wh(cpg, wh_dentry, file); - if (unlikely(err)) - goto out_wh; - - dget(wh_dentry); - h_path.dentry = wh_dentry; - if (!d_is_dir(wh_dentry)) { - /* no delegation since it is just created */ - err = vfsub_unlink(d_inode(h_parent), &h_path, - /*delegated*/NULL, /*force*/0); - } else - err = vfsub_rmdir(d_inode(h_parent), &h_path); - if (unlikely(err)) { - AuIOErr("failed remove copied-up tmp file %pd(%d)\n", - wh_dentry, err); - err = -EIO; - } - au_dtime_revert(&dt); - au_set_hi_wh(d_inode(dentry), bdst, wh_dentry); - -out_wh: - dput(wh_dentry); -out: - dput(parent); - return err; -} - -struct au_cpup_wh_args { - int *errp; - struct au_cp_generic *cpg; - struct file *file; -}; - -static void au_call_cpup_wh(void *args) -{ - struct au_cpup_wh_args *a = args; - - au_pin_hdir_acquire_nest(a->cpg->pin); - *a->errp = au_cpup_wh(a->cpg, a->file); - au_pin_hdir_release(a->cpg->pin); -} - -int au_sio_cpup_wh(struct au_cp_generic *cpg, struct file *file) -{ - int err, wkq_err; - aufs_bindex_t bdst; - struct dentry *dentry, *parent, *h_orph, *h_parent; - struct inode *dir, *h_dir, *h_tmpdir; - struct au_wbr *wbr; - struct au_pin wh_pin, *pin_orig; - - dentry = cpg->dentry; - bdst = cpg->bdst; - parent = dget_parent(dentry); - dir = d_inode(parent); - h_orph = NULL; - h_parent = NULL; - h_dir = au_igrab(au_h_iptr(dir, bdst)); - h_tmpdir = h_dir; - pin_orig = NULL; - if (!h_dir->i_nlink) { - wbr = au_sbr(dentry->d_sb, bdst)->br_wbr; - h_orph = wbr->wbr_orph; - - h_parent = dget(au_h_dptr(parent, bdst)); - au_set_h_dptr(parent, bdst, dget(h_orph)); - h_tmpdir = d_inode(h_orph); - au_set_h_iptr(dir, bdst, au_igrab(h_tmpdir), /*flags*/0); - - mutex_lock_nested(&h_tmpdir->i_mutex, AuLsc_I_PARENT3); - /* todo: au_h_open_pre()? */ - - pin_orig = cpg->pin; - au_pin_init(&wh_pin, dentry, bdst, AuLsc_DI_PARENT, - AuLsc_I_PARENT3, cpg->pin->udba, AuPin_DI_LOCKED); - cpg->pin = &wh_pin; - } - - if (!au_test_h_perm_sio(h_tmpdir, MAY_EXEC | MAY_WRITE) - && !au_cpup_sio_test(cpg->pin, d_inode(dentry)->i_mode)) - err = au_cpup_wh(cpg, file); - else { - struct au_cpup_wh_args args = { - .errp = &err, - .cpg = cpg, - .file = file - }; - wkq_err = au_wkq_wait(au_call_cpup_wh, &args); - if (unlikely(wkq_err)) - err = wkq_err; - } - - if (h_orph) { - mutex_unlock(&h_tmpdir->i_mutex); - /* todo: au_h_open_post()? */ - au_set_h_iptr(dir, bdst, au_igrab(h_dir), /*flags*/0); - au_set_h_dptr(parent, bdst, h_parent); - AuDebugOn(!pin_orig); - cpg->pin = pin_orig; - } - iput(h_dir); - dput(parent); - - return err; -} - -/* ---------------------------------------------------------------------- */ - -/* - * generic routine for both of copy-up and copy-down. - */ -/* cf. revalidate function in file.c */ -int au_cp_dirs(struct dentry *dentry, aufs_bindex_t bdst, - int (*cp)(struct dentry *dentry, aufs_bindex_t bdst, - struct au_pin *pin, - struct dentry *h_parent, void *arg), - void *arg) -{ - int err; - struct au_pin pin; - struct dentry *d, *parent, *h_parent, *real_parent, *h_dentry; - - err = 0; - parent = dget_parent(dentry); - if (IS_ROOT(parent)) - goto out; - - au_pin_init(&pin, dentry, bdst, AuLsc_DI_PARENT2, AuLsc_I_PARENT2, - au_opt_udba(dentry->d_sb), AuPin_MNT_WRITE); - - /* do not use au_dpage */ - real_parent = parent; - while (1) { - dput(parent); - parent = dget_parent(dentry); - h_parent = au_h_dptr(parent, bdst); - if (h_parent) - goto out; /* success */ - - /* find top dir which is necessary to cpup */ - do { - d = parent; - dput(parent); - parent = dget_parent(d); - di_read_lock_parent3(parent, !AuLock_IR); - h_parent = au_h_dptr(parent, bdst); - di_read_unlock(parent, !AuLock_IR); - } while (!h_parent); - - if (d != real_parent) - di_write_lock_child3(d); - - /* somebody else might create while we were sleeping */ - h_dentry = au_h_dptr(d, bdst); - if (!h_dentry || d_is_negative(h_dentry)) { - if (h_dentry) - au_update_dbstart(d); - - au_pin_set_dentry(&pin, d); - err = au_do_pin(&pin); - if (!err) { - err = cp(d, bdst, &pin, h_parent, arg); - au_unpin(&pin); - } - } - - if (d != real_parent) - di_write_unlock(d); - if (unlikely(err)) - break; - } - -out: - dput(parent); - return err; -} - -static int au_cpup_dir(struct dentry *dentry, aufs_bindex_t bdst, - struct au_pin *pin, - struct dentry *h_parent __maybe_unused, - void *arg __maybe_unused) -{ - struct au_cp_generic cpg = { - .dentry = dentry, - .bdst = bdst, - .bsrc = -1, - .len = 0, - .pin = pin, - .flags = AuCpup_DTIME - }; - return au_sio_cpup_simple(&cpg); -} - -int au_cpup_dirs(struct dentry *dentry, aufs_bindex_t bdst) -{ - return au_cp_dirs(dentry, bdst, au_cpup_dir, NULL); -} - -int au_test_and_cpup_dirs(struct dentry *dentry, aufs_bindex_t bdst) -{ - int err; - struct dentry *parent; - struct inode *dir; - - parent = dget_parent(dentry); - dir = d_inode(parent); - err = 0; - if (au_h_iptr(dir, bdst)) - goto out; - - di_read_unlock(parent, AuLock_IR); - di_write_lock_parent(parent); - /* someone else might change our inode while we were sleeping */ - if (!au_h_iptr(dir, bdst)) - err = au_cpup_dirs(dentry, bdst); - di_downgrade_lock(parent, AuLock_IR); - -out: - dput(parent); - return err; -} diff --git a/fs/aufs/cpup.h b/fs/aufs/cpup.h deleted file mode 100644 index a5da7dbe9..000000000 --- a/fs/aufs/cpup.h +++ /dev/null @@ -1,94 +0,0 @@ -/* - * Copyright (C) 2005-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * copy-up/down functions - */ - -#ifndef __AUFS_CPUP_H__ -#define __AUFS_CPUP_H__ - -#ifdef __KERNEL__ - -#include <linux/path.h> - -struct inode; -struct file; -struct au_pin; - -void au_cpup_attr_flags(struct inode *dst, unsigned int iflags); -void au_cpup_attr_timesizes(struct inode *inode); -void au_cpup_attr_nlink(struct inode *inode, int force); -void au_cpup_attr_changeable(struct inode *inode); -void au_cpup_igen(struct inode *inode, struct inode *h_inode); -void au_cpup_attr_all(struct inode *inode, int force); - -/* ---------------------------------------------------------------------- */ - -struct au_cp_generic { - struct dentry *dentry; - aufs_bindex_t bdst, bsrc; - loff_t len; - struct au_pin *pin; - unsigned int flags; -}; - -/* cpup flags */ -#define AuCpup_DTIME 1 /* do dtime_store/revert */ -#define AuCpup_KEEPLINO (1 << 1) /* do not clear the lower xino, - for link(2) */ -#define AuCpup_RENAME (1 << 2) /* rename after cpup */ -#define AuCpup_HOPEN (1 << 3) /* call h_open_pre/post() in - cpup */ -#define AuCpup_OVERWRITE (1 << 4) /* allow overwriting the - existing entry */ -#define AuCpup_RWDST (1 << 5) /* force write target even if - the branch is marked as RO */ - -#define au_ftest_cpup(flags, name) ((flags) & AuCpup_##name) -#define au_fset_cpup(flags, name) \ - do { (flags) |= AuCpup_##name; } while (0) -#define au_fclr_cpup(flags, name) \ - do { (flags) &= ~AuCpup_##name; } while (0) - -int au_copy_file(struct file *dst, struct file *src, loff_t len); -int au_sio_cpup_simple(struct au_cp_generic *cpg); -int au_sio_cpdown_simple(struct au_cp_generic *cpg); -int au_sio_cpup_wh(struct au_cp_generic *cpg, struct file *file); - -int au_cp_dirs(struct dentry *dentry, aufs_bindex_t bdst, - int (*cp)(struct dentry *dentry, aufs_bindex_t bdst, - struct au_pin *pin, - struct dentry *h_parent, void *arg), - void *arg); -int au_cpup_dirs(struct dentry *dentry, aufs_bindex_t bdst); -int au_test_and_cpup_dirs(struct dentry *dentry, aufs_bindex_t bdst); - -/* ---------------------------------------------------------------------- */ - -/* keep timestamps when copyup */ -struct au_dtime { - struct dentry *dt_dentry; - struct path dt_h_path; - struct timespec dt_atime, dt_mtime; -}; -void au_dtime_store(struct au_dtime *dt, struct dentry *dentry, - struct path *h_path); -void au_dtime_revert(struct au_dtime *dt); - -#endif /* __KERNEL__ */ -#endif /* __AUFS_CPUP_H__ */ diff --git a/fs/aufs/dbgaufs.c b/fs/aufs/dbgaufs.c deleted file mode 100644 index df4c65635..000000000 --- a/fs/aufs/dbgaufs.c +++ /dev/null @@ -1,432 +0,0 @@ -/* - * Copyright (C) 2005-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * debugfs interface - */ - -#include <linux/debugfs.h> -#include "aufs.h" - -#ifndef CONFIG_SYSFS -#error DEBUG_FS depends upon SYSFS -#endif - -static struct dentry *dbgaufs; -static const mode_t dbgaufs_mode = S_IRUSR | S_IRGRP | S_IROTH; - -/* 20 is max digits length of ulong 64 */ -struct dbgaufs_arg { - int n; - char a[20 * 4]; -}; - -/* - * common function for all XINO files - */ -static int dbgaufs_xi_release(struct inode *inode __maybe_unused, - struct file *file) -{ - kfree(file->private_data); - return 0; -} - -static int dbgaufs_xi_open(struct file *xf, struct file *file, int do_fcnt) -{ - int err; - struct kstat st; - struct dbgaufs_arg *p; - - err = -ENOMEM; - p = kmalloc(sizeof(*p), GFP_NOFS); - if (unlikely(!p)) - goto out; - - err = 0; - p->n = 0; - file->private_data = p; - if (!xf) - goto out; - - err = vfs_getattr(&xf->f_path, &st); - if (!err) { - if (do_fcnt) - p->n = snprintf - (p->a, sizeof(p->a), "%ld, %llux%lu %lld\n", - (long)file_count(xf), st.blocks, st.blksize, - (long long)st.size); - else - p->n = snprintf(p->a, sizeof(p->a), "%llux%lu %lld\n", - st.blocks, st.blksize, - (long long)st.size); - AuDebugOn(p->n >= sizeof(p->a)); - } else { - p->n = snprintf(p->a, sizeof(p->a), "err %d\n", err); - err = 0; - } - -out: - return err; - -} - -static ssize_t dbgaufs_xi_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) -{ - struct dbgaufs_arg *p; - - p = file->private_data; - return simple_read_from_buffer(buf, count, ppos, p->a, p->n); -} - -/* ---------------------------------------------------------------------- */ - -struct dbgaufs_plink_arg { - int n; - char a[]; -}; - -static int dbgaufs_plink_release(struct inode *inode __maybe_unused, - struct file *file) -{ - free_page((unsigned long)file->private_data); - return 0; -} - -static int dbgaufs_plink_open(struct inode *inode, struct file *file) -{ - int err, i, limit; - unsigned long n, sum; - struct dbgaufs_plink_arg *p; - struct au_sbinfo *sbinfo; - struct super_block *sb; - struct au_sphlhead *sphl; - - err = -ENOMEM; - p = (void *)get_zeroed_page(GFP_NOFS); - if (unlikely(!p)) - goto out; - - err = -EFBIG; - sbinfo = inode->i_private; - sb = sbinfo->si_sb; - si_noflush_read_lock(sb); - if (au_opt_test(au_mntflags(sb), PLINK)) { - limit = PAGE_SIZE - sizeof(p->n); - - /* the number of buckets */ - n = snprintf(p->a + p->n, limit, "%d\n", AuPlink_NHASH); - p->n += n; - limit -= n; - - sum = 0; - for (i = 0, sphl = sbinfo->si_plink; - i < AuPlink_NHASH; - i++, sphl++) { - n = au_sphl_count(sphl); - sum += n; - - n = snprintf(p->a + p->n, limit, "%lu ", n); - p->n += n; - limit -= n; - if (unlikely(limit <= 0)) - goto out_free; - } - p->a[p->n - 1] = '\n'; - - /* the sum of plinks */ - n = snprintf(p->a + p->n, limit, "%lu\n", sum); - p->n += n; - limit -= n; - if (unlikely(limit <= 0)) - goto out_free; - } else { -#define str "1\n0\n0\n" - p->n = sizeof(str) - 1; - strcpy(p->a, str); -#undef str - } - si_read_unlock(sb); - - err = 0; - file->private_data = p; - goto out; /* success */ - -out_free: - free_page((unsigned long)p); -out: - return err; -} - -static ssize_t dbgaufs_plink_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) -{ - struct dbgaufs_plink_arg *p; - - p = file->private_data; - return simple_read_from_buffer(buf, count, ppos, p->a, p->n); -} - -static const struct file_operations dbgaufs_plink_fop = { - .owner = THIS_MODULE, - .open = dbgaufs_plink_open, - .release = dbgaufs_plink_release, - .read = dbgaufs_plink_read -}; - -/* ---------------------------------------------------------------------- */ - -static int dbgaufs_xib_open(struct inode *inode, struct file *file) -{ - int err; - struct au_sbinfo *sbinfo; - struct super_block *sb; - - sbinfo = inode->i_private; - sb = sbinfo->si_sb; - si_noflush_read_lock(sb); - err = dbgaufs_xi_open(sbinfo->si_xib, file, /*do_fcnt*/0); - si_read_unlock(sb); - return err; -} - -static const struct file_operations dbgaufs_xib_fop = { - .owner = THIS_MODULE, - .open = dbgaufs_xib_open, - .release = dbgaufs_xi_release, - .read = dbgaufs_xi_read -}; - -/* ---------------------------------------------------------------------- */ - -#define DbgaufsXi_PREFIX "xi" - -static int dbgaufs_xino_open(struct inode *inode, struct file *file) -{ - int err; - long l; - struct au_sbinfo *sbinfo; - struct super_block *sb; - struct file *xf; - struct qstr *name; - - err = -ENOENT; - xf = NULL; - name = &file->f_path.dentry->d_name; - if (unlikely(name->len < sizeof(DbgaufsXi_PREFIX) - || memcmp(name->name, DbgaufsXi_PREFIX, - sizeof(DbgaufsXi_PREFIX) - 1))) - goto out; - err = kstrtol(name->name + sizeof(DbgaufsXi_PREFIX) - 1, 10, &l); - if (unlikely(err)) - goto out; - - sbinfo = inode->i_private; - sb = sbinfo->si_sb; - si_noflush_read_lock(sb); - if (l <= au_sbend(sb)) { - xf = au_sbr(sb, (aufs_bindex_t)l)->br_xino.xi_file; - err = dbgaufs_xi_open(xf, file, /*do_fcnt*/1); - } else - err = -ENOENT; - si_read_unlock(sb); - -out: - return err; -} - -static const struct file_operations dbgaufs_xino_fop = { - .owner = THIS_MODULE, - .open = dbgaufs_xino_open, - .release = dbgaufs_xi_release, - .read = dbgaufs_xi_read -}; - -void dbgaufs_brs_del(struct super_block *sb, aufs_bindex_t bindex) -{ - aufs_bindex_t bend; - struct au_branch *br; - struct au_xino_file *xi; - - if (!au_sbi(sb)->si_dbgaufs) - return; - - bend = au_sbend(sb); - for (; bindex <= bend; bindex++) { - br = au_sbr(sb, bindex); - xi = &br->br_xino; - debugfs_remove(xi->xi_dbgaufs); - xi->xi_dbgaufs = NULL; - } -} - -void dbgaufs_brs_add(struct super_block *sb, aufs_bindex_t bindex) -{ - struct au_sbinfo *sbinfo; - struct dentry *parent; - struct au_branch *br; - struct au_xino_file *xi; - aufs_bindex_t bend; - char name[sizeof(DbgaufsXi_PREFIX) + 5]; /* "xi" bindex NULL */ - - sbinfo = au_sbi(sb); - parent = sbinfo->si_dbgaufs; - if (!parent) - return; - - bend = au_sbend(sb); - for (; bindex <= bend; bindex++) { - snprintf(name, sizeof(name), DbgaufsXi_PREFIX "%d", bindex); - br = au_sbr(sb, bindex); - xi = &br->br_xino; - AuDebugOn(xi->xi_dbgaufs); - xi->xi_dbgaufs = debugfs_create_file(name, dbgaufs_mode, parent, - sbinfo, &dbgaufs_xino_fop); - /* ignore an error */ - if (unlikely(!xi->xi_dbgaufs)) - AuWarn1("failed %s under debugfs\n", name); - } -} - -/* ---------------------------------------------------------------------- */ - -#ifdef CONFIG_AUFS_EXPORT -static int dbgaufs_xigen_open(struct inode *inode, struct file *file) -{ - int err; - struct au_sbinfo *sbinfo; - struct super_block *sb; - - sbinfo = inode->i_private; - sb = sbinfo->si_sb; - si_noflush_read_lock(sb); - err = dbgaufs_xi_open(sbinfo->si_xigen, file, /*do_fcnt*/0); - si_read_unlock(sb); - return err; -} - -static const struct file_operations dbgaufs_xigen_fop = { - .owner = THIS_MODULE, - .open = dbgaufs_xigen_open, - .release = dbgaufs_xi_release, - .read = dbgaufs_xi_read -}; - -static int dbgaufs_xigen_init(struct au_sbinfo *sbinfo) -{ - int err; - - /* - * This function is a dynamic '__init' function actually, - * so the tiny check for si_rwsem is unnecessary. - */ - /* AuRwMustWriteLock(&sbinfo->si_rwsem); */ - - err = -EIO; - sbinfo->si_dbgaufs_xigen = debugfs_create_file - ("xigen", dbgaufs_mode, sbinfo->si_dbgaufs, sbinfo, - &dbgaufs_xigen_fop); - if (sbinfo->si_dbgaufs_xigen) - err = 0; - - return err; -} -#else -static int dbgaufs_xigen_init(struct au_sbinfo *sbinfo) -{ - return 0; -} -#endif /* CONFIG_AUFS_EXPORT */ - -/* ---------------------------------------------------------------------- */ - -void dbgaufs_si_fin(struct au_sbinfo *sbinfo) -{ - /* - * This function is a dynamic '__fin' function actually, - * so the tiny check for si_rwsem is unnecessary. - */ - /* AuRwMustWriteLock(&sbinfo->si_rwsem); */ - - debugfs_remove_recursive(sbinfo->si_dbgaufs); - sbinfo->si_dbgaufs = NULL; - kobject_put(&sbinfo->si_kobj); -} - -int dbgaufs_si_init(struct au_sbinfo *sbinfo) -{ - int err; - char name[SysaufsSiNameLen]; - - /* - * This function is a dynamic '__init' function actually, - * so the tiny check for si_rwsem is unnecessary. - */ - /* AuRwMustWriteLock(&sbinfo->si_rwsem); */ - - err = -ENOENT; - if (!dbgaufs) { - AuErr1("/debug/aufs is uninitialized\n"); - goto out; - } - - err = -EIO; - sysaufs_name(sbinfo, name); - sbinfo->si_dbgaufs = debugfs_create_dir(name, dbgaufs); - if (unlikely(!sbinfo->si_dbgaufs)) - goto out; - kobject_get(&sbinfo->si_kobj); - - sbinfo->si_dbgaufs_xib = debugfs_create_file - ("xib", dbgaufs_mode, sbinfo->si_dbgaufs, sbinfo, - &dbgaufs_xib_fop); - if (unlikely(!sbinfo->si_dbgaufs_xib)) - goto out_dir; - - sbinfo->si_dbgaufs_plink = debugfs_create_file - ("plink", dbgaufs_mode, sbinfo->si_dbgaufs, sbinfo, - &dbgaufs_plink_fop); - if (unlikely(!sbinfo->si_dbgaufs_plink)) - goto out_dir; - - err = dbgaufs_xigen_init(sbinfo); - if (!err) - goto out; /* success */ - -out_dir: - dbgaufs_si_fin(sbinfo); -out: - return err; -} - -/* ---------------------------------------------------------------------- */ - -void dbgaufs_fin(void) -{ - debugfs_remove(dbgaufs); -} - -int __init dbgaufs_init(void) -{ - int err; - - err = -EIO; - dbgaufs = debugfs_create_dir(AUFS_NAME, NULL); - if (dbgaufs) - err = 0; - return err; -} diff --git a/fs/aufs/dbgaufs.h b/fs/aufs/dbgaufs.h deleted file mode 100644 index 03d47b80b..000000000 --- a/fs/aufs/dbgaufs.h +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright (C) 2005-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * debugfs interface - */ - -#ifndef __DBGAUFS_H__ -#define __DBGAUFS_H__ - -#ifdef __KERNEL__ - -struct super_block; -struct au_sbinfo; - -#ifdef CONFIG_DEBUG_FS -/* dbgaufs.c */ -void dbgaufs_brs_del(struct super_block *sb, aufs_bindex_t bindex); -void dbgaufs_brs_add(struct super_block *sb, aufs_bindex_t bindex); -void dbgaufs_si_fin(struct au_sbinfo *sbinfo); -int dbgaufs_si_init(struct au_sbinfo *sbinfo); -void dbgaufs_fin(void); -int __init dbgaufs_init(void); -#else -AuStubVoid(dbgaufs_brs_del, struct super_block *sb, aufs_bindex_t bindex) -AuStubVoid(dbgaufs_brs_add, struct super_block *sb, aufs_bindex_t bindex) -AuStubVoid(dbgaufs_si_fin, struct au_sbinfo *sbinfo) -AuStubInt0(dbgaufs_si_init, struct au_sbinfo *sbinfo) -AuStubVoid(dbgaufs_fin, void) -AuStubInt0(__init dbgaufs_init, void) -#endif /* CONFIG_DEBUG_FS */ - -#endif /* __KERNEL__ */ -#endif /* __DBGAUFS_H__ */ diff --git a/fs/aufs/dcsub.c b/fs/aufs/dcsub.c deleted file mode 100644 index 910a2e870..000000000 --- a/fs/aufs/dcsub.c +++ /dev/null @@ -1,224 +0,0 @@ -/* - * Copyright (C) 2005-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * sub-routines for dentry cache - */ - -#include "aufs.h" - -static void au_dpage_free(struct au_dpage *dpage) -{ - int i; - struct dentry **p; - - p = dpage->dentries; - for (i = 0; i < dpage->ndentry; i++) - dput(*p++); - free_page((unsigned long)dpage->dentries); -} - -int au_dpages_init(struct au_dcsub_pages *dpages, gfp_t gfp) -{ - int err; - void *p; - - err = -ENOMEM; - dpages->dpages = kmalloc(sizeof(*dpages->dpages), gfp); - if (unlikely(!dpages->dpages)) - goto out; - - p = (void *)__get_free_page(gfp); - if (unlikely(!p)) - goto out_dpages; - - dpages->dpages[0].ndentry = 0; - dpages->dpages[0].dentries = p; - dpages->ndpage = 1; - return 0; /* success */ - -out_dpages: - kfree(dpages->dpages); -out: - return err; -} - -void au_dpages_free(struct au_dcsub_pages *dpages) -{ - int i; - struct au_dpage *p; - - p = dpages->dpages; - for (i = 0; i < dpages->ndpage; i++) - au_dpage_free(p++); - kfree(dpages->dpages); -} - -static int au_dpages_append(struct au_dcsub_pages *dpages, - struct dentry *dentry, gfp_t gfp) -{ - int err, sz; - struct au_dpage *dpage; - void *p; - - dpage = dpages->dpages + dpages->ndpage - 1; - sz = PAGE_SIZE / sizeof(dentry); - if (unlikely(dpage->ndentry >= sz)) { - AuLabel(new dpage); - err = -ENOMEM; - sz = dpages->ndpage * sizeof(*dpages->dpages); - p = au_kzrealloc(dpages->dpages, sz, - sz + sizeof(*dpages->dpages), gfp); - if (unlikely(!p)) - goto out; - - dpages->dpages = p; - dpage = dpages->dpages + dpages->ndpage; - p = (void *)__get_free_page(gfp); - if (unlikely(!p)) - goto out; - - dpage->ndentry = 0; - dpage->dentries = p; - dpages->ndpage++; - } - - AuDebugOn(au_dcount(dentry) <= 0); - dpage->dentries[dpage->ndentry++] = dget_dlock(dentry); - return 0; /* success */ - -out: - return err; -} - -/* todo: BAD approach */ -/* copied from linux/fs/dcache.c */ -enum d_walk_ret { - D_WALK_CONTINUE, - D_WALK_QUIT, - D_WALK_NORETRY, - D_WALK_SKIP, -}; - -extern void d_walk(struct dentry *parent, void *data, - enum d_walk_ret (*enter)(void *, struct dentry *), - void (*finish)(void *)); - -struct ac_dpages_arg { - int err; - struct au_dcsub_pages *dpages; - struct super_block *sb; - au_dpages_test test; - void *arg; -}; - -static enum d_walk_ret au_call_dpages_append(void *_arg, struct dentry *dentry) -{ - enum d_walk_ret ret; - struct ac_dpages_arg *arg = _arg; - - ret = D_WALK_CONTINUE; - if (dentry->d_sb == arg->sb - && !IS_ROOT(dentry) - && au_dcount(dentry) > 0 - && au_di(dentry) - && (!arg->test || arg->test(dentry, arg->arg))) { - arg->err = au_dpages_append(arg->dpages, dentry, GFP_ATOMIC); - if (unlikely(arg->err)) - ret = D_WALK_QUIT; - } - - return ret; -} - -int au_dcsub_pages(struct au_dcsub_pages *dpages, struct dentry *root, - au_dpages_test test, void *arg) -{ - struct ac_dpages_arg args = { - .err = 0, - .dpages = dpages, - .sb = root->d_sb, - .test = test, - .arg = arg - }; - - d_walk(root, &args, au_call_dpages_append, NULL); - - return args.err; -} - -int au_dcsub_pages_rev(struct au_dcsub_pages *dpages, struct dentry *dentry, - int do_include, au_dpages_test test, void *arg) -{ - int err; - - err = 0; - write_seqlock(&rename_lock); - spin_lock(&dentry->d_lock); - if (do_include - && au_dcount(dentry) > 0 - && (!test || test(dentry, arg))) - err = au_dpages_append(dpages, dentry, GFP_ATOMIC); - spin_unlock(&dentry->d_lock); - if (unlikely(err)) - goto out; - - /* - * RCU for vfsmount is unnecessary since this is a traverse in a single - * mount - */ - while (!IS_ROOT(dentry)) { - dentry = dentry->d_parent; /* rename_lock is locked */ - spin_lock(&dentry->d_lock); - if (au_dcount(dentry) > 0 - && (!test || test(dentry, arg))) - err = au_dpages_append(dpages, dentry, GFP_ATOMIC); - spin_unlock(&dentry->d_lock); - if (unlikely(err)) - break; - } - -out: - write_sequnlock(&rename_lock); - return err; -} - -static inline int au_dcsub_dpages_aufs(struct dentry *dentry, void *arg) -{ - return au_di(dentry) && dentry->d_sb == arg; -} - -int au_dcsub_pages_rev_aufs(struct au_dcsub_pages *dpages, - struct dentry *dentry, int do_include) -{ - return au_dcsub_pages_rev(dpages, dentry, do_include, - au_dcsub_dpages_aufs, dentry->d_sb); -} - -int au_test_subdir(struct dentry *d1, struct dentry *d2) -{ - struct path path[2] = { - { - .dentry = d1 - }, - { - .dentry = d2 - } - }; - - return path_is_under(path + 0, path + 1); -} diff --git a/fs/aufs/dcsub.h b/fs/aufs/dcsub.h deleted file mode 100644 index d372325b2..000000000 --- a/fs/aufs/dcsub.h +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Copyright (C) 2005-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * sub-routines for dentry cache - */ - -#ifndef __AUFS_DCSUB_H__ -#define __AUFS_DCSUB_H__ - -#ifdef __KERNEL__ - -#include <linux/dcache.h> -#include <linux/fs.h> - -struct au_dpage { - int ndentry; - struct dentry **dentries; -}; - -struct au_dcsub_pages { - int ndpage; - struct au_dpage *dpages; -}; - -/* ---------------------------------------------------------------------- */ - -/* dcsub.c */ -int au_dpages_init(struct au_dcsub_pages *dpages, gfp_t gfp); -void au_dpages_free(struct au_dcsub_pages *dpages); -typedef int (*au_dpages_test)(struct dentry *dentry, void *arg); -int au_dcsub_pages(struct au_dcsub_pages *dpages, struct dentry *root, - au_dpages_test test, void *arg); -int au_dcsub_pages_rev(struct au_dcsub_pages *dpages, struct dentry *dentry, - int do_include, au_dpages_test test, void *arg); -int au_dcsub_pages_rev_aufs(struct au_dcsub_pages *dpages, - struct dentry *dentry, int do_include); -int au_test_subdir(struct dentry *d1, struct dentry *d2); - -/* ---------------------------------------------------------------------- */ - -/* - * todo: in linux-3.13, several similar (but faster) helpers are added to - * include/linux/dcache.h. Try them (in the future). - */ - -static inline int au_d_hashed_positive(struct dentry *d) -{ - int err; - struct inode *inode = d_inode(d); - - err = 0; - if (unlikely(d_unhashed(d) - || d_is_negative(d) - || !inode->i_nlink)) - err = -ENOENT; - return err; -} - -static inline int au_d_linkable(struct dentry *d) -{ - int err; - struct inode *inode = d_inode(d); - - err = au_d_hashed_positive(d); - if (err - && d_is_positive(d) - && (inode->i_state & I_LINKABLE)) - err = 0; - return err; -} - -static inline int au_d_alive(struct dentry *d) -{ - int err; - struct inode *inode; - - err = 0; - if (!IS_ROOT(d)) - err = au_d_hashed_positive(d); - else { - inode = d_inode(d); - if (unlikely(d_unlinked(d) - || d_is_negative(d) - || !inode->i_nlink)) - err = -ENOENT; - } - return err; -} - -static inline int au_alive_dir(struct dentry *d) -{ - int err; - - err = au_d_alive(d); - if (unlikely(err || IS_DEADDIR(d_inode(d)))) - err = -ENOENT; - return err; -} - -static inline int au_qstreq(struct qstr *a, struct qstr *b) -{ - return a->len == b->len - && !memcmp(a->name, b->name, a->len); -} - -/* - * by the commit - * 360f547 2015-01-25 dcache: let the dentry count go down to zero without - * taking d_lock - * the type of d_lockref.count became int, but the inlined function d_count() - * still returns unsigned int. - * I don't know why. Maybe it is for every d_count() users? - * Anyway au_dcount() lives on. - */ -static inline int au_dcount(struct dentry *d) -{ - return (int)d_count(d); -} - -#endif /* __KERNEL__ */ -#endif /* __AUFS_DCSUB_H__ */ diff --git a/fs/aufs/debug.c b/fs/aufs/debug.c deleted file mode 100644 index a1154d6de..000000000 --- a/fs/aufs/debug.c +++ /dev/null @@ -1,440 +0,0 @@ -/* - * Copyright (C) 2005-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * debug print functions - */ - -#include "aufs.h" - -/* Returns 0, or -errno. arg is in kp->arg. */ -static int param_atomic_t_set(const char *val, const struct kernel_param *kp) -{ - int err, n; - - err = kstrtoint(val, 0, &n); - if (!err) { - if (n > 0) - au_debug_on(); - else - au_debug_off(); - } - return err; -} - -/* Returns length written or -errno. Buffer is 4k (ie. be short!) */ -static int param_atomic_t_get(char *buffer, const struct kernel_param *kp) -{ - atomic_t *a; - - a = kp->arg; - return sprintf(buffer, "%d", atomic_read(a)); -} - -static struct kernel_param_ops param_ops_atomic_t = { - .set = param_atomic_t_set, - .get = param_atomic_t_get - /* void (*free)(void *arg) */ -}; - -atomic_t aufs_debug = ATOMIC_INIT(0); -MODULE_PARM_DESC(debug, "debug print"); -module_param_named(debug, aufs_debug, atomic_t, S_IRUGO | S_IWUSR | S_IWGRP); - -DEFINE_MUTEX(au_dbg_mtx); /* just to serialize the dbg msgs */ -char *au_plevel = KERN_DEBUG; -#define dpri(fmt, ...) do { \ - if ((au_plevel \ - && strcmp(au_plevel, KERN_DEBUG)) \ - || au_debug_test()) \ - printk("%s" fmt, au_plevel, ##__VA_ARGS__); \ -} while (0) - -/* ---------------------------------------------------------------------- */ - -void au_dpri_whlist(struct au_nhash *whlist) -{ - unsigned long ul, n; - struct hlist_head *head; - struct au_vdir_wh *pos; - - n = whlist->nh_num; - head = whlist->nh_head; - for (ul = 0; ul < n; ul++) { - hlist_for_each_entry(pos, head, wh_hash) - dpri("b%d, %.*s, %d\n", - pos->wh_bindex, - pos->wh_str.len, pos->wh_str.name, - pos->wh_str.len); - head++; - } -} - -void au_dpri_vdir(struct au_vdir *vdir) -{ - unsigned long ul; - union au_vdir_deblk_p p; - unsigned char *o; - - if (!vdir || IS_ERR(vdir)) { - dpri("err %ld\n", PTR_ERR(vdir)); - return; - } - - dpri("deblk %u, nblk %lu, deblk %p, last{%lu, %p}, ver %lu\n", - vdir->vd_deblk_sz, vdir->vd_nblk, vdir->vd_deblk, - vdir->vd_last.ul, vdir->vd_last.p.deblk, vdir->vd_version); - for (ul = 0; ul < vdir->vd_nblk; ul++) { - p.deblk = vdir->vd_deblk[ul]; - o = p.deblk; - dpri("[%lu]: %p\n", ul, o); - } -} - -static int do_pri_inode(aufs_bindex_t bindex, struct inode *inode, int hn, - struct dentry *wh) -{ - char *n = NULL; - int l = 0; - - if (!inode || IS_ERR(inode)) { - dpri("i%d: err %ld\n", bindex, PTR_ERR(inode)); - return -1; - } - - /* the type of i_blocks depends upon CONFIG_LBDAF */ - BUILD_BUG_ON(sizeof(inode->i_blocks) != sizeof(unsigned long) - && sizeof(inode->i_blocks) != sizeof(u64)); - if (wh) { - n = (void *)wh->d_name.name; - l = wh->d_name.len; - } - - dpri("i%d: %p, i%lu, %s, cnt %d, nl %u, 0%o, sz %llu, blk %llu," - " hn %d, ct %lld, np %lu, st 0x%lx, f 0x%x, v %llu, g %x%s%.*s\n", - bindex, inode, - inode->i_ino, inode->i_sb ? au_sbtype(inode->i_sb) : "??", - atomic_read(&inode->i_count), inode->i_nlink, inode->i_mode, - i_size_read(inode), (unsigned long long)inode->i_blocks, - hn, (long long)timespec_to_ns(&inode->i_ctime) & 0x0ffff, - inode->i_mapping ? inode->i_mapping->nrpages : 0, - inode->i_state, inode->i_flags, inode->i_version, - inode->i_generation, - l ? ", wh " : "", l, n); - return 0; -} - -void au_dpri_inode(struct inode *inode) -{ - struct au_iinfo *iinfo; - aufs_bindex_t bindex; - int err, hn; - - err = do_pri_inode(-1, inode, -1, NULL); - if (err || !au_test_aufs(inode->i_sb)) - return; - - iinfo = au_ii(inode); - if (!iinfo) - return; - dpri("i-1: bstart %d, bend %d, gen %d\n", - iinfo->ii_bstart, iinfo->ii_bend, au_iigen(inode, NULL)); - if (iinfo->ii_bstart < 0) - return; - hn = 0; - for (bindex = iinfo->ii_bstart; bindex <= iinfo->ii_bend; bindex++) { - hn = !!au_hn(iinfo->ii_hinode + bindex); - do_pri_inode(bindex, iinfo->ii_hinode[0 + bindex].hi_inode, hn, - iinfo->ii_hinode[0 + bindex].hi_whdentry); - } -} - -void au_dpri_dalias(struct inode *inode) -{ - struct dentry *d; - - spin_lock(&inode->i_lock); - hlist_for_each_entry(d, &inode->i_dentry, d_u.d_alias) - au_dpri_dentry(d); - spin_unlock(&inode->i_lock); -} - -static int do_pri_dentry(aufs_bindex_t bindex, struct dentry *dentry) -{ - struct dentry *wh = NULL; - int hn; - struct au_iinfo *iinfo; - - if (!dentry || IS_ERR(dentry)) { - dpri("d%d: err %ld\n", bindex, PTR_ERR(dentry)); - return -1; - } - /* do not call dget_parent() here */ - /* note: access d_xxx without d_lock */ - dpri("d%d: %p, %pd2?, %s, cnt %d, flags 0x%x, %shashed\n", - bindex, dentry, dentry, - dentry->d_sb ? au_sbtype(dentry->d_sb) : "??", - au_dcount(dentry), dentry->d_flags, - d_unhashed(dentry) ? "un" : ""); - hn = -1; - if (bindex >= 0 - && d_is_positive(dentry) - && au_test_aufs(dentry->d_sb)) { - iinfo = au_ii(d_inode(dentry)); - if (iinfo) { - hn = !!au_hn(iinfo->ii_hinode + bindex); - wh = iinfo->ii_hinode[0 + bindex].hi_whdentry; - } - } - do_pri_inode(bindex, d_inode(dentry), hn, wh); - return 0; -} - -void au_dpri_dentry(struct dentry *dentry) -{ - struct au_dinfo *dinfo; - aufs_bindex_t bindex; - int err; - struct au_hdentry *hdp; - - err = do_pri_dentry(-1, dentry); - if (err || !au_test_aufs(dentry->d_sb)) - return; - - dinfo = au_di(dentry); - if (!dinfo) - return; - dpri("d-1: bstart %d, bend %d, bwh %d, bdiropq %d, gen %d, tmp %d\n", - dinfo->di_bstart, dinfo->di_bend, - dinfo->di_bwh, dinfo->di_bdiropq, au_digen(dentry), - dinfo->di_tmpfile); - if (dinfo->di_bstart < 0) - return; - hdp = dinfo->di_hdentry; - for (bindex = dinfo->di_bstart; bindex <= dinfo->di_bend; bindex++) - do_pri_dentry(bindex, hdp[0 + bindex].hd_dentry); -} - -static int do_pri_file(aufs_bindex_t bindex, struct file *file) -{ - char a[32]; - - if (!file || IS_ERR(file)) { - dpri("f%d: err %ld\n", bindex, PTR_ERR(file)); - return -1; - } - a[0] = 0; - if (bindex < 0 - && !IS_ERR_OR_NULL(file->f_path.dentry) - && au_test_aufs(file->f_path.dentry->d_sb) - && au_fi(file)) - snprintf(a, sizeof(a), ", gen %d, mmapped %d", - au_figen(file), atomic_read(&au_fi(file)->fi_mmapped)); - dpri("f%d: mode 0x%x, flags 0%o, cnt %ld, v %llu, pos %llu%s\n", - bindex, file->f_mode, file->f_flags, (long)file_count(file), - file->f_version, file->f_pos, a); - if (!IS_ERR_OR_NULL(file->f_path.dentry)) - do_pri_dentry(bindex, file->f_path.dentry); - return 0; -} - -void au_dpri_file(struct file *file) -{ - struct au_finfo *finfo; - struct au_fidir *fidir; - struct au_hfile *hfile; - aufs_bindex_t bindex; - int err; - - err = do_pri_file(-1, file); - if (err - || IS_ERR_OR_NULL(file->f_path.dentry) - || !au_test_aufs(file->f_path.dentry->d_sb)) - return; - - finfo = au_fi(file); - if (!finfo) - return; - if (finfo->fi_btop < 0) - return; - fidir = finfo->fi_hdir; - if (!fidir) - do_pri_file(finfo->fi_btop, finfo->fi_htop.hf_file); - else - for (bindex = finfo->fi_btop; - bindex >= 0 && bindex <= fidir->fd_bbot; - bindex++) { - hfile = fidir->fd_hfile + bindex; - do_pri_file(bindex, hfile ? hfile->hf_file : NULL); - } -} - -static int do_pri_br(aufs_bindex_t bindex, struct au_branch *br) -{ - struct vfsmount *mnt; - struct super_block *sb; - - if (!br || IS_ERR(br)) - goto out; - mnt = au_br_mnt(br); - if (!mnt || IS_ERR(mnt)) - goto out; - sb = mnt->mnt_sb; - if (!sb || IS_ERR(sb)) - goto out; - - dpri("s%d: {perm 0x%x, id %d, cnt %d, wbr %p}, " - "%s, dev 0x%02x%02x, flags 0x%lx, cnt %d, active %d, " - "xino %d\n", - bindex, br->br_perm, br->br_id, atomic_read(&br->br_count), - br->br_wbr, au_sbtype(sb), MAJOR(sb->s_dev), MINOR(sb->s_dev), - sb->s_flags, sb->s_count, - atomic_read(&sb->s_active), !!br->br_xino.xi_file); - return 0; - -out: - dpri("s%d: err %ld\n", bindex, PTR_ERR(br)); - return -1; -} - -void au_dpri_sb(struct super_block *sb) -{ - struct au_sbinfo *sbinfo; - aufs_bindex_t bindex; - int err; - /* to reuduce stack size */ - struct { - struct vfsmount mnt; - struct au_branch fake; - } *a; - - /* this function can be called from magic sysrq */ - a = kzalloc(sizeof(*a), GFP_ATOMIC); - if (unlikely(!a)) { - dpri("no memory\n"); - return; - } - - a->mnt.mnt_sb = sb; - a->fake.br_perm = 0; - a->fake.br_path.mnt = &a->mnt; - a->fake.br_xino.xi_file = NULL; - atomic_set(&a->fake.br_count, 0); - smp_mb(); /* atomic_set */ - err = do_pri_br(-1, &a->fake); - kfree(a); - dpri("dev 0x%x\n", sb->s_dev); - if (err || !au_test_aufs(sb)) - return; - - sbinfo = au_sbi(sb); - if (!sbinfo) - return; - dpri("nw %d, gen %u, kobj %d\n", - atomic_read(&sbinfo->si_nowait.nw_len), sbinfo->si_generation, - atomic_read(&sbinfo->si_kobj.kref.refcount)); - for (bindex = 0; bindex <= sbinfo->si_bend; bindex++) - do_pri_br(bindex, sbinfo->si_branch[0 + bindex]); -} - -/* ---------------------------------------------------------------------- */ - -void __au_dbg_verify_dinode(struct dentry *dentry, const char *func, int line) -{ - struct inode *h_inode, *inode = d_inode(dentry); - struct dentry *h_dentry; - aufs_bindex_t bindex, bend, bi; - - if (!inode /* || au_di(dentry)->di_lsc == AuLsc_DI_TMP */) - return; - - bend = au_dbend(dentry); - bi = au_ibend(inode); - if (bi < bend) - bend = bi; - bindex = au_dbstart(dentry); - bi = au_ibstart(inode); - if (bi > bindex) - bindex = bi; - - for (; bindex <= bend; bindex++) { - h_dentry = au_h_dptr(dentry, bindex); - if (!h_dentry) - continue; - h_inode = au_h_iptr(inode, bindex); - if (unlikely(h_inode != d_inode(h_dentry))) { - au_debug_on(); - AuDbg("b%d, %s:%d\n", bindex, func, line); - AuDbgDentry(dentry); - AuDbgInode(inode); - au_debug_off(); - BUG(); - } - } -} - -void au_dbg_verify_gen(struct dentry *parent, unsigned int sigen) -{ - int err, i, j; - struct au_dcsub_pages dpages; - struct au_dpage *dpage; - struct dentry **dentries; - - err = au_dpages_init(&dpages, GFP_NOFS); - AuDebugOn(err); - err = au_dcsub_pages_rev_aufs(&dpages, parent, /*do_include*/1); - AuDebugOn(err); - for (i = dpages.ndpage - 1; !err && i >= 0; i--) { - dpage = dpages.dpages + i; - dentries = dpage->dentries; - for (j = dpage->ndentry - 1; !err && j >= 0; j--) - AuDebugOn(au_digen_test(dentries[j], sigen)); - } - au_dpages_free(&dpages); -} - -void au_dbg_verify_kthread(void) -{ - if (au_wkq_test()) { - au_dbg_blocked(); - /* - * It may be recursive, but udba=notify between two aufs mounts, - * where a single ro branch is shared, is not a problem. - */ - /* WARN_ON(1); */ - } -} - -/* ---------------------------------------------------------------------- */ - -int __init au_debug_init(void) -{ - aufs_bindex_t bindex; - struct au_vdir_destr destr; - - bindex = -1; - AuDebugOn(bindex >= 0); - - destr.len = -1; - AuDebugOn(destr.len < NAME_MAX); - -#ifdef CONFIG_4KSTACKS - pr_warn("CONFIG_4KSTACKS is defined.\n"); -#endif - - return 0; -} diff --git a/fs/aufs/debug.h b/fs/aufs/debug.h deleted file mode 100644 index de6c2a682..000000000 --- a/fs/aufs/debug.h +++ /dev/null @@ -1,225 +0,0 @@ -/* - * Copyright (C) 2005-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * debug print functions - */ - -#ifndef __AUFS_DEBUG_H__ -#define __AUFS_DEBUG_H__ - -#ifdef __KERNEL__ - -#include <linux/atomic.h> -#include <linux/module.h> -#include <linux/kallsyms.h> -#include <linux/sysrq.h> - -#ifdef CONFIG_AUFS_DEBUG -#define AuDebugOn(a) BUG_ON(a) - -/* module parameter */ -extern atomic_t aufs_debug; -static inline void au_debug_on(void) -{ - atomic_inc(&aufs_debug); -} -static inline void au_debug_off(void) -{ - atomic_dec_if_positive(&aufs_debug); -} - -static inline int au_debug_test(void) -{ - return atomic_read(&aufs_debug) > 0; -} -#else -#define AuDebugOn(a) do {} while (0) -AuStubVoid(au_debug_on, void) -AuStubVoid(au_debug_off, void) -AuStubInt0(au_debug_test, void) -#endif /* CONFIG_AUFS_DEBUG */ - -#define param_check_atomic_t(name, p) __param_check(name, p, atomic_t) - -/* ---------------------------------------------------------------------- */ - -/* debug print */ - -#define AuDbg(fmt, ...) do { \ - if (au_debug_test()) \ - pr_debug("DEBUG: " fmt, ##__VA_ARGS__); \ -} while (0) -#define AuLabel(l) AuDbg(#l "\n") -#define AuIOErr(fmt, ...) pr_err("I/O Error, " fmt, ##__VA_ARGS__) -#define AuWarn1(fmt, ...) do { \ - static unsigned char _c; \ - if (!_c++) \ - pr_warn(fmt, ##__VA_ARGS__); \ -} while (0) - -#define AuErr1(fmt, ...) do { \ - static unsigned char _c; \ - if (!_c++) \ - pr_err(fmt, ##__VA_ARGS__); \ -} while (0) - -#define AuIOErr1(fmt, ...) do { \ - static unsigned char _c; \ - if (!_c++) \ - AuIOErr(fmt, ##__VA_ARGS__); \ -} while (0) - -#define AuUnsupportMsg "This operation is not supported." \ - " Please report this application to aufs-users ML." -#define AuUnsupport(fmt, ...) do { \ - pr_err(AuUnsupportMsg "\n" fmt, ##__VA_ARGS__); \ - dump_stack(); \ -} while (0) - -#define AuTraceErr(e) do { \ - if (unlikely((e) < 0)) \ - AuDbg("err %d\n", (int)(e)); \ -} while (0) - -#define AuTraceErrPtr(p) do { \ - if (IS_ERR(p)) \ - AuDbg("err %ld\n", PTR_ERR(p)); \ -} while (0) - -/* dirty macros for debug print, use with "%.*s" and caution */ -#define AuLNPair(qstr) (qstr)->len, (qstr)->name - -/* ---------------------------------------------------------------------- */ - -struct dentry; -#ifdef CONFIG_AUFS_DEBUG -extern struct mutex au_dbg_mtx; -extern char *au_plevel; -struct au_nhash; -void au_dpri_whlist(struct au_nhash *whlist); -struct au_vdir; -void au_dpri_vdir(struct au_vdir *vdir); -struct inode; -void au_dpri_inode(struct inode *inode); -void au_dpri_dalias(struct inode *inode); -void au_dpri_dentry(struct dentry *dentry); -struct file; -void au_dpri_file(struct file *filp); -struct super_block; -void au_dpri_sb(struct super_block *sb); - -#define au_dbg_verify_dinode(d) __au_dbg_verify_dinode(d, __func__, __LINE__) -void __au_dbg_verify_dinode(struct dentry *dentry, const char *func, int line); -void au_dbg_verify_gen(struct dentry *parent, unsigned int sigen); -void au_dbg_verify_kthread(void); - -int __init au_debug_init(void); - -#define AuDbgWhlist(w) do { \ - mutex_lock(&au_dbg_mtx); \ - AuDbg(#w "\n"); \ - au_dpri_whlist(w); \ - mutex_unlock(&au_dbg_mtx); \ -} while (0) - -#define AuDbgVdir(v) do { \ - mutex_lock(&au_dbg_mtx); \ - AuDbg(#v "\n"); \ - au_dpri_vdir(v); \ - mutex_unlock(&au_dbg_mtx); \ -} while (0) - -#define AuDbgInode(i) do { \ - mutex_lock(&au_dbg_mtx); \ - AuDbg(#i "\n"); \ - au_dpri_inode(i); \ - mutex_unlock(&au_dbg_mtx); \ -} while (0) - -#define AuDbgDAlias(i) do { \ - mutex_lock(&au_dbg_mtx); \ - AuDbg(#i "\n"); \ - au_dpri_dalias(i); \ - mutex_unlock(&au_dbg_mtx); \ -} while (0) - -#define AuDbgDentry(d) do { \ - mutex_lock(&au_dbg_mtx); \ - AuDbg(#d "\n"); \ - au_dpri_dentry(d); \ - mutex_unlock(&au_dbg_mtx); \ -} while (0) - -#define AuDbgFile(f) do { \ - mutex_lock(&au_dbg_mtx); \ - AuDbg(#f "\n"); \ - au_dpri_file(f); \ - mutex_unlock(&au_dbg_mtx); \ -} while (0) - -#define AuDbgSb(sb) do { \ - mutex_lock(&au_dbg_mtx); \ - AuDbg(#sb "\n"); \ - au_dpri_sb(sb); \ - mutex_unlock(&au_dbg_mtx); \ -} while (0) - -#define AuDbgSym(addr) do { \ - char sym[KSYM_SYMBOL_LEN]; \ - sprint_symbol(sym, (unsigned long)addr); \ - AuDbg("%s\n", sym); \ -} while (0) -#else -AuStubVoid(au_dbg_verify_dinode, struct dentry *dentry) -AuStubVoid(au_dbg_verify_gen, struct dentry *parent, unsigned int sigen) -AuStubVoid(au_dbg_verify_kthread, void) -AuStubInt0(__init au_debug_init, void) - -#define AuDbgWhlist(w) do {} while (0) -#define AuDbgVdir(v) do {} while (0) -#define AuDbgInode(i) do {} while (0) -#define AuDbgDAlias(i) do {} while (0) -#define AuDbgDentry(d) do {} while (0) -#define AuDbgFile(f) do {} while (0) -#define AuDbgSb(sb) do {} while (0) -#define AuDbgSym(addr) do {} while (0) -#endif /* CONFIG_AUFS_DEBUG */ - -/* ---------------------------------------------------------------------- */ - -#ifdef CONFIG_AUFS_MAGIC_SYSRQ -int __init au_sysrq_init(void); -void au_sysrq_fin(void); - -#ifdef CONFIG_HW_CONSOLE -#define au_dbg_blocked() do { \ - WARN_ON(1); \ - handle_sysrq('w'); \ -} while (0) -#else -AuStubVoid(au_dbg_blocked, void) -#endif - -#else -AuStubInt0(__init au_sysrq_init, void) -AuStubVoid(au_sysrq_fin, void) -AuStubVoid(au_dbg_blocked, void) -#endif /* CONFIG_AUFS_MAGIC_SYSRQ */ - -#endif /* __KERNEL__ */ -#endif /* __AUFS_DEBUG_H__ */ diff --git a/fs/aufs/dentry.c b/fs/aufs/dentry.c deleted file mode 100644 index 1191a6c93..000000000 --- a/fs/aufs/dentry.c +++ /dev/null @@ -1,1105 +0,0 @@ -/* - * Copyright (C) 2005-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * lookup and dentry operations - */ - -#include <linux/namei.h> -#include "aufs.h" - -#define AuLkup_ALLOW_NEG 1 -#define AuLkup_IGNORE_PERM (1 << 1) -#define au_ftest_lkup(flags, name) ((flags) & AuLkup_##name) -#define au_fset_lkup(flags, name) \ - do { (flags) |= AuLkup_##name; } while (0) -#define au_fclr_lkup(flags, name) \ - do { (flags) &= ~AuLkup_##name; } while (0) - -struct au_do_lookup_args { - unsigned int flags; - mode_t type; -}; - -/* - * returns positive/negative dentry, NULL or an error. - * NULL means whiteout-ed or not-found. - */ -static struct dentry* -au_do_lookup(struct dentry *h_parent, struct dentry *dentry, - aufs_bindex_t bindex, struct qstr *wh_name, - struct au_do_lookup_args *args) -{ - struct dentry *h_dentry; - struct inode *h_inode; - struct au_branch *br; - int wh_found, opq; - unsigned char wh_able; - const unsigned char allow_neg = !!au_ftest_lkup(args->flags, ALLOW_NEG); - const unsigned char ignore_perm = !!au_ftest_lkup(args->flags, - IGNORE_PERM); - - wh_found = 0; - br = au_sbr(dentry->d_sb, bindex); - wh_able = !!au_br_whable(br->br_perm); - if (wh_able) - wh_found = au_wh_test(h_parent, wh_name, /*try_sio*/0); - h_dentry = ERR_PTR(wh_found); - if (!wh_found) - goto real_lookup; - if (unlikely(wh_found < 0)) - goto out; - - /* We found a whiteout */ - /* au_set_dbend(dentry, bindex); */ - au_set_dbwh(dentry, bindex); - if (!allow_neg) - return NULL; /* success */ - -real_lookup: - if (!ignore_perm) - h_dentry = vfsub_lkup_one(&dentry->d_name, h_parent); - else - h_dentry = au_sio_lkup_one(&dentry->d_name, h_parent); - if (IS_ERR(h_dentry)) { - if (PTR_ERR(h_dentry) == -ENAMETOOLONG - && !allow_neg) - h_dentry = NULL; - goto out; - } - - h_inode = d_inode(h_dentry); - if (d_is_negative(h_dentry)) { - if (!allow_neg) - goto out_neg; - } else if (wh_found - || (args->type && args->type != (h_inode->i_mode & S_IFMT))) - goto out_neg; - - if (au_dbend(dentry) <= bindex) - au_set_dbend(dentry, bindex); - if (au_dbstart(dentry) < 0 || bindex < au_dbstart(dentry)) - au_set_dbstart(dentry, bindex); - au_set_h_dptr(dentry, bindex, h_dentry); - - if (!d_is_dir(h_dentry) - || !wh_able - || (d_really_is_positive(dentry) && !d_is_dir(dentry))) - goto out; /* success */ - - mutex_lock_nested(&h_inode->i_mutex, AuLsc_I_CHILD); - opq = au_diropq_test(h_dentry); - mutex_unlock(&h_inode->i_mutex); - if (opq > 0) - au_set_dbdiropq(dentry, bindex); - else if (unlikely(opq < 0)) { - au_set_h_dptr(dentry, bindex, NULL); - h_dentry = ERR_PTR(opq); - } - goto out; - -out_neg: - dput(h_dentry); - h_dentry = NULL; -out: - return h_dentry; -} - -static int au_test_shwh(struct super_block *sb, const struct qstr *name) -{ - if (unlikely(!au_opt_test(au_mntflags(sb), SHWH) - && !strncmp(name->name, AUFS_WH_PFX, AUFS_WH_PFX_LEN))) - return -EPERM; - return 0; -} - -/* - * returns the number of lower positive dentries, - * otherwise an error. - * can be called at unlinking with @type is zero. - */ -int au_lkup_dentry(struct dentry *dentry, aufs_bindex_t bstart, mode_t type) -{ - int npositive, err; - aufs_bindex_t bindex, btail, bdiropq; - unsigned char isdir, dirperm1; - struct qstr whname; - struct au_do_lookup_args args = { - .flags = 0, - .type = type - }; - const struct qstr *name = &dentry->d_name; - struct dentry *parent; - struct super_block *sb; - - sb = dentry->d_sb; - err = au_test_shwh(sb, name); - if (unlikely(err)) - goto out; - - err = au_wh_name_alloc(&whname, name); - if (unlikely(err)) - goto out; - - isdir = !!d_is_dir(dentry); - if (!type) - au_fset_lkup(args.flags, ALLOW_NEG); - dirperm1 = !!au_opt_test(au_mntflags(sb), DIRPERM1); - - npositive = 0; - parent = dget_parent(dentry); - btail = au_dbtaildir(parent); - for (bindex = bstart; bindex <= btail; bindex++) { - struct dentry *h_parent, *h_dentry; - struct inode *h_inode, *h_dir; - - h_dentry = au_h_dptr(dentry, bindex); - if (h_dentry) { - if (d_is_positive(h_dentry)) - npositive++; - if (type != S_IFDIR) - break; - continue; - } - h_parent = au_h_dptr(parent, bindex); - if (!h_parent || !d_is_dir(h_parent)) - continue; - - h_dir = d_inode(h_parent); - mutex_lock_nested(&h_dir->i_mutex, AuLsc_I_PARENT); - h_dentry = au_do_lookup(h_parent, dentry, bindex, &whname, - &args); - mutex_unlock(&h_dir->i_mutex); - err = PTR_ERR(h_dentry); - if (IS_ERR(h_dentry)) - goto out_parent; - if (h_dentry) - au_fclr_lkup(args.flags, ALLOW_NEG); - if (dirperm1) - au_fset_lkup(args.flags, IGNORE_PERM); - - if (au_dbwh(dentry) >= 0) - break; - if (!h_dentry) - continue; - if (d_is_negative(h_dentry)) - continue; - h_inode = d_inode(h_dentry); - npositive++; - if (!args.type) - args.type = h_inode->i_mode & S_IFMT; - if (args.type != S_IFDIR) - break; - else if (isdir) { - /* the type of lower may be different */ - bdiropq = au_dbdiropq(dentry); - if (bdiropq >= 0 && bdiropq <= bindex) - break; - } - } - - if (npositive) { - AuLabel(positive); - au_update_dbstart(dentry); - } - err = npositive; - if (unlikely(!au_opt_test(au_mntflags(sb), UDBA_NONE) - && au_dbstart(dentry) < 0)) { - err = -EIO; - AuIOErr("both of real entry and whiteout found, %pd, err %d\n", - dentry, err); - } - -out_parent: - dput(parent); - kfree(whname.name); -out: - return err; -} - -struct dentry *au_sio_lkup_one(struct qstr *name, struct dentry *parent) -{ - struct dentry *dentry; - int wkq_err; - - if (!au_test_h_perm_sio(d_inode(parent), MAY_EXEC)) - dentry = vfsub_lkup_one(name, parent); - else { - struct vfsub_lkup_one_args args = { - .errp = &dentry, - .name = name, - .parent = parent - }; - - wkq_err = au_wkq_wait(vfsub_call_lkup_one, &args); - if (unlikely(wkq_err)) - dentry = ERR_PTR(wkq_err); - } - - return dentry; -} - -/* - * lookup @dentry on @bindex which should be negative. - */ -int au_lkup_neg(struct dentry *dentry, aufs_bindex_t bindex, int wh) -{ - int err; - struct dentry *parent, *h_parent, *h_dentry; - struct au_branch *br; - - parent = dget_parent(dentry); - h_parent = au_h_dptr(parent, bindex); - br = au_sbr(dentry->d_sb, bindex); - if (wh) - h_dentry = au_whtmp_lkup(h_parent, br, &dentry->d_name); - else - h_dentry = au_sio_lkup_one(&dentry->d_name, h_parent); - err = PTR_ERR(h_dentry); - if (IS_ERR(h_dentry)) - goto out; - if (unlikely(d_is_positive(h_dentry))) { - err = -EIO; - AuIOErr("%pd should be negative on b%d.\n", h_dentry, bindex); - dput(h_dentry); - goto out; - } - - err = 0; - if (bindex < au_dbstart(dentry)) - au_set_dbstart(dentry, bindex); - if (au_dbend(dentry) < bindex) - au_set_dbend(dentry, bindex); - au_set_h_dptr(dentry, bindex, h_dentry); - -out: - dput(parent); - return err; -} - -/* ---------------------------------------------------------------------- */ - -/* subset of struct inode */ -struct au_iattr { - unsigned long i_ino; - /* unsigned int i_nlink; */ - kuid_t i_uid; - kgid_t i_gid; - u64 i_version; -/* - loff_t i_size; - blkcnt_t i_blocks; -*/ - umode_t i_mode; -}; - -static void au_iattr_save(struct au_iattr *ia, struct inode *h_inode) -{ - ia->i_ino = h_inode->i_ino; - /* ia->i_nlink = h_inode->i_nlink; */ - ia->i_uid = h_inode->i_uid; - ia->i_gid = h_inode->i_gid; - ia->i_version = h_inode->i_version; -/* - ia->i_size = h_inode->i_size; - ia->i_blocks = h_inode->i_blocks; -*/ - ia->i_mode = (h_inode->i_mode & S_IFMT); -} - -static int au_iattr_test(struct au_iattr *ia, struct inode *h_inode) -{ - return ia->i_ino != h_inode->i_ino - /* || ia->i_nlink != h_inode->i_nlink */ - || !uid_eq(ia->i_uid, h_inode->i_uid) - || !gid_eq(ia->i_gid, h_inode->i_gid) - || ia->i_version != h_inode->i_version -/* - || ia->i_size != h_inode->i_size - || ia->i_blocks != h_inode->i_blocks -*/ - || ia->i_mode != (h_inode->i_mode & S_IFMT); -} - -static int au_h_verify_dentry(struct dentry *h_dentry, struct dentry *h_parent, - struct au_branch *br) -{ - int err; - struct au_iattr ia; - struct inode *h_inode; - struct dentry *h_d; - struct super_block *h_sb; - - err = 0; - memset(&ia, -1, sizeof(ia)); - h_sb = h_dentry->d_sb; - h_inode = NULL; - if (d_is_positive(h_dentry)) { - h_inode = d_inode(h_dentry); - au_iattr_save(&ia, h_inode); - } else if (au_test_nfs(h_sb) || au_test_fuse(h_sb)) - /* nfs d_revalidate may return 0 for negative dentry */ - /* fuse d_revalidate always return 0 for negative dentry */ - goto out; - - /* main purpose is namei.c:cached_lookup() and d_revalidate */ - h_d = vfsub_lkup_one(&h_dentry->d_name, h_parent); - err = PTR_ERR(h_d); - if (IS_ERR(h_d)) - goto out; - - err = 0; - if (unlikely(h_d != h_dentry - || d_inode(h_d) != h_inode - || (h_inode && au_iattr_test(&ia, h_inode)))) - err = au_busy_or_stale(); - dput(h_d); - -out: - AuTraceErr(err); - return err; -} - -int au_h_verify(struct dentry *h_dentry, unsigned int udba, struct inode *h_dir, - struct dentry *h_parent, struct au_branch *br) -{ - int err; - - err = 0; - if (udba == AuOpt_UDBA_REVAL - && !au_test_fs_remote(h_dentry->d_sb)) { - IMustLock(h_dir); - err = (d_inode(h_dentry->d_parent) != h_dir); - } else if (udba != AuOpt_UDBA_NONE) - err = au_h_verify_dentry(h_dentry, h_parent, br); - - return err; -} - -/* ---------------------------------------------------------------------- */ - -static int au_do_refresh_hdentry(struct dentry *dentry, struct dentry *parent) -{ - int err; - aufs_bindex_t new_bindex, bindex, bend, bwh, bdiropq; - struct au_hdentry tmp, *p, *q; - struct au_dinfo *dinfo; - struct super_block *sb; - - DiMustWriteLock(dentry); - - sb = dentry->d_sb; - dinfo = au_di(dentry); - bend = dinfo->di_bend; - bwh = dinfo->di_bwh; - bdiropq = dinfo->di_bdiropq; - p = dinfo->di_hdentry + dinfo->di_bstart; - for (bindex = dinfo->di_bstart; bindex <= bend; bindex++, p++) { - if (!p->hd_dentry) - continue; - - new_bindex = au_br_index(sb, p->hd_id); - if (new_bindex == bindex) - continue; - - if (dinfo->di_bwh == bindex) - bwh = new_bindex; - if (dinfo->di_bdiropq == bindex) - bdiropq = new_bindex; - if (new_bindex < 0) { - au_hdput(p); - p->hd_dentry = NULL; - continue; - } - - /* swap two lower dentries, and loop again */ - q = dinfo->di_hdentry + new_bindex; - tmp = *q; - *q = *p; - *p = tmp; - if (tmp.hd_dentry) { - bindex--; - p--; - } - } - - dinfo->di_bwh = -1; - if (bwh >= 0 && bwh <= au_sbend(sb) && au_sbr_whable(sb, bwh)) - dinfo->di_bwh = bwh; - - dinfo->di_bdiropq = -1; - if (bdiropq >= 0 - && bdiropq <= au_sbend(sb) - && au_sbr_whable(sb, bdiropq)) - dinfo->di_bdiropq = bdiropq; - - err = -EIO; - dinfo->di_bstart = -1; - dinfo->di_bend = -1; - bend = au_dbend(parent); - p = dinfo->di_hdentry; - for (bindex = 0; bindex <= bend; bindex++, p++) - if (p->hd_dentry) { - dinfo->di_bstart = bindex; - break; - } - - if (dinfo->di_bstart >= 0) { - p = dinfo->di_hdentry + bend; - for (bindex = bend; bindex >= 0; bindex--, p--) - if (p->hd_dentry) { - dinfo->di_bend = bindex; - err = 0; - break; - } - } - - return err; -} - -static void au_do_hide(struct dentry *dentry) -{ - struct inode *inode; - - if (d_really_is_positive(dentry)) { - inode = d_inode(dentry); - if (!d_is_dir(dentry)) { - if (inode->i_nlink && !d_unhashed(dentry)) - drop_nlink(inode); - } else { - clear_nlink(inode); - /* stop next lookup */ - inode->i_flags |= S_DEAD; - } - smp_mb(); /* necessary? */ - } - d_drop(dentry); -} - -static int au_hide_children(struct dentry *parent) -{ - int err, i, j, ndentry; - struct au_dcsub_pages dpages; - struct au_dpage *dpage; - struct dentry *dentry; - - err = au_dpages_init(&dpages, GFP_NOFS); - if (unlikely(err)) - goto out; - err = au_dcsub_pages(&dpages, parent, NULL, NULL); - if (unlikely(err)) - goto out_dpages; - - /* in reverse order */ - for (i = dpages.ndpage - 1; i >= 0; i--) { - dpage = dpages.dpages + i; - ndentry = dpage->ndentry; - for (j = ndentry - 1; j >= 0; j--) { - dentry = dpage->dentries[j]; - if (dentry != parent) - au_do_hide(dentry); - } - } - -out_dpages: - au_dpages_free(&dpages); -out: - return err; -} - -static void au_hide(struct dentry *dentry) -{ - int err; - - AuDbgDentry(dentry); - if (d_is_dir(dentry)) { - /* shrink_dcache_parent(dentry); */ - err = au_hide_children(dentry); - if (unlikely(err)) - AuIOErr("%pd, failed hiding children, ignored %d\n", - dentry, err); - } - au_do_hide(dentry); -} - -/* - * By adding a dirty branch, a cached dentry may be affected in various ways. - * - * a dirty branch is added - * - on the top of layers - * - in the middle of layers - * - to the bottom of layers - * - * on the added branch there exists - * - a whiteout - * - a diropq - * - a same named entry - * + exist - * * negative --> positive - * * positive --> positive - * - type is unchanged - * - type is changed - * + doesn't exist - * * negative --> negative - * * positive --> negative (rejected by au_br_del() for non-dir case) - * - none - */ -static int au_refresh_by_dinfo(struct dentry *dentry, struct au_dinfo *dinfo, - struct au_dinfo *tmp) -{ - int err; - aufs_bindex_t bindex, bend; - struct { - struct dentry *dentry; - struct inode *inode; - mode_t mode; - } orig_h, tmp_h; - struct au_hdentry *hd; - struct inode *inode, *h_inode; - struct dentry *h_dentry; - - err = 0; - AuDebugOn(dinfo->di_bstart < 0); - orig_h.mode = 0; - orig_h.dentry = dinfo->di_hdentry[dinfo->di_bstart].hd_dentry; - orig_h.inode = NULL; - if (d_is_positive(orig_h.dentry)) { - orig_h.inode = d_inode(orig_h.dentry); - orig_h.mode = orig_h.inode->i_mode & S_IFMT; - } - memset(&tmp_h, 0, sizeof(tmp_h)); - if (tmp->di_bstart >= 0) { - tmp_h.dentry = tmp->di_hdentry[tmp->di_bstart].hd_dentry; - tmp_h.inode = NULL; - if (d_is_positive(tmp_h.dentry)) { - tmp_h.inode = d_inode(tmp_h.dentry); - tmp_h.mode = tmp_h.inode->i_mode & S_IFMT; - } - } - - inode = NULL; - if (d_really_is_positive(dentry)) - inode = d_inode(dentry); - if (!orig_h.inode) { - AuDbg("nagative originally\n"); - if (inode) { - au_hide(dentry); - goto out; - } - AuDebugOn(inode); - AuDebugOn(dinfo->di_bstart != dinfo->di_bend); - AuDebugOn(dinfo->di_bdiropq != -1); - - if (!tmp_h.inode) { - AuDbg("negative --> negative\n"); - /* should have only one negative lower */ - if (tmp->di_bstart >= 0 - && tmp->di_bstart < dinfo->di_bstart) { - AuDebugOn(tmp->di_bstart != tmp->di_bend); - AuDebugOn(dinfo->di_bstart != dinfo->di_bend); - au_set_h_dptr(dentry, dinfo->di_bstart, NULL); - au_di_cp(dinfo, tmp); - hd = tmp->di_hdentry + tmp->di_bstart; - au_set_h_dptr(dentry, tmp->di_bstart, - dget(hd->hd_dentry)); - } - au_dbg_verify_dinode(dentry); - } else { - AuDbg("negative --> positive\n"); - /* - * similar to the behaviour of creating with bypassing - * aufs. - * unhash it in order to force an error in the - * succeeding create operation. - * we should not set S_DEAD here. - */ - d_drop(dentry); - /* au_di_swap(tmp, dinfo); */ - au_dbg_verify_dinode(dentry); - } - } else { - AuDbg("positive originally\n"); - /* inode may be NULL */ - AuDebugOn(inode && (inode->i_mode & S_IFMT) != orig_h.mode); - if (!tmp_h.inode) { - AuDbg("positive --> negative\n"); - /* or bypassing aufs */ - au_hide(dentry); - if (tmp->di_bwh >= 0 && tmp->di_bwh <= dinfo->di_bstart) - dinfo->di_bwh = tmp->di_bwh; - if (inode) - err = au_refresh_hinode_self(inode); - au_dbg_verify_dinode(dentry); - } else if (orig_h.mode == tmp_h.mode) { - AuDbg("positive --> positive, same type\n"); - if (!S_ISDIR(orig_h.mode) - && dinfo->di_bstart > tmp->di_bstart) { - /* - * similar to the behaviour of removing and - * creating. - */ - au_hide(dentry); - if (inode) - err = au_refresh_hinode_self(inode); - au_dbg_verify_dinode(dentry); - } else { - /* fill empty slots */ - if (dinfo->di_bstart > tmp->di_bstart) - dinfo->di_bstart = tmp->di_bstart; - if (dinfo->di_bend < tmp->di_bend) - dinfo->di_bend = tmp->di_bend; - dinfo->di_bwh = tmp->di_bwh; - dinfo->di_bdiropq = tmp->di_bdiropq; - hd = tmp->di_hdentry; - bend = dinfo->di_bend; - for (bindex = tmp->di_bstart; bindex <= bend; - bindex++) { - if (au_h_dptr(dentry, bindex)) - continue; - h_dentry = hd[bindex].hd_dentry; - if (!h_dentry) - continue; - AuDebugOn(d_is_negative(h_dentry)); - h_inode = d_inode(h_dentry); - AuDebugOn(orig_h.mode - != (h_inode->i_mode - & S_IFMT)); - au_set_h_dptr(dentry, bindex, - dget(h_dentry)); - } - err = au_refresh_hinode(inode, dentry); - au_dbg_verify_dinode(dentry); - } - } else { - AuDbg("positive --> positive, different type\n"); - /* similar to the behaviour of removing and creating */ - au_hide(dentry); - if (inode) - err = au_refresh_hinode_self(inode); - au_dbg_verify_dinode(dentry); - } - } - -out: - return err; -} - -int au_refresh_dentry(struct dentry *dentry, struct dentry *parent) -{ - int err, ebrange; - unsigned int sigen; - struct au_dinfo *dinfo, *tmp; - struct super_block *sb; - struct inode *inode; - - DiMustWriteLock(dentry); - AuDebugOn(IS_ROOT(dentry)); - AuDebugOn(d_really_is_negative(parent)); - - sb = dentry->d_sb; - sigen = au_sigen(sb); - err = au_digen_test(parent, sigen); - if (unlikely(err)) - goto out; - - dinfo = au_di(dentry); - err = au_di_realloc(dinfo, au_sbend(sb) + 1); - if (unlikely(err)) - goto out; - ebrange = au_dbrange_test(dentry); - if (!ebrange) - ebrange = au_do_refresh_hdentry(dentry, parent); - - if (d_unhashed(dentry) || ebrange /* || dinfo->di_tmpfile */) { - AuDebugOn(au_dbstart(dentry) < 0 && au_dbend(dentry) >= 0); - if (d_really_is_positive(dentry)) { - inode = d_inode(dentry); - err = au_refresh_hinode_self(inode); - } - au_dbg_verify_dinode(dentry); - if (!err) - goto out_dgen; /* success */ - goto out; - } - - /* temporary dinfo */ - AuDbgDentry(dentry); - err = -ENOMEM; - tmp = au_di_alloc(sb, AuLsc_DI_TMP); - if (unlikely(!tmp)) - goto out; - au_di_swap(tmp, dinfo); - /* returns the number of positive dentries */ - /* - * if current working dir is removed, it returns an error. - * but the dentry is legal. - */ - err = au_lkup_dentry(dentry, /*bstart*/0, /*type*/0); - AuDbgDentry(dentry); - au_di_swap(tmp, dinfo); - if (err == -ENOENT) - err = 0; - if (err >= 0) { - /* compare/refresh by dinfo */ - AuDbgDentry(dentry); - err = au_refresh_by_dinfo(dentry, dinfo, tmp); - au_dbg_verify_dinode(dentry); - AuTraceErr(err); - } - au_rw_write_unlock(&tmp->di_rwsem); - au_di_free(tmp); - if (unlikely(err)) - goto out; - -out_dgen: - au_update_digen(dentry); -out: - if (unlikely(err && !(dentry->d_flags & DCACHE_NFSFS_RENAMED))) { - AuIOErr("failed refreshing %pd, %d\n", dentry, err); - AuDbgDentry(dentry); - } - AuTraceErr(err); - return err; -} - -static int au_do_h_d_reval(struct dentry *h_dentry, unsigned int flags, - struct dentry *dentry, aufs_bindex_t bindex) -{ - int err, valid; - - err = 0; - if (!(h_dentry->d_flags & DCACHE_OP_REVALIDATE)) - goto out; - - AuDbg("b%d\n", bindex); - /* - * gave up supporting LOOKUP_CREATE/OPEN for lower fs, - * due to whiteout and branch permission. - */ - flags &= ~(/*LOOKUP_PARENT |*/ LOOKUP_OPEN | LOOKUP_CREATE - | LOOKUP_FOLLOW | LOOKUP_EXCL); - /* it may return tri-state */ - valid = h_dentry->d_op->d_revalidate(h_dentry, flags); - - if (unlikely(valid < 0)) - err = valid; - else if (!valid) - err = -EINVAL; - -out: - AuTraceErr(err); - return err; -} - -/* todo: remove this */ -static int h_d_revalidate(struct dentry *dentry, struct inode *inode, - unsigned int flags, int do_udba) -{ - int err; - umode_t mode, h_mode; - aufs_bindex_t bindex, btail, bstart, ibs, ibe; - unsigned char plus, unhashed, is_root, h_plus, h_nfs, tmpfile; - struct inode *h_inode, *h_cached_inode; - struct dentry *h_dentry; - struct qstr *name, *h_name; - - err = 0; - plus = 0; - mode = 0; - ibs = -1; - ibe = -1; - unhashed = !!d_unhashed(dentry); - is_root = !!IS_ROOT(dentry); - name = &dentry->d_name; - tmpfile = au_di(dentry)->di_tmpfile; - - /* - * Theoretically, REVAL test should be unnecessary in case of - * {FS,I}NOTIFY. - * But {fs,i}notify doesn't fire some necessary events, - * IN_ATTRIB for atime/nlink/pageio - * Let's do REVAL test too. - */ - if (do_udba && inode) { - mode = (inode->i_mode & S_IFMT); - plus = (inode->i_nlink > 0); - ibs = au_ibstart(inode); - ibe = au_ibend(inode); - } - - bstart = au_dbstart(dentry); - btail = bstart; - if (inode && S_ISDIR(inode->i_mode)) - btail = au_dbtaildir(dentry); - for (bindex = bstart; bindex <= btail; bindex++) { - h_dentry = au_h_dptr(dentry, bindex); - if (!h_dentry) - continue; - - AuDbg("b%d, %pd\n", bindex, h_dentry); - h_nfs = !!au_test_nfs(h_dentry->d_sb); - spin_lock(&h_dentry->d_lock); - h_name = &h_dentry->d_name; - if (unlikely(do_udba - && !is_root - && ((!h_nfs - && (unhashed != !!d_unhashed(h_dentry) - || (!tmpfile - && !au_qstreq(name, h_name)) - )) - || (h_nfs - && !(flags & LOOKUP_OPEN) - && (h_dentry->d_flags - & DCACHE_NFSFS_RENAMED))) - )) { - int h_unhashed; - - h_unhashed = d_unhashed(h_dentry); - spin_unlock(&h_dentry->d_lock); - AuDbg("unhash 0x%x 0x%x, %pd %pd\n", - unhashed, h_unhashed, dentry, h_dentry); - goto err; - } - spin_unlock(&h_dentry->d_lock); - - err = au_do_h_d_reval(h_dentry, flags, dentry, bindex); - if (unlikely(err)) - /* do not goto err, to keep the errno */ - break; - - /* todo: plink too? */ - if (!do_udba) - continue; - - /* UDBA tests */ - if (unlikely(!!inode != d_is_positive(h_dentry))) - goto err; - - h_inode = NULL; - if (d_is_positive(h_dentry)) - h_inode = d_inode(h_dentry); - h_plus = plus; - h_mode = mode; - h_cached_inode = h_inode; - if (h_inode) { - h_mode = (h_inode->i_mode & S_IFMT); - h_plus = (h_inode->i_nlink > 0); - } - if (inode && ibs <= bindex && bindex <= ibe) - h_cached_inode = au_h_iptr(inode, bindex); - - if (!h_nfs) { - if (unlikely(plus != h_plus && !tmpfile)) - goto err; - } else { - if (unlikely(!(h_dentry->d_flags & DCACHE_NFSFS_RENAMED) - && !is_root - && !IS_ROOT(h_dentry) - && unhashed != d_unhashed(h_dentry))) - goto err; - } - if (unlikely(mode != h_mode - || h_cached_inode != h_inode)) - goto err; - continue; - -err: - err = -EINVAL; - break; - } - - AuTraceErr(err); - return err; -} - -/* todo: consolidate with do_refresh() and au_reval_for_attr() */ -static int simple_reval_dpath(struct dentry *dentry, unsigned int sigen) -{ - int err; - struct dentry *parent; - - if (!au_digen_test(dentry, sigen)) - return 0; - - parent = dget_parent(dentry); - di_read_lock_parent(parent, AuLock_IR); - AuDebugOn(au_digen_test(parent, sigen)); - au_dbg_verify_gen(parent, sigen); - err = au_refresh_dentry(dentry, parent); - di_read_unlock(parent, AuLock_IR); - dput(parent); - AuTraceErr(err); - return err; -} - -int au_reval_dpath(struct dentry *dentry, unsigned int sigen) -{ - int err; - struct dentry *d, *parent; - - if (!au_ftest_si(au_sbi(dentry->d_sb), FAILED_REFRESH_DIR)) - return simple_reval_dpath(dentry, sigen); - - /* slow loop, keep it simple and stupid */ - /* cf: au_cpup_dirs() */ - err = 0; - parent = NULL; - while (au_digen_test(dentry, sigen)) { - d = dentry; - while (1) { - dput(parent); - parent = dget_parent(d); - if (!au_digen_test(parent, sigen)) - break; - d = parent; - } - - if (d != dentry) - di_write_lock_child2(d); - - /* someone might update our dentry while we were sleeping */ - if (au_digen_test(d, sigen)) { - /* - * todo: consolidate with simple_reval_dpath(), - * do_refresh() and au_reval_for_attr(). - */ - di_read_lock_parent(parent, AuLock_IR); - err = au_refresh_dentry(d, parent); - di_read_unlock(parent, AuLock_IR); - } - - if (d != dentry) - di_write_unlock(d); - dput(parent); - if (unlikely(err)) - break; - } - - return err; -} - -/* - * if valid returns 1, otherwise 0. - */ -static int aufs_d_revalidate(struct dentry *dentry, unsigned int flags) -{ - int valid, err; - unsigned int sigen; - unsigned char do_udba; - struct super_block *sb; - struct inode *inode; - - /* todo: support rcu-walk? */ - if (flags & LOOKUP_RCU) - return -ECHILD; - - valid = 0; - if (unlikely(!au_di(dentry))) - goto out; - - valid = 1; - sb = dentry->d_sb; - /* - * todo: very ugly - * i_mutex of parent dir may be held, - * but we should not return 'invalid' due to busy. - */ - err = aufs_read_lock(dentry, AuLock_FLUSH | AuLock_DW | AuLock_NOPLM); - if (unlikely(err)) { - valid = err; - AuTraceErr(err); - goto out; - } - inode = NULL; - if (d_really_is_positive(dentry)) - inode = d_inode(dentry); - if (unlikely(inode && is_bad_inode(inode))) { - err = -EINVAL; - AuTraceErr(err); - goto out_dgrade; - } - if (unlikely(au_dbrange_test(dentry))) { - err = -EINVAL; - AuTraceErr(err); - goto out_dgrade; - } - - sigen = au_sigen(sb); - if (au_digen_test(dentry, sigen)) { - AuDebugOn(IS_ROOT(dentry)); - err = au_reval_dpath(dentry, sigen); - if (unlikely(err)) { - AuTraceErr(err); - goto out_dgrade; - } - } - di_downgrade_lock(dentry, AuLock_IR); - - err = -EINVAL; - if (!(flags & (LOOKUP_OPEN | LOOKUP_EMPTY)) - && inode - && !(inode->i_state && I_LINKABLE) - && (IS_DEADDIR(inode) || !inode->i_nlink)) - goto out_inval; - - do_udba = !au_opt_test(au_mntflags(sb), UDBA_NONE); - if (do_udba && inode) { - aufs_bindex_t bstart = au_ibstart(inode); - struct inode *h_inode; - - if (bstart >= 0) { - h_inode = au_h_iptr(inode, bstart); - if (h_inode && au_test_higen(inode, h_inode)) - goto out_inval; - } - } - - err = h_d_revalidate(dentry, inode, flags, do_udba); - if (unlikely(!err && do_udba && au_dbstart(dentry) < 0)) { - err = -EIO; - AuDbg("both of real entry and whiteout found, %p, err %d\n", - dentry, err); - } - goto out_inval; - -out_dgrade: - di_downgrade_lock(dentry, AuLock_IR); -out_inval: - aufs_read_unlock(dentry, AuLock_IR); - AuTraceErr(err); - valid = !err; -out: - if (!valid) { - AuDbg("%pd invalid, %d\n", dentry, valid); - d_drop(dentry); - } - return valid; -} - -static void aufs_d_release(struct dentry *dentry) -{ - if (au_di(dentry)) { - au_di_fin(dentry); - au_hn_di_reinit(dentry); - } -} - -const struct dentry_operations aufs_dop = { - .d_revalidate = aufs_d_revalidate, - .d_weak_revalidate = aufs_d_revalidate, - .d_release = aufs_d_release -}; diff --git a/fs/aufs/dentry.h b/fs/aufs/dentry.h deleted file mode 100644 index 2ce602e1a..000000000 --- a/fs/aufs/dentry.h +++ /dev/null @@ -1,233 +0,0 @@ -/* - * Copyright (C) 2005-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * lookup and dentry operations - */ - -#ifndef __AUFS_DENTRY_H__ -#define __AUFS_DENTRY_H__ - -#ifdef __KERNEL__ - -#include <linux/dcache.h> -#include "rwsem.h" - -struct au_hdentry { - struct dentry *hd_dentry; - aufs_bindex_t hd_id; -}; - -struct au_dinfo { - atomic_t di_generation; - - struct au_rwsem di_rwsem; - aufs_bindex_t di_bstart, di_bend, di_bwh, di_bdiropq; - unsigned char di_tmpfile; /* to allow the different name */ - struct au_hdentry *di_hdentry; -} ____cacheline_aligned_in_smp; - -/* ---------------------------------------------------------------------- */ - -/* dentry.c */ -extern const struct dentry_operations aufs_dop; -struct au_branch; -struct dentry *au_sio_lkup_one(struct qstr *name, struct dentry *parent); -int au_h_verify(struct dentry *h_dentry, unsigned int udba, struct inode *h_dir, - struct dentry *h_parent, struct au_branch *br); - -int au_lkup_dentry(struct dentry *dentry, aufs_bindex_t bstart, mode_t type); -int au_lkup_neg(struct dentry *dentry, aufs_bindex_t bindex, int wh); -int au_refresh_dentry(struct dentry *dentry, struct dentry *parent); -int au_reval_dpath(struct dentry *dentry, unsigned int sigen); - -/* dinfo.c */ -void au_di_init_once(void *_di); -struct au_dinfo *au_di_alloc(struct super_block *sb, unsigned int lsc); -void au_di_free(struct au_dinfo *dinfo); -void au_di_swap(struct au_dinfo *a, struct au_dinfo *b); -void au_di_cp(struct au_dinfo *dst, struct au_dinfo *src); -int au_di_init(struct dentry *dentry); -void au_di_fin(struct dentry *dentry); -int au_di_realloc(struct au_dinfo *dinfo, int nbr); - -void di_read_lock(struct dentry *d, int flags, unsigned int lsc); -void di_read_unlock(struct dentry *d, int flags); -void di_downgrade_lock(struct dentry *d, int flags); -void di_write_lock(struct dentry *d, unsigned int lsc); -void di_write_unlock(struct dentry *d); -void di_write_lock2_child(struct dentry *d1, struct dentry *d2, int isdir); -void di_write_lock2_parent(struct dentry *d1, struct dentry *d2, int isdir); -void di_write_unlock2(struct dentry *d1, struct dentry *d2); - -struct dentry *au_h_dptr(struct dentry *dentry, aufs_bindex_t bindex); -struct dentry *au_h_d_alias(struct dentry *dentry, aufs_bindex_t bindex); -aufs_bindex_t au_dbtail(struct dentry *dentry); -aufs_bindex_t au_dbtaildir(struct dentry *dentry); - -void au_set_h_dptr(struct dentry *dentry, aufs_bindex_t bindex, - struct dentry *h_dentry); -int au_digen_test(struct dentry *dentry, unsigned int sigen); -int au_dbrange_test(struct dentry *dentry); -void au_update_digen(struct dentry *dentry); -void au_update_dbrange(struct dentry *dentry, int do_put_zero); -void au_update_dbstart(struct dentry *dentry); -void au_update_dbend(struct dentry *dentry); -int au_find_dbindex(struct dentry *dentry, struct dentry *h_dentry); - -/* ---------------------------------------------------------------------- */ - -static inline struct au_dinfo *au_di(struct dentry *dentry) -{ - return dentry->d_fsdata; -} - -/* ---------------------------------------------------------------------- */ - -/* lock subclass for dinfo */ -enum { - AuLsc_DI_CHILD, /* child first */ - AuLsc_DI_CHILD2, /* rename(2), link(2), and cpup at hnotify */ - AuLsc_DI_CHILD3, /* copyup dirs */ - AuLsc_DI_PARENT, - AuLsc_DI_PARENT2, - AuLsc_DI_PARENT3, - AuLsc_DI_TMP /* temp for replacing dinfo */ -}; - -/* - * di_read_lock_child, di_write_lock_child, - * di_read_lock_child2, di_write_lock_child2, - * di_read_lock_child3, di_write_lock_child3, - * di_read_lock_parent, di_write_lock_parent, - * di_read_lock_parent2, di_write_lock_parent2, - * di_read_lock_parent3, di_write_lock_parent3, - */ -#define AuReadLockFunc(name, lsc) \ -static inline void di_read_lock_##name(struct dentry *d, int flags) \ -{ di_read_lock(d, flags, AuLsc_DI_##lsc); } - -#define AuWriteLockFunc(name, lsc) \ -static inline void di_write_lock_##name(struct dentry *d) \ -{ di_write_lock(d, AuLsc_DI_##lsc); } - -#define AuRWLockFuncs(name, lsc) \ - AuReadLockFunc(name, lsc) \ - AuWriteLockFunc(name, lsc) - -AuRWLockFuncs(child, CHILD); -AuRWLockFuncs(child2, CHILD2); -AuRWLockFuncs(child3, CHILD3); -AuRWLockFuncs(parent, PARENT); -AuRWLockFuncs(parent2, PARENT2); -AuRWLockFuncs(parent3, PARENT3); - -#undef AuReadLockFunc -#undef AuWriteLockFunc -#undef AuRWLockFuncs - -#define DiMustNoWaiters(d) AuRwMustNoWaiters(&au_di(d)->di_rwsem) -#define DiMustAnyLock(d) AuRwMustAnyLock(&au_di(d)->di_rwsem) -#define DiMustWriteLock(d) AuRwMustWriteLock(&au_di(d)->di_rwsem) - -/* ---------------------------------------------------------------------- */ - -/* todo: memory barrier? */ -static inline unsigned int au_digen(struct dentry *d) -{ - return atomic_read(&au_di(d)->di_generation); -} - -static inline void au_h_dentry_init(struct au_hdentry *hdentry) -{ - hdentry->hd_dentry = NULL; -} - -static inline void au_hdput(struct au_hdentry *hd) -{ - if (hd) - dput(hd->hd_dentry); -} - -static inline aufs_bindex_t au_dbstart(struct dentry *dentry) -{ - DiMustAnyLock(dentry); - return au_di(dentry)->di_bstart; -} - -static inline aufs_bindex_t au_dbend(struct dentry *dentry) -{ - DiMustAnyLock(dentry); - return au_di(dentry)->di_bend; -} - -static inline aufs_bindex_t au_dbwh(struct dentry *dentry) -{ - DiMustAnyLock(dentry); - return au_di(dentry)->di_bwh; -} - -static inline aufs_bindex_t au_dbdiropq(struct dentry *dentry) -{ - DiMustAnyLock(dentry); - return au_di(dentry)->di_bdiropq; -} - -/* todo: hard/soft set? */ -static inline void au_set_dbstart(struct dentry *dentry, aufs_bindex_t bindex) -{ - DiMustWriteLock(dentry); - au_di(dentry)->di_bstart = bindex; -} - -static inline void au_set_dbend(struct dentry *dentry, aufs_bindex_t bindex) -{ - DiMustWriteLock(dentry); - au_di(dentry)->di_bend = bindex; -} - -static inline void au_set_dbwh(struct dentry *dentry, aufs_bindex_t bindex) -{ - DiMustWriteLock(dentry); - /* dbwh can be outside of bstart - bend range */ - au_di(dentry)->di_bwh = bindex; -} - -static inline void au_set_dbdiropq(struct dentry *dentry, aufs_bindex_t bindex) -{ - DiMustWriteLock(dentry); - au_di(dentry)->di_bdiropq = bindex; -} - -/* ---------------------------------------------------------------------- */ - -#ifdef CONFIG_AUFS_HNOTIFY -static inline void au_digen_dec(struct dentry *d) -{ - atomic_dec(&au_di(d)->di_generation); -} - -static inline void au_hn_di_reinit(struct dentry *dentry) -{ - dentry->d_fsdata = NULL; -} -#else -AuStubVoid(au_hn_di_reinit, struct dentry *dentry __maybe_unused) -#endif /* CONFIG_AUFS_HNOTIFY */ - -#endif /* __KERNEL__ */ -#endif /* __AUFS_DENTRY_H__ */ diff --git a/fs/aufs/dinfo.c b/fs/aufs/dinfo.c deleted file mode 100644 index 7b9ca3b88..000000000 --- a/fs/aufs/dinfo.c +++ /dev/null @@ -1,550 +0,0 @@ -/* - * Copyright (C) 2005-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * dentry private data - */ - -#include "aufs.h" - -void au_di_init_once(void *_dinfo) -{ - struct au_dinfo *dinfo = _dinfo; - static struct lock_class_key aufs_di; - - au_rw_init(&dinfo->di_rwsem); - au_rw_class(&dinfo->di_rwsem, &aufs_di); -} - -struct au_dinfo *au_di_alloc(struct super_block *sb, unsigned int lsc) -{ - struct au_dinfo *dinfo; - int nbr, i; - - dinfo = au_cache_alloc_dinfo(); - if (unlikely(!dinfo)) - goto out; - - nbr = au_sbend(sb) + 1; - if (nbr <= 0) - nbr = 1; - dinfo->di_hdentry = kcalloc(nbr, sizeof(*dinfo->di_hdentry), GFP_NOFS); - if (dinfo->di_hdentry) { - au_rw_write_lock_nested(&dinfo->di_rwsem, lsc); - dinfo->di_bstart = -1; - dinfo->di_bend = -1; - dinfo->di_bwh = -1; - dinfo->di_bdiropq = -1; - dinfo->di_tmpfile = 0; - for (i = 0; i < nbr; i++) - dinfo->di_hdentry[i].hd_id = -1; - goto out; - } - - au_cache_free_dinfo(dinfo); - dinfo = NULL; - -out: - return dinfo; -} - -void au_di_free(struct au_dinfo *dinfo) -{ - struct au_hdentry *p; - aufs_bindex_t bend, bindex; - - /* dentry may not be revalidated */ - bindex = dinfo->di_bstart; - if (bindex >= 0) { - bend = dinfo->di_bend; - p = dinfo->di_hdentry + bindex; - while (bindex++ <= bend) - au_hdput(p++); - } - kfree(dinfo->di_hdentry); - au_cache_free_dinfo(dinfo); -} - -void au_di_swap(struct au_dinfo *a, struct au_dinfo *b) -{ - struct au_hdentry *p; - aufs_bindex_t bi; - - AuRwMustWriteLock(&a->di_rwsem); - AuRwMustWriteLock(&b->di_rwsem); - -#define DiSwap(v, name) \ - do { \ - v = a->di_##name; \ - a->di_##name = b->di_##name; \ - b->di_##name = v; \ - } while (0) - - DiSwap(p, hdentry); - DiSwap(bi, bstart); - DiSwap(bi, bend); - DiSwap(bi, bwh); - DiSwap(bi, bdiropq); - /* smp_mb(); */ - -#undef DiSwap -} - -void au_di_cp(struct au_dinfo *dst, struct au_dinfo *src) -{ - AuRwMustWriteLock(&dst->di_rwsem); - AuRwMustWriteLock(&src->di_rwsem); - - dst->di_bstart = src->di_bstart; - dst->di_bend = src->di_bend; - dst->di_bwh = src->di_bwh; - dst->di_bdiropq = src->di_bdiropq; - /* smp_mb(); */ -} - -int au_di_init(struct dentry *dentry) -{ - int err; - struct super_block *sb; - struct au_dinfo *dinfo; - - err = 0; - sb = dentry->d_sb; - dinfo = au_di_alloc(sb, AuLsc_DI_CHILD); - if (dinfo) { - atomic_set(&dinfo->di_generation, au_sigen(sb)); - /* smp_mb(); */ /* atomic_set */ - dentry->d_fsdata = dinfo; - } else - err = -ENOMEM; - - return err; -} - -void au_di_fin(struct dentry *dentry) -{ - struct au_dinfo *dinfo; - - dinfo = au_di(dentry); - AuRwDestroy(&dinfo->di_rwsem); - au_di_free(dinfo); -} - -int au_di_realloc(struct au_dinfo *dinfo, int nbr) -{ - int err, sz; - struct au_hdentry *hdp; - - AuRwMustWriteLock(&dinfo->di_rwsem); - - err = -ENOMEM; - sz = sizeof(*hdp) * (dinfo->di_bend + 1); - if (!sz) - sz = sizeof(*hdp); - hdp = au_kzrealloc(dinfo->di_hdentry, sz, sizeof(*hdp) * nbr, GFP_NOFS); - if (hdp) { - dinfo->di_hdentry = hdp; - err = 0; - } - - return err; -} - -/* ---------------------------------------------------------------------- */ - -static void do_ii_write_lock(struct inode *inode, unsigned int lsc) -{ - switch (lsc) { - case AuLsc_DI_CHILD: - ii_write_lock_child(inode); - break; - case AuLsc_DI_CHILD2: - ii_write_lock_child2(inode); - break; - case AuLsc_DI_CHILD3: - ii_write_lock_child3(inode); - break; - case AuLsc_DI_PARENT: - ii_write_lock_parent(inode); - break; - case AuLsc_DI_PARENT2: - ii_write_lock_parent2(inode); - break; - case AuLsc_DI_PARENT3: - ii_write_lock_parent3(inode); - break; - default: - BUG(); - } -} - -static void do_ii_read_lock(struct inode *inode, unsigned int lsc) -{ - switch (lsc) { - case AuLsc_DI_CHILD: - ii_read_lock_child(inode); - break; - case AuLsc_DI_CHILD2: - ii_read_lock_child2(inode); - break; - case AuLsc_DI_CHILD3: - ii_read_lock_child3(inode); - break; - case AuLsc_DI_PARENT: - ii_read_lock_parent(inode); - break; - case AuLsc_DI_PARENT2: - ii_read_lock_parent2(inode); - break; - case AuLsc_DI_PARENT3: - ii_read_lock_parent3(inode); - break; - default: - BUG(); - } -} - -void di_read_lock(struct dentry *d, int flags, unsigned int lsc) -{ - struct inode *inode; - - au_rw_read_lock_nested(&au_di(d)->di_rwsem, lsc); - if (d_really_is_positive(d)) { - inode = d_inode(d); - if (au_ftest_lock(flags, IW)) - do_ii_write_lock(inode, lsc); - else if (au_ftest_lock(flags, IR)) - do_ii_read_lock(inode, lsc); - } -} - -void di_read_unlock(struct dentry *d, int flags) -{ - struct inode *inode; - - if (d_really_is_positive(d)) { - inode = d_inode(d); - if (au_ftest_lock(flags, IW)) { - au_dbg_verify_dinode(d); - ii_write_unlock(inode); - } else if (au_ftest_lock(flags, IR)) { - au_dbg_verify_dinode(d); - ii_read_unlock(inode); - } - } - au_rw_read_unlock(&au_di(d)->di_rwsem); -} - -void di_downgrade_lock(struct dentry *d, int flags) -{ - if (d_really_is_positive(d) && au_ftest_lock(flags, IR)) - ii_downgrade_lock(d_inode(d)); - au_rw_dgrade_lock(&au_di(d)->di_rwsem); -} - -void di_write_lock(struct dentry *d, unsigned int lsc) -{ - au_rw_write_lock_nested(&au_di(d)->di_rwsem, lsc); - if (d_really_is_positive(d)) - do_ii_write_lock(d_inode(d), lsc); -} - -void di_write_unlock(struct dentry *d) -{ - au_dbg_verify_dinode(d); - if (d_really_is_positive(d)) - ii_write_unlock(d_inode(d)); - au_rw_write_unlock(&au_di(d)->di_rwsem); -} - -void di_write_lock2_child(struct dentry *d1, struct dentry *d2, int isdir) -{ - AuDebugOn(d1 == d2 - || d_inode(d1) == d_inode(d2) - || d1->d_sb != d2->d_sb); - - if (isdir && au_test_subdir(d1, d2)) { - di_write_lock_child(d1); - di_write_lock_child2(d2); - } else { - /* there should be no races */ - di_write_lock_child(d2); - di_write_lock_child2(d1); - } -} - -void di_write_lock2_parent(struct dentry *d1, struct dentry *d2, int isdir) -{ - AuDebugOn(d1 == d2 - || d_inode(d1) == d_inode(d2) - || d1->d_sb != d2->d_sb); - - if (isdir && au_test_subdir(d1, d2)) { - di_write_lock_parent(d1); - di_write_lock_parent2(d2); - } else { - /* there should be no races */ - di_write_lock_parent(d2); - di_write_lock_parent2(d1); - } -} - -void di_write_unlock2(struct dentry *d1, struct dentry *d2) -{ - di_write_unlock(d1); - if (d_inode(d1) == d_inode(d2)) - au_rw_write_unlock(&au_di(d2)->di_rwsem); - else - di_write_unlock(d2); -} - -/* ---------------------------------------------------------------------- */ - -struct dentry *au_h_dptr(struct dentry *dentry, aufs_bindex_t bindex) -{ - struct dentry *d; - - DiMustAnyLock(dentry); - - if (au_dbstart(dentry) < 0 || bindex < au_dbstart(dentry)) - return NULL; - AuDebugOn(bindex < 0); - d = au_di(dentry)->di_hdentry[0 + bindex].hd_dentry; - AuDebugOn(d && au_dcount(d) <= 0); - return d; -} - -/* - * extended version of au_h_dptr(). - * returns a hashed and positive (or linkable) h_dentry in bindex, NULL, or - * error. - */ -struct dentry *au_h_d_alias(struct dentry *dentry, aufs_bindex_t bindex) -{ - struct dentry *h_dentry; - struct inode *inode, *h_inode; - - AuDebugOn(d_really_is_negative(dentry)); - - h_dentry = NULL; - if (au_dbstart(dentry) <= bindex - && bindex <= au_dbend(dentry)) - h_dentry = au_h_dptr(dentry, bindex); - if (h_dentry && !au_d_linkable(h_dentry)) { - dget(h_dentry); - goto out; /* success */ - } - - inode = d_inode(dentry); - AuDebugOn(bindex < au_ibstart(inode)); - AuDebugOn(au_ibend(inode) < bindex); - h_inode = au_h_iptr(inode, bindex); - h_dentry = d_find_alias(h_inode); - if (h_dentry) { - if (!IS_ERR(h_dentry)) { - if (!au_d_linkable(h_dentry)) - goto out; /* success */ - dput(h_dentry); - } else - goto out; - } - - if (au_opt_test(au_mntflags(dentry->d_sb), PLINK)) { - h_dentry = au_plink_lkup(inode, bindex); - AuDebugOn(!h_dentry); - if (!IS_ERR(h_dentry)) { - if (!au_d_hashed_positive(h_dentry)) - goto out; /* success */ - dput(h_dentry); - h_dentry = NULL; - } - } - -out: - AuDbgDentry(h_dentry); - return h_dentry; -} - -aufs_bindex_t au_dbtail(struct dentry *dentry) -{ - aufs_bindex_t bend, bwh; - - bend = au_dbend(dentry); - if (0 <= bend) { - bwh = au_dbwh(dentry); - if (!bwh) - return bwh; - if (0 < bwh && bwh < bend) - return bwh - 1; - } - return bend; -} - -aufs_bindex_t au_dbtaildir(struct dentry *dentry) -{ - aufs_bindex_t bend, bopq; - - bend = au_dbtail(dentry); - if (0 <= bend) { - bopq = au_dbdiropq(dentry); - if (0 <= bopq && bopq < bend) - bend = bopq; - } - return bend; -} - -/* ---------------------------------------------------------------------- */ - -void au_set_h_dptr(struct dentry *dentry, aufs_bindex_t bindex, - struct dentry *h_dentry) -{ - struct au_hdentry *hd = au_di(dentry)->di_hdentry + bindex; - struct au_branch *br; - - DiMustWriteLock(dentry); - - au_hdput(hd); - hd->hd_dentry = h_dentry; - if (h_dentry) { - br = au_sbr(dentry->d_sb, bindex); - hd->hd_id = br->br_id; - } -} - -int au_dbrange_test(struct dentry *dentry) -{ - int err; - aufs_bindex_t bstart, bend; - - err = 0; - bstart = au_dbstart(dentry); - bend = au_dbend(dentry); - if (bstart >= 0) - AuDebugOn(bend < 0 && bstart > bend); - else { - err = -EIO; - AuDebugOn(bend >= 0); - } - - return err; -} - -int au_digen_test(struct dentry *dentry, unsigned int sigen) -{ - int err; - - err = 0; - if (unlikely(au_digen(dentry) != sigen - || au_iigen_test(d_inode(dentry), sigen))) - err = -EIO; - - return err; -} - -void au_update_digen(struct dentry *dentry) -{ - atomic_set(&au_di(dentry)->di_generation, au_sigen(dentry->d_sb)); - /* smp_mb(); */ /* atomic_set */ -} - -void au_update_dbrange(struct dentry *dentry, int do_put_zero) -{ - struct au_dinfo *dinfo; - struct dentry *h_d; - struct au_hdentry *hdp; - - DiMustWriteLock(dentry); - - dinfo = au_di(dentry); - if (!dinfo || dinfo->di_bstart < 0) - return; - - hdp = dinfo->di_hdentry; - if (do_put_zero) { - aufs_bindex_t bindex, bend; - - bend = dinfo->di_bend; - for (bindex = dinfo->di_bstart; bindex <= bend; bindex++) { - h_d = hdp[0 + bindex].hd_dentry; - if (h_d && d_is_negative(h_d)) - au_set_h_dptr(dentry, bindex, NULL); - } - } - - dinfo->di_bstart = -1; - while (++dinfo->di_bstart <= dinfo->di_bend) - if (hdp[0 + dinfo->di_bstart].hd_dentry) - break; - if (dinfo->di_bstart > dinfo->di_bend) { - dinfo->di_bstart = -1; - dinfo->di_bend = -1; - return; - } - - dinfo->di_bend++; - while (0 <= --dinfo->di_bend) - if (hdp[0 + dinfo->di_bend].hd_dentry) - break; - AuDebugOn(dinfo->di_bstart > dinfo->di_bend || dinfo->di_bend < 0); -} - -void au_update_dbstart(struct dentry *dentry) -{ - aufs_bindex_t bindex, bend; - struct dentry *h_dentry; - - bend = au_dbend(dentry); - for (bindex = au_dbstart(dentry); bindex <= bend; bindex++) { - h_dentry = au_h_dptr(dentry, bindex); - if (!h_dentry) - continue; - if (d_is_positive(h_dentry)) { - au_set_dbstart(dentry, bindex); - return; - } - au_set_h_dptr(dentry, bindex, NULL); - } -} - -void au_update_dbend(struct dentry *dentry) -{ - aufs_bindex_t bindex, bstart; - struct dentry *h_dentry; - - bstart = au_dbstart(dentry); - for (bindex = au_dbend(dentry); bindex >= bstart; bindex--) { - h_dentry = au_h_dptr(dentry, bindex); - if (!h_dentry) - continue; - if (d_is_positive(h_dentry)) { - au_set_dbend(dentry, bindex); - return; - } - au_set_h_dptr(dentry, bindex, NULL); - } -} - -int au_find_dbindex(struct dentry *dentry, struct dentry *h_dentry) -{ - aufs_bindex_t bindex, bend; - - bend = au_dbend(dentry); - for (bindex = au_dbstart(dentry); bindex <= bend; bindex++) - if (au_h_dptr(dentry, bindex) == h_dentry) - return bindex; - return -1; -} diff --git a/fs/aufs/dir.c b/fs/aufs/dir.c deleted file mode 100644 index 7705c2997..000000000 --- a/fs/aufs/dir.c +++ /dev/null @@ -1,753 +0,0 @@ -/* - * Copyright (C) 2005-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * directory operations - */ - -#include <linux/fs_stack.h> -#include "aufs.h" - -void au_add_nlink(struct inode *dir, struct inode *h_dir) -{ - unsigned int nlink; - - AuDebugOn(!S_ISDIR(dir->i_mode) || !S_ISDIR(h_dir->i_mode)); - - nlink = dir->i_nlink; - nlink += h_dir->i_nlink - 2; - if (h_dir->i_nlink < 2) - nlink += 2; - smp_mb(); /* for i_nlink */ - /* 0 can happen in revaliding */ - set_nlink(dir, nlink); -} - -void au_sub_nlink(struct inode *dir, struct inode *h_dir) -{ - unsigned int nlink; - - AuDebugOn(!S_ISDIR(dir->i_mode) || !S_ISDIR(h_dir->i_mode)); - - nlink = dir->i_nlink; - nlink -= h_dir->i_nlink - 2; - if (h_dir->i_nlink < 2) - nlink -= 2; - smp_mb(); /* for i_nlink */ - /* nlink == 0 means the branch-fs is broken */ - set_nlink(dir, nlink); -} - -loff_t au_dir_size(struct file *file, struct dentry *dentry) -{ - loff_t sz; - aufs_bindex_t bindex, bend; - struct file *h_file; - struct dentry *h_dentry; - - sz = 0; - if (file) { - AuDebugOn(!d_is_dir(file->f_path.dentry)); - - bend = au_fbend_dir(file); - for (bindex = au_fbstart(file); - bindex <= bend && sz < KMALLOC_MAX_SIZE; - bindex++) { - h_file = au_hf_dir(file, bindex); - if (h_file && file_inode(h_file)) - sz += vfsub_f_size_read(h_file); - } - } else { - AuDebugOn(!dentry); - AuDebugOn(!d_is_dir(dentry)); - - bend = au_dbtaildir(dentry); - for (bindex = au_dbstart(dentry); - bindex <= bend && sz < KMALLOC_MAX_SIZE; - bindex++) { - h_dentry = au_h_dptr(dentry, bindex); - if (h_dentry && d_is_positive(h_dentry)) - sz += i_size_read(d_inode(h_dentry)); - } - } - if (sz < KMALLOC_MAX_SIZE) - sz = roundup_pow_of_two(sz); - if (sz > KMALLOC_MAX_SIZE) - sz = KMALLOC_MAX_SIZE; - else if (sz < NAME_MAX) { - BUILD_BUG_ON(AUFS_RDBLK_DEF < NAME_MAX); - sz = AUFS_RDBLK_DEF; - } - return sz; -} - -struct au_dir_ts_arg { - struct dentry *dentry; - aufs_bindex_t brid; -}; - -static void au_do_dir_ts(void *arg) -{ - struct au_dir_ts_arg *a = arg; - struct au_dtime dt; - struct path h_path; - struct inode *dir, *h_dir; - struct super_block *sb; - struct au_branch *br; - struct au_hinode *hdir; - int err; - aufs_bindex_t bstart, bindex; - - sb = a->dentry->d_sb; - if (d_really_is_negative(a->dentry)) - goto out; - aufs_read_lock(a->dentry, AuLock_DW | AuLock_DIR); /* noflush */ - - /* no dir->i_mutex lock */ - dir = d_inode(a->dentry); - bstart = au_ibstart(dir); - bindex = au_br_index(sb, a->brid); - if (bindex < bstart) - goto out_unlock; - - br = au_sbr(sb, bindex); - h_path.dentry = au_h_dptr(a->dentry, bindex); - if (!h_path.dentry) - goto out_unlock; - h_path.mnt = au_br_mnt(br); - au_dtime_store(&dt, a->dentry, &h_path); - - br = au_sbr(sb, bstart); - if (!au_br_writable(br->br_perm)) - goto out_unlock; - h_path.dentry = au_h_dptr(a->dentry, bstart); - h_path.mnt = au_br_mnt(br); - err = vfsub_mnt_want_write(h_path.mnt); - if (err) - goto out_unlock; - hdir = au_hi(dir, bstart); - au_hn_imtx_lock_nested(hdir, AuLsc_I_PARENT); - h_dir = au_h_iptr(dir, bstart); - if (h_dir->i_nlink - && timespec_compare(&h_dir->i_mtime, &dt.dt_mtime) < 0) { - dt.dt_h_path = h_path; - au_dtime_revert(&dt); - } - au_hn_imtx_unlock(hdir); - vfsub_mnt_drop_write(h_path.mnt); - au_cpup_attr_timesizes(dir); - -out_unlock: - aufs_read_unlock(a->dentry, AuLock_DW); -out: - dput(a->dentry); - au_nwt_done(&au_sbi(sb)->si_nowait); - kfree(arg); -} - -void au_dir_ts(struct inode *dir, aufs_bindex_t bindex) -{ - int perm, wkq_err; - aufs_bindex_t bstart; - struct au_dir_ts_arg *arg; - struct dentry *dentry; - struct super_block *sb; - - IMustLock(dir); - - dentry = d_find_any_alias(dir); - AuDebugOn(!dentry); - sb = dentry->d_sb; - bstart = au_ibstart(dir); - if (bstart == bindex) { - au_cpup_attr_timesizes(dir); - goto out; - } - - perm = au_sbr_perm(sb, bstart); - if (!au_br_writable(perm)) - goto out; - - arg = kmalloc(sizeof(*arg), GFP_NOFS); - if (!arg) - goto out; - - arg->dentry = dget(dentry); /* will be dput-ted by au_do_dir_ts() */ - arg->brid = au_sbr_id(sb, bindex); - wkq_err = au_wkq_nowait(au_do_dir_ts, arg, sb, /*flags*/0); - if (unlikely(wkq_err)) { - pr_err("wkq %d\n", wkq_err); - dput(dentry); - kfree(arg); - } - -out: - dput(dentry); -} - -/* ---------------------------------------------------------------------- */ - -static int reopen_dir(struct file *file) -{ - int err; - unsigned int flags; - aufs_bindex_t bindex, btail, bstart; - struct dentry *dentry, *h_dentry; - struct file *h_file; - - /* open all lower dirs */ - dentry = file->f_path.dentry; - bstart = au_dbstart(dentry); - for (bindex = au_fbstart(file); bindex < bstart; bindex++) - au_set_h_fptr(file, bindex, NULL); - au_set_fbstart(file, bstart); - - btail = au_dbtaildir(dentry); - for (bindex = au_fbend_dir(file); btail < bindex; bindex--) - au_set_h_fptr(file, bindex, NULL); - au_set_fbend_dir(file, btail); - - flags = vfsub_file_flags(file); - for (bindex = bstart; bindex <= btail; bindex++) { - h_dentry = au_h_dptr(dentry, bindex); - if (!h_dentry) - continue; - h_file = au_hf_dir(file, bindex); - if (h_file) - continue; - - h_file = au_h_open(dentry, bindex, flags, file, /*force_wr*/0); - err = PTR_ERR(h_file); - if (IS_ERR(h_file)) - goto out; /* close all? */ - au_set_h_fptr(file, bindex, h_file); - } - au_update_figen(file); - /* todo: necessary? */ - /* file->f_ra = h_file->f_ra; */ - err = 0; - -out: - return err; -} - -static int do_open_dir(struct file *file, int flags, struct file *h_file) -{ - int err; - aufs_bindex_t bindex, btail; - struct dentry *dentry, *h_dentry; - - FiMustWriteLock(file); - AuDebugOn(h_file); - - err = 0; - dentry = file->f_path.dentry; - file->f_version = d_inode(dentry)->i_version; - bindex = au_dbstart(dentry); - au_set_fbstart(file, bindex); - btail = au_dbtaildir(dentry); - au_set_fbend_dir(file, btail); - for (; !err && bindex <= btail; bindex++) { - h_dentry = au_h_dptr(dentry, bindex); - if (!h_dentry) - continue; - - h_file = au_h_open(dentry, bindex, flags, file, /*force_wr*/0); - if (IS_ERR(h_file)) { - err = PTR_ERR(h_file); - break; - } - au_set_h_fptr(file, bindex, h_file); - } - au_update_figen(file); - /* todo: necessary? */ - /* file->f_ra = h_file->f_ra; */ - if (!err) - return 0; /* success */ - - /* close all */ - for (bindex = au_fbstart(file); bindex <= btail; bindex++) - au_set_h_fptr(file, bindex, NULL); - au_set_fbstart(file, -1); - au_set_fbend_dir(file, -1); - - return err; -} - -static int aufs_open_dir(struct inode *inode __maybe_unused, - struct file *file) -{ - int err; - struct super_block *sb; - struct au_fidir *fidir; - - err = -ENOMEM; - sb = file->f_path.dentry->d_sb; - si_read_lock(sb, AuLock_FLUSH); - fidir = au_fidir_alloc(sb); - if (fidir) { - struct au_do_open_args args = { - .open = do_open_dir, - .fidir = fidir - }; - err = au_do_open(file, &args); - if (unlikely(err)) - kfree(fidir); - } - si_read_unlock(sb); - return err; -} - -static int aufs_release_dir(struct inode *inode __maybe_unused, - struct file *file) -{ - struct au_vdir *vdir_cache; - struct au_finfo *finfo; - struct au_fidir *fidir; - aufs_bindex_t bindex, bend; - - finfo = au_fi(file); - fidir = finfo->fi_hdir; - if (fidir) { - au_sphl_del(&finfo->fi_hlist, - &au_sbi(file->f_path.dentry->d_sb)->si_files); - vdir_cache = fidir->fd_vdir_cache; /* lock-free */ - if (vdir_cache) - au_vdir_free(vdir_cache); - - bindex = finfo->fi_btop; - if (bindex >= 0) { - /* - * calls fput() instead of filp_close(), - * since no dnotify or lock for the lower file. - */ - bend = fidir->fd_bbot; - for (; bindex <= bend; bindex++) - au_set_h_fptr(file, bindex, NULL); - } - kfree(fidir); - finfo->fi_hdir = NULL; - } - au_finfo_fin(file); - return 0; -} - -/* ---------------------------------------------------------------------- */ - -static int au_do_flush_dir(struct file *file, fl_owner_t id) -{ - int err; - aufs_bindex_t bindex, bend; - struct file *h_file; - - err = 0; - bend = au_fbend_dir(file); - for (bindex = au_fbstart(file); !err && bindex <= bend; bindex++) { - h_file = au_hf_dir(file, bindex); - if (h_file) - err = vfsub_flush(h_file, id); - } - return err; -} - -static int aufs_flush_dir(struct file *file, fl_owner_t id) -{ - return au_do_flush(file, id, au_do_flush_dir); -} - -/* ---------------------------------------------------------------------- */ - -static int au_do_fsync_dir_no_file(struct dentry *dentry, int datasync) -{ - int err; - aufs_bindex_t bend, bindex; - struct inode *inode; - struct super_block *sb; - - err = 0; - sb = dentry->d_sb; - inode = d_inode(dentry); - IMustLock(inode); - bend = au_dbend(dentry); - for (bindex = au_dbstart(dentry); !err && bindex <= bend; bindex++) { - struct path h_path; - - if (au_test_ro(sb, bindex, inode)) - continue; - h_path.dentry = au_h_dptr(dentry, bindex); - if (!h_path.dentry) - continue; - - h_path.mnt = au_sbr_mnt(sb, bindex); - err = vfsub_fsync(NULL, &h_path, datasync); - } - - return err; -} - -static int au_do_fsync_dir(struct file *file, int datasync) -{ - int err; - aufs_bindex_t bend, bindex; - struct file *h_file; - struct super_block *sb; - struct inode *inode; - - err = au_reval_and_lock_fdi(file, reopen_dir, /*wlock*/1); - if (unlikely(err)) - goto out; - - inode = file_inode(file); - sb = inode->i_sb; - bend = au_fbend_dir(file); - for (bindex = au_fbstart(file); !err && bindex <= bend; bindex++) { - h_file = au_hf_dir(file, bindex); - if (!h_file || au_test_ro(sb, bindex, inode)) - continue; - - err = vfsub_fsync(h_file, &h_file->f_path, datasync); - } - -out: - return err; -} - -/* - * @file may be NULL - */ -static int aufs_fsync_dir(struct file *file, loff_t start, loff_t end, - int datasync) -{ - int err; - struct dentry *dentry; - struct inode *inode; - struct super_block *sb; - struct mutex *mtx; - - err = 0; - dentry = file->f_path.dentry; - inode = d_inode(dentry); - mtx = &inode->i_mutex; - mutex_lock(mtx); - sb = dentry->d_sb; - si_noflush_read_lock(sb); - if (file) - err = au_do_fsync_dir(file, datasync); - else { - di_write_lock_child(dentry); - err = au_do_fsync_dir_no_file(dentry, datasync); - } - au_cpup_attr_timesizes(inode); - di_write_unlock(dentry); - if (file) - fi_write_unlock(file); - - si_read_unlock(sb); - mutex_unlock(mtx); - return err; -} - -/* ---------------------------------------------------------------------- */ - -static int aufs_iterate(struct file *file, struct dir_context *ctx) -{ - int err; - struct dentry *dentry; - struct inode *inode, *h_inode; - struct super_block *sb; - - AuDbg("%pD, ctx{%pf, %llu}\n", file, ctx->actor, ctx->pos); - - dentry = file->f_path.dentry; - inode = d_inode(dentry); - IMustLock(inode); - - sb = dentry->d_sb; - si_read_lock(sb, AuLock_FLUSH); - err = au_reval_and_lock_fdi(file, reopen_dir, /*wlock*/1); - if (unlikely(err)) - goto out; - err = au_alive_dir(dentry); - if (!err) - err = au_vdir_init(file); - di_downgrade_lock(dentry, AuLock_IR); - if (unlikely(err)) - goto out_unlock; - - h_inode = au_h_iptr(inode, au_ibstart(inode)); - if (!au_test_nfsd()) { - err = au_vdir_fill_de(file, ctx); - fsstack_copy_attr_atime(inode, h_inode); - } else { - /* - * nfsd filldir may call lookup_one_len(), vfs_getattr(), - * encode_fh() and others. - */ - atomic_inc(&h_inode->i_count); - di_read_unlock(dentry, AuLock_IR); - si_read_unlock(sb); - err = au_vdir_fill_de(file, ctx); - fsstack_copy_attr_atime(inode, h_inode); - fi_write_unlock(file); - iput(h_inode); - - AuTraceErr(err); - return err; - } - -out_unlock: - di_read_unlock(dentry, AuLock_IR); - fi_write_unlock(file); -out: - si_read_unlock(sb); - return err; -} - -/* ---------------------------------------------------------------------- */ - -#define AuTestEmpty_WHONLY 1 -#define AuTestEmpty_CALLED (1 << 1) -#define AuTestEmpty_SHWH (1 << 2) -#define au_ftest_testempty(flags, name) ((flags) & AuTestEmpty_##name) -#define au_fset_testempty(flags, name) \ - do { (flags) |= AuTestEmpty_##name; } while (0) -#define au_fclr_testempty(flags, name) \ - do { (flags) &= ~AuTestEmpty_##name; } while (0) - -#ifndef CONFIG_AUFS_SHWH -#undef AuTestEmpty_SHWH -#define AuTestEmpty_SHWH 0 -#endif - -struct test_empty_arg { - struct dir_context ctx; - struct au_nhash *whlist; - unsigned int flags; - int err; - aufs_bindex_t bindex; -}; - -static int test_empty_cb(struct dir_context *ctx, const char *__name, - int namelen, loff_t offset __maybe_unused, u64 ino, - unsigned int d_type) -{ - struct test_empty_arg *arg = container_of(ctx, struct test_empty_arg, - ctx); - char *name = (void *)__name; - - arg->err = 0; - au_fset_testempty(arg->flags, CALLED); - /* smp_mb(); */ - if (name[0] == '.' - && (namelen == 1 || (name[1] == '.' && namelen == 2))) - goto out; /* success */ - - if (namelen <= AUFS_WH_PFX_LEN - || memcmp(name, AUFS_WH_PFX, AUFS_WH_PFX_LEN)) { - if (au_ftest_testempty(arg->flags, WHONLY) - && !au_nhash_test_known_wh(arg->whlist, name, namelen)) - arg->err = -ENOTEMPTY; - goto out; - } - - name += AUFS_WH_PFX_LEN; - namelen -= AUFS_WH_PFX_LEN; - if (!au_nhash_test_known_wh(arg->whlist, name, namelen)) - arg->err = au_nhash_append_wh - (arg->whlist, name, namelen, ino, d_type, arg->bindex, - au_ftest_testempty(arg->flags, SHWH)); - -out: - /* smp_mb(); */ - AuTraceErr(arg->err); - return arg->err; -} - -static int do_test_empty(struct dentry *dentry, struct test_empty_arg *arg) -{ - int err; - struct file *h_file; - - h_file = au_h_open(dentry, arg->bindex, - O_RDONLY | O_NONBLOCK | O_DIRECTORY | O_LARGEFILE, - /*file*/NULL, /*force_wr*/0); - err = PTR_ERR(h_file); - if (IS_ERR(h_file)) - goto out; - - err = 0; - if (!au_opt_test(au_mntflags(dentry->d_sb), UDBA_NONE) - && !file_inode(h_file)->i_nlink) - goto out_put; - - do { - arg->err = 0; - au_fclr_testempty(arg->flags, CALLED); - /* smp_mb(); */ - err = vfsub_iterate_dir(h_file, &arg->ctx); - if (err >= 0) - err = arg->err; - } while (!err && au_ftest_testempty(arg->flags, CALLED)); - -out_put: - fput(h_file); - au_sbr_put(dentry->d_sb, arg->bindex); -out: - return err; -} - -struct do_test_empty_args { - int *errp; - struct dentry *dentry; - struct test_empty_arg *arg; -}; - -static void call_do_test_empty(void *args) -{ - struct do_test_empty_args *a = args; - *a->errp = do_test_empty(a->dentry, a->arg); -} - -static int sio_test_empty(struct dentry *dentry, struct test_empty_arg *arg) -{ - int err, wkq_err; - struct dentry *h_dentry; - struct inode *h_inode; - - h_dentry = au_h_dptr(dentry, arg->bindex); - h_inode = d_inode(h_dentry); - /* todo: i_mode changes anytime? */ - mutex_lock_nested(&h_inode->i_mutex, AuLsc_I_CHILD); - err = au_test_h_perm_sio(h_inode, MAY_EXEC | MAY_READ); - mutex_unlock(&h_inode->i_mutex); - if (!err) - err = do_test_empty(dentry, arg); - else { - struct do_test_empty_args args = { - .errp = &err, - .dentry = dentry, - .arg = arg - }; - unsigned int flags = arg->flags; - - wkq_err = au_wkq_wait(call_do_test_empty, &args); - if (unlikely(wkq_err)) - err = wkq_err; - arg->flags = flags; - } - - return err; -} - -int au_test_empty_lower(struct dentry *dentry) -{ - int err; - unsigned int rdhash; - aufs_bindex_t bindex, bstart, btail; - struct au_nhash whlist; - struct test_empty_arg arg = { - .ctx = { - .actor = test_empty_cb - } - }; - int (*test_empty)(struct dentry *dentry, struct test_empty_arg *arg); - - SiMustAnyLock(dentry->d_sb); - - rdhash = au_sbi(dentry->d_sb)->si_rdhash; - if (!rdhash) - rdhash = au_rdhash_est(au_dir_size(/*file*/NULL, dentry)); - err = au_nhash_alloc(&whlist, rdhash, GFP_NOFS); - if (unlikely(err)) - goto out; - - arg.flags = 0; - arg.whlist = &whlist; - bstart = au_dbstart(dentry); - if (au_opt_test(au_mntflags(dentry->d_sb), SHWH)) - au_fset_testempty(arg.flags, SHWH); - test_empty = do_test_empty; - if (au_opt_test(au_mntflags(dentry->d_sb), DIRPERM1)) - test_empty = sio_test_empty; - arg.bindex = bstart; - err = test_empty(dentry, &arg); - if (unlikely(err)) - goto out_whlist; - - au_fset_testempty(arg.flags, WHONLY); - btail = au_dbtaildir(dentry); - for (bindex = bstart + 1; !err && bindex <= btail; bindex++) { - struct dentry *h_dentry; - - h_dentry = au_h_dptr(dentry, bindex); - if (h_dentry && d_is_positive(h_dentry)) { - arg.bindex = bindex; - err = test_empty(dentry, &arg); - } - } - -out_whlist: - au_nhash_wh_free(&whlist); -out: - return err; -} - -int au_test_empty(struct dentry *dentry, struct au_nhash *whlist) -{ - int err; - struct test_empty_arg arg = { - .ctx = { - .actor = test_empty_cb - } - }; - aufs_bindex_t bindex, btail; - - err = 0; - arg.whlist = whlist; - arg.flags = AuTestEmpty_WHONLY; - if (au_opt_test(au_mntflags(dentry->d_sb), SHWH)) - au_fset_testempty(arg.flags, SHWH); - btail = au_dbtaildir(dentry); - for (bindex = au_dbstart(dentry); !err && bindex <= btail; bindex++) { - struct dentry *h_dentry; - - h_dentry = au_h_dptr(dentry, bindex); - if (h_dentry && d_is_positive(h_dentry)) { - arg.bindex = bindex; - err = sio_test_empty(dentry, &arg); - } - } - - return err; -} - -/* ---------------------------------------------------------------------- */ - -const struct file_operations aufs_dir_fop = { - .owner = THIS_MODULE, - .llseek = default_llseek, - .read = generic_read_dir, - .iterate = aufs_iterate, - .unlocked_ioctl = aufs_ioctl_dir, -#ifdef CONFIG_COMPAT - .compat_ioctl = aufs_compat_ioctl_dir, -#endif - .open = aufs_open_dir, - .release = aufs_release_dir, - .flush = aufs_flush_dir, - .fsync = aufs_fsync_dir -}; diff --git a/fs/aufs/dir.h b/fs/aufs/dir.h deleted file mode 100644 index e4acf732c..000000000 --- a/fs/aufs/dir.h +++ /dev/null @@ -1,131 +0,0 @@ -/* - * Copyright (C) 2005-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * directory operations - */ - -#ifndef __AUFS_DIR_H__ -#define __AUFS_DIR_H__ - -#ifdef __KERNEL__ - -#include <linux/fs.h> - -/* ---------------------------------------------------------------------- */ - -/* need to be faster and smaller */ - -struct au_nhash { - unsigned int nh_num; - struct hlist_head *nh_head; -}; - -struct au_vdir_destr { - unsigned char len; - unsigned char name[0]; -} __packed; - -struct au_vdir_dehstr { - struct hlist_node hash; - struct au_vdir_destr *str; -} ____cacheline_aligned_in_smp; - -struct au_vdir_de { - ino_t de_ino; - unsigned char de_type; - /* caution: packed */ - struct au_vdir_destr de_str; -} __packed; - -struct au_vdir_wh { - struct hlist_node wh_hash; -#ifdef CONFIG_AUFS_SHWH - ino_t wh_ino; - aufs_bindex_t wh_bindex; - unsigned char wh_type; -#else - aufs_bindex_t wh_bindex; -#endif - /* caution: packed */ - struct au_vdir_destr wh_str; -} __packed; - -union au_vdir_deblk_p { - unsigned char *deblk; - struct au_vdir_de *de; -}; - -struct au_vdir { - unsigned char **vd_deblk; - unsigned long vd_nblk; - struct { - unsigned long ul; - union au_vdir_deblk_p p; - } vd_last; - - unsigned long vd_version; - unsigned int vd_deblk_sz; - unsigned long vd_jiffy; -} ____cacheline_aligned_in_smp; - -/* ---------------------------------------------------------------------- */ - -/* dir.c */ -extern const struct file_operations aufs_dir_fop; -void au_add_nlink(struct inode *dir, struct inode *h_dir); -void au_sub_nlink(struct inode *dir, struct inode *h_dir); -loff_t au_dir_size(struct file *file, struct dentry *dentry); -void au_dir_ts(struct inode *dir, aufs_bindex_t bsrc); -int au_test_empty_lower(struct dentry *dentry); -int au_test_empty(struct dentry *dentry, struct au_nhash *whlist); - -/* vdir.c */ -unsigned int au_rdhash_est(loff_t sz); -int au_nhash_alloc(struct au_nhash *nhash, unsigned int num_hash, gfp_t gfp); -void au_nhash_wh_free(struct au_nhash *whlist); -int au_nhash_test_longer_wh(struct au_nhash *whlist, aufs_bindex_t btgt, - int limit); -int au_nhash_test_known_wh(struct au_nhash *whlist, char *name, int nlen); -int au_nhash_append_wh(struct au_nhash *whlist, char *name, int nlen, ino_t ino, - unsigned int d_type, aufs_bindex_t bindex, - unsigned char shwh); -void au_vdir_free(struct au_vdir *vdir); -int au_vdir_init(struct file *file); -int au_vdir_fill_de(struct file *file, struct dir_context *ctx); - -/* ioctl.c */ -long aufs_ioctl_dir(struct file *file, unsigned int cmd, unsigned long arg); - -#ifdef CONFIG_AUFS_RDU -/* rdu.c */ -long au_rdu_ioctl(struct file *file, unsigned int cmd, unsigned long arg); -#ifdef CONFIG_COMPAT -long au_rdu_compat_ioctl(struct file *file, unsigned int cmd, - unsigned long arg); -#endif -#else -AuStub(long, au_rdu_ioctl, return -EINVAL, struct file *file, - unsigned int cmd, unsigned long arg) -#ifdef CONFIG_COMPAT -AuStub(long, au_rdu_compat_ioctl, return -EINVAL, struct file *file, - unsigned int cmd, unsigned long arg) -#endif -#endif - -#endif /* __KERNEL__ */ -#endif /* __AUFS_DIR_H__ */ diff --git a/fs/aufs/dynop.c b/fs/aufs/dynop.c deleted file mode 100644 index a04d02f91..000000000 --- a/fs/aufs/dynop.c +++ /dev/null @@ -1,369 +0,0 @@ -/* - * Copyright (C) 2010-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * dynamically customizable operations for regular files - */ - -#include "aufs.h" - -#define DyPrSym(key) AuDbgSym(key->dk_op.dy_hop) - -/* - * How large will these lists be? - * Usually just a few elements, 20-30 at most for each, I guess. - */ -static struct au_splhead dynop[AuDyLast]; - -static struct au_dykey *dy_gfind_get(struct au_splhead *spl, const void *h_op) -{ - struct au_dykey *key, *tmp; - struct list_head *head; - - key = NULL; - head = &spl->head; - rcu_read_lock(); - list_for_each_entry_rcu(tmp, head, dk_list) - if (tmp->dk_op.dy_hop == h_op) { - key = tmp; - kref_get(&key->dk_kref); - break; - } - rcu_read_unlock(); - - return key; -} - -static struct au_dykey *dy_bradd(struct au_branch *br, struct au_dykey *key) -{ - struct au_dykey **k, *found; - const void *h_op = key->dk_op.dy_hop; - int i; - - found = NULL; - k = br->br_dykey; - for (i = 0; i < AuBrDynOp; i++) - if (k[i]) { - if (k[i]->dk_op.dy_hop == h_op) { - found = k[i]; - break; - } - } else - break; - if (!found) { - spin_lock(&br->br_dykey_lock); - for (; i < AuBrDynOp; i++) - if (k[i]) { - if (k[i]->dk_op.dy_hop == h_op) { - found = k[i]; - break; - } - } else { - k[i] = key; - break; - } - spin_unlock(&br->br_dykey_lock); - BUG_ON(i == AuBrDynOp); /* expand the array */ - } - - return found; -} - -/* kref_get() if @key is already added */ -static struct au_dykey *dy_gadd(struct au_splhead *spl, struct au_dykey *key) -{ - struct au_dykey *tmp, *found; - struct list_head *head; - const void *h_op = key->dk_op.dy_hop; - - found = NULL; - head = &spl->head; - spin_lock(&spl->spin); - list_for_each_entry(tmp, head, dk_list) - if (tmp->dk_op.dy_hop == h_op) { - kref_get(&tmp->dk_kref); - found = tmp; - break; - } - if (!found) - list_add_rcu(&key->dk_list, head); - spin_unlock(&spl->spin); - - if (!found) - DyPrSym(key); - return found; -} - -static void dy_free_rcu(struct rcu_head *rcu) -{ - struct au_dykey *key; - - key = container_of(rcu, struct au_dykey, dk_rcu); - DyPrSym(key); - kfree(key); -} - -static void dy_free(struct kref *kref) -{ - struct au_dykey *key; - struct au_splhead *spl; - - key = container_of(kref, struct au_dykey, dk_kref); - spl = dynop + key->dk_op.dy_type; - au_spl_del_rcu(&key->dk_list, spl); - call_rcu(&key->dk_rcu, dy_free_rcu); -} - -void au_dy_put(struct au_dykey *key) -{ - kref_put(&key->dk_kref, dy_free); -} - -/* ---------------------------------------------------------------------- */ - -#define DyDbgSize(cnt, op) AuDebugOn(cnt != sizeof(op)/sizeof(void *)) - -#ifdef CONFIG_AUFS_DEBUG -#define DyDbgDeclare(cnt) unsigned int cnt = 0 -#define DyDbgInc(cnt) do { cnt++; } while (0) -#else -#define DyDbgDeclare(cnt) do {} while (0) -#define DyDbgInc(cnt) do {} while (0) -#endif - -#define DySet(func, dst, src, h_op, h_sb) do { \ - DyDbgInc(cnt); \ - if (h_op->func) { \ - if (src.func) \ - dst.func = src.func; \ - else \ - AuDbg("%s %s\n", au_sbtype(h_sb), #func); \ - } \ -} while (0) - -#define DySetForce(func, dst, src) do { \ - AuDebugOn(!src.func); \ - DyDbgInc(cnt); \ - dst.func = src.func; \ -} while (0) - -#define DySetAop(func) \ - DySet(func, dyaop->da_op, aufs_aop, h_aop, h_sb) -#define DySetAopForce(func) \ - DySetForce(func, dyaop->da_op, aufs_aop) - -static void dy_aop(struct au_dykey *key, const void *h_op, - struct super_block *h_sb __maybe_unused) -{ - struct au_dyaop *dyaop = (void *)key; - const struct address_space_operations *h_aop = h_op; - DyDbgDeclare(cnt); - - AuDbg("%s\n", au_sbtype(h_sb)); - - DySetAop(writepage); - DySetAopForce(readpage); /* force */ - DySetAop(writepages); - DySetAop(set_page_dirty); - DySetAop(readpages); - DySetAop(write_begin); - DySetAop(write_end); - DySetAop(bmap); - DySetAop(invalidatepage); - DySetAop(releasepage); - DySetAop(freepage); - /* this one will be changed according to an aufs mount option */ - DySetAop(direct_IO); - DySetAop(migratepage); - DySetAop(launder_page); - DySetAop(is_partially_uptodate); - DySetAop(is_dirty_writeback); - DySetAop(error_remove_page); - DySetAop(swap_activate); - DySetAop(swap_deactivate); - - DyDbgSize(cnt, *h_aop); -} - -/* ---------------------------------------------------------------------- */ - -static void dy_bug(struct kref *kref) -{ - BUG(); -} - -static struct au_dykey *dy_get(struct au_dynop *op, struct au_branch *br) -{ - struct au_dykey *key, *old; - struct au_splhead *spl; - struct op { - unsigned int sz; - void (*set)(struct au_dykey *key, const void *h_op, - struct super_block *h_sb __maybe_unused); - }; - static const struct op a[] = { - [AuDy_AOP] = { - .sz = sizeof(struct au_dyaop), - .set = dy_aop - } - }; - const struct op *p; - - spl = dynop + op->dy_type; - key = dy_gfind_get(spl, op->dy_hop); - if (key) - goto out_add; /* success */ - - p = a + op->dy_type; - key = kzalloc(p->sz, GFP_NOFS); - if (unlikely(!key)) { - key = ERR_PTR(-ENOMEM); - goto out; - } - - key->dk_op.dy_hop = op->dy_hop; - kref_init(&key->dk_kref); - p->set(key, op->dy_hop, au_br_sb(br)); - old = dy_gadd(spl, key); - if (old) { - kfree(key); - key = old; - } - -out_add: - old = dy_bradd(br, key); - if (old) - /* its ref-count should never be zero here */ - kref_put(&key->dk_kref, dy_bug); -out: - return key; -} - -/* ---------------------------------------------------------------------- */ -/* - * Aufs prohibits O_DIRECT by defaut even if the branch supports it. - * This behaviour is necessary to return an error from open(O_DIRECT) instead - * of the succeeding I/O. The dio mount option enables O_DIRECT and makes - * open(O_DIRECT) always succeed, but the succeeding I/O may return an error. - * See the aufs manual in detail. - */ -static void dy_adx(struct au_dyaop *dyaop, int do_dx) -{ - if (!do_dx) - dyaop->da_op.direct_IO = NULL; - else - dyaop->da_op.direct_IO = aufs_aop.direct_IO; -} - -static struct au_dyaop *dy_aget(struct au_branch *br, - const struct address_space_operations *h_aop, - int do_dx) -{ - struct au_dyaop *dyaop; - struct au_dynop op; - - op.dy_type = AuDy_AOP; - op.dy_haop = h_aop; - dyaop = (void *)dy_get(&op, br); - if (IS_ERR(dyaop)) - goto out; - dy_adx(dyaop, do_dx); - -out: - return dyaop; -} - -int au_dy_iaop(struct inode *inode, aufs_bindex_t bindex, - struct inode *h_inode) -{ - int err, do_dx; - struct super_block *sb; - struct au_branch *br; - struct au_dyaop *dyaop; - - AuDebugOn(!S_ISREG(h_inode->i_mode)); - IiMustWriteLock(inode); - - sb = inode->i_sb; - br = au_sbr(sb, bindex); - do_dx = !!au_opt_test(au_mntflags(sb), DIO); - dyaop = dy_aget(br, h_inode->i_mapping->a_ops, do_dx); - err = PTR_ERR(dyaop); - if (IS_ERR(dyaop)) - /* unnecessary to call dy_fput() */ - goto out; - - err = 0; - inode->i_mapping->a_ops = &dyaop->da_op; - -out: - return err; -} - -/* - * Is it safe to replace a_ops during the inode/file is in operation? - * Yes, I hope so. - */ -int au_dy_irefresh(struct inode *inode) -{ - int err; - aufs_bindex_t bstart; - struct inode *h_inode; - - err = 0; - if (S_ISREG(inode->i_mode)) { - bstart = au_ibstart(inode); - h_inode = au_h_iptr(inode, bstart); - err = au_dy_iaop(inode, bstart, h_inode); - } - return err; -} - -void au_dy_arefresh(int do_dx) -{ - struct au_splhead *spl; - struct list_head *head; - struct au_dykey *key; - - spl = dynop + AuDy_AOP; - head = &spl->head; - spin_lock(&spl->spin); - list_for_each_entry(key, head, dk_list) - dy_adx((void *)key, do_dx); - spin_unlock(&spl->spin); -} - -/* ---------------------------------------------------------------------- */ - -void __init au_dy_init(void) -{ - int i; - - /* make sure that 'struct au_dykey *' can be any type */ - BUILD_BUG_ON(offsetof(struct au_dyaop, da_key)); - - for (i = 0; i < AuDyLast; i++) - au_spl_init(dynop + i); -} - -void au_dy_fin(void) -{ - int i; - - for (i = 0; i < AuDyLast; i++) - WARN_ON(!list_empty(&dynop[i].head)); -} diff --git a/fs/aufs/dynop.h b/fs/aufs/dynop.h deleted file mode 100644 index 724dddab1..000000000 --- a/fs/aufs/dynop.h +++ /dev/null @@ -1,74 +0,0 @@ -/* - * Copyright (C) 2010-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * dynamically customizable operations (for regular files only) - */ - -#ifndef __AUFS_DYNOP_H__ -#define __AUFS_DYNOP_H__ - -#ifdef __KERNEL__ - -#include <linux/fs.h> -#include <linux/kref.h> - -enum {AuDy_AOP, AuDyLast}; - -struct au_dynop { - int dy_type; - union { - const void *dy_hop; - const struct address_space_operations *dy_haop; - }; -}; - -struct au_dykey { - union { - struct list_head dk_list; - struct rcu_head dk_rcu; - }; - struct au_dynop dk_op; - - /* - * during I am in the branch local array, kref is gotten. when the - * branch is removed, kref is put. - */ - struct kref dk_kref; -}; - -/* stop unioning since their sizes are very different from each other */ -struct au_dyaop { - struct au_dykey da_key; - struct address_space_operations da_op; /* not const */ -}; - -/* ---------------------------------------------------------------------- */ - -/* dynop.c */ -struct au_branch; -void au_dy_put(struct au_dykey *key); -int au_dy_iaop(struct inode *inode, aufs_bindex_t bindex, - struct inode *h_inode); -int au_dy_irefresh(struct inode *inode); -void au_dy_arefresh(int do_dio); - -void __init au_dy_init(void); -void au_dy_fin(void); - -#endif /* __KERNEL__ */ -#endif /* __AUFS_DYNOP_H__ */ diff --git a/fs/aufs/export.c b/fs/aufs/export.c deleted file mode 100644 index c85b036b8..000000000 --- a/fs/aufs/export.c +++ /dev/null @@ -1,832 +0,0 @@ -/* - * Copyright (C) 2005-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * export via nfs - */ - -#include <linux/exportfs.h> -#include <linux/fs_struct.h> -#include <linux/namei.h> -#include <linux/nsproxy.h> -#include <linux/random.h> -#include <linux/writeback.h> -#include "../fs/mount.h" -#include "aufs.h" - -union conv { -#ifdef CONFIG_AUFS_INO_T_64 - __u32 a[2]; -#else - __u32 a[1]; -#endif - ino_t ino; -}; - -static ino_t decode_ino(__u32 *a) -{ - union conv u; - - BUILD_BUG_ON(sizeof(u.ino) != sizeof(u.a)); - u.a[0] = a[0]; -#ifdef CONFIG_AUFS_INO_T_64 - u.a[1] = a[1]; -#endif - return u.ino; -} - -static void encode_ino(__u32 *a, ino_t ino) -{ - union conv u; - - u.ino = ino; - a[0] = u.a[0]; -#ifdef CONFIG_AUFS_INO_T_64 - a[1] = u.a[1]; -#endif -} - -/* NFS file handle */ -enum { - Fh_br_id, - Fh_sigen, -#ifdef CONFIG_AUFS_INO_T_64 - /* support 64bit inode number */ - Fh_ino1, - Fh_ino2, - Fh_dir_ino1, - Fh_dir_ino2, -#else - Fh_ino1, - Fh_dir_ino1, -#endif - Fh_igen, - Fh_h_type, - Fh_tail, - - Fh_ino = Fh_ino1, - Fh_dir_ino = Fh_dir_ino1 -}; - -static int au_test_anon(struct dentry *dentry) -{ - /* note: read d_flags without d_lock */ - return !!(dentry->d_flags & DCACHE_DISCONNECTED); -} - -int au_test_nfsd(void) -{ - int ret; - struct task_struct *tsk = current; - char comm[sizeof(tsk->comm)]; - - ret = 0; - if (tsk->flags & PF_KTHREAD) { - get_task_comm(comm, tsk); - ret = !strcmp(comm, "nfsd"); - } - - return ret; -} - -/* ---------------------------------------------------------------------- */ -/* inode generation external table */ - -void au_xigen_inc(struct inode *inode) -{ - loff_t pos; - ssize_t sz; - __u32 igen; - struct super_block *sb; - struct au_sbinfo *sbinfo; - - sb = inode->i_sb; - AuDebugOn(!au_opt_test(au_mntflags(sb), XINO)); - - sbinfo = au_sbi(sb); - pos = inode->i_ino; - pos *= sizeof(igen); - igen = inode->i_generation + 1; - sz = xino_fwrite(sbinfo->si_xwrite, sbinfo->si_xigen, &igen, - sizeof(igen), &pos); - if (sz == sizeof(igen)) - return; /* success */ - - if (unlikely(sz >= 0)) - AuIOErr("xigen error (%zd)\n", sz); -} - -int au_xigen_new(struct inode *inode) -{ - int err; - loff_t pos; - ssize_t sz; - struct super_block *sb; - struct au_sbinfo *sbinfo; - struct file *file; - - err = 0; - /* todo: dirty, at mount time */ - if (inode->i_ino == AUFS_ROOT_INO) - goto out; - sb = inode->i_sb; - SiMustAnyLock(sb); - if (unlikely(!au_opt_test(au_mntflags(sb), XINO))) - goto out; - - err = -EFBIG; - pos = inode->i_ino; - if (unlikely(au_loff_max / sizeof(inode->i_generation) - 1 < pos)) { - AuIOErr1("too large i%lld\n", pos); - goto out; - } - pos *= sizeof(inode->i_generation); - - err = 0; - sbinfo = au_sbi(sb); - file = sbinfo->si_xigen; - BUG_ON(!file); - - if (vfsub_f_size_read(file) - < pos + sizeof(inode->i_generation)) { - inode->i_generation = atomic_inc_return(&sbinfo->si_xigen_next); - sz = xino_fwrite(sbinfo->si_xwrite, file, &inode->i_generation, - sizeof(inode->i_generation), &pos); - } else - sz = xino_fread(sbinfo->si_xread, file, &inode->i_generation, - sizeof(inode->i_generation), &pos); - if (sz == sizeof(inode->i_generation)) - goto out; /* success */ - - err = sz; - if (unlikely(sz >= 0)) { - err = -EIO; - AuIOErr("xigen error (%zd)\n", sz); - } - -out: - return err; -} - -int au_xigen_set(struct super_block *sb, struct file *base) -{ - int err; - struct au_sbinfo *sbinfo; - struct file *file; - - SiMustWriteLock(sb); - - sbinfo = au_sbi(sb); - file = au_xino_create2(base, sbinfo->si_xigen); - err = PTR_ERR(file); - if (IS_ERR(file)) - goto out; - err = 0; - if (sbinfo->si_xigen) - fput(sbinfo->si_xigen); - sbinfo->si_xigen = file; - -out: - return err; -} - -void au_xigen_clr(struct super_block *sb) -{ - struct au_sbinfo *sbinfo; - - SiMustWriteLock(sb); - - sbinfo = au_sbi(sb); - if (sbinfo->si_xigen) { - fput(sbinfo->si_xigen); - sbinfo->si_xigen = NULL; - } -} - -/* ---------------------------------------------------------------------- */ - -static struct dentry *decode_by_ino(struct super_block *sb, ino_t ino, - ino_t dir_ino) -{ - struct dentry *dentry, *d; - struct inode *inode; - unsigned int sigen; - - dentry = NULL; - inode = ilookup(sb, ino); - if (!inode) - goto out; - - dentry = ERR_PTR(-ESTALE); - sigen = au_sigen(sb); - if (unlikely(is_bad_inode(inode) - || IS_DEADDIR(inode) - || sigen != au_iigen(inode, NULL))) - goto out_iput; - - dentry = NULL; - if (!dir_ino || S_ISDIR(inode->i_mode)) - dentry = d_find_alias(inode); - else { - spin_lock(&inode->i_lock); - hlist_for_each_entry(d, &inode->i_dentry, d_u.d_alias) { - spin_lock(&d->d_lock); - if (!au_test_anon(d) - && d_inode(d->d_parent)->i_ino == dir_ino) { - dentry = dget_dlock(d); - spin_unlock(&d->d_lock); - break; - } - spin_unlock(&d->d_lock); - } - spin_unlock(&inode->i_lock); - } - if (unlikely(dentry && au_digen_test(dentry, sigen))) { - /* need to refresh */ - dput(dentry); - dentry = NULL; - } - -out_iput: - iput(inode); -out: - AuTraceErrPtr(dentry); - return dentry; -} - -/* ---------------------------------------------------------------------- */ - -/* todo: dirty? */ -/* if exportfs_decode_fh() passed vfsmount*, we could be happy */ - -struct au_compare_mnt_args { - /* input */ - struct super_block *sb; - - /* output */ - struct vfsmount *mnt; -}; - -static int au_compare_mnt(struct vfsmount *mnt, void *arg) -{ - struct au_compare_mnt_args *a = arg; - - if (mnt->mnt_sb != a->sb) - return 0; - a->mnt = mntget(mnt); - return 1; -} - -static struct vfsmount *au_mnt_get(struct super_block *sb) -{ - int err; - struct path root; - struct au_compare_mnt_args args = { - .sb = sb - }; - - get_fs_root(current->fs, &root); - rcu_read_lock(); - err = iterate_mounts(au_compare_mnt, &args, root.mnt); - rcu_read_unlock(); - path_put(&root); - AuDebugOn(!err); - AuDebugOn(!args.mnt); - return args.mnt; -} - -struct au_nfsd_si_lock { - unsigned int sigen; - aufs_bindex_t bindex, br_id; - unsigned char force_lock; -}; - -static int si_nfsd_read_lock(struct super_block *sb, - struct au_nfsd_si_lock *nsi_lock) -{ - int err; - aufs_bindex_t bindex; - - si_read_lock(sb, AuLock_FLUSH); - - /* branch id may be wrapped around */ - err = 0; - bindex = au_br_index(sb, nsi_lock->br_id); - if (bindex >= 0 && nsi_lock->sigen + AUFS_BRANCH_MAX > au_sigen(sb)) - goto out; /* success */ - - err = -ESTALE; - bindex = -1; - if (!nsi_lock->force_lock) - si_read_unlock(sb); - -out: - nsi_lock->bindex = bindex; - return err; -} - -struct find_name_by_ino { - struct dir_context ctx; - int called, found; - ino_t ino; - char *name; - int namelen; -}; - -static int -find_name_by_ino(struct dir_context *ctx, const char *name, int namelen, - loff_t offset, u64 ino, unsigned int d_type) -{ - struct find_name_by_ino *a = container_of(ctx, struct find_name_by_ino, - ctx); - - a->called++; - if (a->ino != ino) - return 0; - - memcpy(a->name, name, namelen); - a->namelen = namelen; - a->found = 1; - return 1; -} - -static struct dentry *au_lkup_by_ino(struct path *path, ino_t ino, - struct au_nfsd_si_lock *nsi_lock) -{ - struct dentry *dentry, *parent; - struct file *file; - struct inode *dir; - struct find_name_by_ino arg = { - .ctx = { - .actor = find_name_by_ino - } - }; - int err; - - parent = path->dentry; - if (nsi_lock) - si_read_unlock(parent->d_sb); - file = vfsub_dentry_open(path, au_dir_roflags); - dentry = (void *)file; - if (IS_ERR(file)) - goto out; - - dentry = ERR_PTR(-ENOMEM); - arg.name = (void *)__get_free_page(GFP_NOFS); - if (unlikely(!arg.name)) - goto out_file; - arg.ino = ino; - arg.found = 0; - do { - arg.called = 0; - /* smp_mb(); */ - err = vfsub_iterate_dir(file, &arg.ctx); - } while (!err && !arg.found && arg.called); - dentry = ERR_PTR(err); - if (unlikely(err)) - goto out_name; - /* instead of ENOENT */ - dentry = ERR_PTR(-ESTALE); - if (!arg.found) - goto out_name; - - /* do not call vfsub_lkup_one() */ - dir = d_inode(parent); - mutex_lock(&dir->i_mutex); - dentry = vfsub_lookup_one_len(arg.name, parent, arg.namelen); - mutex_unlock(&dir->i_mutex); - AuTraceErrPtr(dentry); - if (IS_ERR(dentry)) - goto out_name; - AuDebugOn(au_test_anon(dentry)); - if (unlikely(d_really_is_negative(dentry))) { - dput(dentry); - dentry = ERR_PTR(-ENOENT); - } - -out_name: - free_page((unsigned long)arg.name); -out_file: - fput(file); -out: - if (unlikely(nsi_lock - && si_nfsd_read_lock(parent->d_sb, nsi_lock) < 0)) - if (!IS_ERR(dentry)) { - dput(dentry); - dentry = ERR_PTR(-ESTALE); - } - AuTraceErrPtr(dentry); - return dentry; -} - -static struct dentry *decode_by_dir_ino(struct super_block *sb, ino_t ino, - ino_t dir_ino, - struct au_nfsd_si_lock *nsi_lock) -{ - struct dentry *dentry; - struct path path; - - if (dir_ino != AUFS_ROOT_INO) { - path.dentry = decode_by_ino(sb, dir_ino, 0); - dentry = path.dentry; - if (!path.dentry || IS_ERR(path.dentry)) - goto out; - AuDebugOn(au_test_anon(path.dentry)); - } else - path.dentry = dget(sb->s_root); - - path.mnt = au_mnt_get(sb); - dentry = au_lkup_by_ino(&path, ino, nsi_lock); - path_put(&path); - -out: - AuTraceErrPtr(dentry); - return dentry; -} - -/* ---------------------------------------------------------------------- */ - -static int h_acceptable(void *expv, struct dentry *dentry) -{ - return 1; -} - -static char *au_build_path(struct dentry *h_parent, struct path *h_rootpath, - char *buf, int len, struct super_block *sb) -{ - char *p; - int n; - struct path path; - - p = d_path(h_rootpath, buf, len); - if (IS_ERR(p)) - goto out; - n = strlen(p); - - path.mnt = h_rootpath->mnt; - path.dentry = h_parent; - p = d_path(&path, buf, len); - if (IS_ERR(p)) - goto out; - if (n != 1) - p += n; - - path.mnt = au_mnt_get(sb); - path.dentry = sb->s_root; - p = d_path(&path, buf, len - strlen(p)); - mntput(path.mnt); - if (IS_ERR(p)) - goto out; - if (n != 1) - p[strlen(p)] = '/'; - -out: - AuTraceErrPtr(p); - return p; -} - -static -struct dentry *decode_by_path(struct super_block *sb, ino_t ino, __u32 *fh, - int fh_len, struct au_nfsd_si_lock *nsi_lock) -{ - struct dentry *dentry, *h_parent, *root; - struct super_block *h_sb; - char *pathname, *p; - struct vfsmount *h_mnt; - struct au_branch *br; - int err; - struct path path; - - br = au_sbr(sb, nsi_lock->bindex); - h_mnt = au_br_mnt(br); - h_sb = h_mnt->mnt_sb; - /* todo: call lower fh_to_dentry()? fh_to_parent()? */ - h_parent = exportfs_decode_fh(h_mnt, (void *)(fh + Fh_tail), - fh_len - Fh_tail, fh[Fh_h_type], - h_acceptable, /*context*/NULL); - dentry = h_parent; - if (unlikely(!h_parent || IS_ERR(h_parent))) { - AuWarn1("%s decode_fh failed, %ld\n", - au_sbtype(h_sb), PTR_ERR(h_parent)); - goto out; - } - dentry = NULL; - if (unlikely(au_test_anon(h_parent))) { - AuWarn1("%s decode_fh returned a disconnected dentry\n", - au_sbtype(h_sb)); - goto out_h_parent; - } - - dentry = ERR_PTR(-ENOMEM); - pathname = (void *)__get_free_page(GFP_NOFS); - if (unlikely(!pathname)) - goto out_h_parent; - - root = sb->s_root; - path.mnt = h_mnt; - di_read_lock_parent(root, !AuLock_IR); - path.dentry = au_h_dptr(root, nsi_lock->bindex); - di_read_unlock(root, !AuLock_IR); - p = au_build_path(h_parent, &path, pathname, PAGE_SIZE, sb); - dentry = (void *)p; - if (IS_ERR(p)) - goto out_pathname; - - si_read_unlock(sb); - err = vfsub_kern_path(p, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &path); - dentry = ERR_PTR(err); - if (unlikely(err)) - goto out_relock; - - dentry = ERR_PTR(-ENOENT); - AuDebugOn(au_test_anon(path.dentry)); - if (unlikely(d_really_is_negative(path.dentry))) - goto out_path; - - if (ino != d_inode(path.dentry)->i_ino) - dentry = au_lkup_by_ino(&path, ino, /*nsi_lock*/NULL); - else - dentry = dget(path.dentry); - -out_path: - path_put(&path); -out_relock: - if (unlikely(si_nfsd_read_lock(sb, nsi_lock) < 0)) - if (!IS_ERR(dentry)) { - dput(dentry); - dentry = ERR_PTR(-ESTALE); - } -out_pathname: - free_page((unsigned long)pathname); -out_h_parent: - dput(h_parent); -out: - AuTraceErrPtr(dentry); - return dentry; -} - -/* ---------------------------------------------------------------------- */ - -static struct dentry * -aufs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, - int fh_type) -{ - struct dentry *dentry; - __u32 *fh = fid->raw; - struct au_branch *br; - ino_t ino, dir_ino; - struct au_nfsd_si_lock nsi_lock = { - .force_lock = 0 - }; - - dentry = ERR_PTR(-ESTALE); - /* it should never happen, but the file handle is unreliable */ - if (unlikely(fh_len < Fh_tail)) - goto out; - nsi_lock.sigen = fh[Fh_sigen]; - nsi_lock.br_id = fh[Fh_br_id]; - - /* branch id may be wrapped around */ - br = NULL; - if (unlikely(si_nfsd_read_lock(sb, &nsi_lock))) - goto out; - nsi_lock.force_lock = 1; - - /* is this inode still cached? */ - ino = decode_ino(fh + Fh_ino); - /* it should never happen */ - if (unlikely(ino == AUFS_ROOT_INO)) - goto out; - - dir_ino = decode_ino(fh + Fh_dir_ino); - dentry = decode_by_ino(sb, ino, dir_ino); - if (IS_ERR(dentry)) - goto out_unlock; - if (dentry) - goto accept; - - /* is the parent dir cached? */ - br = au_sbr(sb, nsi_lock.bindex); - atomic_inc(&br->br_count); - dentry = decode_by_dir_ino(sb, ino, dir_ino, &nsi_lock); - if (IS_ERR(dentry)) - goto out_unlock; - if (dentry) - goto accept; - - /* lookup path */ - dentry = decode_by_path(sb, ino, fh, fh_len, &nsi_lock); - if (IS_ERR(dentry)) - goto out_unlock; - if (unlikely(!dentry)) - /* todo?: make it ESTALE */ - goto out_unlock; - -accept: - if (!au_digen_test(dentry, au_sigen(sb)) - && d_inode(dentry)->i_generation == fh[Fh_igen]) - goto out_unlock; /* success */ - - dput(dentry); - dentry = ERR_PTR(-ESTALE); -out_unlock: - if (br) - atomic_dec(&br->br_count); - si_read_unlock(sb); -out: - AuTraceErrPtr(dentry); - return dentry; -} - -#if 0 /* reserved for future use */ -/* support subtreecheck option */ -static struct dentry *aufs_fh_to_parent(struct super_block *sb, struct fid *fid, - int fh_len, int fh_type) -{ - struct dentry *parent; - __u32 *fh = fid->raw; - ino_t dir_ino; - - dir_ino = decode_ino(fh + Fh_dir_ino); - parent = decode_by_ino(sb, dir_ino, 0); - if (IS_ERR(parent)) - goto out; - if (!parent) - parent = decode_by_path(sb, au_br_index(sb, fh[Fh_br_id]), - dir_ino, fh, fh_len); - -out: - AuTraceErrPtr(parent); - return parent; -} -#endif - -/* ---------------------------------------------------------------------- */ - -static int aufs_encode_fh(struct inode *inode, __u32 *fh, int *max_len, - struct inode *dir) -{ - int err; - aufs_bindex_t bindex; - struct super_block *sb, *h_sb; - struct dentry *dentry, *parent, *h_parent; - struct inode *h_dir; - struct au_branch *br; - - err = -ENOSPC; - if (unlikely(*max_len <= Fh_tail)) { - AuWarn1("NFSv2 client (max_len %d)?\n", *max_len); - goto out; - } - - err = FILEID_ROOT; - if (inode->i_ino == AUFS_ROOT_INO) { - AuDebugOn(inode->i_ino != AUFS_ROOT_INO); - goto out; - } - - h_parent = NULL; - sb = inode->i_sb; - err = si_read_lock(sb, AuLock_FLUSH); - if (unlikely(err)) - goto out; - -#ifdef CONFIG_AUFS_DEBUG - if (unlikely(!au_opt_test(au_mntflags(sb), XINO))) - AuWarn1("NFS-exporting requires xino\n"); -#endif - err = -EIO; - parent = NULL; - ii_read_lock_child(inode); - bindex = au_ibstart(inode); - if (!dir) { - dentry = d_find_any_alias(inode); - if (unlikely(!dentry)) - goto out_unlock; - AuDebugOn(au_test_anon(dentry)); - parent = dget_parent(dentry); - dput(dentry); - if (unlikely(!parent)) - goto out_unlock; - if (d_really_is_positive(parent)) - dir = d_inode(parent); - } - - ii_read_lock_parent(dir); - h_dir = au_h_iptr(dir, bindex); - ii_read_unlock(dir); - if (unlikely(!h_dir)) - goto out_parent; - h_parent = d_find_any_alias(h_dir); - if (unlikely(!h_parent)) - goto out_hparent; - - err = -EPERM; - br = au_sbr(sb, bindex); - h_sb = au_br_sb(br); - if (unlikely(!h_sb->s_export_op)) { - AuErr1("%s branch is not exportable\n", au_sbtype(h_sb)); - goto out_hparent; - } - - fh[Fh_br_id] = br->br_id; - fh[Fh_sigen] = au_sigen(sb); - encode_ino(fh + Fh_ino, inode->i_ino); - encode_ino(fh + Fh_dir_ino, dir->i_ino); - fh[Fh_igen] = inode->i_generation; - - *max_len -= Fh_tail; - fh[Fh_h_type] = exportfs_encode_fh(h_parent, (void *)(fh + Fh_tail), - max_len, - /*connectable or subtreecheck*/0); - err = fh[Fh_h_type]; - *max_len += Fh_tail; - /* todo: macros? */ - if (err != FILEID_INVALID) - err = 99; - else - AuWarn1("%s encode_fh failed\n", au_sbtype(h_sb)); - -out_hparent: - dput(h_parent); -out_parent: - dput(parent); -out_unlock: - ii_read_unlock(inode); - si_read_unlock(sb); -out: - if (unlikely(err < 0)) - err = FILEID_INVALID; - return err; -} - -/* ---------------------------------------------------------------------- */ - -static int aufs_commit_metadata(struct inode *inode) -{ - int err; - aufs_bindex_t bindex; - struct super_block *sb; - struct inode *h_inode; - int (*f)(struct inode *inode); - - sb = inode->i_sb; - si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW); - ii_write_lock_child(inode); - bindex = au_ibstart(inode); - AuDebugOn(bindex < 0); - h_inode = au_h_iptr(inode, bindex); - - f = h_inode->i_sb->s_export_op->commit_metadata; - if (f) - err = f(h_inode); - else { - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = 0 /* metadata only */ - }; - - err = sync_inode(h_inode, &wbc); - } - - au_cpup_attr_timesizes(inode); - ii_write_unlock(inode); - si_read_unlock(sb); - return err; -} - -/* ---------------------------------------------------------------------- */ - -static struct export_operations aufs_export_op = { - .fh_to_dentry = aufs_fh_to_dentry, - /* .fh_to_parent = aufs_fh_to_parent, */ - .encode_fh = aufs_encode_fh, - .commit_metadata = aufs_commit_metadata -}; - -void au_export_init(struct super_block *sb) -{ - struct au_sbinfo *sbinfo; - __u32 u; - - sb->s_export_op = &aufs_export_op; - sbinfo = au_sbi(sb); - sbinfo->si_xigen = NULL; - get_random_bytes(&u, sizeof(u)); - BUILD_BUG_ON(sizeof(u) != sizeof(int)); - atomic_set(&sbinfo->si_xigen_next, u); -} diff --git a/fs/aufs/f_op.c b/fs/aufs/f_op.c deleted file mode 100644 index 34037c708..000000000 --- a/fs/aufs/f_op.c +++ /dev/null @@ -1,738 +0,0 @@ -/* - * Copyright (C) 2005-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * file and vm operations - */ - -#include <linux/aio.h> -#include <linux/fs_stack.h> -#include <linux/mman.h> -#include <linux/security.h> -#include "aufs.h" - -int au_do_open_nondir(struct file *file, int flags, struct file *h_file) -{ - int err; - aufs_bindex_t bindex; - struct dentry *dentry; - struct au_finfo *finfo; - struct inode *h_inode; - - FiMustWriteLock(file); - - err = 0; - dentry = file->f_path.dentry; - AuDebugOn(IS_ERR_OR_NULL(dentry)); - finfo = au_fi(file); - memset(&finfo->fi_htop, 0, sizeof(finfo->fi_htop)); - atomic_set(&finfo->fi_mmapped, 0); - bindex = au_dbstart(dentry); - if (!h_file) - h_file = au_h_open(dentry, bindex, flags, file, /*force_wr*/0); - else - get_file(h_file); - if (IS_ERR(h_file)) - err = PTR_ERR(h_file); - else { - if ((flags & __O_TMPFILE) - && !(flags & O_EXCL)) { - h_inode = file_inode(h_file); - spin_lock(&h_inode->i_lock); - h_inode->i_state |= I_LINKABLE; - spin_unlock(&h_inode->i_lock); - } - au_set_fbstart(file, bindex); - au_set_h_fptr(file, bindex, h_file); - au_update_figen(file); - /* todo: necessary? */ - /* file->f_ra = h_file->f_ra; */ - } - - return err; -} - -static int aufs_open_nondir(struct inode *inode __maybe_unused, - struct file *file) -{ - int err; - struct super_block *sb; - struct au_do_open_args args = { - .open = au_do_open_nondir - }; - - AuDbg("%pD, f_flags 0x%x, f_mode 0x%x\n", - file, vfsub_file_flags(file), file->f_mode); - - sb = file->f_path.dentry->d_sb; - si_read_lock(sb, AuLock_FLUSH); - err = au_do_open(file, &args); - si_read_unlock(sb); - return err; -} - -int aufs_release_nondir(struct inode *inode __maybe_unused, struct file *file) -{ - struct au_finfo *finfo; - aufs_bindex_t bindex; - - finfo = au_fi(file); - au_sphl_del(&finfo->fi_hlist, - &au_sbi(file->f_path.dentry->d_sb)->si_files); - bindex = finfo->fi_btop; - if (bindex >= 0) - au_set_h_fptr(file, bindex, NULL); - - au_finfo_fin(file); - return 0; -} - -/* ---------------------------------------------------------------------- */ - -static int au_do_flush_nondir(struct file *file, fl_owner_t id) -{ - int err; - struct file *h_file; - - err = 0; - h_file = au_hf_top(file); - if (h_file) - err = vfsub_flush(h_file, id); - return err; -} - -static int aufs_flush_nondir(struct file *file, fl_owner_t id) -{ - return au_do_flush(file, id, au_do_flush_nondir); -} - -/* ---------------------------------------------------------------------- */ -/* - * read and write functions acquire [fdi]_rwsem once, but release before - * mmap_sem. This is because to stop a race condition between mmap(2). - * Releasing these aufs-rwsem should be safe, no branch-mamagement (by keeping - * si_rwsem), no harmful copy-up should happen. Actually copy-up may happen in - * read functions after [fdi]_rwsem are released, but it should be harmless. - */ - -/* Callers should call au_read_post() or fput() in the end */ -struct file *au_read_pre(struct file *file, int keep_fi) -{ - struct file *h_file; - int err; - - err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/0); - if (!err) { - di_read_unlock(file->f_path.dentry, AuLock_IR); - h_file = au_hf_top(file); - get_file(h_file); - if (!keep_fi) - fi_read_unlock(file); - } else - h_file = ERR_PTR(err); - - return h_file; -} - -static void au_read_post(struct inode *inode, struct file *h_file) -{ - /* update without lock, I don't think it a problem */ - fsstack_copy_attr_atime(inode, file_inode(h_file)); - fput(h_file); -} - -struct au_write_pre { - blkcnt_t blks; - aufs_bindex_t bstart; -}; - -/* - * return with iinfo is write-locked - * callers should call au_write_post() or iinfo_write_unlock() + fput() in the - * end - */ -static struct file *au_write_pre(struct file *file, int do_ready, - struct au_write_pre *wpre) -{ - struct file *h_file; - struct dentry *dentry; - int err; - struct au_pin pin; - - err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/1); - h_file = ERR_PTR(err); - if (unlikely(err)) - goto out; - - dentry = file->f_path.dentry; - if (do_ready) { - err = au_ready_to_write(file, -1, &pin); - if (unlikely(err)) { - h_file = ERR_PTR(err); - di_write_unlock(dentry); - goto out_fi; - } - } - - di_downgrade_lock(dentry, /*flags*/0); - if (wpre) - wpre->bstart = au_fbstart(file); - h_file = au_hf_top(file); - get_file(h_file); - if (wpre) - wpre->blks = file_inode(h_file)->i_blocks; - if (do_ready) - au_unpin(&pin); - di_read_unlock(dentry, /*flags*/0); - -out_fi: - fi_write_unlock(file); -out: - return h_file; -} - -static void au_write_post(struct inode *inode, struct file *h_file, - struct au_write_pre *wpre, ssize_t written) -{ - struct inode *h_inode; - - au_cpup_attr_timesizes(inode); - AuDebugOn(au_ibstart(inode) != wpre->bstart); - h_inode = file_inode(h_file); - inode->i_mode = h_inode->i_mode; - ii_write_unlock(inode); - fput(h_file); - - /* AuDbg("blks %llu, %llu\n", (u64)blks, (u64)h_inode->i_blocks); */ - if (written > 0) - au_fhsm_wrote(inode->i_sb, wpre->bstart, - /*force*/h_inode->i_blocks > wpre->blks); -} - -static ssize_t aufs_read(struct file *file, char __user *buf, size_t count, - loff_t *ppos) -{ - ssize_t err; - struct inode *inode; - struct file *h_file; - struct super_block *sb; - - inode = file_inode(file); - sb = inode->i_sb; - si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW); - - h_file = au_read_pre(file, /*keep_fi*/0); - err = PTR_ERR(h_file); - if (IS_ERR(h_file)) - goto out; - - /* filedata may be obsoleted by concurrent copyup, but no problem */ - err = vfsub_read_u(h_file, buf, count, ppos); - /* todo: necessary? */ - /* file->f_ra = h_file->f_ra; */ - au_read_post(inode, h_file); - -out: - si_read_unlock(sb); - return err; -} - -/* - * todo: very ugly - * it locks both of i_mutex and si_rwsem for read in safe. - * if the plink maintenance mode continues forever (that is the problem), - * may loop forever. - */ -static void au_mtx_and_read_lock(struct inode *inode) -{ - int err; - struct super_block *sb = inode->i_sb; - - while (1) { - mutex_lock(&inode->i_mutex); - err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM); - if (!err) - break; - mutex_unlock(&inode->i_mutex); - si_read_lock(sb, AuLock_NOPLMW); - si_read_unlock(sb); - } -} - -static ssize_t aufs_write(struct file *file, const char __user *ubuf, - size_t count, loff_t *ppos) -{ - ssize_t err; - struct au_write_pre wpre; - struct inode *inode; - struct file *h_file; - char __user *buf = (char __user *)ubuf; - - inode = file_inode(file); - au_mtx_and_read_lock(inode); - - h_file = au_write_pre(file, /*do_ready*/1, &wpre); - err = PTR_ERR(h_file); - if (IS_ERR(h_file)) - goto out; - - err = vfsub_write_u(h_file, buf, count, ppos); - au_write_post(inode, h_file, &wpre, err); - -out: - si_read_unlock(inode->i_sb); - mutex_unlock(&inode->i_mutex); - return err; -} - -static ssize_t au_do_iter(struct file *h_file, int rw, struct kiocb *kio, - struct iov_iter *iov_iter) -{ - ssize_t err; - struct file *file; - ssize_t (*iter)(struct kiocb *, struct iov_iter *); - - err = security_file_permission(h_file, rw); - if (unlikely(err)) - goto out; - - err = -ENOSYS; - iter = NULL; - if (rw == MAY_READ) - iter = h_file->f_op->read_iter; - else if (rw == MAY_WRITE) - iter = h_file->f_op->write_iter; - - file = kio->ki_filp; - kio->ki_filp = h_file; - if (iter) { - lockdep_off(); - err = iter(kio, iov_iter); - lockdep_on(); - } else - /* currently there is no such fs */ - WARN_ON_ONCE(1); - kio->ki_filp = file; - -out: - return err; -} - -static ssize_t aufs_read_iter(struct kiocb *kio, struct iov_iter *iov_iter) -{ - ssize_t err; - struct file *file, *h_file; - struct inode *inode; - struct super_block *sb; - - file = kio->ki_filp; - inode = file_inode(file); - sb = inode->i_sb; - si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW); - - h_file = au_read_pre(file, /*keep_fi*/0); - err = PTR_ERR(h_file); - if (IS_ERR(h_file)) - goto out; - - err = au_do_iter(h_file, MAY_READ, kio, iov_iter); - /* todo: necessary? */ - /* file->f_ra = h_file->f_ra; */ - au_read_post(inode, h_file); - -out: - si_read_unlock(sb); - return err; -} - -static ssize_t aufs_write_iter(struct kiocb *kio, struct iov_iter *iov_iter) -{ - ssize_t err; - struct au_write_pre wpre; - struct inode *inode; - struct file *file, *h_file; - - file = kio->ki_filp; - inode = file_inode(file); - au_mtx_and_read_lock(inode); - - h_file = au_write_pre(file, /*do_ready*/1, &wpre); - err = PTR_ERR(h_file); - if (IS_ERR(h_file)) - goto out; - - err = au_do_iter(h_file, MAY_WRITE, kio, iov_iter); - au_write_post(inode, h_file, &wpre, err); - -out: - si_read_unlock(inode->i_sb); - mutex_unlock(&inode->i_mutex); - return err; -} - -static ssize_t aufs_splice_read(struct file *file, loff_t *ppos, - struct pipe_inode_info *pipe, size_t len, - unsigned int flags) -{ - ssize_t err; - struct file *h_file; - struct inode *inode; - struct super_block *sb; - - inode = file_inode(file); - sb = inode->i_sb; - si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW); - - h_file = au_read_pre(file, /*keep_fi*/1); - err = PTR_ERR(h_file); - if (IS_ERR(h_file)) - goto out; - - if (au_test_loopback_kthread()) { - au_warn_loopback(h_file->f_path.dentry->d_sb); - if (file->f_mapping != h_file->f_mapping) { - file->f_mapping = h_file->f_mapping; - smp_mb(); /* unnecessary? */ - } - } - fi_read_unlock(file); - - err = vfsub_splice_to(h_file, ppos, pipe, len, flags); - /* todo: necessasry? */ - /* file->f_ra = h_file->f_ra; */ - au_read_post(inode, h_file); - -out: - si_read_unlock(sb); - return err; -} - -static ssize_t -aufs_splice_write(struct pipe_inode_info *pipe, struct file *file, loff_t *ppos, - size_t len, unsigned int flags) -{ - ssize_t err; - struct au_write_pre wpre; - struct inode *inode; - struct file *h_file; - - inode = file_inode(file); - au_mtx_and_read_lock(inode); - - h_file = au_write_pre(file, /*do_ready*/1, &wpre); - err = PTR_ERR(h_file); - if (IS_ERR(h_file)) - goto out; - - err = vfsub_splice_from(pipe, h_file, ppos, len, flags); - au_write_post(inode, h_file, &wpre, err); - -out: - si_read_unlock(inode->i_sb); - mutex_unlock(&inode->i_mutex); - return err; -} - -static long aufs_fallocate(struct file *file, int mode, loff_t offset, - loff_t len) -{ - long err; - struct au_write_pre wpre; - struct inode *inode; - struct file *h_file; - - inode = file_inode(file); - au_mtx_and_read_lock(inode); - - h_file = au_write_pre(file, /*do_ready*/1, &wpre); - err = PTR_ERR(h_file); - if (IS_ERR(h_file)) - goto out; - - lockdep_off(); - err = vfs_fallocate(h_file, mode, offset, len); - lockdep_on(); - au_write_post(inode, h_file, &wpre, /*written*/1); - -out: - si_read_unlock(inode->i_sb); - mutex_unlock(&inode->i_mutex); - return err; -} - -/* ---------------------------------------------------------------------- */ - -/* - * The locking order around current->mmap_sem. - * - in most and regular cases - * file I/O syscall -- aufs_read() or something - * -- si_rwsem for read -- mmap_sem - * (Note that [fdi]i_rwsem are released before mmap_sem). - * - in mmap case - * mmap(2) -- mmap_sem -- aufs_mmap() -- si_rwsem for read -- [fdi]i_rwsem - * This AB-BA order is definitly bad, but is not a problem since "si_rwsem for - * read" allows muliple processes to acquire it and [fdi]i_rwsem are not held in - * file I/O. Aufs needs to stop lockdep in aufs_mmap() though. - * It means that when aufs acquires si_rwsem for write, the process should never - * acquire mmap_sem. - * - * Actually aufs_iterate() holds [fdi]i_rwsem before mmap_sem, but this is not a - * problem either since any directory is not able to be mmap-ed. - * The similar scenario is applied to aufs_readlink() too. - */ - -#if 0 /* stop calling security_file_mmap() */ -/* cf. linux/include/linux/mman.h: calc_vm_prot_bits() */ -#define AuConv_VM_PROT(f, b) _calc_vm_trans(f, VM_##b, PROT_##b) - -static unsigned long au_arch_prot_conv(unsigned long flags) -{ - /* currently ppc64 only */ -#ifdef CONFIG_PPC64 - /* cf. linux/arch/powerpc/include/asm/mman.h */ - AuDebugOn(arch_calc_vm_prot_bits(-1) != VM_SAO); - return AuConv_VM_PROT(flags, SAO); -#else - AuDebugOn(arch_calc_vm_prot_bits(-1)); - return 0; -#endif -} - -static unsigned long au_prot_conv(unsigned long flags) -{ - return AuConv_VM_PROT(flags, READ) - | AuConv_VM_PROT(flags, WRITE) - | AuConv_VM_PROT(flags, EXEC) - | au_arch_prot_conv(flags); -} - -/* cf. linux/include/linux/mman.h: calc_vm_flag_bits() */ -#define AuConv_VM_MAP(f, b) _calc_vm_trans(f, VM_##b, MAP_##b) - -static unsigned long au_flag_conv(unsigned long flags) -{ - return AuConv_VM_MAP(flags, GROWSDOWN) - | AuConv_VM_MAP(flags, DENYWRITE) - | AuConv_VM_MAP(flags, LOCKED); -} -#endif - -static int aufs_mmap(struct file *file, struct vm_area_struct *vma) -{ - int err; - const unsigned char wlock - = (file->f_mode & FMODE_WRITE) && (vma->vm_flags & VM_SHARED); - struct super_block *sb; - struct file *h_file; - struct inode *inode; - - AuDbgVmRegion(file, vma); - - inode = file_inode(file); - sb = inode->i_sb; - lockdep_off(); - si_read_lock(sb, AuLock_NOPLMW); - - h_file = au_write_pre(file, wlock, /*wpre*/NULL); - lockdep_on(); - err = PTR_ERR(h_file); - if (IS_ERR(h_file)) - goto out; - - err = 0; - au_set_mmapped(file); - au_vm_file_reset(vma, h_file); - /* - * we cannot call security_mmap_file() here since it may acquire - * mmap_sem or i_mutex. - * - * err = security_mmap_file(h_file, au_prot_conv(vma->vm_flags), - * au_flag_conv(vma->vm_flags)); - */ - if (!err) - err = h_file->f_op->mmap(h_file, vma); - if (!err) { - au_vm_prfile_set(vma, file); - fsstack_copy_attr_atime(inode, file_inode(h_file)); - goto out_fput; /* success */ - } - au_unset_mmapped(file); - au_vm_file_reset(vma, file); - -out_fput: - lockdep_off(); - ii_write_unlock(inode); - lockdep_on(); - fput(h_file); -out: - lockdep_off(); - si_read_unlock(sb); - lockdep_on(); - AuTraceErr(err); - return err; -} - -/* ---------------------------------------------------------------------- */ - -static int aufs_fsync_nondir(struct file *file, loff_t start, loff_t end, - int datasync) -{ - int err; - struct au_write_pre wpre; - struct inode *inode; - struct file *h_file; - - err = 0; /* -EBADF; */ /* posix? */ - if (unlikely(!(file->f_mode & FMODE_WRITE))) - goto out; - - inode = file_inode(file); - au_mtx_and_read_lock(inode); - - h_file = au_write_pre(file, /*do_ready*/1, &wpre); - err = PTR_ERR(h_file); - if (IS_ERR(h_file)) - goto out_unlock; - - err = vfsub_fsync(h_file, &h_file->f_path, datasync); - au_write_post(inode, h_file, &wpre, /*written*/0); - -out_unlock: - si_read_unlock(inode->i_sb); - mutex_unlock(&inode->i_mutex); -out: - return err; -} - -/* no one supports this operation, currently */ -#if 0 -static int aufs_aio_fsync_nondir(struct kiocb *kio, int datasync) -{ - int err; - struct au_write_pre wpre; - struct inode *inode; - struct file *file, *h_file; - - err = 0; /* -EBADF; */ /* posix? */ - if (unlikely(!(file->f_mode & FMODE_WRITE))) - goto out; - - file = kio->ki_filp; - inode = file_inode(file); - au_mtx_and_read_lock(inode); - - h_file = au_write_pre(file, /*do_ready*/1, &wpre); - err = PTR_ERR(h_file); - if (IS_ERR(h_file)) - goto out_unlock; - - err = -ENOSYS; - h_file = au_hf_top(file); - if (h_file->f_op->aio_fsync) { - struct mutex *h_mtx; - - h_mtx = &file_inode(h_file)->i_mutex; - if (!is_sync_kiocb(kio)) { - get_file(h_file); - fput(file); - } - kio->ki_filp = h_file; - err = h_file->f_op->aio_fsync(kio, datasync); - mutex_lock_nested(h_mtx, AuLsc_I_CHILD); - if (!err) - vfsub_update_h_iattr(&h_file->f_path, /*did*/NULL); - /*ignore*/ - mutex_unlock(h_mtx); - } - au_write_post(inode, h_file, &wpre, /*written*/0); - -out_unlock: - si_read_unlock(inode->sb); - mutex_unlock(&inode->i_mutex); -out: - return err; -} -#endif - -static int aufs_fasync(int fd, struct file *file, int flag) -{ - int err; - struct file *h_file; - struct super_block *sb; - - sb = file->f_path.dentry->d_sb; - si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW); - - h_file = au_read_pre(file, /*keep_fi*/0); - err = PTR_ERR(h_file); - if (IS_ERR(h_file)) - goto out; - - if (h_file->f_op->fasync) - err = h_file->f_op->fasync(fd, h_file, flag); - fput(h_file); /* instead of au_read_post() */ - -out: - si_read_unlock(sb); - return err; -} - -/* ---------------------------------------------------------------------- */ - -/* no one supports this operation, currently */ -#if 0 -static ssize_t aufs_sendpage(struct file *file, struct page *page, int offset, - size_t len, loff_t *pos, int more) -{ -} -#endif - -/* ---------------------------------------------------------------------- */ - -const struct file_operations aufs_file_fop = { - .owner = THIS_MODULE, - - .llseek = default_llseek, - - .read = aufs_read, - .write = aufs_write, - .read_iter = aufs_read_iter, - .write_iter = aufs_write_iter, - -#ifdef CONFIG_AUFS_POLL - .poll = aufs_poll, -#endif - .unlocked_ioctl = aufs_ioctl_nondir, -#ifdef CONFIG_COMPAT - .compat_ioctl = aufs_compat_ioctl_nondir, -#endif - .mmap = aufs_mmap, - .open = aufs_open_nondir, - .flush = aufs_flush_nondir, - .release = aufs_release_nondir, - .fsync = aufs_fsync_nondir, - /* .aio_fsync = aufs_aio_fsync_nondir, */ - .fasync = aufs_fasync, - /* .sendpage = aufs_sendpage, */ - .splice_write = aufs_splice_write, - .splice_read = aufs_splice_read, -#if 0 - .aio_splice_write = aufs_aio_splice_write, - .aio_splice_read = aufs_aio_splice_read, -#endif - .fallocate = aufs_fallocate -}; diff --git a/fs/aufs/fhsm.c b/fs/aufs/fhsm.c deleted file mode 100644 index 7ca4d1759..000000000 --- a/fs/aufs/fhsm.c +++ /dev/null @@ -1,426 +0,0 @@ -/* - * Copyright (C) 2011-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ - -/* - * File-based Hierarchy Storage Management - */ - -#include <linux/anon_inodes.h> -#include <linux/poll.h> -#include <linux/seq_file.h> -#include <linux/statfs.h> -#include "aufs.h" - -static aufs_bindex_t au_fhsm_bottom(struct super_block *sb) -{ - struct au_sbinfo *sbinfo; - struct au_fhsm *fhsm; - - SiMustAnyLock(sb); - - sbinfo = au_sbi(sb); - fhsm = &sbinfo->si_fhsm; - AuDebugOn(!fhsm); - return fhsm->fhsm_bottom; -} - -void au_fhsm_set_bottom(struct super_block *sb, aufs_bindex_t bindex) -{ - struct au_sbinfo *sbinfo; - struct au_fhsm *fhsm; - - SiMustWriteLock(sb); - - sbinfo = au_sbi(sb); - fhsm = &sbinfo->si_fhsm; - AuDebugOn(!fhsm); - fhsm->fhsm_bottom = bindex; -} - -/* ---------------------------------------------------------------------- */ - -static int au_fhsm_test_jiffy(struct au_sbinfo *sbinfo, struct au_branch *br) -{ - struct au_br_fhsm *bf; - - bf = br->br_fhsm; - MtxMustLock(&bf->bf_lock); - - return !bf->bf_readable - || time_after(jiffies, - bf->bf_jiffy + sbinfo->si_fhsm.fhsm_expire); -} - -/* ---------------------------------------------------------------------- */ - -static void au_fhsm_notify(struct super_block *sb, int val) -{ - struct au_sbinfo *sbinfo; - struct au_fhsm *fhsm; - - SiMustAnyLock(sb); - - sbinfo = au_sbi(sb); - fhsm = &sbinfo->si_fhsm; - if (au_fhsm_pid(fhsm) - && atomic_read(&fhsm->fhsm_readable) != -1) { - atomic_set(&fhsm->fhsm_readable, val); - if (val) - wake_up(&fhsm->fhsm_wqh); - } -} - -static int au_fhsm_stfs(struct super_block *sb, aufs_bindex_t bindex, - struct aufs_stfs *rstfs, int do_lock, int do_notify) -{ - int err; - struct au_branch *br; - struct au_br_fhsm *bf; - - br = au_sbr(sb, bindex); - AuDebugOn(au_br_rdonly(br)); - bf = br->br_fhsm; - AuDebugOn(!bf); - - if (do_lock) - mutex_lock(&bf->bf_lock); - else - MtxMustLock(&bf->bf_lock); - - /* sb->s_root for NFS is unreliable */ - err = au_br_stfs(br, &bf->bf_stfs); - if (unlikely(err)) { - AuErr1("FHSM failed (%d), b%d, ignored.\n", bindex, err); - goto out; - } - - bf->bf_jiffy = jiffies; - bf->bf_readable = 1; - if (do_notify) - au_fhsm_notify(sb, /*val*/1); - if (rstfs) - *rstfs = bf->bf_stfs; - -out: - if (do_lock) - mutex_unlock(&bf->bf_lock); - au_fhsm_notify(sb, /*val*/1); - - return err; -} - -void au_fhsm_wrote(struct super_block *sb, aufs_bindex_t bindex, int force) -{ - int err; - struct au_sbinfo *sbinfo; - struct au_fhsm *fhsm; - struct au_branch *br; - struct au_br_fhsm *bf; - - AuDbg("b%d, force %d\n", bindex, force); - SiMustAnyLock(sb); - - sbinfo = au_sbi(sb); - fhsm = &sbinfo->si_fhsm; - if (!au_ftest_si(sbinfo, FHSM) - || fhsm->fhsm_bottom == bindex) - return; - - br = au_sbr(sb, bindex); - bf = br->br_fhsm; - AuDebugOn(!bf); - mutex_lock(&bf->bf_lock); - if (force - || au_fhsm_pid(fhsm) - || au_fhsm_test_jiffy(sbinfo, br)) - err = au_fhsm_stfs(sb, bindex, /*rstfs*/NULL, /*do_lock*/0, - /*do_notify*/1); - mutex_unlock(&bf->bf_lock); -} - -void au_fhsm_wrote_all(struct super_block *sb, int force) -{ - aufs_bindex_t bindex, bend; - struct au_branch *br; - - /* exclude the bottom */ - bend = au_fhsm_bottom(sb); - for (bindex = 0; bindex < bend; bindex++) { - br = au_sbr(sb, bindex); - if (au_br_fhsm(br->br_perm)) - au_fhsm_wrote(sb, bindex, force); - } -} - -/* ---------------------------------------------------------------------- */ - -static unsigned int au_fhsm_poll(struct file *file, - struct poll_table_struct *wait) -{ - unsigned int mask; - struct au_sbinfo *sbinfo; - struct au_fhsm *fhsm; - - mask = 0; - sbinfo = file->private_data; - fhsm = &sbinfo->si_fhsm; - poll_wait(file, &fhsm->fhsm_wqh, wait); - if (atomic_read(&fhsm->fhsm_readable)) - mask = POLLIN /* | POLLRDNORM */; - - AuTraceErr((int)mask); - return mask; -} - -static int au_fhsm_do_read_one(struct aufs_stbr __user *stbr, - struct aufs_stfs *stfs, __s16 brid) -{ - int err; - - err = copy_to_user(&stbr->stfs, stfs, sizeof(*stfs)); - if (!err) - err = __put_user(brid, &stbr->brid); - if (unlikely(err)) - err = -EFAULT; - - return err; -} - -static ssize_t au_fhsm_do_read(struct super_block *sb, - struct aufs_stbr __user *stbr, size_t count) -{ - ssize_t err; - int nstbr; - aufs_bindex_t bindex, bend; - struct au_branch *br; - struct au_br_fhsm *bf; - - /* except the bottom branch */ - err = 0; - nstbr = 0; - bend = au_fhsm_bottom(sb); - for (bindex = 0; !err && bindex < bend; bindex++) { - br = au_sbr(sb, bindex); - if (!au_br_fhsm(br->br_perm)) - continue; - - bf = br->br_fhsm; - mutex_lock(&bf->bf_lock); - if (bf->bf_readable) { - err = -EFAULT; - if (count >= sizeof(*stbr)) - err = au_fhsm_do_read_one(stbr++, &bf->bf_stfs, - br->br_id); - if (!err) { - bf->bf_readable = 0; - count -= sizeof(*stbr); - nstbr++; - } - } - mutex_unlock(&bf->bf_lock); - } - if (!err) - err = sizeof(*stbr) * nstbr; - - return err; -} - -static ssize_t au_fhsm_read(struct file *file, char __user *buf, size_t count, - loff_t *pos) -{ - ssize_t err; - int readable; - aufs_bindex_t nfhsm, bindex, bend; - struct au_sbinfo *sbinfo; - struct au_fhsm *fhsm; - struct au_branch *br; - struct super_block *sb; - - err = 0; - sbinfo = file->private_data; - fhsm = &sbinfo->si_fhsm; -need_data: - spin_lock_irq(&fhsm->fhsm_wqh.lock); - if (!atomic_read(&fhsm->fhsm_readable)) { - if (vfsub_file_flags(file) & O_NONBLOCK) - err = -EAGAIN; - else - err = wait_event_interruptible_locked_irq - (fhsm->fhsm_wqh, - atomic_read(&fhsm->fhsm_readable)); - } - spin_unlock_irq(&fhsm->fhsm_wqh.lock); - if (unlikely(err)) - goto out; - - /* sb may already be dead */ - au_rw_read_lock(&sbinfo->si_rwsem); - readable = atomic_read(&fhsm->fhsm_readable); - if (readable > 0) { - sb = sbinfo->si_sb; - AuDebugOn(!sb); - /* exclude the bottom branch */ - nfhsm = 0; - bend = au_fhsm_bottom(sb); - for (bindex = 0; bindex < bend; bindex++) { - br = au_sbr(sb, bindex); - if (au_br_fhsm(br->br_perm)) - nfhsm++; - } - err = -EMSGSIZE; - if (nfhsm * sizeof(struct aufs_stbr) <= count) { - atomic_set(&fhsm->fhsm_readable, 0); - err = au_fhsm_do_read(sbinfo->si_sb, (void __user *)buf, - count); - } - } - au_rw_read_unlock(&sbinfo->si_rwsem); - if (!readable) - goto need_data; - -out: - return err; -} - -static int au_fhsm_release(struct inode *inode, struct file *file) -{ - struct au_sbinfo *sbinfo; - struct au_fhsm *fhsm; - - /* sb may already be dead */ - sbinfo = file->private_data; - fhsm = &sbinfo->si_fhsm; - spin_lock(&fhsm->fhsm_spin); - fhsm->fhsm_pid = 0; - spin_unlock(&fhsm->fhsm_spin); - kobject_put(&sbinfo->si_kobj); - - return 0; -} - -static const struct file_operations au_fhsm_fops = { - .owner = THIS_MODULE, - .llseek = noop_llseek, - .read = au_fhsm_read, - .poll = au_fhsm_poll, - .release = au_fhsm_release -}; - -int au_fhsm_fd(struct super_block *sb, int oflags) -{ - int err, fd; - struct au_sbinfo *sbinfo; - struct au_fhsm *fhsm; - - err = -EPERM; - if (unlikely(!capable(CAP_SYS_ADMIN))) - goto out; - - err = -EINVAL; - if (unlikely(oflags & ~(O_CLOEXEC | O_NONBLOCK))) - goto out; - - err = 0; - sbinfo = au_sbi(sb); - fhsm = &sbinfo->si_fhsm; - spin_lock(&fhsm->fhsm_spin); - if (!fhsm->fhsm_pid) - fhsm->fhsm_pid = current->pid; - else - err = -EBUSY; - spin_unlock(&fhsm->fhsm_spin); - if (unlikely(err)) - goto out; - - oflags |= O_RDONLY; - /* oflags |= FMODE_NONOTIFY; */ - fd = anon_inode_getfd("[aufs_fhsm]", &au_fhsm_fops, sbinfo, oflags); - err = fd; - if (unlikely(fd < 0)) - goto out_pid; - - /* succeed reglardless 'fhsm' status */ - kobject_get(&sbinfo->si_kobj); - si_noflush_read_lock(sb); - if (au_ftest_si(sbinfo, FHSM)) - au_fhsm_wrote_all(sb, /*force*/0); - si_read_unlock(sb); - goto out; /* success */ - -out_pid: - spin_lock(&fhsm->fhsm_spin); - fhsm->fhsm_pid = 0; - spin_unlock(&fhsm->fhsm_spin); -out: - AuTraceErr(err); - return err; -} - -/* ---------------------------------------------------------------------- */ - -int au_fhsm_br_alloc(struct au_branch *br) -{ - int err; - - err = 0; - br->br_fhsm = kmalloc(sizeof(*br->br_fhsm), GFP_NOFS); - if (br->br_fhsm) - au_br_fhsm_init(br->br_fhsm); - else - err = -ENOMEM; - - return err; -} - -/* ---------------------------------------------------------------------- */ - -void au_fhsm_fin(struct super_block *sb) -{ - au_fhsm_notify(sb, /*val*/-1); -} - -void au_fhsm_init(struct au_sbinfo *sbinfo) -{ - struct au_fhsm *fhsm; - - fhsm = &sbinfo->si_fhsm; - spin_lock_init(&fhsm->fhsm_spin); - init_waitqueue_head(&fhsm->fhsm_wqh); - atomic_set(&fhsm->fhsm_readable, 0); - fhsm->fhsm_expire - = msecs_to_jiffies(AUFS_FHSM_CACHE_DEF_SEC * MSEC_PER_SEC); - fhsm->fhsm_bottom = -1; -} - -void au_fhsm_set(struct au_sbinfo *sbinfo, unsigned int sec) -{ - sbinfo->si_fhsm.fhsm_expire - = msecs_to_jiffies(sec * MSEC_PER_SEC); -} - -void au_fhsm_show(struct seq_file *seq, struct au_sbinfo *sbinfo) -{ - unsigned int u; - - if (!au_ftest_si(sbinfo, FHSM)) - return; - - u = jiffies_to_msecs(sbinfo->si_fhsm.fhsm_expire) / MSEC_PER_SEC; - if (u != AUFS_FHSM_CACHE_DEF_SEC) - seq_printf(seq, ",fhsm_sec=%u", u); -} diff --git a/fs/aufs/file.c b/fs/aufs/file.c deleted file mode 100644 index d3a566e4f..000000000 --- a/fs/aufs/file.c +++ /dev/null @@ -1,841 +0,0 @@ -/* - * Copyright (C) 2005-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * handling file/dir, and address_space operation - */ - -#ifdef CONFIG_AUFS_DEBUG -#include <linux/migrate.h> -#endif -#include <linux/pagemap.h> -#include "aufs.h" - -/* drop flags for writing */ -unsigned int au_file_roflags(unsigned int flags) -{ - flags &= ~(O_WRONLY | O_RDWR | O_APPEND | O_CREAT | O_TRUNC); - flags |= O_RDONLY | O_NOATIME; - return flags; -} - -/* common functions to regular file and dir */ -struct file *au_h_open(struct dentry *dentry, aufs_bindex_t bindex, int flags, - struct file *file, int force_wr) -{ - struct file *h_file; - struct dentry *h_dentry; - struct inode *h_inode; - struct super_block *sb; - struct au_branch *br; - struct path h_path; - int err; - - /* a race condition can happen between open and unlink/rmdir */ - h_file = ERR_PTR(-ENOENT); - h_dentry = au_h_dptr(dentry, bindex); - if (au_test_nfsd() && (!h_dentry || d_is_negative(h_dentry))) - goto out; - h_inode = d_inode(h_dentry); - spin_lock(&h_dentry->d_lock); - err = (!d_unhashed(dentry) && d_unlinked(h_dentry)) - /* || !d_inode(dentry)->i_nlink */ - ; - spin_unlock(&h_dentry->d_lock); - if (unlikely(err)) - goto out; - - sb = dentry->d_sb; - br = au_sbr(sb, bindex); - err = au_br_test_oflag(flags, br); - h_file = ERR_PTR(err); - if (unlikely(err)) - goto out; - - /* drop flags for writing */ - if (au_test_ro(sb, bindex, d_inode(dentry))) { - if (force_wr && !(flags & O_WRONLY)) - force_wr = 0; - flags = au_file_roflags(flags); - if (force_wr) { - h_file = ERR_PTR(-EROFS); - flags = au_file_roflags(flags); - if (unlikely(vfsub_native_ro(h_inode) - || IS_APPEND(h_inode))) - goto out; - flags &= ~O_ACCMODE; - flags |= O_WRONLY; - } - } - flags &= ~O_CREAT; - atomic_inc(&br->br_count); - h_path.dentry = h_dentry; - h_path.mnt = au_br_mnt(br); - h_file = vfsub_dentry_open(&h_path, flags); - if (IS_ERR(h_file)) - goto out_br; - - if (flags & __FMODE_EXEC) { - err = deny_write_access(h_file); - if (unlikely(err)) { - fput(h_file); - h_file = ERR_PTR(err); - goto out_br; - } - } - fsnotify_open(h_file); - goto out; /* success */ - -out_br: - atomic_dec(&br->br_count); -out: - return h_file; -} - -static int au_cmoo(struct dentry *dentry) -{ - int err, cmoo; - unsigned int udba; - struct path h_path; - struct au_pin pin; - struct au_cp_generic cpg = { - .dentry = dentry, - .bdst = -1, - .bsrc = -1, - .len = -1, - .pin = &pin, - .flags = AuCpup_DTIME | AuCpup_HOPEN - }; - struct inode *delegated; - struct super_block *sb; - struct au_sbinfo *sbinfo; - struct au_fhsm *fhsm; - pid_t pid; - struct au_branch *br; - struct dentry *parent; - struct au_hinode *hdir; - - DiMustWriteLock(dentry); - IiMustWriteLock(d_inode(dentry)); - - err = 0; - if (IS_ROOT(dentry)) - goto out; - cpg.bsrc = au_dbstart(dentry); - if (!cpg.bsrc) - goto out; - - sb = dentry->d_sb; - sbinfo = au_sbi(sb); - fhsm = &sbinfo->si_fhsm; - pid = au_fhsm_pid(fhsm); - if (pid - && (current->pid == pid - || current->real_parent->pid == pid)) - goto out; - - br = au_sbr(sb, cpg.bsrc); - cmoo = au_br_cmoo(br->br_perm); - if (!cmoo) - goto out; - if (!d_is_reg(dentry)) - cmoo &= AuBrAttr_COO_ALL; - if (!cmoo) - goto out; - - parent = dget_parent(dentry); - di_write_lock_parent(parent); - err = au_wbr_do_copyup_bu(dentry, cpg.bsrc - 1); - cpg.bdst = err; - if (unlikely(err < 0)) { - err = 0; /* there is no upper writable branch */ - goto out_dgrade; - } - AuDbg("bsrc %d, bdst %d\n", cpg.bsrc, cpg.bdst); - - /* do not respect the coo attrib for the target branch */ - err = au_cpup_dirs(dentry, cpg.bdst); - if (unlikely(err)) - goto out_dgrade; - - di_downgrade_lock(parent, AuLock_IR); - udba = au_opt_udba(sb); - err = au_pin(&pin, dentry, cpg.bdst, udba, - AuPin_DI_LOCKED | AuPin_MNT_WRITE); - if (unlikely(err)) - goto out_parent; - - err = au_sio_cpup_simple(&cpg); - au_unpin(&pin); - if (unlikely(err)) - goto out_parent; - if (!(cmoo & AuBrWAttr_MOO)) - goto out_parent; /* success */ - - err = au_pin(&pin, dentry, cpg.bsrc, udba, - AuPin_DI_LOCKED | AuPin_MNT_WRITE); - if (unlikely(err)) - goto out_parent; - - h_path.mnt = au_br_mnt(br); - h_path.dentry = au_h_dptr(dentry, cpg.bsrc); - hdir = au_hi(d_inode(parent), cpg.bsrc); - delegated = NULL; - err = vfsub_unlink(hdir->hi_inode, &h_path, &delegated, /*force*/1); - au_unpin(&pin); - /* todo: keep h_dentry or not? */ - if (unlikely(err == -EWOULDBLOCK)) { - pr_warn("cannot retry for NFSv4 delegation" - " for an internal unlink\n"); - iput(delegated); - } - if (unlikely(err)) { - pr_err("unlink %pd after coo failed (%d), ignored\n", - dentry, err); - err = 0; - } - goto out_parent; /* success */ - -out_dgrade: - di_downgrade_lock(parent, AuLock_IR); -out_parent: - di_read_unlock(parent, AuLock_IR); - dput(parent); -out: - AuTraceErr(err); - return err; -} - -int au_do_open(struct file *file, struct au_do_open_args *args) -{ - int err, no_lock = args->no_lock; - struct dentry *dentry; - struct au_finfo *finfo; - - if (!no_lock) - err = au_finfo_init(file, args->fidir); - else { - lockdep_off(); - err = au_finfo_init(file, args->fidir); - lockdep_on(); - } - if (unlikely(err)) - goto out; - - dentry = file->f_path.dentry; - AuDebugOn(IS_ERR_OR_NULL(dentry)); - if (!no_lock) { - di_write_lock_child(dentry); - err = au_cmoo(dentry); - di_downgrade_lock(dentry, AuLock_IR); - if (!err) - err = args->open(file, vfsub_file_flags(file), NULL); - di_read_unlock(dentry, AuLock_IR); - } else { - err = au_cmoo(dentry); - if (!err) - err = args->open(file, vfsub_file_flags(file), - args->h_file); - if (!err && au_fbstart(file) != au_dbstart(dentry)) - /* - * cmoo happens after h_file was opened. - * need to refresh file later. - */ - atomic_dec(&au_fi(file)->fi_generation); - } - - finfo = au_fi(file); - if (!err) { - finfo->fi_file = file; - au_sphl_add(&finfo->fi_hlist, - &au_sbi(file->f_path.dentry->d_sb)->si_files); - } - if (!no_lock) - fi_write_unlock(file); - else { - lockdep_off(); - fi_write_unlock(file); - lockdep_on(); - } - if (unlikely(err)) { - finfo->fi_hdir = NULL; - au_finfo_fin(file); - } - -out: - return err; -} - -int au_reopen_nondir(struct file *file) -{ - int err; - aufs_bindex_t bstart; - struct dentry *dentry; - struct file *h_file, *h_file_tmp; - - dentry = file->f_path.dentry; - bstart = au_dbstart(dentry); - h_file_tmp = NULL; - if (au_fbstart(file) == bstart) { - h_file = au_hf_top(file); - if (file->f_mode == h_file->f_mode) - return 0; /* success */ - h_file_tmp = h_file; - get_file(h_file_tmp); - au_set_h_fptr(file, bstart, NULL); - } - AuDebugOn(au_fi(file)->fi_hdir); - /* - * it can happen - * file exists on both of rw and ro - * open --> dbstart and fbstart are both 0 - * prepend a branch as rw, "rw" become ro - * remove rw/file - * delete the top branch, "rw" becomes rw again - * --> dbstart is 1, fbstart is still 0 - * write --> fbstart is 0 but dbstart is 1 - */ - /* AuDebugOn(au_fbstart(file) < bstart); */ - - h_file = au_h_open(dentry, bstart, vfsub_file_flags(file) & ~O_TRUNC, - file, /*force_wr*/0); - err = PTR_ERR(h_file); - if (IS_ERR(h_file)) { - if (h_file_tmp) { - atomic_inc(&au_sbr(dentry->d_sb, bstart)->br_count); - au_set_h_fptr(file, bstart, h_file_tmp); - h_file_tmp = NULL; - } - goto out; /* todo: close all? */ - } - - err = 0; - au_set_fbstart(file, bstart); - au_set_h_fptr(file, bstart, h_file); - au_update_figen(file); - /* todo: necessary? */ - /* file->f_ra = h_file->f_ra; */ - -out: - if (h_file_tmp) - fput(h_file_tmp); - return err; -} - -/* ---------------------------------------------------------------------- */ - -static int au_reopen_wh(struct file *file, aufs_bindex_t btgt, - struct dentry *hi_wh) -{ - int err; - aufs_bindex_t bstart; - struct au_dinfo *dinfo; - struct dentry *h_dentry; - struct au_hdentry *hdp; - - dinfo = au_di(file->f_path.dentry); - AuRwMustWriteLock(&dinfo->di_rwsem); - - bstart = dinfo->di_bstart; - dinfo->di_bstart = btgt; - hdp = dinfo->di_hdentry; - h_dentry = hdp[0 + btgt].hd_dentry; - hdp[0 + btgt].hd_dentry = hi_wh; - err = au_reopen_nondir(file); - hdp[0 + btgt].hd_dentry = h_dentry; - dinfo->di_bstart = bstart; - - return err; -} - -static int au_ready_to_write_wh(struct file *file, loff_t len, - aufs_bindex_t bcpup, struct au_pin *pin) -{ - int err; - struct inode *inode, *h_inode; - struct dentry *h_dentry, *hi_wh; - struct au_cp_generic cpg = { - .dentry = file->f_path.dentry, - .bdst = bcpup, - .bsrc = -1, - .len = len, - .pin = pin - }; - - au_update_dbstart(cpg.dentry); - inode = d_inode(cpg.dentry); - h_inode = NULL; - if (au_dbstart(cpg.dentry) <= bcpup - && au_dbend(cpg.dentry) >= bcpup) { - h_dentry = au_h_dptr(cpg.dentry, bcpup); - if (h_dentry && d_is_positive(h_dentry)) - h_inode = d_inode(h_dentry); - } - hi_wh = au_hi_wh(inode, bcpup); - if (!hi_wh && !h_inode) - err = au_sio_cpup_wh(&cpg, file); - else - /* already copied-up after unlink */ - err = au_reopen_wh(file, bcpup, hi_wh); - - if (!err - && (inode->i_nlink > 1 - || (inode->i_state & I_LINKABLE)) - && au_opt_test(au_mntflags(cpg.dentry->d_sb), PLINK)) - au_plink_append(inode, bcpup, au_h_dptr(cpg.dentry, bcpup)); - - return err; -} - -/* - * prepare the @file for writing. - */ -int au_ready_to_write(struct file *file, loff_t len, struct au_pin *pin) -{ - int err; - aufs_bindex_t dbstart; - struct dentry *parent; - struct inode *inode; - struct super_block *sb; - struct file *h_file; - struct au_cp_generic cpg = { - .dentry = file->f_path.dentry, - .bdst = -1, - .bsrc = -1, - .len = len, - .pin = pin, - .flags = AuCpup_DTIME - }; - - sb = cpg.dentry->d_sb; - inode = d_inode(cpg.dentry); - cpg.bsrc = au_fbstart(file); - err = au_test_ro(sb, cpg.bsrc, inode); - if (!err && (au_hf_top(file)->f_mode & FMODE_WRITE)) { - err = au_pin(pin, cpg.dentry, cpg.bsrc, AuOpt_UDBA_NONE, - /*flags*/0); - goto out; - } - - /* need to cpup or reopen */ - parent = dget_parent(cpg.dentry); - di_write_lock_parent(parent); - err = AuWbrCopyup(au_sbi(sb), cpg.dentry); - cpg.bdst = err; - if (unlikely(err < 0)) - goto out_dgrade; - err = 0; - - if (!d_unhashed(cpg.dentry) && !au_h_dptr(parent, cpg.bdst)) { - err = au_cpup_dirs(cpg.dentry, cpg.bdst); - if (unlikely(err)) - goto out_dgrade; - } - - err = au_pin(pin, cpg.dentry, cpg.bdst, AuOpt_UDBA_NONE, - AuPin_DI_LOCKED | AuPin_MNT_WRITE); - if (unlikely(err)) - goto out_dgrade; - - dbstart = au_dbstart(cpg.dentry); - if (dbstart <= cpg.bdst) - cpg.bsrc = cpg.bdst; - - if (dbstart <= cpg.bdst /* just reopen */ - || !d_unhashed(cpg.dentry) /* copyup and reopen */ - ) { - h_file = au_h_open_pre(cpg.dentry, cpg.bsrc, /*force_wr*/0); - if (IS_ERR(h_file)) - err = PTR_ERR(h_file); - else { - di_downgrade_lock(parent, AuLock_IR); - if (dbstart > cpg.bdst) - err = au_sio_cpup_simple(&cpg); - if (!err) - err = au_reopen_nondir(file); - au_h_open_post(cpg.dentry, cpg.bsrc, h_file); - } - } else { /* copyup as wh and reopen */ - /* - * since writable hfsplus branch is not supported, - * h_open_pre/post() are unnecessary. - */ - err = au_ready_to_write_wh(file, len, cpg.bdst, pin); - di_downgrade_lock(parent, AuLock_IR); - } - - if (!err) { - au_pin_set_parent_lflag(pin, /*lflag*/0); - goto out_dput; /* success */ - } - au_unpin(pin); - goto out_unlock; - -out_dgrade: - di_downgrade_lock(parent, AuLock_IR); -out_unlock: - di_read_unlock(parent, AuLock_IR); -out_dput: - dput(parent); -out: - return err; -} - -/* ---------------------------------------------------------------------- */ - -int au_do_flush(struct file *file, fl_owner_t id, - int (*flush)(struct file *file, fl_owner_t id)) -{ - int err; - struct super_block *sb; - struct inode *inode; - - inode = file_inode(file); - sb = inode->i_sb; - si_noflush_read_lock(sb); - fi_read_lock(file); - ii_read_lock_child(inode); - - err = flush(file, id); - au_cpup_attr_timesizes(inode); - - ii_read_unlock(inode); - fi_read_unlock(file); - si_read_unlock(sb); - return err; -} - -/* ---------------------------------------------------------------------- */ - -static int au_file_refresh_by_inode(struct file *file, int *need_reopen) -{ - int err; - struct au_pin pin; - struct au_finfo *finfo; - struct dentry *parent, *hi_wh; - struct inode *inode; - struct super_block *sb; - struct au_cp_generic cpg = { - .dentry = file->f_path.dentry, - .bdst = -1, - .bsrc = -1, - .len = -1, - .pin = &pin, - .flags = AuCpup_DTIME - }; - - FiMustWriteLock(file); - - err = 0; - finfo = au_fi(file); - sb = cpg.dentry->d_sb; - inode = d_inode(cpg.dentry); - cpg.bdst = au_ibstart(inode); - if (cpg.bdst == finfo->fi_btop || IS_ROOT(cpg.dentry)) - goto out; - - parent = dget_parent(cpg.dentry); - if (au_test_ro(sb, cpg.bdst, inode)) { - di_read_lock_parent(parent, !AuLock_IR); - err = AuWbrCopyup(au_sbi(sb), cpg.dentry); - cpg.bdst = err; - di_read_unlock(parent, !AuLock_IR); - if (unlikely(err < 0)) - goto out_parent; - err = 0; - } - - di_read_lock_parent(parent, AuLock_IR); - hi_wh = au_hi_wh(inode, cpg.bdst); - if (!S_ISDIR(inode->i_mode) - && au_opt_test(au_mntflags(sb), PLINK) - && au_plink_test(inode) - && !d_unhashed(cpg.dentry) - && cpg.bdst < au_dbstart(cpg.dentry)) { - err = au_test_and_cpup_dirs(cpg.dentry, cpg.bdst); - if (unlikely(err)) - goto out_unlock; - - /* always superio. */ - err = au_pin(&pin, cpg.dentry, cpg.bdst, AuOpt_UDBA_NONE, - AuPin_DI_LOCKED | AuPin_MNT_WRITE); - if (!err) { - err = au_sio_cpup_simple(&cpg); - au_unpin(&pin); - } - } else if (hi_wh) { - /* already copied-up after unlink */ - err = au_reopen_wh(file, cpg.bdst, hi_wh); - *need_reopen = 0; - } - -out_unlock: - di_read_unlock(parent, AuLock_IR); -out_parent: - dput(parent); -out: - return err; -} - -static void au_do_refresh_dir(struct file *file) -{ - aufs_bindex_t bindex, bend, new_bindex, brid; - struct au_hfile *p, tmp, *q; - struct au_finfo *finfo; - struct super_block *sb; - struct au_fidir *fidir; - - FiMustWriteLock(file); - - sb = file->f_path.dentry->d_sb; - finfo = au_fi(file); - fidir = finfo->fi_hdir; - AuDebugOn(!fidir); - p = fidir->fd_hfile + finfo->fi_btop; - brid = p->hf_br->br_id; - bend = fidir->fd_bbot; - for (bindex = finfo->fi_btop; bindex <= bend; bindex++, p++) { - if (!p->hf_file) - continue; - - new_bindex = au_br_index(sb, p->hf_br->br_id); - if (new_bindex == bindex) - continue; - if (new_bindex < 0) { - au_set_h_fptr(file, bindex, NULL); - continue; - } - - /* swap two lower inode, and loop again */ - q = fidir->fd_hfile + new_bindex; - tmp = *q; - *q = *p; - *p = tmp; - if (tmp.hf_file) { - bindex--; - p--; - } - } - - p = fidir->fd_hfile; - if (!au_test_mmapped(file) && !d_unlinked(file->f_path.dentry)) { - bend = au_sbend(sb); - for (finfo->fi_btop = 0; finfo->fi_btop <= bend; - finfo->fi_btop++, p++) - if (p->hf_file) { - if (file_inode(p->hf_file)) - break; - au_hfput(p, file); - } - } else { - bend = au_br_index(sb, brid); - for (finfo->fi_btop = 0; finfo->fi_btop < bend; - finfo->fi_btop++, p++) - if (p->hf_file) - au_hfput(p, file); - bend = au_sbend(sb); - } - - p = fidir->fd_hfile + bend; - for (fidir->fd_bbot = bend; fidir->fd_bbot >= finfo->fi_btop; - fidir->fd_bbot--, p--) - if (p->hf_file) { - if (file_inode(p->hf_file)) - break; - au_hfput(p, file); - } - AuDebugOn(fidir->fd_bbot < finfo->fi_btop); -} - -/* - * after branch manipulating, refresh the file. - */ -static int refresh_file(struct file *file, int (*reopen)(struct file *file)) -{ - int err, need_reopen; - aufs_bindex_t bend, bindex; - struct dentry *dentry; - struct au_finfo *finfo; - struct au_hfile *hfile; - - dentry = file->f_path.dentry; - finfo = au_fi(file); - if (!finfo->fi_hdir) { - hfile = &finfo->fi_htop; - AuDebugOn(!hfile->hf_file); - bindex = au_br_index(dentry->d_sb, hfile->hf_br->br_id); - AuDebugOn(bindex < 0); - if (bindex != finfo->fi_btop) - au_set_fbstart(file, bindex); - } else { - err = au_fidir_realloc(finfo, au_sbend(dentry->d_sb) + 1); - if (unlikely(err)) - goto out; - au_do_refresh_dir(file); - } - - err = 0; - need_reopen = 1; - if (!au_test_mmapped(file)) - err = au_file_refresh_by_inode(file, &need_reopen); - if (!err && need_reopen && !d_unlinked(dentry)) - err = reopen(file); - if (!err) { - au_update_figen(file); - goto out; /* success */ - } - - /* error, close all lower files */ - if (finfo->fi_hdir) { - bend = au_fbend_dir(file); - for (bindex = au_fbstart(file); bindex <= bend; bindex++) - au_set_h_fptr(file, bindex, NULL); - } - -out: - return err; -} - -/* common function to regular file and dir */ -int au_reval_and_lock_fdi(struct file *file, int (*reopen)(struct file *file), - int wlock) -{ - int err; - unsigned int sigen, figen; - aufs_bindex_t bstart; - unsigned char pseudo_link; - struct dentry *dentry; - struct inode *inode; - - err = 0; - dentry = file->f_path.dentry; - inode = d_inode(dentry); - sigen = au_sigen(dentry->d_sb); - fi_write_lock(file); - figen = au_figen(file); - di_write_lock_child(dentry); - bstart = au_dbstart(dentry); - pseudo_link = (bstart != au_ibstart(inode)); - if (sigen == figen && !pseudo_link && au_fbstart(file) == bstart) { - if (!wlock) { - di_downgrade_lock(dentry, AuLock_IR); - fi_downgrade_lock(file); - } - goto out; /* success */ - } - - AuDbg("sigen %d, figen %d\n", sigen, figen); - if (au_digen_test(dentry, sigen)) { - err = au_reval_dpath(dentry, sigen); - AuDebugOn(!err && au_digen_test(dentry, sigen)); - } - - if (!err) - err = refresh_file(file, reopen); - if (!err) { - if (!wlock) { - di_downgrade_lock(dentry, AuLock_IR); - fi_downgrade_lock(file); - } - } else { - di_write_unlock(dentry); - fi_write_unlock(file); - } - -out: - return err; -} - -/* ---------------------------------------------------------------------- */ - -/* cf. aufs_nopage() */ -/* for madvise(2) */ -static int aufs_readpage(struct file *file __maybe_unused, struct page *page) -{ - unlock_page(page); - return 0; -} - -/* it will never be called, but necessary to support O_DIRECT */ -static ssize_t aufs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, - loff_t offset) -{ BUG(); return 0; } - -/* they will never be called. */ -#ifdef CONFIG_AUFS_DEBUG -static int aufs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata) -{ AuUnsupport(); return 0; } -static int aufs_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) -{ AuUnsupport(); return 0; } -static int aufs_writepage(struct page *page, struct writeback_control *wbc) -{ AuUnsupport(); return 0; } - -static int aufs_set_page_dirty(struct page *page) -{ AuUnsupport(); return 0; } -static void aufs_invalidatepage(struct page *page, unsigned int offset, - unsigned int length) -{ AuUnsupport(); } -static int aufs_releasepage(struct page *page, gfp_t gfp) -{ AuUnsupport(); return 0; } -static int aufs_migratepage(struct address_space *mapping, struct page *newpage, - struct page *page, enum migrate_mode mode) -{ AuUnsupport(); return 0; } -static int aufs_launder_page(struct page *page) -{ AuUnsupport(); return 0; } -static int aufs_is_partially_uptodate(struct page *page, - unsigned long from, - unsigned long count) -{ AuUnsupport(); return 0; } -static void aufs_is_dirty_writeback(struct page *page, bool *dirty, - bool *writeback) -{ AuUnsupport(); } -static int aufs_error_remove_page(struct address_space *mapping, - struct page *page) -{ AuUnsupport(); return 0; } -static int aufs_swap_activate(struct swap_info_struct *sis, struct file *file, - sector_t *span) -{ AuUnsupport(); return 0; } -static void aufs_swap_deactivate(struct file *file) -{ AuUnsupport(); } -#endif /* CONFIG_AUFS_DEBUG */ - -const struct address_space_operations aufs_aop = { - .readpage = aufs_readpage, - .direct_IO = aufs_direct_IO, -#ifdef CONFIG_AUFS_DEBUG - .writepage = aufs_writepage, - /* no writepages, because of writepage */ - .set_page_dirty = aufs_set_page_dirty, - /* no readpages, because of readpage */ - .write_begin = aufs_write_begin, - .write_end = aufs_write_end, - /* no bmap, no block device */ - .invalidatepage = aufs_invalidatepage, - .releasepage = aufs_releasepage, - .migratepage = aufs_migratepage, - .launder_page = aufs_launder_page, - .is_partially_uptodate = aufs_is_partially_uptodate, - .is_dirty_writeback = aufs_is_dirty_writeback, - .error_remove_page = aufs_error_remove_page, - .swap_activate = aufs_swap_activate, - .swap_deactivate = aufs_swap_deactivate -#endif /* CONFIG_AUFS_DEBUG */ -}; diff --git a/fs/aufs/file.h b/fs/aufs/file.h deleted file mode 100644 index 10ede169f..000000000 --- a/fs/aufs/file.h +++ /dev/null @@ -1,291 +0,0 @@ -/* - * Copyright (C) 2005-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * file operations - */ - -#ifndef __AUFS_FILE_H__ -#define __AUFS_FILE_H__ - -#ifdef __KERNEL__ - -#include <linux/file.h> -#include <linux/fs.h> -#include <linux/poll.h> -#include "rwsem.h" - -struct au_branch; -struct au_hfile { - struct file *hf_file; - struct au_branch *hf_br; -}; - -struct au_vdir; -struct au_fidir { - aufs_bindex_t fd_bbot; - aufs_bindex_t fd_nent; - struct au_vdir *fd_vdir_cache; - struct au_hfile fd_hfile[]; -}; - -static inline int au_fidir_sz(int nent) -{ - AuDebugOn(nent < 0); - return sizeof(struct au_fidir) + sizeof(struct au_hfile) * nent; -} - -struct au_finfo { - atomic_t fi_generation; - - struct au_rwsem fi_rwsem; - aufs_bindex_t fi_btop; - - /* do not union them */ - struct { /* for non-dir */ - struct au_hfile fi_htop; - atomic_t fi_mmapped; - }; - struct au_fidir *fi_hdir; /* for dir only */ - - struct hlist_node fi_hlist; - struct file *fi_file; /* very ugly */ -} ____cacheline_aligned_in_smp; - -/* ---------------------------------------------------------------------- */ - -/* file.c */ -extern const struct address_space_operations aufs_aop; -unsigned int au_file_roflags(unsigned int flags); -struct file *au_h_open(struct dentry *dentry, aufs_bindex_t bindex, int flags, - struct file *file, int force_wr); -struct au_do_open_args { - int no_lock; - int (*open)(struct file *file, int flags, - struct file *h_file); - struct au_fidir *fidir; - struct file *h_file; -}; -int au_do_open(struct file *file, struct au_do_open_args *args); -int au_reopen_nondir(struct file *file); -struct au_pin; -int au_ready_to_write(struct file *file, loff_t len, struct au_pin *pin); -int au_reval_and_lock_fdi(struct file *file, int (*reopen)(struct file *file), - int wlock); -int au_do_flush(struct file *file, fl_owner_t id, - int (*flush)(struct file *file, fl_owner_t id)); - -/* poll.c */ -#ifdef CONFIG_AUFS_POLL -unsigned int aufs_poll(struct file *file, poll_table *wait); -#endif - -#ifdef CONFIG_AUFS_BR_HFSPLUS -/* hfsplus.c */ -struct file *au_h_open_pre(struct dentry *dentry, aufs_bindex_t bindex, - int force_wr); -void au_h_open_post(struct dentry *dentry, aufs_bindex_t bindex, - struct file *h_file); -#else -AuStub(struct file *, au_h_open_pre, return NULL, struct dentry *dentry, - aufs_bindex_t bindex, int force_wr) -AuStubVoid(au_h_open_post, struct dentry *dentry, aufs_bindex_t bindex, - struct file *h_file); -#endif - -/* f_op.c */ -extern const struct file_operations aufs_file_fop; -int au_do_open_nondir(struct file *file, int flags, struct file *h_file); -int aufs_release_nondir(struct inode *inode __maybe_unused, struct file *file); -struct file *au_read_pre(struct file *file, int keep_fi); - -/* finfo.c */ -void au_hfput(struct au_hfile *hf, struct file *file); -void au_set_h_fptr(struct file *file, aufs_bindex_t bindex, - struct file *h_file); - -void au_update_figen(struct file *file); -struct au_fidir *au_fidir_alloc(struct super_block *sb); -int au_fidir_realloc(struct au_finfo *finfo, int nbr); - -void au_fi_init_once(void *_fi); -void au_finfo_fin(struct file *file); -int au_finfo_init(struct file *file, struct au_fidir *fidir); - -/* ioctl.c */ -long aufs_ioctl_nondir(struct file *file, unsigned int cmd, unsigned long arg); -#ifdef CONFIG_COMPAT -long aufs_compat_ioctl_dir(struct file *file, unsigned int cmd, - unsigned long arg); -long aufs_compat_ioctl_nondir(struct file *file, unsigned int cmd, - unsigned long arg); -#endif - -/* ---------------------------------------------------------------------- */ - -static inline struct au_finfo *au_fi(struct file *file) -{ - return file->private_data; -} - -/* ---------------------------------------------------------------------- */ - -/* - * fi_read_lock, fi_write_lock, - * fi_read_unlock, fi_write_unlock, fi_downgrade_lock - */ -AuSimpleRwsemFuncs(fi, struct file *f, &au_fi(f)->fi_rwsem); - -#define FiMustNoWaiters(f) AuRwMustNoWaiters(&au_fi(f)->fi_rwsem) -#define FiMustAnyLock(f) AuRwMustAnyLock(&au_fi(f)->fi_rwsem) -#define FiMustWriteLock(f) AuRwMustWriteLock(&au_fi(f)->fi_rwsem) - -/* ---------------------------------------------------------------------- */ - -/* todo: hard/soft set? */ -static inline aufs_bindex_t au_fbstart(struct file *file) -{ - FiMustAnyLock(file); - return au_fi(file)->fi_btop; -} - -static inline aufs_bindex_t au_fbend_dir(struct file *file) -{ - FiMustAnyLock(file); - AuDebugOn(!au_fi(file)->fi_hdir); - return au_fi(file)->fi_hdir->fd_bbot; -} - -static inline struct au_vdir *au_fvdir_cache(struct file *file) -{ - FiMustAnyLock(file); - AuDebugOn(!au_fi(file)->fi_hdir); - return au_fi(file)->fi_hdir->fd_vdir_cache; -} - -static inline void au_set_fbstart(struct file *file, aufs_bindex_t bindex) -{ - FiMustWriteLock(file); - au_fi(file)->fi_btop = bindex; -} - -static inline void au_set_fbend_dir(struct file *file, aufs_bindex_t bindex) -{ - FiMustWriteLock(file); - AuDebugOn(!au_fi(file)->fi_hdir); - au_fi(file)->fi_hdir->fd_bbot = bindex; -} - -static inline void au_set_fvdir_cache(struct file *file, - struct au_vdir *vdir_cache) -{ - FiMustWriteLock(file); - AuDebugOn(!au_fi(file)->fi_hdir); - au_fi(file)->fi_hdir->fd_vdir_cache = vdir_cache; -} - -static inline struct file *au_hf_top(struct file *file) -{ - FiMustAnyLock(file); - AuDebugOn(au_fi(file)->fi_hdir); - return au_fi(file)->fi_htop.hf_file; -} - -static inline struct file *au_hf_dir(struct file *file, aufs_bindex_t bindex) -{ - FiMustAnyLock(file); - AuDebugOn(!au_fi(file)->fi_hdir); - return au_fi(file)->fi_hdir->fd_hfile[0 + bindex].hf_file; -} - -/* todo: memory barrier? */ -static inline unsigned int au_figen(struct file *f) -{ - return atomic_read(&au_fi(f)->fi_generation); -} - -static inline void au_set_mmapped(struct file *f) -{ - if (atomic_inc_return(&au_fi(f)->fi_mmapped)) - return; - pr_warn("fi_mmapped wrapped around\n"); - while (!atomic_inc_return(&au_fi(f)->fi_mmapped)) - ; -} - -static inline void au_unset_mmapped(struct file *f) -{ - atomic_dec(&au_fi(f)->fi_mmapped); -} - -static inline int au_test_mmapped(struct file *f) -{ - return atomic_read(&au_fi(f)->fi_mmapped); -} - -/* customize vma->vm_file */ - -static inline void au_do_vm_file_reset(struct vm_area_struct *vma, - struct file *file) -{ - struct file *f; - - f = vma->vm_file; - get_file(file); - vma->vm_file = file; - fput(f); -} - -#ifdef CONFIG_MMU -#define AuDbgVmRegion(file, vma) do {} while (0) - -static inline void au_vm_file_reset(struct vm_area_struct *vma, - struct file *file) -{ - au_do_vm_file_reset(vma, file); -} -#else -#define AuDbgVmRegion(file, vma) \ - AuDebugOn((vma)->vm_region && (vma)->vm_region->vm_file != (file)) - -static inline void au_vm_file_reset(struct vm_area_struct *vma, - struct file *file) -{ - struct file *f; - - au_do_vm_file_reset(vma, file); - f = vma->vm_region->vm_file; - get_file(file); - vma->vm_region->vm_file = file; - fput(f); -} -#endif /* CONFIG_MMU */ - -/* handle vma->vm_prfile */ -static inline void au_vm_prfile_set(struct vm_area_struct *vma, - struct file *file) -{ - get_file(file); - vma->vm_prfile = file; -#ifndef CONFIG_MMU - get_file(file); - vma->vm_region->vm_prfile = file; -#endif -} - -#endif /* __KERNEL__ */ -#endif /* __AUFS_FILE_H__ */ diff --git a/fs/aufs/finfo.c b/fs/aufs/finfo.c deleted file mode 100644 index 7ac467cf7..000000000 --- a/fs/aufs/finfo.c +++ /dev/null @@ -1,157 +0,0 @@ -/* - * Copyright (C) 2005-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * file private data - */ - -#include "aufs.h" - -void au_hfput(struct au_hfile *hf, struct file *file) -{ - /* todo: direct access f_flags */ - if (vfsub_file_flags(file) & __FMODE_EXEC) - allow_write_access(hf->hf_file); - fput(hf->hf_file); - hf->hf_file = NULL; - atomic_dec(&hf->hf_br->br_count); - hf->hf_br = NULL; -} - -void au_set_h_fptr(struct file *file, aufs_bindex_t bindex, struct file *val) -{ - struct au_finfo *finfo = au_fi(file); - struct au_hfile *hf; - struct au_fidir *fidir; - - fidir = finfo->fi_hdir; - if (!fidir) { - AuDebugOn(finfo->fi_btop != bindex); - hf = &finfo->fi_htop; - } else - hf = fidir->fd_hfile + bindex; - - if (hf && hf->hf_file) - au_hfput(hf, file); - if (val) { - FiMustWriteLock(file); - AuDebugOn(IS_ERR_OR_NULL(file->f_path.dentry)); - hf->hf_file = val; - hf->hf_br = au_sbr(file->f_path.dentry->d_sb, bindex); - } -} - -void au_update_figen(struct file *file) -{ - atomic_set(&au_fi(file)->fi_generation, au_digen(file->f_path.dentry)); - /* smp_mb(); */ /* atomic_set */ -} - -/* ---------------------------------------------------------------------- */ - -struct au_fidir *au_fidir_alloc(struct super_block *sb) -{ - struct au_fidir *fidir; - int nbr; - - nbr = au_sbend(sb) + 1; - if (nbr < 2) - nbr = 2; /* initial allocate for 2 branches */ - fidir = kzalloc(au_fidir_sz(nbr), GFP_NOFS); - if (fidir) { - fidir->fd_bbot = -1; - fidir->fd_nent = nbr; - fidir->fd_vdir_cache = NULL; - } - - return fidir; -} - -int au_fidir_realloc(struct au_finfo *finfo, int nbr) -{ - int err; - struct au_fidir *fidir, *p; - - AuRwMustWriteLock(&finfo->fi_rwsem); - fidir = finfo->fi_hdir; - AuDebugOn(!fidir); - - err = -ENOMEM; - p = au_kzrealloc(fidir, au_fidir_sz(fidir->fd_nent), au_fidir_sz(nbr), - GFP_NOFS); - if (p) { - p->fd_nent = nbr; - finfo->fi_hdir = p; - err = 0; - } - - return err; -} - -/* ---------------------------------------------------------------------- */ - -void au_finfo_fin(struct file *file) -{ - struct au_finfo *finfo; - - au_nfiles_dec(file->f_path.dentry->d_sb); - - finfo = au_fi(file); - AuDebugOn(finfo->fi_hdir); - AuRwDestroy(&finfo->fi_rwsem); - au_cache_free_finfo(finfo); -} - -void au_fi_init_once(void *_finfo) -{ - struct au_finfo *finfo = _finfo; - static struct lock_class_key aufs_fi; - - au_rw_init(&finfo->fi_rwsem); - au_rw_class(&finfo->fi_rwsem, &aufs_fi); -} - -int au_finfo_init(struct file *file, struct au_fidir *fidir) -{ - int err; - struct au_finfo *finfo; - struct dentry *dentry; - - err = -ENOMEM; - dentry = file->f_path.dentry; - finfo = au_cache_alloc_finfo(); - if (unlikely(!finfo)) - goto out; - - err = 0; - au_nfiles_inc(dentry->d_sb); - /* verbose coding for lock class name */ - if (!fidir) - au_rw_class(&finfo->fi_rwsem, au_lc_key + AuLcNonDir_FIINFO); - else - au_rw_class(&finfo->fi_rwsem, au_lc_key + AuLcDir_FIINFO); - au_rw_write_lock(&finfo->fi_rwsem); - finfo->fi_btop = -1; - finfo->fi_hdir = fidir; - atomic_set(&finfo->fi_generation, au_digen(dentry)); - /* smp_mb(); */ /* atomic_set */ - - file->private_data = finfo; - -out: - return err; -} diff --git a/fs/aufs/fstype.h b/fs/aufs/fstype.h deleted file mode 100644 index 8b6878b4e..000000000 --- a/fs/aufs/fstype.h +++ /dev/null @@ -1,400 +0,0 @@ -/* - * Copyright (C) 2005-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * judging filesystem type - */ - -#ifndef __AUFS_FSTYPE_H__ -#define __AUFS_FSTYPE_H__ - -#ifdef __KERNEL__ - -#include <linux/fs.h> -#include <linux/magic.h> -#include <linux/romfs_fs.h> -#include <linux/nfs_fs.h> - -static inline int au_test_aufs(struct super_block *sb) -{ - return sb->s_magic == AUFS_SUPER_MAGIC; -} - -static inline const char *au_sbtype(struct super_block *sb) -{ - return sb->s_type->name; -} - -static inline int au_test_iso9660(struct super_block *sb __maybe_unused) -{ -#if defined(CONFIG_ISO9660_FS) || defined(CONFIG_ISO9660_FS_MODULE) - return sb->s_magic == ISOFS_SUPER_MAGIC; -#else - return 0; -#endif -} - -static inline int au_test_romfs(struct super_block *sb __maybe_unused) -{ -#if defined(CONFIG_ROMFS_FS) || defined(CONFIG_ROMFS_FS_MODULE) - return sb->s_magic == ROMFS_MAGIC; -#else - return 0; -#endif -} - -static inline int au_test_cramfs(struct super_block *sb __maybe_unused) -{ -#if defined(CONFIG_CRAMFS) || defined(CONFIG_CRAMFS_MODULE) - return sb->s_magic == CRAMFS_MAGIC; -#endif - return 0; -} - -static inline int au_test_nfs(struct super_block *sb __maybe_unused) -{ -#if defined(CONFIG_NFS_FS) || defined(CONFIG_NFS_FS_MODULE) - return sb->s_magic == NFS_SUPER_MAGIC; -#else - return 0; -#endif -} - -static inline int au_test_fuse(struct super_block *sb __maybe_unused) -{ -#if defined(CONFIG_FUSE_FS) || defined(CONFIG_FUSE_FS_MODULE) - return sb->s_magic == FUSE_SUPER_MAGIC; -#else - return 0; -#endif -} - -static inline int au_test_xfs(struct super_block *sb __maybe_unused) -{ -#if defined(CONFIG_XFS_FS) || defined(CONFIG_XFS_FS_MODULE) - return sb->s_magic == XFS_SB_MAGIC; -#else - return 0; -#endif -} - -static inline int au_test_tmpfs(struct super_block *sb __maybe_unused) -{ -#ifdef CONFIG_TMPFS - return sb->s_magic == TMPFS_MAGIC; -#else - return 0; -#endif -} - -static inline int au_test_ecryptfs(struct super_block *sb __maybe_unused) -{ -#if defined(CONFIG_ECRYPT_FS) || defined(CONFIG_ECRYPT_FS_MODULE) - return !strcmp(au_sbtype(sb), "ecryptfs"); -#else - return 0; -#endif -} - -static inline int au_test_ramfs(struct super_block *sb) -{ - return sb->s_magic == RAMFS_MAGIC; -} - -static inline int au_test_ubifs(struct super_block *sb __maybe_unused) -{ -#if defined(CONFIG_UBIFS_FS) || defined(CONFIG_UBIFS_FS_MODULE) - return sb->s_magic == UBIFS_SUPER_MAGIC; -#else - return 0; -#endif -} - -static inline int au_test_procfs(struct super_block *sb __maybe_unused) -{ -#ifdef CONFIG_PROC_FS - return sb->s_magic == PROC_SUPER_MAGIC; -#else - return 0; -#endif -} - -static inline int au_test_sysfs(struct super_block *sb __maybe_unused) -{ -#ifdef CONFIG_SYSFS - return sb->s_magic == SYSFS_MAGIC; -#else - return 0; -#endif -} - -static inline int au_test_configfs(struct super_block *sb __maybe_unused) -{ -#if defined(CONFIG_CONFIGFS_FS) || defined(CONFIG_CONFIGFS_FS_MODULE) - return sb->s_magic == CONFIGFS_MAGIC; -#else - return 0; -#endif -} - -static inline int au_test_minix(struct super_block *sb __maybe_unused) -{ -#if defined(CONFIG_MINIX_FS) || defined(CONFIG_MINIX_FS_MODULE) - return sb->s_magic == MINIX3_SUPER_MAGIC - || sb->s_magic == MINIX2_SUPER_MAGIC - || sb->s_magic == MINIX2_SUPER_MAGIC2 - || sb->s_magic == MINIX_SUPER_MAGIC - || sb->s_magic == MINIX_SUPER_MAGIC2; -#else - return 0; -#endif -} - -static inline int au_test_fat(struct super_block *sb __maybe_unused) -{ -#if defined(CONFIG_FAT_FS) || defined(CONFIG_FAT_FS_MODULE) - return sb->s_magic == MSDOS_SUPER_MAGIC; -#else - return 0; -#endif -} - -static inline int au_test_msdos(struct super_block *sb) -{ - return au_test_fat(sb); -} - -static inline int au_test_vfat(struct super_block *sb) -{ - return au_test_fat(sb); -} - -static inline int au_test_securityfs(struct super_block *sb __maybe_unused) -{ -#ifdef CONFIG_SECURITYFS - return sb->s_magic == SECURITYFS_MAGIC; -#else - return 0; -#endif -} - -static inline int au_test_squashfs(struct super_block *sb __maybe_unused) -{ -#if defined(CONFIG_SQUASHFS) || defined(CONFIG_SQUASHFS_MODULE) - return sb->s_magic == SQUASHFS_MAGIC; -#else - return 0; -#endif -} - -static inline int au_test_btrfs(struct super_block *sb __maybe_unused) -{ -#if defined(CONFIG_BTRFS_FS) || defined(CONFIG_BTRFS_FS_MODULE) - return sb->s_magic == BTRFS_SUPER_MAGIC; -#else - return 0; -#endif -} - -static inline int au_test_xenfs(struct super_block *sb __maybe_unused) -{ -#if defined(CONFIG_XENFS) || defined(CONFIG_XENFS_MODULE) - return sb->s_magic == XENFS_SUPER_MAGIC; -#else - return 0; -#endif -} - -static inline int au_test_debugfs(struct super_block *sb __maybe_unused) -{ -#ifdef CONFIG_DEBUG_FS - return sb->s_magic == DEBUGFS_MAGIC; -#else - return 0; -#endif -} - -static inline int au_test_nilfs(struct super_block *sb __maybe_unused) -{ -#if defined(CONFIG_NILFS) || defined(CONFIG_NILFS_MODULE) - return sb->s_magic == NILFS_SUPER_MAGIC; -#else - return 0; -#endif -} - -static inline int au_test_hfsplus(struct super_block *sb __maybe_unused) -{ -#if defined(CONFIG_HFSPLUS_FS) || defined(CONFIG_HFSPLUS_FS_MODULE) - return sb->s_magic == HFSPLUS_SUPER_MAGIC; -#else - return 0; -#endif -} - -/* ---------------------------------------------------------------------- */ -/* - * they can't be an aufs branch. - */ -static inline int au_test_fs_unsuppoted(struct super_block *sb) -{ - return -#ifndef CONFIG_AUFS_BR_RAMFS - au_test_ramfs(sb) || -#endif - au_test_procfs(sb) - || au_test_sysfs(sb) - || au_test_configfs(sb) - || au_test_debugfs(sb) - || au_test_securityfs(sb) - || au_test_xenfs(sb) - || au_test_ecryptfs(sb) - /* || !strcmp(au_sbtype(sb), "unionfs") */ - || au_test_aufs(sb); /* will be supported in next version */ -} - -static inline int au_test_fs_remote(struct super_block *sb) -{ - return !au_test_tmpfs(sb) -#ifdef CONFIG_AUFS_BR_RAMFS - && !au_test_ramfs(sb) -#endif - && !(sb->s_type->fs_flags & FS_REQUIRES_DEV); -} - -/* ---------------------------------------------------------------------- */ - -/* - * Note: these functions (below) are created after reading ->getattr() in all - * filesystems under linux/fs. it means we have to do so in every update... - */ - -/* - * some filesystems require getattr to refresh the inode attributes before - * referencing. - * in most cases, we can rely on the inode attribute in NFS (or every remote fs) - * and leave the work for d_revalidate() - */ -static inline int au_test_fs_refresh_iattr(struct super_block *sb) -{ - return au_test_nfs(sb) - || au_test_fuse(sb) - /* || au_test_btrfs(sb) */ /* untested */ - ; -} - -/* - * filesystems which don't maintain i_size or i_blocks. - */ -static inline int au_test_fs_bad_iattr_size(struct super_block *sb) -{ - return au_test_xfs(sb) - || au_test_btrfs(sb) - || au_test_ubifs(sb) - || au_test_hfsplus(sb) /* maintained, but incorrect */ - /* || au_test_minix(sb) */ /* untested */ - ; -} - -/* - * filesystems which don't store the correct value in some of their inode - * attributes. - */ -static inline int au_test_fs_bad_iattr(struct super_block *sb) -{ - return au_test_fs_bad_iattr_size(sb) - || au_test_fat(sb) - || au_test_msdos(sb) - || au_test_vfat(sb); -} - -/* they don't check i_nlink in link(2) */ -static inline int au_test_fs_no_limit_nlink(struct super_block *sb) -{ - return au_test_tmpfs(sb) -#ifdef CONFIG_AUFS_BR_RAMFS - || au_test_ramfs(sb) -#endif - || au_test_ubifs(sb) - || au_test_hfsplus(sb); -} - -/* - * filesystems which sets S_NOATIME and S_NOCMTIME. - */ -static inline int au_test_fs_notime(struct super_block *sb) -{ - return au_test_nfs(sb) - || au_test_fuse(sb) - || au_test_ubifs(sb) - ; -} - -/* temporary support for i#1 in cramfs */ -static inline int au_test_fs_unique_ino(struct inode *inode) -{ - if (au_test_cramfs(inode->i_sb)) - return inode->i_ino != 1; - return 1; -} - -/* ---------------------------------------------------------------------- */ - -/* - * the filesystem where the xino files placed must support i/o after unlink and - * maintain i_size and i_blocks. - */ -static inline int au_test_fs_bad_xino(struct super_block *sb) -{ - return au_test_fs_remote(sb) - || au_test_fs_bad_iattr_size(sb) - /* don't want unnecessary work for xino */ - || au_test_aufs(sb) - || au_test_ecryptfs(sb) - || au_test_nilfs(sb); -} - -static inline int au_test_fs_trunc_xino(struct super_block *sb) -{ - return au_test_tmpfs(sb) - || au_test_ramfs(sb); -} - -/* - * test if the @sb is real-readonly. - */ -static inline int au_test_fs_rr(struct super_block *sb) -{ - return au_test_squashfs(sb) - || au_test_iso9660(sb) - || au_test_cramfs(sb) - || au_test_romfs(sb); -} - -/* - * test if the @inode is nfs with 'noacl' option - * NFS always sets MS_POSIXACL regardless its mount option 'noacl.' - */ -static inline int au_test_nfs_noacl(struct inode *inode) -{ - return au_test_nfs(inode->i_sb) - /* && IS_POSIXACL(inode) */ - && !nfs_server_capable(inode, NFS_CAP_ACLS); -} - -#endif /* __KERNEL__ */ -#endif /* __AUFS_FSTYPE_H__ */ diff --git a/fs/aufs/hfsnotify.c b/fs/aufs/hfsnotify.c deleted file mode 100644 index dc4f6c3c9..000000000 --- a/fs/aufs/hfsnotify.c +++ /dev/null @@ -1,288 +0,0 @@ -/* - * Copyright (C) 2005-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * fsnotify for the lower directories - */ - -#include "aufs.h" - -/* FS_IN_IGNORED is unnecessary */ -static const __u32 AuHfsnMask = (FS_MOVED_TO | FS_MOVED_FROM | FS_DELETE - | FS_CREATE | FS_EVENT_ON_CHILD); -static DECLARE_WAIT_QUEUE_HEAD(au_hfsn_wq); -static __cacheline_aligned_in_smp atomic64_t au_hfsn_ifree = ATOMIC64_INIT(0); - -static void au_hfsn_free_mark(struct fsnotify_mark *mark) -{ - struct au_hnotify *hn = container_of(mark, struct au_hnotify, - hn_mark); - AuDbg("here\n"); - au_cache_free_hnotify(hn); - smp_mb__before_atomic(); - if (atomic64_dec_and_test(&au_hfsn_ifree)) - wake_up(&au_hfsn_wq); -} - -static int au_hfsn_alloc(struct au_hinode *hinode) -{ - int err; - struct au_hnotify *hn; - struct super_block *sb; - struct au_branch *br; - struct fsnotify_mark *mark; - aufs_bindex_t bindex; - - hn = hinode->hi_notify; - sb = hn->hn_aufs_inode->i_sb; - bindex = au_br_index(sb, hinode->hi_id); - br = au_sbr(sb, bindex); - AuDebugOn(!br->br_hfsn); - - mark = &hn->hn_mark; - fsnotify_init_mark(mark, au_hfsn_free_mark); - mark->mask = AuHfsnMask; - /* - * by udba rename or rmdir, aufs assign a new inode to the known - * h_inode, so specify 1 to allow dups. - */ - lockdep_off(); - err = fsnotify_add_mark(mark, br->br_hfsn->hfsn_group, hinode->hi_inode, - /*mnt*/NULL, /*allow_dups*/1); - /* even if err */ - fsnotify_put_mark(mark); - lockdep_on(); - - return err; -} - -static int au_hfsn_free(struct au_hinode *hinode, struct au_hnotify *hn) -{ - struct fsnotify_mark *mark; - unsigned long long ull; - struct fsnotify_group *group; - - ull = atomic64_inc_return(&au_hfsn_ifree); - BUG_ON(!ull); - - mark = &hn->hn_mark; - spin_lock(&mark->lock); - group = mark->group; - fsnotify_get_group(group); - spin_unlock(&mark->lock); - lockdep_off(); - fsnotify_destroy_mark(mark, group); - fsnotify_put_group(group); - lockdep_on(); - - /* free hn by myself */ - return 0; -} - -/* ---------------------------------------------------------------------- */ - -static void au_hfsn_ctl(struct au_hinode *hinode, int do_set) -{ - struct fsnotify_mark *mark; - - mark = &hinode->hi_notify->hn_mark; - spin_lock(&mark->lock); - if (do_set) { - AuDebugOn(mark->mask & AuHfsnMask); - mark->mask |= AuHfsnMask; - } else { - AuDebugOn(!(mark->mask & AuHfsnMask)); - mark->mask &= ~AuHfsnMask; - } - spin_unlock(&mark->lock); - /* fsnotify_recalc_inode_mask(hinode->hi_inode); */ -} - -/* ---------------------------------------------------------------------- */ - -/* #define AuDbgHnotify */ -#ifdef AuDbgHnotify -static char *au_hfsn_name(u32 mask) -{ -#ifdef CONFIG_AUFS_DEBUG -#define test_ret(flag) \ - do { \ - if (mask & flag) \ - return #flag; \ - } while (0) - test_ret(FS_ACCESS); - test_ret(FS_MODIFY); - test_ret(FS_ATTRIB); - test_ret(FS_CLOSE_WRITE); - test_ret(FS_CLOSE_NOWRITE); - test_ret(FS_OPEN); - test_ret(FS_MOVED_FROM); - test_ret(FS_MOVED_TO); - test_ret(FS_CREATE); - test_ret(FS_DELETE); - test_ret(FS_DELETE_SELF); - test_ret(FS_MOVE_SELF); - test_ret(FS_UNMOUNT); - test_ret(FS_Q_OVERFLOW); - test_ret(FS_IN_IGNORED); - test_ret(FS_ISDIR); - test_ret(FS_IN_ONESHOT); - test_ret(FS_EVENT_ON_CHILD); - return ""; -#undef test_ret -#else - return "??"; -#endif -} -#endif - -/* ---------------------------------------------------------------------- */ - -static void au_hfsn_free_group(struct fsnotify_group *group) -{ - struct au_br_hfsnotify *hfsn = group->private; - - AuDbg("here\n"); - kfree(hfsn); -} - -static int au_hfsn_handle_event(struct fsnotify_group *group, - struct inode *inode, - struct fsnotify_mark *inode_mark, - struct fsnotify_mark *vfsmount_mark, - u32 mask, void *data, int data_type, - const unsigned char *file_name, u32 cookie) -{ - int err; - struct au_hnotify *hnotify; - struct inode *h_dir, *h_inode; - struct qstr h_child_qstr = QSTR_INIT(file_name, strlen(file_name)); - - AuDebugOn(data_type != FSNOTIFY_EVENT_INODE); - - err = 0; - /* if FS_UNMOUNT happens, there must be another bug */ - AuDebugOn(mask & FS_UNMOUNT); - if (mask & (FS_IN_IGNORED | FS_UNMOUNT)) - goto out; - - h_dir = inode; - h_inode = NULL; -#ifdef AuDbgHnotify - au_debug_on(); - if (1 || h_child_qstr.len != sizeof(AUFS_XINO_FNAME) - 1 - || strncmp(h_child_qstr.name, AUFS_XINO_FNAME, h_child_qstr.len)) { - AuDbg("i%lu, mask 0x%x %s, hcname %.*s, hi%lu\n", - h_dir->i_ino, mask, au_hfsn_name(mask), - AuLNPair(&h_child_qstr), h_inode ? h_inode->i_ino : 0); - /* WARN_ON(1); */ - } - au_debug_off(); -#endif - - AuDebugOn(!inode_mark); - hnotify = container_of(inode_mark, struct au_hnotify, hn_mark); - err = au_hnotify(h_dir, hnotify, mask, &h_child_qstr, h_inode); - -out: - return err; -} - -static struct fsnotify_ops au_hfsn_ops = { - .handle_event = au_hfsn_handle_event, - .free_group_priv = au_hfsn_free_group -}; - -/* ---------------------------------------------------------------------- */ - -static void au_hfsn_fin_br(struct au_branch *br) -{ - struct au_br_hfsnotify *hfsn; - - hfsn = br->br_hfsn; - if (hfsn) { - lockdep_off(); - fsnotify_put_group(hfsn->hfsn_group); - lockdep_on(); - } -} - -static int au_hfsn_init_br(struct au_branch *br, int perm) -{ - int err; - struct fsnotify_group *group; - struct au_br_hfsnotify *hfsn; - - err = 0; - br->br_hfsn = NULL; - if (!au_br_hnotifyable(perm)) - goto out; - - err = -ENOMEM; - hfsn = kmalloc(sizeof(*hfsn), GFP_NOFS); - if (unlikely(!hfsn)) - goto out; - - err = 0; - group = fsnotify_alloc_group(&au_hfsn_ops); - if (IS_ERR(group)) { - err = PTR_ERR(group); - pr_err("fsnotify_alloc_group() failed, %d\n", err); - goto out_hfsn; - } - - group->private = hfsn; - hfsn->hfsn_group = group; - br->br_hfsn = hfsn; - goto out; /* success */ - -out_hfsn: - kfree(hfsn); -out: - return err; -} - -static int au_hfsn_reset_br(unsigned int udba, struct au_branch *br, int perm) -{ - int err; - - err = 0; - if (!br->br_hfsn) - err = au_hfsn_init_br(br, perm); - - return err; -} - -/* ---------------------------------------------------------------------- */ - -static void au_hfsn_fin(void) -{ - AuDbg("au_hfsn_ifree %lld\n", (long long)atomic64_read(&au_hfsn_ifree)); - wait_event(au_hfsn_wq, !atomic64_read(&au_hfsn_ifree)); -} - -const struct au_hnotify_op au_hnotify_op = { - .ctl = au_hfsn_ctl, - .alloc = au_hfsn_alloc, - .free = au_hfsn_free, - - .fin = au_hfsn_fin, - - .reset_br = au_hfsn_reset_br, - .fin_br = au_hfsn_fin_br, - .init_br = au_hfsn_init_br -}; diff --git a/fs/aufs/hfsplus.c b/fs/aufs/hfsplus.c deleted file mode 100644 index ced054639..000000000 --- a/fs/aufs/hfsplus.c +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (C) 2010-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * special support for filesystems which aqucires an inode mutex - * at final closing a file, eg, hfsplus. - * - * This trick is very simple and stupid, just to open the file before really - * neceeary open to tell hfsplus that this is not the final closing. - * The caller should call au_h_open_pre() after acquiring the inode mutex, - * and au_h_open_post() after releasing it. - */ - -#include "aufs.h" - -struct file *au_h_open_pre(struct dentry *dentry, aufs_bindex_t bindex, - int force_wr) -{ - struct file *h_file; - struct dentry *h_dentry; - - h_dentry = au_h_dptr(dentry, bindex); - AuDebugOn(!h_dentry); - AuDebugOn(d_is_negative(h_dentry)); - - h_file = NULL; - if (au_test_hfsplus(h_dentry->d_sb) - && d_is_reg(h_dentry)) - h_file = au_h_open(dentry, bindex, - O_RDONLY | O_NOATIME | O_LARGEFILE, - /*file*/NULL, force_wr); - return h_file; -} - -void au_h_open_post(struct dentry *dentry, aufs_bindex_t bindex, - struct file *h_file) -{ - if (h_file) { - fput(h_file); - au_sbr_put(dentry->d_sb, bindex); - } -} diff --git a/fs/aufs/hnotify.c b/fs/aufs/hnotify.c deleted file mode 100644 index 643bc57c5..000000000 --- a/fs/aufs/hnotify.c +++ /dev/null @@ -1,710 +0,0 @@ -/* - * Copyright (C) 2005-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * abstraction to notify the direct changes on lower directories - */ - -#include "aufs.h" - -int au_hn_alloc(struct au_hinode *hinode, struct inode *inode) -{ - int err; - struct au_hnotify *hn; - - err = -ENOMEM; - hn = au_cache_alloc_hnotify(); - if (hn) { - hn->hn_aufs_inode = inode; - hinode->hi_notify = hn; - err = au_hnotify_op.alloc(hinode); - AuTraceErr(err); - if (unlikely(err)) { - hinode->hi_notify = NULL; - au_cache_free_hnotify(hn); - /* - * The upper dir was removed by udba, but the same named - * dir left. In this case, aufs assignes a new inode - * number and set the monitor again. - * For the lower dir, the old monitnor is still left. - */ - if (err == -EEXIST) - err = 0; - } - } - - AuTraceErr(err); - return err; -} - -void au_hn_free(struct au_hinode *hinode) -{ - struct au_hnotify *hn; - - hn = hinode->hi_notify; - if (hn) { - hinode->hi_notify = NULL; - if (au_hnotify_op.free(hinode, hn)) - au_cache_free_hnotify(hn); - } -} - -/* ---------------------------------------------------------------------- */ - -void au_hn_ctl(struct au_hinode *hinode, int do_set) -{ - if (hinode->hi_notify) - au_hnotify_op.ctl(hinode, do_set); -} - -void au_hn_reset(struct inode *inode, unsigned int flags) -{ - aufs_bindex_t bindex, bend; - struct inode *hi; - struct dentry *iwhdentry; - - bend = au_ibend(inode); - for (bindex = au_ibstart(inode); bindex <= bend; bindex++) { - hi = au_h_iptr(inode, bindex); - if (!hi) - continue; - - /* mutex_lock_nested(&hi->i_mutex, AuLsc_I_CHILD); */ - iwhdentry = au_hi_wh(inode, bindex); - if (iwhdentry) - dget(iwhdentry); - au_igrab(hi); - au_set_h_iptr(inode, bindex, NULL, 0); - au_set_h_iptr(inode, bindex, au_igrab(hi), - flags & ~AuHi_XINO); - iput(hi); - dput(iwhdentry); - /* mutex_unlock(&hi->i_mutex); */ - } -} - -/* ---------------------------------------------------------------------- */ - -static int hn_xino(struct inode *inode, struct inode *h_inode) -{ - int err; - aufs_bindex_t bindex, bend, bfound, bstart; - struct inode *h_i; - - err = 0; - if (unlikely(inode->i_ino == AUFS_ROOT_INO)) { - pr_warn("branch root dir was changed\n"); - goto out; - } - - bfound = -1; - bend = au_ibend(inode); - bstart = au_ibstart(inode); -#if 0 /* reserved for future use */ - if (bindex == bend) { - /* keep this ino in rename case */ - goto out; - } -#endif - for (bindex = bstart; bindex <= bend; bindex++) - if (au_h_iptr(inode, bindex) == h_inode) { - bfound = bindex; - break; - } - if (bfound < 0) - goto out; - - for (bindex = bstart; bindex <= bend; bindex++) { - h_i = au_h_iptr(inode, bindex); - if (!h_i) - continue; - - err = au_xino_write(inode->i_sb, bindex, h_i->i_ino, /*ino*/0); - /* ignore this error */ - /* bad action? */ - } - - /* children inode number will be broken */ - -out: - AuTraceErr(err); - return err; -} - -static int hn_gen_tree(struct dentry *dentry) -{ - int err, i, j, ndentry; - struct au_dcsub_pages dpages; - struct au_dpage *dpage; - struct dentry **dentries; - - err = au_dpages_init(&dpages, GFP_NOFS); - if (unlikely(err)) - goto out; - err = au_dcsub_pages(&dpages, dentry, NULL, NULL); - if (unlikely(err)) - goto out_dpages; - - for (i = 0; i < dpages.ndpage; i++) { - dpage = dpages.dpages + i; - dentries = dpage->dentries; - ndentry = dpage->ndentry; - for (j = 0; j < ndentry; j++) { - struct dentry *d; - - d = dentries[j]; - if (IS_ROOT(d)) - continue; - - au_digen_dec(d); - if (d_really_is_positive(d)) - /* todo: reset children xino? - cached children only? */ - au_iigen_dec(d_inode(d)); - } - } - -out_dpages: - au_dpages_free(&dpages); - -#if 0 - /* discard children */ - dentry_unhash(dentry); - dput(dentry); -#endif -out: - return err; -} - -/* - * return 0 if processed. - */ -static int hn_gen_by_inode(char *name, unsigned int nlen, struct inode *inode, - const unsigned int isdir) -{ - int err; - struct dentry *d; - struct qstr *dname; - - err = 1; - if (unlikely(inode->i_ino == AUFS_ROOT_INO)) { - pr_warn("branch root dir was changed\n"); - err = 0; - goto out; - } - - if (!isdir) { - AuDebugOn(!name); - au_iigen_dec(inode); - spin_lock(&inode->i_lock); - hlist_for_each_entry(d, &inode->i_dentry, d_u.d_alias) { - spin_lock(&d->d_lock); - dname = &d->d_name; - if (dname->len != nlen - && memcmp(dname->name, name, nlen)) { - spin_unlock(&d->d_lock); - continue; - } - err = 0; - au_digen_dec(d); - spin_unlock(&d->d_lock); - break; - } - spin_unlock(&inode->i_lock); - } else { - au_fset_si(au_sbi(inode->i_sb), FAILED_REFRESH_DIR); - d = d_find_any_alias(inode); - if (!d) { - au_iigen_dec(inode); - goto out; - } - - spin_lock(&d->d_lock); - dname = &d->d_name; - if (dname->len == nlen && !memcmp(dname->name, name, nlen)) { - spin_unlock(&d->d_lock); - err = hn_gen_tree(d); - spin_lock(&d->d_lock); - } - spin_unlock(&d->d_lock); - dput(d); - } - -out: - AuTraceErr(err); - return err; -} - -static int hn_gen_by_name(struct dentry *dentry, const unsigned int isdir) -{ - int err; - - if (IS_ROOT(dentry)) { - pr_warn("branch root dir was changed\n"); - return 0; - } - - err = 0; - if (!isdir) { - au_digen_dec(dentry); - if (d_really_is_positive(dentry)) - au_iigen_dec(d_inode(dentry)); - } else { - au_fset_si(au_sbi(dentry->d_sb), FAILED_REFRESH_DIR); - if (d_really_is_positive(dentry)) - err = hn_gen_tree(dentry); - } - - AuTraceErr(err); - return err; -} - -/* ---------------------------------------------------------------------- */ - -/* hnotify job flags */ -#define AuHnJob_XINO0 1 -#define AuHnJob_GEN (1 << 1) -#define AuHnJob_DIRENT (1 << 2) -#define AuHnJob_ISDIR (1 << 3) -#define AuHnJob_TRYXINO0 (1 << 4) -#define AuHnJob_MNTPNT (1 << 5) -#define au_ftest_hnjob(flags, name) ((flags) & AuHnJob_##name) -#define au_fset_hnjob(flags, name) \ - do { (flags) |= AuHnJob_##name; } while (0) -#define au_fclr_hnjob(flags, name) \ - do { (flags) &= ~AuHnJob_##name; } while (0) - -enum { - AuHn_CHILD, - AuHn_PARENT, - AuHnLast -}; - -struct au_hnotify_args { - struct inode *h_dir, *dir, *h_child_inode; - u32 mask; - unsigned int flags[AuHnLast]; - unsigned int h_child_nlen; - char h_child_name[]; -}; - -struct hn_job_args { - unsigned int flags; - struct inode *inode, *h_inode, *dir, *h_dir; - struct dentry *dentry; - char *h_name; - int h_nlen; -}; - -static int hn_job(struct hn_job_args *a) -{ - const unsigned int isdir = au_ftest_hnjob(a->flags, ISDIR); - int e; - - /* reset xino */ - if (au_ftest_hnjob(a->flags, XINO0) && a->inode) - hn_xino(a->inode, a->h_inode); /* ignore this error */ - - if (au_ftest_hnjob(a->flags, TRYXINO0) - && a->inode - && a->h_inode) { - mutex_lock_nested(&a->h_inode->i_mutex, AuLsc_I_CHILD); - if (!a->h_inode->i_nlink - && !(a->h_inode->i_state & I_LINKABLE)) - hn_xino(a->inode, a->h_inode); /* ignore this error */ - mutex_unlock(&a->h_inode->i_mutex); - } - - /* make the generation obsolete */ - if (au_ftest_hnjob(a->flags, GEN)) { - e = -1; - if (a->inode) - e = hn_gen_by_inode(a->h_name, a->h_nlen, a->inode, - isdir); - if (e && a->dentry) - hn_gen_by_name(a->dentry, isdir); - /* ignore this error */ - } - - /* make dir entries obsolete */ - if (au_ftest_hnjob(a->flags, DIRENT) && a->inode) { - struct au_vdir *vdir; - - vdir = au_ivdir(a->inode); - if (vdir) - vdir->vd_jiffy = 0; - /* IMustLock(a->inode); */ - /* a->inode->i_version++; */ - } - - /* can do nothing but warn */ - if (au_ftest_hnjob(a->flags, MNTPNT) - && a->dentry - && d_mountpoint(a->dentry)) - pr_warn("mount-point %pd is removed or renamed\n", a->dentry); - - return 0; -} - -/* ---------------------------------------------------------------------- */ - -static struct dentry *lookup_wlock_by_name(char *name, unsigned int nlen, - struct inode *dir) -{ - struct dentry *dentry, *d, *parent; - struct qstr *dname; - - parent = d_find_any_alias(dir); - if (!parent) - return NULL; - - dentry = NULL; - spin_lock(&parent->d_lock); - list_for_each_entry(d, &parent->d_subdirs, d_child) { - /* AuDbg("%pd\n", d); */ - spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); - dname = &d->d_name; - if (dname->len != nlen || memcmp(dname->name, name, nlen)) - goto cont_unlock; - if (au_di(d)) - au_digen_dec(d); - else - goto cont_unlock; - if (au_dcount(d) > 0) { - dentry = dget_dlock(d); - spin_unlock(&d->d_lock); - break; - } - -cont_unlock: - spin_unlock(&d->d_lock); - } - spin_unlock(&parent->d_lock); - dput(parent); - - if (dentry) - di_write_lock_child(dentry); - - return dentry; -} - -static struct inode *lookup_wlock_by_ino(struct super_block *sb, - aufs_bindex_t bindex, ino_t h_ino) -{ - struct inode *inode; - ino_t ino; - int err; - - inode = NULL; - err = au_xino_read(sb, bindex, h_ino, &ino); - if (!err && ino) - inode = ilookup(sb, ino); - if (!inode) - goto out; - - if (unlikely(inode->i_ino == AUFS_ROOT_INO)) { - pr_warn("wrong root branch\n"); - iput(inode); - inode = NULL; - goto out; - } - - ii_write_lock_child(inode); - -out: - return inode; -} - -static void au_hn_bh(void *_args) -{ - struct au_hnotify_args *a = _args; - struct super_block *sb; - aufs_bindex_t bindex, bend, bfound; - unsigned char xino, try_iput; - int err; - struct inode *inode; - ino_t h_ino; - struct hn_job_args args; - struct dentry *dentry; - struct au_sbinfo *sbinfo; - - AuDebugOn(!_args); - AuDebugOn(!a->h_dir); - AuDebugOn(!a->dir); - AuDebugOn(!a->mask); - AuDbg("mask 0x%x, i%lu, hi%lu, hci%lu\n", - a->mask, a->dir->i_ino, a->h_dir->i_ino, - a->h_child_inode ? a->h_child_inode->i_ino : 0); - - inode = NULL; - dentry = NULL; - /* - * do not lock a->dir->i_mutex here - * because of d_revalidate() may cause a deadlock. - */ - sb = a->dir->i_sb; - AuDebugOn(!sb); - sbinfo = au_sbi(sb); - AuDebugOn(!sbinfo); - si_write_lock(sb, AuLock_NOPLMW); - - ii_read_lock_parent(a->dir); - bfound = -1; - bend = au_ibend(a->dir); - for (bindex = au_ibstart(a->dir); bindex <= bend; bindex++) - if (au_h_iptr(a->dir, bindex) == a->h_dir) { - bfound = bindex; - break; - } - ii_read_unlock(a->dir); - if (unlikely(bfound < 0)) - goto out; - - xino = !!au_opt_test(au_mntflags(sb), XINO); - h_ino = 0; - if (a->h_child_inode) - h_ino = a->h_child_inode->i_ino; - - if (a->h_child_nlen - && (au_ftest_hnjob(a->flags[AuHn_CHILD], GEN) - || au_ftest_hnjob(a->flags[AuHn_CHILD], MNTPNT))) - dentry = lookup_wlock_by_name(a->h_child_name, a->h_child_nlen, - a->dir); - try_iput = 0; - if (dentry && d_really_is_positive(dentry)) - inode = d_inode(dentry); - if (xino && !inode && h_ino - && (au_ftest_hnjob(a->flags[AuHn_CHILD], XINO0) - || au_ftest_hnjob(a->flags[AuHn_CHILD], TRYXINO0) - || au_ftest_hnjob(a->flags[AuHn_CHILD], GEN))) { - inode = lookup_wlock_by_ino(sb, bfound, h_ino); - try_iput = 1; - } - - args.flags = a->flags[AuHn_CHILD]; - args.dentry = dentry; - args.inode = inode; - args.h_inode = a->h_child_inode; - args.dir = a->dir; - args.h_dir = a->h_dir; - args.h_name = a->h_child_name; - args.h_nlen = a->h_child_nlen; - err = hn_job(&args); - if (dentry) { - if (au_di(dentry)) - di_write_unlock(dentry); - dput(dentry); - } - if (inode && try_iput) { - ii_write_unlock(inode); - iput(inode); - } - - ii_write_lock_parent(a->dir); - args.flags = a->flags[AuHn_PARENT]; - args.dentry = NULL; - args.inode = a->dir; - args.h_inode = a->h_dir; - args.dir = NULL; - args.h_dir = NULL; - args.h_name = NULL; - args.h_nlen = 0; - err = hn_job(&args); - ii_write_unlock(a->dir); - -out: - iput(a->h_child_inode); - iput(a->h_dir); - iput(a->dir); - si_write_unlock(sb); - au_nwt_done(&sbinfo->si_nowait); - kfree(a); -} - -/* ---------------------------------------------------------------------- */ - -int au_hnotify(struct inode *h_dir, struct au_hnotify *hnotify, u32 mask, - struct qstr *h_child_qstr, struct inode *h_child_inode) -{ - int err, len; - unsigned int flags[AuHnLast], f; - unsigned char isdir, isroot, wh; - struct inode *dir; - struct au_hnotify_args *args; - char *p, *h_child_name; - - err = 0; - AuDebugOn(!hnotify || !hnotify->hn_aufs_inode); - dir = igrab(hnotify->hn_aufs_inode); - if (!dir) - goto out; - - isroot = (dir->i_ino == AUFS_ROOT_INO); - wh = 0; - h_child_name = (void *)h_child_qstr->name; - len = h_child_qstr->len; - if (h_child_name) { - if (len > AUFS_WH_PFX_LEN - && !memcmp(h_child_name, AUFS_WH_PFX, AUFS_WH_PFX_LEN)) { - h_child_name += AUFS_WH_PFX_LEN; - len -= AUFS_WH_PFX_LEN; - wh = 1; - } - } - - isdir = 0; - if (h_child_inode) - isdir = !!S_ISDIR(h_child_inode->i_mode); - flags[AuHn_PARENT] = AuHnJob_ISDIR; - flags[AuHn_CHILD] = 0; - if (isdir) - flags[AuHn_CHILD] = AuHnJob_ISDIR; - au_fset_hnjob(flags[AuHn_PARENT], DIRENT); - au_fset_hnjob(flags[AuHn_CHILD], GEN); - switch (mask & FS_EVENTS_POSS_ON_CHILD) { - case FS_MOVED_FROM: - case FS_MOVED_TO: - au_fset_hnjob(flags[AuHn_CHILD], XINO0); - au_fset_hnjob(flags[AuHn_CHILD], MNTPNT); - /*FALLTHROUGH*/ - case FS_CREATE: - AuDebugOn(!h_child_name); - break; - - case FS_DELETE: - /* - * aufs never be able to get this child inode. - * revalidation should be in d_revalidate() - * by checking i_nlink, i_generation or d_unhashed(). - */ - AuDebugOn(!h_child_name); - au_fset_hnjob(flags[AuHn_CHILD], TRYXINO0); - au_fset_hnjob(flags[AuHn_CHILD], MNTPNT); - break; - - default: - AuDebugOn(1); - } - - if (wh) - h_child_inode = NULL; - - err = -ENOMEM; - /* iput() and kfree() will be called in au_hnotify() */ - args = kmalloc(sizeof(*args) + len + 1, GFP_NOFS); - if (unlikely(!args)) { - AuErr1("no memory\n"); - iput(dir); - goto out; - } - args->flags[AuHn_PARENT] = flags[AuHn_PARENT]; - args->flags[AuHn_CHILD] = flags[AuHn_CHILD]; - args->mask = mask; - args->dir = dir; - args->h_dir = igrab(h_dir); - if (h_child_inode) - h_child_inode = igrab(h_child_inode); /* can be NULL */ - args->h_child_inode = h_child_inode; - args->h_child_nlen = len; - if (len) { - p = (void *)args; - p += sizeof(*args); - memcpy(p, h_child_name, len); - p[len] = 0; - } - - /* NFS fires the event for silly-renamed one from kworker */ - f = 0; - if (!dir->i_nlink - || (au_test_nfs(h_dir->i_sb) && (mask & FS_DELETE))) - f = AuWkq_NEST; - err = au_wkq_nowait(au_hn_bh, args, dir->i_sb, f); - if (unlikely(err)) { - pr_err("wkq %d\n", err); - iput(args->h_child_inode); - iput(args->h_dir); - iput(args->dir); - kfree(args); - } - -out: - return err; -} - -/* ---------------------------------------------------------------------- */ - -int au_hnotify_reset_br(unsigned int udba, struct au_branch *br, int perm) -{ - int err; - - AuDebugOn(!(udba & AuOptMask_UDBA)); - - err = 0; - if (au_hnotify_op.reset_br) - err = au_hnotify_op.reset_br(udba, br, perm); - - return err; -} - -int au_hnotify_init_br(struct au_branch *br, int perm) -{ - int err; - - err = 0; - if (au_hnotify_op.init_br) - err = au_hnotify_op.init_br(br, perm); - - return err; -} - -void au_hnotify_fin_br(struct au_branch *br) -{ - if (au_hnotify_op.fin_br) - au_hnotify_op.fin_br(br); -} - -static void au_hn_destroy_cache(void) -{ - kmem_cache_destroy(au_cachep[AuCache_HNOTIFY]); - au_cachep[AuCache_HNOTIFY] = NULL; -} - -int __init au_hnotify_init(void) -{ - int err; - - err = -ENOMEM; - au_cachep[AuCache_HNOTIFY] = AuCache(au_hnotify); - if (au_cachep[AuCache_HNOTIFY]) { - err = 0; - if (au_hnotify_op.init) - err = au_hnotify_op.init(); - if (unlikely(err)) - au_hn_destroy_cache(); - } - AuTraceErr(err); - return err; -} - -void au_hnotify_fin(void) -{ - if (au_hnotify_op.fin) - au_hnotify_op.fin(); - /* cf. au_cache_fin() */ - if (au_cachep[AuCache_HNOTIFY]) - au_hn_destroy_cache(); -} diff --git a/fs/aufs/i_op.c b/fs/aufs/i_op.c deleted file mode 100644 index 17364e4f1..000000000 --- a/fs/aufs/i_op.c +++ /dev/null @@ -1,1447 +0,0 @@ -/* - * Copyright (C) 2005-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * inode operations (except add/del/rename) - */ - -#include <linux/device_cgroup.h> -#include <linux/fs_stack.h> -#include <linux/mm.h> -#include <linux/namei.h> -#include <linux/security.h> -#include "aufs.h" - -static int h_permission(struct inode *h_inode, int mask, - struct vfsmount *h_mnt, int brperm) -{ - int err; - const unsigned char write_mask = !!(mask & (MAY_WRITE | MAY_APPEND)); - - err = -EACCES; - if ((write_mask && IS_IMMUTABLE(h_inode)) - || ((mask & MAY_EXEC) - && S_ISREG(h_inode->i_mode) - && ((h_mnt->mnt_flags & MNT_NOEXEC) - || !(h_inode->i_mode & S_IXUGO)))) - goto out; - - /* - * - skip the lower fs test in the case of write to ro branch. - * - nfs dir permission write check is optimized, but a policy for - * link/rename requires a real check. - * - nfs always sets MS_POSIXACL regardless its mount option 'noacl.' - * in this case, generic_permission() returns -EOPNOTSUPP. - */ - if ((write_mask && !au_br_writable(brperm)) - || (au_test_nfs(h_inode->i_sb) && S_ISDIR(h_inode->i_mode) - && write_mask && !(mask & MAY_READ)) - || !h_inode->i_op->permission) { - /* AuLabel(generic_permission); */ - /* AuDbg("get_acl %pf\n", h_inode->i_op->get_acl); */ - err = generic_permission(h_inode, mask); - if (err == -EOPNOTSUPP && au_test_nfs_noacl(h_inode)) - err = h_inode->i_op->permission(h_inode, mask); - AuTraceErr(err); - } else { - /* AuLabel(h_inode->permission); */ - err = h_inode->i_op->permission(h_inode, mask); - AuTraceErr(err); - } - - if (!err) - err = devcgroup_inode_permission(h_inode, mask); - if (!err) - err = security_inode_permission(h_inode, mask); - -#if 0 - if (!err) { - /* todo: do we need to call ima_path_check()? */ - struct path h_path = { - .dentry = - .mnt = h_mnt - }; - err = ima_path_check(&h_path, - mask & (MAY_READ | MAY_WRITE | MAY_EXEC), - IMA_COUNT_LEAVE); - } -#endif - -out: - return err; -} - -static int aufs_permission(struct inode *inode, int mask) -{ - int err; - aufs_bindex_t bindex, bend; - const unsigned char isdir = !!S_ISDIR(inode->i_mode), - write_mask = !!(mask & (MAY_WRITE | MAY_APPEND)); - struct inode *h_inode; - struct super_block *sb; - struct au_branch *br; - - /* todo: support rcu-walk? */ - if (mask & MAY_NOT_BLOCK) - return -ECHILD; - - sb = inode->i_sb; - si_read_lock(sb, AuLock_FLUSH); - ii_read_lock_child(inode); -#if 0 - err = au_iigen_test(inode, au_sigen(sb)); - if (unlikely(err)) - goto out; -#endif - - if (!isdir - || write_mask - || au_opt_test(au_mntflags(sb), DIRPERM1)) { - err = au_busy_or_stale(); - h_inode = au_h_iptr(inode, au_ibstart(inode)); - if (unlikely(!h_inode - || (h_inode->i_mode & S_IFMT) - != (inode->i_mode & S_IFMT))) - goto out; - - err = 0; - bindex = au_ibstart(inode); - br = au_sbr(sb, bindex); - err = h_permission(h_inode, mask, au_br_mnt(br), br->br_perm); - if (write_mask - && !err - && !special_file(h_inode->i_mode)) { - /* test whether the upper writable branch exists */ - err = -EROFS; - for (; bindex >= 0; bindex--) - if (!au_br_rdonly(au_sbr(sb, bindex))) { - err = 0; - break; - } - } - goto out; - } - - /* non-write to dir */ - err = 0; - bend = au_ibend(inode); - for (bindex = au_ibstart(inode); !err && bindex <= bend; bindex++) { - h_inode = au_h_iptr(inode, bindex); - if (h_inode) { - err = au_busy_or_stale(); - if (unlikely(!S_ISDIR(h_inode->i_mode))) - break; - - br = au_sbr(sb, bindex); - err = h_permission(h_inode, mask, au_br_mnt(br), - br->br_perm); - } - } - -out: - ii_read_unlock(inode); - si_read_unlock(sb); - return err; -} - -/* ---------------------------------------------------------------------- */ - -static struct dentry *aufs_lookup(struct inode *dir, struct dentry *dentry, - unsigned int flags) -{ - struct dentry *ret, *parent; - struct inode *inode; - struct super_block *sb; - int err, npositive; - - IMustLock(dir); - - /* todo: support rcu-walk? */ - ret = ERR_PTR(-ECHILD); - if (flags & LOOKUP_RCU) - goto out; - - ret = ERR_PTR(-ENAMETOOLONG); - if (unlikely(dentry->d_name.len > AUFS_MAX_NAMELEN)) - goto out; - - sb = dir->i_sb; - err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM); - ret = ERR_PTR(err); - if (unlikely(err)) - goto out; - - err = au_di_init(dentry); - ret = ERR_PTR(err); - if (unlikely(err)) - goto out_si; - - inode = NULL; - npositive = 0; /* suppress a warning */ - parent = dentry->d_parent; /* dir inode is locked */ - di_read_lock_parent(parent, AuLock_IR); - err = au_alive_dir(parent); - if (!err) - err = au_digen_test(parent, au_sigen(sb)); - if (!err) { - npositive = au_lkup_dentry(dentry, au_dbstart(parent), - /*type*/0); - err = npositive; - } - di_read_unlock(parent, AuLock_IR); - ret = ERR_PTR(err); - if (unlikely(err < 0)) - goto out_unlock; - - if (npositive) { - inode = au_new_inode(dentry, /*must_new*/0); - if (IS_ERR(inode)) { - ret = (void *)inode; - inode = NULL; - goto out_unlock; - } - } - - if (inode) - atomic_inc(&inode->i_count); - ret = d_splice_alias(inode, dentry); -#if 0 - if (unlikely(d_need_lookup(dentry))) { - spin_lock(&dentry->d_lock); - dentry->d_flags &= ~DCACHE_NEED_LOOKUP; - spin_unlock(&dentry->d_lock); - } else -#endif - if (inode) { - if (!IS_ERR(ret)) { - iput(inode); - if (ret && ret != dentry) - ii_write_unlock(inode); - } else { - ii_write_unlock(inode); - iput(inode); - inode = NULL; - } - } - -out_unlock: - di_write_unlock(dentry); - if (inode) { - /* verbose coding for lock class name */ - if (unlikely(S_ISLNK(inode->i_mode))) - au_rw_class(&au_di(dentry)->di_rwsem, - au_lc_key + AuLcSymlink_DIINFO); - else if (unlikely(S_ISDIR(inode->i_mode))) - au_rw_class(&au_di(dentry)->di_rwsem, - au_lc_key + AuLcDir_DIINFO); - else /* likely */ - au_rw_class(&au_di(dentry)->di_rwsem, - au_lc_key + AuLcNonDir_DIINFO); - } -out_si: - si_read_unlock(sb); -out: - return ret; -} - -/* ---------------------------------------------------------------------- */ - -struct aopen_node { - struct hlist_node hlist; - struct file *file, *h_file; -}; - -static int au_do_aopen(struct inode *inode, struct file *file) -{ - struct au_sphlhead *aopen; - struct aopen_node *node; - struct au_do_open_args args = { - .no_lock = 1, - .open = au_do_open_nondir - }; - - aopen = &au_sbi(inode->i_sb)->si_aopen; - spin_lock(&aopen->spin); - hlist_for_each_entry(node, &aopen->head, hlist) - if (node->file == file) { - args.h_file = node->h_file; - break; - } - spin_unlock(&aopen->spin); - /* AuDebugOn(!args.h_file); */ - - return au_do_open(file, &args); -} - -static int aufs_atomic_open(struct inode *dir, struct dentry *dentry, - struct file *file, unsigned int open_flag, - umode_t create_mode, int *opened) -{ - int err, h_opened = *opened; - struct dentry *parent; - struct dentry *d; - struct au_sphlhead *aopen; - struct vfsub_aopen_args args = { - .open_flag = open_flag, - .create_mode = create_mode, - .opened = &h_opened - }; - struct aopen_node aopen_node = { - .file = file - }; - - IMustLock(dir); - AuDbg("open_flag 0x%x\n", open_flag); - AuDbgDentry(dentry); - - err = 0; - if (!au_di(dentry)) { - d = aufs_lookup(dir, dentry, /*flags*/0); - if (IS_ERR(d)) { - err = PTR_ERR(d); - goto out; - } else if (d) { - /* - * obsoleted dentry found. - * another error will be returned later. - */ - d_drop(d); - dput(d); - AuDbgDentry(d); - } - AuDbgDentry(dentry); - } - - if (d_is_positive(dentry) - || d_unhashed(dentry) - || d_unlinked(dentry) - || !(open_flag & O_CREAT)) - goto out_no_open; - - err = aufs_read_lock(dentry, AuLock_DW | AuLock_FLUSH | AuLock_GEN); - if (unlikely(err)) - goto out; - - parent = dentry->d_parent; /* dir is locked */ - di_write_lock_parent(parent); - err = au_lkup_dentry(dentry, /*bstart*/0, /*type*/0); - if (unlikely(err)) - goto out_unlock; - - AuDbgDentry(dentry); - if (d_is_positive(dentry)) - goto out_unlock; - - args.file = get_empty_filp(); - err = PTR_ERR(args.file); - if (IS_ERR(args.file)) - goto out_unlock; - - args.file->f_flags = file->f_flags; - err = au_aopen_or_create(dir, dentry, &args); - AuTraceErr(err); - AuDbgFile(args.file); - if (unlikely(err < 0)) { - if (h_opened & FILE_OPENED) - fput(args.file); - else - put_filp(args.file); - goto out_unlock; - } - - /* some filesystems don't set FILE_CREATED while succeeded? */ - *opened |= FILE_CREATED; - if (h_opened & FILE_OPENED) - aopen_node.h_file = args.file; - else { - put_filp(args.file); - args.file = NULL; - } - aopen = &au_sbi(dir->i_sb)->si_aopen; - au_sphl_add(&aopen_node.hlist, aopen); - err = finish_open(file, dentry, au_do_aopen, opened); - au_sphl_del(&aopen_node.hlist, aopen); - AuTraceErr(err); - AuDbgFile(file); - if (aopen_node.h_file) - fput(aopen_node.h_file); - -out_unlock: - di_write_unlock(parent); - aufs_read_unlock(dentry, AuLock_DW); - AuDbgDentry(dentry); - if (unlikely(err)) - goto out; -out_no_open: - if (!err && !(*opened & FILE_CREATED)) { - AuLabel(out_no_open); - dget(dentry); - err = finish_no_open(file, dentry); - } -out: - AuDbg("%pd%s%s\n", dentry, - (*opened & FILE_CREATED) ? " created" : "", - (*opened & FILE_OPENED) ? " opened" : ""); - AuTraceErr(err); - return err; -} - - -/* ---------------------------------------------------------------------- */ - -static int au_wr_dir_cpup(struct dentry *dentry, struct dentry *parent, - const unsigned char add_entry, aufs_bindex_t bcpup, - aufs_bindex_t bstart) -{ - int err; - struct dentry *h_parent; - struct inode *h_dir; - - if (add_entry) - IMustLock(d_inode(parent)); - else - di_write_lock_parent(parent); - - err = 0; - if (!au_h_dptr(parent, bcpup)) { - if (bstart > bcpup) - err = au_cpup_dirs(dentry, bcpup); - else if (bstart < bcpup) - err = au_cpdown_dirs(dentry, bcpup); - else - BUG(); - } - if (!err && add_entry && !au_ftest_wrdir(add_entry, TMPFILE)) { - h_parent = au_h_dptr(parent, bcpup); - h_dir = d_inode(h_parent); - mutex_lock_nested(&h_dir->i_mutex, AuLsc_I_PARENT); - err = au_lkup_neg(dentry, bcpup, /*wh*/0); - /* todo: no unlock here */ - mutex_unlock(&h_dir->i_mutex); - - AuDbg("bcpup %d\n", bcpup); - if (!err) { - if (d_really_is_negative(dentry)) - au_set_h_dptr(dentry, bstart, NULL); - au_update_dbrange(dentry, /*do_put_zero*/0); - } - } - - if (!add_entry) - di_write_unlock(parent); - if (!err) - err = bcpup; /* success */ - - AuTraceErr(err); - return err; -} - -/* - * decide the branch and the parent dir where we will create a new entry. - * returns new bindex or an error. - * copyup the parent dir if needed. - */ -int au_wr_dir(struct dentry *dentry, struct dentry *src_dentry, - struct au_wr_dir_args *args) -{ - int err; - unsigned int flags; - aufs_bindex_t bcpup, bstart, src_bstart; - const unsigned char add_entry - = au_ftest_wrdir(args->flags, ADD_ENTRY) - | au_ftest_wrdir(args->flags, TMPFILE); - struct super_block *sb; - struct dentry *parent; - struct au_sbinfo *sbinfo; - - sb = dentry->d_sb; - sbinfo = au_sbi(sb); - parent = dget_parent(dentry); - bstart = au_dbstart(dentry); - bcpup = bstart; - if (args->force_btgt < 0) { - if (src_dentry) { - src_bstart = au_dbstart(src_dentry); - if (src_bstart < bstart) - bcpup = src_bstart; - } else if (add_entry) { - flags = 0; - if (au_ftest_wrdir(args->flags, ISDIR)) - au_fset_wbr(flags, DIR); - err = AuWbrCreate(sbinfo, dentry, flags); - bcpup = err; - } - - if (bcpup < 0 || au_test_ro(sb, bcpup, d_inode(dentry))) { - if (add_entry) - err = AuWbrCopyup(sbinfo, dentry); - else { - if (!IS_ROOT(dentry)) { - di_read_lock_parent(parent, !AuLock_IR); - err = AuWbrCopyup(sbinfo, dentry); - di_read_unlock(parent, !AuLock_IR); - } else - err = AuWbrCopyup(sbinfo, dentry); - } - bcpup = err; - if (unlikely(err < 0)) - goto out; - } - } else { - bcpup = args->force_btgt; - AuDebugOn(au_test_ro(sb, bcpup, d_inode(dentry))); - } - - AuDbg("bstart %d, bcpup %d\n", bstart, bcpup); - err = bcpup; - if (bcpup == bstart) - goto out; /* success */ - - /* copyup the new parent into the branch we process */ - err = au_wr_dir_cpup(dentry, parent, add_entry, bcpup, bstart); - if (err >= 0) { - if (d_really_is_negative(dentry)) { - au_set_h_dptr(dentry, bstart, NULL); - au_set_dbstart(dentry, bcpup); - au_set_dbend(dentry, bcpup); - } - AuDebugOn(add_entry - && !au_ftest_wrdir(args->flags, TMPFILE) - && !au_h_dptr(dentry, bcpup)); - } - -out: - dput(parent); - return err; -} - -/* ---------------------------------------------------------------------- */ - -void au_pin_hdir_unlock(struct au_pin *p) -{ - if (p->hdir) - au_hn_imtx_unlock(p->hdir); -} - -int au_pin_hdir_lock(struct au_pin *p) -{ - int err; - - err = 0; - if (!p->hdir) - goto out; - - /* even if an error happens later, keep this lock */ - au_hn_imtx_lock_nested(p->hdir, p->lsc_hi); - - err = -EBUSY; - if (unlikely(p->hdir->hi_inode != d_inode(p->h_parent))) - goto out; - - err = 0; - if (p->h_dentry) - err = au_h_verify(p->h_dentry, p->udba, p->hdir->hi_inode, - p->h_parent, p->br); - -out: - return err; -} - -int au_pin_hdir_relock(struct au_pin *p) -{ - int err, i; - struct inode *h_i; - struct dentry *h_d[] = { - p->h_dentry, - p->h_parent - }; - - err = au_pin_hdir_lock(p); - if (unlikely(err)) - goto out; - - for (i = 0; !err && i < sizeof(h_d)/sizeof(*h_d); i++) { - if (!h_d[i]) - continue; - if (d_is_positive(h_d[i])) { - h_i = d_inode(h_d[i]); - err = !h_i->i_nlink; - } - } - -out: - return err; -} - -void au_pin_hdir_set_owner(struct au_pin *p, struct task_struct *task) -{ -#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_SMP) - p->hdir->hi_inode->i_mutex.owner = task; -#endif -} - -void au_pin_hdir_acquire_nest(struct au_pin *p) -{ - if (p->hdir) { - mutex_acquire_nest(&p->hdir->hi_inode->i_mutex.dep_map, - p->lsc_hi, 0, NULL, _RET_IP_); - au_pin_hdir_set_owner(p, current); - } -} - -void au_pin_hdir_release(struct au_pin *p) -{ - if (p->hdir) { - au_pin_hdir_set_owner(p, p->task); - mutex_release(&p->hdir->hi_inode->i_mutex.dep_map, 1, _RET_IP_); - } -} - -struct dentry *au_pinned_h_parent(struct au_pin *pin) -{ - if (pin && pin->parent) - return au_h_dptr(pin->parent, pin->bindex); - return NULL; -} - -void au_unpin(struct au_pin *p) -{ - if (p->hdir) - au_pin_hdir_unlock(p); - if (p->h_mnt && au_ftest_pin(p->flags, MNT_WRITE)) - vfsub_mnt_drop_write(p->h_mnt); - if (!p->hdir) - return; - - if (!au_ftest_pin(p->flags, DI_LOCKED)) - di_read_unlock(p->parent, AuLock_IR); - iput(p->hdir->hi_inode); - dput(p->parent); - p->parent = NULL; - p->hdir = NULL; - p->h_mnt = NULL; - /* do not clear p->task */ -} - -int au_do_pin(struct au_pin *p) -{ - int err; - struct super_block *sb; - struct inode *h_dir; - - err = 0; - sb = p->dentry->d_sb; - p->br = au_sbr(sb, p->bindex); - if (IS_ROOT(p->dentry)) { - if (au_ftest_pin(p->flags, MNT_WRITE)) { - p->h_mnt = au_br_mnt(p->br); - err = vfsub_mnt_want_write(p->h_mnt); - if (unlikely(err)) { - au_fclr_pin(p->flags, MNT_WRITE); - goto out_err; - } - } - goto out; - } - - p->h_dentry = NULL; - if (p->bindex <= au_dbend(p->dentry)) - p->h_dentry = au_h_dptr(p->dentry, p->bindex); - - p->parent = dget_parent(p->dentry); - if (!au_ftest_pin(p->flags, DI_LOCKED)) - di_read_lock(p->parent, AuLock_IR, p->lsc_di); - - h_dir = NULL; - p->h_parent = au_h_dptr(p->parent, p->bindex); - p->hdir = au_hi(d_inode(p->parent), p->bindex); - if (p->hdir) - h_dir = p->hdir->hi_inode; - - /* - * udba case, or - * if DI_LOCKED is not set, then p->parent may be different - * and h_parent can be NULL. - */ - if (unlikely(!p->hdir || !h_dir || !p->h_parent)) { - err = -EBUSY; - if (!au_ftest_pin(p->flags, DI_LOCKED)) - di_read_unlock(p->parent, AuLock_IR); - dput(p->parent); - p->parent = NULL; - goto out_err; - } - - if (au_ftest_pin(p->flags, MNT_WRITE)) { - p->h_mnt = au_br_mnt(p->br); - err = vfsub_mnt_want_write(p->h_mnt); - if (unlikely(err)) { - au_fclr_pin(p->flags, MNT_WRITE); - if (!au_ftest_pin(p->flags, DI_LOCKED)) - di_read_unlock(p->parent, AuLock_IR); - dput(p->parent); - p->parent = NULL; - goto out_err; - } - } - - au_igrab(h_dir); - err = au_pin_hdir_lock(p); - if (!err) - goto out; /* success */ - - au_unpin(p); - -out_err: - pr_err("err %d\n", err); - err = au_busy_or_stale(); -out: - return err; -} - -void au_pin_init(struct au_pin *p, struct dentry *dentry, - aufs_bindex_t bindex, int lsc_di, int lsc_hi, - unsigned int udba, unsigned char flags) -{ - p->dentry = dentry; - p->udba = udba; - p->lsc_di = lsc_di; - p->lsc_hi = lsc_hi; - p->flags = flags; - p->bindex = bindex; - - p->parent = NULL; - p->hdir = NULL; - p->h_mnt = NULL; - - p->h_dentry = NULL; - p->h_parent = NULL; - p->br = NULL; - p->task = current; -} - -int au_pin(struct au_pin *pin, struct dentry *dentry, aufs_bindex_t bindex, - unsigned int udba, unsigned char flags) -{ - au_pin_init(pin, dentry, bindex, AuLsc_DI_PARENT, AuLsc_I_PARENT2, - udba, flags); - return au_do_pin(pin); -} - -/* ---------------------------------------------------------------------- */ - -/* - * ->setattr() and ->getattr() are called in various cases. - * chmod, stat: dentry is revalidated. - * fchmod, fstat: file and dentry are not revalidated, additionally they may be - * unhashed. - * for ->setattr(), ia->ia_file is passed from ftruncate only. - */ -/* todo: consolidate with do_refresh() and simple_reval_dpath() */ -int au_reval_for_attr(struct dentry *dentry, unsigned int sigen) -{ - int err; - struct dentry *parent; - - err = 0; - if (au_digen_test(dentry, sigen)) { - parent = dget_parent(dentry); - di_read_lock_parent(parent, AuLock_IR); - err = au_refresh_dentry(dentry, parent); - di_read_unlock(parent, AuLock_IR); - dput(parent); - } - - AuTraceErr(err); - return err; -} - -int au_pin_and_icpup(struct dentry *dentry, struct iattr *ia, - struct au_icpup_args *a) -{ - int err; - loff_t sz; - aufs_bindex_t bstart, ibstart; - struct dentry *hi_wh, *parent; - struct inode *inode; - struct au_wr_dir_args wr_dir_args = { - .force_btgt = -1, - .flags = 0 - }; - - if (d_is_dir(dentry)) - au_fset_wrdir(wr_dir_args.flags, ISDIR); - /* plink or hi_wh() case */ - bstart = au_dbstart(dentry); - inode = d_inode(dentry); - ibstart = au_ibstart(inode); - if (bstart != ibstart && !au_test_ro(inode->i_sb, ibstart, inode)) - wr_dir_args.force_btgt = ibstart; - err = au_wr_dir(dentry, /*src_dentry*/NULL, &wr_dir_args); - if (unlikely(err < 0)) - goto out; - a->btgt = err; - if (err != bstart) - au_fset_icpup(a->flags, DID_CPUP); - - err = 0; - a->pin_flags = AuPin_MNT_WRITE; - parent = NULL; - if (!IS_ROOT(dentry)) { - au_fset_pin(a->pin_flags, DI_LOCKED); - parent = dget_parent(dentry); - di_write_lock_parent(parent); - } - - err = au_pin(&a->pin, dentry, a->btgt, a->udba, a->pin_flags); - if (unlikely(err)) - goto out_parent; - - a->h_path.dentry = au_h_dptr(dentry, bstart); - sz = -1; - a->h_inode = d_inode(a->h_path.dentry); - if (ia && (ia->ia_valid & ATTR_SIZE)) { - mutex_lock_nested(&a->h_inode->i_mutex, AuLsc_I_CHILD); - if (ia->ia_size < i_size_read(a->h_inode)) - sz = ia->ia_size; - mutex_unlock(&a->h_inode->i_mutex); - } - - hi_wh = NULL; - if (au_ftest_icpup(a->flags, DID_CPUP) && d_unlinked(dentry)) { - hi_wh = au_hi_wh(inode, a->btgt); - if (!hi_wh) { - struct au_cp_generic cpg = { - .dentry = dentry, - .bdst = a->btgt, - .bsrc = -1, - .len = sz, - .pin = &a->pin - }; - err = au_sio_cpup_wh(&cpg, /*file*/NULL); - if (unlikely(err)) - goto out_unlock; - hi_wh = au_hi_wh(inode, a->btgt); - /* todo: revalidate hi_wh? */ - } - } - - if (parent) { - au_pin_set_parent_lflag(&a->pin, /*lflag*/0); - di_downgrade_lock(parent, AuLock_IR); - dput(parent); - parent = NULL; - } - if (!au_ftest_icpup(a->flags, DID_CPUP)) - goto out; /* success */ - - if (!d_unhashed(dentry)) { - struct au_cp_generic cpg = { - .dentry = dentry, - .bdst = a->btgt, - .bsrc = bstart, - .len = sz, - .pin = &a->pin, - .flags = AuCpup_DTIME | AuCpup_HOPEN - }; - err = au_sio_cpup_simple(&cpg); - if (!err) - a->h_path.dentry = au_h_dptr(dentry, a->btgt); - } else if (!hi_wh) - a->h_path.dentry = au_h_dptr(dentry, a->btgt); - else - a->h_path.dentry = hi_wh; /* do not dget here */ - -out_unlock: - a->h_inode = d_inode(a->h_path.dentry); - if (!err) - goto out; /* success */ - au_unpin(&a->pin); -out_parent: - if (parent) { - di_write_unlock(parent); - dput(parent); - } -out: - if (!err) - mutex_lock_nested(&a->h_inode->i_mutex, AuLsc_I_CHILD); - return err; -} - -static int aufs_setattr(struct dentry *dentry, struct iattr *ia) -{ - int err; - struct inode *inode, *delegated; - struct super_block *sb; - struct file *file; - struct au_icpup_args *a; - - inode = d_inode(dentry); - IMustLock(inode); - - err = -ENOMEM; - a = kzalloc(sizeof(*a), GFP_NOFS); - if (unlikely(!a)) - goto out; - - if (ia->ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID)) - ia->ia_valid &= ~ATTR_MODE; - - file = NULL; - sb = dentry->d_sb; - err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM); - if (unlikely(err)) - goto out_kfree; - - if (ia->ia_valid & ATTR_FILE) { - /* currently ftruncate(2) only */ - AuDebugOn(!d_is_reg(dentry)); - file = ia->ia_file; - err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/1); - if (unlikely(err)) - goto out_si; - ia->ia_file = au_hf_top(file); - a->udba = AuOpt_UDBA_NONE; - } else { - /* fchmod() doesn't pass ia_file */ - a->udba = au_opt_udba(sb); - di_write_lock_child(dentry); - /* no d_unlinked(), to set UDBA_NONE for root */ - if (d_unhashed(dentry)) - a->udba = AuOpt_UDBA_NONE; - if (a->udba != AuOpt_UDBA_NONE) { - AuDebugOn(IS_ROOT(dentry)); - err = au_reval_for_attr(dentry, au_sigen(sb)); - if (unlikely(err)) - goto out_dentry; - } - } - - err = au_pin_and_icpup(dentry, ia, a); - if (unlikely(err < 0)) - goto out_dentry; - if (au_ftest_icpup(a->flags, DID_CPUP)) { - ia->ia_file = NULL; - ia->ia_valid &= ~ATTR_FILE; - } - - a->h_path.mnt = au_sbr_mnt(sb, a->btgt); - if ((ia->ia_valid & (ATTR_MODE | ATTR_CTIME)) - == (ATTR_MODE | ATTR_CTIME)) { - err = security_path_chmod(&a->h_path, ia->ia_mode); - if (unlikely(err)) - goto out_unlock; - } else if ((ia->ia_valid & (ATTR_UID | ATTR_GID)) - && (ia->ia_valid & ATTR_CTIME)) { - err = security_path_chown(&a->h_path, ia->ia_uid, ia->ia_gid); - if (unlikely(err)) - goto out_unlock; - } - - if (ia->ia_valid & ATTR_SIZE) { - struct file *f; - - if (ia->ia_size < i_size_read(inode)) - /* unmap only */ - truncate_setsize(inode, ia->ia_size); - - f = NULL; - if (ia->ia_valid & ATTR_FILE) - f = ia->ia_file; - mutex_unlock(&a->h_inode->i_mutex); - err = vfsub_trunc(&a->h_path, ia->ia_size, ia->ia_valid, f); - mutex_lock_nested(&a->h_inode->i_mutex, AuLsc_I_CHILD); - } else { - delegated = NULL; - while (1) { - err = vfsub_notify_change(&a->h_path, ia, &delegated); - if (delegated) { - err = break_deleg_wait(&delegated); - if (!err) - continue; - } - break; - } - } - if (!err) - au_cpup_attr_changeable(inode); - -out_unlock: - mutex_unlock(&a->h_inode->i_mutex); - au_unpin(&a->pin); - if (unlikely(err)) - au_update_dbstart(dentry); -out_dentry: - di_write_unlock(dentry); - if (file) { - fi_write_unlock(file); - ia->ia_file = file; - ia->ia_valid |= ATTR_FILE; - } -out_si: - si_read_unlock(sb); -out_kfree: - kfree(a); -out: - AuTraceErr(err); - return err; -} - -#if IS_ENABLED(CONFIG_AUFS_XATTR) || IS_ENABLED(CONFIG_FS_POSIX_ACL) -static int au_h_path_to_set_attr(struct dentry *dentry, - struct au_icpup_args *a, struct path *h_path) -{ - int err; - struct super_block *sb; - - sb = dentry->d_sb; - a->udba = au_opt_udba(sb); - /* no d_unlinked(), to set UDBA_NONE for root */ - if (d_unhashed(dentry)) - a->udba = AuOpt_UDBA_NONE; - if (a->udba != AuOpt_UDBA_NONE) { - AuDebugOn(IS_ROOT(dentry)); - err = au_reval_for_attr(dentry, au_sigen(sb)); - if (unlikely(err)) - goto out; - } - err = au_pin_and_icpup(dentry, /*ia*/NULL, a); - if (unlikely(err < 0)) - goto out; - - h_path->dentry = a->h_path.dentry; - h_path->mnt = au_sbr_mnt(sb, a->btgt); - -out: - return err; -} - -ssize_t au_srxattr(struct dentry *dentry, struct au_srxattr *arg) -{ - int err; - struct path h_path; - struct super_block *sb; - struct au_icpup_args *a; - struct inode *inode, *h_inode; - - inode = d_inode(dentry); - IMustLock(inode); - - err = -ENOMEM; - a = kzalloc(sizeof(*a), GFP_NOFS); - if (unlikely(!a)) - goto out; - - sb = dentry->d_sb; - err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM); - if (unlikely(err)) - goto out_kfree; - - h_path.dentry = NULL; /* silence gcc */ - di_write_lock_child(dentry); - err = au_h_path_to_set_attr(dentry, a, &h_path); - if (unlikely(err)) - goto out_di; - - mutex_unlock(&a->h_inode->i_mutex); - switch (arg->type) { - case AU_XATTR_SET: - err = vfsub_setxattr(h_path.dentry, - arg->u.set.name, arg->u.set.value, - arg->u.set.size, arg->u.set.flags); - break; - case AU_XATTR_REMOVE: - err = vfsub_removexattr(h_path.dentry, arg->u.remove.name); - break; - case AU_ACL_SET: - err = -EOPNOTSUPP; - h_inode = d_inode(h_path.dentry); - if (h_inode->i_op->set_acl) - err = h_inode->i_op->set_acl(h_inode, - arg->u.acl_set.acl, - arg->u.acl_set.type); - break; - } - if (!err) - au_cpup_attr_timesizes(inode); - - au_unpin(&a->pin); - if (unlikely(err)) - au_update_dbstart(dentry); - -out_di: - di_write_unlock(dentry); - si_read_unlock(sb); -out_kfree: - kfree(a); -out: - AuTraceErr(err); - return err; -} -#endif - -static void au_refresh_iattr(struct inode *inode, struct kstat *st, - unsigned int nlink) -{ - unsigned int n; - - inode->i_mode = st->mode; - /* don't i_[ug]id_write() here */ - inode->i_uid = st->uid; - inode->i_gid = st->gid; - inode->i_atime = st->atime; - inode->i_mtime = st->mtime; - inode->i_ctime = st->ctime; - - au_cpup_attr_nlink(inode, /*force*/0); - if (S_ISDIR(inode->i_mode)) { - n = inode->i_nlink; - n -= nlink; - n += st->nlink; - smp_mb(); /* for i_nlink */ - /* 0 can happen */ - set_nlink(inode, n); - } - - spin_lock(&inode->i_lock); - inode->i_blocks = st->blocks; - i_size_write(inode, st->size); - spin_unlock(&inode->i_lock); -} - -/* - * common routine for aufs_getattr() and aufs_getxattr(). - * returns zero or negative (an error). - * @dentry will be read-locked in success. - */ -int au_h_path_getattr(struct dentry *dentry, int force, struct path *h_path) -{ - int err; - unsigned int mnt_flags, sigen; - unsigned char udba_none; - aufs_bindex_t bindex; - struct super_block *sb, *h_sb; - struct inode *inode; - - h_path->mnt = NULL; - h_path->dentry = NULL; - - err = 0; - sb = dentry->d_sb; - mnt_flags = au_mntflags(sb); - udba_none = !!au_opt_test(mnt_flags, UDBA_NONE); - - /* support fstat(2) */ - if (!d_unlinked(dentry) && !udba_none) { - sigen = au_sigen(sb); - err = au_digen_test(dentry, sigen); - if (!err) { - di_read_lock_child(dentry, AuLock_IR); - err = au_dbrange_test(dentry); - if (unlikely(err)) { - di_read_unlock(dentry, AuLock_IR); - goto out; - } - } else { - AuDebugOn(IS_ROOT(dentry)); - di_write_lock_child(dentry); - err = au_dbrange_test(dentry); - if (!err) - err = au_reval_for_attr(dentry, sigen); - if (!err) - di_downgrade_lock(dentry, AuLock_IR); - else { - di_write_unlock(dentry); - goto out; - } - } - } else - di_read_lock_child(dentry, AuLock_IR); - - inode = d_inode(dentry); - bindex = au_ibstart(inode); - h_path->mnt = au_sbr_mnt(sb, bindex); - h_sb = h_path->mnt->mnt_sb; - if (!force - && !au_test_fs_bad_iattr(h_sb) - && udba_none) - goto out; /* success */ - - if (au_dbstart(dentry) == bindex) - h_path->dentry = au_h_dptr(dentry, bindex); - else if (au_opt_test(mnt_flags, PLINK) && au_plink_test(inode)) { - h_path->dentry = au_plink_lkup(inode, bindex); - if (IS_ERR(h_path->dentry)) - /* pretending success */ - h_path->dentry = NULL; - else - dput(h_path->dentry); - } - -out: - return err; -} - -static int aufs_getattr(struct vfsmount *mnt __maybe_unused, - struct dentry *dentry, struct kstat *st) -{ - int err; - unsigned char positive; - struct path h_path; - struct inode *inode; - struct super_block *sb; - - inode = d_inode(dentry); - sb = dentry->d_sb; - err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM); - if (unlikely(err)) - goto out; - err = au_h_path_getattr(dentry, /*force*/0, &h_path); - if (unlikely(err)) - goto out_si; - if (unlikely(!h_path.dentry)) - /* illegally overlapped or something */ - goto out_fill; /* pretending success */ - - positive = d_is_positive(h_path.dentry); - if (positive) - err = vfs_getattr(&h_path, st); - if (!err) { - if (positive) - au_refresh_iattr(inode, st, - d_inode(h_path.dentry)->i_nlink); - goto out_fill; /* success */ - } - AuTraceErr(err); - goto out_di; - -out_fill: - generic_fillattr(inode, st); -out_di: - di_read_unlock(dentry, AuLock_IR); -out_si: - si_read_unlock(sb); -out: - AuTraceErr(err); - return err; -} - -/* ---------------------------------------------------------------------- */ - -static int h_readlink(struct dentry *dentry, int bindex, char __user *buf, - int bufsiz) -{ - int err; - struct super_block *sb; - struct dentry *h_dentry; - struct inode *inode, *h_inode; - - err = -EINVAL; - h_dentry = au_h_dptr(dentry, bindex); - h_inode = d_inode(h_dentry); - if (unlikely(!h_inode->i_op->readlink)) - goto out; - - err = security_inode_readlink(h_dentry); - if (unlikely(err)) - goto out; - - sb = dentry->d_sb; - inode = d_inode(dentry); - if (!au_test_ro(sb, bindex, inode)) { - vfsub_touch_atime(au_sbr_mnt(sb, bindex), h_dentry); - fsstack_copy_attr_atime(inode, h_inode); - } - err = h_inode->i_op->readlink(h_dentry, buf, bufsiz); - -out: - return err; -} - -static int aufs_readlink(struct dentry *dentry, char __user *buf, int bufsiz) -{ - int err; - - err = aufs_read_lock(dentry, AuLock_IR | AuLock_GEN); - if (unlikely(err)) - goto out; - err = au_d_hashed_positive(dentry); - if (!err) - err = h_readlink(dentry, au_dbstart(dentry), buf, bufsiz); - aufs_read_unlock(dentry, AuLock_IR); - -out: - return err; -} - -static void *aufs_follow_link(struct dentry *dentry, struct nameidata *nd) -{ - int err; - mm_segment_t old_fs; - union { - char *k; - char __user *u; - } buf; - - err = -ENOMEM; - buf.k = (void *)__get_free_page(GFP_NOFS); - if (unlikely(!buf.k)) - goto out; - - err = aufs_read_lock(dentry, AuLock_IR | AuLock_GEN); - if (unlikely(err)) - goto out_name; - - err = au_d_hashed_positive(dentry); - if (!err) { - old_fs = get_fs(); - set_fs(KERNEL_DS); - err = h_readlink(dentry, au_dbstart(dentry), buf.u, PATH_MAX); - set_fs(old_fs); - } - aufs_read_unlock(dentry, AuLock_IR); - - if (err >= 0) { - buf.k[err] = 0; - /* will be freed by put_link */ - nd_set_link(nd, buf.k); - return NULL; /* success */ - } - -out_name: - free_page((unsigned long)buf.k); -out: - AuTraceErr(err); - return ERR_PTR(err); -} - -static void aufs_put_link(struct dentry *dentry __maybe_unused, - struct nameidata *nd, void *cookie __maybe_unused) -{ - char *p; - - p = nd_get_link(nd); - if (!IS_ERR_OR_NULL(p)) - free_page((unsigned long)p); -} - -/* ---------------------------------------------------------------------- */ - -static int aufs_update_time(struct inode *inode, struct timespec *ts, int flags) -{ - int err; - struct super_block *sb; - struct inode *h_inode; - - sb = inode->i_sb; - /* mmap_sem might be acquired already, cf. aufs_mmap() */ - lockdep_off(); - si_read_lock(sb, AuLock_FLUSH); - ii_write_lock_child(inode); - lockdep_on(); - h_inode = au_h_iptr(inode, au_ibstart(inode)); - err = vfsub_update_time(h_inode, ts, flags); - lockdep_off(); - if (!err) - au_cpup_attr_timesizes(inode); - ii_write_unlock(inode); - si_read_unlock(sb); - lockdep_on(); - - if (!err && (flags & S_VERSION)) - inode_inc_iversion(inode); - - return err; -} - -/* ---------------------------------------------------------------------- */ - -struct inode_operations aufs_symlink_iop = { - .permission = aufs_permission, -#ifdef CONFIG_FS_POSIX_ACL - .get_acl = aufs_get_acl, - .set_acl = aufs_set_acl, /* unsupport for symlink? */ -#endif - - .setattr = aufs_setattr, - .getattr = aufs_getattr, - -#ifdef CONFIG_AUFS_XATTR - .setxattr = aufs_setxattr, - .getxattr = aufs_getxattr, - .listxattr = aufs_listxattr, - .removexattr = aufs_removexattr, -#endif - - .readlink = aufs_readlink, - .follow_link = aufs_follow_link, - .put_link = aufs_put_link, - - /* .update_time = aufs_update_time */ -}; - -struct inode_operations aufs_dir_iop = { - .create = aufs_create, - .lookup = aufs_lookup, - .link = aufs_link, - .unlink = aufs_unlink, - .symlink = aufs_symlink, - .mkdir = aufs_mkdir, - .rmdir = aufs_rmdir, - .mknod = aufs_mknod, - .rename = aufs_rename, - - .permission = aufs_permission, -#ifdef CONFIG_FS_POSIX_ACL - .get_acl = aufs_get_acl, - .set_acl = aufs_set_acl, -#endif - - .setattr = aufs_setattr, - .getattr = aufs_getattr, - -#ifdef CONFIG_AUFS_XATTR - .setxattr = aufs_setxattr, - .getxattr = aufs_getxattr, - .listxattr = aufs_listxattr, - .removexattr = aufs_removexattr, -#endif - - .update_time = aufs_update_time, - .atomic_open = aufs_atomic_open, - .tmpfile = aufs_tmpfile -}; - -struct inode_operations aufs_iop = { - .permission = aufs_permission, -#ifdef CONFIG_FS_POSIX_ACL - .get_acl = aufs_get_acl, - .set_acl = aufs_set_acl, -#endif - - .setattr = aufs_setattr, - .getattr = aufs_getattr, - -#ifdef CONFIG_AUFS_XATTR - .setxattr = aufs_setxattr, - .getxattr = aufs_getxattr, - .listxattr = aufs_listxattr, - .removexattr = aufs_removexattr, -#endif - - .update_time = aufs_update_time -}; diff --git a/fs/aufs/i_op_add.c b/fs/aufs/i_op_add.c deleted file mode 100644 index 209b26864..000000000 --- a/fs/aufs/i_op_add.c +++ /dev/null @@ -1,932 +0,0 @@ -/* - * Copyright (C) 2005-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * inode operations (add entry) - */ - -#include "aufs.h" - -/* - * final procedure of adding a new entry, except link(2). - * remove whiteout, instantiate, copyup the parent dir's times and size - * and update version. - * if it failed, re-create the removed whiteout. - */ -static int epilog(struct inode *dir, aufs_bindex_t bindex, - struct dentry *wh_dentry, struct dentry *dentry) -{ - int err, rerr; - aufs_bindex_t bwh; - struct path h_path; - struct super_block *sb; - struct inode *inode, *h_dir; - struct dentry *wh; - - bwh = -1; - sb = dir->i_sb; - if (wh_dentry) { - h_dir = d_inode(wh_dentry->d_parent); /* dir inode is locked */ - IMustLock(h_dir); - AuDebugOn(au_h_iptr(dir, bindex) != h_dir); - bwh = au_dbwh(dentry); - h_path.dentry = wh_dentry; - h_path.mnt = au_sbr_mnt(sb, bindex); - err = au_wh_unlink_dentry(au_h_iptr(dir, bindex), &h_path, - dentry); - if (unlikely(err)) - goto out; - } - - inode = au_new_inode(dentry, /*must_new*/1); - if (!IS_ERR(inode)) { - d_instantiate(dentry, inode); - dir = d_inode(dentry->d_parent); /* dir inode is locked */ - IMustLock(dir); - au_dir_ts(dir, bindex); - dir->i_version++; - au_fhsm_wrote(sb, bindex, /*force*/0); - return 0; /* success */ - } - - err = PTR_ERR(inode); - if (!wh_dentry) - goto out; - - /* revert */ - /* dir inode is locked */ - wh = au_wh_create(dentry, bwh, wh_dentry->d_parent); - rerr = PTR_ERR(wh); - if (IS_ERR(wh)) { - AuIOErr("%pd reverting whiteout failed(%d, %d)\n", - dentry, err, rerr); - err = -EIO; - } else - dput(wh); - -out: - return err; -} - -static int au_d_may_add(struct dentry *dentry) -{ - int err; - - err = 0; - if (unlikely(d_unhashed(dentry))) - err = -ENOENT; - if (unlikely(d_really_is_positive(dentry))) - err = -EEXIST; - return err; -} - -/* - * simple tests for the adding inode operations. - * following the checks in vfs, plus the parent-child relationship. - */ -int au_may_add(struct dentry *dentry, aufs_bindex_t bindex, - struct dentry *h_parent, int isdir) -{ - int err; - umode_t h_mode; - struct dentry *h_dentry; - struct inode *h_inode; - - err = -ENAMETOOLONG; - if (unlikely(dentry->d_name.len > AUFS_MAX_NAMELEN)) - goto out; - - h_dentry = au_h_dptr(dentry, bindex); - if (d_really_is_negative(dentry)) { - err = -EEXIST; - if (unlikely(d_is_positive(h_dentry))) - goto out; - } else { - /* rename(2) case */ - err = -EIO; - if (unlikely(d_is_negative(h_dentry))) - goto out; - h_inode = d_inode(h_dentry); - if (unlikely(!h_inode->i_nlink)) - goto out; - - h_mode = h_inode->i_mode; - if (!isdir) { - err = -EISDIR; - if (unlikely(S_ISDIR(h_mode))) - goto out; - } else if (unlikely(!S_ISDIR(h_mode))) { - err = -ENOTDIR; - goto out; - } - } - - err = 0; - /* expected parent dir is locked */ - if (unlikely(h_parent != h_dentry->d_parent)) - err = -EIO; - -out: - AuTraceErr(err); - return err; -} - -/* - * initial procedure of adding a new entry. - * prepare writable branch and the parent dir, lock it, - * and lookup whiteout for the new entry. - */ -static struct dentry* -lock_hdir_lkup_wh(struct dentry *dentry, struct au_dtime *dt, - struct dentry *src_dentry, struct au_pin *pin, - struct au_wr_dir_args *wr_dir_args) -{ - struct dentry *wh_dentry, *h_parent; - struct super_block *sb; - struct au_branch *br; - int err; - unsigned int udba; - aufs_bindex_t bcpup; - - AuDbg("%pd\n", dentry); - - err = au_wr_dir(dentry, src_dentry, wr_dir_args); - bcpup = err; - wh_dentry = ERR_PTR(err); - if (unlikely(err < 0)) - goto out; - - sb = dentry->d_sb; - udba = au_opt_udba(sb); - err = au_pin(pin, dentry, bcpup, udba, - AuPin_DI_LOCKED | AuPin_MNT_WRITE); - wh_dentry = ERR_PTR(err); - if (unlikely(err)) - goto out; - - h_parent = au_pinned_h_parent(pin); - if (udba != AuOpt_UDBA_NONE - && au_dbstart(dentry) == bcpup) - err = au_may_add(dentry, bcpup, h_parent, - au_ftest_wrdir(wr_dir_args->flags, ISDIR)); - else if (unlikely(dentry->d_name.len > AUFS_MAX_NAMELEN)) - err = -ENAMETOOLONG; - wh_dentry = ERR_PTR(err); - if (unlikely(err)) - goto out_unpin; - - br = au_sbr(sb, bcpup); - if (dt) { - struct path tmp = { - .dentry = h_parent, - .mnt = au_br_mnt(br) - }; - au_dtime_store(dt, au_pinned_parent(pin), &tmp); - } - - wh_dentry = NULL; - if (bcpup != au_dbwh(dentry)) - goto out; /* success */ - - /* - * ENAMETOOLONG here means that if we allowed create such name, then it - * would not be able to removed in the future. So we don't allow such - * name here and we don't handle ENAMETOOLONG differently here. - */ - wh_dentry = au_wh_lkup(h_parent, &dentry->d_name, br); - -out_unpin: - if (IS_ERR(wh_dentry)) - au_unpin(pin); -out: - return wh_dentry; -} - -/* ---------------------------------------------------------------------- */ - -enum { Mknod, Symlink, Creat }; -struct simple_arg { - int type; - union { - struct { - umode_t mode; - bool want_excl; - bool try_aopen; - struct vfsub_aopen_args *aopen; - } c; - struct { - const char *symname; - } s; - struct { - umode_t mode; - dev_t dev; - } m; - } u; -}; - -static int add_simple(struct inode *dir, struct dentry *dentry, - struct simple_arg *arg) -{ - int err, rerr; - aufs_bindex_t bstart; - unsigned char created; - const unsigned char try_aopen - = (arg->type == Creat && arg->u.c.try_aopen); - struct dentry *wh_dentry, *parent; - struct inode *h_dir; - struct super_block *sb; - struct au_branch *br; - /* to reuduce stack size */ - struct { - struct au_dtime dt; - struct au_pin pin; - struct path h_path; - struct au_wr_dir_args wr_dir_args; - } *a; - - AuDbg("%pd\n", dentry); - IMustLock(dir); - - err = -ENOMEM; - a = kmalloc(sizeof(*a), GFP_NOFS); - if (unlikely(!a)) - goto out; - a->wr_dir_args.force_btgt = -1; - a->wr_dir_args.flags = AuWrDir_ADD_ENTRY; - - parent = dentry->d_parent; /* dir inode is locked */ - if (!try_aopen) { - err = aufs_read_lock(dentry, AuLock_DW | AuLock_GEN); - if (unlikely(err)) - goto out_free; - } - err = au_d_may_add(dentry); - if (unlikely(err)) - goto out_unlock; - if (!try_aopen) - di_write_lock_parent(parent); - wh_dentry = lock_hdir_lkup_wh(dentry, &a->dt, /*src_dentry*/NULL, - &a->pin, &a->wr_dir_args); - err = PTR_ERR(wh_dentry); - if (IS_ERR(wh_dentry)) - goto out_parent; - - bstart = au_dbstart(dentry); - sb = dentry->d_sb; - br = au_sbr(sb, bstart); - a->h_path.dentry = au_h_dptr(dentry, bstart); - a->h_path.mnt = au_br_mnt(br); - h_dir = au_pinned_h_dir(&a->pin); - switch (arg->type) { - case Creat: - err = 0; - if (!try_aopen || !h_dir->i_op->atomic_open) - err = vfsub_create(h_dir, &a->h_path, arg->u.c.mode, - arg->u.c.want_excl); - else - err = vfsub_atomic_open(h_dir, a->h_path.dentry, - arg->u.c.aopen, br); - break; - case Symlink: - err = vfsub_symlink(h_dir, &a->h_path, arg->u.s.symname); - break; - case Mknod: - err = vfsub_mknod(h_dir, &a->h_path, arg->u.m.mode, - arg->u.m.dev); - break; - default: - BUG(); - } - created = !err; - if (!err) - err = epilog(dir, bstart, wh_dentry, dentry); - - /* revert */ - if (unlikely(created && err && d_is_positive(a->h_path.dentry))) { - /* no delegation since it is just created */ - rerr = vfsub_unlink(h_dir, &a->h_path, /*delegated*/NULL, - /*force*/0); - if (rerr) { - AuIOErr("%pd revert failure(%d, %d)\n", - dentry, err, rerr); - err = -EIO; - } - au_dtime_revert(&a->dt); - } - - if (!err && try_aopen && !h_dir->i_op->atomic_open) - *arg->u.c.aopen->opened |= FILE_CREATED; - - au_unpin(&a->pin); - dput(wh_dentry); - -out_parent: - if (!try_aopen) - di_write_unlock(parent); -out_unlock: - if (unlikely(err)) { - au_update_dbstart(dentry); - d_drop(dentry); - } - if (!try_aopen) - aufs_read_unlock(dentry, AuLock_DW); -out_free: - kfree(a); -out: - return err; -} - -int aufs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, - dev_t dev) -{ - struct simple_arg arg = { - .type = Mknod, - .u.m = { - .mode = mode, - .dev = dev - } - }; - return add_simple(dir, dentry, &arg); -} - -int aufs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) -{ - struct simple_arg arg = { - .type = Symlink, - .u.s.symname = symname - }; - return add_simple(dir, dentry, &arg); -} - -int aufs_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool want_excl) -{ - struct simple_arg arg = { - .type = Creat, - .u.c = { - .mode = mode, - .want_excl = want_excl - } - }; - return add_simple(dir, dentry, &arg); -} - -int au_aopen_or_create(struct inode *dir, struct dentry *dentry, - struct vfsub_aopen_args *aopen_args) -{ - struct simple_arg arg = { - .type = Creat, - .u.c = { - .mode = aopen_args->create_mode, - .want_excl = aopen_args->open_flag & O_EXCL, - .try_aopen = true, - .aopen = aopen_args - } - }; - return add_simple(dir, dentry, &arg); -} - -int aufs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) -{ - int err; - aufs_bindex_t bindex; - struct super_block *sb; - struct dentry *parent, *h_parent, *h_dentry; - struct inode *h_dir, *inode; - struct vfsmount *h_mnt; - struct au_wr_dir_args wr_dir_args = { - .force_btgt = -1, - .flags = AuWrDir_TMPFILE - }; - - /* copy-up may happen */ - mutex_lock(&dir->i_mutex); - - sb = dir->i_sb; - err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM); - if (unlikely(err)) - goto out; - - err = au_di_init(dentry); - if (unlikely(err)) - goto out_si; - - err = -EBUSY; - parent = d_find_any_alias(dir); - AuDebugOn(!parent); - di_write_lock_parent(parent); - if (unlikely(d_inode(parent) != dir)) - goto out_parent; - - err = au_digen_test(parent, au_sigen(sb)); - if (unlikely(err)) - goto out_parent; - - bindex = au_dbstart(parent); - au_set_dbstart(dentry, bindex); - au_set_dbend(dentry, bindex); - err = au_wr_dir(dentry, /*src_dentry*/NULL, &wr_dir_args); - bindex = err; - if (unlikely(err < 0)) - goto out_parent; - - err = -EOPNOTSUPP; - h_dir = au_h_iptr(dir, bindex); - if (unlikely(!h_dir->i_op->tmpfile)) - goto out_parent; - - h_mnt = au_sbr_mnt(sb, bindex); - err = vfsub_mnt_want_write(h_mnt); - if (unlikely(err)) - goto out_parent; - - h_parent = au_h_dptr(parent, bindex); - err = inode_permission(d_inode(h_parent), MAY_WRITE | MAY_EXEC); - if (unlikely(err)) - goto out_mnt; - - err = -ENOMEM; - h_dentry = d_alloc(h_parent, &dentry->d_name); - if (unlikely(!h_dentry)) - goto out_mnt; - - err = h_dir->i_op->tmpfile(h_dir, h_dentry, mode); - if (unlikely(err)) - goto out_dentry; - - au_set_dbstart(dentry, bindex); - au_set_dbend(dentry, bindex); - au_set_h_dptr(dentry, bindex, dget(h_dentry)); - inode = au_new_inode(dentry, /*must_new*/1); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); - au_set_h_dptr(dentry, bindex, NULL); - au_set_dbstart(dentry, -1); - au_set_dbend(dentry, -1); - } else { - if (!inode->i_nlink) - set_nlink(inode, 1); - d_tmpfile(dentry, inode); - au_di(dentry)->di_tmpfile = 1; - - /* update without i_mutex */ - if (au_ibstart(dir) == au_dbstart(dentry)) - au_cpup_attr_timesizes(dir); - } - -out_dentry: - dput(h_dentry); -out_mnt: - vfsub_mnt_drop_write(h_mnt); -out_parent: - di_write_unlock(parent); - dput(parent); - di_write_unlock(dentry); - if (!err) -#if 0 - /* verbose coding for lock class name */ - au_rw_class(&au_di(dentry)->di_rwsem, - au_lc_key + AuLcNonDir_DIINFO); -#else - ; -#endif - else { - au_di_fin(dentry); - dentry->d_fsdata = NULL; - } -out_si: - si_read_unlock(sb); -out: - mutex_unlock(&dir->i_mutex); - return err; -} - -/* ---------------------------------------------------------------------- */ - -struct au_link_args { - aufs_bindex_t bdst, bsrc; - struct au_pin pin; - struct path h_path; - struct dentry *src_parent, *parent; -}; - -static int au_cpup_before_link(struct dentry *src_dentry, - struct au_link_args *a) -{ - int err; - struct dentry *h_src_dentry; - struct au_cp_generic cpg = { - .dentry = src_dentry, - .bdst = a->bdst, - .bsrc = a->bsrc, - .len = -1, - .pin = &a->pin, - .flags = AuCpup_DTIME | AuCpup_HOPEN /* | AuCpup_KEEPLINO */ - }; - - di_read_lock_parent(a->src_parent, AuLock_IR); - err = au_test_and_cpup_dirs(src_dentry, a->bdst); - if (unlikely(err)) - goto out; - - h_src_dentry = au_h_dptr(src_dentry, a->bsrc); - err = au_pin(&a->pin, src_dentry, a->bdst, - au_opt_udba(src_dentry->d_sb), - AuPin_DI_LOCKED | AuPin_MNT_WRITE); - if (unlikely(err)) - goto out; - - err = au_sio_cpup_simple(&cpg); - au_unpin(&a->pin); - -out: - di_read_unlock(a->src_parent, AuLock_IR); - return err; -} - -static int au_cpup_or_link(struct dentry *src_dentry, struct dentry *dentry, - struct au_link_args *a) -{ - int err; - unsigned char plink; - aufs_bindex_t bend; - struct dentry *h_src_dentry; - struct inode *h_inode, *inode, *delegated; - struct super_block *sb; - struct file *h_file; - - plink = 0; - h_inode = NULL; - sb = src_dentry->d_sb; - inode = d_inode(src_dentry); - if (au_ibstart(inode) <= a->bdst) - h_inode = au_h_iptr(inode, a->bdst); - if (!h_inode || !h_inode->i_nlink) { - /* copyup src_dentry as the name of dentry. */ - bend = au_dbend(dentry); - if (bend < a->bsrc) - au_set_dbend(dentry, a->bsrc); - au_set_h_dptr(dentry, a->bsrc, - dget(au_h_dptr(src_dentry, a->bsrc))); - dget(a->h_path.dentry); - au_set_h_dptr(dentry, a->bdst, NULL); - AuDbg("temporary d_inode...\n"); - spin_lock(&dentry->d_lock); - dentry->d_inode = d_inode(src_dentry); /* tmp */ - spin_unlock(&dentry->d_lock); - h_file = au_h_open_pre(dentry, a->bsrc, /*force_wr*/0); - if (IS_ERR(h_file)) - err = PTR_ERR(h_file); - else { - struct au_cp_generic cpg = { - .dentry = dentry, - .bdst = a->bdst, - .bsrc = -1, - .len = -1, - .pin = &a->pin, - .flags = AuCpup_KEEPLINO - }; - err = au_sio_cpup_simple(&cpg); - au_h_open_post(dentry, a->bsrc, h_file); - if (!err) { - dput(a->h_path.dentry); - a->h_path.dentry = au_h_dptr(dentry, a->bdst); - } else - au_set_h_dptr(dentry, a->bdst, - a->h_path.dentry); - } - spin_lock(&dentry->d_lock); - dentry->d_inode = NULL; /* restore */ - spin_unlock(&dentry->d_lock); - AuDbg("temporary d_inode...done\n"); - au_set_h_dptr(dentry, a->bsrc, NULL); - au_set_dbend(dentry, bend); - } else { - /* the inode of src_dentry already exists on a.bdst branch */ - h_src_dentry = d_find_alias(h_inode); - if (!h_src_dentry && au_plink_test(inode)) { - plink = 1; - h_src_dentry = au_plink_lkup(inode, a->bdst); - err = PTR_ERR(h_src_dentry); - if (IS_ERR(h_src_dentry)) - goto out; - - if (unlikely(d_is_negative(h_src_dentry))) { - dput(h_src_dentry); - h_src_dentry = NULL; - } - - } - if (h_src_dentry) { - delegated = NULL; - err = vfsub_link(h_src_dentry, au_pinned_h_dir(&a->pin), - &a->h_path, &delegated); - if (unlikely(err == -EWOULDBLOCK)) { - pr_warn("cannot retry for NFSv4 delegation" - " for an internal link\n"); - iput(delegated); - } - dput(h_src_dentry); - } else { - AuIOErr("no dentry found for hi%lu on b%d\n", - h_inode->i_ino, a->bdst); - err = -EIO; - } - } - - if (!err && !plink) - au_plink_append(inode, a->bdst, a->h_path.dentry); - -out: - AuTraceErr(err); - return err; -} - -int aufs_link(struct dentry *src_dentry, struct inode *dir, - struct dentry *dentry) -{ - int err, rerr; - struct au_dtime dt; - struct au_link_args *a; - struct dentry *wh_dentry, *h_src_dentry; - struct inode *inode, *delegated; - struct super_block *sb; - struct au_wr_dir_args wr_dir_args = { - /* .force_btgt = -1, */ - .flags = AuWrDir_ADD_ENTRY - }; - - IMustLock(dir); - inode = d_inode(src_dentry); - IMustLock(inode); - - err = -ENOMEM; - a = kzalloc(sizeof(*a), GFP_NOFS); - if (unlikely(!a)) - goto out; - - a->parent = dentry->d_parent; /* dir inode is locked */ - err = aufs_read_and_write_lock2(dentry, src_dentry, - AuLock_NOPLM | AuLock_GEN); - if (unlikely(err)) - goto out_kfree; - err = au_d_linkable(src_dentry); - if (unlikely(err)) - goto out_unlock; - err = au_d_may_add(dentry); - if (unlikely(err)) - goto out_unlock; - - a->src_parent = dget_parent(src_dentry); - wr_dir_args.force_btgt = au_ibstart(inode); - - di_write_lock_parent(a->parent); - wr_dir_args.force_btgt = au_wbr(dentry, wr_dir_args.force_btgt); - wh_dentry = lock_hdir_lkup_wh(dentry, &dt, src_dentry, &a->pin, - &wr_dir_args); - err = PTR_ERR(wh_dentry); - if (IS_ERR(wh_dentry)) - goto out_parent; - - err = 0; - sb = dentry->d_sb; - a->bdst = au_dbstart(dentry); - a->h_path.dentry = au_h_dptr(dentry, a->bdst); - a->h_path.mnt = au_sbr_mnt(sb, a->bdst); - a->bsrc = au_ibstart(inode); - h_src_dentry = au_h_d_alias(src_dentry, a->bsrc); - if (!h_src_dentry && au_di(src_dentry)->di_tmpfile) - h_src_dentry = dget(au_hi_wh(inode, a->bsrc)); - if (!h_src_dentry) { - a->bsrc = au_dbstart(src_dentry); - h_src_dentry = au_h_d_alias(src_dentry, a->bsrc); - AuDebugOn(!h_src_dentry); - } else if (IS_ERR(h_src_dentry)) { - err = PTR_ERR(h_src_dentry); - goto out_parent; - } - - if (au_opt_test(au_mntflags(sb), PLINK)) { - if (a->bdst < a->bsrc - /* && h_src_dentry->d_sb != a->h_path.dentry->d_sb */) - err = au_cpup_or_link(src_dentry, dentry, a); - else { - delegated = NULL; - err = vfsub_link(h_src_dentry, au_pinned_h_dir(&a->pin), - &a->h_path, &delegated); - if (unlikely(err == -EWOULDBLOCK)) { - pr_warn("cannot retry for NFSv4 delegation" - " for an internal link\n"); - iput(delegated); - } - } - dput(h_src_dentry); - } else { - /* - * copyup src_dentry to the branch we process, - * and then link(2) to it. - */ - dput(h_src_dentry); - if (a->bdst < a->bsrc - /* && h_src_dentry->d_sb != a->h_path.dentry->d_sb */) { - au_unpin(&a->pin); - di_write_unlock(a->parent); - err = au_cpup_before_link(src_dentry, a); - di_write_lock_parent(a->parent); - if (!err) - err = au_pin(&a->pin, dentry, a->bdst, - au_opt_udba(sb), - AuPin_DI_LOCKED | AuPin_MNT_WRITE); - if (unlikely(err)) - goto out_wh; - } - if (!err) { - h_src_dentry = au_h_dptr(src_dentry, a->bdst); - err = -ENOENT; - if (h_src_dentry && d_is_positive(h_src_dentry)) { - delegated = NULL; - err = vfsub_link(h_src_dentry, - au_pinned_h_dir(&a->pin), - &a->h_path, &delegated); - if (unlikely(err == -EWOULDBLOCK)) { - pr_warn("cannot retry" - " for NFSv4 delegation" - " for an internal link\n"); - iput(delegated); - } - } - } - } - if (unlikely(err)) - goto out_unpin; - - if (wh_dentry) { - a->h_path.dentry = wh_dentry; - err = au_wh_unlink_dentry(au_pinned_h_dir(&a->pin), &a->h_path, - dentry); - if (unlikely(err)) - goto out_revert; - } - - au_dir_ts(dir, a->bdst); - dir->i_version++; - inc_nlink(inode); - inode->i_ctime = dir->i_ctime; - d_instantiate(dentry, au_igrab(inode)); - if (d_unhashed(a->h_path.dentry)) - /* some filesystem calls d_drop() */ - d_drop(dentry); - /* some filesystems consume an inode even hardlink */ - au_fhsm_wrote(sb, a->bdst, /*force*/0); - goto out_unpin; /* success */ - -out_revert: - /* no delegation since it is just created */ - rerr = vfsub_unlink(au_pinned_h_dir(&a->pin), &a->h_path, - /*delegated*/NULL, /*force*/0); - if (unlikely(rerr)) { - AuIOErr("%pd reverting failed(%d, %d)\n", dentry, err, rerr); - err = -EIO; - } - au_dtime_revert(&dt); -out_unpin: - au_unpin(&a->pin); -out_wh: - dput(wh_dentry); -out_parent: - di_write_unlock(a->parent); - dput(a->src_parent); -out_unlock: - if (unlikely(err)) { - au_update_dbstart(dentry); - d_drop(dentry); - } - aufs_read_and_write_unlock2(dentry, src_dentry); -out_kfree: - kfree(a); -out: - AuTraceErr(err); - return err; -} - -int aufs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) -{ - int err, rerr; - aufs_bindex_t bindex; - unsigned char diropq; - struct path h_path; - struct dentry *wh_dentry, *parent, *opq_dentry; - struct mutex *h_mtx; - struct super_block *sb; - struct { - struct au_pin pin; - struct au_dtime dt; - } *a; /* reduce the stack usage */ - struct au_wr_dir_args wr_dir_args = { - .force_btgt = -1, - .flags = AuWrDir_ADD_ENTRY | AuWrDir_ISDIR - }; - - IMustLock(dir); - - err = -ENOMEM; - a = kmalloc(sizeof(*a), GFP_NOFS); - if (unlikely(!a)) - goto out; - - err = aufs_read_lock(dentry, AuLock_DW | AuLock_GEN); - if (unlikely(err)) - goto out_free; - err = au_d_may_add(dentry); - if (unlikely(err)) - goto out_unlock; - - parent = dentry->d_parent; /* dir inode is locked */ - di_write_lock_parent(parent); - wh_dentry = lock_hdir_lkup_wh(dentry, &a->dt, /*src_dentry*/NULL, - &a->pin, &wr_dir_args); - err = PTR_ERR(wh_dentry); - if (IS_ERR(wh_dentry)) - goto out_parent; - - sb = dentry->d_sb; - bindex = au_dbstart(dentry); - h_path.dentry = au_h_dptr(dentry, bindex); - h_path.mnt = au_sbr_mnt(sb, bindex); - err = vfsub_mkdir(au_pinned_h_dir(&a->pin), &h_path, mode); - if (unlikely(err)) - goto out_unpin; - - /* make the dir opaque */ - diropq = 0; - h_mtx = &d_inode(h_path.dentry)->i_mutex; - if (wh_dentry - || au_opt_test(au_mntflags(sb), ALWAYS_DIROPQ)) { - mutex_lock_nested(h_mtx, AuLsc_I_CHILD); - opq_dentry = au_diropq_create(dentry, bindex); - mutex_unlock(h_mtx); - err = PTR_ERR(opq_dentry); - if (IS_ERR(opq_dentry)) - goto out_dir; - dput(opq_dentry); - diropq = 1; - } - - err = epilog(dir, bindex, wh_dentry, dentry); - if (!err) { - inc_nlink(dir); - goto out_unpin; /* success */ - } - - /* revert */ - if (diropq) { - AuLabel(revert opq); - mutex_lock_nested(h_mtx, AuLsc_I_CHILD); - rerr = au_diropq_remove(dentry, bindex); - mutex_unlock(h_mtx); - if (rerr) { - AuIOErr("%pd reverting diropq failed(%d, %d)\n", - dentry, err, rerr); - err = -EIO; - } - } - -out_dir: - AuLabel(revert dir); - rerr = vfsub_rmdir(au_pinned_h_dir(&a->pin), &h_path); - if (rerr) { - AuIOErr("%pd reverting dir failed(%d, %d)\n", - dentry, err, rerr); - err = -EIO; - } - au_dtime_revert(&a->dt); -out_unpin: - au_unpin(&a->pin); - dput(wh_dentry); -out_parent: - di_write_unlock(parent); -out_unlock: - if (unlikely(err)) { - au_update_dbstart(dentry); - d_drop(dentry); - } - aufs_read_unlock(dentry, AuLock_DW); -out_free: - kfree(a); -out: - return err; -} diff --git a/fs/aufs/i_op_del.c b/fs/aufs/i_op_del.c deleted file mode 100644 index 78cf67a52..000000000 --- a/fs/aufs/i_op_del.c +++ /dev/null @@ -1,510 +0,0 @@ -/* - * Copyright (C) 2005-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * inode operations (del entry) - */ - -#include "aufs.h" - -/* - * decide if a new whiteout for @dentry is necessary or not. - * when it is necessary, prepare the parent dir for the upper branch whose - * branch index is @bcpup for creation. the actual creation of the whiteout will - * be done by caller. - * return value: - * 0: wh is unnecessary - * plus: wh is necessary - * minus: error - */ -int au_wr_dir_need_wh(struct dentry *dentry, int isdir, aufs_bindex_t *bcpup) -{ - int need_wh, err; - aufs_bindex_t bstart; - struct super_block *sb; - - sb = dentry->d_sb; - bstart = au_dbstart(dentry); - if (*bcpup < 0) { - *bcpup = bstart; - if (au_test_ro(sb, bstart, d_inode(dentry))) { - err = AuWbrCopyup(au_sbi(sb), dentry); - *bcpup = err; - if (unlikely(err < 0)) - goto out; - } - } else - AuDebugOn(bstart < *bcpup - || au_test_ro(sb, *bcpup, d_inode(dentry))); - AuDbg("bcpup %d, bstart %d\n", *bcpup, bstart); - - if (*bcpup != bstart) { - err = au_cpup_dirs(dentry, *bcpup); - if (unlikely(err)) - goto out; - need_wh = 1; - } else { - struct au_dinfo *dinfo, *tmp; - - need_wh = -ENOMEM; - dinfo = au_di(dentry); - tmp = au_di_alloc(sb, AuLsc_DI_TMP); - if (tmp) { - au_di_cp(tmp, dinfo); - au_di_swap(tmp, dinfo); - /* returns the number of positive dentries */ - need_wh = au_lkup_dentry(dentry, bstart + 1, /*type*/0); - au_di_swap(tmp, dinfo); - au_rw_write_unlock(&tmp->di_rwsem); - au_di_free(tmp); - } - } - AuDbg("need_wh %d\n", need_wh); - err = need_wh; - -out: - return err; -} - -/* - * simple tests for the del-entry operations. - * following the checks in vfs, plus the parent-child relationship. - */ -int au_may_del(struct dentry *dentry, aufs_bindex_t bindex, - struct dentry *h_parent, int isdir) -{ - int err; - umode_t h_mode; - struct dentry *h_dentry, *h_latest; - struct inode *h_inode; - - h_dentry = au_h_dptr(dentry, bindex); - if (d_really_is_positive(dentry)) { - err = -ENOENT; - if (unlikely(d_is_negative(h_dentry))) - goto out; - h_inode = d_inode(h_dentry); - if (unlikely(!h_inode->i_nlink)) - goto out; - - h_mode = h_inode->i_mode; - if (!isdir) { - err = -EISDIR; - if (unlikely(S_ISDIR(h_mode))) - goto out; - } else if (unlikely(!S_ISDIR(h_mode))) { - err = -ENOTDIR; - goto out; - } - } else { - /* rename(2) case */ - err = -EIO; - if (unlikely(d_is_positive(h_dentry))) - goto out; - } - - err = -ENOENT; - /* expected parent dir is locked */ - if (unlikely(h_parent != h_dentry->d_parent)) - goto out; - err = 0; - - /* - * rmdir a dir may break the consistency on some filesystem. - * let's try heavy test. - */ - err = -EACCES; - if (unlikely(!au_opt_test(au_mntflags(dentry->d_sb), DIRPERM1) - && au_test_h_perm(d_inode(h_parent), - MAY_EXEC | MAY_WRITE))) - goto out; - - h_latest = au_sio_lkup_one(&dentry->d_name, h_parent); - err = -EIO; - if (IS_ERR(h_latest)) - goto out; - if (h_latest == h_dentry) - err = 0; - dput(h_latest); - -out: - return err; -} - -/* - * decide the branch where we operate for @dentry. the branch index will be set - * @rbcpup. after diciding it, 'pin' it and store the timestamps of the parent - * dir for reverting. - * when a new whiteout is necessary, create it. - */ -static struct dentry* -lock_hdir_create_wh(struct dentry *dentry, int isdir, aufs_bindex_t *rbcpup, - struct au_dtime *dt, struct au_pin *pin) -{ - struct dentry *wh_dentry; - struct super_block *sb; - struct path h_path; - int err, need_wh; - unsigned int udba; - aufs_bindex_t bcpup; - - need_wh = au_wr_dir_need_wh(dentry, isdir, rbcpup); - wh_dentry = ERR_PTR(need_wh); - if (unlikely(need_wh < 0)) - goto out; - - sb = dentry->d_sb; - udba = au_opt_udba(sb); - bcpup = *rbcpup; - err = au_pin(pin, dentry, bcpup, udba, - AuPin_DI_LOCKED | AuPin_MNT_WRITE); - wh_dentry = ERR_PTR(err); - if (unlikely(err)) - goto out; - - h_path.dentry = au_pinned_h_parent(pin); - if (udba != AuOpt_UDBA_NONE - && au_dbstart(dentry) == bcpup) { - err = au_may_del(dentry, bcpup, h_path.dentry, isdir); - wh_dentry = ERR_PTR(err); - if (unlikely(err)) - goto out_unpin; - } - - h_path.mnt = au_sbr_mnt(sb, bcpup); - au_dtime_store(dt, au_pinned_parent(pin), &h_path); - wh_dentry = NULL; - if (!need_wh) - goto out; /* success, no need to create whiteout */ - - wh_dentry = au_wh_create(dentry, bcpup, h_path.dentry); - if (IS_ERR(wh_dentry)) - goto out_unpin; - - /* returns with the parent is locked and wh_dentry is dget-ed */ - goto out; /* success */ - -out_unpin: - au_unpin(pin); -out: - return wh_dentry; -} - -/* - * when removing a dir, rename it to a unique temporary whiteout-ed name first - * in order to be revertible and save time for removing many child whiteouts - * under the dir. - * returns 1 when there are too many child whiteout and caller should remove - * them asynchronously. returns 0 when the number of children is enough small to - * remove now or the branch fs is a remote fs. - * otherwise return an error. - */ -static int renwh_and_rmdir(struct dentry *dentry, aufs_bindex_t bindex, - struct au_nhash *whlist, struct inode *dir) -{ - int rmdir_later, err, dirwh; - struct dentry *h_dentry; - struct super_block *sb; - struct inode *inode; - - sb = dentry->d_sb; - SiMustAnyLock(sb); - h_dentry = au_h_dptr(dentry, bindex); - err = au_whtmp_ren(h_dentry, au_sbr(sb, bindex)); - if (unlikely(err)) - goto out; - - /* stop monitoring */ - inode = d_inode(dentry); - au_hn_free(au_hi(inode, bindex)); - - if (!au_test_fs_remote(h_dentry->d_sb)) { - dirwh = au_sbi(sb)->si_dirwh; - rmdir_later = (dirwh <= 1); - if (!rmdir_later) - rmdir_later = au_nhash_test_longer_wh(whlist, bindex, - dirwh); - if (rmdir_later) - return rmdir_later; - } - - err = au_whtmp_rmdir(dir, bindex, h_dentry, whlist); - if (unlikely(err)) { - AuIOErr("rmdir %pd, b%d failed, %d. ignored\n", - h_dentry, bindex, err); - err = 0; - } - -out: - AuTraceErr(err); - return err; -} - -/* - * final procedure for deleting a entry. - * maintain dentry and iattr. - */ -static void epilog(struct inode *dir, struct dentry *dentry, - aufs_bindex_t bindex) -{ - struct inode *inode; - - inode = d_inode(dentry); - d_drop(dentry); - inode->i_ctime = dir->i_ctime; - - au_dir_ts(dir, bindex); - dir->i_version++; -} - -/* - * when an error happened, remove the created whiteout and revert everything. - */ -static int do_revert(int err, struct inode *dir, aufs_bindex_t bindex, - aufs_bindex_t bwh, struct dentry *wh_dentry, - struct dentry *dentry, struct au_dtime *dt) -{ - int rerr; - struct path h_path = { - .dentry = wh_dentry, - .mnt = au_sbr_mnt(dir->i_sb, bindex) - }; - - rerr = au_wh_unlink_dentry(au_h_iptr(dir, bindex), &h_path, dentry); - if (!rerr) { - au_set_dbwh(dentry, bwh); - au_dtime_revert(dt); - return 0; - } - - AuIOErr("%pd reverting whiteout failed(%d, %d)\n", dentry, err, rerr); - return -EIO; -} - -/* ---------------------------------------------------------------------- */ - -int aufs_unlink(struct inode *dir, struct dentry *dentry) -{ - int err; - aufs_bindex_t bwh, bindex, bstart; - struct inode *inode, *h_dir, *delegated; - struct dentry *parent, *wh_dentry; - /* to reuduce stack size */ - struct { - struct au_dtime dt; - struct au_pin pin; - struct path h_path; - } *a; - - IMustLock(dir); - - err = -ENOMEM; - a = kmalloc(sizeof(*a), GFP_NOFS); - if (unlikely(!a)) - goto out; - - err = aufs_read_lock(dentry, AuLock_DW | AuLock_GEN); - if (unlikely(err)) - goto out_free; - err = au_d_hashed_positive(dentry); - if (unlikely(err)) - goto out_unlock; - inode = d_inode(dentry); - IMustLock(inode); - err = -EISDIR; - if (unlikely(d_is_dir(dentry))) - goto out_unlock; /* possible? */ - - bstart = au_dbstart(dentry); - bwh = au_dbwh(dentry); - bindex = -1; - parent = dentry->d_parent; /* dir inode is locked */ - di_write_lock_parent(parent); - wh_dentry = lock_hdir_create_wh(dentry, /*isdir*/0, &bindex, &a->dt, - &a->pin); - err = PTR_ERR(wh_dentry); - if (IS_ERR(wh_dentry)) - goto out_parent; - - a->h_path.mnt = au_sbr_mnt(dentry->d_sb, bstart); - a->h_path.dentry = au_h_dptr(dentry, bstart); - dget(a->h_path.dentry); - if (bindex == bstart) { - h_dir = au_pinned_h_dir(&a->pin); - delegated = NULL; - err = vfsub_unlink(h_dir, &a->h_path, &delegated, /*force*/0); - if (unlikely(err == -EWOULDBLOCK)) { - pr_warn("cannot retry for NFSv4 delegation" - " for an internal unlink\n"); - iput(delegated); - } - } else { - /* dir inode is locked */ - h_dir = d_inode(wh_dentry->d_parent); - IMustLock(h_dir); - err = 0; - } - - if (!err) { - vfsub_drop_nlink(inode); - epilog(dir, dentry, bindex); - - /* update target timestamps */ - if (bindex == bstart) { - vfsub_update_h_iattr(&a->h_path, /*did*/NULL); - /*ignore*/ - inode->i_ctime = d_inode(a->h_path.dentry)->i_ctime; - } else - /* todo: this timestamp may be reverted later */ - inode->i_ctime = h_dir->i_ctime; - goto out_unpin; /* success */ - } - - /* revert */ - if (wh_dentry) { - int rerr; - - rerr = do_revert(err, dir, bindex, bwh, wh_dentry, dentry, - &a->dt); - if (rerr) - err = rerr; - } - -out_unpin: - au_unpin(&a->pin); - dput(wh_dentry); - dput(a->h_path.dentry); -out_parent: - di_write_unlock(parent); -out_unlock: - aufs_read_unlock(dentry, AuLock_DW); -out_free: - kfree(a); -out: - return err; -} - -int aufs_rmdir(struct inode *dir, struct dentry *dentry) -{ - int err, rmdir_later; - aufs_bindex_t bwh, bindex, bstart; - struct inode *inode; - struct dentry *parent, *wh_dentry, *h_dentry; - struct au_whtmp_rmdir *args; - /* to reuduce stack size */ - struct { - struct au_dtime dt; - struct au_pin pin; - } *a; - - IMustLock(dir); - - err = -ENOMEM; - a = kmalloc(sizeof(*a), GFP_NOFS); - if (unlikely(!a)) - goto out; - - err = aufs_read_lock(dentry, AuLock_DW | AuLock_FLUSH | AuLock_GEN); - if (unlikely(err)) - goto out_free; - err = au_alive_dir(dentry); - if (unlikely(err)) - goto out_unlock; - inode = d_inode(dentry); - IMustLock(inode); - err = -ENOTDIR; - if (unlikely(!d_is_dir(dentry))) - goto out_unlock; /* possible? */ - - err = -ENOMEM; - args = au_whtmp_rmdir_alloc(dir->i_sb, GFP_NOFS); - if (unlikely(!args)) - goto out_unlock; - - parent = dentry->d_parent; /* dir inode is locked */ - di_write_lock_parent(parent); - err = au_test_empty(dentry, &args->whlist); - if (unlikely(err)) - goto out_parent; - - bstart = au_dbstart(dentry); - bwh = au_dbwh(dentry); - bindex = -1; - wh_dentry = lock_hdir_create_wh(dentry, /*isdir*/1, &bindex, &a->dt, - &a->pin); - err = PTR_ERR(wh_dentry); - if (IS_ERR(wh_dentry)) - goto out_parent; - - h_dentry = au_h_dptr(dentry, bstart); - dget(h_dentry); - rmdir_later = 0; - if (bindex == bstart) { - err = renwh_and_rmdir(dentry, bstart, &args->whlist, dir); - if (err > 0) { - rmdir_later = err; - err = 0; - } - } else { - /* stop monitoring */ - au_hn_free(au_hi(inode, bstart)); - - /* dir inode is locked */ - IMustLock(d_inode(wh_dentry->d_parent)); - err = 0; - } - - if (!err) { - vfsub_dead_dir(inode); - au_set_dbdiropq(dentry, -1); - epilog(dir, dentry, bindex); - - if (rmdir_later) { - au_whtmp_kick_rmdir(dir, bstart, h_dentry, args); - args = NULL; - } - - goto out_unpin; /* success */ - } - - /* revert */ - AuLabel(revert); - if (wh_dentry) { - int rerr; - - rerr = do_revert(err, dir, bindex, bwh, wh_dentry, dentry, - &a->dt); - if (rerr) - err = rerr; - } - -out_unpin: - au_unpin(&a->pin); - dput(wh_dentry); - dput(h_dentry); -out_parent: - di_write_unlock(parent); - if (args) - au_whtmp_rmdir_free(args); -out_unlock: - aufs_read_unlock(dentry, AuLock_DW); -out_free: - kfree(a); -out: - AuTraceErr(err); - return err; -} diff --git a/fs/aufs/i_op_ren.c b/fs/aufs/i_op_ren.c deleted file mode 100644 index 4db778057..000000000 --- a/fs/aufs/i_op_ren.c +++ /dev/null @@ -1,1017 +0,0 @@ -/* - * Copyright (C) 2005-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * inode operation (rename entry) - * todo: this is crazy monster - */ - -#include "aufs.h" - -enum { AuSRC, AuDST, AuSrcDst }; -enum { AuPARENT, AuCHILD, AuParentChild }; - -#define AuRen_ISDIR 1 -#define AuRen_ISSAMEDIR (1 << 1) -#define AuRen_WHSRC (1 << 2) -#define AuRen_WHDST (1 << 3) -#define AuRen_MNT_WRITE (1 << 4) -#define AuRen_DT_DSTDIR (1 << 5) -#define AuRen_DIROPQ (1 << 6) -#define au_ftest_ren(flags, name) ((flags) & AuRen_##name) -#define au_fset_ren(flags, name) \ - do { (flags) |= AuRen_##name; } while (0) -#define au_fclr_ren(flags, name) \ - do { (flags) &= ~AuRen_##name; } while (0) - -struct au_ren_args { - struct { - struct dentry *dentry, *h_dentry, *parent, *h_parent, - *wh_dentry; - struct inode *dir, *inode; - struct au_hinode *hdir; - struct au_dtime dt[AuParentChild]; - aufs_bindex_t bstart; - } sd[AuSrcDst]; - -#define src_dentry sd[AuSRC].dentry -#define src_dir sd[AuSRC].dir -#define src_inode sd[AuSRC].inode -#define src_h_dentry sd[AuSRC].h_dentry -#define src_parent sd[AuSRC].parent -#define src_h_parent sd[AuSRC].h_parent -#define src_wh_dentry sd[AuSRC].wh_dentry -#define src_hdir sd[AuSRC].hdir -#define src_h_dir sd[AuSRC].hdir->hi_inode -#define src_dt sd[AuSRC].dt -#define src_bstart sd[AuSRC].bstart - -#define dst_dentry sd[AuDST].dentry -#define dst_dir sd[AuDST].dir -#define dst_inode sd[AuDST].inode -#define dst_h_dentry sd[AuDST].h_dentry -#define dst_parent sd[AuDST].parent -#define dst_h_parent sd[AuDST].h_parent -#define dst_wh_dentry sd[AuDST].wh_dentry -#define dst_hdir sd[AuDST].hdir -#define dst_h_dir sd[AuDST].hdir->hi_inode -#define dst_dt sd[AuDST].dt -#define dst_bstart sd[AuDST].bstart - - struct dentry *h_trap; - struct au_branch *br; - struct au_hinode *src_hinode; - struct path h_path; - struct au_nhash whlist; - aufs_bindex_t btgt, src_bwh, src_bdiropq; - - unsigned int flags; - - struct au_whtmp_rmdir *thargs; - struct dentry *h_dst; -}; - -/* ---------------------------------------------------------------------- */ - -/* - * functions for reverting. - * when an error happened in a single rename systemcall, we should revert - * everything as if nothing happend. - * we don't need to revert the copied-up/down the parent dir since they are - * harmless. - */ - -#define RevertFailure(fmt, ...) do { \ - AuIOErr("revert failure: " fmt " (%d, %d)\n", \ - ##__VA_ARGS__, err, rerr); \ - err = -EIO; \ -} while (0) - -static void au_ren_rev_diropq(int err, struct au_ren_args *a) -{ - int rerr; - - au_hn_imtx_lock_nested(a->src_hinode, AuLsc_I_CHILD); - rerr = au_diropq_remove(a->src_dentry, a->btgt); - au_hn_imtx_unlock(a->src_hinode); - au_set_dbdiropq(a->src_dentry, a->src_bdiropq); - if (rerr) - RevertFailure("remove diropq %pd", a->src_dentry); -} - -static void au_ren_rev_rename(int err, struct au_ren_args *a) -{ - int rerr; - struct inode *delegated; - - a->h_path.dentry = vfsub_lkup_one(&a->src_dentry->d_name, - a->src_h_parent); - rerr = PTR_ERR(a->h_path.dentry); - if (IS_ERR(a->h_path.dentry)) { - RevertFailure("lkup one %pd", a->src_dentry); - return; - } - - delegated = NULL; - rerr = vfsub_rename(a->dst_h_dir, - au_h_dptr(a->src_dentry, a->btgt), - a->src_h_dir, &a->h_path, &delegated); - if (unlikely(rerr == -EWOULDBLOCK)) { - pr_warn("cannot retry for NFSv4 delegation" - " for an internal rename\n"); - iput(delegated); - } - d_drop(a->h_path.dentry); - dput(a->h_path.dentry); - /* au_set_h_dptr(a->src_dentry, a->btgt, NULL); */ - if (rerr) - RevertFailure("rename %pd", a->src_dentry); -} - -static void au_ren_rev_whtmp(int err, struct au_ren_args *a) -{ - int rerr; - struct inode *delegated; - - a->h_path.dentry = vfsub_lkup_one(&a->dst_dentry->d_name, - a->dst_h_parent); - rerr = PTR_ERR(a->h_path.dentry); - if (IS_ERR(a->h_path.dentry)) { - RevertFailure("lkup one %pd", a->dst_dentry); - return; - } - if (d_is_positive(a->h_path.dentry)) { - d_drop(a->h_path.dentry); - dput(a->h_path.dentry); - return; - } - - delegated = NULL; - rerr = vfsub_rename(a->dst_h_dir, a->h_dst, a->dst_h_dir, &a->h_path, - &delegated); - if (unlikely(rerr == -EWOULDBLOCK)) { - pr_warn("cannot retry for NFSv4 delegation" - " for an internal rename\n"); - iput(delegated); - } - d_drop(a->h_path.dentry); - dput(a->h_path.dentry); - if (!rerr) - au_set_h_dptr(a->dst_dentry, a->btgt, dget(a->h_dst)); - else - RevertFailure("rename %pd", a->h_dst); -} - -static void au_ren_rev_whsrc(int err, struct au_ren_args *a) -{ - int rerr; - - a->h_path.dentry = a->src_wh_dentry; - rerr = au_wh_unlink_dentry(a->src_h_dir, &a->h_path, a->src_dentry); - au_set_dbwh(a->src_dentry, a->src_bwh); - if (rerr) - RevertFailure("unlink %pd", a->src_wh_dentry); -} -#undef RevertFailure - -/* ---------------------------------------------------------------------- */ - -/* - * when we have to copyup the renaming entry, do it with the rename-target name - * in order to minimize the cost (the later actual rename is unnecessary). - * otherwise rename it on the target branch. - */ -static int au_ren_or_cpup(struct au_ren_args *a) -{ - int err; - struct dentry *d; - struct inode *delegated; - - d = a->src_dentry; - if (au_dbstart(d) == a->btgt) { - a->h_path.dentry = a->dst_h_dentry; - if (au_ftest_ren(a->flags, DIROPQ) - && au_dbdiropq(d) == a->btgt) - au_fclr_ren(a->flags, DIROPQ); - AuDebugOn(au_dbstart(d) != a->btgt); - delegated = NULL; - err = vfsub_rename(a->src_h_dir, au_h_dptr(d, a->btgt), - a->dst_h_dir, &a->h_path, &delegated); - if (unlikely(err == -EWOULDBLOCK)) { - pr_warn("cannot retry for NFSv4 delegation" - " for an internal rename\n"); - iput(delegated); - } - } else - BUG(); - - if (!err && a->h_dst) - /* it will be set to dinfo later */ - dget(a->h_dst); - - return err; -} - -/* cf. aufs_rmdir() */ -static int au_ren_del_whtmp(struct au_ren_args *a) -{ - int err; - struct inode *dir; - - dir = a->dst_dir; - SiMustAnyLock(dir->i_sb); - if (!au_nhash_test_longer_wh(&a->whlist, a->btgt, - au_sbi(dir->i_sb)->si_dirwh) - || au_test_fs_remote(a->h_dst->d_sb)) { - err = au_whtmp_rmdir(dir, a->btgt, a->h_dst, &a->whlist); - if (unlikely(err)) - pr_warn("failed removing whtmp dir %pd (%d), " - "ignored.\n", a->h_dst, err); - } else { - au_nhash_wh_free(&a->thargs->whlist); - a->thargs->whlist = a->whlist; - a->whlist.nh_num = 0; - au_whtmp_kick_rmdir(dir, a->btgt, a->h_dst, a->thargs); - dput(a->h_dst); - a->thargs = NULL; - } - - return 0; -} - -/* make it 'opaque' dir. */ -static int au_ren_diropq(struct au_ren_args *a) -{ - int err; - struct dentry *diropq; - - err = 0; - a->src_bdiropq = au_dbdiropq(a->src_dentry); - a->src_hinode = au_hi(a->src_inode, a->btgt); - au_hn_imtx_lock_nested(a->src_hinode, AuLsc_I_CHILD); - diropq = au_diropq_create(a->src_dentry, a->btgt); - au_hn_imtx_unlock(a->src_hinode); - if (IS_ERR(diropq)) - err = PTR_ERR(diropq); - else - dput(diropq); - - return err; -} - -static int do_rename(struct au_ren_args *a) -{ - int err; - struct dentry *d, *h_d; - - /* prepare workqueue args for asynchronous rmdir */ - h_d = a->dst_h_dentry; - if (au_ftest_ren(a->flags, ISDIR) && d_is_positive(h_d)) { - err = -ENOMEM; - a->thargs = au_whtmp_rmdir_alloc(a->src_dentry->d_sb, GFP_NOFS); - if (unlikely(!a->thargs)) - goto out; - a->h_dst = dget(h_d); - } - - /* create whiteout for src_dentry */ - if (au_ftest_ren(a->flags, WHSRC)) { - a->src_bwh = au_dbwh(a->src_dentry); - AuDebugOn(a->src_bwh >= 0); - a->src_wh_dentry - = au_wh_create(a->src_dentry, a->btgt, a->src_h_parent); - err = PTR_ERR(a->src_wh_dentry); - if (IS_ERR(a->src_wh_dentry)) - goto out_thargs; - } - - /* lookup whiteout for dentry */ - if (au_ftest_ren(a->flags, WHDST)) { - h_d = au_wh_lkup(a->dst_h_parent, &a->dst_dentry->d_name, - a->br); - err = PTR_ERR(h_d); - if (IS_ERR(h_d)) - goto out_whsrc; - if (d_is_negative(h_d)) - dput(h_d); - else - a->dst_wh_dentry = h_d; - } - - /* rename dentry to tmpwh */ - if (a->thargs) { - err = au_whtmp_ren(a->dst_h_dentry, a->br); - if (unlikely(err)) - goto out_whdst; - - d = a->dst_dentry; - au_set_h_dptr(d, a->btgt, NULL); - err = au_lkup_neg(d, a->btgt, /*wh*/0); - if (unlikely(err)) - goto out_whtmp; - a->dst_h_dentry = au_h_dptr(d, a->btgt); - } - - BUG_ON(d_is_positive(a->dst_h_dentry) && a->src_bstart != a->btgt); - - /* rename by vfs_rename or cpup */ - d = a->dst_dentry; - if (au_ftest_ren(a->flags, ISDIR) - && (a->dst_wh_dentry - || au_dbdiropq(d) == a->btgt - /* hide the lower to keep xino */ - || a->btgt < au_dbend(d) - || au_opt_test(au_mntflags(d->d_sb), ALWAYS_DIROPQ))) - au_fset_ren(a->flags, DIROPQ); - err = au_ren_or_cpup(a); - if (unlikely(err)) - /* leave the copied-up one */ - goto out_whtmp; - - /* make dir opaque */ - if (au_ftest_ren(a->flags, DIROPQ)) { - err = au_ren_diropq(a); - if (unlikely(err)) - goto out_rename; - } - - /* update target timestamps */ - AuDebugOn(au_dbstart(a->src_dentry) != a->btgt); - a->h_path.dentry = au_h_dptr(a->src_dentry, a->btgt); - vfsub_update_h_iattr(&a->h_path, /*did*/NULL); /*ignore*/ - a->src_inode->i_ctime = d_inode(a->h_path.dentry)->i_ctime; - - /* remove whiteout for dentry */ - if (a->dst_wh_dentry) { - a->h_path.dentry = a->dst_wh_dentry; - err = au_wh_unlink_dentry(a->dst_h_dir, &a->h_path, - a->dst_dentry); - if (unlikely(err)) - goto out_diropq; - } - - /* remove whtmp */ - if (a->thargs) - au_ren_del_whtmp(a); /* ignore this error */ - - au_fhsm_wrote(a->src_dentry->d_sb, a->btgt, /*force*/0); - err = 0; - goto out_success; - -out_diropq: - if (au_ftest_ren(a->flags, DIROPQ)) - au_ren_rev_diropq(err, a); -out_rename: - au_ren_rev_rename(err, a); - dput(a->h_dst); -out_whtmp: - if (a->thargs) - au_ren_rev_whtmp(err, a); -out_whdst: - dput(a->dst_wh_dentry); - a->dst_wh_dentry = NULL; -out_whsrc: - if (a->src_wh_dentry) - au_ren_rev_whsrc(err, a); -out_success: - dput(a->src_wh_dentry); - dput(a->dst_wh_dentry); -out_thargs: - if (a->thargs) { - dput(a->h_dst); - au_whtmp_rmdir_free(a->thargs); - a->thargs = NULL; - } -out: - return err; -} - -/* ---------------------------------------------------------------------- */ - -/* - * test if @dentry dir can be rename destination or not. - * success means, it is a logically empty dir. - */ -static int may_rename_dstdir(struct dentry *dentry, struct au_nhash *whlist) -{ - return au_test_empty(dentry, whlist); -} - -/* - * test if @dentry dir can be rename source or not. - * if it can, return 0 and @children is filled. - * success means, - * - it is a logically empty dir. - * - or, it exists on writable branch and has no children including whiteouts - * on the lower branch. - */ -static int may_rename_srcdir(struct dentry *dentry, aufs_bindex_t btgt) -{ - int err; - unsigned int rdhash; - aufs_bindex_t bstart; - - bstart = au_dbstart(dentry); - if (bstart != btgt) { - struct au_nhash whlist; - - SiMustAnyLock(dentry->d_sb); - rdhash = au_sbi(dentry->d_sb)->si_rdhash; - if (!rdhash) - rdhash = au_rdhash_est(au_dir_size(/*file*/NULL, - dentry)); - err = au_nhash_alloc(&whlist, rdhash, GFP_NOFS); - if (unlikely(err)) - goto out; - err = au_test_empty(dentry, &whlist); - au_nhash_wh_free(&whlist); - goto out; - } - - if (bstart == au_dbtaildir(dentry)) - return 0; /* success */ - - err = au_test_empty_lower(dentry); - -out: - if (err == -ENOTEMPTY) { - AuWarn1("renaming dir who has child(ren) on multiple branches," - " is not supported\n"); - err = -EXDEV; - } - return err; -} - -/* side effect: sets whlist and h_dentry */ -static int au_ren_may_dir(struct au_ren_args *a) -{ - int err; - unsigned int rdhash; - struct dentry *d; - - d = a->dst_dentry; - SiMustAnyLock(d->d_sb); - - err = 0; - if (au_ftest_ren(a->flags, ISDIR) && a->dst_inode) { - rdhash = au_sbi(d->d_sb)->si_rdhash; - if (!rdhash) - rdhash = au_rdhash_est(au_dir_size(/*file*/NULL, d)); - err = au_nhash_alloc(&a->whlist, rdhash, GFP_NOFS); - if (unlikely(err)) - goto out; - - au_set_dbstart(d, a->dst_bstart); - err = may_rename_dstdir(d, &a->whlist); - au_set_dbstart(d, a->btgt); - } - a->dst_h_dentry = au_h_dptr(d, au_dbstart(d)); - if (unlikely(err)) - goto out; - - d = a->src_dentry; - a->src_h_dentry = au_h_dptr(d, au_dbstart(d)); - if (au_ftest_ren(a->flags, ISDIR)) { - err = may_rename_srcdir(d, a->btgt); - if (unlikely(err)) { - au_nhash_wh_free(&a->whlist); - a->whlist.nh_num = 0; - } - } -out: - return err; -} - -/* ---------------------------------------------------------------------- */ - -/* - * simple tests for rename. - * following the checks in vfs, plus the parent-child relationship. - */ -static int au_may_ren(struct au_ren_args *a) -{ - int err, isdir; - struct inode *h_inode; - - if (a->src_bstart == a->btgt) { - err = au_may_del(a->src_dentry, a->btgt, a->src_h_parent, - au_ftest_ren(a->flags, ISDIR)); - if (unlikely(err)) - goto out; - err = -EINVAL; - if (unlikely(a->src_h_dentry == a->h_trap)) - goto out; - } - - err = 0; - if (a->dst_bstart != a->btgt) - goto out; - - err = -ENOTEMPTY; - if (unlikely(a->dst_h_dentry == a->h_trap)) - goto out; - - err = -EIO; - isdir = !!au_ftest_ren(a->flags, ISDIR); - if (d_really_is_negative(a->dst_dentry)) { - if (d_is_negative(a->dst_h_dentry)) - err = au_may_add(a->dst_dentry, a->btgt, - a->dst_h_parent, isdir); - } else { - if (unlikely(d_is_negative(a->dst_h_dentry))) - goto out; - h_inode = d_inode(a->dst_h_dentry); - if (h_inode->i_nlink) - err = au_may_del(a->dst_dentry, a->btgt, - a->dst_h_parent, isdir); - } - -out: - if (unlikely(err == -ENOENT || err == -EEXIST)) - err = -EIO; - AuTraceErr(err); - return err; -} - -/* ---------------------------------------------------------------------- */ - -/* - * locking order - * (VFS) - * - src_dir and dir by lock_rename() - * - inode if exitsts - * (aufs) - * - lock all - * + src_dentry and dentry by aufs_read_and_write_lock2() which calls, - * + si_read_lock - * + di_write_lock2_child() - * + di_write_lock_child() - * + ii_write_lock_child() - * + di_write_lock_child2() - * + ii_write_lock_child2() - * + src_parent and parent - * + di_write_lock_parent() - * + ii_write_lock_parent() - * + di_write_lock_parent2() - * + ii_write_lock_parent2() - * + lower src_dir and dir by vfsub_lock_rename() - * + verify the every relationships between child and parent. if any - * of them failed, unlock all and return -EBUSY. - */ -static void au_ren_unlock(struct au_ren_args *a) -{ - vfsub_unlock_rename(a->src_h_parent, a->src_hdir, - a->dst_h_parent, a->dst_hdir); - if (au_ftest_ren(a->flags, MNT_WRITE)) - vfsub_mnt_drop_write(au_br_mnt(a->br)); -} - -static int au_ren_lock(struct au_ren_args *a) -{ - int err; - unsigned int udba; - - err = 0; - a->src_h_parent = au_h_dptr(a->src_parent, a->btgt); - a->src_hdir = au_hi(a->src_dir, a->btgt); - a->dst_h_parent = au_h_dptr(a->dst_parent, a->btgt); - a->dst_hdir = au_hi(a->dst_dir, a->btgt); - - err = vfsub_mnt_want_write(au_br_mnt(a->br)); - if (unlikely(err)) - goto out; - au_fset_ren(a->flags, MNT_WRITE); - a->h_trap = vfsub_lock_rename(a->src_h_parent, a->src_hdir, - a->dst_h_parent, a->dst_hdir); - udba = au_opt_udba(a->src_dentry->d_sb); - if (unlikely(a->src_hdir->hi_inode != d_inode(a->src_h_parent) - || a->dst_hdir->hi_inode != d_inode(a->dst_h_parent))) - err = au_busy_or_stale(); - if (!err && au_dbstart(a->src_dentry) == a->btgt) - err = au_h_verify(a->src_h_dentry, udba, - d_inode(a->src_h_parent), a->src_h_parent, - a->br); - if (!err && au_dbstart(a->dst_dentry) == a->btgt) - err = au_h_verify(a->dst_h_dentry, udba, - d_inode(a->dst_h_parent), a->dst_h_parent, - a->br); - if (!err) - goto out; /* success */ - - err = au_busy_or_stale(); - au_ren_unlock(a); - -out: - return err; -} - -/* ---------------------------------------------------------------------- */ - -static void au_ren_refresh_dir(struct au_ren_args *a) -{ - struct inode *dir; - - dir = a->dst_dir; - dir->i_version++; - if (au_ftest_ren(a->flags, ISDIR)) { - /* is this updating defined in POSIX? */ - au_cpup_attr_timesizes(a->src_inode); - au_cpup_attr_nlink(dir, /*force*/1); - } - - au_dir_ts(dir, a->btgt); - - if (au_ftest_ren(a->flags, ISSAMEDIR)) - return; - - dir = a->src_dir; - dir->i_version++; - if (au_ftest_ren(a->flags, ISDIR)) - au_cpup_attr_nlink(dir, /*force*/1); - au_dir_ts(dir, a->btgt); -} - -static void au_ren_refresh(struct au_ren_args *a) -{ - aufs_bindex_t bend, bindex; - struct dentry *d, *h_d; - struct inode *i, *h_i; - struct super_block *sb; - - d = a->dst_dentry; - d_drop(d); - if (a->h_dst) - /* already dget-ed by au_ren_or_cpup() */ - au_set_h_dptr(d, a->btgt, a->h_dst); - - i = a->dst_inode; - if (i) { - if (!au_ftest_ren(a->flags, ISDIR)) - vfsub_drop_nlink(i); - else { - vfsub_dead_dir(i); - au_cpup_attr_timesizes(i); - } - au_update_dbrange(d, /*do_put_zero*/1); - } else { - bend = a->btgt; - for (bindex = au_dbstart(d); bindex < bend; bindex++) - au_set_h_dptr(d, bindex, NULL); - bend = au_dbend(d); - for (bindex = a->btgt + 1; bindex <= bend; bindex++) - au_set_h_dptr(d, bindex, NULL); - au_update_dbrange(d, /*do_put_zero*/0); - } - - d = a->src_dentry; - au_set_dbwh(d, -1); - bend = au_dbend(d); - for (bindex = a->btgt + 1; bindex <= bend; bindex++) { - h_d = au_h_dptr(d, bindex); - if (h_d) - au_set_h_dptr(d, bindex, NULL); - } - au_set_dbend(d, a->btgt); - - sb = d->d_sb; - i = a->src_inode; - if (au_opt_test(au_mntflags(sb), PLINK) && au_plink_test(i)) - return; /* success */ - - bend = au_ibend(i); - for (bindex = a->btgt + 1; bindex <= bend; bindex++) { - h_i = au_h_iptr(i, bindex); - if (h_i) { - au_xino_write(sb, bindex, h_i->i_ino, /*ino*/0); - /* ignore this error */ - au_set_h_iptr(i, bindex, NULL, 0); - } - } - au_set_ibend(i, a->btgt); -} - -/* ---------------------------------------------------------------------- */ - -/* mainly for link(2) and rename(2) */ -int au_wbr(struct dentry *dentry, aufs_bindex_t btgt) -{ - aufs_bindex_t bdiropq, bwh; - struct dentry *parent; - struct au_branch *br; - - parent = dentry->d_parent; - IMustLock(d_inode(parent)); /* dir is locked */ - - bdiropq = au_dbdiropq(parent); - bwh = au_dbwh(dentry); - br = au_sbr(dentry->d_sb, btgt); - if (au_br_rdonly(br) - || (0 <= bdiropq && bdiropq < btgt) - || (0 <= bwh && bwh < btgt)) - btgt = -1; - - AuDbg("btgt %d\n", btgt); - return btgt; -} - -/* sets src_bstart, dst_bstart and btgt */ -static int au_ren_wbr(struct au_ren_args *a) -{ - int err; - struct au_wr_dir_args wr_dir_args = { - /* .force_btgt = -1, */ - .flags = AuWrDir_ADD_ENTRY - }; - - a->src_bstart = au_dbstart(a->src_dentry); - a->dst_bstart = au_dbstart(a->dst_dentry); - if (au_ftest_ren(a->flags, ISDIR)) - au_fset_wrdir(wr_dir_args.flags, ISDIR); - wr_dir_args.force_btgt = a->src_bstart; - if (a->dst_inode && a->dst_bstart < a->src_bstart) - wr_dir_args.force_btgt = a->dst_bstart; - wr_dir_args.force_btgt = au_wbr(a->dst_dentry, wr_dir_args.force_btgt); - err = au_wr_dir(a->dst_dentry, a->src_dentry, &wr_dir_args); - a->btgt = err; - - return err; -} - -static void au_ren_dt(struct au_ren_args *a) -{ - a->h_path.dentry = a->src_h_parent; - au_dtime_store(a->src_dt + AuPARENT, a->src_parent, &a->h_path); - if (!au_ftest_ren(a->flags, ISSAMEDIR)) { - a->h_path.dentry = a->dst_h_parent; - au_dtime_store(a->dst_dt + AuPARENT, a->dst_parent, &a->h_path); - } - - au_fclr_ren(a->flags, DT_DSTDIR); - if (!au_ftest_ren(a->flags, ISDIR)) - return; - - a->h_path.dentry = a->src_h_dentry; - au_dtime_store(a->src_dt + AuCHILD, a->src_dentry, &a->h_path); - if (d_is_positive(a->dst_h_dentry)) { - au_fset_ren(a->flags, DT_DSTDIR); - a->h_path.dentry = a->dst_h_dentry; - au_dtime_store(a->dst_dt + AuCHILD, a->dst_dentry, &a->h_path); - } -} - -static void au_ren_rev_dt(int err, struct au_ren_args *a) -{ - struct dentry *h_d; - struct mutex *h_mtx; - - au_dtime_revert(a->src_dt + AuPARENT); - if (!au_ftest_ren(a->flags, ISSAMEDIR)) - au_dtime_revert(a->dst_dt + AuPARENT); - - if (au_ftest_ren(a->flags, ISDIR) && err != -EIO) { - h_d = a->src_dt[AuCHILD].dt_h_path.dentry; - h_mtx = &d_inode(h_d)->i_mutex; - mutex_lock_nested(h_mtx, AuLsc_I_CHILD); - au_dtime_revert(a->src_dt + AuCHILD); - mutex_unlock(h_mtx); - - if (au_ftest_ren(a->flags, DT_DSTDIR)) { - h_d = a->dst_dt[AuCHILD].dt_h_path.dentry; - h_mtx = &d_inode(h_d)->i_mutex; - mutex_lock_nested(h_mtx, AuLsc_I_CHILD); - au_dtime_revert(a->dst_dt + AuCHILD); - mutex_unlock(h_mtx); - } - } -} - -/* ---------------------------------------------------------------------- */ - -int aufs_rename(struct inode *_src_dir, struct dentry *_src_dentry, - struct inode *_dst_dir, struct dentry *_dst_dentry) -{ - int err, flags; - /* reduce stack space */ - struct au_ren_args *a; - - AuDbg("%pd, %pd\n", _src_dentry, _dst_dentry); - IMustLock(_src_dir); - IMustLock(_dst_dir); - - err = -ENOMEM; - BUILD_BUG_ON(sizeof(*a) > PAGE_SIZE); - a = kzalloc(sizeof(*a), GFP_NOFS); - if (unlikely(!a)) - goto out; - - a->src_dir = _src_dir; - a->src_dentry = _src_dentry; - a->src_inode = NULL; - if (d_really_is_positive(a->src_dentry)) - a->src_inode = d_inode(a->src_dentry); - a->src_parent = a->src_dentry->d_parent; /* dir inode is locked */ - a->dst_dir = _dst_dir; - a->dst_dentry = _dst_dentry; - a->dst_inode = NULL; - if (d_really_is_positive(a->dst_dentry)) - a->dst_inode = d_inode(a->dst_dentry); - a->dst_parent = a->dst_dentry->d_parent; /* dir inode is locked */ - if (a->dst_inode) { - IMustLock(a->dst_inode); - au_igrab(a->dst_inode); - } - - err = -ENOTDIR; - flags = AuLock_FLUSH | AuLock_NOPLM | AuLock_GEN; - if (d_is_dir(a->src_dentry)) { - au_fset_ren(a->flags, ISDIR); - if (unlikely(d_really_is_positive(a->dst_dentry) - && !d_is_dir(a->dst_dentry))) - goto out_free; - err = aufs_read_and_write_lock2(a->dst_dentry, a->src_dentry, - AuLock_DIR | flags); - } else - err = aufs_read_and_write_lock2(a->dst_dentry, a->src_dentry, - flags); - if (unlikely(err)) - goto out_free; - - err = au_d_hashed_positive(a->src_dentry); - if (unlikely(err)) - goto out_unlock; - err = -ENOENT; - if (a->dst_inode) { - /* - * If it is a dir, VFS unhash dst_dentry before this - * function. It means we cannot rely upon d_unhashed(). - */ - if (unlikely(!a->dst_inode->i_nlink)) - goto out_unlock; - if (!S_ISDIR(a->dst_inode->i_mode)) { - err = au_d_hashed_positive(a->dst_dentry); - if (unlikely(err)) - goto out_unlock; - } else if (unlikely(IS_DEADDIR(a->dst_inode))) - goto out_unlock; - } else if (unlikely(d_unhashed(a->dst_dentry))) - goto out_unlock; - - /* - * is it possible? - * yes, it happend (in linux-3.3-rcN) but I don't know why. - * there may exist a problem somewhere else. - */ - err = -EINVAL; - if (unlikely(d_inode(a->dst_parent) == d_inode(a->src_dentry))) - goto out_unlock; - - au_fset_ren(a->flags, ISSAMEDIR); /* temporary */ - di_write_lock_parent(a->dst_parent); - - /* which branch we process */ - err = au_ren_wbr(a); - if (unlikely(err < 0)) - goto out_parent; - a->br = au_sbr(a->dst_dentry->d_sb, a->btgt); - a->h_path.mnt = au_br_mnt(a->br); - - /* are they available to be renamed */ - err = au_ren_may_dir(a); - if (unlikely(err)) - goto out_children; - - /* prepare the writable parent dir on the same branch */ - if (a->dst_bstart == a->btgt) { - au_fset_ren(a->flags, WHDST); - } else { - err = au_cpup_dirs(a->dst_dentry, a->btgt); - if (unlikely(err)) - goto out_children; - } - - if (a->src_dir != a->dst_dir) { - /* - * this temporary unlock is safe, - * because both dir->i_mutex are locked. - */ - di_write_unlock(a->dst_parent); - di_write_lock_parent(a->src_parent); - err = au_wr_dir_need_wh(a->src_dentry, - au_ftest_ren(a->flags, ISDIR), - &a->btgt); - di_write_unlock(a->src_parent); - di_write_lock2_parent(a->src_parent, a->dst_parent, /*isdir*/1); - au_fclr_ren(a->flags, ISSAMEDIR); - } else - err = au_wr_dir_need_wh(a->src_dentry, - au_ftest_ren(a->flags, ISDIR), - &a->btgt); - if (unlikely(err < 0)) - goto out_children; - if (err) - au_fset_ren(a->flags, WHSRC); - - /* cpup src */ - if (a->src_bstart != a->btgt) { - struct au_pin pin; - - err = au_pin(&pin, a->src_dentry, a->btgt, - au_opt_udba(a->src_dentry->d_sb), - AuPin_DI_LOCKED | AuPin_MNT_WRITE); - if (!err) { - struct au_cp_generic cpg = { - .dentry = a->src_dentry, - .bdst = a->btgt, - .bsrc = a->src_bstart, - .len = -1, - .pin = &pin, - .flags = AuCpup_DTIME | AuCpup_HOPEN - }; - AuDebugOn(au_dbstart(a->src_dentry) != a->src_bstart); - err = au_sio_cpup_simple(&cpg); - au_unpin(&pin); - } - if (unlikely(err)) - goto out_children; - a->src_bstart = a->btgt; - a->src_h_dentry = au_h_dptr(a->src_dentry, a->btgt); - au_fset_ren(a->flags, WHSRC); - } - - /* lock them all */ - err = au_ren_lock(a); - if (unlikely(err)) - /* leave the copied-up one */ - goto out_children; - - if (!au_opt_test(au_mntflags(a->dst_dir->i_sb), UDBA_NONE)) - err = au_may_ren(a); - else if (unlikely(a->dst_dentry->d_name.len > AUFS_MAX_NAMELEN)) - err = -ENAMETOOLONG; - if (unlikely(err)) - goto out_hdir; - - /* store timestamps to be revertible */ - au_ren_dt(a); - - /* here we go */ - err = do_rename(a); - if (unlikely(err)) - goto out_dt; - - /* update dir attributes */ - au_ren_refresh_dir(a); - - /* dput/iput all lower dentries */ - au_ren_refresh(a); - - goto out_hdir; /* success */ - -out_dt: - au_ren_rev_dt(err, a); -out_hdir: - au_ren_unlock(a); -out_children: - au_nhash_wh_free(&a->whlist); - if (err && a->dst_inode && a->dst_bstart != a->btgt) { - AuDbg("bstart %d, btgt %d\n", a->dst_bstart, a->btgt); - au_set_h_dptr(a->dst_dentry, a->btgt, NULL); - au_set_dbstart(a->dst_dentry, a->dst_bstart); - } -out_parent: - if (!err) - d_move(a->src_dentry, a->dst_dentry); - else { - au_update_dbstart(a->dst_dentry); - if (!a->dst_inode) - d_drop(a->dst_dentry); - } - if (au_ftest_ren(a->flags, ISSAMEDIR)) - di_write_unlock(a->dst_parent); - else - di_write_unlock2(a->src_parent, a->dst_parent); -out_unlock: - aufs_read_and_write_unlock2(a->dst_dentry, a->src_dentry); -out_free: - iput(a->dst_inode); - if (a->thargs) - au_whtmp_rmdir_free(a->thargs); - kfree(a); -out: - AuTraceErr(err); - return err; -} diff --git a/fs/aufs/iinfo.c b/fs/aufs/iinfo.c deleted file mode 100644 index 0eaa7de7e..000000000 --- a/fs/aufs/iinfo.c +++ /dev/null @@ -1,277 +0,0 @@ -/* - * Copyright (C) 2005-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * inode private data - */ - -#include "aufs.h" - -struct inode *au_h_iptr(struct inode *inode, aufs_bindex_t bindex) -{ - struct inode *h_inode; - - IiMustAnyLock(inode); - - h_inode = au_ii(inode)->ii_hinode[0 + bindex].hi_inode; - AuDebugOn(h_inode && atomic_read(&h_inode->i_count) <= 0); - return h_inode; -} - -/* todo: hard/soft set? */ -void au_hiput(struct au_hinode *hinode) -{ - au_hn_free(hinode); - dput(hinode->hi_whdentry); - iput(hinode->hi_inode); -} - -unsigned int au_hi_flags(struct inode *inode, int isdir) -{ - unsigned int flags; - const unsigned int mnt_flags = au_mntflags(inode->i_sb); - - flags = 0; - if (au_opt_test(mnt_flags, XINO)) - au_fset_hi(flags, XINO); - if (isdir && au_opt_test(mnt_flags, UDBA_HNOTIFY)) - au_fset_hi(flags, HNOTIFY); - return flags; -} - -void au_set_h_iptr(struct inode *inode, aufs_bindex_t bindex, - struct inode *h_inode, unsigned int flags) -{ - struct au_hinode *hinode; - struct inode *hi; - struct au_iinfo *iinfo = au_ii(inode); - - IiMustWriteLock(inode); - - hinode = iinfo->ii_hinode + bindex; - hi = hinode->hi_inode; - AuDebugOn(h_inode && atomic_read(&h_inode->i_count) <= 0); - - if (hi) - au_hiput(hinode); - hinode->hi_inode = h_inode; - if (h_inode) { - int err; - struct super_block *sb = inode->i_sb; - struct au_branch *br; - - AuDebugOn(inode->i_mode - && (h_inode->i_mode & S_IFMT) - != (inode->i_mode & S_IFMT)); - if (bindex == iinfo->ii_bstart) - au_cpup_igen(inode, h_inode); - br = au_sbr(sb, bindex); - hinode->hi_id = br->br_id; - if (au_ftest_hi(flags, XINO)) { - err = au_xino_write(sb, bindex, h_inode->i_ino, - inode->i_ino); - if (unlikely(err)) - AuIOErr1("failed au_xino_write() %d\n", err); - } - - if (au_ftest_hi(flags, HNOTIFY) - && au_br_hnotifyable(br->br_perm)) { - err = au_hn_alloc(hinode, inode); - if (unlikely(err)) - AuIOErr1("au_hn_alloc() %d\n", err); - } - } -} - -void au_set_hi_wh(struct inode *inode, aufs_bindex_t bindex, - struct dentry *h_wh) -{ - struct au_hinode *hinode; - - IiMustWriteLock(inode); - - hinode = au_ii(inode)->ii_hinode + bindex; - AuDebugOn(hinode->hi_whdentry); - hinode->hi_whdentry = h_wh; -} - -void au_update_iigen(struct inode *inode, int half) -{ - struct au_iinfo *iinfo; - struct au_iigen *iigen; - unsigned int sigen; - - sigen = au_sigen(inode->i_sb); - iinfo = au_ii(inode); - iigen = &iinfo->ii_generation; - spin_lock(&iinfo->ii_genspin); - iigen->ig_generation = sigen; - if (half) - au_ig_fset(iigen->ig_flags, HALF_REFRESHED); - else - au_ig_fclr(iigen->ig_flags, HALF_REFRESHED); - spin_unlock(&iinfo->ii_genspin); -} - -/* it may be called at remount time, too */ -void au_update_ibrange(struct inode *inode, int do_put_zero) -{ - struct au_iinfo *iinfo; - aufs_bindex_t bindex, bend; - - iinfo = au_ii(inode); - if (!iinfo) - return; - - IiMustWriteLock(inode); - - if (do_put_zero && iinfo->ii_bstart >= 0) { - for (bindex = iinfo->ii_bstart; bindex <= iinfo->ii_bend; - bindex++) { - struct inode *h_i; - - h_i = iinfo->ii_hinode[0 + bindex].hi_inode; - if (h_i - && !h_i->i_nlink - && !(h_i->i_state & I_LINKABLE)) - au_set_h_iptr(inode, bindex, NULL, 0); - } - } - - iinfo->ii_bstart = -1; - iinfo->ii_bend = -1; - bend = au_sbend(inode->i_sb); - for (bindex = 0; bindex <= bend; bindex++) - if (iinfo->ii_hinode[0 + bindex].hi_inode) { - iinfo->ii_bstart = bindex; - break; - } - if (iinfo->ii_bstart >= 0) - for (bindex = bend; bindex >= iinfo->ii_bstart; bindex--) - if (iinfo->ii_hinode[0 + bindex].hi_inode) { - iinfo->ii_bend = bindex; - break; - } - AuDebugOn(iinfo->ii_bstart > iinfo->ii_bend); -} - -/* ---------------------------------------------------------------------- */ - -void au_icntnr_init_once(void *_c) -{ - struct au_icntnr *c = _c; - struct au_iinfo *iinfo = &c->iinfo; - static struct lock_class_key aufs_ii; - - spin_lock_init(&iinfo->ii_genspin); - au_rw_init(&iinfo->ii_rwsem); - au_rw_class(&iinfo->ii_rwsem, &aufs_ii); - inode_init_once(&c->vfs_inode); -} - -int au_iinfo_init(struct inode *inode) -{ - struct au_iinfo *iinfo; - struct super_block *sb; - int nbr, i; - - sb = inode->i_sb; - iinfo = &(container_of(inode, struct au_icntnr, vfs_inode)->iinfo); - nbr = au_sbend(sb) + 1; - if (unlikely(nbr <= 0)) - nbr = 1; - iinfo->ii_hinode = kcalloc(nbr, sizeof(*iinfo->ii_hinode), GFP_NOFS); - if (iinfo->ii_hinode) { - au_ninodes_inc(sb); - for (i = 0; i < nbr; i++) - iinfo->ii_hinode[i].hi_id = -1; - - iinfo->ii_generation.ig_generation = au_sigen(sb); - iinfo->ii_bstart = -1; - iinfo->ii_bend = -1; - iinfo->ii_vdir = NULL; - return 0; - } - return -ENOMEM; -} - -int au_ii_realloc(struct au_iinfo *iinfo, int nbr) -{ - int err, sz; - struct au_hinode *hip; - - AuRwMustWriteLock(&iinfo->ii_rwsem); - - err = -ENOMEM; - sz = sizeof(*hip) * (iinfo->ii_bend + 1); - if (!sz) - sz = sizeof(*hip); - hip = au_kzrealloc(iinfo->ii_hinode, sz, sizeof(*hip) * nbr, GFP_NOFS); - if (hip) { - iinfo->ii_hinode = hip; - err = 0; - } - - return err; -} - -void au_iinfo_fin(struct inode *inode) -{ - struct au_iinfo *iinfo; - struct au_hinode *hi; - struct super_block *sb; - aufs_bindex_t bindex, bend; - const unsigned char unlinked = !inode->i_nlink; - - iinfo = au_ii(inode); - /* bad_inode case */ - if (!iinfo) - return; - - sb = inode->i_sb; - au_ninodes_dec(sb); - if (si_pid_test(sb)) - au_xino_delete_inode(inode, unlinked); - else { - /* - * it is safe to hide the dependency between sbinfo and - * sb->s_umount. - */ - lockdep_off(); - si_noflush_read_lock(sb); - au_xino_delete_inode(inode, unlinked); - si_read_unlock(sb); - lockdep_on(); - } - - if (iinfo->ii_vdir) - au_vdir_free(iinfo->ii_vdir); - - bindex = iinfo->ii_bstart; - if (bindex >= 0) { - hi = iinfo->ii_hinode + bindex; - bend = iinfo->ii_bend; - while (bindex++ <= bend) { - if (hi->hi_inode) - au_hiput(hi); - hi++; - } - } - kfree(iinfo->ii_hinode); - iinfo->ii_hinode = NULL; - AuRwDestroy(&iinfo->ii_rwsem); -} diff --git a/fs/aufs/inode.c b/fs/aufs/inode.c deleted file mode 100644 index 201e112a0..000000000 --- a/fs/aufs/inode.c +++ /dev/null @@ -1,500 +0,0 @@ -/* - * Copyright (C) 2005-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * inode functions - */ - -#include "aufs.h" - -struct inode *au_igrab(struct inode *inode) -{ - if (inode) { - AuDebugOn(!atomic_read(&inode->i_count)); - ihold(inode); - } - return inode; -} - -static void au_refresh_hinode_attr(struct inode *inode, int do_version) -{ - au_cpup_attr_all(inode, /*force*/0); - au_update_iigen(inode, /*half*/1); - if (do_version) - inode->i_version++; -} - -static int au_ii_refresh(struct inode *inode, int *update) -{ - int err, e; - umode_t type; - aufs_bindex_t bindex, new_bindex; - struct super_block *sb; - struct au_iinfo *iinfo; - struct au_hinode *p, *q, tmp; - - IiMustWriteLock(inode); - - *update = 0; - sb = inode->i_sb; - type = inode->i_mode & S_IFMT; - iinfo = au_ii(inode); - err = au_ii_realloc(iinfo, au_sbend(sb) + 1); - if (unlikely(err)) - goto out; - - AuDebugOn(iinfo->ii_bstart < 0); - p = iinfo->ii_hinode + iinfo->ii_bstart; - for (bindex = iinfo->ii_bstart; bindex <= iinfo->ii_bend; - bindex++, p++) { - if (!p->hi_inode) - continue; - - AuDebugOn(type != (p->hi_inode->i_mode & S_IFMT)); - new_bindex = au_br_index(sb, p->hi_id); - if (new_bindex == bindex) - continue; - - if (new_bindex < 0) { - *update = 1; - au_hiput(p); - p->hi_inode = NULL; - continue; - } - - if (new_bindex < iinfo->ii_bstart) - iinfo->ii_bstart = new_bindex; - if (iinfo->ii_bend < new_bindex) - iinfo->ii_bend = new_bindex; - /* swap two lower inode, and loop again */ - q = iinfo->ii_hinode + new_bindex; - tmp = *q; - *q = *p; - *p = tmp; - if (tmp.hi_inode) { - bindex--; - p--; - } - } - au_update_ibrange(inode, /*do_put_zero*/0); - e = au_dy_irefresh(inode); - if (unlikely(e && !err)) - err = e; - -out: - AuTraceErr(err); - return err; -} - -int au_refresh_hinode_self(struct inode *inode) -{ - int err, update; - - err = au_ii_refresh(inode, &update); - if (!err) - au_refresh_hinode_attr(inode, update && S_ISDIR(inode->i_mode)); - - AuTraceErr(err); - return err; -} - -int au_refresh_hinode(struct inode *inode, struct dentry *dentry) -{ - int err, e, update; - unsigned int flags; - umode_t mode; - aufs_bindex_t bindex, bend; - unsigned char isdir; - struct au_hinode *p; - struct au_iinfo *iinfo; - - err = au_ii_refresh(inode, &update); - if (unlikely(err)) - goto out; - - update = 0; - iinfo = au_ii(inode); - p = iinfo->ii_hinode + iinfo->ii_bstart; - mode = (inode->i_mode & S_IFMT); - isdir = S_ISDIR(mode); - flags = au_hi_flags(inode, isdir); - bend = au_dbend(dentry); - for (bindex = au_dbstart(dentry); bindex <= bend; bindex++) { - struct inode *h_i, *h_inode; - struct dentry *h_d; - - h_d = au_h_dptr(dentry, bindex); - if (!h_d || d_is_negative(h_d)) - continue; - - h_inode = d_inode(h_d); - AuDebugOn(mode != (h_inode->i_mode & S_IFMT)); - if (iinfo->ii_bstart <= bindex && bindex <= iinfo->ii_bend) { - h_i = au_h_iptr(inode, bindex); - if (h_i) { - if (h_i == h_inode) - continue; - err = -EIO; - break; - } - } - if (bindex < iinfo->ii_bstart) - iinfo->ii_bstart = bindex; - if (iinfo->ii_bend < bindex) - iinfo->ii_bend = bindex; - au_set_h_iptr(inode, bindex, au_igrab(h_inode), flags); - update = 1; - } - au_update_ibrange(inode, /*do_put_zero*/0); - e = au_dy_irefresh(inode); - if (unlikely(e && !err)) - err = e; - if (!err) - au_refresh_hinode_attr(inode, update && isdir); - -out: - AuTraceErr(err); - return err; -} - -static int set_inode(struct inode *inode, struct dentry *dentry) -{ - int err; - unsigned int flags; - umode_t mode; - aufs_bindex_t bindex, bstart, btail; - unsigned char isdir; - struct dentry *h_dentry; - struct inode *h_inode; - struct au_iinfo *iinfo; - - IiMustWriteLock(inode); - - err = 0; - isdir = 0; - bstart = au_dbstart(dentry); - h_dentry = au_h_dptr(dentry, bstart); - h_inode = d_inode(h_dentry); - mode = h_inode->i_mode; - switch (mode & S_IFMT) { - case S_IFREG: - btail = au_dbtail(dentry); - inode->i_op = &aufs_iop; - inode->i_fop = &aufs_file_fop; - err = au_dy_iaop(inode, bstart, h_inode); - if (unlikely(err)) - goto out; - break; - case S_IFDIR: - isdir = 1; - btail = au_dbtaildir(dentry); - inode->i_op = &aufs_dir_iop; - inode->i_fop = &aufs_dir_fop; - break; - case S_IFLNK: - btail = au_dbtail(dentry); - inode->i_op = &aufs_symlink_iop; - break; - case S_IFBLK: - case S_IFCHR: - case S_IFIFO: - case S_IFSOCK: - btail = au_dbtail(dentry); - inode->i_op = &aufs_iop; - init_special_inode(inode, mode, h_inode->i_rdev); - break; - default: - AuIOErr("Unknown file type 0%o\n", mode); - err = -EIO; - goto out; - } - - /* do not set hnotify for whiteouted dirs (SHWH mode) */ - flags = au_hi_flags(inode, isdir); - if (au_opt_test(au_mntflags(dentry->d_sb), SHWH) - && au_ftest_hi(flags, HNOTIFY) - && dentry->d_name.len > AUFS_WH_PFX_LEN - && !memcmp(dentry->d_name.name, AUFS_WH_PFX, AUFS_WH_PFX_LEN)) - au_fclr_hi(flags, HNOTIFY); - iinfo = au_ii(inode); - iinfo->ii_bstart = bstart; - iinfo->ii_bend = btail; - for (bindex = bstart; bindex <= btail; bindex++) { - h_dentry = au_h_dptr(dentry, bindex); - if (h_dentry) - au_set_h_iptr(inode, bindex, - au_igrab(d_inode(h_dentry)), flags); - } - au_cpup_attr_all(inode, /*force*/1); - /* - * to force calling aufs_get_acl() every time, - * do not call cache_no_acl() for aufs inode. - */ - -out: - return err; -} - -/* - * successful returns with iinfo write_locked - * minus: errno - * zero: success, matched - * plus: no error, but unmatched - */ -static int reval_inode(struct inode *inode, struct dentry *dentry) -{ - int err; - unsigned int gen; - struct au_iigen iigen; - aufs_bindex_t bindex, bend; - struct inode *h_inode, *h_dinode; - struct dentry *h_dentry; - - /* - * before this function, if aufs got any iinfo lock, it must be only - * one, the parent dir. - * it can happen by UDBA and the obsoleted inode number. - */ - err = -EIO; - if (unlikely(inode->i_ino == parent_ino(dentry))) - goto out; - - err = 1; - ii_write_lock_new_child(inode); - h_dentry = au_h_dptr(dentry, au_dbstart(dentry)); - h_dinode = d_inode(h_dentry); - bend = au_ibend(inode); - for (bindex = au_ibstart(inode); bindex <= bend; bindex++) { - h_inode = au_h_iptr(inode, bindex); - if (!h_inode || h_inode != h_dinode) - continue; - - err = 0; - gen = au_iigen(inode, &iigen); - if (gen == au_digen(dentry) - && !au_ig_ftest(iigen.ig_flags, HALF_REFRESHED)) - break; - - /* fully refresh inode using dentry */ - err = au_refresh_hinode(inode, dentry); - if (!err) - au_update_iigen(inode, /*half*/0); - break; - } - - if (unlikely(err)) - ii_write_unlock(inode); -out: - return err; -} - -int au_ino(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino, - unsigned int d_type, ino_t *ino) -{ - int err; - struct mutex *mtx; - - /* prevent hardlinked inode number from race condition */ - mtx = NULL; - if (d_type != DT_DIR) { - mtx = &au_sbr(sb, bindex)->br_xino.xi_nondir_mtx; - mutex_lock(mtx); - } - err = au_xino_read(sb, bindex, h_ino, ino); - if (unlikely(err)) - goto out; - - if (!*ino) { - err = -EIO; - *ino = au_xino_new_ino(sb); - if (unlikely(!*ino)) - goto out; - err = au_xino_write(sb, bindex, h_ino, *ino); - if (unlikely(err)) - goto out; - } - -out: - if (mtx) - mutex_unlock(mtx); - return err; -} - -/* successful returns with iinfo write_locked */ -/* todo: return with unlocked? */ -struct inode *au_new_inode(struct dentry *dentry, int must_new) -{ - struct inode *inode, *h_inode; - struct dentry *h_dentry; - struct super_block *sb; - struct mutex *mtx; - ino_t h_ino, ino; - int err; - aufs_bindex_t bstart; - - sb = dentry->d_sb; - bstart = au_dbstart(dentry); - h_dentry = au_h_dptr(dentry, bstart); - h_inode = d_inode(h_dentry); - h_ino = h_inode->i_ino; - - /* - * stop 'race'-ing between hardlinks under different - * parents. - */ - mtx = NULL; - if (!d_is_dir(h_dentry)) - mtx = &au_sbr(sb, bstart)->br_xino.xi_nondir_mtx; - -new_ino: - if (mtx) - mutex_lock(mtx); - err = au_xino_read(sb, bstart, h_ino, &ino); - inode = ERR_PTR(err); - if (unlikely(err)) - goto out; - - if (!ino) { - ino = au_xino_new_ino(sb); - if (unlikely(!ino)) { - inode = ERR_PTR(-EIO); - goto out; - } - } - - AuDbg("i%lu\n", (unsigned long)ino); - inode = au_iget_locked(sb, ino); - err = PTR_ERR(inode); - if (IS_ERR(inode)) - goto out; - - AuDbg("%lx, new %d\n", inode->i_state, !!(inode->i_state & I_NEW)); - if (inode->i_state & I_NEW) { - /* verbose coding for lock class name */ - if (unlikely(d_is_symlink(h_dentry))) - au_rw_class(&au_ii(inode)->ii_rwsem, - au_lc_key + AuLcSymlink_IIINFO); - else if (unlikely(d_is_dir(h_dentry))) - au_rw_class(&au_ii(inode)->ii_rwsem, - au_lc_key + AuLcDir_IIINFO); - else /* likely */ - au_rw_class(&au_ii(inode)->ii_rwsem, - au_lc_key + AuLcNonDir_IIINFO); - - ii_write_lock_new_child(inode); - err = set_inode(inode, dentry); - if (!err) { - unlock_new_inode(inode); - goto out; /* success */ - } - - /* - * iget_failed() calls iput(), but we need to call - * ii_write_unlock() after iget_failed(). so dirty hack for - * i_count. - */ - atomic_inc(&inode->i_count); - iget_failed(inode); - ii_write_unlock(inode); - au_xino_write(sb, bstart, h_ino, /*ino*/0); - /* ignore this error */ - goto out_iput; - } else if (!must_new && !IS_DEADDIR(inode) && inode->i_nlink) { - /* - * horrible race condition between lookup, readdir and copyup - * (or something). - */ - if (mtx) - mutex_unlock(mtx); - err = reval_inode(inode, dentry); - if (unlikely(err < 0)) { - mtx = NULL; - goto out_iput; - } - - if (!err) { - mtx = NULL; - goto out; /* success */ - } else if (mtx) - mutex_lock(mtx); - } - - if (unlikely(au_test_fs_unique_ino(h_inode))) - AuWarn1("Warning: Un-notified UDBA or repeatedly renamed dir," - " b%d, %s, %pd, hi%lu, i%lu.\n", - bstart, au_sbtype(h_dentry->d_sb), dentry, - (unsigned long)h_ino, (unsigned long)ino); - ino = 0; - err = au_xino_write(sb, bstart, h_ino, /*ino*/0); - if (!err) { - iput(inode); - if (mtx) - mutex_unlock(mtx); - goto new_ino; - } - -out_iput: - iput(inode); - inode = ERR_PTR(err); -out: - if (mtx) - mutex_unlock(mtx); - return inode; -} - -/* ---------------------------------------------------------------------- */ - -int au_test_ro(struct super_block *sb, aufs_bindex_t bindex, - struct inode *inode) -{ - int err; - struct inode *hi; - - err = au_br_rdonly(au_sbr(sb, bindex)); - - /* pseudo-link after flushed may happen out of bounds */ - if (!err - && inode - && au_ibstart(inode) <= bindex - && bindex <= au_ibend(inode)) { - /* - * permission check is unnecessary since vfsub routine - * will be called later - */ - hi = au_h_iptr(inode, bindex); - if (hi) - err = IS_IMMUTABLE(hi) ? -EROFS : 0; - } - - return err; -} - -int au_test_h_perm(struct inode *h_inode, int mask) -{ - if (uid_eq(current_fsuid(), GLOBAL_ROOT_UID)) - return 0; - return inode_permission(h_inode, mask); -} - -int au_test_h_perm_sio(struct inode *h_inode, int mask) -{ - if (au_test_nfs(h_inode->i_sb) - && (mask & MAY_WRITE) - && S_ISDIR(h_inode->i_mode)) - mask |= MAY_READ; /* force permission check */ - return au_test_h_perm(h_inode, mask); -} diff --git a/fs/aufs/inode.h b/fs/aufs/inode.h deleted file mode 100644 index 1b9be356d..000000000 --- a/fs/aufs/inode.h +++ /dev/null @@ -1,673 +0,0 @@ -/* - * Copyright (C) 2005-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * inode operations - */ - -#ifndef __AUFS_INODE_H__ -#define __AUFS_INODE_H__ - -#ifdef __KERNEL__ - -#include <linux/fsnotify.h> -#include "rwsem.h" - -struct vfsmount; - -struct au_hnotify { -#ifdef CONFIG_AUFS_HNOTIFY -#ifdef CONFIG_AUFS_HFSNOTIFY - /* never use fsnotify_add_vfsmount_mark() */ - struct fsnotify_mark hn_mark; -#endif - struct inode *hn_aufs_inode; /* no get/put */ -#endif -} ____cacheline_aligned_in_smp; - -struct au_hinode { - struct inode *hi_inode; - aufs_bindex_t hi_id; -#ifdef CONFIG_AUFS_HNOTIFY - struct au_hnotify *hi_notify; -#endif - - /* reference to the copied-up whiteout with get/put */ - struct dentry *hi_whdentry; -}; - -/* ig_flags */ -#define AuIG_HALF_REFRESHED 1 -#define au_ig_ftest(flags, name) ((flags) & AuIG_##name) -#define au_ig_fset(flags, name) \ - do { (flags) |= AuIG_##name; } while (0) -#define au_ig_fclr(flags, name) \ - do { (flags) &= ~AuIG_##name; } while (0) - -struct au_iigen { - __u32 ig_generation, ig_flags; -}; - -struct au_vdir; -struct au_iinfo { - spinlock_t ii_genspin; - struct au_iigen ii_generation; - struct super_block *ii_hsb1; /* no get/put */ - - struct au_rwsem ii_rwsem; - aufs_bindex_t ii_bstart, ii_bend; - __u32 ii_higen; - struct au_hinode *ii_hinode; - struct au_vdir *ii_vdir; -}; - -struct au_icntnr { - struct au_iinfo iinfo; - struct inode vfs_inode; -} ____cacheline_aligned_in_smp; - -/* au_pin flags */ -#define AuPin_DI_LOCKED 1 -#define AuPin_MNT_WRITE (1 << 1) -#define au_ftest_pin(flags, name) ((flags) & AuPin_##name) -#define au_fset_pin(flags, name) \ - do { (flags) |= AuPin_##name; } while (0) -#define au_fclr_pin(flags, name) \ - do { (flags) &= ~AuPin_##name; } while (0) - -struct au_pin { - /* input */ - struct dentry *dentry; - unsigned int udba; - unsigned char lsc_di, lsc_hi, flags; - aufs_bindex_t bindex; - - /* output */ - struct dentry *parent; - struct au_hinode *hdir; - struct vfsmount *h_mnt; - - /* temporary unlock/relock for copyup */ - struct dentry *h_dentry, *h_parent; - struct au_branch *br; - struct task_struct *task; -}; - -void au_pin_hdir_unlock(struct au_pin *p); -int au_pin_hdir_lock(struct au_pin *p); -int au_pin_hdir_relock(struct au_pin *p); -void au_pin_hdir_set_owner(struct au_pin *p, struct task_struct *task); -void au_pin_hdir_acquire_nest(struct au_pin *p); -void au_pin_hdir_release(struct au_pin *p); - -/* ---------------------------------------------------------------------- */ - -static inline struct au_iinfo *au_ii(struct inode *inode) -{ - struct au_iinfo *iinfo; - - iinfo = &(container_of(inode, struct au_icntnr, vfs_inode)->iinfo); - if (iinfo->ii_hinode) - return iinfo; - return NULL; /* debugging bad_inode case */ -} - -/* ---------------------------------------------------------------------- */ - -/* inode.c */ -struct inode *au_igrab(struct inode *inode); -int au_refresh_hinode_self(struct inode *inode); -int au_refresh_hinode(struct inode *inode, struct dentry *dentry); -int au_ino(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino, - unsigned int d_type, ino_t *ino); -struct inode *au_new_inode(struct dentry *dentry, int must_new); -int au_test_ro(struct super_block *sb, aufs_bindex_t bindex, - struct inode *inode); -int au_test_h_perm(struct inode *h_inode, int mask); -int au_test_h_perm_sio(struct inode *h_inode, int mask); - -static inline int au_wh_ino(struct super_block *sb, aufs_bindex_t bindex, - ino_t h_ino, unsigned int d_type, ino_t *ino) -{ -#ifdef CONFIG_AUFS_SHWH - return au_ino(sb, bindex, h_ino, d_type, ino); -#else - return 0; -#endif -} - -/* i_op.c */ -extern struct inode_operations aufs_iop, aufs_symlink_iop, aufs_dir_iop; - -/* au_wr_dir flags */ -#define AuWrDir_ADD_ENTRY 1 -#define AuWrDir_ISDIR (1 << 1) -#define AuWrDir_TMPFILE (1 << 2) -#define au_ftest_wrdir(flags, name) ((flags) & AuWrDir_##name) -#define au_fset_wrdir(flags, name) \ - do { (flags) |= AuWrDir_##name; } while (0) -#define au_fclr_wrdir(flags, name) \ - do { (flags) &= ~AuWrDir_##name; } while (0) - -struct au_wr_dir_args { - aufs_bindex_t force_btgt; - unsigned char flags; -}; -int au_wr_dir(struct dentry *dentry, struct dentry *src_dentry, - struct au_wr_dir_args *args); - -struct dentry *au_pinned_h_parent(struct au_pin *pin); -void au_pin_init(struct au_pin *pin, struct dentry *dentry, - aufs_bindex_t bindex, int lsc_di, int lsc_hi, - unsigned int udba, unsigned char flags); -int au_pin(struct au_pin *pin, struct dentry *dentry, aufs_bindex_t bindex, - unsigned int udba, unsigned char flags) __must_check; -int au_do_pin(struct au_pin *pin) __must_check; -void au_unpin(struct au_pin *pin); -int au_reval_for_attr(struct dentry *dentry, unsigned int sigen); - -#define AuIcpup_DID_CPUP 1 -#define au_ftest_icpup(flags, name) ((flags) & AuIcpup_##name) -#define au_fset_icpup(flags, name) \ - do { (flags) |= AuIcpup_##name; } while (0) -#define au_fclr_icpup(flags, name) \ - do { (flags) &= ~AuIcpup_##name; } while (0) - -struct au_icpup_args { - unsigned char flags; - unsigned char pin_flags; - aufs_bindex_t btgt; - unsigned int udba; - struct au_pin pin; - struct path h_path; - struct inode *h_inode; -}; - -int au_pin_and_icpup(struct dentry *dentry, struct iattr *ia, - struct au_icpup_args *a); - -int au_h_path_getattr(struct dentry *dentry, int force, struct path *h_path); - -/* i_op_add.c */ -int au_may_add(struct dentry *dentry, aufs_bindex_t bindex, - struct dentry *h_parent, int isdir); -int aufs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, - dev_t dev); -int aufs_symlink(struct inode *dir, struct dentry *dentry, const char *symname); -int aufs_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool want_excl); -struct vfsub_aopen_args; -int au_aopen_or_create(struct inode *dir, struct dentry *dentry, - struct vfsub_aopen_args *args); -int aufs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode); -int aufs_link(struct dentry *src_dentry, struct inode *dir, - struct dentry *dentry); -int aufs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); - -/* i_op_del.c */ -int au_wr_dir_need_wh(struct dentry *dentry, int isdir, aufs_bindex_t *bcpup); -int au_may_del(struct dentry *dentry, aufs_bindex_t bindex, - struct dentry *h_parent, int isdir); -int aufs_unlink(struct inode *dir, struct dentry *dentry); -int aufs_rmdir(struct inode *dir, struct dentry *dentry); - -/* i_op_ren.c */ -int au_wbr(struct dentry *dentry, aufs_bindex_t btgt); -int aufs_rename(struct inode *src_dir, struct dentry *src_dentry, - struct inode *dir, struct dentry *dentry); - -/* iinfo.c */ -struct inode *au_h_iptr(struct inode *inode, aufs_bindex_t bindex); -void au_hiput(struct au_hinode *hinode); -void au_set_hi_wh(struct inode *inode, aufs_bindex_t bindex, - struct dentry *h_wh); -unsigned int au_hi_flags(struct inode *inode, int isdir); - -/* hinode flags */ -#define AuHi_XINO 1 -#define AuHi_HNOTIFY (1 << 1) -#define au_ftest_hi(flags, name) ((flags) & AuHi_##name) -#define au_fset_hi(flags, name) \ - do { (flags) |= AuHi_##name; } while (0) -#define au_fclr_hi(flags, name) \ - do { (flags) &= ~AuHi_##name; } while (0) - -#ifndef CONFIG_AUFS_HNOTIFY -#undef AuHi_HNOTIFY -#define AuHi_HNOTIFY 0 -#endif - -void au_set_h_iptr(struct inode *inode, aufs_bindex_t bindex, - struct inode *h_inode, unsigned int flags); - -void au_update_iigen(struct inode *inode, int half); -void au_update_ibrange(struct inode *inode, int do_put_zero); - -void au_icntnr_init_once(void *_c); -int au_iinfo_init(struct inode *inode); -void au_iinfo_fin(struct inode *inode); -int au_ii_realloc(struct au_iinfo *iinfo, int nbr); - -#ifdef CONFIG_PROC_FS -/* plink.c */ -int au_plink_maint(struct super_block *sb, int flags); -struct au_sbinfo; -void au_plink_maint_leave(struct au_sbinfo *sbinfo); -int au_plink_maint_enter(struct super_block *sb); -#ifdef CONFIG_AUFS_DEBUG -void au_plink_list(struct super_block *sb); -#else -AuStubVoid(au_plink_list, struct super_block *sb) -#endif -int au_plink_test(struct inode *inode); -struct dentry *au_plink_lkup(struct inode *inode, aufs_bindex_t bindex); -void au_plink_append(struct inode *inode, aufs_bindex_t bindex, - struct dentry *h_dentry); -void au_plink_put(struct super_block *sb, int verbose); -void au_plink_clean(struct super_block *sb, int verbose); -void au_plink_half_refresh(struct super_block *sb, aufs_bindex_t br_id); -#else -AuStubInt0(au_plink_maint, struct super_block *sb, int flags); -AuStubVoid(au_plink_maint_leave, struct au_sbinfo *sbinfo); -AuStubInt0(au_plink_maint_enter, struct super_block *sb); -AuStubVoid(au_plink_list, struct super_block *sb); -AuStubInt0(au_plink_test, struct inode *inode); -AuStub(struct dentry *, au_plink_lkup, return NULL, - struct inode *inode, aufs_bindex_t bindex); -AuStubVoid(au_plink_append, struct inode *inode, aufs_bindex_t bindex, - struct dentry *h_dentry); -AuStubVoid(au_plink_put, struct super_block *sb, int verbose); -AuStubVoid(au_plink_clean, struct super_block *sb, int verbose); -AuStubVoid(au_plink_half_refresh, struct super_block *sb, aufs_bindex_t br_id); -#endif /* CONFIG_PROC_FS */ - -#ifdef CONFIG_AUFS_XATTR -/* xattr.c */ -int au_cpup_xattr(struct dentry *h_dst, struct dentry *h_src, int ignore_flags, - unsigned int verbose); -ssize_t aufs_listxattr(struct dentry *dentry, char *list, size_t size); -ssize_t aufs_getxattr(struct dentry *dentry, const char *name, void *value, - size_t size); -int aufs_setxattr(struct dentry *dentry, const char *name, const void *value, - size_t size, int flags); -int aufs_removexattr(struct dentry *dentry, const char *name); - -/* void au_xattr_init(struct super_block *sb); */ -#else -AuStubInt0(au_cpup_xattr, struct dentry *h_dst, struct dentry *h_src, - int ignore_flags, unsigned int verbose); -/* AuStubVoid(au_xattr_init, struct super_block *sb); */ -#endif - -#ifdef CONFIG_FS_POSIX_ACL -struct posix_acl *aufs_get_acl(struct inode *inode, int type); -int aufs_set_acl(struct inode *inode, struct posix_acl *acl, int type); -#endif - -#if IS_ENABLED(CONFIG_AUFS_XATTR) || IS_ENABLED(CONFIG_FS_POSIX_ACL) -enum { - AU_XATTR_SET, - AU_XATTR_REMOVE, - AU_ACL_SET -}; - -struct au_srxattr { - int type; - union { - struct { - const char *name; - const void *value; - size_t size; - int flags; - } set; - struct { - const char *name; - } remove; - struct { - struct posix_acl *acl; - int type; - } acl_set; - } u; -}; -ssize_t au_srxattr(struct dentry *dentry, struct au_srxattr *arg); -#endif - -/* ---------------------------------------------------------------------- */ - -/* lock subclass for iinfo */ -enum { - AuLsc_II_CHILD, /* child first */ - AuLsc_II_CHILD2, /* rename(2), link(2), and cpup at hnotify */ - AuLsc_II_CHILD3, /* copyup dirs */ - AuLsc_II_PARENT, /* see AuLsc_I_PARENT in vfsub.h */ - AuLsc_II_PARENT2, - AuLsc_II_PARENT3, /* copyup dirs */ - AuLsc_II_NEW_CHILD -}; - -/* - * ii_read_lock_child, ii_write_lock_child, - * ii_read_lock_child2, ii_write_lock_child2, - * ii_read_lock_child3, ii_write_lock_child3, - * ii_read_lock_parent, ii_write_lock_parent, - * ii_read_lock_parent2, ii_write_lock_parent2, - * ii_read_lock_parent3, ii_write_lock_parent3, - * ii_read_lock_new_child, ii_write_lock_new_child, - */ -#define AuReadLockFunc(name, lsc) \ -static inline void ii_read_lock_##name(struct inode *i) \ -{ \ - au_rw_read_lock_nested(&au_ii(i)->ii_rwsem, AuLsc_II_##lsc); \ -} - -#define AuWriteLockFunc(name, lsc) \ -static inline void ii_write_lock_##name(struct inode *i) \ -{ \ - au_rw_write_lock_nested(&au_ii(i)->ii_rwsem, AuLsc_II_##lsc); \ -} - -#define AuRWLockFuncs(name, lsc) \ - AuReadLockFunc(name, lsc) \ - AuWriteLockFunc(name, lsc) - -AuRWLockFuncs(child, CHILD); -AuRWLockFuncs(child2, CHILD2); -AuRWLockFuncs(child3, CHILD3); -AuRWLockFuncs(parent, PARENT); -AuRWLockFuncs(parent2, PARENT2); -AuRWLockFuncs(parent3, PARENT3); -AuRWLockFuncs(new_child, NEW_CHILD); - -#undef AuReadLockFunc -#undef AuWriteLockFunc -#undef AuRWLockFuncs - -/* - * ii_read_unlock, ii_write_unlock, ii_downgrade_lock - */ -AuSimpleUnlockRwsemFuncs(ii, struct inode *i, &au_ii(i)->ii_rwsem); - -#define IiMustNoWaiters(i) AuRwMustNoWaiters(&au_ii(i)->ii_rwsem) -#define IiMustAnyLock(i) AuRwMustAnyLock(&au_ii(i)->ii_rwsem) -#define IiMustWriteLock(i) AuRwMustWriteLock(&au_ii(i)->ii_rwsem) - -/* ---------------------------------------------------------------------- */ - -static inline void au_icntnr_init(struct au_icntnr *c) -{ -#ifdef CONFIG_AUFS_DEBUG - c->vfs_inode.i_mode = 0; -#endif -} - -static inline unsigned int au_iigen(struct inode *inode, struct au_iigen *iigen) -{ - unsigned int gen; - struct au_iinfo *iinfo; - - iinfo = au_ii(inode); - spin_lock(&iinfo->ii_genspin); - if (iigen) - *iigen = iinfo->ii_generation; - gen = iinfo->ii_generation.ig_generation; - spin_unlock(&iinfo->ii_genspin); - - return gen; -} - -/* tiny test for inode number */ -/* tmpfs generation is too rough */ -static inline int au_test_higen(struct inode *inode, struct inode *h_inode) -{ - struct au_iinfo *iinfo; - - iinfo = au_ii(inode); - AuRwMustAnyLock(&iinfo->ii_rwsem); - return !(iinfo->ii_hsb1 == h_inode->i_sb - && iinfo->ii_higen == h_inode->i_generation); -} - -static inline void au_iigen_dec(struct inode *inode) -{ - struct au_iinfo *iinfo; - - iinfo = au_ii(inode); - spin_lock(&iinfo->ii_genspin); - iinfo->ii_generation.ig_generation--; - spin_unlock(&iinfo->ii_genspin); -} - -static inline int au_iigen_test(struct inode *inode, unsigned int sigen) -{ - int err; - - err = 0; - if (unlikely(inode && au_iigen(inode, NULL) != sigen)) - err = -EIO; - - return err; -} - -/* ---------------------------------------------------------------------- */ - -static inline aufs_bindex_t au_ii_br_id(struct inode *inode, - aufs_bindex_t bindex) -{ - IiMustAnyLock(inode); - return au_ii(inode)->ii_hinode[0 + bindex].hi_id; -} - -static inline aufs_bindex_t au_ibstart(struct inode *inode) -{ - IiMustAnyLock(inode); - return au_ii(inode)->ii_bstart; -} - -static inline aufs_bindex_t au_ibend(struct inode *inode) -{ - IiMustAnyLock(inode); - return au_ii(inode)->ii_bend; -} - -static inline struct au_vdir *au_ivdir(struct inode *inode) -{ - IiMustAnyLock(inode); - return au_ii(inode)->ii_vdir; -} - -static inline struct dentry *au_hi_wh(struct inode *inode, aufs_bindex_t bindex) -{ - IiMustAnyLock(inode); - return au_ii(inode)->ii_hinode[0 + bindex].hi_whdentry; -} - -static inline void au_set_ibstart(struct inode *inode, aufs_bindex_t bindex) -{ - IiMustWriteLock(inode); - au_ii(inode)->ii_bstart = bindex; -} - -static inline void au_set_ibend(struct inode *inode, aufs_bindex_t bindex) -{ - IiMustWriteLock(inode); - au_ii(inode)->ii_bend = bindex; -} - -static inline void au_set_ivdir(struct inode *inode, struct au_vdir *vdir) -{ - IiMustWriteLock(inode); - au_ii(inode)->ii_vdir = vdir; -} - -static inline struct au_hinode *au_hi(struct inode *inode, aufs_bindex_t bindex) -{ - IiMustAnyLock(inode); - return au_ii(inode)->ii_hinode + bindex; -} - -/* ---------------------------------------------------------------------- */ - -static inline struct dentry *au_pinned_parent(struct au_pin *pin) -{ - if (pin) - return pin->parent; - return NULL; -} - -static inline struct inode *au_pinned_h_dir(struct au_pin *pin) -{ - if (pin && pin->hdir) - return pin->hdir->hi_inode; - return NULL; -} - -static inline struct au_hinode *au_pinned_hdir(struct au_pin *pin) -{ - if (pin) - return pin->hdir; - return NULL; -} - -static inline void au_pin_set_dentry(struct au_pin *pin, struct dentry *dentry) -{ - if (pin) - pin->dentry = dentry; -} - -static inline void au_pin_set_parent_lflag(struct au_pin *pin, - unsigned char lflag) -{ - if (pin) { - if (lflag) - au_fset_pin(pin->flags, DI_LOCKED); - else - au_fclr_pin(pin->flags, DI_LOCKED); - } -} - -#if 0 /* reserved */ -static inline void au_pin_set_parent(struct au_pin *pin, struct dentry *parent) -{ - if (pin) { - dput(pin->parent); - pin->parent = dget(parent); - } -} -#endif - -/* ---------------------------------------------------------------------- */ - -struct au_branch; -#ifdef CONFIG_AUFS_HNOTIFY -struct au_hnotify_op { - void (*ctl)(struct au_hinode *hinode, int do_set); - int (*alloc)(struct au_hinode *hinode); - - /* - * if it returns true, the the caller should free hinode->hi_notify, - * otherwise ->free() frees it. - */ - int (*free)(struct au_hinode *hinode, - struct au_hnotify *hn) __must_check; - - void (*fin)(void); - int (*init)(void); - - int (*reset_br)(unsigned int udba, struct au_branch *br, int perm); - void (*fin_br)(struct au_branch *br); - int (*init_br)(struct au_branch *br, int perm); -}; - -/* hnotify.c */ -int au_hn_alloc(struct au_hinode *hinode, struct inode *inode); -void au_hn_free(struct au_hinode *hinode); -void au_hn_ctl(struct au_hinode *hinode, int do_set); -void au_hn_reset(struct inode *inode, unsigned int flags); -int au_hnotify(struct inode *h_dir, struct au_hnotify *hnotify, u32 mask, - struct qstr *h_child_qstr, struct inode *h_child_inode); -int au_hnotify_reset_br(unsigned int udba, struct au_branch *br, int perm); -int au_hnotify_init_br(struct au_branch *br, int perm); -void au_hnotify_fin_br(struct au_branch *br); -int __init au_hnotify_init(void); -void au_hnotify_fin(void); - -/* hfsnotify.c */ -extern const struct au_hnotify_op au_hnotify_op; - -static inline -void au_hn_init(struct au_hinode *hinode) -{ - hinode->hi_notify = NULL; -} - -static inline struct au_hnotify *au_hn(struct au_hinode *hinode) -{ - return hinode->hi_notify; -} - -#else -AuStub(int, au_hn_alloc, return -EOPNOTSUPP, - struct au_hinode *hinode __maybe_unused, - struct inode *inode __maybe_unused) -AuStub(struct au_hnotify *, au_hn, return NULL, struct au_hinode *hinode) -AuStubVoid(au_hn_free, struct au_hinode *hinode __maybe_unused) -AuStubVoid(au_hn_ctl, struct au_hinode *hinode __maybe_unused, - int do_set __maybe_unused) -AuStubVoid(au_hn_reset, struct inode *inode __maybe_unused, - unsigned int flags __maybe_unused) -AuStubInt0(au_hnotify_reset_br, unsigned int udba __maybe_unused, - struct au_branch *br __maybe_unused, - int perm __maybe_unused) -AuStubInt0(au_hnotify_init_br, struct au_branch *br __maybe_unused, - int perm __maybe_unused) -AuStubVoid(au_hnotify_fin_br, struct au_branch *br __maybe_unused) -AuStubInt0(__init au_hnotify_init, void) -AuStubVoid(au_hnotify_fin, void) -AuStubVoid(au_hn_init, struct au_hinode *hinode __maybe_unused) -#endif /* CONFIG_AUFS_HNOTIFY */ - -static inline void au_hn_suspend(struct au_hinode *hdir) -{ - au_hn_ctl(hdir, /*do_set*/0); -} - -static inline void au_hn_resume(struct au_hinode *hdir) -{ - au_hn_ctl(hdir, /*do_set*/1); -} - -static inline void au_hn_imtx_lock(struct au_hinode *hdir) -{ - mutex_lock(&hdir->hi_inode->i_mutex); - au_hn_suspend(hdir); -} - -static inline void au_hn_imtx_lock_nested(struct au_hinode *hdir, - unsigned int sc __maybe_unused) -{ - mutex_lock_nested(&hdir->hi_inode->i_mutex, sc); - au_hn_suspend(hdir); -} - -static inline void au_hn_imtx_unlock(struct au_hinode *hdir) -{ - au_hn_resume(hdir); - mutex_unlock(&hdir->hi_inode->i_mutex); -} - -#endif /* __KERNEL__ */ -#endif /* __AUFS_INODE_H__ */ diff --git a/fs/aufs/ioctl.c b/fs/aufs/ioctl.c deleted file mode 100644 index 18d7c6aea..000000000 --- a/fs/aufs/ioctl.c +++ /dev/null @@ -1,219 +0,0 @@ -/* - * Copyright (C) 2005-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * ioctl - * plink-management and readdir in userspace. - * assist the pathconf(3) wrapper library. - * move-down - * File-based Hierarchical Storage Management. - */ - -#include <linux/compat.h> -#include <linux/file.h> -#include "aufs.h" - -static int au_wbr_fd(struct path *path, struct aufs_wbr_fd __user *arg) -{ - int err, fd; - aufs_bindex_t wbi, bindex, bend; - struct file *h_file; - struct super_block *sb; - struct dentry *root; - struct au_branch *br; - struct aufs_wbr_fd wbrfd = { - .oflags = au_dir_roflags, - .brid = -1 - }; - const int valid = O_RDONLY | O_NONBLOCK | O_LARGEFILE | O_DIRECTORY - | O_NOATIME | O_CLOEXEC; - - AuDebugOn(wbrfd.oflags & ~valid); - - if (arg) { - err = copy_from_user(&wbrfd, arg, sizeof(wbrfd)); - if (unlikely(err)) { - err = -EFAULT; - goto out; - } - - err = -EINVAL; - AuDbg("wbrfd{0%o, %d}\n", wbrfd.oflags, wbrfd.brid); - wbrfd.oflags |= au_dir_roflags; - AuDbg("0%o\n", wbrfd.oflags); - if (unlikely(wbrfd.oflags & ~valid)) - goto out; - } - - fd = get_unused_fd_flags(0); - err = fd; - if (unlikely(fd < 0)) - goto out; - - h_file = ERR_PTR(-EINVAL); - wbi = 0; - br = NULL; - sb = path->dentry->d_sb; - root = sb->s_root; - aufs_read_lock(root, AuLock_IR); - bend = au_sbend(sb); - if (wbrfd.brid >= 0) { - wbi = au_br_index(sb, wbrfd.brid); - if (unlikely(wbi < 0 || wbi > bend)) - goto out_unlock; - } - - h_file = ERR_PTR(-ENOENT); - br = au_sbr(sb, wbi); - if (!au_br_writable(br->br_perm)) { - if (arg) - goto out_unlock; - - bindex = wbi + 1; - wbi = -1; - for (; bindex <= bend; bindex++) { - br = au_sbr(sb, bindex); - if (au_br_writable(br->br_perm)) { - wbi = bindex; - br = au_sbr(sb, wbi); - break; - } - } - } - AuDbg("wbi %d\n", wbi); - if (wbi >= 0) - h_file = au_h_open(root, wbi, wbrfd.oflags, NULL, - /*force_wr*/0); - -out_unlock: - aufs_read_unlock(root, AuLock_IR); - err = PTR_ERR(h_file); - if (IS_ERR(h_file)) - goto out_fd; - - atomic_dec(&br->br_count); /* cf. au_h_open() */ - fd_install(fd, h_file); - err = fd; - goto out; /* success */ - -out_fd: - put_unused_fd(fd); -out: - AuTraceErr(err); - return err; -} - -/* ---------------------------------------------------------------------- */ - -long aufs_ioctl_dir(struct file *file, unsigned int cmd, unsigned long arg) -{ - long err; - struct dentry *dentry; - - switch (cmd) { - case AUFS_CTL_RDU: - case AUFS_CTL_RDU_INO: - err = au_rdu_ioctl(file, cmd, arg); - break; - - case AUFS_CTL_WBR_FD: - err = au_wbr_fd(&file->f_path, (void __user *)arg); - break; - - case AUFS_CTL_IBUSY: - err = au_ibusy_ioctl(file, arg); - break; - - case AUFS_CTL_BRINFO: - err = au_brinfo_ioctl(file, arg); - break; - - case AUFS_CTL_FHSM_FD: - dentry = file->f_path.dentry; - if (IS_ROOT(dentry)) - err = au_fhsm_fd(dentry->d_sb, arg); - else - err = -ENOTTY; - break; - - default: - /* do not call the lower */ - AuDbg("0x%x\n", cmd); - err = -ENOTTY; - } - - AuTraceErr(err); - return err; -} - -long aufs_ioctl_nondir(struct file *file, unsigned int cmd, unsigned long arg) -{ - long err; - - switch (cmd) { - case AUFS_CTL_MVDOWN: - err = au_mvdown(file->f_path.dentry, (void __user *)arg); - break; - - case AUFS_CTL_WBR_FD: - err = au_wbr_fd(&file->f_path, (void __user *)arg); - break; - - default: - /* do not call the lower */ - AuDbg("0x%x\n", cmd); - err = -ENOTTY; - } - - AuTraceErr(err); - return err; -} - -#ifdef CONFIG_COMPAT -long aufs_compat_ioctl_dir(struct file *file, unsigned int cmd, - unsigned long arg) -{ - long err; - - switch (cmd) { - case AUFS_CTL_RDU: - case AUFS_CTL_RDU_INO: - err = au_rdu_compat_ioctl(file, cmd, arg); - break; - - case AUFS_CTL_IBUSY: - err = au_ibusy_compat_ioctl(file, arg); - break; - - case AUFS_CTL_BRINFO: - err = au_brinfo_compat_ioctl(file, arg); - break; - - default: - err = aufs_ioctl_dir(file, cmd, arg); - } - - AuTraceErr(err); - return err; -} - -long aufs_compat_ioctl_nondir(struct file *file, unsigned int cmd, - unsigned long arg) -{ - return aufs_ioctl_nondir(file, cmd, (unsigned long)compat_ptr(arg)); -} -#endif diff --git a/fs/aufs/loop.c b/fs/aufs/loop.c deleted file mode 100644 index 677568445..000000000 --- a/fs/aufs/loop.c +++ /dev/null @@ -1,145 +0,0 @@ -/* - * Copyright (C) 2005-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * support for loopback block device as a branch - */ - -#include "aufs.h" - -/* added into drivers/block/loop.c */ -static struct file *(*backing_file_func)(struct super_block *sb); - -/* - * test if two lower dentries have overlapping branches. - */ -int au_test_loopback_overlap(struct super_block *sb, struct dentry *h_adding) -{ - struct super_block *h_sb; - struct file *backing_file; - - if (unlikely(!backing_file_func)) { - /* don't load "loop" module here */ - backing_file_func = symbol_get(loop_backing_file); - if (unlikely(!backing_file_func)) - /* "loop" module is not loaded */ - return 0; - } - - h_sb = h_adding->d_sb; - backing_file = backing_file_func(h_sb); - if (!backing_file) - return 0; - - h_adding = backing_file->f_path.dentry; - /* - * h_adding can be local NFS. - * in this case aufs cannot detect the loop. - */ - if (unlikely(h_adding->d_sb == sb)) - return 1; - return !!au_test_subdir(h_adding, sb->s_root); -} - -/* true if a kernel thread named 'loop[0-9].*' accesses a file */ -int au_test_loopback_kthread(void) -{ - int ret; - struct task_struct *tsk = current; - char c, comm[sizeof(tsk->comm)]; - - ret = 0; - if (tsk->flags & PF_KTHREAD) { - get_task_comm(comm, tsk); - c = comm[4]; - ret = ('0' <= c && c <= '9' - && !strncmp(comm, "loop", 4)); - } - - return ret; -} - -/* ---------------------------------------------------------------------- */ - -#define au_warn_loopback_step 16 -static int au_warn_loopback_nelem = au_warn_loopback_step; -static unsigned long *au_warn_loopback_array; - -void au_warn_loopback(struct super_block *h_sb) -{ - int i, new_nelem; - unsigned long *a, magic; - static DEFINE_SPINLOCK(spin); - - magic = h_sb->s_magic; - spin_lock(&spin); - a = au_warn_loopback_array; - for (i = 0; i < au_warn_loopback_nelem && *a; i++) - if (a[i] == magic) { - spin_unlock(&spin); - return; - } - - /* h_sb is new to us, print it */ - if (i < au_warn_loopback_nelem) { - a[i] = magic; - goto pr; - } - - /* expand the array */ - new_nelem = au_warn_loopback_nelem + au_warn_loopback_step; - a = au_kzrealloc(au_warn_loopback_array, - au_warn_loopback_nelem * sizeof(unsigned long), - new_nelem * sizeof(unsigned long), GFP_ATOMIC); - if (a) { - au_warn_loopback_nelem = new_nelem; - au_warn_loopback_array = a; - a[i] = magic; - goto pr; - } - - spin_unlock(&spin); - AuWarn1("realloc failed, ignored\n"); - return; - -pr: - spin_unlock(&spin); - pr_warn("you may want to try another patch for loopback file " - "on %s(0x%lx) branch\n", au_sbtype(h_sb), magic); -} - -int au_loopback_init(void) -{ - int err; - struct super_block *sb __maybe_unused; - - AuDebugOn(sizeof(sb->s_magic) != sizeof(unsigned long)); - - err = 0; - au_warn_loopback_array = kcalloc(au_warn_loopback_step, - sizeof(unsigned long), GFP_NOFS); - if (unlikely(!au_warn_loopback_array)) - err = -ENOMEM; - - return err; -} - -void au_loopback_fin(void) -{ - symbol_put(loop_backing_file); - kfree(au_warn_loopback_array); -} diff --git a/fs/aufs/loop.h b/fs/aufs/loop.h deleted file mode 100644 index 0c28c2366..000000000 --- a/fs/aufs/loop.h +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Copyright (C) 2005-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * support for loopback mount as a branch - */ - -#ifndef __AUFS_LOOP_H__ -#define __AUFS_LOOP_H__ - -#ifdef __KERNEL__ - -struct dentry; -struct super_block; - -#ifdef CONFIG_AUFS_BDEV_LOOP -/* drivers/block/loop.c */ -struct file *loop_backing_file(struct super_block *sb); - -/* loop.c */ -int au_test_loopback_overlap(struct super_block *sb, struct dentry *h_adding); -int au_test_loopback_kthread(void); -void au_warn_loopback(struct super_block *h_sb); - -int au_loopback_init(void); -void au_loopback_fin(void); -#else -AuStubInt0(au_test_loopback_overlap, struct super_block *sb, - struct dentry *h_adding) -AuStubInt0(au_test_loopback_kthread, void) -AuStubVoid(au_warn_loopback, struct super_block *h_sb) - -AuStubInt0(au_loopback_init, void) -AuStubVoid(au_loopback_fin, void) -#endif /* BLK_DEV_LOOP */ - -#endif /* __KERNEL__ */ -#endif /* __AUFS_LOOP_H__ */ diff --git a/fs/aufs/magic.mk b/fs/aufs/magic.mk deleted file mode 100644 index 4f83bdf1d..000000000 --- a/fs/aufs/magic.mk +++ /dev/null @@ -1,30 +0,0 @@ - -# defined in ${srctree}/fs/fuse/inode.c -# tristate -ifdef CONFIG_FUSE_FS -ccflags-y += -DFUSE_SUPER_MAGIC=0x65735546 -endif - -# defined in ${srctree}/fs/xfs/xfs_sb.h -# tristate -ifdef CONFIG_XFS_FS -ccflags-y += -DXFS_SB_MAGIC=0x58465342 -endif - -# defined in ${srctree}/fs/configfs/mount.c -# tristate -ifdef CONFIG_CONFIGFS_FS -ccflags-y += -DCONFIGFS_MAGIC=0x62656570 -endif - -# defined in ${srctree}/fs/ubifs/ubifs.h -# tristate -ifdef CONFIG_UBIFS_FS -ccflags-y += -DUBIFS_SUPER_MAGIC=0x24051905 -endif - -# defined in ${srctree}/fs/hfsplus/hfsplus_raw.h -# tristate -ifdef CONFIG_HFSPLUS_FS -ccflags-y += -DHFSPLUS_SUPER_MAGIC=0x482b -endif diff --git a/fs/aufs/module.c b/fs/aufs/module.c deleted file mode 100644 index 2c70dfffc..000000000 --- a/fs/aufs/module.c +++ /dev/null @@ -1,210 +0,0 @@ -/* - * Copyright (C) 2005-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * module global variables and operations - */ - -#include <linux/module.h> -#include <linux/seq_file.h> -#include "aufs.h" - -void *au_kzrealloc(void *p, unsigned int nused, unsigned int new_sz, gfp_t gfp) -{ - if (new_sz <= nused) - return p; - - p = krealloc(p, new_sz, gfp); - if (p) - memset(p + nused, 0, new_sz - nused); - return p; -} - -/* ---------------------------------------------------------------------- */ - -/* - * aufs caches - */ -struct kmem_cache *au_cachep[AuCache_Last]; -static int __init au_cache_init(void) -{ - au_cachep[AuCache_DINFO] = AuCacheCtor(au_dinfo, au_di_init_once); - if (au_cachep[AuCache_DINFO]) - /* SLAB_DESTROY_BY_RCU */ - au_cachep[AuCache_ICNTNR] = AuCacheCtor(au_icntnr, - au_icntnr_init_once); - if (au_cachep[AuCache_ICNTNR]) - au_cachep[AuCache_FINFO] = AuCacheCtor(au_finfo, - au_fi_init_once); - if (au_cachep[AuCache_FINFO]) - au_cachep[AuCache_VDIR] = AuCache(au_vdir); - if (au_cachep[AuCache_VDIR]) - au_cachep[AuCache_DEHSTR] = AuCache(au_vdir_dehstr); - if (au_cachep[AuCache_DEHSTR]) - return 0; - - return -ENOMEM; -} - -static void au_cache_fin(void) -{ - int i; - - /* - * Make sure all delayed rcu free inodes are flushed before we - * destroy cache. - */ - rcu_barrier(); - - /* excluding AuCache_HNOTIFY */ - BUILD_BUG_ON(AuCache_HNOTIFY + 1 != AuCache_Last); - for (i = 0; i < AuCache_HNOTIFY; i++) - if (au_cachep[i]) { - kmem_cache_destroy(au_cachep[i]); - au_cachep[i] = NULL; - } -} - -/* ---------------------------------------------------------------------- */ - -int au_dir_roflags; - -#ifdef CONFIG_AUFS_SBILIST -/* - * iterate_supers_type() doesn't protect us from - * remounting (branch management) - */ -struct au_splhead au_sbilist; -#endif - -struct lock_class_key au_lc_key[AuLcKey_Last]; - -/* - * functions for module interface. - */ -MODULE_LICENSE("GPL"); -/* MODULE_LICENSE("GPL v2"); */ -MODULE_AUTHOR("Junjiro R. Okajima <aufs-users@lists.sourceforge.net>"); -MODULE_DESCRIPTION(AUFS_NAME - " -- Advanced multi layered unification filesystem"); -MODULE_VERSION(AUFS_VERSION); -MODULE_ALIAS_FS(AUFS_NAME); - -/* this module parameter has no meaning when SYSFS is disabled */ -int sysaufs_brs = 1; -MODULE_PARM_DESC(brs, "use <sysfs>/fs/aufs/si_*/brN"); -module_param_named(brs, sysaufs_brs, int, S_IRUGO); - -/* this module parameter has no meaning when USER_NS is disabled */ -static bool au_userns; -MODULE_PARM_DESC(allow_userns, "allow unprivileged to mount under userns"); -module_param_named(allow_userns, au_userns, bool, S_IRUGO); - -/* ---------------------------------------------------------------------- */ - -static char au_esc_chars[0x20 + 3]; /* 0x01-0x20, backslash, del, and NULL */ - -int au_seq_path(struct seq_file *seq, struct path *path) -{ - return seq_path(seq, path, au_esc_chars); -} - -/* ---------------------------------------------------------------------- */ - -static int __init aufs_init(void) -{ - int err, i; - char *p; - - p = au_esc_chars; - for (i = 1; i <= ' '; i++) - *p++ = i; - *p++ = '\\'; - *p++ = '\x7f'; - *p = 0; - - au_dir_roflags = au_file_roflags(O_DIRECTORY | O_LARGEFILE); - - au_sbilist_init(); - sysaufs_brs_init(); - au_debug_init(); - au_dy_init(); - err = sysaufs_init(); - if (unlikely(err)) - goto out; - err = au_procfs_init(); - if (unlikely(err)) - goto out_sysaufs; - err = au_wkq_init(); - if (unlikely(err)) - goto out_procfs; - err = au_loopback_init(); - if (unlikely(err)) - goto out_wkq; - err = au_hnotify_init(); - if (unlikely(err)) - goto out_loopback; - err = au_sysrq_init(); - if (unlikely(err)) - goto out_hin; - err = au_cache_init(); - if (unlikely(err)) - goto out_sysrq; - - aufs_fs_type.fs_flags |= au_userns ? FS_USERNS_MOUNT : 0; - err = register_filesystem(&aufs_fs_type); - if (unlikely(err)) - goto out_cache; - - /* since we define pr_fmt, call printk directly */ - printk(KERN_INFO AUFS_NAME " " AUFS_VERSION "\n"); - goto out; /* success */ - -out_cache: - au_cache_fin(); -out_sysrq: - au_sysrq_fin(); -out_hin: - au_hnotify_fin(); -out_loopback: - au_loopback_fin(); -out_wkq: - au_wkq_fin(); -out_procfs: - au_procfs_fin(); -out_sysaufs: - sysaufs_fin(); - au_dy_fin(); -out: - return err; -} - -static void __exit aufs_exit(void) -{ - unregister_filesystem(&aufs_fs_type); - au_cache_fin(); - au_sysrq_fin(); - au_hnotify_fin(); - au_loopback_fin(); - au_wkq_fin(); - au_procfs_fin(); - sysaufs_fin(); - au_dy_fin(); -} - -module_init(aufs_init); -module_exit(aufs_exit); diff --git a/fs/aufs/module.h b/fs/aufs/module.h deleted file mode 100644 index c0a29d49d..000000000 --- a/fs/aufs/module.h +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Copyright (C) 2005-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * module initialization and module-global - */ - -#ifndef __AUFS_MODULE_H__ -#define __AUFS_MODULE_H__ - -#ifdef __KERNEL__ - -#include <linux/slab.h> - -struct path; -struct seq_file; - -/* module parameters */ -extern int sysaufs_brs; - -/* ---------------------------------------------------------------------- */ - -extern int au_dir_roflags; - -enum { - AuLcNonDir_FIINFO, - AuLcNonDir_DIINFO, - AuLcNonDir_IIINFO, - - AuLcDir_FIINFO, - AuLcDir_DIINFO, - AuLcDir_IIINFO, - - AuLcSymlink_DIINFO, - AuLcSymlink_IIINFO, - - AuLcKey_Last -}; -extern struct lock_class_key au_lc_key[AuLcKey_Last]; - -void *au_kzrealloc(void *p, unsigned int nused, unsigned int new_sz, gfp_t gfp); -int au_seq_path(struct seq_file *seq, struct path *path); - -#ifdef CONFIG_PROC_FS -/* procfs.c */ -int __init au_procfs_init(void); -void au_procfs_fin(void); -#else -AuStubInt0(au_procfs_init, void); -AuStubVoid(au_procfs_fin, void); -#endif - -/* ---------------------------------------------------------------------- */ - -/* kmem cache */ -enum { - AuCache_DINFO, - AuCache_ICNTNR, - AuCache_FINFO, - AuCache_VDIR, - AuCache_DEHSTR, - AuCache_HNOTIFY, /* must be last */ - AuCache_Last -}; - -#define AuCacheFlags (SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD) -#define AuCache(type) KMEM_CACHE(type, AuCacheFlags) -#define AuCacheCtor(type, ctor) \ - kmem_cache_create(#type, sizeof(struct type), \ - __alignof__(struct type), AuCacheFlags, ctor) - -extern struct kmem_cache *au_cachep[]; - -#define AuCacheFuncs(name, index) \ -static inline struct au_##name *au_cache_alloc_##name(void) \ -{ return kmem_cache_alloc(au_cachep[AuCache_##index], GFP_NOFS); } \ -static inline void au_cache_free_##name(struct au_##name *p) \ -{ kmem_cache_free(au_cachep[AuCache_##index], p); } - -AuCacheFuncs(dinfo, DINFO); -AuCacheFuncs(icntnr, ICNTNR); -AuCacheFuncs(finfo, FINFO); -AuCacheFuncs(vdir, VDIR); -AuCacheFuncs(vdir_dehstr, DEHSTR); -#ifdef CONFIG_AUFS_HNOTIFY -AuCacheFuncs(hnotify, HNOTIFY); -#endif - -#endif /* __KERNEL__ */ -#endif /* __AUFS_MODULE_H__ */ diff --git a/fs/aufs/mvdown.c b/fs/aufs/mvdown.c deleted file mode 100644 index 25e3a80b9..000000000 --- a/fs/aufs/mvdown.c +++ /dev/null @@ -1,694 +0,0 @@ -/* - * Copyright (C) 2011-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * move-down, opposite of copy-up - */ - -#include "aufs.h" - -struct au_mvd_args { - struct { - struct super_block *h_sb; - struct dentry *h_parent; - struct au_hinode *hdir; - struct inode *h_dir, *h_inode; - struct au_pin pin; - } info[AUFS_MVDOWN_NARRAY]; - - struct aufs_mvdown mvdown; - struct dentry *dentry, *parent; - struct inode *inode, *dir; - struct super_block *sb; - aufs_bindex_t bopq, bwh, bfound; - unsigned char rename_lock; -}; - -#define mvd_errno mvdown.au_errno -#define mvd_bsrc mvdown.stbr[AUFS_MVDOWN_UPPER].bindex -#define mvd_src_brid mvdown.stbr[AUFS_MVDOWN_UPPER].brid -#define mvd_bdst mvdown.stbr[AUFS_MVDOWN_LOWER].bindex -#define mvd_dst_brid mvdown.stbr[AUFS_MVDOWN_LOWER].brid - -#define mvd_h_src_sb info[AUFS_MVDOWN_UPPER].h_sb -#define mvd_h_src_parent info[AUFS_MVDOWN_UPPER].h_parent -#define mvd_hdir_src info[AUFS_MVDOWN_UPPER].hdir -#define mvd_h_src_dir info[AUFS_MVDOWN_UPPER].h_dir -#define mvd_h_src_inode info[AUFS_MVDOWN_UPPER].h_inode -#define mvd_pin_src info[AUFS_MVDOWN_UPPER].pin - -#define mvd_h_dst_sb info[AUFS_MVDOWN_LOWER].h_sb -#define mvd_h_dst_parent info[AUFS_MVDOWN_LOWER].h_parent -#define mvd_hdir_dst info[AUFS_MVDOWN_LOWER].hdir -#define mvd_h_dst_dir info[AUFS_MVDOWN_LOWER].h_dir -#define mvd_h_dst_inode info[AUFS_MVDOWN_LOWER].h_inode -#define mvd_pin_dst info[AUFS_MVDOWN_LOWER].pin - -#define AU_MVD_PR(flag, ...) do { \ - if (flag) \ - pr_err(__VA_ARGS__); \ - } while (0) - -static int find_lower_writable(struct au_mvd_args *a) -{ - struct super_block *sb; - aufs_bindex_t bindex, bend; - struct au_branch *br; - - sb = a->sb; - bindex = a->mvd_bsrc; - bend = au_sbend(sb); - if (a->mvdown.flags & AUFS_MVDOWN_FHSM_LOWER) - for (bindex++; bindex <= bend; bindex++) { - br = au_sbr(sb, bindex); - if (au_br_fhsm(br->br_perm) - && (!(au_br_sb(br)->s_flags & MS_RDONLY))) - return bindex; - } - else if (!(a->mvdown.flags & AUFS_MVDOWN_ROLOWER)) - for (bindex++; bindex <= bend; bindex++) { - br = au_sbr(sb, bindex); - if (!au_br_rdonly(br)) - return bindex; - } - else - for (bindex++; bindex <= bend; bindex++) { - br = au_sbr(sb, bindex); - if (!(au_br_sb(br)->s_flags & MS_RDONLY)) { - if (au_br_rdonly(br)) - a->mvdown.flags - |= AUFS_MVDOWN_ROLOWER_R; - return bindex; - } - } - - return -1; -} - -/* make the parent dir on bdst */ -static int au_do_mkdir(const unsigned char dmsg, struct au_mvd_args *a) -{ - int err; - - err = 0; - a->mvd_hdir_src = au_hi(a->dir, a->mvd_bsrc); - a->mvd_hdir_dst = au_hi(a->dir, a->mvd_bdst); - a->mvd_h_src_parent = au_h_dptr(a->parent, a->mvd_bsrc); - a->mvd_h_dst_parent = NULL; - if (au_dbend(a->parent) >= a->mvd_bdst) - a->mvd_h_dst_parent = au_h_dptr(a->parent, a->mvd_bdst); - if (!a->mvd_h_dst_parent) { - err = au_cpdown_dirs(a->dentry, a->mvd_bdst); - if (unlikely(err)) { - AU_MVD_PR(dmsg, "cpdown_dirs failed\n"); - goto out; - } - a->mvd_h_dst_parent = au_h_dptr(a->parent, a->mvd_bdst); - } - -out: - AuTraceErr(err); - return err; -} - -/* lock them all */ -static int au_do_lock(const unsigned char dmsg, struct au_mvd_args *a) -{ - int err; - struct dentry *h_trap; - - a->mvd_h_src_sb = au_sbr_sb(a->sb, a->mvd_bsrc); - a->mvd_h_dst_sb = au_sbr_sb(a->sb, a->mvd_bdst); - err = au_pin(&a->mvd_pin_dst, a->dentry, a->mvd_bdst, - au_opt_udba(a->sb), - AuPin_MNT_WRITE | AuPin_DI_LOCKED); - AuTraceErr(err); - if (unlikely(err)) { - AU_MVD_PR(dmsg, "pin_dst failed\n"); - goto out; - } - - if (a->mvd_h_src_sb != a->mvd_h_dst_sb) { - a->rename_lock = 0; - au_pin_init(&a->mvd_pin_src, a->dentry, a->mvd_bsrc, - AuLsc_DI_PARENT, AuLsc_I_PARENT3, - au_opt_udba(a->sb), - AuPin_MNT_WRITE | AuPin_DI_LOCKED); - err = au_do_pin(&a->mvd_pin_src); - AuTraceErr(err); - a->mvd_h_src_dir = d_inode(a->mvd_h_src_parent); - if (unlikely(err)) { - AU_MVD_PR(dmsg, "pin_src failed\n"); - goto out_dst; - } - goto out; /* success */ - } - - a->rename_lock = 1; - au_pin_hdir_unlock(&a->mvd_pin_dst); - err = au_pin(&a->mvd_pin_src, a->dentry, a->mvd_bsrc, - au_opt_udba(a->sb), - AuPin_MNT_WRITE | AuPin_DI_LOCKED); - AuTraceErr(err); - a->mvd_h_src_dir = d_inode(a->mvd_h_src_parent); - if (unlikely(err)) { - AU_MVD_PR(dmsg, "pin_src failed\n"); - au_pin_hdir_lock(&a->mvd_pin_dst); - goto out_dst; - } - au_pin_hdir_unlock(&a->mvd_pin_src); - h_trap = vfsub_lock_rename(a->mvd_h_src_parent, a->mvd_hdir_src, - a->mvd_h_dst_parent, a->mvd_hdir_dst); - if (h_trap) { - err = (h_trap != a->mvd_h_src_parent); - if (err) - err = (h_trap != a->mvd_h_dst_parent); - } - BUG_ON(err); /* it should never happen */ - if (unlikely(a->mvd_h_src_dir != au_pinned_h_dir(&a->mvd_pin_src))) { - err = -EBUSY; - AuTraceErr(err); - vfsub_unlock_rename(a->mvd_h_src_parent, a->mvd_hdir_src, - a->mvd_h_dst_parent, a->mvd_hdir_dst); - au_pin_hdir_lock(&a->mvd_pin_src); - au_unpin(&a->mvd_pin_src); - au_pin_hdir_lock(&a->mvd_pin_dst); - goto out_dst; - } - goto out; /* success */ - -out_dst: - au_unpin(&a->mvd_pin_dst); -out: - AuTraceErr(err); - return err; -} - -static void au_do_unlock(const unsigned char dmsg, struct au_mvd_args *a) -{ - if (!a->rename_lock) - au_unpin(&a->mvd_pin_src); - else { - vfsub_unlock_rename(a->mvd_h_src_parent, a->mvd_hdir_src, - a->mvd_h_dst_parent, a->mvd_hdir_dst); - au_pin_hdir_lock(&a->mvd_pin_src); - au_unpin(&a->mvd_pin_src); - au_pin_hdir_lock(&a->mvd_pin_dst); - } - au_unpin(&a->mvd_pin_dst); -} - -/* copy-down the file */ -static int au_do_cpdown(const unsigned char dmsg, struct au_mvd_args *a) -{ - int err; - struct au_cp_generic cpg = { - .dentry = a->dentry, - .bdst = a->mvd_bdst, - .bsrc = a->mvd_bsrc, - .len = -1, - .pin = &a->mvd_pin_dst, - .flags = AuCpup_DTIME | AuCpup_HOPEN - }; - - AuDbg("b%d, b%d\n", cpg.bsrc, cpg.bdst); - if (a->mvdown.flags & AUFS_MVDOWN_OWLOWER) - au_fset_cpup(cpg.flags, OVERWRITE); - if (a->mvdown.flags & AUFS_MVDOWN_ROLOWER) - au_fset_cpup(cpg.flags, RWDST); - err = au_sio_cpdown_simple(&cpg); - if (unlikely(err)) - AU_MVD_PR(dmsg, "cpdown failed\n"); - - AuTraceErr(err); - return err; -} - -/* - * unlink the whiteout on bdst if exist which may be created by UDBA while we - * were sleeping - */ -static int au_do_unlink_wh(const unsigned char dmsg, struct au_mvd_args *a) -{ - int err; - struct path h_path; - struct au_branch *br; - struct inode *delegated; - - br = au_sbr(a->sb, a->mvd_bdst); - h_path.dentry = au_wh_lkup(a->mvd_h_dst_parent, &a->dentry->d_name, br); - err = PTR_ERR(h_path.dentry); - if (IS_ERR(h_path.dentry)) { - AU_MVD_PR(dmsg, "wh_lkup failed\n"); - goto out; - } - - err = 0; - if (d_is_positive(h_path.dentry)) { - h_path.mnt = au_br_mnt(br); - delegated = NULL; - err = vfsub_unlink(d_inode(a->mvd_h_dst_parent), &h_path, - &delegated, /*force*/0); - if (unlikely(err == -EWOULDBLOCK)) { - pr_warn("cannot retry for NFSv4 delegation" - " for an internal unlink\n"); - iput(delegated); - } - if (unlikely(err)) - AU_MVD_PR(dmsg, "wh_unlink failed\n"); - } - dput(h_path.dentry); - -out: - AuTraceErr(err); - return err; -} - -/* - * unlink the topmost h_dentry - */ -static int au_do_unlink(const unsigned char dmsg, struct au_mvd_args *a) -{ - int err; - struct path h_path; - struct inode *delegated; - - h_path.mnt = au_sbr_mnt(a->sb, a->mvd_bsrc); - h_path.dentry = au_h_dptr(a->dentry, a->mvd_bsrc); - delegated = NULL; - err = vfsub_unlink(a->mvd_h_src_dir, &h_path, &delegated, /*force*/0); - if (unlikely(err == -EWOULDBLOCK)) { - pr_warn("cannot retry for NFSv4 delegation" - " for an internal unlink\n"); - iput(delegated); - } - if (unlikely(err)) - AU_MVD_PR(dmsg, "unlink failed\n"); - - AuTraceErr(err); - return err; -} - -/* Since mvdown succeeded, we ignore an error of this function */ -static void au_do_stfs(const unsigned char dmsg, struct au_mvd_args *a) -{ - int err; - struct au_branch *br; - - a->mvdown.flags |= AUFS_MVDOWN_STFS_FAILED; - br = au_sbr(a->sb, a->mvd_bsrc); - err = au_br_stfs(br, &a->mvdown.stbr[AUFS_MVDOWN_UPPER].stfs); - if (!err) { - br = au_sbr(a->sb, a->mvd_bdst); - a->mvdown.stbr[AUFS_MVDOWN_LOWER].brid = br->br_id; - err = au_br_stfs(br, &a->mvdown.stbr[AUFS_MVDOWN_LOWER].stfs); - } - if (!err) - a->mvdown.flags &= ~AUFS_MVDOWN_STFS_FAILED; - else - AU_MVD_PR(dmsg, "statfs failed (%d), ignored\n", err); -} - -/* - * copy-down the file and unlink the bsrc file. - * - unlink the bdst whout if exist - * - copy-down the file (with whtmp name and rename) - * - unlink the bsrc file - */ -static int au_do_mvdown(const unsigned char dmsg, struct au_mvd_args *a) -{ - int err; - - err = au_do_mkdir(dmsg, a); - if (!err) - err = au_do_lock(dmsg, a); - if (unlikely(err)) - goto out; - - /* - * do not revert the activities we made on bdst since they should be - * harmless in aufs. - */ - - err = au_do_cpdown(dmsg, a); - if (!err) - err = au_do_unlink_wh(dmsg, a); - if (!err && !(a->mvdown.flags & AUFS_MVDOWN_KUPPER)) - err = au_do_unlink(dmsg, a); - if (unlikely(err)) - goto out_unlock; - - AuDbg("%pd2, 0x%x, %d --> %d\n", - a->dentry, a->mvdown.flags, a->mvd_bsrc, a->mvd_bdst); - if (find_lower_writable(a) < 0) - a->mvdown.flags |= AUFS_MVDOWN_BOTTOM; - - if (a->mvdown.flags & AUFS_MVDOWN_STFS) - au_do_stfs(dmsg, a); - - /* maintain internal array */ - if (!(a->mvdown.flags & AUFS_MVDOWN_KUPPER)) { - au_set_h_dptr(a->dentry, a->mvd_bsrc, NULL); - au_set_dbstart(a->dentry, a->mvd_bdst); - au_set_h_iptr(a->inode, a->mvd_bsrc, NULL, /*flags*/0); - au_set_ibstart(a->inode, a->mvd_bdst); - } - if (au_dbend(a->dentry) < a->mvd_bdst) - au_set_dbend(a->dentry, a->mvd_bdst); - if (au_ibend(a->inode) < a->mvd_bdst) - au_set_ibend(a->inode, a->mvd_bdst); - -out_unlock: - au_do_unlock(dmsg, a); -out: - AuTraceErr(err); - return err; -} - -/* ---------------------------------------------------------------------- */ - -/* make sure the file is idle */ -static int au_mvd_args_busy(const unsigned char dmsg, struct au_mvd_args *a) -{ - int err, plinked; - - err = 0; - plinked = !!au_opt_test(au_mntflags(a->sb), PLINK); - if (au_dbstart(a->dentry) == a->mvd_bsrc - && au_dcount(a->dentry) == 1 - && atomic_read(&a->inode->i_count) == 1 - /* && a->mvd_h_src_inode->i_nlink == 1 */ - && (!plinked || !au_plink_test(a->inode)) - && a->inode->i_nlink == 1) - goto out; - - err = -EBUSY; - AU_MVD_PR(dmsg, - "b%d, d{b%d, c%d?}, i{c%d?, l%u}, hi{l%u}, p{%d, %d}\n", - a->mvd_bsrc, au_dbstart(a->dentry), au_dcount(a->dentry), - atomic_read(&a->inode->i_count), a->inode->i_nlink, - a->mvd_h_src_inode->i_nlink, - plinked, plinked ? au_plink_test(a->inode) : 0); - -out: - AuTraceErr(err); - return err; -} - -/* make sure the parent dir is fine */ -static int au_mvd_args_parent(const unsigned char dmsg, - struct au_mvd_args *a) -{ - int err; - aufs_bindex_t bindex; - - err = 0; - if (unlikely(au_alive_dir(a->parent))) { - err = -ENOENT; - AU_MVD_PR(dmsg, "parent dir is dead\n"); - goto out; - } - - a->bopq = au_dbdiropq(a->parent); - bindex = au_wbr_nonopq(a->dentry, a->mvd_bdst); - AuDbg("b%d\n", bindex); - if (unlikely((bindex >= 0 && bindex < a->mvd_bdst) - || (a->bopq != -1 && a->bopq < a->mvd_bdst))) { - err = -EINVAL; - a->mvd_errno = EAU_MVDOWN_OPAQUE; - AU_MVD_PR(dmsg, "ancestor is opaque b%d, b%d\n", - a->bopq, a->mvd_bdst); - } - -out: - AuTraceErr(err); - return err; -} - -static int au_mvd_args_intermediate(const unsigned char dmsg, - struct au_mvd_args *a) -{ - int err; - struct au_dinfo *dinfo, *tmp; - - /* lookup the next lower positive entry */ - err = -ENOMEM; - tmp = au_di_alloc(a->sb, AuLsc_DI_TMP); - if (unlikely(!tmp)) - goto out; - - a->bfound = -1; - a->bwh = -1; - dinfo = au_di(a->dentry); - au_di_cp(tmp, dinfo); - au_di_swap(tmp, dinfo); - - /* returns the number of positive dentries */ - err = au_lkup_dentry(a->dentry, a->mvd_bsrc + 1, /*type*/0); - if (!err) - a->bwh = au_dbwh(a->dentry); - else if (err > 0) - a->bfound = au_dbstart(a->dentry); - - au_di_swap(tmp, dinfo); - au_rw_write_unlock(&tmp->di_rwsem); - au_di_free(tmp); - if (unlikely(err < 0)) - AU_MVD_PR(dmsg, "failed look-up lower\n"); - - /* - * here, we have these cases. - * bfound == -1 - * no positive dentry under bsrc. there are more sub-cases. - * bwh < 0 - * there no whiteout, we can safely move-down. - * bwh <= bsrc - * impossible - * bsrc < bwh && bwh < bdst - * there is a whiteout on RO branch. cannot proceed. - * bwh == bdst - * there is a whiteout on the RW target branch. it should - * be removed. - * bdst < bwh - * there is a whiteout somewhere unrelated branch. - * -1 < bfound && bfound <= bsrc - * impossible. - * bfound < bdst - * found, but it is on RO branch between bsrc and bdst. cannot - * proceed. - * bfound == bdst - * found, replace it if AUFS_MVDOWN_FORCE is set. otherwise return - * error. - * bdst < bfound - * found, after we create the file on bdst, it will be hidden. - */ - - AuDebugOn(a->bfound == -1 - && a->bwh != -1 - && a->bwh <= a->mvd_bsrc); - AuDebugOn(-1 < a->bfound - && a->bfound <= a->mvd_bsrc); - - err = -EINVAL; - if (a->bfound == -1 - && a->mvd_bsrc < a->bwh - && a->bwh != -1 - && a->bwh < a->mvd_bdst) { - a->mvd_errno = EAU_MVDOWN_WHITEOUT; - AU_MVD_PR(dmsg, "bsrc %d, bdst %d, bfound %d, bwh %d\n", - a->mvd_bsrc, a->mvd_bdst, a->bfound, a->bwh); - goto out; - } else if (a->bfound != -1 && a->bfound < a->mvd_bdst) { - a->mvd_errno = EAU_MVDOWN_UPPER; - AU_MVD_PR(dmsg, "bdst %d, bfound %d\n", - a->mvd_bdst, a->bfound); - goto out; - } - - err = 0; /* success */ - -out: - AuTraceErr(err); - return err; -} - -static int au_mvd_args_exist(const unsigned char dmsg, struct au_mvd_args *a) -{ - int err; - - err = 0; - if (!(a->mvdown.flags & AUFS_MVDOWN_OWLOWER) - && a->bfound == a->mvd_bdst) - err = -EEXIST; - AuTraceErr(err); - return err; -} - -static int au_mvd_args(const unsigned char dmsg, struct au_mvd_args *a) -{ - int err; - struct au_branch *br; - - err = -EISDIR; - if (unlikely(S_ISDIR(a->inode->i_mode))) - goto out; - - err = -EINVAL; - if (!(a->mvdown.flags & AUFS_MVDOWN_BRID_UPPER)) - a->mvd_bsrc = au_ibstart(a->inode); - else { - a->mvd_bsrc = au_br_index(a->sb, a->mvd_src_brid); - if (unlikely(a->mvd_bsrc < 0 - || (a->mvd_bsrc < au_dbstart(a->dentry) - || au_dbend(a->dentry) < a->mvd_bsrc - || !au_h_dptr(a->dentry, a->mvd_bsrc)) - || (a->mvd_bsrc < au_ibstart(a->inode) - || au_ibend(a->inode) < a->mvd_bsrc - || !au_h_iptr(a->inode, a->mvd_bsrc)))) { - a->mvd_errno = EAU_MVDOWN_NOUPPER; - AU_MVD_PR(dmsg, "no upper\n"); - goto out; - } - } - if (unlikely(a->mvd_bsrc == au_sbend(a->sb))) { - a->mvd_errno = EAU_MVDOWN_BOTTOM; - AU_MVD_PR(dmsg, "on the bottom\n"); - goto out; - } - a->mvd_h_src_inode = au_h_iptr(a->inode, a->mvd_bsrc); - br = au_sbr(a->sb, a->mvd_bsrc); - err = au_br_rdonly(br); - if (!(a->mvdown.flags & AUFS_MVDOWN_ROUPPER)) { - if (unlikely(err)) - goto out; - } else if (!(vfsub_native_ro(a->mvd_h_src_inode) - || IS_APPEND(a->mvd_h_src_inode))) { - if (err) - a->mvdown.flags |= AUFS_MVDOWN_ROUPPER_R; - /* go on */ - } else - goto out; - - err = -EINVAL; - if (!(a->mvdown.flags & AUFS_MVDOWN_BRID_LOWER)) { - a->mvd_bdst = find_lower_writable(a); - if (unlikely(a->mvd_bdst < 0)) { - a->mvd_errno = EAU_MVDOWN_BOTTOM; - AU_MVD_PR(dmsg, "no writable lower branch\n"); - goto out; - } - } else { - a->mvd_bdst = au_br_index(a->sb, a->mvd_dst_brid); - if (unlikely(a->mvd_bdst < 0 - || au_sbend(a->sb) < a->mvd_bdst)) { - a->mvd_errno = EAU_MVDOWN_NOLOWERBR; - AU_MVD_PR(dmsg, "no lower brid\n"); - goto out; - } - } - - err = au_mvd_args_busy(dmsg, a); - if (!err) - err = au_mvd_args_parent(dmsg, a); - if (!err) - err = au_mvd_args_intermediate(dmsg, a); - if (!err) - err = au_mvd_args_exist(dmsg, a); - if (!err) - AuDbg("b%d, b%d\n", a->mvd_bsrc, a->mvd_bdst); - -out: - AuTraceErr(err); - return err; -} - -int au_mvdown(struct dentry *dentry, struct aufs_mvdown __user *uarg) -{ - int err, e; - unsigned char dmsg; - struct au_mvd_args *args; - - err = -EPERM; - if (unlikely(!capable(CAP_SYS_ADMIN))) - goto out; - - err = -ENOMEM; - args = kmalloc(sizeof(*args), GFP_NOFS); - if (unlikely(!args)) - goto out; - - err = copy_from_user(&args->mvdown, uarg, sizeof(args->mvdown)); - if (!err) - err = !access_ok(VERIFY_WRITE, uarg, sizeof(*uarg)); - if (unlikely(err)) { - err = -EFAULT; - AuTraceErr(err); - goto out_free; - } - AuDbg("flags 0x%x\n", args->mvdown.flags); - args->mvdown.flags &= ~(AUFS_MVDOWN_ROLOWER_R | AUFS_MVDOWN_ROUPPER_R); - args->mvdown.au_errno = 0; - args->dentry = dentry; - args->inode = d_inode(dentry); - args->sb = dentry->d_sb; - - err = -ENOENT; - dmsg = !!(args->mvdown.flags & AUFS_MVDOWN_DMSG); - args->parent = dget_parent(dentry); - args->dir = d_inode(args->parent); - mutex_lock_nested(&args->dir->i_mutex, I_MUTEX_PARENT); - dput(args->parent); - if (unlikely(args->parent != dentry->d_parent)) { - AU_MVD_PR(dmsg, "parent dir is moved\n"); - goto out_dir; - } - - mutex_lock_nested(&args->inode->i_mutex, I_MUTEX_CHILD); - err = aufs_read_lock(dentry, AuLock_DW | AuLock_FLUSH); - if (unlikely(err)) - goto out_inode; - - di_write_lock_parent(args->parent); - err = au_mvd_args(dmsg, args); - if (unlikely(err)) - goto out_parent; - - err = au_do_mvdown(dmsg, args); - if (unlikely(err)) - goto out_parent; - - au_cpup_attr_timesizes(args->dir); - au_cpup_attr_timesizes(args->inode); - au_cpup_igen(args->inode, au_h_iptr(args->inode, args->mvd_bdst)); - /* au_digen_dec(dentry); */ - -out_parent: - di_write_unlock(args->parent); - aufs_read_unlock(dentry, AuLock_DW); -out_inode: - mutex_unlock(&args->inode->i_mutex); -out_dir: - mutex_unlock(&args->dir->i_mutex); -out_free: - e = copy_to_user(uarg, &args->mvdown, sizeof(args->mvdown)); - if (unlikely(e)) - err = -EFAULT; - kfree(args); -out: - AuTraceErr(err); - return err; -} diff --git a/fs/aufs/opts.c b/fs/aufs/opts.c deleted file mode 100644 index 4e8e2e1ff..000000000 --- a/fs/aufs/opts.c +++ /dev/null @@ -1,1835 +0,0 @@ -/* - * Copyright (C) 2005-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * mount options/flags - */ - -#include <linux/namei.h> -#include <linux/types.h> /* a distribution requires */ -#include <linux/parser.h> -#include "aufs.h" - -/* ---------------------------------------------------------------------- */ - -enum { - Opt_br, - Opt_add, Opt_del, Opt_mod, Opt_append, Opt_prepend, - Opt_idel, Opt_imod, - Opt_dirwh, Opt_rdcache, Opt_rdblk, Opt_rdhash, - Opt_rdblk_def, Opt_rdhash_def, - Opt_xino, Opt_noxino, - Opt_trunc_xino, Opt_trunc_xino_v, Opt_notrunc_xino, - Opt_trunc_xino_path, Opt_itrunc_xino, - Opt_trunc_xib, Opt_notrunc_xib, - Opt_shwh, Opt_noshwh, - Opt_plink, Opt_noplink, Opt_list_plink, - Opt_udba, - Opt_dio, Opt_nodio, - Opt_diropq_a, Opt_diropq_w, - Opt_warn_perm, Opt_nowarn_perm, - Opt_wbr_copyup, Opt_wbr_create, - Opt_fhsm_sec, - Opt_verbose, Opt_noverbose, - Opt_sum, Opt_nosum, Opt_wsum, - Opt_dirperm1, Opt_nodirperm1, - Opt_acl, Opt_noacl, - Opt_tail, Opt_ignore, Opt_ignore_silent, Opt_err -}; - -static match_table_t options = { - {Opt_br, "br=%s"}, - {Opt_br, "br:%s"}, - - {Opt_add, "add=%d:%s"}, - {Opt_add, "add:%d:%s"}, - {Opt_add, "ins=%d:%s"}, - {Opt_add, "ins:%d:%s"}, - {Opt_append, "append=%s"}, - {Opt_append, "append:%s"}, - {Opt_prepend, "prepend=%s"}, - {Opt_prepend, "prepend:%s"}, - - {Opt_del, "del=%s"}, - {Opt_del, "del:%s"}, - /* {Opt_idel, "idel:%d"}, */ - {Opt_mod, "mod=%s"}, - {Opt_mod, "mod:%s"}, - /* {Opt_imod, "imod:%d:%s"}, */ - - {Opt_dirwh, "dirwh=%d"}, - - {Opt_xino, "xino=%s"}, - {Opt_noxino, "noxino"}, - {Opt_trunc_xino, "trunc_xino"}, - {Opt_trunc_xino_v, "trunc_xino_v=%d:%d"}, - {Opt_notrunc_xino, "notrunc_xino"}, - {Opt_trunc_xino_path, "trunc_xino=%s"}, - {Opt_itrunc_xino, "itrunc_xino=%d"}, - /* {Opt_zxino, "zxino=%s"}, */ - {Opt_trunc_xib, "trunc_xib"}, - {Opt_notrunc_xib, "notrunc_xib"}, - -#ifdef CONFIG_PROC_FS - {Opt_plink, "plink"}, -#else - {Opt_ignore_silent, "plink"}, -#endif - - {Opt_noplink, "noplink"}, - -#ifdef CONFIG_AUFS_DEBUG - {Opt_list_plink, "list_plink"}, -#endif - - {Opt_udba, "udba=%s"}, - - {Opt_dio, "dio"}, - {Opt_nodio, "nodio"}, - -#ifdef CONFIG_AUFS_FHSM - {Opt_fhsm_sec, "fhsm_sec=%d"}, -#else - {Opt_ignore_silent, "fhsm_sec=%d"}, -#endif - - {Opt_diropq_a, "diropq=always"}, - {Opt_diropq_a, "diropq=a"}, - {Opt_diropq_w, "diropq=whiteouted"}, - {Opt_diropq_w, "diropq=w"}, - - {Opt_warn_perm, "warn_perm"}, - {Opt_nowarn_perm, "nowarn_perm"}, - - /* keep them temporary */ - {Opt_ignore_silent, "nodlgt"}, - {Opt_ignore_silent, "clean_plink"}, - -#ifdef CONFIG_AUFS_SHWH - {Opt_shwh, "shwh"}, -#endif - {Opt_noshwh, "noshwh"}, - - {Opt_dirperm1, "dirperm1"}, - {Opt_nodirperm1, "nodirperm1"}, - - {Opt_verbose, "verbose"}, - {Opt_verbose, "v"}, - {Opt_noverbose, "noverbose"}, - {Opt_noverbose, "quiet"}, - {Opt_noverbose, "q"}, - {Opt_noverbose, "silent"}, - - {Opt_sum, "sum"}, - {Opt_nosum, "nosum"}, - {Opt_wsum, "wsum"}, - - {Opt_rdcache, "rdcache=%d"}, - {Opt_rdblk, "rdblk=%d"}, - {Opt_rdblk_def, "rdblk=def"}, - {Opt_rdhash, "rdhash=%d"}, - {Opt_rdhash_def, "rdhash=def"}, - - {Opt_wbr_create, "create=%s"}, - {Opt_wbr_create, "create_policy=%s"}, - {Opt_wbr_copyup, "cpup=%s"}, - {Opt_wbr_copyup, "copyup=%s"}, - {Opt_wbr_copyup, "copyup_policy=%s"}, - - /* generic VFS flag */ -#ifdef CONFIG_FS_POSIX_ACL - {Opt_acl, "acl"}, - {Opt_noacl, "noacl"}, -#else - {Opt_ignore_silent, "acl"}, - {Opt_ignore_silent, "noacl"}, -#endif - - /* internal use for the scripts */ - {Opt_ignore_silent, "si=%s"}, - - {Opt_br, "dirs=%s"}, - {Opt_ignore, "debug=%d"}, - {Opt_ignore, "delete=whiteout"}, - {Opt_ignore, "delete=all"}, - {Opt_ignore, "imap=%s"}, - - /* temporary workaround, due to old mount(8)? */ - {Opt_ignore_silent, "relatime"}, - - {Opt_err, NULL} -}; - -/* ---------------------------------------------------------------------- */ - -static const char *au_parser_pattern(int val, match_table_t tbl) -{ - struct match_token *p; - - p = tbl; - while (p->pattern) { - if (p->token == val) - return p->pattern; - p++; - } - BUG(); - return "??"; -} - -static const char *au_optstr(int *val, match_table_t tbl) -{ - struct match_token *p; - int v; - - v = *val; - if (!v) - goto out; - p = tbl; - while (p->pattern) { - if (p->token - && (v & p->token) == p->token) { - *val &= ~p->token; - return p->pattern; - } - p++; - } - -out: - return NULL; -} - -/* ---------------------------------------------------------------------- */ - -static match_table_t brperm = { - {AuBrPerm_RO, AUFS_BRPERM_RO}, - {AuBrPerm_RR, AUFS_BRPERM_RR}, - {AuBrPerm_RW, AUFS_BRPERM_RW}, - {0, NULL} -}; - -static match_table_t brattr = { - /* general */ - {AuBrAttr_COO_REG, AUFS_BRATTR_COO_REG}, - {AuBrAttr_COO_ALL, AUFS_BRATTR_COO_ALL}, - /* 'unpin' attrib is meaningless since linux-3.18-rc1 */ - {AuBrAttr_UNPIN, AUFS_BRATTR_UNPIN}, -#ifdef CONFIG_AUFS_FHSM - {AuBrAttr_FHSM, AUFS_BRATTR_FHSM}, -#endif -#ifdef CONFIG_AUFS_XATTR - {AuBrAttr_ICEX, AUFS_BRATTR_ICEX}, - {AuBrAttr_ICEX_SEC, AUFS_BRATTR_ICEX_SEC}, - {AuBrAttr_ICEX_SYS, AUFS_BRATTR_ICEX_SYS}, - {AuBrAttr_ICEX_TR, AUFS_BRATTR_ICEX_TR}, - {AuBrAttr_ICEX_USR, AUFS_BRATTR_ICEX_USR}, - {AuBrAttr_ICEX_OTH, AUFS_BRATTR_ICEX_OTH}, -#endif - - /* ro/rr branch */ - {AuBrRAttr_WH, AUFS_BRRATTR_WH}, - - /* rw branch */ - {AuBrWAttr_MOO, AUFS_BRWATTR_MOO}, - {AuBrWAttr_NoLinkWH, AUFS_BRWATTR_NLWH}, - - {0, NULL} -}; - -static int br_attr_val(char *str, match_table_t table, substring_t args[]) -{ - int attr, v; - char *p; - - attr = 0; - do { - p = strchr(str, '+'); - if (p) - *p = 0; - v = match_token(str, table, args); - if (v) { - if (v & AuBrAttr_CMOO_Mask) - attr &= ~AuBrAttr_CMOO_Mask; - attr |= v; - } else { - if (p) - *p = '+'; - pr_warn("ignored branch attribute %s\n", str); - break; - } - if (p) - str = p + 1; - } while (p); - - return attr; -} - -static int au_do_optstr_br_attr(au_br_perm_str_t *str, int perm) -{ - int sz; - const char *p; - char *q; - - q = str->a; - *q = 0; - p = au_optstr(&perm, brattr); - if (p) { - sz = strlen(p); - memcpy(q, p, sz + 1); - q += sz; - } else - goto out; - - do { - p = au_optstr(&perm, brattr); - if (p) { - *q++ = '+'; - sz = strlen(p); - memcpy(q, p, sz + 1); - q += sz; - } - } while (p); - -out: - return q - str->a; -} - -static int noinline_for_stack br_perm_val(char *perm) -{ - int val, bad, sz; - char *p; - substring_t args[MAX_OPT_ARGS]; - au_br_perm_str_t attr; - - p = strchr(perm, '+'); - if (p) - *p = 0; - val = match_token(perm, brperm, args); - if (!val) { - if (p) - *p = '+'; - pr_warn("ignored branch permission %s\n", perm); - val = AuBrPerm_RO; - goto out; - } - if (!p) - goto out; - - val |= br_attr_val(p + 1, brattr, args); - - bad = 0; - switch (val & AuBrPerm_Mask) { - case AuBrPerm_RO: - case AuBrPerm_RR: - bad = val & AuBrWAttr_Mask; - val &= ~AuBrWAttr_Mask; - break; - case AuBrPerm_RW: - bad = val & AuBrRAttr_Mask; - val &= ~AuBrRAttr_Mask; - break; - } - - /* - * 'unpin' attrib becomes meaningless since linux-3.18-rc1, but aufs - * does not treat it as an error, just warning. - * this is a tiny guard for the user operation. - */ - if (val & AuBrAttr_UNPIN) { - bad |= AuBrAttr_UNPIN; - val &= ~AuBrAttr_UNPIN; - } - - if (unlikely(bad)) { - sz = au_do_optstr_br_attr(&attr, bad); - AuDebugOn(!sz); - pr_warn("ignored branch attribute %s\n", attr.a); - } - -out: - return val; -} - -void au_optstr_br_perm(au_br_perm_str_t *str, int perm) -{ - au_br_perm_str_t attr; - const char *p; - char *q; - int sz; - - q = str->a; - p = au_optstr(&perm, brperm); - AuDebugOn(!p || !*p); - sz = strlen(p); - memcpy(q, p, sz + 1); - q += sz; - - sz = au_do_optstr_br_attr(&attr, perm); - if (sz) { - *q++ = '+'; - memcpy(q, attr.a, sz + 1); - } - - AuDebugOn(strlen(str->a) >= sizeof(str->a)); -} - -/* ---------------------------------------------------------------------- */ - -static match_table_t udbalevel = { - {AuOpt_UDBA_REVAL, "reval"}, - {AuOpt_UDBA_NONE, "none"}, -#ifdef CONFIG_AUFS_HNOTIFY - {AuOpt_UDBA_HNOTIFY, "notify"}, /* abstraction */ -#ifdef CONFIG_AUFS_HFSNOTIFY - {AuOpt_UDBA_HNOTIFY, "fsnotify"}, -#endif -#endif - {-1, NULL} -}; - -static int noinline_for_stack udba_val(char *str) -{ - substring_t args[MAX_OPT_ARGS]; - - return match_token(str, udbalevel, args); -} - -const char *au_optstr_udba(int udba) -{ - return au_parser_pattern(udba, udbalevel); -} - -/* ---------------------------------------------------------------------- */ - -static match_table_t au_wbr_create_policy = { - {AuWbrCreate_TDP, "tdp"}, - {AuWbrCreate_TDP, "top-down-parent"}, - {AuWbrCreate_RR, "rr"}, - {AuWbrCreate_RR, "round-robin"}, - {AuWbrCreate_MFS, "mfs"}, - {AuWbrCreate_MFS, "most-free-space"}, - {AuWbrCreate_MFSV, "mfs:%d"}, - {AuWbrCreate_MFSV, "most-free-space:%d"}, - - {AuWbrCreate_MFSRR, "mfsrr:%d"}, - {AuWbrCreate_MFSRRV, "mfsrr:%d:%d"}, - {AuWbrCreate_PMFS, "pmfs"}, - {AuWbrCreate_PMFSV, "pmfs:%d"}, - {AuWbrCreate_PMFSRR, "pmfsrr:%d"}, - {AuWbrCreate_PMFSRRV, "pmfsrr:%d:%d"}, - - {-1, NULL} -}; - -/* - * cf. linux/lib/parser.c and cmdline.c - * gave up calling memparse() since it uses simple_strtoull() instead of - * kstrto...(). - */ -static int noinline_for_stack -au_match_ull(substring_t *s, unsigned long long *result) -{ - int err; - unsigned int len; - char a[32]; - - err = -ERANGE; - len = s->to - s->from; - if (len + 1 <= sizeof(a)) { - memcpy(a, s->from, len); - a[len] = '\0'; - err = kstrtoull(a, 0, result); - } - return err; -} - -static int au_wbr_mfs_wmark(substring_t *arg, char *str, - struct au_opt_wbr_create *create) -{ - int err; - unsigned long long ull; - - err = 0; - if (!au_match_ull(arg, &ull)) - create->mfsrr_watermark = ull; - else { - pr_err("bad integer in %s\n", str); - err = -EINVAL; - } - - return err; -} - -static int au_wbr_mfs_sec(substring_t *arg, char *str, - struct au_opt_wbr_create *create) -{ - int n, err; - - err = 0; - if (!match_int(arg, &n) && 0 <= n && n <= AUFS_MFS_MAX_SEC) - create->mfs_second = n; - else { - pr_err("bad integer in %s\n", str); - err = -EINVAL; - } - - return err; -} - -static int noinline_for_stack -au_wbr_create_val(char *str, struct au_opt_wbr_create *create) -{ - int err, e; - substring_t args[MAX_OPT_ARGS]; - - err = match_token(str, au_wbr_create_policy, args); - create->wbr_create = err; - switch (err) { - case AuWbrCreate_MFSRRV: - case AuWbrCreate_PMFSRRV: - e = au_wbr_mfs_wmark(&args[0], str, create); - if (!e) - e = au_wbr_mfs_sec(&args[1], str, create); - if (unlikely(e)) - err = e; - break; - case AuWbrCreate_MFSRR: - case AuWbrCreate_PMFSRR: - e = au_wbr_mfs_wmark(&args[0], str, create); - if (unlikely(e)) { - err = e; - break; - } - /*FALLTHROUGH*/ - case AuWbrCreate_MFS: - case AuWbrCreate_PMFS: - create->mfs_second = AUFS_MFS_DEF_SEC; - break; - case AuWbrCreate_MFSV: - case AuWbrCreate_PMFSV: - e = au_wbr_mfs_sec(&args[0], str, create); - if (unlikely(e)) - err = e; - break; - } - - return err; -} - -const char *au_optstr_wbr_create(int wbr_create) -{ - return au_parser_pattern(wbr_create, au_wbr_create_policy); -} - -static match_table_t au_wbr_copyup_policy = { - {AuWbrCopyup_TDP, "tdp"}, - {AuWbrCopyup_TDP, "top-down-parent"}, - {AuWbrCopyup_BUP, "bup"}, - {AuWbrCopyup_BUP, "bottom-up-parent"}, - {AuWbrCopyup_BU, "bu"}, - {AuWbrCopyup_BU, "bottom-up"}, - {-1, NULL} -}; - -static int noinline_for_stack au_wbr_copyup_val(char *str) -{ - substring_t args[MAX_OPT_ARGS]; - - return match_token(str, au_wbr_copyup_policy, args); -} - -const char *au_optstr_wbr_copyup(int wbr_copyup) -{ - return au_parser_pattern(wbr_copyup, au_wbr_copyup_policy); -} - -/* ---------------------------------------------------------------------- */ - -static const int lkup_dirflags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY; - -static void dump_opts(struct au_opts *opts) -{ -#ifdef CONFIG_AUFS_DEBUG - /* reduce stack space */ - union { - struct au_opt_add *add; - struct au_opt_del *del; - struct au_opt_mod *mod; - struct au_opt_xino *xino; - struct au_opt_xino_itrunc *xino_itrunc; - struct au_opt_wbr_create *create; - } u; - struct au_opt *opt; - - opt = opts->opt; - while (opt->type != Opt_tail) { - switch (opt->type) { - case Opt_add: - u.add = &opt->add; - AuDbg("add {b%d, %s, 0x%x, %p}\n", - u.add->bindex, u.add->pathname, u.add->perm, - u.add->path.dentry); - break; - case Opt_del: - case Opt_idel: - u.del = &opt->del; - AuDbg("del {%s, %p}\n", - u.del->pathname, u.del->h_path.dentry); - break; - case Opt_mod: - case Opt_imod: - u.mod = &opt->mod; - AuDbg("mod {%s, 0x%x, %p}\n", - u.mod->path, u.mod->perm, u.mod->h_root); - break; - case Opt_append: - u.add = &opt->add; - AuDbg("append {b%d, %s, 0x%x, %p}\n", - u.add->bindex, u.add->pathname, u.add->perm, - u.add->path.dentry); - break; - case Opt_prepend: - u.add = &opt->add; - AuDbg("prepend {b%d, %s, 0x%x, %p}\n", - u.add->bindex, u.add->pathname, u.add->perm, - u.add->path.dentry); - break; - case Opt_dirwh: - AuDbg("dirwh %d\n", opt->dirwh); - break; - case Opt_rdcache: - AuDbg("rdcache %d\n", opt->rdcache); - break; - case Opt_rdblk: - AuDbg("rdblk %u\n", opt->rdblk); - break; - case Opt_rdblk_def: - AuDbg("rdblk_def\n"); - break; - case Opt_rdhash: - AuDbg("rdhash %u\n", opt->rdhash); - break; - case Opt_rdhash_def: - AuDbg("rdhash_def\n"); - break; - case Opt_xino: - u.xino = &opt->xino; - AuDbg("xino {%s %pD}\n", u.xino->path, u.xino->file); - break; - case Opt_trunc_xino: - AuLabel(trunc_xino); - break; - case Opt_notrunc_xino: - AuLabel(notrunc_xino); - break; - case Opt_trunc_xino_path: - case Opt_itrunc_xino: - u.xino_itrunc = &opt->xino_itrunc; - AuDbg("trunc_xino %d\n", u.xino_itrunc->bindex); - break; - case Opt_noxino: - AuLabel(noxino); - break; - case Opt_trunc_xib: - AuLabel(trunc_xib); - break; - case Opt_notrunc_xib: - AuLabel(notrunc_xib); - break; - case Opt_shwh: - AuLabel(shwh); - break; - case Opt_noshwh: - AuLabel(noshwh); - break; - case Opt_dirperm1: - AuLabel(dirperm1); - break; - case Opt_nodirperm1: - AuLabel(nodirperm1); - break; - case Opt_plink: - AuLabel(plink); - break; - case Opt_noplink: - AuLabel(noplink); - break; - case Opt_list_plink: - AuLabel(list_plink); - break; - case Opt_udba: - AuDbg("udba %d, %s\n", - opt->udba, au_optstr_udba(opt->udba)); - break; - case Opt_dio: - AuLabel(dio); - break; - case Opt_nodio: - AuLabel(nodio); - break; - case Opt_diropq_a: - AuLabel(diropq_a); - break; - case Opt_diropq_w: - AuLabel(diropq_w); - break; - case Opt_warn_perm: - AuLabel(warn_perm); - break; - case Opt_nowarn_perm: - AuLabel(nowarn_perm); - break; - case Opt_verbose: - AuLabel(verbose); - break; - case Opt_noverbose: - AuLabel(noverbose); - break; - case Opt_sum: - AuLabel(sum); - break; - case Opt_nosum: - AuLabel(nosum); - break; - case Opt_wsum: - AuLabel(wsum); - break; - case Opt_wbr_create: - u.create = &opt->wbr_create; - AuDbg("create %d, %s\n", u.create->wbr_create, - au_optstr_wbr_create(u.create->wbr_create)); - switch (u.create->wbr_create) { - case AuWbrCreate_MFSV: - case AuWbrCreate_PMFSV: - AuDbg("%d sec\n", u.create->mfs_second); - break; - case AuWbrCreate_MFSRR: - AuDbg("%llu watermark\n", - u.create->mfsrr_watermark); - break; - case AuWbrCreate_MFSRRV: - case AuWbrCreate_PMFSRRV: - AuDbg("%llu watermark, %d sec\n", - u.create->mfsrr_watermark, - u.create->mfs_second); - break; - } - break; - case Opt_wbr_copyup: - AuDbg("copyup %d, %s\n", opt->wbr_copyup, - au_optstr_wbr_copyup(opt->wbr_copyup)); - break; - case Opt_fhsm_sec: - AuDbg("fhsm_sec %u\n", opt->fhsm_second); - break; - case Opt_acl: - AuLabel(acl); - break; - case Opt_noacl: - AuLabel(noacl); - break; - default: - BUG(); - } - opt++; - } -#endif -} - -void au_opts_free(struct au_opts *opts) -{ - struct au_opt *opt; - - opt = opts->opt; - while (opt->type != Opt_tail) { - switch (opt->type) { - case Opt_add: - case Opt_append: - case Opt_prepend: - path_put(&opt->add.path); - break; - case Opt_del: - case Opt_idel: - path_put(&opt->del.h_path); - break; - case Opt_mod: - case Opt_imod: - dput(opt->mod.h_root); - break; - case Opt_xino: - fput(opt->xino.file); - break; - } - opt++; - } -} - -static int opt_add(struct au_opt *opt, char *opt_str, unsigned long sb_flags, - aufs_bindex_t bindex) -{ - int err; - struct au_opt_add *add = &opt->add; - char *p; - - add->bindex = bindex; - add->perm = AuBrPerm_RO; - add->pathname = opt_str; - p = strchr(opt_str, '='); - if (p) { - *p++ = 0; - if (*p) - add->perm = br_perm_val(p); - } - - err = vfsub_kern_path(add->pathname, lkup_dirflags, &add->path); - if (!err) { - if (!p) { - add->perm = AuBrPerm_RO; - if (au_test_fs_rr(add->path.dentry->d_sb)) - add->perm = AuBrPerm_RR; - else if (!bindex && !(sb_flags & MS_RDONLY)) - add->perm = AuBrPerm_RW; - } - opt->type = Opt_add; - goto out; - } - pr_err("lookup failed %s (%d)\n", add->pathname, err); - err = -EINVAL; - -out: - return err; -} - -static int au_opts_parse_del(struct au_opt_del *del, substring_t args[]) -{ - int err; - - del->pathname = args[0].from; - AuDbg("del path %s\n", del->pathname); - - err = vfsub_kern_path(del->pathname, lkup_dirflags, &del->h_path); - if (unlikely(err)) - pr_err("lookup failed %s (%d)\n", del->pathname, err); - - return err; -} - -#if 0 /* reserved for future use */ -static int au_opts_parse_idel(struct super_block *sb, aufs_bindex_t bindex, - struct au_opt_del *del, substring_t args[]) -{ - int err; - struct dentry *root; - - err = -EINVAL; - root = sb->s_root; - aufs_read_lock(root, AuLock_FLUSH); - if (bindex < 0 || au_sbend(sb) < bindex) { - pr_err("out of bounds, %d\n", bindex); - goto out; - } - - err = 0; - del->h_path.dentry = dget(au_h_dptr(root, bindex)); - del->h_path.mnt = mntget(au_sbr_mnt(sb, bindex)); - -out: - aufs_read_unlock(root, !AuLock_IR); - return err; -} -#endif - -static int noinline_for_stack -au_opts_parse_mod(struct au_opt_mod *mod, substring_t args[]) -{ - int err; - struct path path; - char *p; - - err = -EINVAL; - mod->path = args[0].from; - p = strchr(mod->path, '='); - if (unlikely(!p)) { - pr_err("no permssion %s\n", args[0].from); - goto out; - } - - *p++ = 0; - err = vfsub_kern_path(mod->path, lkup_dirflags, &path); - if (unlikely(err)) { - pr_err("lookup failed %s (%d)\n", mod->path, err); - goto out; - } - - mod->perm = br_perm_val(p); - AuDbg("mod path %s, perm 0x%x, %s\n", mod->path, mod->perm, p); - mod->h_root = dget(path.dentry); - path_put(&path); - -out: - return err; -} - -#if 0 /* reserved for future use */ -static int au_opts_parse_imod(struct super_block *sb, aufs_bindex_t bindex, - struct au_opt_mod *mod, substring_t args[]) -{ - int err; - struct dentry *root; - - err = -EINVAL; - root = sb->s_root; - aufs_read_lock(root, AuLock_FLUSH); - if (bindex < 0 || au_sbend(sb) < bindex) { - pr_err("out of bounds, %d\n", bindex); - goto out; - } - - err = 0; - mod->perm = br_perm_val(args[1].from); - AuDbg("mod path %s, perm 0x%x, %s\n", - mod->path, mod->perm, args[1].from); - mod->h_root = dget(au_h_dptr(root, bindex)); - -out: - aufs_read_unlock(root, !AuLock_IR); - return err; -} -#endif - -static int au_opts_parse_xino(struct super_block *sb, struct au_opt_xino *xino, - substring_t args[]) -{ - int err; - struct file *file; - - file = au_xino_create(sb, args[0].from, /*silent*/0); - err = PTR_ERR(file); - if (IS_ERR(file)) - goto out; - - err = -EINVAL; - if (unlikely(file->f_path.dentry->d_sb == sb)) { - fput(file); - pr_err("%s must be outside\n", args[0].from); - goto out; - } - - err = 0; - xino->file = file; - xino->path = args[0].from; - -out: - return err; -} - -static int noinline_for_stack -au_opts_parse_xino_itrunc_path(struct super_block *sb, - struct au_opt_xino_itrunc *xino_itrunc, - substring_t args[]) -{ - int err; - aufs_bindex_t bend, bindex; - struct path path; - struct dentry *root; - - err = vfsub_kern_path(args[0].from, lkup_dirflags, &path); - if (unlikely(err)) { - pr_err("lookup failed %s (%d)\n", args[0].from, err); - goto out; - } - - xino_itrunc->bindex = -1; - root = sb->s_root; - aufs_read_lock(root, AuLock_FLUSH); - bend = au_sbend(sb); - for (bindex = 0; bindex <= bend; bindex++) { - if (au_h_dptr(root, bindex) == path.dentry) { - xino_itrunc->bindex = bindex; - break; - } - } - aufs_read_unlock(root, !AuLock_IR); - path_put(&path); - - if (unlikely(xino_itrunc->bindex < 0)) { - pr_err("no such branch %s\n", args[0].from); - err = -EINVAL; - } - -out: - return err; -} - -/* called without aufs lock */ -int au_opts_parse(struct super_block *sb, char *str, struct au_opts *opts) -{ - int err, n, token; - aufs_bindex_t bindex; - unsigned char skipped; - struct dentry *root; - struct au_opt *opt, *opt_tail; - char *opt_str; - /* reduce the stack space */ - union { - struct au_opt_xino_itrunc *xino_itrunc; - struct au_opt_wbr_create *create; - } u; - struct { - substring_t args[MAX_OPT_ARGS]; - } *a; - - err = -ENOMEM; - a = kmalloc(sizeof(*a), GFP_NOFS); - if (unlikely(!a)) - goto out; - - root = sb->s_root; - err = 0; - bindex = 0; - opt = opts->opt; - opt_tail = opt + opts->max_opt - 1; - opt->type = Opt_tail; - while (!err && (opt_str = strsep(&str, ",")) && *opt_str) { - err = -EINVAL; - skipped = 0; - token = match_token(opt_str, options, a->args); - switch (token) { - case Opt_br: - err = 0; - while (!err && (opt_str = strsep(&a->args[0].from, ":")) - && *opt_str) { - err = opt_add(opt, opt_str, opts->sb_flags, - bindex++); - if (unlikely(!err && ++opt > opt_tail)) { - err = -E2BIG; - break; - } - opt->type = Opt_tail; - skipped = 1; - } - break; - case Opt_add: - if (unlikely(match_int(&a->args[0], &n))) { - pr_err("bad integer in %s\n", opt_str); - break; - } - bindex = n; - err = opt_add(opt, a->args[1].from, opts->sb_flags, - bindex); - if (!err) - opt->type = token; - break; - case Opt_append: - err = opt_add(opt, a->args[0].from, opts->sb_flags, - /*dummy bindex*/1); - if (!err) - opt->type = token; - break; - case Opt_prepend: - err = opt_add(opt, a->args[0].from, opts->sb_flags, - /*bindex*/0); - if (!err) - opt->type = token; - break; - case Opt_del: - err = au_opts_parse_del(&opt->del, a->args); - if (!err) - opt->type = token; - break; -#if 0 /* reserved for future use */ - case Opt_idel: - del->pathname = "(indexed)"; - if (unlikely(match_int(&args[0], &n))) { - pr_err("bad integer in %s\n", opt_str); - break; - } - err = au_opts_parse_idel(sb, n, &opt->del, a->args); - if (!err) - opt->type = token; - break; -#endif - case Opt_mod: - err = au_opts_parse_mod(&opt->mod, a->args); - if (!err) - opt->type = token; - break; -#ifdef IMOD /* reserved for future use */ - case Opt_imod: - u.mod->path = "(indexed)"; - if (unlikely(match_int(&a->args[0], &n))) { - pr_err("bad integer in %s\n", opt_str); - break; - } - err = au_opts_parse_imod(sb, n, &opt->mod, a->args); - if (!err) - opt->type = token; - break; -#endif - case Opt_xino: - err = au_opts_parse_xino(sb, &opt->xino, a->args); - if (!err) - opt->type = token; - break; - - case Opt_trunc_xino_path: - err = au_opts_parse_xino_itrunc_path - (sb, &opt->xino_itrunc, a->args); - if (!err) - opt->type = token; - break; - - case Opt_itrunc_xino: - u.xino_itrunc = &opt->xino_itrunc; - if (unlikely(match_int(&a->args[0], &n))) { - pr_err("bad integer in %s\n", opt_str); - break; - } - u.xino_itrunc->bindex = n; - aufs_read_lock(root, AuLock_FLUSH); - if (n < 0 || au_sbend(sb) < n) { - pr_err("out of bounds, %d\n", n); - aufs_read_unlock(root, !AuLock_IR); - break; - } - aufs_read_unlock(root, !AuLock_IR); - err = 0; - opt->type = token; - break; - - case Opt_dirwh: - if (unlikely(match_int(&a->args[0], &opt->dirwh))) - break; - err = 0; - opt->type = token; - break; - - case Opt_rdcache: - if (unlikely(match_int(&a->args[0], &n))) { - pr_err("bad integer in %s\n", opt_str); - break; - } - if (unlikely(n > AUFS_RDCACHE_MAX)) { - pr_err("rdcache must be smaller than %d\n", - AUFS_RDCACHE_MAX); - break; - } - opt->rdcache = n; - err = 0; - opt->type = token; - break; - case Opt_rdblk: - if (unlikely(match_int(&a->args[0], &n) - || n < 0 - || n > KMALLOC_MAX_SIZE)) { - pr_err("bad integer in %s\n", opt_str); - break; - } - if (unlikely(n && n < NAME_MAX)) { - pr_err("rdblk must be larger than %d\n", - NAME_MAX); - break; - } - opt->rdblk = n; - err = 0; - opt->type = token; - break; - case Opt_rdhash: - if (unlikely(match_int(&a->args[0], &n) - || n < 0 - || n * sizeof(struct hlist_head) - > KMALLOC_MAX_SIZE)) { - pr_err("bad integer in %s\n", opt_str); - break; - } - opt->rdhash = n; - err = 0; - opt->type = token; - break; - - case Opt_trunc_xino: - case Opt_notrunc_xino: - case Opt_noxino: - case Opt_trunc_xib: - case Opt_notrunc_xib: - case Opt_shwh: - case Opt_noshwh: - case Opt_dirperm1: - case Opt_nodirperm1: - case Opt_plink: - case Opt_noplink: - case Opt_list_plink: - case Opt_dio: - case Opt_nodio: - case Opt_diropq_a: - case Opt_diropq_w: - case Opt_warn_perm: - case Opt_nowarn_perm: - case Opt_verbose: - case Opt_noverbose: - case Opt_sum: - case Opt_nosum: - case Opt_wsum: - case Opt_rdblk_def: - case Opt_rdhash_def: - case Opt_acl: - case Opt_noacl: - err = 0; - opt->type = token; - break; - - case Opt_udba: - opt->udba = udba_val(a->args[0].from); - if (opt->udba >= 0) { - err = 0; - opt->type = token; - } else - pr_err("wrong value, %s\n", opt_str); - break; - - case Opt_wbr_create: - u.create = &opt->wbr_create; - u.create->wbr_create - = au_wbr_create_val(a->args[0].from, u.create); - if (u.create->wbr_create >= 0) { - err = 0; - opt->type = token; - } else - pr_err("wrong value, %s\n", opt_str); - break; - case Opt_wbr_copyup: - opt->wbr_copyup = au_wbr_copyup_val(a->args[0].from); - if (opt->wbr_copyup >= 0) { - err = 0; - opt->type = token; - } else - pr_err("wrong value, %s\n", opt_str); - break; - - case Opt_fhsm_sec: - if (unlikely(match_int(&a->args[0], &n) - || n < 0)) { - pr_err("bad integer in %s\n", opt_str); - break; - } - if (sysaufs_brs) { - opt->fhsm_second = n; - opt->type = token; - } else - pr_warn("ignored %s\n", opt_str); - err = 0; - break; - - case Opt_ignore: - pr_warn("ignored %s\n", opt_str); - /*FALLTHROUGH*/ - case Opt_ignore_silent: - skipped = 1; - err = 0; - break; - case Opt_err: - pr_err("unknown option %s\n", opt_str); - break; - } - - if (!err && !skipped) { - if (unlikely(++opt > opt_tail)) { - err = -E2BIG; - opt--; - opt->type = Opt_tail; - break; - } - opt->type = Opt_tail; - } - } - - kfree(a); - dump_opts(opts); - if (unlikely(err)) - au_opts_free(opts); - -out: - return err; -} - -static int au_opt_wbr_create(struct super_block *sb, - struct au_opt_wbr_create *create) -{ - int err; - struct au_sbinfo *sbinfo; - - SiMustWriteLock(sb); - - err = 1; /* handled */ - sbinfo = au_sbi(sb); - if (sbinfo->si_wbr_create_ops->fin) { - err = sbinfo->si_wbr_create_ops->fin(sb); - if (!err) - err = 1; - } - - sbinfo->si_wbr_create = create->wbr_create; - sbinfo->si_wbr_create_ops = au_wbr_create_ops + create->wbr_create; - switch (create->wbr_create) { - case AuWbrCreate_MFSRRV: - case AuWbrCreate_MFSRR: - case AuWbrCreate_PMFSRR: - case AuWbrCreate_PMFSRRV: - sbinfo->si_wbr_mfs.mfsrr_watermark = create->mfsrr_watermark; - /*FALLTHROUGH*/ - case AuWbrCreate_MFS: - case AuWbrCreate_MFSV: - case AuWbrCreate_PMFS: - case AuWbrCreate_PMFSV: - sbinfo->si_wbr_mfs.mfs_expire - = msecs_to_jiffies(create->mfs_second * MSEC_PER_SEC); - break; - } - - if (sbinfo->si_wbr_create_ops->init) - sbinfo->si_wbr_create_ops->init(sb); /* ignore */ - - return err; -} - -/* - * returns, - * plus: processed without an error - * zero: unprocessed - */ -static int au_opt_simple(struct super_block *sb, struct au_opt *opt, - struct au_opts *opts) -{ - int err; - struct au_sbinfo *sbinfo; - - SiMustWriteLock(sb); - - err = 1; /* handled */ - sbinfo = au_sbi(sb); - switch (opt->type) { - case Opt_udba: - sbinfo->si_mntflags &= ~AuOptMask_UDBA; - sbinfo->si_mntflags |= opt->udba; - opts->given_udba |= opt->udba; - break; - - case Opt_plink: - au_opt_set(sbinfo->si_mntflags, PLINK); - break; - case Opt_noplink: - if (au_opt_test(sbinfo->si_mntflags, PLINK)) - au_plink_put(sb, /*verbose*/1); - au_opt_clr(sbinfo->si_mntflags, PLINK); - break; - case Opt_list_plink: - if (au_opt_test(sbinfo->si_mntflags, PLINK)) - au_plink_list(sb); - break; - - case Opt_dio: - au_opt_set(sbinfo->si_mntflags, DIO); - au_fset_opts(opts->flags, REFRESH_DYAOP); - break; - case Opt_nodio: - au_opt_clr(sbinfo->si_mntflags, DIO); - au_fset_opts(opts->flags, REFRESH_DYAOP); - break; - - case Opt_fhsm_sec: - au_fhsm_set(sbinfo, opt->fhsm_second); - break; - - case Opt_diropq_a: - au_opt_set(sbinfo->si_mntflags, ALWAYS_DIROPQ); - break; - case Opt_diropq_w: - au_opt_clr(sbinfo->si_mntflags, ALWAYS_DIROPQ); - break; - - case Opt_warn_perm: - au_opt_set(sbinfo->si_mntflags, WARN_PERM); - break; - case Opt_nowarn_perm: - au_opt_clr(sbinfo->si_mntflags, WARN_PERM); - break; - - case Opt_verbose: - au_opt_set(sbinfo->si_mntflags, VERBOSE); - break; - case Opt_noverbose: - au_opt_clr(sbinfo->si_mntflags, VERBOSE); - break; - - case Opt_sum: - au_opt_set(sbinfo->si_mntflags, SUM); - break; - case Opt_wsum: - au_opt_clr(sbinfo->si_mntflags, SUM); - au_opt_set(sbinfo->si_mntflags, SUM_W); - case Opt_nosum: - au_opt_clr(sbinfo->si_mntflags, SUM); - au_opt_clr(sbinfo->si_mntflags, SUM_W); - break; - - case Opt_wbr_create: - err = au_opt_wbr_create(sb, &opt->wbr_create); - break; - case Opt_wbr_copyup: - sbinfo->si_wbr_copyup = opt->wbr_copyup; - sbinfo->si_wbr_copyup_ops = au_wbr_copyup_ops + opt->wbr_copyup; - break; - - case Opt_dirwh: - sbinfo->si_dirwh = opt->dirwh; - break; - - case Opt_rdcache: - sbinfo->si_rdcache - = msecs_to_jiffies(opt->rdcache * MSEC_PER_SEC); - break; - case Opt_rdblk: - sbinfo->si_rdblk = opt->rdblk; - break; - case Opt_rdblk_def: - sbinfo->si_rdblk = AUFS_RDBLK_DEF; - break; - case Opt_rdhash: - sbinfo->si_rdhash = opt->rdhash; - break; - case Opt_rdhash_def: - sbinfo->si_rdhash = AUFS_RDHASH_DEF; - break; - - case Opt_shwh: - au_opt_set(sbinfo->si_mntflags, SHWH); - break; - case Opt_noshwh: - au_opt_clr(sbinfo->si_mntflags, SHWH); - break; - - case Opt_dirperm1: - au_opt_set(sbinfo->si_mntflags, DIRPERM1); - break; - case Opt_nodirperm1: - au_opt_clr(sbinfo->si_mntflags, DIRPERM1); - break; - - case Opt_trunc_xino: - au_opt_set(sbinfo->si_mntflags, TRUNC_XINO); - break; - case Opt_notrunc_xino: - au_opt_clr(sbinfo->si_mntflags, TRUNC_XINO); - break; - - case Opt_trunc_xino_path: - case Opt_itrunc_xino: - err = au_xino_trunc(sb, opt->xino_itrunc.bindex); - if (!err) - err = 1; - break; - - case Opt_trunc_xib: - au_fset_opts(opts->flags, TRUNC_XIB); - break; - case Opt_notrunc_xib: - au_fclr_opts(opts->flags, TRUNC_XIB); - break; - - case Opt_acl: - sb->s_flags |= MS_POSIXACL; - break; - case Opt_noacl: - sb->s_flags &= ~MS_POSIXACL; - break; - - default: - err = 0; - break; - } - - return err; -} - -/* - * returns tri-state. - * plus: processed without an error - * zero: unprocessed - * minus: error - */ -static int au_opt_br(struct super_block *sb, struct au_opt *opt, - struct au_opts *opts) -{ - int err, do_refresh; - - err = 0; - switch (opt->type) { - case Opt_append: - opt->add.bindex = au_sbend(sb) + 1; - if (opt->add.bindex < 0) - opt->add.bindex = 0; - goto add; - case Opt_prepend: - opt->add.bindex = 0; - add: /* indented label */ - case Opt_add: - err = au_br_add(sb, &opt->add, - au_ftest_opts(opts->flags, REMOUNT)); - if (!err) { - err = 1; - au_fset_opts(opts->flags, REFRESH); - } - break; - - case Opt_del: - case Opt_idel: - err = au_br_del(sb, &opt->del, - au_ftest_opts(opts->flags, REMOUNT)); - if (!err) { - err = 1; - au_fset_opts(opts->flags, TRUNC_XIB); - au_fset_opts(opts->flags, REFRESH); - } - break; - - case Opt_mod: - case Opt_imod: - err = au_br_mod(sb, &opt->mod, - au_ftest_opts(opts->flags, REMOUNT), - &do_refresh); - if (!err) { - err = 1; - if (do_refresh) - au_fset_opts(opts->flags, REFRESH); - } - break; - } - - return err; -} - -static int au_opt_xino(struct super_block *sb, struct au_opt *opt, - struct au_opt_xino **opt_xino, - struct au_opts *opts) -{ - int err; - aufs_bindex_t bend, bindex; - struct dentry *root, *parent, *h_root; - - err = 0; - switch (opt->type) { - case Opt_xino: - err = au_xino_set(sb, &opt->xino, - !!au_ftest_opts(opts->flags, REMOUNT)); - if (unlikely(err)) - break; - - *opt_xino = &opt->xino; - au_xino_brid_set(sb, -1); - - /* safe d_parent access */ - parent = opt->xino.file->f_path.dentry->d_parent; - root = sb->s_root; - bend = au_sbend(sb); - for (bindex = 0; bindex <= bend; bindex++) { - h_root = au_h_dptr(root, bindex); - if (h_root == parent) { - au_xino_brid_set(sb, au_sbr_id(sb, bindex)); - break; - } - } - break; - - case Opt_noxino: - au_xino_clr(sb); - au_xino_brid_set(sb, -1); - *opt_xino = (void *)-1; - break; - } - - return err; -} - -int au_opts_verify(struct super_block *sb, unsigned long sb_flags, - unsigned int pending) -{ - int err, fhsm; - aufs_bindex_t bindex, bend; - unsigned char do_plink, skip, do_free; - struct au_branch *br; - struct au_wbr *wbr; - struct dentry *root; - struct inode *dir, *h_dir; - struct au_sbinfo *sbinfo; - struct au_hinode *hdir; - - SiMustAnyLock(sb); - - sbinfo = au_sbi(sb); - AuDebugOn(!(sbinfo->si_mntflags & AuOptMask_UDBA)); - - if (!(sb_flags & MS_RDONLY)) { - if (unlikely(!au_br_writable(au_sbr_perm(sb, 0)))) - pr_warn("first branch should be rw\n"); - if (unlikely(au_opt_test(sbinfo->si_mntflags, SHWH))) - pr_warn("shwh should be used with ro\n"); - } - - if (au_opt_test((sbinfo->si_mntflags | pending), UDBA_HNOTIFY) - && !au_opt_test(sbinfo->si_mntflags, XINO)) - pr_warn("udba=*notify requires xino\n"); - - if (au_opt_test(sbinfo->si_mntflags, DIRPERM1)) - pr_warn("dirperm1 breaks the protection" - " by the permission bits on the lower branch\n"); - - err = 0; - fhsm = 0; - root = sb->s_root; - dir = d_inode(root); - do_plink = !!au_opt_test(sbinfo->si_mntflags, PLINK); - bend = au_sbend(sb); - for (bindex = 0; !err && bindex <= bend; bindex++) { - skip = 0; - h_dir = au_h_iptr(dir, bindex); - br = au_sbr(sb, bindex); - - if ((br->br_perm & AuBrAttr_ICEX) - && !h_dir->i_op->listxattr) - br->br_perm &= ~AuBrAttr_ICEX; -#if 0 - if ((br->br_perm & AuBrAttr_ICEX_SEC) - && (au_br_sb(br)->s_flags & MS_NOSEC)) - br->br_perm &= ~AuBrAttr_ICEX_SEC; -#endif - - do_free = 0; - wbr = br->br_wbr; - if (wbr) - wbr_wh_read_lock(wbr); - - if (!au_br_writable(br->br_perm)) { - do_free = !!wbr; - skip = (!wbr - || (!wbr->wbr_whbase - && !wbr->wbr_plink - && !wbr->wbr_orph)); - } else if (!au_br_wh_linkable(br->br_perm)) { - /* skip = (!br->br_whbase && !br->br_orph); */ - skip = (!wbr || !wbr->wbr_whbase); - if (skip && wbr) { - if (do_plink) - skip = !!wbr->wbr_plink; - else - skip = !wbr->wbr_plink; - } - } else { - /* skip = (br->br_whbase && br->br_ohph); */ - skip = (wbr && wbr->wbr_whbase); - if (skip) { - if (do_plink) - skip = !!wbr->wbr_plink; - else - skip = !wbr->wbr_plink; - } - } - if (wbr) - wbr_wh_read_unlock(wbr); - - if (au_br_fhsm(br->br_perm)) { - fhsm++; - AuDebugOn(!br->br_fhsm); - } - - if (skip) - continue; - - hdir = au_hi(dir, bindex); - au_hn_imtx_lock_nested(hdir, AuLsc_I_PARENT); - if (wbr) - wbr_wh_write_lock(wbr); - err = au_wh_init(br, sb); - if (wbr) - wbr_wh_write_unlock(wbr); - au_hn_imtx_unlock(hdir); - - if (!err && do_free) { - kfree(wbr); - br->br_wbr = NULL; - } - } - - if (fhsm >= 2) { - au_fset_si(sbinfo, FHSM); - for (bindex = bend; bindex >= 0; bindex--) { - br = au_sbr(sb, bindex); - if (au_br_fhsm(br->br_perm)) { - au_fhsm_set_bottom(sb, bindex); - break; - } - } - } else { - au_fclr_si(sbinfo, FHSM); - au_fhsm_set_bottom(sb, -1); - } - - return err; -} - -int au_opts_mount(struct super_block *sb, struct au_opts *opts) -{ - int err; - unsigned int tmp; - aufs_bindex_t bindex, bend; - struct au_opt *opt; - struct au_opt_xino *opt_xino, xino; - struct au_sbinfo *sbinfo; - struct au_branch *br; - struct inode *dir; - - SiMustWriteLock(sb); - - err = 0; - opt_xino = NULL; - opt = opts->opt; - while (err >= 0 && opt->type != Opt_tail) - err = au_opt_simple(sb, opt++, opts); - if (err > 0) - err = 0; - else if (unlikely(err < 0)) - goto out; - - /* disable xino and udba temporary */ - sbinfo = au_sbi(sb); - tmp = sbinfo->si_mntflags; - au_opt_clr(sbinfo->si_mntflags, XINO); - au_opt_set_udba(sbinfo->si_mntflags, UDBA_REVAL); - - opt = opts->opt; - while (err >= 0 && opt->type != Opt_tail) - err = au_opt_br(sb, opt++, opts); - if (err > 0) - err = 0; - else if (unlikely(err < 0)) - goto out; - - bend = au_sbend(sb); - if (unlikely(bend < 0)) { - err = -EINVAL; - pr_err("no branches\n"); - goto out; - } - - if (au_opt_test(tmp, XINO)) - au_opt_set(sbinfo->si_mntflags, XINO); - opt = opts->opt; - while (!err && opt->type != Opt_tail) - err = au_opt_xino(sb, opt++, &opt_xino, opts); - if (unlikely(err)) - goto out; - - err = au_opts_verify(sb, sb->s_flags, tmp); - if (unlikely(err)) - goto out; - - /* restore xino */ - if (au_opt_test(tmp, XINO) && !opt_xino) { - xino.file = au_xino_def(sb); - err = PTR_ERR(xino.file); - if (IS_ERR(xino.file)) - goto out; - - err = au_xino_set(sb, &xino, /*remount*/0); - fput(xino.file); - if (unlikely(err)) - goto out; - } - - /* restore udba */ - tmp &= AuOptMask_UDBA; - sbinfo->si_mntflags &= ~AuOptMask_UDBA; - sbinfo->si_mntflags |= tmp; - bend = au_sbend(sb); - for (bindex = 0; bindex <= bend; bindex++) { - br = au_sbr(sb, bindex); - err = au_hnotify_reset_br(tmp, br, br->br_perm); - if (unlikely(err)) - AuIOErr("hnotify failed on br %d, %d, ignored\n", - bindex, err); - /* go on even if err */ - } - if (au_opt_test(tmp, UDBA_HNOTIFY)) { - dir = d_inode(sb->s_root); - au_hn_reset(dir, au_hi_flags(dir, /*isdir*/1) & ~AuHi_XINO); - } - -out: - return err; -} - -int au_opts_remount(struct super_block *sb, struct au_opts *opts) -{ - int err, rerr; - struct inode *dir; - struct au_opt_xino *opt_xino; - struct au_opt *opt; - struct au_sbinfo *sbinfo; - - SiMustWriteLock(sb); - - dir = d_inode(sb->s_root); - sbinfo = au_sbi(sb); - err = 0; - opt_xino = NULL; - opt = opts->opt; - while (err >= 0 && opt->type != Opt_tail) { - err = au_opt_simple(sb, opt, opts); - if (!err) - err = au_opt_br(sb, opt, opts); - if (!err) - err = au_opt_xino(sb, opt, &opt_xino, opts); - opt++; - } - if (err > 0) - err = 0; - AuTraceErr(err); - /* go on even err */ - - rerr = au_opts_verify(sb, opts->sb_flags, /*pending*/0); - if (unlikely(rerr && !err)) - err = rerr; - - if (au_ftest_opts(opts->flags, TRUNC_XIB)) { - rerr = au_xib_trunc(sb); - if (unlikely(rerr && !err)) - err = rerr; - } - - /* will be handled by the caller */ - if (!au_ftest_opts(opts->flags, REFRESH) - && (opts->given_udba || au_opt_test(sbinfo->si_mntflags, XINO))) - au_fset_opts(opts->flags, REFRESH); - - AuDbg("status 0x%x\n", opts->flags); - return err; -} - -/* ---------------------------------------------------------------------- */ - -unsigned int au_opt_udba(struct super_block *sb) -{ - return au_mntflags(sb) & AuOptMask_UDBA; -} diff --git a/fs/aufs/opts.h b/fs/aufs/opts.h deleted file mode 100644 index b18a4aa00..000000000 --- a/fs/aufs/opts.h +++ /dev/null @@ -1,210 +0,0 @@ -/* - * Copyright (C) 2005-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * mount options/flags - */ - -#ifndef __AUFS_OPTS_H__ -#define __AUFS_OPTS_H__ - -#ifdef __KERNEL__ - -#include <linux/path.h> - -struct file; -struct super_block; - -/* ---------------------------------------------------------------------- */ - -/* mount flags */ -#define AuOpt_XINO 1 /* external inode number bitmap - and translation table */ -#define AuOpt_TRUNC_XINO (1 << 1) /* truncate xino files */ -#define AuOpt_UDBA_NONE (1 << 2) /* users direct branch access */ -#define AuOpt_UDBA_REVAL (1 << 3) -#define AuOpt_UDBA_HNOTIFY (1 << 4) -#define AuOpt_SHWH (1 << 5) /* show whiteout */ -#define AuOpt_PLINK (1 << 6) /* pseudo-link */ -#define AuOpt_DIRPERM1 (1 << 7) /* ignore the lower dir's perm - bits */ -#define AuOpt_ALWAYS_DIROPQ (1 << 9) /* policy to creating diropq */ -#define AuOpt_SUM (1 << 10) /* summation for statfs(2) */ -#define AuOpt_SUM_W (1 << 11) /* unimplemented */ -#define AuOpt_WARN_PERM (1 << 12) /* warn when add-branch */ -#define AuOpt_VERBOSE (1 << 13) /* busy inode when del-branch */ -#define AuOpt_DIO (1 << 14) /* direct io */ - -#ifndef CONFIG_AUFS_HNOTIFY -#undef AuOpt_UDBA_HNOTIFY -#define AuOpt_UDBA_HNOTIFY 0 -#endif -#ifndef CONFIG_AUFS_SHWH -#undef AuOpt_SHWH -#define AuOpt_SHWH 0 -#endif - -#define AuOpt_Def (AuOpt_XINO \ - | AuOpt_UDBA_REVAL \ - | AuOpt_PLINK \ - /* | AuOpt_DIRPERM1 */ \ - | AuOpt_WARN_PERM) -#define AuOptMask_UDBA (AuOpt_UDBA_NONE \ - | AuOpt_UDBA_REVAL \ - | AuOpt_UDBA_HNOTIFY) - -#define au_opt_test(flags, name) (flags & AuOpt_##name) -#define au_opt_set(flags, name) do { \ - BUILD_BUG_ON(AuOpt_##name & AuOptMask_UDBA); \ - ((flags) |= AuOpt_##name); \ -} while (0) -#define au_opt_set_udba(flags, name) do { \ - (flags) &= ~AuOptMask_UDBA; \ - ((flags) |= AuOpt_##name); \ -} while (0) -#define au_opt_clr(flags, name) do { \ - ((flags) &= ~AuOpt_##name); \ -} while (0) - -static inline unsigned int au_opts_plink(unsigned int mntflags) -{ -#ifdef CONFIG_PROC_FS - return mntflags; -#else - return mntflags & ~AuOpt_PLINK; -#endif -} - -/* ---------------------------------------------------------------------- */ - -/* policies to select one among multiple writable branches */ -enum { - AuWbrCreate_TDP, /* top down parent */ - AuWbrCreate_RR, /* round robin */ - AuWbrCreate_MFS, /* most free space */ - AuWbrCreate_MFSV, /* mfs with seconds */ - AuWbrCreate_MFSRR, /* mfs then rr */ - AuWbrCreate_MFSRRV, /* mfs then rr with seconds */ - AuWbrCreate_PMFS, /* parent and mfs */ - AuWbrCreate_PMFSV, /* parent and mfs with seconds */ - AuWbrCreate_PMFSRR, /* parent, mfs and round-robin */ - AuWbrCreate_PMFSRRV, /* plus seconds */ - - AuWbrCreate_Def = AuWbrCreate_TDP -}; - -enum { - AuWbrCopyup_TDP, /* top down parent */ - AuWbrCopyup_BUP, /* bottom up parent */ - AuWbrCopyup_BU, /* bottom up */ - - AuWbrCopyup_Def = AuWbrCopyup_TDP -}; - -/* ---------------------------------------------------------------------- */ - -struct au_opt_add { - aufs_bindex_t bindex; - char *pathname; - int perm; - struct path path; -}; - -struct au_opt_del { - char *pathname; - struct path h_path; -}; - -struct au_opt_mod { - char *path; - int perm; - struct dentry *h_root; -}; - -struct au_opt_xino { - char *path; - struct file *file; -}; - -struct au_opt_xino_itrunc { - aufs_bindex_t bindex; -}; - -struct au_opt_wbr_create { - int wbr_create; - int mfs_second; - unsigned long long mfsrr_watermark; -}; - -struct au_opt { - int type; - union { - struct au_opt_xino xino; - struct au_opt_xino_itrunc xino_itrunc; - struct au_opt_add add; - struct au_opt_del del; - struct au_opt_mod mod; - int dirwh; - int rdcache; - unsigned int rdblk; - unsigned int rdhash; - int udba; - struct au_opt_wbr_create wbr_create; - int wbr_copyup; - unsigned int fhsm_second; - }; -}; - -/* opts flags */ -#define AuOpts_REMOUNT 1 -#define AuOpts_REFRESH (1 << 1) -#define AuOpts_TRUNC_XIB (1 << 2) -#define AuOpts_REFRESH_DYAOP (1 << 3) -#define au_ftest_opts(flags, name) ((flags) & AuOpts_##name) -#define au_fset_opts(flags, name) \ - do { (flags) |= AuOpts_##name; } while (0) -#define au_fclr_opts(flags, name) \ - do { (flags) &= ~AuOpts_##name; } while (0) - -struct au_opts { - struct au_opt *opt; - int max_opt; - - unsigned int given_udba; - unsigned int flags; - unsigned long sb_flags; -}; - -/* ---------------------------------------------------------------------- */ - -/* opts.c */ -void au_optstr_br_perm(au_br_perm_str_t *str, int perm); -const char *au_optstr_udba(int udba); -const char *au_optstr_wbr_copyup(int wbr_copyup); -const char *au_optstr_wbr_create(int wbr_create); - -void au_opts_free(struct au_opts *opts); -int au_opts_parse(struct super_block *sb, char *str, struct au_opts *opts); -int au_opts_verify(struct super_block *sb, unsigned long sb_flags, - unsigned int pending); -int au_opts_mount(struct super_block *sb, struct au_opts *opts); -int au_opts_remount(struct super_block *sb, struct au_opts *opts); - -unsigned int au_opt_udba(struct super_block *sb); - -#endif /* __KERNEL__ */ -#endif /* __AUFS_OPTS_H__ */ diff --git a/fs/aufs/plink.c b/fs/aufs/plink.c deleted file mode 100644 index 988fba940..000000000 --- a/fs/aufs/plink.c +++ /dev/null @@ -1,528 +0,0 @@ -/* - * Copyright (C) 2005-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * pseudo-link - */ - -#include "aufs.h" - -/* - * the pseudo-link maintenance mode. - * during a user process maintains the pseudo-links, - * prohibit adding a new plink and branch manipulation. - * - * Flags - * NOPLM: - * For entry functions which will handle plink, and i_mutex is already held - * in VFS. - * They cannot wait and should return an error at once. - * Callers has to check the error. - * NOPLMW: - * For entry functions which will handle plink, but i_mutex is not held - * in VFS. - * They can wait the plink maintenance mode to finish. - * - * They behave like F_SETLK and F_SETLKW. - * If the caller never handle plink, then both flags are unnecessary. - */ - -int au_plink_maint(struct super_block *sb, int flags) -{ - int err; - pid_t pid, ppid; - struct au_sbinfo *sbi; - - SiMustAnyLock(sb); - - err = 0; - if (!au_opt_test(au_mntflags(sb), PLINK)) - goto out; - - sbi = au_sbi(sb); - pid = sbi->si_plink_maint_pid; - if (!pid || pid == current->pid) - goto out; - - /* todo: it highly depends upon /sbin/mount.aufs */ - rcu_read_lock(); - ppid = task_pid_vnr(rcu_dereference(current->real_parent)); - rcu_read_unlock(); - if (pid == ppid) - goto out; - - if (au_ftest_lock(flags, NOPLMW)) { - /* if there is no i_mutex lock in VFS, we don't need to wait */ - /* AuDebugOn(!lockdep_depth(current)); */ - while (sbi->si_plink_maint_pid) { - si_read_unlock(sb); - /* gave up wake_up_bit() */ - wait_event(sbi->si_plink_wq, !sbi->si_plink_maint_pid); - - if (au_ftest_lock(flags, FLUSH)) - au_nwt_flush(&sbi->si_nowait); - si_noflush_read_lock(sb); - } - } else if (au_ftest_lock(flags, NOPLM)) { - AuDbg("ppid %d, pid %d\n", ppid, pid); - err = -EAGAIN; - } - -out: - return err; -} - -void au_plink_maint_leave(struct au_sbinfo *sbinfo) -{ - spin_lock(&sbinfo->si_plink_maint_lock); - sbinfo->si_plink_maint_pid = 0; - spin_unlock(&sbinfo->si_plink_maint_lock); - wake_up_all(&sbinfo->si_plink_wq); -} - -int au_plink_maint_enter(struct super_block *sb) -{ - int err; - struct au_sbinfo *sbinfo; - - err = 0; - sbinfo = au_sbi(sb); - /* make sure i am the only one in this fs */ - si_write_lock(sb, AuLock_FLUSH); - if (au_opt_test(au_mntflags(sb), PLINK)) { - spin_lock(&sbinfo->si_plink_maint_lock); - if (!sbinfo->si_plink_maint_pid) - sbinfo->si_plink_maint_pid = current->pid; - else - err = -EBUSY; - spin_unlock(&sbinfo->si_plink_maint_lock); - } - si_write_unlock(sb); - - return err; -} - -/* ---------------------------------------------------------------------- */ - -#ifdef CONFIG_AUFS_DEBUG -void au_plink_list(struct super_block *sb) -{ - int i; - struct au_sbinfo *sbinfo; - struct hlist_head *plink_hlist; - struct pseudo_link *plink; - - SiMustAnyLock(sb); - - sbinfo = au_sbi(sb); - AuDebugOn(!au_opt_test(au_mntflags(sb), PLINK)); - AuDebugOn(au_plink_maint(sb, AuLock_NOPLM)); - - for (i = 0; i < AuPlink_NHASH; i++) { - plink_hlist = &sbinfo->si_plink[i].head; - rcu_read_lock(); - hlist_for_each_entry_rcu(plink, plink_hlist, hlist) - AuDbg("%lu\n", plink->inode->i_ino); - rcu_read_unlock(); - } -} -#endif - -/* is the inode pseudo-linked? */ -int au_plink_test(struct inode *inode) -{ - int found, i; - struct au_sbinfo *sbinfo; - struct hlist_head *plink_hlist; - struct pseudo_link *plink; - - sbinfo = au_sbi(inode->i_sb); - AuRwMustAnyLock(&sbinfo->si_rwsem); - AuDebugOn(!au_opt_test(au_mntflags(inode->i_sb), PLINK)); - AuDebugOn(au_plink_maint(inode->i_sb, AuLock_NOPLM)); - - found = 0; - i = au_plink_hash(inode->i_ino); - plink_hlist = &sbinfo->si_plink[i].head; - rcu_read_lock(); - hlist_for_each_entry_rcu(plink, plink_hlist, hlist) - if (plink->inode == inode) { - found = 1; - break; - } - rcu_read_unlock(); - return found; -} - -/* ---------------------------------------------------------------------- */ - -/* - * generate a name for plink. - * the file will be stored under AUFS_WH_PLINKDIR. - */ -/* 20 is max digits length of ulong 64 */ -#define PLINK_NAME_LEN ((20 + 1) * 2) - -static int plink_name(char *name, int len, struct inode *inode, - aufs_bindex_t bindex) -{ - int rlen; - struct inode *h_inode; - - h_inode = au_h_iptr(inode, bindex); - rlen = snprintf(name, len, "%lu.%lu", inode->i_ino, h_inode->i_ino); - return rlen; -} - -struct au_do_plink_lkup_args { - struct dentry **errp; - struct qstr *tgtname; - struct dentry *h_parent; - struct au_branch *br; -}; - -static struct dentry *au_do_plink_lkup(struct qstr *tgtname, - struct dentry *h_parent, - struct au_branch *br) -{ - struct dentry *h_dentry; - struct mutex *h_mtx; - - h_mtx = &d_inode(h_parent)->i_mutex; - mutex_lock_nested(h_mtx, AuLsc_I_CHILD2); - h_dentry = vfsub_lkup_one(tgtname, h_parent); - mutex_unlock(h_mtx); - return h_dentry; -} - -static void au_call_do_plink_lkup(void *args) -{ - struct au_do_plink_lkup_args *a = args; - *a->errp = au_do_plink_lkup(a->tgtname, a->h_parent, a->br); -} - -/* lookup the plink-ed @inode under the branch at @bindex */ -struct dentry *au_plink_lkup(struct inode *inode, aufs_bindex_t bindex) -{ - struct dentry *h_dentry, *h_parent; - struct au_branch *br; - int wkq_err; - char a[PLINK_NAME_LEN]; - struct qstr tgtname = QSTR_INIT(a, 0); - - AuDebugOn(au_plink_maint(inode->i_sb, AuLock_NOPLM)); - - br = au_sbr(inode->i_sb, bindex); - h_parent = br->br_wbr->wbr_plink; - tgtname.len = plink_name(a, sizeof(a), inode, bindex); - - if (!uid_eq(current_fsuid(), GLOBAL_ROOT_UID)) { - struct au_do_plink_lkup_args args = { - .errp = &h_dentry, - .tgtname = &tgtname, - .h_parent = h_parent, - .br = br - }; - - wkq_err = au_wkq_wait(au_call_do_plink_lkup, &args); - if (unlikely(wkq_err)) - h_dentry = ERR_PTR(wkq_err); - } else - h_dentry = au_do_plink_lkup(&tgtname, h_parent, br); - - return h_dentry; -} - -/* create a pseudo-link */ -static int do_whplink(struct qstr *tgt, struct dentry *h_parent, - struct dentry *h_dentry, struct au_branch *br) -{ - int err; - struct path h_path = { - .mnt = au_br_mnt(br) - }; - struct inode *h_dir, *delegated; - - h_dir = d_inode(h_parent); - mutex_lock_nested(&h_dir->i_mutex, AuLsc_I_CHILD2); -again: - h_path.dentry = vfsub_lkup_one(tgt, h_parent); - err = PTR_ERR(h_path.dentry); - if (IS_ERR(h_path.dentry)) - goto out; - - err = 0; - /* wh.plink dir is not monitored */ - /* todo: is it really safe? */ - if (d_is_positive(h_path.dentry) - && d_inode(h_path.dentry) != d_inode(h_dentry)) { - delegated = NULL; - err = vfsub_unlink(h_dir, &h_path, &delegated, /*force*/0); - if (unlikely(err == -EWOULDBLOCK)) { - pr_warn("cannot retry for NFSv4 delegation" - " for an internal unlink\n"); - iput(delegated); - } - dput(h_path.dentry); - h_path.dentry = NULL; - if (!err) - goto again; - } - if (!err && d_is_negative(h_path.dentry)) { - delegated = NULL; - err = vfsub_link(h_dentry, h_dir, &h_path, &delegated); - if (unlikely(err == -EWOULDBLOCK)) { - pr_warn("cannot retry for NFSv4 delegation" - " for an internal link\n"); - iput(delegated); - } - } - dput(h_path.dentry); - -out: - mutex_unlock(&h_dir->i_mutex); - return err; -} - -struct do_whplink_args { - int *errp; - struct qstr *tgt; - struct dentry *h_parent; - struct dentry *h_dentry; - struct au_branch *br; -}; - -static void call_do_whplink(void *args) -{ - struct do_whplink_args *a = args; - *a->errp = do_whplink(a->tgt, a->h_parent, a->h_dentry, a->br); -} - -static int whplink(struct dentry *h_dentry, struct inode *inode, - aufs_bindex_t bindex, struct au_branch *br) -{ - int err, wkq_err; - struct au_wbr *wbr; - struct dentry *h_parent; - char a[PLINK_NAME_LEN]; - struct qstr tgtname = QSTR_INIT(a, 0); - - wbr = au_sbr(inode->i_sb, bindex)->br_wbr; - h_parent = wbr->wbr_plink; - tgtname.len = plink_name(a, sizeof(a), inode, bindex); - - /* always superio. */ - if (!uid_eq(current_fsuid(), GLOBAL_ROOT_UID)) { - struct do_whplink_args args = { - .errp = &err, - .tgt = &tgtname, - .h_parent = h_parent, - .h_dentry = h_dentry, - .br = br - }; - wkq_err = au_wkq_wait(call_do_whplink, &args); - if (unlikely(wkq_err)) - err = wkq_err; - } else - err = do_whplink(&tgtname, h_parent, h_dentry, br); - - return err; -} - -/* free a single plink */ -static void do_put_plink(struct pseudo_link *plink, int do_del) -{ - if (do_del) - hlist_del(&plink->hlist); - iput(plink->inode); - kfree(plink); -} - -static void do_put_plink_rcu(struct rcu_head *rcu) -{ - struct pseudo_link *plink; - - plink = container_of(rcu, struct pseudo_link, rcu); - iput(plink->inode); - kfree(plink); -} - -/* - * create a new pseudo-link for @h_dentry on @bindex. - * the linked inode is held in aufs @inode. - */ -void au_plink_append(struct inode *inode, aufs_bindex_t bindex, - struct dentry *h_dentry) -{ - struct super_block *sb; - struct au_sbinfo *sbinfo; - struct hlist_head *plink_hlist; - struct pseudo_link *plink, *tmp; - struct au_sphlhead *sphl; - int found, err, cnt, i; - - sb = inode->i_sb; - sbinfo = au_sbi(sb); - AuDebugOn(!au_opt_test(au_mntflags(sb), PLINK)); - AuDebugOn(au_plink_maint(sb, AuLock_NOPLM)); - - found = au_plink_test(inode); - if (found) - return; - - i = au_plink_hash(inode->i_ino); - sphl = sbinfo->si_plink + i; - plink_hlist = &sphl->head; - tmp = kmalloc(sizeof(*plink), GFP_NOFS); - if (tmp) - tmp->inode = au_igrab(inode); - else { - err = -ENOMEM; - goto out; - } - - spin_lock(&sphl->spin); - hlist_for_each_entry(plink, plink_hlist, hlist) { - if (plink->inode == inode) { - found = 1; - break; - } - } - if (!found) - hlist_add_head_rcu(&tmp->hlist, plink_hlist); - spin_unlock(&sphl->spin); - if (!found) { - cnt = au_sphl_count(sphl); -#define msg "unexpectedly unblanced or too many pseudo-links" - if (cnt > AUFS_PLINK_WARN) - AuWarn1(msg ", %d\n", cnt); -#undef msg - err = whplink(h_dentry, inode, bindex, au_sbr(sb, bindex)); - } else { - do_put_plink(tmp, 0); - return; - } - -out: - if (unlikely(err)) { - pr_warn("err %d, damaged pseudo link.\n", err); - if (tmp) { - au_sphl_del_rcu(&tmp->hlist, sphl); - call_rcu(&tmp->rcu, do_put_plink_rcu); - } - } -} - -/* free all plinks */ -void au_plink_put(struct super_block *sb, int verbose) -{ - int i, warned; - struct au_sbinfo *sbinfo; - struct hlist_head *plink_hlist; - struct hlist_node *tmp; - struct pseudo_link *plink; - - SiMustWriteLock(sb); - - sbinfo = au_sbi(sb); - AuDebugOn(!au_opt_test(au_mntflags(sb), PLINK)); - AuDebugOn(au_plink_maint(sb, AuLock_NOPLM)); - - /* no spin_lock since sbinfo is write-locked */ - warned = 0; - for (i = 0; i < AuPlink_NHASH; i++) { - plink_hlist = &sbinfo->si_plink[i].head; - if (!warned && verbose && !hlist_empty(plink_hlist)) { - pr_warn("pseudo-link is not flushed"); - warned = 1; - } - hlist_for_each_entry_safe(plink, tmp, plink_hlist, hlist) - do_put_plink(plink, 0); - INIT_HLIST_HEAD(plink_hlist); - } -} - -void au_plink_clean(struct super_block *sb, int verbose) -{ - struct dentry *root; - - root = sb->s_root; - aufs_write_lock(root); - if (au_opt_test(au_mntflags(sb), PLINK)) - au_plink_put(sb, verbose); - aufs_write_unlock(root); -} - -static int au_plink_do_half_refresh(struct inode *inode, aufs_bindex_t br_id) -{ - int do_put; - aufs_bindex_t bstart, bend, bindex; - - do_put = 0; - bstart = au_ibstart(inode); - bend = au_ibend(inode); - if (bstart >= 0) { - for (bindex = bstart; bindex <= bend; bindex++) { - if (!au_h_iptr(inode, bindex) - || au_ii_br_id(inode, bindex) != br_id) - continue; - au_set_h_iptr(inode, bindex, NULL, 0); - do_put = 1; - break; - } - if (do_put) - for (bindex = bstart; bindex <= bend; bindex++) - if (au_h_iptr(inode, bindex)) { - do_put = 0; - break; - } - } else - do_put = 1; - - return do_put; -} - -/* free the plinks on a branch specified by @br_id */ -void au_plink_half_refresh(struct super_block *sb, aufs_bindex_t br_id) -{ - struct au_sbinfo *sbinfo; - struct hlist_head *plink_hlist; - struct hlist_node *tmp; - struct pseudo_link *plink; - struct inode *inode; - int i, do_put; - - SiMustWriteLock(sb); - - sbinfo = au_sbi(sb); - AuDebugOn(!au_opt_test(au_mntflags(sb), PLINK)); - AuDebugOn(au_plink_maint(sb, AuLock_NOPLM)); - - /* no spin_lock since sbinfo is write-locked */ - for (i = 0; i < AuPlink_NHASH; i++) { - plink_hlist = &sbinfo->si_plink[i].head; - hlist_for_each_entry_safe(plink, tmp, plink_hlist, hlist) { - inode = au_igrab(plink->inode); - ii_write_lock_child(inode); - do_put = au_plink_do_half_refresh(inode, br_id); - if (do_put) - do_put_plink(plink, 1); - ii_write_unlock(inode); - iput(inode); - } - } -} diff --git a/fs/aufs/poll.c b/fs/aufs/poll.c deleted file mode 100644 index 262a792fd..000000000 --- a/fs/aufs/poll.c +++ /dev/null @@ -1,52 +0,0 @@ -/* - * Copyright (C) 2005-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * poll operation - * There is only one filesystem which implements ->poll operation, currently. - */ - -#include "aufs.h" - -unsigned int aufs_poll(struct file *file, poll_table *wait) -{ - unsigned int mask; - int err; - struct file *h_file; - struct super_block *sb; - - /* We should pretend an error happened. */ - mask = POLLERR /* | POLLIN | POLLOUT */; - sb = file->f_path.dentry->d_sb; - si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW); - - h_file = au_read_pre(file, /*keep_fi*/0); - err = PTR_ERR(h_file); - if (IS_ERR(h_file)) - goto out; - - /* it is not an error if h_file has no operation */ - mask = DEFAULT_POLLMASK; - if (h_file->f_op->poll) - mask = h_file->f_op->poll(h_file, wait); - fput(h_file); /* instead of au_read_post() */ - -out: - si_read_unlock(sb); - AuTraceErr((int)mask); - return mask; -} diff --git a/fs/aufs/posix_acl.c b/fs/aufs/posix_acl.c deleted file mode 100644 index d5266ef76..000000000 --- a/fs/aufs/posix_acl.c +++ /dev/null @@ -1,99 +0,0 @@ -/* - * Copyright (C) 2014-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * posix acl operations - */ - -#include <linux/fs.h> -#include <linux/posix_acl.h> -#include "aufs.h" - -struct posix_acl *aufs_get_acl(struct inode *inode, int type) -{ - struct posix_acl *acl; - int err; - aufs_bindex_t bindex; - struct inode *h_inode; - struct super_block *sb; - - acl = NULL; - sb = inode->i_sb; - si_read_lock(sb, AuLock_FLUSH); - ii_read_lock_child(inode); - if (!(sb->s_flags & MS_POSIXACL)) - goto out; - - bindex = au_ibstart(inode); - h_inode = au_h_iptr(inode, bindex); - if (unlikely(!h_inode - || ((h_inode->i_mode & S_IFMT) - != (inode->i_mode & S_IFMT)))) { - err = au_busy_or_stale(); - acl = ERR_PTR(err); - goto out; - } - - /* always topmost only */ - acl = get_acl(h_inode, type); - -out: - ii_read_unlock(inode); - si_read_unlock(sb); - - AuTraceErrPtr(acl); - return acl; -} - -int aufs_set_acl(struct inode *inode, struct posix_acl *acl, int type) -{ - int err; - ssize_t ssz; - struct dentry *dentry; - struct au_srxattr arg = { - .type = AU_ACL_SET, - .u.acl_set = { - .acl = acl, - .type = type - }, - }; - - mutex_lock(&inode->i_mutex); - if (inode->i_ino == AUFS_ROOT_INO) - dentry = dget(inode->i_sb->s_root); - else { - dentry = d_find_alias(inode); - if (!dentry) - dentry = d_find_any_alias(inode); - if (!dentry) { - pr_warn("cannot handle this inode, " - "please report to aufs-users ML\n"); - err = -ENOENT; - goto out; - } - } - - ssz = au_srxattr(dentry, &arg); - dput(dentry); - err = ssz; - if (ssz >= 0) - err = 0; - -out: - mutex_unlock(&inode->i_mutex); - return err; -} diff --git a/fs/aufs/procfs.c b/fs/aufs/procfs.c deleted file mode 100644 index 772be91a9..000000000 --- a/fs/aufs/procfs.c +++ /dev/null @@ -1,169 +0,0 @@ -/* - * Copyright (C) 2010-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * procfs interfaces - */ - -#include <linux/proc_fs.h> -#include "aufs.h" - -static int au_procfs_plm_release(struct inode *inode, struct file *file) -{ - struct au_sbinfo *sbinfo; - - sbinfo = file->private_data; - if (sbinfo) { - au_plink_maint_leave(sbinfo); - kobject_put(&sbinfo->si_kobj); - } - - return 0; -} - -static void au_procfs_plm_write_clean(struct file *file) -{ - struct au_sbinfo *sbinfo; - - sbinfo = file->private_data; - if (sbinfo) - au_plink_clean(sbinfo->si_sb, /*verbose*/0); -} - -static int au_procfs_plm_write_si(struct file *file, unsigned long id) -{ - int err; - struct super_block *sb; - struct au_sbinfo *sbinfo; - - err = -EBUSY; - if (unlikely(file->private_data)) - goto out; - - sb = NULL; - /* don't use au_sbilist_lock() here */ - spin_lock(&au_sbilist.spin); - list_for_each_entry(sbinfo, &au_sbilist.head, si_list) - if (id == sysaufs_si_id(sbinfo)) { - kobject_get(&sbinfo->si_kobj); - sb = sbinfo->si_sb; - break; - } - spin_unlock(&au_sbilist.spin); - - err = -EINVAL; - if (unlikely(!sb)) - goto out; - - err = au_plink_maint_enter(sb); - if (!err) - /* keep kobject_get() */ - file->private_data = sbinfo; - else - kobject_put(&sbinfo->si_kobj); -out: - return err; -} - -/* - * Accept a valid "si=xxxx" only. - * Once it is accepted successfully, accept "clean" too. - */ -static ssize_t au_procfs_plm_write(struct file *file, const char __user *ubuf, - size_t count, loff_t *ppos) -{ - ssize_t err; - unsigned long id; - /* last newline is allowed */ - char buf[3 + sizeof(unsigned long) * 2 + 1]; - - err = -EACCES; - if (unlikely(!capable(CAP_SYS_ADMIN))) - goto out; - - err = -EINVAL; - if (unlikely(count > sizeof(buf))) - goto out; - - err = copy_from_user(buf, ubuf, count); - if (unlikely(err)) { - err = -EFAULT; - goto out; - } - buf[count] = 0; - - err = -EINVAL; - if (!strcmp("clean", buf)) { - au_procfs_plm_write_clean(file); - goto out_success; - } else if (unlikely(strncmp("si=", buf, 3))) - goto out; - - err = kstrtoul(buf + 3, 16, &id); - if (unlikely(err)) - goto out; - - err = au_procfs_plm_write_si(file, id); - if (unlikely(err)) - goto out; - -out_success: - err = count; /* success */ -out: - return err; -} - -static const struct file_operations au_procfs_plm_fop = { - .write = au_procfs_plm_write, - .release = au_procfs_plm_release, - .owner = THIS_MODULE -}; - -/* ---------------------------------------------------------------------- */ - -static struct proc_dir_entry *au_procfs_dir; - -void au_procfs_fin(void) -{ - remove_proc_entry(AUFS_PLINK_MAINT_NAME, au_procfs_dir); - remove_proc_entry(AUFS_PLINK_MAINT_DIR, NULL); -} - -int __init au_procfs_init(void) -{ - int err; - struct proc_dir_entry *entry; - - err = -ENOMEM; - au_procfs_dir = proc_mkdir(AUFS_PLINK_MAINT_DIR, NULL); - if (unlikely(!au_procfs_dir)) - goto out; - - entry = proc_create(AUFS_PLINK_MAINT_NAME, S_IFREG | S_IWUSR, - au_procfs_dir, &au_procfs_plm_fop); - if (unlikely(!entry)) - goto out_dir; - - err = 0; - goto out; /* success */ - - -out_dir: - remove_proc_entry(AUFS_PLINK_MAINT_DIR, NULL); -out: - return err; -} diff --git a/fs/aufs/rdu.c b/fs/aufs/rdu.c deleted file mode 100644 index 3beba1806..000000000 --- a/fs/aufs/rdu.c +++ /dev/null @@ -1,388 +0,0 @@ -/* - * Copyright (C) 2005-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * readdir in userspace. - */ - -#include <linux/compat.h> -#include <linux/fs_stack.h> -#include <linux/security.h> -#include "aufs.h" - -/* bits for struct aufs_rdu.flags */ -#define AuRdu_CALLED 1 -#define AuRdu_CONT (1 << 1) -#define AuRdu_FULL (1 << 2) -#define au_ftest_rdu(flags, name) ((flags) & AuRdu_##name) -#define au_fset_rdu(flags, name) \ - do { (flags) |= AuRdu_##name; } while (0) -#define au_fclr_rdu(flags, name) \ - do { (flags) &= ~AuRdu_##name; } while (0) - -struct au_rdu_arg { - struct dir_context ctx; - struct aufs_rdu *rdu; - union au_rdu_ent_ul ent; - unsigned long end; - - struct super_block *sb; - int err; -}; - -static int au_rdu_fill(struct dir_context *ctx, const char *name, int nlen, - loff_t offset, u64 h_ino, unsigned int d_type) -{ - int err, len; - struct au_rdu_arg *arg = container_of(ctx, struct au_rdu_arg, ctx); - struct aufs_rdu *rdu = arg->rdu; - struct au_rdu_ent ent; - - err = 0; - arg->err = 0; - au_fset_rdu(rdu->cookie.flags, CALLED); - len = au_rdu_len(nlen); - if (arg->ent.ul + len < arg->end) { - ent.ino = h_ino; - ent.bindex = rdu->cookie.bindex; - ent.type = d_type; - ent.nlen = nlen; - if (unlikely(nlen > AUFS_MAX_NAMELEN)) - ent.type = DT_UNKNOWN; - - /* unnecessary to support mmap_sem since this is a dir */ - err = -EFAULT; - if (copy_to_user(arg->ent.e, &ent, sizeof(ent))) - goto out; - if (copy_to_user(arg->ent.e->name, name, nlen)) - goto out; - /* the terminating NULL */ - if (__put_user(0, arg->ent.e->name + nlen)) - goto out; - err = 0; - /* AuDbg("%p, %.*s\n", arg->ent.p, nlen, name); */ - arg->ent.ul += len; - rdu->rent++; - } else { - err = -EFAULT; - au_fset_rdu(rdu->cookie.flags, FULL); - rdu->full = 1; - rdu->tail = arg->ent; - } - -out: - /* AuTraceErr(err); */ - return err; -} - -static int au_rdu_do(struct file *h_file, struct au_rdu_arg *arg) -{ - int err; - loff_t offset; - struct au_rdu_cookie *cookie = &arg->rdu->cookie; - - /* we don't have to care (FMODE_32BITHASH | FMODE_64BITHASH) for ext4 */ - offset = vfsub_llseek(h_file, cookie->h_pos, SEEK_SET); - err = offset; - if (unlikely(offset != cookie->h_pos)) - goto out; - - err = 0; - do { - arg->err = 0; - au_fclr_rdu(cookie->flags, CALLED); - /* smp_mb(); */ - err = vfsub_iterate_dir(h_file, &arg->ctx); - if (err >= 0) - err = arg->err; - } while (!err - && au_ftest_rdu(cookie->flags, CALLED) - && !au_ftest_rdu(cookie->flags, FULL)); - cookie->h_pos = h_file->f_pos; - -out: - AuTraceErr(err); - return err; -} - -static int au_rdu(struct file *file, struct aufs_rdu *rdu) -{ - int err; - aufs_bindex_t bend; - struct au_rdu_arg arg = { - .ctx = { - .actor = au_rdu_fill - } - }; - struct dentry *dentry; - struct inode *inode; - struct file *h_file; - struct au_rdu_cookie *cookie = &rdu->cookie; - - err = !access_ok(VERIFY_WRITE, rdu->ent.e, rdu->sz); - if (unlikely(err)) { - err = -EFAULT; - AuTraceErr(err); - goto out; - } - rdu->rent = 0; - rdu->tail = rdu->ent; - rdu->full = 0; - arg.rdu = rdu; - arg.ent = rdu->ent; - arg.end = arg.ent.ul; - arg.end += rdu->sz; - - err = -ENOTDIR; - if (unlikely(!file->f_op->iterate)) - goto out; - - err = security_file_permission(file, MAY_READ); - AuTraceErr(err); - if (unlikely(err)) - goto out; - - dentry = file->f_path.dentry; - inode = d_inode(dentry); -#if 1 - mutex_lock(&inode->i_mutex); -#else - err = mutex_lock_killable(&inode->i_mutex); - AuTraceErr(err); - if (unlikely(err)) - goto out; -#endif - - arg.sb = inode->i_sb; - err = si_read_lock(arg.sb, AuLock_FLUSH | AuLock_NOPLM); - if (unlikely(err)) - goto out_mtx; - err = au_alive_dir(dentry); - if (unlikely(err)) - goto out_si; - /* todo: reval? */ - fi_read_lock(file); - - err = -EAGAIN; - if (unlikely(au_ftest_rdu(cookie->flags, CONT) - && cookie->generation != au_figen(file))) - goto out_unlock; - - err = 0; - if (!rdu->blk) { - rdu->blk = au_sbi(arg.sb)->si_rdblk; - if (!rdu->blk) - rdu->blk = au_dir_size(file, /*dentry*/NULL); - } - bend = au_fbstart(file); - if (cookie->bindex < bend) - cookie->bindex = bend; - bend = au_fbend_dir(file); - /* AuDbg("b%d, b%d\n", cookie->bindex, bend); */ - for (; !err && cookie->bindex <= bend; - cookie->bindex++, cookie->h_pos = 0) { - h_file = au_hf_dir(file, cookie->bindex); - if (!h_file) - continue; - - au_fclr_rdu(cookie->flags, FULL); - err = au_rdu_do(h_file, &arg); - AuTraceErr(err); - if (unlikely(au_ftest_rdu(cookie->flags, FULL) || err)) - break; - } - AuDbg("rent %llu\n", rdu->rent); - - if (!err && !au_ftest_rdu(cookie->flags, CONT)) { - rdu->shwh = !!au_opt_test(au_sbi(arg.sb)->si_mntflags, SHWH); - au_fset_rdu(cookie->flags, CONT); - cookie->generation = au_figen(file); - } - - ii_read_lock_child(inode); - fsstack_copy_attr_atime(inode, au_h_iptr(inode, au_ibstart(inode))); - ii_read_unlock(inode); - -out_unlock: - fi_read_unlock(file); -out_si: - si_read_unlock(arg.sb); -out_mtx: - mutex_unlock(&inode->i_mutex); -out: - AuTraceErr(err); - return err; -} - -static int au_rdu_ino(struct file *file, struct aufs_rdu *rdu) -{ - int err; - ino_t ino; - unsigned long long nent; - union au_rdu_ent_ul *u; - struct au_rdu_ent ent; - struct super_block *sb; - - err = 0; - nent = rdu->nent; - u = &rdu->ent; - sb = file->f_path.dentry->d_sb; - si_read_lock(sb, AuLock_FLUSH); - while (nent-- > 0) { - /* unnecessary to support mmap_sem since this is a dir */ - err = copy_from_user(&ent, u->e, sizeof(ent)); - if (!err) - err = !access_ok(VERIFY_WRITE, &u->e->ino, sizeof(ino)); - if (unlikely(err)) { - err = -EFAULT; - AuTraceErr(err); - break; - } - - /* AuDbg("b%d, i%llu\n", ent.bindex, ent.ino); */ - if (!ent.wh) - err = au_ino(sb, ent.bindex, ent.ino, ent.type, &ino); - else - err = au_wh_ino(sb, ent.bindex, ent.ino, ent.type, - &ino); - if (unlikely(err)) { - AuTraceErr(err); - break; - } - - err = __put_user(ino, &u->e->ino); - if (unlikely(err)) { - err = -EFAULT; - AuTraceErr(err); - break; - } - u->ul += au_rdu_len(ent.nlen); - } - si_read_unlock(sb); - - return err; -} - -/* ---------------------------------------------------------------------- */ - -static int au_rdu_verify(struct aufs_rdu *rdu) -{ - AuDbg("rdu{%llu, %p, %u | %u | %llu, %u, %u | " - "%llu, b%d, 0x%x, g%u}\n", - rdu->sz, rdu->ent.e, rdu->verify[AufsCtlRduV_SZ], - rdu->blk, - rdu->rent, rdu->shwh, rdu->full, - rdu->cookie.h_pos, rdu->cookie.bindex, rdu->cookie.flags, - rdu->cookie.generation); - - if (rdu->verify[AufsCtlRduV_SZ] == sizeof(*rdu)) - return 0; - - AuDbg("%u:%u\n", - rdu->verify[AufsCtlRduV_SZ], (unsigned int)sizeof(*rdu)); - return -EINVAL; -} - -long au_rdu_ioctl(struct file *file, unsigned int cmd, unsigned long arg) -{ - long err, e; - struct aufs_rdu rdu; - void __user *p = (void __user *)arg; - - err = copy_from_user(&rdu, p, sizeof(rdu)); - if (unlikely(err)) { - err = -EFAULT; - AuTraceErr(err); - goto out; - } - err = au_rdu_verify(&rdu); - if (unlikely(err)) - goto out; - - switch (cmd) { - case AUFS_CTL_RDU: - err = au_rdu(file, &rdu); - if (unlikely(err)) - break; - - e = copy_to_user(p, &rdu, sizeof(rdu)); - if (unlikely(e)) { - err = -EFAULT; - AuTraceErr(err); - } - break; - case AUFS_CTL_RDU_INO: - err = au_rdu_ino(file, &rdu); - break; - - default: - /* err = -ENOTTY; */ - err = -EINVAL; - } - -out: - AuTraceErr(err); - return err; -} - -#ifdef CONFIG_COMPAT -long au_rdu_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) -{ - long err, e; - struct aufs_rdu rdu; - void __user *p = compat_ptr(arg); - - /* todo: get_user()? */ - err = copy_from_user(&rdu, p, sizeof(rdu)); - if (unlikely(err)) { - err = -EFAULT; - AuTraceErr(err); - goto out; - } - rdu.ent.e = compat_ptr(rdu.ent.ul); - err = au_rdu_verify(&rdu); - if (unlikely(err)) - goto out; - - switch (cmd) { - case AUFS_CTL_RDU: - err = au_rdu(file, &rdu); - if (unlikely(err)) - break; - - rdu.ent.ul = ptr_to_compat(rdu.ent.e); - rdu.tail.ul = ptr_to_compat(rdu.tail.e); - e = copy_to_user(p, &rdu, sizeof(rdu)); - if (unlikely(e)) { - err = -EFAULT; - AuTraceErr(err); - } - break; - case AUFS_CTL_RDU_INO: - err = au_rdu_ino(file, &rdu); - break; - - default: - /* err = -ENOTTY; */ - err = -EINVAL; - } - -out: - AuTraceErr(err); - return err; -} -#endif diff --git a/fs/aufs/rwsem.h b/fs/aufs/rwsem.h deleted file mode 100644 index deb813f3e..000000000 --- a/fs/aufs/rwsem.h +++ /dev/null @@ -1,191 +0,0 @@ -/* - * Copyright (C) 2005-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * simple read-write semaphore wrappers - */ - -#ifndef __AUFS_RWSEM_H__ -#define __AUFS_RWSEM_H__ - -#ifdef __KERNEL__ - -#include "debug.h" - -struct au_rwsem { - struct rw_semaphore rwsem; -#ifdef CONFIG_AUFS_DEBUG - /* just for debugging, not almighty counter */ - atomic_t rcnt, wcnt; -#endif -}; - -#ifdef CONFIG_AUFS_DEBUG -#define AuDbgCntInit(rw) do { \ - atomic_set(&(rw)->rcnt, 0); \ - atomic_set(&(rw)->wcnt, 0); \ - smp_mb(); /* atomic set */ \ -} while (0) - -#define AuDbgRcntInc(rw) atomic_inc(&(rw)->rcnt) -#define AuDbgRcntDec(rw) WARN_ON(atomic_dec_return(&(rw)->rcnt) < 0) -#define AuDbgWcntInc(rw) atomic_inc(&(rw)->wcnt) -#define AuDbgWcntDec(rw) WARN_ON(atomic_dec_return(&(rw)->wcnt) < 0) -#else -#define AuDbgCntInit(rw) do {} while (0) -#define AuDbgRcntInc(rw) do {} while (0) -#define AuDbgRcntDec(rw) do {} while (0) -#define AuDbgWcntInc(rw) do {} while (0) -#define AuDbgWcntDec(rw) do {} while (0) -#endif /* CONFIG_AUFS_DEBUG */ - -/* to debug easier, do not make them inlined functions */ -#define AuRwMustNoWaiters(rw) AuDebugOn(!list_empty(&(rw)->rwsem.wait_list)) -/* rwsem_is_locked() is unusable */ -#define AuRwMustReadLock(rw) AuDebugOn(atomic_read(&(rw)->rcnt) <= 0) -#define AuRwMustWriteLock(rw) AuDebugOn(atomic_read(&(rw)->wcnt) <= 0) -#define AuRwMustAnyLock(rw) AuDebugOn(atomic_read(&(rw)->rcnt) <= 0 \ - && atomic_read(&(rw)->wcnt) <= 0) -#define AuRwDestroy(rw) AuDebugOn(atomic_read(&(rw)->rcnt) \ - || atomic_read(&(rw)->wcnt)) - -#define au_rw_class(rw, key) lockdep_set_class(&(rw)->rwsem, key) - -static inline void au_rw_init(struct au_rwsem *rw) -{ - AuDbgCntInit(rw); - init_rwsem(&rw->rwsem); -} - -static inline void au_rw_init_wlock(struct au_rwsem *rw) -{ - au_rw_init(rw); - down_write(&rw->rwsem); - AuDbgWcntInc(rw); -} - -static inline void au_rw_init_wlock_nested(struct au_rwsem *rw, - unsigned int lsc) -{ - au_rw_init(rw); - down_write_nested(&rw->rwsem, lsc); - AuDbgWcntInc(rw); -} - -static inline void au_rw_read_lock(struct au_rwsem *rw) -{ - down_read(&rw->rwsem); - AuDbgRcntInc(rw); -} - -static inline void au_rw_read_lock_nested(struct au_rwsem *rw, unsigned int lsc) -{ - down_read_nested(&rw->rwsem, lsc); - AuDbgRcntInc(rw); -} - -static inline void au_rw_read_unlock(struct au_rwsem *rw) -{ - AuRwMustReadLock(rw); - AuDbgRcntDec(rw); - up_read(&rw->rwsem); -} - -static inline void au_rw_dgrade_lock(struct au_rwsem *rw) -{ - AuRwMustWriteLock(rw); - AuDbgRcntInc(rw); - AuDbgWcntDec(rw); - downgrade_write(&rw->rwsem); -} - -static inline void au_rw_write_lock(struct au_rwsem *rw) -{ - down_write(&rw->rwsem); - AuDbgWcntInc(rw); -} - -static inline void au_rw_write_lock_nested(struct au_rwsem *rw, - unsigned int lsc) -{ - down_write_nested(&rw->rwsem, lsc); - AuDbgWcntInc(rw); -} - -static inline void au_rw_write_unlock(struct au_rwsem *rw) -{ - AuRwMustWriteLock(rw); - AuDbgWcntDec(rw); - up_write(&rw->rwsem); -} - -/* why is not _nested version defined */ -static inline int au_rw_read_trylock(struct au_rwsem *rw) -{ - int ret; - - ret = down_read_trylock(&rw->rwsem); - if (ret) - AuDbgRcntInc(rw); - return ret; -} - -static inline int au_rw_write_trylock(struct au_rwsem *rw) -{ - int ret; - - ret = down_write_trylock(&rw->rwsem); - if (ret) - AuDbgWcntInc(rw); - return ret; -} - -#undef AuDbgCntInit -#undef AuDbgRcntInc -#undef AuDbgRcntDec -#undef AuDbgWcntInc -#undef AuDbgWcntDec - -#define AuSimpleLockRwsemFuncs(prefix, param, rwsem) \ -static inline void prefix##_read_lock(param) \ -{ au_rw_read_lock(rwsem); } \ -static inline void prefix##_write_lock(param) \ -{ au_rw_write_lock(rwsem); } \ -static inline int prefix##_read_trylock(param) \ -{ return au_rw_read_trylock(rwsem); } \ -static inline int prefix##_write_trylock(param) \ -{ return au_rw_write_trylock(rwsem); } -/* why is not _nested version defined */ -/* static inline void prefix##_read_trylock_nested(param, lsc) -{ au_rw_read_trylock_nested(rwsem, lsc)); } -static inline void prefix##_write_trylock_nestd(param, lsc) -{ au_rw_write_trylock_nested(rwsem, lsc); } */ - -#define AuSimpleUnlockRwsemFuncs(prefix, param, rwsem) \ -static inline void prefix##_read_unlock(param) \ -{ au_rw_read_unlock(rwsem); } \ -static inline void prefix##_write_unlock(param) \ -{ au_rw_write_unlock(rwsem); } \ -static inline void prefix##_downgrade_lock(param) \ -{ au_rw_dgrade_lock(rwsem); } - -#define AuSimpleRwsemFuncs(prefix, param, rwsem) \ - AuSimpleLockRwsemFuncs(prefix, param, rwsem) \ - AuSimpleUnlockRwsemFuncs(prefix, param, rwsem) - -#endif /* __KERNEL__ */ -#endif /* __AUFS_RWSEM_H__ */ diff --git a/fs/aufs/sbinfo.c b/fs/aufs/sbinfo.c deleted file mode 100644 index 4e59efc08..000000000 --- a/fs/aufs/sbinfo.c +++ /dev/null @@ -1,356 +0,0 @@ -/* - * Copyright (C) 2005-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * superblock private data - */ - -#include "aufs.h" - -/* - * they are necessary regardless sysfs is disabled. - */ -void au_si_free(struct kobject *kobj) -{ - int i; - struct au_sbinfo *sbinfo; - char *locked __maybe_unused; /* debug only */ - - sbinfo = container_of(kobj, struct au_sbinfo, si_kobj); - for (i = 0; i < AuPlink_NHASH; i++) - AuDebugOn(!hlist_empty(&sbinfo->si_plink[i].head)); - AuDebugOn(atomic_read(&sbinfo->si_nowait.nw_len)); - - au_rw_write_lock(&sbinfo->si_rwsem); - au_br_free(sbinfo); - au_rw_write_unlock(&sbinfo->si_rwsem); - - AuDebugOn(radix_tree_gang_lookup - (&sbinfo->au_si_pid.tree, (void **)&locked, - /*first_index*/PID_MAX_DEFAULT - 1, - /*max_items*/sizeof(locked)/sizeof(*locked))); - - kfree(sbinfo->si_branch); - kfree(sbinfo->au_si_pid.bitmap); - mutex_destroy(&sbinfo->si_xib_mtx); - AuRwDestroy(&sbinfo->si_rwsem); - - kfree(sbinfo); -} - -int au_si_alloc(struct super_block *sb) -{ - int err, i; - struct au_sbinfo *sbinfo; - static struct lock_class_key aufs_si; - - err = -ENOMEM; - sbinfo = kzalloc(sizeof(*sbinfo), GFP_NOFS); - if (unlikely(!sbinfo)) - goto out; - - BUILD_BUG_ON(sizeof(unsigned long) != - sizeof(*sbinfo->au_si_pid.bitmap)); - sbinfo->au_si_pid.bitmap = kcalloc(BITS_TO_LONGS(PID_MAX_DEFAULT), - sizeof(*sbinfo->au_si_pid.bitmap), - GFP_NOFS); - if (unlikely(!sbinfo->au_si_pid.bitmap)) - goto out_sbinfo; - - /* will be reallocated separately */ - sbinfo->si_branch = kzalloc(sizeof(*sbinfo->si_branch), GFP_NOFS); - if (unlikely(!sbinfo->si_branch)) - goto out_pidmap; - - err = sysaufs_si_init(sbinfo); - if (unlikely(err)) - goto out_br; - - au_nwt_init(&sbinfo->si_nowait); - au_rw_init_wlock(&sbinfo->si_rwsem); - au_rw_class(&sbinfo->si_rwsem, &aufs_si); - spin_lock_init(&sbinfo->au_si_pid.tree_lock); - INIT_RADIX_TREE(&sbinfo->au_si_pid.tree, GFP_ATOMIC | __GFP_NOFAIL); - - atomic_long_set(&sbinfo->si_ninodes, 0); - atomic_long_set(&sbinfo->si_nfiles, 0); - - sbinfo->si_bend = -1; - sbinfo->si_last_br_id = AUFS_BRANCH_MAX / 2; - - sbinfo->si_wbr_copyup = AuWbrCopyup_Def; - sbinfo->si_wbr_create = AuWbrCreate_Def; - sbinfo->si_wbr_copyup_ops = au_wbr_copyup_ops + sbinfo->si_wbr_copyup; - sbinfo->si_wbr_create_ops = au_wbr_create_ops + sbinfo->si_wbr_create; - - au_fhsm_init(sbinfo); - - sbinfo->si_mntflags = au_opts_plink(AuOpt_Def); - - sbinfo->si_xino_jiffy = jiffies; - sbinfo->si_xino_expire - = msecs_to_jiffies(AUFS_XINO_DEF_SEC * MSEC_PER_SEC); - mutex_init(&sbinfo->si_xib_mtx); - sbinfo->si_xino_brid = -1; - /* leave si_xib_last_pindex and si_xib_next_bit */ - - au_sphl_init(&sbinfo->si_aopen); - - sbinfo->si_rdcache = msecs_to_jiffies(AUFS_RDCACHE_DEF * MSEC_PER_SEC); - sbinfo->si_rdblk = AUFS_RDBLK_DEF; - sbinfo->si_rdhash = AUFS_RDHASH_DEF; - sbinfo->si_dirwh = AUFS_DIRWH_DEF; - - for (i = 0; i < AuPlink_NHASH; i++) - au_sphl_init(sbinfo->si_plink + i); - init_waitqueue_head(&sbinfo->si_plink_wq); - spin_lock_init(&sbinfo->si_plink_maint_lock); - - au_sphl_init(&sbinfo->si_files); - - /* leave other members for sysaufs and si_mnt. */ - sbinfo->si_sb = sb; - sb->s_fs_info = sbinfo; - si_pid_set(sb); - return 0; /* success */ - -out_br: - kfree(sbinfo->si_branch); -out_pidmap: - kfree(sbinfo->au_si_pid.bitmap); -out_sbinfo: - kfree(sbinfo); -out: - return err; -} - -int au_sbr_realloc(struct au_sbinfo *sbinfo, int nbr) -{ - int err, sz; - struct au_branch **brp; - - AuRwMustWriteLock(&sbinfo->si_rwsem); - - err = -ENOMEM; - sz = sizeof(*brp) * (sbinfo->si_bend + 1); - if (unlikely(!sz)) - sz = sizeof(*brp); - brp = au_kzrealloc(sbinfo->si_branch, sz, sizeof(*brp) * nbr, GFP_NOFS); - if (brp) { - sbinfo->si_branch = brp; - err = 0; - } - - return err; -} - -/* ---------------------------------------------------------------------- */ - -unsigned int au_sigen_inc(struct super_block *sb) -{ - unsigned int gen; - struct inode *inode; - - SiMustWriteLock(sb); - - gen = ++au_sbi(sb)->si_generation; - au_update_digen(sb->s_root); - inode = d_inode(sb->s_root); - au_update_iigen(inode, /*half*/0); - inode->i_version++; - return gen; -} - -aufs_bindex_t au_new_br_id(struct super_block *sb) -{ - aufs_bindex_t br_id; - int i; - struct au_sbinfo *sbinfo; - - SiMustWriteLock(sb); - - sbinfo = au_sbi(sb); - for (i = 0; i <= AUFS_BRANCH_MAX; i++) { - br_id = ++sbinfo->si_last_br_id; - AuDebugOn(br_id < 0); - if (br_id && au_br_index(sb, br_id) < 0) - return br_id; - } - - return -1; -} - -/* ---------------------------------------------------------------------- */ - -/* it is ok that new 'nwt' tasks are appended while we are sleeping */ -int si_read_lock(struct super_block *sb, int flags) -{ - int err; - - err = 0; - if (au_ftest_lock(flags, FLUSH)) - au_nwt_flush(&au_sbi(sb)->si_nowait); - - si_noflush_read_lock(sb); - err = au_plink_maint(sb, flags); - if (unlikely(err)) - si_read_unlock(sb); - - return err; -} - -int si_write_lock(struct super_block *sb, int flags) -{ - int err; - - if (au_ftest_lock(flags, FLUSH)) - au_nwt_flush(&au_sbi(sb)->si_nowait); - - si_noflush_write_lock(sb); - err = au_plink_maint(sb, flags); - if (unlikely(err)) - si_write_unlock(sb); - - return err; -} - -/* dentry and super_block lock. call at entry point */ -int aufs_read_lock(struct dentry *dentry, int flags) -{ - int err; - struct super_block *sb; - - sb = dentry->d_sb; - err = si_read_lock(sb, flags); - if (unlikely(err)) - goto out; - - if (au_ftest_lock(flags, DW)) - di_write_lock_child(dentry); - else - di_read_lock_child(dentry, flags); - - if (au_ftest_lock(flags, GEN)) { - err = au_digen_test(dentry, au_sigen(sb)); - AuDebugOn(!err && au_dbrange_test(dentry)); - if (unlikely(err)) - aufs_read_unlock(dentry, flags); - } - -out: - return err; -} - -void aufs_read_unlock(struct dentry *dentry, int flags) -{ - if (au_ftest_lock(flags, DW)) - di_write_unlock(dentry); - else - di_read_unlock(dentry, flags); - si_read_unlock(dentry->d_sb); -} - -void aufs_write_lock(struct dentry *dentry) -{ - si_write_lock(dentry->d_sb, AuLock_FLUSH | AuLock_NOPLMW); - di_write_lock_child(dentry); -} - -void aufs_write_unlock(struct dentry *dentry) -{ - di_write_unlock(dentry); - si_write_unlock(dentry->d_sb); -} - -int aufs_read_and_write_lock2(struct dentry *d1, struct dentry *d2, int flags) -{ - int err; - unsigned int sigen; - struct super_block *sb; - - sb = d1->d_sb; - err = si_read_lock(sb, flags); - if (unlikely(err)) - goto out; - - di_write_lock2_child(d1, d2, au_ftest_lock(flags, DIR)); - - if (au_ftest_lock(flags, GEN)) { - sigen = au_sigen(sb); - err = au_digen_test(d1, sigen); - AuDebugOn(!err && au_dbrange_test(d1)); - if (!err) { - err = au_digen_test(d2, sigen); - AuDebugOn(!err && au_dbrange_test(d2)); - } - if (unlikely(err)) - aufs_read_and_write_unlock2(d1, d2); - } - -out: - return err; -} - -void aufs_read_and_write_unlock2(struct dentry *d1, struct dentry *d2) -{ - di_write_unlock2(d1, d2); - si_read_unlock(d1->d_sb); -} - -/* ---------------------------------------------------------------------- */ - -int si_pid_test_slow(struct super_block *sb) -{ - void *p; - - rcu_read_lock(); - p = radix_tree_lookup(&au_sbi(sb)->au_si_pid.tree, current->pid); - rcu_read_unlock(); - - return (long)!!p; -} - -void si_pid_set_slow(struct super_block *sb) -{ - int err; - struct au_sbinfo *sbinfo; - - AuDebugOn(si_pid_test_slow(sb)); - - sbinfo = au_sbi(sb); - err = radix_tree_preload(GFP_NOFS | __GFP_NOFAIL); - AuDebugOn(err); - spin_lock(&sbinfo->au_si_pid.tree_lock); - err = radix_tree_insert(&sbinfo->au_si_pid.tree, current->pid, - /*any valid ptr*/sb); - spin_unlock(&sbinfo->au_si_pid.tree_lock); - AuDebugOn(err); - radix_tree_preload_end(); -} - -void si_pid_clr_slow(struct super_block *sb) -{ - void *p; - struct au_sbinfo *sbinfo; - - AuDebugOn(!si_pid_test_slow(sb)); - - sbinfo = au_sbi(sb); - spin_lock(&sbinfo->au_si_pid.tree_lock); - p = radix_tree_delete(&sbinfo->au_si_pid.tree, current->pid); - spin_unlock(&sbinfo->au_si_pid.tree_lock); -} diff --git a/fs/aufs/spl.h b/fs/aufs/spl.h deleted file mode 100644 index 94f09f298..000000000 --- a/fs/aufs/spl.h +++ /dev/null @@ -1,111 +0,0 @@ -/* - * Copyright (C) 2005-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * simple list protected by a spinlock - */ - -#ifndef __AUFS_SPL_H__ -#define __AUFS_SPL_H__ - -#ifdef __KERNEL__ - -struct au_splhead { - spinlock_t spin; - struct list_head head; -}; - -static inline void au_spl_init(struct au_splhead *spl) -{ - spin_lock_init(&spl->spin); - INIT_LIST_HEAD(&spl->head); -} - -static inline void au_spl_add(struct list_head *list, struct au_splhead *spl) -{ - spin_lock(&spl->spin); - list_add(list, &spl->head); - spin_unlock(&spl->spin); -} - -static inline void au_spl_del(struct list_head *list, struct au_splhead *spl) -{ - spin_lock(&spl->spin); - list_del(list); - spin_unlock(&spl->spin); -} - -static inline void au_spl_del_rcu(struct list_head *list, - struct au_splhead *spl) -{ - spin_lock(&spl->spin); - list_del_rcu(list); - spin_unlock(&spl->spin); -} - -/* ---------------------------------------------------------------------- */ - -struct au_sphlhead { - spinlock_t spin; - struct hlist_head head; -}; - -static inline void au_sphl_init(struct au_sphlhead *sphl) -{ - spin_lock_init(&sphl->spin); - INIT_HLIST_HEAD(&sphl->head); -} - -static inline void au_sphl_add(struct hlist_node *hlist, - struct au_sphlhead *sphl) -{ - spin_lock(&sphl->spin); - hlist_add_head(hlist, &sphl->head); - spin_unlock(&sphl->spin); -} - -static inline void au_sphl_del(struct hlist_node *hlist, - struct au_sphlhead *sphl) -{ - spin_lock(&sphl->spin); - hlist_del(hlist); - spin_unlock(&sphl->spin); -} - -static inline void au_sphl_del_rcu(struct hlist_node *hlist, - struct au_sphlhead *sphl) -{ - spin_lock(&sphl->spin); - hlist_del_rcu(hlist); - spin_unlock(&sphl->spin); -} - -static inline unsigned long au_sphl_count(struct au_sphlhead *sphl) -{ - unsigned long cnt; - struct hlist_node *pos; - - cnt = 0; - spin_lock(&sphl->spin); - hlist_for_each(pos, &sphl->head) - cnt++; - spin_unlock(&sphl->spin); - return cnt; -} - -#endif /* __KERNEL__ */ -#endif /* __AUFS_SPL_H__ */ diff --git a/fs/aufs/super.c b/fs/aufs/super.c deleted file mode 100644 index 6b7f38e48..000000000 --- a/fs/aufs/super.c +++ /dev/null @@ -1,1004 +0,0 @@ -/* - * Copyright (C) 2005-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * mount and super_block operations - */ - -#include <linux/mm.h> -#include <linux/seq_file.h> -#include <linux/statfs.h> -#include <linux/vmalloc.h> -#include "aufs.h" - -/* - * super_operations - */ -static struct inode *aufs_alloc_inode(struct super_block *sb __maybe_unused) -{ - struct au_icntnr *c; - - c = au_cache_alloc_icntnr(); - if (c) { - au_icntnr_init(c); - c->vfs_inode.i_version = 1; /* sigen(sb); */ - c->iinfo.ii_hinode = NULL; - return &c->vfs_inode; - } - return NULL; -} - -static void aufs_destroy_inode_cb(struct rcu_head *head) -{ - struct inode *inode = container_of(head, struct inode, i_rcu); - - INIT_HLIST_HEAD(&inode->i_dentry); - au_cache_free_icntnr(container_of(inode, struct au_icntnr, vfs_inode)); -} - -static void aufs_destroy_inode(struct inode *inode) -{ - au_iinfo_fin(inode); - call_rcu(&inode->i_rcu, aufs_destroy_inode_cb); -} - -struct inode *au_iget_locked(struct super_block *sb, ino_t ino) -{ - struct inode *inode; - int err; - - inode = iget_locked(sb, ino); - if (unlikely(!inode)) { - inode = ERR_PTR(-ENOMEM); - goto out; - } - if (!(inode->i_state & I_NEW)) - goto out; - - err = au_xigen_new(inode); - if (!err) - err = au_iinfo_init(inode); - if (!err) - inode->i_version++; - else { - iget_failed(inode); - inode = ERR_PTR(err); - } - -out: - /* never return NULL */ - AuDebugOn(!inode); - AuTraceErrPtr(inode); - return inode; -} - -/* lock free root dinfo */ -static int au_show_brs(struct seq_file *seq, struct super_block *sb) -{ - int err; - aufs_bindex_t bindex, bend; - struct path path; - struct au_hdentry *hdp; - struct au_branch *br; - au_br_perm_str_t perm; - - err = 0; - bend = au_sbend(sb); - hdp = au_di(sb->s_root)->di_hdentry; - for (bindex = 0; !err && bindex <= bend; bindex++) { - br = au_sbr(sb, bindex); - path.mnt = au_br_mnt(br); - path.dentry = hdp[bindex].hd_dentry; - err = au_seq_path(seq, &path); - if (err > 0) { - au_optstr_br_perm(&perm, br->br_perm); - err = seq_printf(seq, "=%s", perm.a); - if (err == -1) - err = -E2BIG; - } - if (!err && bindex != bend) - err = seq_putc(seq, ':'); - } - - return err; -} - -static void au_show_wbr_create(struct seq_file *m, int v, - struct au_sbinfo *sbinfo) -{ - const char *pat; - - AuRwMustAnyLock(&sbinfo->si_rwsem); - - seq_puts(m, ",create="); - pat = au_optstr_wbr_create(v); - switch (v) { - case AuWbrCreate_TDP: - case AuWbrCreate_RR: - case AuWbrCreate_MFS: - case AuWbrCreate_PMFS: - seq_puts(m, pat); - break; - case AuWbrCreate_MFSV: - seq_printf(m, /*pat*/"mfs:%lu", - jiffies_to_msecs(sbinfo->si_wbr_mfs.mfs_expire) - / MSEC_PER_SEC); - break; - case AuWbrCreate_PMFSV: - seq_printf(m, /*pat*/"pmfs:%lu", - jiffies_to_msecs(sbinfo->si_wbr_mfs.mfs_expire) - / MSEC_PER_SEC); - break; - case AuWbrCreate_MFSRR: - seq_printf(m, /*pat*/"mfsrr:%llu", - sbinfo->si_wbr_mfs.mfsrr_watermark); - break; - case AuWbrCreate_MFSRRV: - seq_printf(m, /*pat*/"mfsrr:%llu:%lu", - sbinfo->si_wbr_mfs.mfsrr_watermark, - jiffies_to_msecs(sbinfo->si_wbr_mfs.mfs_expire) - / MSEC_PER_SEC); - break; - case AuWbrCreate_PMFSRR: - seq_printf(m, /*pat*/"pmfsrr:%llu", - sbinfo->si_wbr_mfs.mfsrr_watermark); - break; - case AuWbrCreate_PMFSRRV: - seq_printf(m, /*pat*/"pmfsrr:%llu:%lu", - sbinfo->si_wbr_mfs.mfsrr_watermark, - jiffies_to_msecs(sbinfo->si_wbr_mfs.mfs_expire) - / MSEC_PER_SEC); - break; - } -} - -static int au_show_xino(struct seq_file *seq, struct super_block *sb) -{ -#ifdef CONFIG_SYSFS - return 0; -#else - int err; - const int len = sizeof(AUFS_XINO_FNAME) - 1; - aufs_bindex_t bindex, brid; - struct qstr *name; - struct file *f; - struct dentry *d, *h_root; - struct au_hdentry *hdp; - - AuRwMustAnyLock(&sbinfo->si_rwsem); - - err = 0; - f = au_sbi(sb)->si_xib; - if (!f) - goto out; - - /* stop printing the default xino path on the first writable branch */ - h_root = NULL; - brid = au_xino_brid(sb); - if (brid >= 0) { - bindex = au_br_index(sb, brid); - hdp = au_di(sb->s_root)->di_hdentry; - h_root = hdp[0 + bindex].hd_dentry; - } - d = f->f_path.dentry; - name = &d->d_name; - /* safe ->d_parent because the file is unlinked */ - if (d->d_parent == h_root - && name->len == len - && !memcmp(name->name, AUFS_XINO_FNAME, len)) - goto out; - - seq_puts(seq, ",xino="); - err = au_xino_path(seq, f); - -out: - return err; -#endif -} - -/* seq_file will re-call me in case of too long string */ -static int aufs_show_options(struct seq_file *m, struct dentry *dentry) -{ - int err; - unsigned int mnt_flags, v; - struct super_block *sb; - struct au_sbinfo *sbinfo; - -#define AuBool(name, str) do { \ - v = au_opt_test(mnt_flags, name); \ - if (v != au_opt_test(AuOpt_Def, name)) \ - seq_printf(m, ",%s" #str, v ? "" : "no"); \ -} while (0) - -#define AuStr(name, str) do { \ - v = mnt_flags & AuOptMask_##name; \ - if (v != (AuOpt_Def & AuOptMask_##name)) \ - seq_printf(m, "," #str "=%s", au_optstr_##str(v)); \ -} while (0) - -#define AuUInt(name, str, val) do { \ - if (val != AUFS_##name##_DEF) \ - seq_printf(m, "," #str "=%u", val); \ -} while (0) - - sb = dentry->d_sb; - if (sb->s_flags & MS_POSIXACL) - seq_puts(m, ",acl"); - - /* lock free root dinfo */ - si_noflush_read_lock(sb); - sbinfo = au_sbi(sb); - seq_printf(m, ",si=%lx", sysaufs_si_id(sbinfo)); - - mnt_flags = au_mntflags(sb); - if (au_opt_test(mnt_flags, XINO)) { - err = au_show_xino(m, sb); - if (unlikely(err)) - goto out; - } else - seq_puts(m, ",noxino"); - - AuBool(TRUNC_XINO, trunc_xino); - AuStr(UDBA, udba); - AuBool(SHWH, shwh); - AuBool(PLINK, plink); - AuBool(DIO, dio); - AuBool(DIRPERM1, dirperm1); - - v = sbinfo->si_wbr_create; - if (v != AuWbrCreate_Def) - au_show_wbr_create(m, v, sbinfo); - - v = sbinfo->si_wbr_copyup; - if (v != AuWbrCopyup_Def) - seq_printf(m, ",cpup=%s", au_optstr_wbr_copyup(v)); - - v = au_opt_test(mnt_flags, ALWAYS_DIROPQ); - if (v != au_opt_test(AuOpt_Def, ALWAYS_DIROPQ)) - seq_printf(m, ",diropq=%c", v ? 'a' : 'w'); - - AuUInt(DIRWH, dirwh, sbinfo->si_dirwh); - - v = jiffies_to_msecs(sbinfo->si_rdcache) / MSEC_PER_SEC; - AuUInt(RDCACHE, rdcache, v); - - AuUInt(RDBLK, rdblk, sbinfo->si_rdblk); - AuUInt(RDHASH, rdhash, sbinfo->si_rdhash); - - au_fhsm_show(m, sbinfo); - - AuBool(SUM, sum); - /* AuBool(SUM_W, wsum); */ - AuBool(WARN_PERM, warn_perm); - AuBool(VERBOSE, verbose); - -out: - /* be sure to print "br:" last */ - if (!sysaufs_brs) { - seq_puts(m, ",br:"); - au_show_brs(m, sb); - } - si_read_unlock(sb); - return 0; - -#undef AuBool -#undef AuStr -#undef AuUInt -} - -/* ---------------------------------------------------------------------- */ - -/* sum mode which returns the summation for statfs(2) */ - -static u64 au_add_till_max(u64 a, u64 b) -{ - u64 old; - - old = a; - a += b; - if (old <= a) - return a; - return ULLONG_MAX; -} - -static u64 au_mul_till_max(u64 a, long mul) -{ - u64 old; - - old = a; - a *= mul; - if (old <= a) - return a; - return ULLONG_MAX; -} - -static int au_statfs_sum(struct super_block *sb, struct kstatfs *buf) -{ - int err; - long bsize, factor; - u64 blocks, bfree, bavail, files, ffree; - aufs_bindex_t bend, bindex, i; - unsigned char shared; - struct path h_path; - struct super_block *h_sb; - - err = 0; - bsize = LONG_MAX; - files = 0; - ffree = 0; - blocks = 0; - bfree = 0; - bavail = 0; - bend = au_sbend(sb); - for (bindex = 0; bindex <= bend; bindex++) { - h_path.mnt = au_sbr_mnt(sb, bindex); - h_sb = h_path.mnt->mnt_sb; - shared = 0; - for (i = 0; !shared && i < bindex; i++) - shared = (au_sbr_sb(sb, i) == h_sb); - if (shared) - continue; - - /* sb->s_root for NFS is unreliable */ - h_path.dentry = h_path.mnt->mnt_root; - err = vfs_statfs(&h_path, buf); - if (unlikely(err)) - goto out; - - if (bsize > buf->f_bsize) { - /* - * we will reduce bsize, so we have to expand blocks - * etc. to match them again - */ - factor = (bsize / buf->f_bsize); - blocks = au_mul_till_max(blocks, factor); - bfree = au_mul_till_max(bfree, factor); - bavail = au_mul_till_max(bavail, factor); - bsize = buf->f_bsize; - } - - factor = (buf->f_bsize / bsize); - blocks = au_add_till_max(blocks, - au_mul_till_max(buf->f_blocks, factor)); - bfree = au_add_till_max(bfree, - au_mul_till_max(buf->f_bfree, factor)); - bavail = au_add_till_max(bavail, - au_mul_till_max(buf->f_bavail, factor)); - files = au_add_till_max(files, buf->f_files); - ffree = au_add_till_max(ffree, buf->f_ffree); - } - - buf->f_bsize = bsize; - buf->f_blocks = blocks; - buf->f_bfree = bfree; - buf->f_bavail = bavail; - buf->f_files = files; - buf->f_ffree = ffree; - buf->f_frsize = 0; - -out: - return err; -} - -static int aufs_statfs(struct dentry *dentry, struct kstatfs *buf) -{ - int err; - struct path h_path; - struct super_block *sb; - - /* lock free root dinfo */ - sb = dentry->d_sb; - si_noflush_read_lock(sb); - if (!au_opt_test(au_mntflags(sb), SUM)) { - /* sb->s_root for NFS is unreliable */ - h_path.mnt = au_sbr_mnt(sb, 0); - h_path.dentry = h_path.mnt->mnt_root; - err = vfs_statfs(&h_path, buf); - } else - err = au_statfs_sum(sb, buf); - si_read_unlock(sb); - - if (!err) { - buf->f_type = AUFS_SUPER_MAGIC; - buf->f_namelen = AUFS_MAX_NAMELEN; - memset(&buf->f_fsid, 0, sizeof(buf->f_fsid)); - } - /* buf->f_bsize = buf->f_blocks = buf->f_bfree = buf->f_bavail = -1; */ - - return err; -} - -/* ---------------------------------------------------------------------- */ - -static int aufs_sync_fs(struct super_block *sb, int wait) -{ - int err, e; - aufs_bindex_t bend, bindex; - struct au_branch *br; - struct super_block *h_sb; - - err = 0; - si_noflush_read_lock(sb); - bend = au_sbend(sb); - for (bindex = 0; bindex <= bend; bindex++) { - br = au_sbr(sb, bindex); - if (!au_br_writable(br->br_perm)) - continue; - - h_sb = au_sbr_sb(sb, bindex); - if (h_sb->s_op->sync_fs) { - e = h_sb->s_op->sync_fs(h_sb, wait); - if (unlikely(e && !err)) - err = e; - /* go on even if an error happens */ - } - } - si_read_unlock(sb); - - return err; -} - -/* ---------------------------------------------------------------------- */ - -/* final actions when unmounting a file system */ -static void aufs_put_super(struct super_block *sb) -{ - struct au_sbinfo *sbinfo; - - sbinfo = au_sbi(sb); - if (!sbinfo) - return; - - dbgaufs_si_fin(sbinfo); - kobject_put(&sbinfo->si_kobj); -} - -/* ---------------------------------------------------------------------- */ - -void au_array_free(void *array) -{ - if (array) { - if (!is_vmalloc_addr(array)) - kfree(array); - else - vfree(array); - } -} - -void *au_array_alloc(unsigned long long *hint, au_arraycb_t cb, void *arg) -{ - void *array; - unsigned long long n, sz; - - array = NULL; - n = 0; - if (!*hint) - goto out; - - if (*hint > ULLONG_MAX / sizeof(array)) { - array = ERR_PTR(-EMFILE); - pr_err("hint %llu\n", *hint); - goto out; - } - - sz = sizeof(array) * *hint; - array = kzalloc(sz, GFP_NOFS); - if (unlikely(!array)) - array = vzalloc(sz); - if (unlikely(!array)) { - array = ERR_PTR(-ENOMEM); - goto out; - } - - n = cb(array, *hint, arg); - AuDebugOn(n > *hint); - -out: - *hint = n; - return array; -} - -static unsigned long long au_iarray_cb(void *a, - unsigned long long max __maybe_unused, - void *arg) -{ - unsigned long long n; - struct inode **p, *inode; - struct list_head *head; - - n = 0; - p = a; - head = arg; - spin_lock(&inode_sb_list_lock); - list_for_each_entry(inode, head, i_sb_list) { - if (!is_bad_inode(inode) - && au_ii(inode)->ii_bstart >= 0) { - spin_lock(&inode->i_lock); - if (atomic_read(&inode->i_count)) { - au_igrab(inode); - *p++ = inode; - n++; - AuDebugOn(n > max); - } - spin_unlock(&inode->i_lock); - } - } - spin_unlock(&inode_sb_list_lock); - - return n; -} - -struct inode **au_iarray_alloc(struct super_block *sb, unsigned long long *max) -{ - *max = atomic_long_read(&au_sbi(sb)->si_ninodes); - return au_array_alloc(max, au_iarray_cb, &sb->s_inodes); -} - -void au_iarray_free(struct inode **a, unsigned long long max) -{ - unsigned long long ull; - - for (ull = 0; ull < max; ull++) - iput(a[ull]); - au_array_free(a); -} - -/* ---------------------------------------------------------------------- */ - -/* - * refresh dentry and inode at remount time. - */ -/* todo: consolidate with simple_reval_dpath() and au_reval_for_attr() */ -static int au_do_refresh(struct dentry *dentry, unsigned int dir_flags, - struct dentry *parent) -{ - int err; - - di_write_lock_child(dentry); - di_read_lock_parent(parent, AuLock_IR); - err = au_refresh_dentry(dentry, parent); - if (!err && dir_flags) - au_hn_reset(d_inode(dentry), dir_flags); - di_read_unlock(parent, AuLock_IR); - di_write_unlock(dentry); - - return err; -} - -static int au_do_refresh_d(struct dentry *dentry, unsigned int sigen, - struct au_sbinfo *sbinfo, - const unsigned int dir_flags) -{ - int err; - struct dentry *parent; - - err = 0; - parent = dget_parent(dentry); - if (!au_digen_test(parent, sigen) && au_digen_test(dentry, sigen)) { - if (d_really_is_positive(dentry)) { - if (!d_is_dir(dentry)) - err = au_do_refresh(dentry, /*dir_flags*/0, - parent); - else { - err = au_do_refresh(dentry, dir_flags, parent); - if (unlikely(err)) - au_fset_si(sbinfo, FAILED_REFRESH_DIR); - } - } else - err = au_do_refresh(dentry, /*dir_flags*/0, parent); - AuDbgDentry(dentry); - } - dput(parent); - - AuTraceErr(err); - return err; -} - -static int au_refresh_d(struct super_block *sb) -{ - int err, i, j, ndentry, e; - unsigned int sigen; - struct au_dcsub_pages dpages; - struct au_dpage *dpage; - struct dentry **dentries, *d; - struct au_sbinfo *sbinfo; - struct dentry *root = sb->s_root; - const unsigned int dir_flags = au_hi_flags(d_inode(root), /*isdir*/1); - - err = au_dpages_init(&dpages, GFP_NOFS); - if (unlikely(err)) - goto out; - err = au_dcsub_pages(&dpages, root, NULL, NULL); - if (unlikely(err)) - goto out_dpages; - - sigen = au_sigen(sb); - sbinfo = au_sbi(sb); - for (i = 0; i < dpages.ndpage; i++) { - dpage = dpages.dpages + i; - dentries = dpage->dentries; - ndentry = dpage->ndentry; - for (j = 0; j < ndentry; j++) { - d = dentries[j]; - e = au_do_refresh_d(d, sigen, sbinfo, dir_flags); - if (unlikely(e && !err)) - err = e; - /* go on even err */ - } - } - -out_dpages: - au_dpages_free(&dpages); -out: - return err; -} - -static int au_refresh_i(struct super_block *sb) -{ - int err, e; - unsigned int sigen; - unsigned long long max, ull; - struct inode *inode, **array; - - array = au_iarray_alloc(sb, &max); - err = PTR_ERR(array); - if (IS_ERR(array)) - goto out; - - err = 0; - sigen = au_sigen(sb); - for (ull = 0; ull < max; ull++) { - inode = array[ull]; - if (unlikely(!inode)) - break; - if (au_iigen(inode, NULL) != sigen) { - ii_write_lock_child(inode); - e = au_refresh_hinode_self(inode); - ii_write_unlock(inode); - if (unlikely(e)) { - pr_err("error %d, i%lu\n", e, inode->i_ino); - if (!err) - err = e; - /* go on even if err */ - } - } - } - - au_iarray_free(array, max); - -out: - return err; -} - -static void au_remount_refresh(struct super_block *sb) -{ - int err, e; - unsigned int udba; - aufs_bindex_t bindex, bend; - struct dentry *root; - struct inode *inode; - struct au_branch *br; - - au_sigen_inc(sb); - au_fclr_si(au_sbi(sb), FAILED_REFRESH_DIR); - - root = sb->s_root; - DiMustNoWaiters(root); - inode = d_inode(root); - IiMustNoWaiters(inode); - - udba = au_opt_udba(sb); - bend = au_sbend(sb); - for (bindex = 0; bindex <= bend; bindex++) { - br = au_sbr(sb, bindex); - err = au_hnotify_reset_br(udba, br, br->br_perm); - if (unlikely(err)) - AuIOErr("hnotify failed on br %d, %d, ignored\n", - bindex, err); - /* go on even if err */ - } - au_hn_reset(inode, au_hi_flags(inode, /*isdir*/1)); - - di_write_unlock(root); - err = au_refresh_d(sb); - e = au_refresh_i(sb); - if (unlikely(e && !err)) - err = e; - /* aufs_write_lock() calls ..._child() */ - di_write_lock_child(root); - - au_cpup_attr_all(inode, /*force*/1); - - if (unlikely(err)) - AuIOErr("refresh failed, ignored, %d\n", err); -} - -/* stop extra interpretation of errno in mount(8), and strange error messages */ -static int cvt_err(int err) -{ - AuTraceErr(err); - - switch (err) { - case -ENOENT: - case -ENOTDIR: - case -EEXIST: - case -EIO: - err = -EINVAL; - } - return err; -} - -static int aufs_remount_fs(struct super_block *sb, int *flags, char *data) -{ - int err, do_dx; - unsigned int mntflags; - struct au_opts opts; - struct dentry *root; - struct inode *inode; - struct au_sbinfo *sbinfo; - - err = 0; - root = sb->s_root; - if (!data || !*data) { - err = si_write_lock(sb, AuLock_FLUSH | AuLock_NOPLM); - if (!err) { - di_write_lock_child(root); - err = au_opts_verify(sb, *flags, /*pending*/0); - aufs_write_unlock(root); - } - goto out; - } - - err = -ENOMEM; - memset(&opts, 0, sizeof(opts)); - opts.opt = (void *)__get_free_page(GFP_NOFS); - if (unlikely(!opts.opt)) - goto out; - opts.max_opt = PAGE_SIZE / sizeof(*opts.opt); - opts.flags = AuOpts_REMOUNT; - opts.sb_flags = *flags; - - /* parse it before aufs lock */ - err = au_opts_parse(sb, data, &opts); - if (unlikely(err)) - goto out_opts; - - sbinfo = au_sbi(sb); - inode = d_inode(root); - mutex_lock(&inode->i_mutex); - err = si_write_lock(sb, AuLock_FLUSH | AuLock_NOPLM); - if (unlikely(err)) - goto out_mtx; - di_write_lock_child(root); - - /* au_opts_remount() may return an error */ - err = au_opts_remount(sb, &opts); - au_opts_free(&opts); - - if (au_ftest_opts(opts.flags, REFRESH)) - au_remount_refresh(sb); - - if (au_ftest_opts(opts.flags, REFRESH_DYAOP)) { - mntflags = au_mntflags(sb); - do_dx = !!au_opt_test(mntflags, DIO); - au_dy_arefresh(do_dx); - } - - au_fhsm_wrote_all(sb, /*force*/1); /* ?? */ - aufs_write_unlock(root); - -out_mtx: - mutex_unlock(&inode->i_mutex); -out_opts: - free_page((unsigned long)opts.opt); -out: - err = cvt_err(err); - AuTraceErr(err); - return err; -} - -static const struct super_operations aufs_sop = { - .alloc_inode = aufs_alloc_inode, - .destroy_inode = aufs_destroy_inode, - /* always deleting, no clearing */ - .drop_inode = generic_delete_inode, - .show_options = aufs_show_options, - .statfs = aufs_statfs, - .put_super = aufs_put_super, - .sync_fs = aufs_sync_fs, - .remount_fs = aufs_remount_fs -}; - -/* ---------------------------------------------------------------------- */ - -static int alloc_root(struct super_block *sb) -{ - int err; - struct inode *inode; - struct dentry *root; - - err = -ENOMEM; - inode = au_iget_locked(sb, AUFS_ROOT_INO); - err = PTR_ERR(inode); - if (IS_ERR(inode)) - goto out; - - inode->i_op = &aufs_dir_iop; - inode->i_fop = &aufs_dir_fop; - inode->i_mode = S_IFDIR; - set_nlink(inode, 2); - unlock_new_inode(inode); - - root = d_make_root(inode); - if (unlikely(!root)) - goto out; - err = PTR_ERR(root); - if (IS_ERR(root)) - goto out; - - err = au_di_init(root); - if (!err) { - sb->s_root = root; - return 0; /* success */ - } - dput(root); - -out: - return err; -} - -static int aufs_fill_super(struct super_block *sb, void *raw_data, - int silent __maybe_unused) -{ - int err; - struct au_opts opts; - struct dentry *root; - struct inode *inode; - char *arg = raw_data; - - if (unlikely(!arg || !*arg)) { - err = -EINVAL; - pr_err("no arg\n"); - goto out; - } - - err = -ENOMEM; - memset(&opts, 0, sizeof(opts)); - opts.opt = (void *)__get_free_page(GFP_NOFS); - if (unlikely(!opts.opt)) - goto out; - opts.max_opt = PAGE_SIZE / sizeof(*opts.opt); - opts.sb_flags = sb->s_flags; - - err = au_si_alloc(sb); - if (unlikely(err)) - goto out_opts; - - /* all timestamps always follow the ones on the branch */ - sb->s_flags |= MS_NOATIME | MS_NODIRATIME; - sb->s_op = &aufs_sop; - sb->s_d_op = &aufs_dop; - sb->s_magic = AUFS_SUPER_MAGIC; - sb->s_maxbytes = 0; - sb->s_stack_depth = 1; - au_export_init(sb); - /* au_xattr_init(sb); */ - - err = alloc_root(sb); - if (unlikely(err)) { - si_write_unlock(sb); - goto out_info; - } - root = sb->s_root; - inode = d_inode(root); - - /* - * actually we can parse options regardless aufs lock here. - * but at remount time, parsing must be done before aufs lock. - * so we follow the same rule. - */ - ii_write_lock_parent(inode); - aufs_write_unlock(root); - err = au_opts_parse(sb, arg, &opts); - if (unlikely(err)) - goto out_root; - - /* lock vfs_inode first, then aufs. */ - mutex_lock(&inode->i_mutex); - aufs_write_lock(root); - err = au_opts_mount(sb, &opts); - au_opts_free(&opts); - aufs_write_unlock(root); - mutex_unlock(&inode->i_mutex); - if (!err) - goto out_opts; /* success */ - -out_root: - dput(root); - sb->s_root = NULL; -out_info: - dbgaufs_si_fin(au_sbi(sb)); - kobject_put(&au_sbi(sb)->si_kobj); - sb->s_fs_info = NULL; -out_opts: - free_page((unsigned long)opts.opt); -out: - AuTraceErr(err); - err = cvt_err(err); - AuTraceErr(err); - return err; -} - -/* ---------------------------------------------------------------------- */ - -static struct dentry *aufs_mount(struct file_system_type *fs_type, int flags, - const char *dev_name __maybe_unused, - void *raw_data) -{ - struct dentry *root; - struct super_block *sb; - - /* all timestamps always follow the ones on the branch */ - /* mnt->mnt_flags |= MNT_NOATIME | MNT_NODIRATIME; */ - root = mount_nodev(fs_type, flags, raw_data, aufs_fill_super); - if (IS_ERR(root)) - goto out; - - sb = root->d_sb; - si_write_lock(sb, !AuLock_FLUSH); - sysaufs_brs_add(sb, 0); - si_write_unlock(sb); - au_sbilist_add(sb); - -out: - return root; -} - -static void aufs_kill_sb(struct super_block *sb) -{ - struct au_sbinfo *sbinfo; - - sbinfo = au_sbi(sb); - if (sbinfo) { - au_sbilist_del(sb); - aufs_write_lock(sb->s_root); - au_fhsm_fin(sb); - if (sbinfo->si_wbr_create_ops->fin) - sbinfo->si_wbr_create_ops->fin(sb); - if (au_opt_test(sbinfo->si_mntflags, UDBA_HNOTIFY)) { - au_opt_set_udba(sbinfo->si_mntflags, UDBA_NONE); - au_remount_refresh(sb); - } - if (au_opt_test(sbinfo->si_mntflags, PLINK)) - au_plink_put(sb, /*verbose*/1); - au_xino_clr(sb); - sbinfo->si_sb = NULL; - aufs_write_unlock(sb->s_root); - au_nwt_flush(&sbinfo->si_nowait); - } - kill_anon_super(sb); -} - -struct file_system_type aufs_fs_type = { - .name = AUFS_FSTYPE, - /* a race between rename and others */ - .fs_flags = FS_RENAME_DOES_D_MOVE, - .mount = aufs_mount, - .kill_sb = aufs_kill_sb, - /* no need to __module_get() and module_put(). */ - .owner = THIS_MODULE, -}; diff --git a/fs/aufs/super.h b/fs/aufs/super.h deleted file mode 100644 index 95ffbbb29..000000000 --- a/fs/aufs/super.h +++ /dev/null @@ -1,635 +0,0 @@ -/* - * Copyright (C) 2005-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * super_block operations - */ - -#ifndef __AUFS_SUPER_H__ -#define __AUFS_SUPER_H__ - -#ifdef __KERNEL__ - -#include <linux/fs.h> -#include <linux/kobject.h> -#include "rwsem.h" -#include "spl.h" -#include "wkq.h" - -/* policies to select one among multiple writable branches */ -struct au_wbr_copyup_operations { - int (*copyup)(struct dentry *dentry); -}; - -#define AuWbr_DIR 1 /* target is a dir */ -#define AuWbr_PARENT (1 << 1) /* always require a parent */ - -#define au_ftest_wbr(flags, name) ((flags) & AuWbr_##name) -#define au_fset_wbr(flags, name) { (flags) |= AuWbr_##name; } -#define au_fclr_wbr(flags, name) { (flags) &= ~AuWbr_##name; } - -struct au_wbr_create_operations { - int (*create)(struct dentry *dentry, unsigned int flags); - int (*init)(struct super_block *sb); - int (*fin)(struct super_block *sb); -}; - -struct au_wbr_mfs { - struct mutex mfs_lock; /* protect this structure */ - unsigned long mfs_jiffy; - unsigned long mfs_expire; - aufs_bindex_t mfs_bindex; - - unsigned long long mfsrr_bytes; - unsigned long long mfsrr_watermark; -}; - -struct pseudo_link { - union { - struct hlist_node hlist; - struct rcu_head rcu; - }; - struct inode *inode; -}; - -#define AuPlink_NHASH 100 -static inline int au_plink_hash(ino_t ino) -{ - return ino % AuPlink_NHASH; -} - -/* File-based Hierarchical Storage Management */ -struct au_fhsm { -#ifdef CONFIG_AUFS_FHSM - /* allow only one process who can receive the notification */ - spinlock_t fhsm_spin; - pid_t fhsm_pid; - wait_queue_head_t fhsm_wqh; - atomic_t fhsm_readable; - - /* these are protected by si_rwsem */ - unsigned long fhsm_expire; - aufs_bindex_t fhsm_bottom; -#endif -}; - -struct au_branch; -struct au_sbinfo { - /* nowait tasks in the system-wide workqueue */ - struct au_nowait_tasks si_nowait; - - /* - * tried sb->s_umount, but failed due to the dependecy between i_mutex. - * rwsem for au_sbinfo is necessary. - */ - struct au_rwsem si_rwsem; - - /* prevent recursive locking in deleting inode */ - struct { - unsigned long *bitmap; - spinlock_t tree_lock; - struct radix_tree_root tree; - } au_si_pid; - - /* - * dirty approach to protect sb->sb_inodes and ->s_files (gone) from - * remount. - */ - atomic_long_t si_ninodes, si_nfiles; - - /* branch management */ - unsigned int si_generation; - - /* see AuSi_ flags */ - unsigned char au_si_status; - - aufs_bindex_t si_bend; - - /* dirty trick to keep br_id plus */ - unsigned int si_last_br_id : - sizeof(aufs_bindex_t) * BITS_PER_BYTE - 1; - struct au_branch **si_branch; - - /* policy to select a writable branch */ - unsigned char si_wbr_copyup; - unsigned char si_wbr_create; - struct au_wbr_copyup_operations *si_wbr_copyup_ops; - struct au_wbr_create_operations *si_wbr_create_ops; - - /* round robin */ - atomic_t si_wbr_rr_next; - - /* most free space */ - struct au_wbr_mfs si_wbr_mfs; - - /* File-based Hierarchical Storage Management */ - struct au_fhsm si_fhsm; - - /* mount flags */ - /* include/asm-ia64/siginfo.h defines a macro named si_flags */ - unsigned int si_mntflags; - - /* external inode number (bitmap and translation table) */ - vfs_readf_t si_xread; - vfs_writef_t si_xwrite; - struct file *si_xib; - struct mutex si_xib_mtx; /* protect xib members */ - unsigned long *si_xib_buf; - unsigned long si_xib_last_pindex; - int si_xib_next_bit; - aufs_bindex_t si_xino_brid; - unsigned long si_xino_jiffy; - unsigned long si_xino_expire; - /* reserved for future use */ - /* unsigned long long si_xib_limit; */ /* Max xib file size */ - -#ifdef CONFIG_AUFS_EXPORT - /* i_generation */ - struct file *si_xigen; - atomic_t si_xigen_next; -#endif - - /* dirty trick to suppoer atomic_open */ - struct au_sphlhead si_aopen; - - /* vdir parameters */ - unsigned long si_rdcache; /* max cache time in jiffies */ - unsigned int si_rdblk; /* deblk size */ - unsigned int si_rdhash; /* hash size */ - - /* - * If the number of whiteouts are larger than si_dirwh, leave all of - * them after au_whtmp_ren to reduce the cost of rmdir(2). - * future fsck.aufs or kernel thread will remove them later. - * Otherwise, remove all whiteouts and the dir in rmdir(2). - */ - unsigned int si_dirwh; - - /* pseudo_link list */ - struct au_sphlhead si_plink[AuPlink_NHASH]; - wait_queue_head_t si_plink_wq; - spinlock_t si_plink_maint_lock; - pid_t si_plink_maint_pid; - - /* file list */ - struct au_sphlhead si_files; - - /* - * sysfs and lifetime management. - * this is not a small structure and it may be a waste of memory in case - * of sysfs is disabled, particulary when many aufs-es are mounted. - * but using sysfs is majority. - */ - struct kobject si_kobj; -#ifdef CONFIG_DEBUG_FS - struct dentry *si_dbgaufs; - struct dentry *si_dbgaufs_plink; - struct dentry *si_dbgaufs_xib; -#ifdef CONFIG_AUFS_EXPORT - struct dentry *si_dbgaufs_xigen; -#endif -#endif - -#ifdef CONFIG_AUFS_SBILIST - struct list_head si_list; -#endif - - /* dirty, necessary for unmounting, sysfs and sysrq */ - struct super_block *si_sb; -}; - -/* sbinfo status flags */ -/* - * set true when refresh_dirs() failed at remount time. - * then try refreshing dirs at access time again. - * if it is false, refreshing dirs at access time is unnecesary - */ -#define AuSi_FAILED_REFRESH_DIR 1 - -#define AuSi_FHSM (1 << 1) /* fhsm is active now */ - -#ifndef CONFIG_AUFS_FHSM -#undef AuSi_FHSM -#define AuSi_FHSM 0 -#endif - -static inline unsigned char au_do_ftest_si(struct au_sbinfo *sbi, - unsigned int flag) -{ - AuRwMustAnyLock(&sbi->si_rwsem); - return sbi->au_si_status & flag; -} -#define au_ftest_si(sbinfo, name) au_do_ftest_si(sbinfo, AuSi_##name) -#define au_fset_si(sbinfo, name) do { \ - AuRwMustWriteLock(&(sbinfo)->si_rwsem); \ - (sbinfo)->au_si_status |= AuSi_##name; \ -} while (0) -#define au_fclr_si(sbinfo, name) do { \ - AuRwMustWriteLock(&(sbinfo)->si_rwsem); \ - (sbinfo)->au_si_status &= ~AuSi_##name; \ -} while (0) - -/* ---------------------------------------------------------------------- */ - -/* policy to select one among writable branches */ -#define AuWbrCopyup(sbinfo, ...) \ - ((sbinfo)->si_wbr_copyup_ops->copyup(__VA_ARGS__)) -#define AuWbrCreate(sbinfo, ...) \ - ((sbinfo)->si_wbr_create_ops->create(__VA_ARGS__)) - -/* flags for si_read_lock()/aufs_read_lock()/di_read_lock() */ -#define AuLock_DW 1 /* write-lock dentry */ -#define AuLock_IR (1 << 1) /* read-lock inode */ -#define AuLock_IW (1 << 2) /* write-lock inode */ -#define AuLock_FLUSH (1 << 3) /* wait for 'nowait' tasks */ -#define AuLock_DIR (1 << 4) /* target is a dir */ -#define AuLock_NOPLM (1 << 5) /* return err in plm mode */ -#define AuLock_NOPLMW (1 << 6) /* wait for plm mode ends */ -#define AuLock_GEN (1 << 7) /* test digen/iigen */ -#define au_ftest_lock(flags, name) ((flags) & AuLock_##name) -#define au_fset_lock(flags, name) \ - do { (flags) |= AuLock_##name; } while (0) -#define au_fclr_lock(flags, name) \ - do { (flags) &= ~AuLock_##name; } while (0) - -/* ---------------------------------------------------------------------- */ - -/* super.c */ -extern struct file_system_type aufs_fs_type; -struct inode *au_iget_locked(struct super_block *sb, ino_t ino); -typedef unsigned long long (*au_arraycb_t)(void *array, unsigned long long max, - void *arg); -void au_array_free(void *array); -void *au_array_alloc(unsigned long long *hint, au_arraycb_t cb, void *arg); -struct inode **au_iarray_alloc(struct super_block *sb, unsigned long long *max); -void au_iarray_free(struct inode **a, unsigned long long max); - -/* sbinfo.c */ -void au_si_free(struct kobject *kobj); -int au_si_alloc(struct super_block *sb); -int au_sbr_realloc(struct au_sbinfo *sbinfo, int nbr); - -unsigned int au_sigen_inc(struct super_block *sb); -aufs_bindex_t au_new_br_id(struct super_block *sb); - -int si_read_lock(struct super_block *sb, int flags); -int si_write_lock(struct super_block *sb, int flags); -int aufs_read_lock(struct dentry *dentry, int flags); -void aufs_read_unlock(struct dentry *dentry, int flags); -void aufs_write_lock(struct dentry *dentry); -void aufs_write_unlock(struct dentry *dentry); -int aufs_read_and_write_lock2(struct dentry *d1, struct dentry *d2, int flags); -void aufs_read_and_write_unlock2(struct dentry *d1, struct dentry *d2); - -int si_pid_test_slow(struct super_block *sb); -void si_pid_set_slow(struct super_block *sb); -void si_pid_clr_slow(struct super_block *sb); - -/* wbr_policy.c */ -extern struct au_wbr_copyup_operations au_wbr_copyup_ops[]; -extern struct au_wbr_create_operations au_wbr_create_ops[]; -int au_cpdown_dirs(struct dentry *dentry, aufs_bindex_t bdst); -int au_wbr_nonopq(struct dentry *dentry, aufs_bindex_t bindex); -int au_wbr_do_copyup_bu(struct dentry *dentry, aufs_bindex_t bstart); - -/* mvdown.c */ -int au_mvdown(struct dentry *dentry, struct aufs_mvdown __user *arg); - -#ifdef CONFIG_AUFS_FHSM -/* fhsm.c */ - -static inline pid_t au_fhsm_pid(struct au_fhsm *fhsm) -{ - pid_t pid; - - spin_lock(&fhsm->fhsm_spin); - pid = fhsm->fhsm_pid; - spin_unlock(&fhsm->fhsm_spin); - - return pid; -} - -void au_fhsm_wrote(struct super_block *sb, aufs_bindex_t bindex, int force); -void au_fhsm_wrote_all(struct super_block *sb, int force); -int au_fhsm_fd(struct super_block *sb, int oflags); -int au_fhsm_br_alloc(struct au_branch *br); -void au_fhsm_set_bottom(struct super_block *sb, aufs_bindex_t bindex); -void au_fhsm_fin(struct super_block *sb); -void au_fhsm_init(struct au_sbinfo *sbinfo); -void au_fhsm_set(struct au_sbinfo *sbinfo, unsigned int sec); -void au_fhsm_show(struct seq_file *seq, struct au_sbinfo *sbinfo); -#else -AuStubVoid(au_fhsm_wrote, struct super_block *sb, aufs_bindex_t bindex, - int force) -AuStubVoid(au_fhsm_wrote_all, struct super_block *sb, int force) -AuStub(int, au_fhsm_fd, return -EOPNOTSUPP, struct super_block *sb, int oflags) -AuStub(pid_t, au_fhsm_pid, return 0, struct au_fhsm *fhsm) -AuStubInt0(au_fhsm_br_alloc, struct au_branch *br) -AuStubVoid(au_fhsm_set_bottom, struct super_block *sb, aufs_bindex_t bindex) -AuStubVoid(au_fhsm_fin, struct super_block *sb) -AuStubVoid(au_fhsm_init, struct au_sbinfo *sbinfo) -AuStubVoid(au_fhsm_set, struct au_sbinfo *sbinfo, unsigned int sec) -AuStubVoid(au_fhsm_show, struct seq_file *seq, struct au_sbinfo *sbinfo) -#endif - -/* ---------------------------------------------------------------------- */ - -static inline struct au_sbinfo *au_sbi(struct super_block *sb) -{ - return sb->s_fs_info; -} - -/* ---------------------------------------------------------------------- */ - -#ifdef CONFIG_AUFS_EXPORT -int au_test_nfsd(void); -void au_export_init(struct super_block *sb); -void au_xigen_inc(struct inode *inode); -int au_xigen_new(struct inode *inode); -int au_xigen_set(struct super_block *sb, struct file *base); -void au_xigen_clr(struct super_block *sb); - -static inline int au_busy_or_stale(void) -{ - if (!au_test_nfsd()) - return -EBUSY; - return -ESTALE; -} -#else -AuStubInt0(au_test_nfsd, void) -AuStubVoid(au_export_init, struct super_block *sb) -AuStubVoid(au_xigen_inc, struct inode *inode) -AuStubInt0(au_xigen_new, struct inode *inode) -AuStubInt0(au_xigen_set, struct super_block *sb, struct file *base) -AuStubVoid(au_xigen_clr, struct super_block *sb) -AuStub(int, au_busy_or_stale, return -EBUSY, void) -#endif /* CONFIG_AUFS_EXPORT */ - -/* ---------------------------------------------------------------------- */ - -#ifdef CONFIG_AUFS_SBILIST -/* module.c */ -extern struct au_splhead au_sbilist; - -static inline void au_sbilist_init(void) -{ - au_spl_init(&au_sbilist); -} - -static inline void au_sbilist_add(struct super_block *sb) -{ - au_spl_add(&au_sbi(sb)->si_list, &au_sbilist); -} - -static inline void au_sbilist_del(struct super_block *sb) -{ - au_spl_del(&au_sbi(sb)->si_list, &au_sbilist); -} - -#ifdef CONFIG_AUFS_MAGIC_SYSRQ -static inline void au_sbilist_lock(void) -{ - spin_lock(&au_sbilist.spin); -} - -static inline void au_sbilist_unlock(void) -{ - spin_unlock(&au_sbilist.spin); -} -#define AuGFP_SBILIST GFP_ATOMIC -#else -AuStubVoid(au_sbilist_lock, void) -AuStubVoid(au_sbilist_unlock, void) -#define AuGFP_SBILIST GFP_NOFS -#endif /* CONFIG_AUFS_MAGIC_SYSRQ */ -#else -AuStubVoid(au_sbilist_init, void) -AuStubVoid(au_sbilist_add, struct super_block *sb) -AuStubVoid(au_sbilist_del, struct super_block *sb) -AuStubVoid(au_sbilist_lock, void) -AuStubVoid(au_sbilist_unlock, void) -#define AuGFP_SBILIST GFP_NOFS -#endif - -/* ---------------------------------------------------------------------- */ - -static inline void dbgaufs_si_null(struct au_sbinfo *sbinfo) -{ - /* - * This function is a dynamic '__init' function actually, - * so the tiny check for si_rwsem is unnecessary. - */ - /* AuRwMustWriteLock(&sbinfo->si_rwsem); */ -#ifdef CONFIG_DEBUG_FS - sbinfo->si_dbgaufs = NULL; - sbinfo->si_dbgaufs_plink = NULL; - sbinfo->si_dbgaufs_xib = NULL; -#ifdef CONFIG_AUFS_EXPORT - sbinfo->si_dbgaufs_xigen = NULL; -#endif -#endif -} - -/* ---------------------------------------------------------------------- */ - -static inline pid_t si_pid_bit(void) -{ - /* the origin of pid is 1, but the bitmap's is 0 */ - return current->pid - 1; -} - -static inline int si_pid_test(struct super_block *sb) -{ - pid_t bit; - - bit = si_pid_bit(); - if (bit < PID_MAX_DEFAULT) - return test_bit(bit, au_sbi(sb)->au_si_pid.bitmap); - return si_pid_test_slow(sb); -} - -static inline void si_pid_set(struct super_block *sb) -{ - pid_t bit; - - bit = si_pid_bit(); - if (bit < PID_MAX_DEFAULT) { - AuDebugOn(test_bit(bit, au_sbi(sb)->au_si_pid.bitmap)); - set_bit(bit, au_sbi(sb)->au_si_pid.bitmap); - /* smp_mb(); */ - } else - si_pid_set_slow(sb); -} - -static inline void si_pid_clr(struct super_block *sb) -{ - pid_t bit; - - bit = si_pid_bit(); - if (bit < PID_MAX_DEFAULT) { - AuDebugOn(!test_bit(bit, au_sbi(sb)->au_si_pid.bitmap)); - clear_bit(bit, au_sbi(sb)->au_si_pid.bitmap); - /* smp_mb(); */ - } else - si_pid_clr_slow(sb); -} - -/* ---------------------------------------------------------------------- */ - -/* lock superblock. mainly for entry point functions */ -/* - * __si_read_lock, __si_write_lock, - * __si_read_unlock, __si_write_unlock, __si_downgrade_lock - */ -AuSimpleRwsemFuncs(__si, struct super_block *sb, &au_sbi(sb)->si_rwsem); - -#define SiMustNoWaiters(sb) AuRwMustNoWaiters(&au_sbi(sb)->si_rwsem) -#define SiMustAnyLock(sb) AuRwMustAnyLock(&au_sbi(sb)->si_rwsem) -#define SiMustWriteLock(sb) AuRwMustWriteLock(&au_sbi(sb)->si_rwsem) - -static inline void si_noflush_read_lock(struct super_block *sb) -{ - __si_read_lock(sb); - si_pid_set(sb); -} - -static inline int si_noflush_read_trylock(struct super_block *sb) -{ - int locked; - - locked = __si_read_trylock(sb); - if (locked) - si_pid_set(sb); - return locked; -} - -static inline void si_noflush_write_lock(struct super_block *sb) -{ - __si_write_lock(sb); - si_pid_set(sb); -} - -static inline int si_noflush_write_trylock(struct super_block *sb) -{ - int locked; - - locked = __si_write_trylock(sb); - if (locked) - si_pid_set(sb); - return locked; -} - -#if 0 /* reserved */ -static inline int si_read_trylock(struct super_block *sb, int flags) -{ - if (au_ftest_lock(flags, FLUSH)) - au_nwt_flush(&au_sbi(sb)->si_nowait); - return si_noflush_read_trylock(sb); -} -#endif - -static inline void si_read_unlock(struct super_block *sb) -{ - si_pid_clr(sb); - __si_read_unlock(sb); -} - -#if 0 /* reserved */ -static inline int si_write_trylock(struct super_block *sb, int flags) -{ - if (au_ftest_lock(flags, FLUSH)) - au_nwt_flush(&au_sbi(sb)->si_nowait); - return si_noflush_write_trylock(sb); -} -#endif - -static inline void si_write_unlock(struct super_block *sb) -{ - si_pid_clr(sb); - __si_write_unlock(sb); -} - -#if 0 /* reserved */ -static inline void si_downgrade_lock(struct super_block *sb) -{ - __si_downgrade_lock(sb); -} -#endif - -/* ---------------------------------------------------------------------- */ - -static inline aufs_bindex_t au_sbend(struct super_block *sb) -{ - SiMustAnyLock(sb); - return au_sbi(sb)->si_bend; -} - -static inline unsigned int au_mntflags(struct super_block *sb) -{ - SiMustAnyLock(sb); - return au_sbi(sb)->si_mntflags; -} - -static inline unsigned int au_sigen(struct super_block *sb) -{ - SiMustAnyLock(sb); - return au_sbi(sb)->si_generation; -} - -static inline void au_ninodes_inc(struct super_block *sb) -{ - atomic_long_inc(&au_sbi(sb)->si_ninodes); -} - -static inline void au_ninodes_dec(struct super_block *sb) -{ - AuDebugOn(!atomic_long_read(&au_sbi(sb)->si_ninodes)); - atomic_long_dec(&au_sbi(sb)->si_ninodes); -} - -static inline void au_nfiles_inc(struct super_block *sb) -{ - atomic_long_inc(&au_sbi(sb)->si_nfiles); -} - -static inline void au_nfiles_dec(struct super_block *sb) -{ - AuDebugOn(!atomic_long_read(&au_sbi(sb)->si_nfiles)); - atomic_long_dec(&au_sbi(sb)->si_nfiles); -} - -static inline struct au_branch *au_sbr(struct super_block *sb, - aufs_bindex_t bindex) -{ - SiMustAnyLock(sb); - return au_sbi(sb)->si_branch[0 + bindex]; -} - -static inline void au_xino_brid_set(struct super_block *sb, aufs_bindex_t brid) -{ - SiMustWriteLock(sb); - au_sbi(sb)->si_xino_brid = brid; -} - -static inline aufs_bindex_t au_xino_brid(struct super_block *sb) -{ - SiMustAnyLock(sb); - return au_sbi(sb)->si_xino_brid; -} - -#endif /* __KERNEL__ */ -#endif /* __AUFS_SUPER_H__ */ diff --git a/fs/aufs/sysaufs.c b/fs/aufs/sysaufs.c deleted file mode 100644 index 42dbc2825..000000000 --- a/fs/aufs/sysaufs.c +++ /dev/null @@ -1,104 +0,0 @@ -/* - * Copyright (C) 2005-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * sysfs interface and lifetime management - * they are necessary regardless sysfs is disabled. - */ - -#include <linux/random.h> -#include "aufs.h" - -unsigned long sysaufs_si_mask; -struct kset *sysaufs_kset; - -#define AuSiAttr(_name) { \ - .attr = { .name = __stringify(_name), .mode = 0444 }, \ - .show = sysaufs_si_##_name, \ -} - -static struct sysaufs_si_attr sysaufs_si_attr_xi_path = AuSiAttr(xi_path); -struct attribute *sysaufs_si_attrs[] = { - &sysaufs_si_attr_xi_path.attr, - NULL, -}; - -static const struct sysfs_ops au_sbi_ops = { - .show = sysaufs_si_show -}; - -static struct kobj_type au_sbi_ktype = { - .release = au_si_free, - .sysfs_ops = &au_sbi_ops, - .default_attrs = sysaufs_si_attrs -}; - -/* ---------------------------------------------------------------------- */ - -int sysaufs_si_init(struct au_sbinfo *sbinfo) -{ - int err; - - sbinfo->si_kobj.kset = sysaufs_kset; - /* cf. sysaufs_name() */ - err = kobject_init_and_add - (&sbinfo->si_kobj, &au_sbi_ktype, /*&sysaufs_kset->kobj*/NULL, - SysaufsSiNamePrefix "%lx", sysaufs_si_id(sbinfo)); - - dbgaufs_si_null(sbinfo); - if (!err) { - err = dbgaufs_si_init(sbinfo); - if (unlikely(err)) - kobject_put(&sbinfo->si_kobj); - } - return err; -} - -void sysaufs_fin(void) -{ - dbgaufs_fin(); - sysfs_remove_group(&sysaufs_kset->kobj, sysaufs_attr_group); - kset_unregister(sysaufs_kset); -} - -int __init sysaufs_init(void) -{ - int err; - - do { - get_random_bytes(&sysaufs_si_mask, sizeof(sysaufs_si_mask)); - } while (!sysaufs_si_mask); - - err = -EINVAL; - sysaufs_kset = kset_create_and_add(AUFS_NAME, NULL, fs_kobj); - if (unlikely(!sysaufs_kset)) - goto out; - err = PTR_ERR(sysaufs_kset); - if (IS_ERR(sysaufs_kset)) - goto out; - err = sysfs_create_group(&sysaufs_kset->kobj, sysaufs_attr_group); - if (unlikely(err)) { - kset_unregister(sysaufs_kset); - goto out; - } - - err = dbgaufs_init(); - if (unlikely(err)) - sysaufs_fin(); -out: - return err; -} diff --git a/fs/aufs/sysaufs.h b/fs/aufs/sysaufs.h deleted file mode 100644 index 8e0fd02aa..000000000 --- a/fs/aufs/sysaufs.h +++ /dev/null @@ -1,101 +0,0 @@ -/* - * Copyright (C) 2005-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * sysfs interface and mount lifetime management - */ - -#ifndef __SYSAUFS_H__ -#define __SYSAUFS_H__ - -#ifdef __KERNEL__ - -#include <linux/sysfs.h> -#include "module.h" - -struct super_block; -struct au_sbinfo; - -struct sysaufs_si_attr { - struct attribute attr; - int (*show)(struct seq_file *seq, struct super_block *sb); -}; - -/* ---------------------------------------------------------------------- */ - -/* sysaufs.c */ -extern unsigned long sysaufs_si_mask; -extern struct kset *sysaufs_kset; -extern struct attribute *sysaufs_si_attrs[]; -int sysaufs_si_init(struct au_sbinfo *sbinfo); -int __init sysaufs_init(void); -void sysaufs_fin(void); - -/* ---------------------------------------------------------------------- */ - -/* some people doesn't like to show a pointer in kernel */ -static inline unsigned long sysaufs_si_id(struct au_sbinfo *sbinfo) -{ - return sysaufs_si_mask ^ (unsigned long)sbinfo; -} - -#define SysaufsSiNamePrefix "si_" -#define SysaufsSiNameLen (sizeof(SysaufsSiNamePrefix) + 16) -static inline void sysaufs_name(struct au_sbinfo *sbinfo, char *name) -{ - snprintf(name, SysaufsSiNameLen, SysaufsSiNamePrefix "%lx", - sysaufs_si_id(sbinfo)); -} - -struct au_branch; -#ifdef CONFIG_SYSFS -/* sysfs.c */ -extern struct attribute_group *sysaufs_attr_group; - -int sysaufs_si_xi_path(struct seq_file *seq, struct super_block *sb); -ssize_t sysaufs_si_show(struct kobject *kobj, struct attribute *attr, - char *buf); -long au_brinfo_ioctl(struct file *file, unsigned long arg); -#ifdef CONFIG_COMPAT -long au_brinfo_compat_ioctl(struct file *file, unsigned long arg); -#endif - -void sysaufs_br_init(struct au_branch *br); -void sysaufs_brs_add(struct super_block *sb, aufs_bindex_t bindex); -void sysaufs_brs_del(struct super_block *sb, aufs_bindex_t bindex); - -#define sysaufs_brs_init() do {} while (0) - -#else -#define sysaufs_attr_group NULL - -AuStubInt0(sysaufs_si_xi_path, struct seq_file *seq, struct super_block *sb) -AuStub(ssize_t, sysaufs_si_show, return 0, struct kobject *kobj, - struct attribute *attr, char *buf) -AuStubVoid(sysaufs_br_init, struct au_branch *br) -AuStubVoid(sysaufs_brs_add, struct super_block *sb, aufs_bindex_t bindex) -AuStubVoid(sysaufs_brs_del, struct super_block *sb, aufs_bindex_t bindex) - -static inline void sysaufs_brs_init(void) -{ - sysaufs_brs = 0; -} - -#endif /* CONFIG_SYSFS */ - -#endif /* __KERNEL__ */ -#endif /* __SYSAUFS_H__ */ diff --git a/fs/aufs/sysfs.c b/fs/aufs/sysfs.c deleted file mode 100644 index 549f71d1a..000000000 --- a/fs/aufs/sysfs.c +++ /dev/null @@ -1,372 +0,0 @@ -/* - * Copyright (C) 2005-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * sysfs interface - */ - -#include <linux/compat.h> -#include <linux/seq_file.h> -#include "aufs.h" - -#ifdef CONFIG_AUFS_FS_MODULE -/* this entry violates the "one line per file" policy of sysfs */ -static ssize_t config_show(struct kobject *kobj, struct kobj_attribute *attr, - char *buf) -{ - ssize_t err; - static char *conf = -/* this file is generated at compiling */ -#include "conf.str" - ; - - err = snprintf(buf, PAGE_SIZE, conf); - if (unlikely(err >= PAGE_SIZE)) - err = -EFBIG; - return err; -} - -static struct kobj_attribute au_config_attr = __ATTR_RO(config); -#endif - -static struct attribute *au_attr[] = { -#ifdef CONFIG_AUFS_FS_MODULE - &au_config_attr.attr, -#endif - NULL, /* need to NULL terminate the list of attributes */ -}; - -static struct attribute_group sysaufs_attr_group_body = { - .attrs = au_attr -}; - -struct attribute_group *sysaufs_attr_group = &sysaufs_attr_group_body; - -/* ---------------------------------------------------------------------- */ - -int sysaufs_si_xi_path(struct seq_file *seq, struct super_block *sb) -{ - int err; - - SiMustAnyLock(sb); - - err = 0; - if (au_opt_test(au_mntflags(sb), XINO)) { - err = au_xino_path(seq, au_sbi(sb)->si_xib); - seq_putc(seq, '\n'); - } - return err; -} - -/* - * the lifetime of branch is independent from the entry under sysfs. - * sysfs handles the lifetime of the entry, and never call ->show() after it is - * unlinked. - */ -static int sysaufs_si_br(struct seq_file *seq, struct super_block *sb, - aufs_bindex_t bindex, int idx) -{ - int err; - struct path path; - struct dentry *root; - struct au_branch *br; - au_br_perm_str_t perm; - - AuDbg("b%d\n", bindex); - - err = 0; - root = sb->s_root; - di_read_lock_parent(root, !AuLock_IR); - br = au_sbr(sb, bindex); - - switch (idx) { - case AuBrSysfs_BR: - path.mnt = au_br_mnt(br); - path.dentry = au_h_dptr(root, bindex); - au_seq_path(seq, &path); - au_optstr_br_perm(&perm, br->br_perm); - err = seq_printf(seq, "=%s\n", perm.a); - break; - case AuBrSysfs_BRID: - err = seq_printf(seq, "%d\n", br->br_id); - break; - } - di_read_unlock(root, !AuLock_IR); - if (err == -1) - err = -E2BIG; - - return err; -} - -/* ---------------------------------------------------------------------- */ - -static struct seq_file *au_seq(char *p, ssize_t len) -{ - struct seq_file *seq; - - seq = kzalloc(sizeof(*seq), GFP_NOFS); - if (seq) { - /* mutex_init(&seq.lock); */ - seq->buf = p; - seq->size = len; - return seq; /* success */ - } - - seq = ERR_PTR(-ENOMEM); - return seq; -} - -#define SysaufsBr_PREFIX "br" -#define SysaufsBrid_PREFIX "brid" - -/* todo: file size may exceed PAGE_SIZE */ -ssize_t sysaufs_si_show(struct kobject *kobj, struct attribute *attr, - char *buf) -{ - ssize_t err; - int idx; - long l; - aufs_bindex_t bend; - struct au_sbinfo *sbinfo; - struct super_block *sb; - struct seq_file *seq; - char *name; - struct attribute **cattr; - - sbinfo = container_of(kobj, struct au_sbinfo, si_kobj); - sb = sbinfo->si_sb; - - /* - * prevent a race condition between sysfs and aufs. - * for instance, sysfs_file_read() calls sysfs_get_active_two() which - * prohibits maintaining the sysfs entries. - * hew we acquire read lock after sysfs_get_active_two(). - * on the other hand, the remount process may maintain the sysfs/aufs - * entries after acquiring write lock. - * it can cause a deadlock. - * simply we gave up processing read here. - */ - err = -EBUSY; - if (unlikely(!si_noflush_read_trylock(sb))) - goto out; - - seq = au_seq(buf, PAGE_SIZE); - err = PTR_ERR(seq); - if (IS_ERR(seq)) - goto out_unlock; - - name = (void *)attr->name; - cattr = sysaufs_si_attrs; - while (*cattr) { - if (!strcmp(name, (*cattr)->name)) { - err = container_of(*cattr, struct sysaufs_si_attr, attr) - ->show(seq, sb); - goto out_seq; - } - cattr++; - } - - if (!strncmp(name, SysaufsBrid_PREFIX, - sizeof(SysaufsBrid_PREFIX) - 1)) { - idx = AuBrSysfs_BRID; - name += sizeof(SysaufsBrid_PREFIX) - 1; - } else if (!strncmp(name, SysaufsBr_PREFIX, - sizeof(SysaufsBr_PREFIX) - 1)) { - idx = AuBrSysfs_BR; - name += sizeof(SysaufsBr_PREFIX) - 1; - } else - BUG(); - - err = kstrtol(name, 10, &l); - if (!err) { - bend = au_sbend(sb); - if (l <= bend) - err = sysaufs_si_br(seq, sb, (aufs_bindex_t)l, idx); - else - err = -ENOENT; - } - -out_seq: - if (!err) { - err = seq->count; - /* sysfs limit */ - if (unlikely(err == PAGE_SIZE)) - err = -EFBIG; - } - kfree(seq); -out_unlock: - si_read_unlock(sb); -out: - return err; -} - -/* ---------------------------------------------------------------------- */ - -static int au_brinfo(struct super_block *sb, union aufs_brinfo __user *arg) -{ - int err; - int16_t brid; - aufs_bindex_t bindex, bend; - size_t sz; - char *buf; - struct seq_file *seq; - struct au_branch *br; - - si_read_lock(sb, AuLock_FLUSH); - bend = au_sbend(sb); - err = bend + 1; - if (!arg) - goto out; - - err = -ENOMEM; - buf = (void *)__get_free_page(GFP_NOFS); - if (unlikely(!buf)) - goto out; - - seq = au_seq(buf, PAGE_SIZE); - err = PTR_ERR(seq); - if (IS_ERR(seq)) - goto out_buf; - - sz = sizeof(*arg) - offsetof(union aufs_brinfo, path); - for (bindex = 0; bindex <= bend; bindex++, arg++) { - err = !access_ok(VERIFY_WRITE, arg, sizeof(*arg)); - if (unlikely(err)) - break; - - br = au_sbr(sb, bindex); - brid = br->br_id; - BUILD_BUG_ON(sizeof(brid) != sizeof(arg->id)); - err = __put_user(brid, &arg->id); - if (unlikely(err)) - break; - - BUILD_BUG_ON(sizeof(br->br_perm) != sizeof(arg->perm)); - err = __put_user(br->br_perm, &arg->perm); - if (unlikely(err)) - break; - - au_seq_path(seq, &br->br_path); - err = seq_putc(seq, '\0'); - if (!err && seq->count <= sz) { - err = copy_to_user(arg->path, seq->buf, seq->count); - seq->count = 0; - if (unlikely(err)) - break; - } else { - err = -E2BIG; - goto out_seq; - } - } - if (unlikely(err)) - err = -EFAULT; - -out_seq: - kfree(seq); -out_buf: - free_page((unsigned long)buf); -out: - si_read_unlock(sb); - return err; -} - -long au_brinfo_ioctl(struct file *file, unsigned long arg) -{ - return au_brinfo(file->f_path.dentry->d_sb, (void __user *)arg); -} - -#ifdef CONFIG_COMPAT -long au_brinfo_compat_ioctl(struct file *file, unsigned long arg) -{ - return au_brinfo(file->f_path.dentry->d_sb, compat_ptr(arg)); -} -#endif - -/* ---------------------------------------------------------------------- */ - -void sysaufs_br_init(struct au_branch *br) -{ - int i; - struct au_brsysfs *br_sysfs; - struct attribute *attr; - - br_sysfs = br->br_sysfs; - for (i = 0; i < ARRAY_SIZE(br->br_sysfs); i++) { - attr = &br_sysfs->attr; - sysfs_attr_init(attr); - attr->name = br_sysfs->name; - attr->mode = S_IRUGO; - br_sysfs++; - } -} - -void sysaufs_brs_del(struct super_block *sb, aufs_bindex_t bindex) -{ - struct au_branch *br; - struct kobject *kobj; - struct au_brsysfs *br_sysfs; - int i; - aufs_bindex_t bend; - - dbgaufs_brs_del(sb, bindex); - - if (!sysaufs_brs) - return; - - kobj = &au_sbi(sb)->si_kobj; - bend = au_sbend(sb); - for (; bindex <= bend; bindex++) { - br = au_sbr(sb, bindex); - br_sysfs = br->br_sysfs; - for (i = 0; i < ARRAY_SIZE(br->br_sysfs); i++) { - sysfs_remove_file(kobj, &br_sysfs->attr); - br_sysfs++; - } - } -} - -void sysaufs_brs_add(struct super_block *sb, aufs_bindex_t bindex) -{ - int err, i; - aufs_bindex_t bend; - struct kobject *kobj; - struct au_branch *br; - struct au_brsysfs *br_sysfs; - - dbgaufs_brs_add(sb, bindex); - - if (!sysaufs_brs) - return; - - kobj = &au_sbi(sb)->si_kobj; - bend = au_sbend(sb); - for (; bindex <= bend; bindex++) { - br = au_sbr(sb, bindex); - br_sysfs = br->br_sysfs; - snprintf(br_sysfs[AuBrSysfs_BR].name, sizeof(br_sysfs->name), - SysaufsBr_PREFIX "%d", bindex); - snprintf(br_sysfs[AuBrSysfs_BRID].name, sizeof(br_sysfs->name), - SysaufsBrid_PREFIX "%d", bindex); - for (i = 0; i < ARRAY_SIZE(br->br_sysfs); i++) { - err = sysfs_create_file(kobj, &br_sysfs->attr); - if (unlikely(err)) - pr_warn("failed %s under sysfs(%d)\n", - br_sysfs->name, err); - br_sysfs++; - } - } -} diff --git a/fs/aufs/sysrq.c b/fs/aufs/sysrq.c deleted file mode 100644 index a3e95bce6..000000000 --- a/fs/aufs/sysrq.c +++ /dev/null @@ -1,157 +0,0 @@ -/* - * Copyright (C) 2005-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * magic sysrq hanlder - */ - -/* #include <linux/sysrq.h> */ -#include <linux/writeback.h> -#include "aufs.h" - -/* ---------------------------------------------------------------------- */ - -static void sysrq_sb(struct super_block *sb) -{ - char *plevel; - struct au_sbinfo *sbinfo; - struct file *file; - struct au_sphlhead *files; - struct au_finfo *finfo; - - plevel = au_plevel; - au_plevel = KERN_WARNING; - - /* since we define pr_fmt, call printk directly */ -#define pr(str) printk(KERN_WARNING AUFS_NAME ": " str) - - sbinfo = au_sbi(sb); - printk(KERN_WARNING "si=%lx\n", sysaufs_si_id(sbinfo)); - pr("superblock\n"); - au_dpri_sb(sb); - -#if 0 - pr("root dentry\n"); - au_dpri_dentry(sb->s_root); - pr("root inode\n"); - au_dpri_inode(d_inode(sb->s_root)); -#endif - -#if 0 - do { - int err, i, j, ndentry; - struct au_dcsub_pages dpages; - struct au_dpage *dpage; - - err = au_dpages_init(&dpages, GFP_ATOMIC); - if (unlikely(err)) - break; - err = au_dcsub_pages(&dpages, sb->s_root, NULL, NULL); - if (!err) - for (i = 0; i < dpages.ndpage; i++) { - dpage = dpages.dpages + i; - ndentry = dpage->ndentry; - for (j = 0; j < ndentry; j++) - au_dpri_dentry(dpage->dentries[j]); - } - au_dpages_free(&dpages); - } while (0); -#endif - -#if 1 - { - struct inode *i; - - pr("isolated inode\n"); - spin_lock(&inode_sb_list_lock); - list_for_each_entry(i, &sb->s_inodes, i_sb_list) { - spin_lock(&i->i_lock); - if (1 || hlist_empty(&i->i_dentry)) - au_dpri_inode(i); - spin_unlock(&i->i_lock); - } - spin_unlock(&inode_sb_list_lock); - } -#endif - pr("files\n"); - files = &au_sbi(sb)->si_files; - spin_lock(&files->spin); - hlist_for_each_entry(finfo, &files->head, fi_hlist) { - umode_t mode; - - file = finfo->fi_file; - mode = file_inode(file)->i_mode; - if (!special_file(mode)) - au_dpri_file(file); - } - spin_unlock(&files->spin); - pr("done\n"); - -#undef pr - au_plevel = plevel; -} - -/* ---------------------------------------------------------------------- */ - -/* module parameter */ -static char *aufs_sysrq_key = "a"; -module_param_named(sysrq, aufs_sysrq_key, charp, S_IRUGO); -MODULE_PARM_DESC(sysrq, "MagicSysRq key for " AUFS_NAME); - -static void au_sysrq(int key __maybe_unused) -{ - struct au_sbinfo *sbinfo; - - lockdep_off(); - au_sbilist_lock(); - list_for_each_entry(sbinfo, &au_sbilist.head, si_list) - sysrq_sb(sbinfo->si_sb); - au_sbilist_unlock(); - lockdep_on(); -} - -static struct sysrq_key_op au_sysrq_op = { - .handler = au_sysrq, - .help_msg = "Aufs", - .action_msg = "Aufs", - .enable_mask = SYSRQ_ENABLE_DUMP -}; - -/* ---------------------------------------------------------------------- */ - -int __init au_sysrq_init(void) -{ - int err; - char key; - - err = -1; - key = *aufs_sysrq_key; - if ('a' <= key && key <= 'z') - err = register_sysrq_key(key, &au_sysrq_op); - if (unlikely(err)) - pr_err("err %d, sysrq=%c\n", err, key); - return err; -} - -void au_sysrq_fin(void) -{ - int err; - - err = unregister_sysrq_key(*aufs_sysrq_key, &au_sysrq_op); - if (unlikely(err)) - pr_err("err %d (ignored)\n", err); -} diff --git a/fs/aufs/vdir.c b/fs/aufs/vdir.c deleted file mode 100644 index 0929d393a..000000000 --- a/fs/aufs/vdir.c +++ /dev/null @@ -1,888 +0,0 @@ -/* - * Copyright (C) 2005-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * virtual or vertical directory - */ - -#include "aufs.h" - -static unsigned int calc_size(int nlen) -{ - return ALIGN(sizeof(struct au_vdir_de) + nlen, sizeof(ino_t)); -} - -static int set_deblk_end(union au_vdir_deblk_p *p, - union au_vdir_deblk_p *deblk_end) -{ - if (calc_size(0) <= deblk_end->deblk - p->deblk) { - p->de->de_str.len = 0; - /* smp_mb(); */ - return 0; - } - return -1; /* error */ -} - -/* returns true or false */ -static int is_deblk_end(union au_vdir_deblk_p *p, - union au_vdir_deblk_p *deblk_end) -{ - if (calc_size(0) <= deblk_end->deblk - p->deblk) - return !p->de->de_str.len; - return 1; -} - -static unsigned char *last_deblk(struct au_vdir *vdir) -{ - return vdir->vd_deblk[vdir->vd_nblk - 1]; -} - -/* ---------------------------------------------------------------------- */ - -/* estimate the apropriate size for name hash table */ -unsigned int au_rdhash_est(loff_t sz) -{ - unsigned int n; - - n = UINT_MAX; - sz >>= 10; - if (sz < n) - n = sz; - if (sz < AUFS_RDHASH_DEF) - n = AUFS_RDHASH_DEF; - /* pr_info("n %u\n", n); */ - return n; -} - -/* - * the allocated memory has to be freed by - * au_nhash_wh_free() or au_nhash_de_free(). - */ -int au_nhash_alloc(struct au_nhash *nhash, unsigned int num_hash, gfp_t gfp) -{ - struct hlist_head *head; - unsigned int u; - size_t sz; - - sz = sizeof(*nhash->nh_head) * num_hash; - head = kmalloc(sz, gfp); - if (head) { - nhash->nh_num = num_hash; - nhash->nh_head = head; - for (u = 0; u < num_hash; u++) - INIT_HLIST_HEAD(head++); - return 0; /* success */ - } - - return -ENOMEM; -} - -static void nhash_count(struct hlist_head *head) -{ -#if 0 - unsigned long n; - struct hlist_node *pos; - - n = 0; - hlist_for_each(pos, head) - n++; - pr_info("%lu\n", n); -#endif -} - -static void au_nhash_wh_do_free(struct hlist_head *head) -{ - struct au_vdir_wh *pos; - struct hlist_node *node; - - hlist_for_each_entry_safe(pos, node, head, wh_hash) - kfree(pos); -} - -static void au_nhash_de_do_free(struct hlist_head *head) -{ - struct au_vdir_dehstr *pos; - struct hlist_node *node; - - hlist_for_each_entry_safe(pos, node, head, hash) - au_cache_free_vdir_dehstr(pos); -} - -static void au_nhash_do_free(struct au_nhash *nhash, - void (*free)(struct hlist_head *head)) -{ - unsigned int n; - struct hlist_head *head; - - n = nhash->nh_num; - if (!n) - return; - - head = nhash->nh_head; - while (n-- > 0) { - nhash_count(head); - free(head++); - } - kfree(nhash->nh_head); -} - -void au_nhash_wh_free(struct au_nhash *whlist) -{ - au_nhash_do_free(whlist, au_nhash_wh_do_free); -} - -static void au_nhash_de_free(struct au_nhash *delist) -{ - au_nhash_do_free(delist, au_nhash_de_do_free); -} - -/* ---------------------------------------------------------------------- */ - -int au_nhash_test_longer_wh(struct au_nhash *whlist, aufs_bindex_t btgt, - int limit) -{ - int num; - unsigned int u, n; - struct hlist_head *head; - struct au_vdir_wh *pos; - - num = 0; - n = whlist->nh_num; - head = whlist->nh_head; - for (u = 0; u < n; u++, head++) - hlist_for_each_entry(pos, head, wh_hash) - if (pos->wh_bindex == btgt && ++num > limit) - return 1; - return 0; -} - -static struct hlist_head *au_name_hash(struct au_nhash *nhash, - unsigned char *name, - unsigned int len) -{ - unsigned int v; - /* const unsigned int magic_bit = 12; */ - - AuDebugOn(!nhash->nh_num || !nhash->nh_head); - - v = 0; - while (len--) - v += *name++; - /* v = hash_long(v, magic_bit); */ - v %= nhash->nh_num; - return nhash->nh_head + v; -} - -static int au_nhash_test_name(struct au_vdir_destr *str, const char *name, - int nlen) -{ - return str->len == nlen && !memcmp(str->name, name, nlen); -} - -/* returns found or not */ -int au_nhash_test_known_wh(struct au_nhash *whlist, char *name, int nlen) -{ - struct hlist_head *head; - struct au_vdir_wh *pos; - struct au_vdir_destr *str; - - head = au_name_hash(whlist, name, nlen); - hlist_for_each_entry(pos, head, wh_hash) { - str = &pos->wh_str; - AuDbg("%.*s\n", str->len, str->name); - if (au_nhash_test_name(str, name, nlen)) - return 1; - } - return 0; -} - -/* returns found(true) or not */ -static int test_known(struct au_nhash *delist, char *name, int nlen) -{ - struct hlist_head *head; - struct au_vdir_dehstr *pos; - struct au_vdir_destr *str; - - head = au_name_hash(delist, name, nlen); - hlist_for_each_entry(pos, head, hash) { - str = pos->str; - AuDbg("%.*s\n", str->len, str->name); - if (au_nhash_test_name(str, name, nlen)) - return 1; - } - return 0; -} - -static void au_shwh_init_wh(struct au_vdir_wh *wh, ino_t ino, - unsigned char d_type) -{ -#ifdef CONFIG_AUFS_SHWH - wh->wh_ino = ino; - wh->wh_type = d_type; -#endif -} - -/* ---------------------------------------------------------------------- */ - -int au_nhash_append_wh(struct au_nhash *whlist, char *name, int nlen, ino_t ino, - unsigned int d_type, aufs_bindex_t bindex, - unsigned char shwh) -{ - int err; - struct au_vdir_destr *str; - struct au_vdir_wh *wh; - - AuDbg("%.*s\n", nlen, name); - AuDebugOn(!whlist->nh_num || !whlist->nh_head); - - err = -ENOMEM; - wh = kmalloc(sizeof(*wh) + nlen, GFP_NOFS); - if (unlikely(!wh)) - goto out; - - err = 0; - wh->wh_bindex = bindex; - if (shwh) - au_shwh_init_wh(wh, ino, d_type); - str = &wh->wh_str; - str->len = nlen; - memcpy(str->name, name, nlen); - hlist_add_head(&wh->wh_hash, au_name_hash(whlist, name, nlen)); - /* smp_mb(); */ - -out: - return err; -} - -static int append_deblk(struct au_vdir *vdir) -{ - int err; - unsigned long ul; - const unsigned int deblk_sz = vdir->vd_deblk_sz; - union au_vdir_deblk_p p, deblk_end; - unsigned char **o; - - err = -ENOMEM; - o = krealloc(vdir->vd_deblk, sizeof(*o) * (vdir->vd_nblk + 1), - GFP_NOFS); - if (unlikely(!o)) - goto out; - - vdir->vd_deblk = o; - p.deblk = kmalloc(deblk_sz, GFP_NOFS); - if (p.deblk) { - ul = vdir->vd_nblk++; - vdir->vd_deblk[ul] = p.deblk; - vdir->vd_last.ul = ul; - vdir->vd_last.p.deblk = p.deblk; - deblk_end.deblk = p.deblk + deblk_sz; - err = set_deblk_end(&p, &deblk_end); - } - -out: - return err; -} - -static int append_de(struct au_vdir *vdir, char *name, int nlen, ino_t ino, - unsigned int d_type, struct au_nhash *delist) -{ - int err; - unsigned int sz; - const unsigned int deblk_sz = vdir->vd_deblk_sz; - union au_vdir_deblk_p p, *room, deblk_end; - struct au_vdir_dehstr *dehstr; - - p.deblk = last_deblk(vdir); - deblk_end.deblk = p.deblk + deblk_sz; - room = &vdir->vd_last.p; - AuDebugOn(room->deblk < p.deblk || deblk_end.deblk <= room->deblk - || !is_deblk_end(room, &deblk_end)); - - sz = calc_size(nlen); - if (unlikely(sz > deblk_end.deblk - room->deblk)) { - err = append_deblk(vdir); - if (unlikely(err)) - goto out; - - p.deblk = last_deblk(vdir); - deblk_end.deblk = p.deblk + deblk_sz; - /* smp_mb(); */ - AuDebugOn(room->deblk != p.deblk); - } - - err = -ENOMEM; - dehstr = au_cache_alloc_vdir_dehstr(); - if (unlikely(!dehstr)) - goto out; - - dehstr->str = &room->de->de_str; - hlist_add_head(&dehstr->hash, au_name_hash(delist, name, nlen)); - room->de->de_ino = ino; - room->de->de_type = d_type; - room->de->de_str.len = nlen; - memcpy(room->de->de_str.name, name, nlen); - - err = 0; - room->deblk += sz; - if (unlikely(set_deblk_end(room, &deblk_end))) - err = append_deblk(vdir); - /* smp_mb(); */ - -out: - return err; -} - -/* ---------------------------------------------------------------------- */ - -void au_vdir_free(struct au_vdir *vdir) -{ - unsigned char **deblk; - - deblk = vdir->vd_deblk; - while (vdir->vd_nblk--) - kfree(*deblk++); - kfree(vdir->vd_deblk); - au_cache_free_vdir(vdir); -} - -static struct au_vdir *alloc_vdir(struct file *file) -{ - struct au_vdir *vdir; - struct super_block *sb; - int err; - - sb = file->f_path.dentry->d_sb; - SiMustAnyLock(sb); - - err = -ENOMEM; - vdir = au_cache_alloc_vdir(); - if (unlikely(!vdir)) - goto out; - - vdir->vd_deblk = kzalloc(sizeof(*vdir->vd_deblk), GFP_NOFS); - if (unlikely(!vdir->vd_deblk)) - goto out_free; - - vdir->vd_deblk_sz = au_sbi(sb)->si_rdblk; - if (!vdir->vd_deblk_sz) { - /* estimate the apropriate size for deblk */ - vdir->vd_deblk_sz = au_dir_size(file, /*dentry*/NULL); - /* pr_info("vd_deblk_sz %u\n", vdir->vd_deblk_sz); */ - } - vdir->vd_nblk = 0; - vdir->vd_version = 0; - vdir->vd_jiffy = 0; - err = append_deblk(vdir); - if (!err) - return vdir; /* success */ - - kfree(vdir->vd_deblk); - -out_free: - au_cache_free_vdir(vdir); -out: - vdir = ERR_PTR(err); - return vdir; -} - -static int reinit_vdir(struct au_vdir *vdir) -{ - int err; - union au_vdir_deblk_p p, deblk_end; - - while (vdir->vd_nblk > 1) { - kfree(vdir->vd_deblk[vdir->vd_nblk - 1]); - /* vdir->vd_deblk[vdir->vd_nblk - 1] = NULL; */ - vdir->vd_nblk--; - } - p.deblk = vdir->vd_deblk[0]; - deblk_end.deblk = p.deblk + vdir->vd_deblk_sz; - err = set_deblk_end(&p, &deblk_end); - /* keep vd_dblk_sz */ - vdir->vd_last.ul = 0; - vdir->vd_last.p.deblk = vdir->vd_deblk[0]; - vdir->vd_version = 0; - vdir->vd_jiffy = 0; - /* smp_mb(); */ - return err; -} - -/* ---------------------------------------------------------------------- */ - -#define AuFillVdir_CALLED 1 -#define AuFillVdir_WHABLE (1 << 1) -#define AuFillVdir_SHWH (1 << 2) -#define au_ftest_fillvdir(flags, name) ((flags) & AuFillVdir_##name) -#define au_fset_fillvdir(flags, name) \ - do { (flags) |= AuFillVdir_##name; } while (0) -#define au_fclr_fillvdir(flags, name) \ - do { (flags) &= ~AuFillVdir_##name; } while (0) - -#ifndef CONFIG_AUFS_SHWH -#undef AuFillVdir_SHWH -#define AuFillVdir_SHWH 0 -#endif - -struct fillvdir_arg { - struct dir_context ctx; - struct file *file; - struct au_vdir *vdir; - struct au_nhash delist; - struct au_nhash whlist; - aufs_bindex_t bindex; - unsigned int flags; - int err; -}; - -static int fillvdir(struct dir_context *ctx, const char *__name, int nlen, - loff_t offset __maybe_unused, u64 h_ino, - unsigned int d_type) -{ - struct fillvdir_arg *arg = container_of(ctx, struct fillvdir_arg, ctx); - char *name = (void *)__name; - struct super_block *sb; - ino_t ino; - const unsigned char shwh = !!au_ftest_fillvdir(arg->flags, SHWH); - - arg->err = 0; - sb = arg->file->f_path.dentry->d_sb; - au_fset_fillvdir(arg->flags, CALLED); - /* smp_mb(); */ - if (nlen <= AUFS_WH_PFX_LEN - || memcmp(name, AUFS_WH_PFX, AUFS_WH_PFX_LEN)) { - if (test_known(&arg->delist, name, nlen) - || au_nhash_test_known_wh(&arg->whlist, name, nlen)) - goto out; /* already exists or whiteouted */ - - arg->err = au_ino(sb, arg->bindex, h_ino, d_type, &ino); - if (!arg->err) { - if (unlikely(nlen > AUFS_MAX_NAMELEN)) - d_type = DT_UNKNOWN; - arg->err = append_de(arg->vdir, name, nlen, ino, - d_type, &arg->delist); - } - } else if (au_ftest_fillvdir(arg->flags, WHABLE)) { - name += AUFS_WH_PFX_LEN; - nlen -= AUFS_WH_PFX_LEN; - if (au_nhash_test_known_wh(&arg->whlist, name, nlen)) - goto out; /* already whiteouted */ - - if (shwh) - arg->err = au_wh_ino(sb, arg->bindex, h_ino, d_type, - &ino); - if (!arg->err) { - if (nlen <= AUFS_MAX_NAMELEN + AUFS_WH_PFX_LEN) - d_type = DT_UNKNOWN; - arg->err = au_nhash_append_wh - (&arg->whlist, name, nlen, ino, d_type, - arg->bindex, shwh); - } - } - -out: - if (!arg->err) - arg->vdir->vd_jiffy = jiffies; - /* smp_mb(); */ - AuTraceErr(arg->err); - return arg->err; -} - -static int au_handle_shwh(struct super_block *sb, struct au_vdir *vdir, - struct au_nhash *whlist, struct au_nhash *delist) -{ -#ifdef CONFIG_AUFS_SHWH - int err; - unsigned int nh, u; - struct hlist_head *head; - struct au_vdir_wh *pos; - struct hlist_node *n; - char *p, *o; - struct au_vdir_destr *destr; - - AuDebugOn(!au_opt_test(au_mntflags(sb), SHWH)); - - err = -ENOMEM; - o = p = (void *)__get_free_page(GFP_NOFS); - if (unlikely(!p)) - goto out; - - err = 0; - nh = whlist->nh_num; - memcpy(p, AUFS_WH_PFX, AUFS_WH_PFX_LEN); - p += AUFS_WH_PFX_LEN; - for (u = 0; u < nh; u++) { - head = whlist->nh_head + u; - hlist_for_each_entry_safe(pos, n, head, wh_hash) { - destr = &pos->wh_str; - memcpy(p, destr->name, destr->len); - err = append_de(vdir, o, destr->len + AUFS_WH_PFX_LEN, - pos->wh_ino, pos->wh_type, delist); - if (unlikely(err)) - break; - } - } - - free_page((unsigned long)o); - -out: - AuTraceErr(err); - return err; -#else - return 0; -#endif -} - -static int au_do_read_vdir(struct fillvdir_arg *arg) -{ - int err; - unsigned int rdhash; - loff_t offset; - aufs_bindex_t bend, bindex, bstart; - unsigned char shwh; - struct file *hf, *file; - struct super_block *sb; - - file = arg->file; - sb = file->f_path.dentry->d_sb; - SiMustAnyLock(sb); - - rdhash = au_sbi(sb)->si_rdhash; - if (!rdhash) - rdhash = au_rdhash_est(au_dir_size(file, /*dentry*/NULL)); - err = au_nhash_alloc(&arg->delist, rdhash, GFP_NOFS); - if (unlikely(err)) - goto out; - err = au_nhash_alloc(&arg->whlist, rdhash, GFP_NOFS); - if (unlikely(err)) - goto out_delist; - - err = 0; - arg->flags = 0; - shwh = 0; - if (au_opt_test(au_mntflags(sb), SHWH)) { - shwh = 1; - au_fset_fillvdir(arg->flags, SHWH); - } - bstart = au_fbstart(file); - bend = au_fbend_dir(file); - for (bindex = bstart; !err && bindex <= bend; bindex++) { - hf = au_hf_dir(file, bindex); - if (!hf) - continue; - - offset = vfsub_llseek(hf, 0, SEEK_SET); - err = offset; - if (unlikely(offset)) - break; - - arg->bindex = bindex; - au_fclr_fillvdir(arg->flags, WHABLE); - if (shwh - || (bindex != bend - && au_br_whable(au_sbr_perm(sb, bindex)))) - au_fset_fillvdir(arg->flags, WHABLE); - do { - arg->err = 0; - au_fclr_fillvdir(arg->flags, CALLED); - /* smp_mb(); */ - err = vfsub_iterate_dir(hf, &arg->ctx); - if (err >= 0) - err = arg->err; - } while (!err && au_ftest_fillvdir(arg->flags, CALLED)); - - /* - * dir_relax() may be good for concurrency, but aufs should not - * use it since it will cause a lockdep problem. - */ - } - - if (!err && shwh) - err = au_handle_shwh(sb, arg->vdir, &arg->whlist, &arg->delist); - - au_nhash_wh_free(&arg->whlist); - -out_delist: - au_nhash_de_free(&arg->delist); -out: - return err; -} - -static int read_vdir(struct file *file, int may_read) -{ - int err; - unsigned long expire; - unsigned char do_read; - struct fillvdir_arg arg = { - .ctx = { - .actor = fillvdir - } - }; - struct inode *inode; - struct au_vdir *vdir, *allocated; - - err = 0; - inode = file_inode(file); - IMustLock(inode); - SiMustAnyLock(inode->i_sb); - - allocated = NULL; - do_read = 0; - expire = au_sbi(inode->i_sb)->si_rdcache; - vdir = au_ivdir(inode); - if (!vdir) { - do_read = 1; - vdir = alloc_vdir(file); - err = PTR_ERR(vdir); - if (IS_ERR(vdir)) - goto out; - err = 0; - allocated = vdir; - } else if (may_read - && (inode->i_version != vdir->vd_version - || time_after(jiffies, vdir->vd_jiffy + expire))) { - do_read = 1; - err = reinit_vdir(vdir); - if (unlikely(err)) - goto out; - } - - if (!do_read) - return 0; /* success */ - - arg.file = file; - arg.vdir = vdir; - err = au_do_read_vdir(&arg); - if (!err) { - /* file->f_pos = 0; */ /* todo: ctx->pos? */ - vdir->vd_version = inode->i_version; - vdir->vd_last.ul = 0; - vdir->vd_last.p.deblk = vdir->vd_deblk[0]; - if (allocated) - au_set_ivdir(inode, allocated); - } else if (allocated) - au_vdir_free(allocated); - -out: - return err; -} - -static int copy_vdir(struct au_vdir *tgt, struct au_vdir *src) -{ - int err, rerr; - unsigned long ul, n; - const unsigned int deblk_sz = src->vd_deblk_sz; - - AuDebugOn(tgt->vd_nblk != 1); - - err = -ENOMEM; - if (tgt->vd_nblk < src->vd_nblk) { - unsigned char **p; - - p = krealloc(tgt->vd_deblk, sizeof(*p) * src->vd_nblk, - GFP_NOFS); - if (unlikely(!p)) - goto out; - tgt->vd_deblk = p; - } - - if (tgt->vd_deblk_sz != deblk_sz) { - unsigned char *p; - - tgt->vd_deblk_sz = deblk_sz; - p = krealloc(tgt->vd_deblk[0], deblk_sz, GFP_NOFS); - if (unlikely(!p)) - goto out; - tgt->vd_deblk[0] = p; - } - memcpy(tgt->vd_deblk[0], src->vd_deblk[0], deblk_sz); - tgt->vd_version = src->vd_version; - tgt->vd_jiffy = src->vd_jiffy; - - n = src->vd_nblk; - for (ul = 1; ul < n; ul++) { - tgt->vd_deblk[ul] = kmemdup(src->vd_deblk[ul], deblk_sz, - GFP_NOFS); - if (unlikely(!tgt->vd_deblk[ul])) - goto out; - tgt->vd_nblk++; - } - tgt->vd_nblk = n; - tgt->vd_last.ul = tgt->vd_last.ul; - tgt->vd_last.p.deblk = tgt->vd_deblk[tgt->vd_last.ul]; - tgt->vd_last.p.deblk += src->vd_last.p.deblk - - src->vd_deblk[src->vd_last.ul]; - /* smp_mb(); */ - return 0; /* success */ - -out: - rerr = reinit_vdir(tgt); - BUG_ON(rerr); - return err; -} - -int au_vdir_init(struct file *file) -{ - int err; - struct inode *inode; - struct au_vdir *vdir_cache, *allocated; - - /* test file->f_pos here instead of ctx->pos */ - err = read_vdir(file, !file->f_pos); - if (unlikely(err)) - goto out; - - allocated = NULL; - vdir_cache = au_fvdir_cache(file); - if (!vdir_cache) { - vdir_cache = alloc_vdir(file); - err = PTR_ERR(vdir_cache); - if (IS_ERR(vdir_cache)) - goto out; - allocated = vdir_cache; - } else if (!file->f_pos && vdir_cache->vd_version != file->f_version) { - /* test file->f_pos here instead of ctx->pos */ - err = reinit_vdir(vdir_cache); - if (unlikely(err)) - goto out; - } else - return 0; /* success */ - - inode = file_inode(file); - err = copy_vdir(vdir_cache, au_ivdir(inode)); - if (!err) { - file->f_version = inode->i_version; - if (allocated) - au_set_fvdir_cache(file, allocated); - } else if (allocated) - au_vdir_free(allocated); - -out: - return err; -} - -static loff_t calc_offset(struct au_vdir *vdir) -{ - loff_t offset; - union au_vdir_deblk_p p; - - p.deblk = vdir->vd_deblk[vdir->vd_last.ul]; - offset = vdir->vd_last.p.deblk - p.deblk; - offset += vdir->vd_deblk_sz * vdir->vd_last.ul; - return offset; -} - -/* returns true or false */ -static int seek_vdir(struct file *file, struct dir_context *ctx) -{ - int valid; - unsigned int deblk_sz; - unsigned long ul, n; - loff_t offset; - union au_vdir_deblk_p p, deblk_end; - struct au_vdir *vdir_cache; - - valid = 1; - vdir_cache = au_fvdir_cache(file); - offset = calc_offset(vdir_cache); - AuDbg("offset %lld\n", offset); - if (ctx->pos == offset) - goto out; - - vdir_cache->vd_last.ul = 0; - vdir_cache->vd_last.p.deblk = vdir_cache->vd_deblk[0]; - if (!ctx->pos) - goto out; - - valid = 0; - deblk_sz = vdir_cache->vd_deblk_sz; - ul = div64_u64(ctx->pos, deblk_sz); - AuDbg("ul %lu\n", ul); - if (ul >= vdir_cache->vd_nblk) - goto out; - - n = vdir_cache->vd_nblk; - for (; ul < n; ul++) { - p.deblk = vdir_cache->vd_deblk[ul]; - deblk_end.deblk = p.deblk + deblk_sz; - offset = ul; - offset *= deblk_sz; - while (!is_deblk_end(&p, &deblk_end) && offset < ctx->pos) { - unsigned int l; - - l = calc_size(p.de->de_str.len); - offset += l; - p.deblk += l; - } - if (!is_deblk_end(&p, &deblk_end)) { - valid = 1; - vdir_cache->vd_last.ul = ul; - vdir_cache->vd_last.p = p; - break; - } - } - -out: - /* smp_mb(); */ - AuTraceErr(!valid); - return valid; -} - -int au_vdir_fill_de(struct file *file, struct dir_context *ctx) -{ - unsigned int l, deblk_sz; - union au_vdir_deblk_p deblk_end; - struct au_vdir *vdir_cache; - struct au_vdir_de *de; - - vdir_cache = au_fvdir_cache(file); - if (!seek_vdir(file, ctx)) - return 0; - - deblk_sz = vdir_cache->vd_deblk_sz; - while (1) { - deblk_end.deblk = vdir_cache->vd_deblk[vdir_cache->vd_last.ul]; - deblk_end.deblk += deblk_sz; - while (!is_deblk_end(&vdir_cache->vd_last.p, &deblk_end)) { - de = vdir_cache->vd_last.p.de; - AuDbg("%.*s, off%lld, i%lu, dt%d\n", - de->de_str.len, de->de_str.name, ctx->pos, - (unsigned long)de->de_ino, de->de_type); - if (unlikely(!dir_emit(ctx, de->de_str.name, - de->de_str.len, de->de_ino, - de->de_type))) { - /* todo: ignore the error caused by udba? */ - /* return err; */ - return 0; - } - - l = calc_size(de->de_str.len); - vdir_cache->vd_last.p.deblk += l; - ctx->pos += l; - } - if (vdir_cache->vd_last.ul < vdir_cache->vd_nblk - 1) { - vdir_cache->vd_last.ul++; - vdir_cache->vd_last.p.deblk - = vdir_cache->vd_deblk[vdir_cache->vd_last.ul]; - ctx->pos = deblk_sz * vdir_cache->vd_last.ul; - continue; - } - break; - } - - /* smp_mb(); */ - return 0; -} diff --git a/fs/aufs/vfsub.c b/fs/aufs/vfsub.c deleted file mode 100644 index d2af7ce8d..000000000 --- a/fs/aufs/vfsub.c +++ /dev/null @@ -1,848 +0,0 @@ -/* - * Copyright (C) 2005-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * sub-routines for VFS - */ - -#include <linux/namei.h> -#include <linux/security.h> -#include <linux/splice.h> -#include "aufs.h" - -int vfsub_update_h_iattr(struct path *h_path, int *did) -{ - int err; - struct kstat st; - struct super_block *h_sb; - - /* for remote fs, leave work for its getattr or d_revalidate */ - /* for bad i_attr fs, handle them in aufs_getattr() */ - /* still some fs may acquire i_mutex. we need to skip them */ - err = 0; - if (!did) - did = &err; - h_sb = h_path->dentry->d_sb; - *did = (!au_test_fs_remote(h_sb) && au_test_fs_refresh_iattr(h_sb)); - if (*did) - err = vfs_getattr(h_path, &st); - - return err; -} - -/* ---------------------------------------------------------------------- */ - -struct file *vfsub_dentry_open(struct path *path, int flags) -{ - struct file *file; - - file = dentry_open(path, flags /* | __FMODE_NONOTIFY */, - current_cred()); - if (!IS_ERR_OR_NULL(file) - && (file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) - i_readcount_inc(d_inode(path->dentry)); - - return file; -} - -struct file *vfsub_filp_open(const char *path, int oflags, int mode) -{ - struct file *file; - - lockdep_off(); - file = filp_open(path, - oflags /* | __FMODE_NONOTIFY */, - mode); - lockdep_on(); - if (IS_ERR(file)) - goto out; - vfsub_update_h_iattr(&file->f_path, /*did*/NULL); /*ignore*/ - -out: - return file; -} - -/* - * Ideally this function should call VFS:do_last() in order to keep all its - * checkings. But it is very hard for aufs to regenerate several VFS internal - * structure such as nameidata. This is a second (or third) best approach. - * cf. linux/fs/namei.c:do_last(), lookup_open() and atomic_open(). - */ -int vfsub_atomic_open(struct inode *dir, struct dentry *dentry, - struct vfsub_aopen_args *args, struct au_branch *br) -{ - int err; - struct file *file = args->file; - /* copied from linux/fs/namei.c:atomic_open() */ - struct dentry *const DENTRY_NOT_SET = (void *)-1UL; - - IMustLock(dir); - AuDebugOn(!dir->i_op->atomic_open); - - err = au_br_test_oflag(args->open_flag, br); - if (unlikely(err)) - goto out; - - args->file->f_path.dentry = DENTRY_NOT_SET; - args->file->f_path.mnt = au_br_mnt(br); - err = dir->i_op->atomic_open(dir, dentry, file, args->open_flag, - args->create_mode, args->opened); - if (err >= 0) { - /* some filesystems don't set FILE_CREATED while succeeded? */ - if (*args->opened & FILE_CREATED) - fsnotify_create(dir, dentry); - } else - goto out; - - - if (!err) { - /* todo: call VFS:may_open() here */ - err = open_check_o_direct(file); - /* todo: ima_file_check() too? */ - if (!err && (args->open_flag & __FMODE_EXEC)) - err = deny_write_access(file); - if (unlikely(err)) - /* note that the file is created and still opened */ - goto out; - } - - atomic_inc(&br->br_count); - fsnotify_open(file); - -out: - return err; -} - -int vfsub_kern_path(const char *name, unsigned int flags, struct path *path) -{ - int err; - - err = kern_path(name, flags, path); - if (!err && d_is_positive(path->dentry)) - vfsub_update_h_iattr(path, /*did*/NULL); /*ignore*/ - return err; -} - -struct dentry *vfsub_lookup_one_len(const char *name, struct dentry *parent, - int len) -{ - struct path path = { - .mnt = NULL - }; - - /* VFS checks it too, but by WARN_ON_ONCE() */ - IMustLock(d_inode(parent)); - - path.dentry = lookup_one_len(name, parent, len); - if (IS_ERR(path.dentry)) - goto out; - if (d_is_positive(path.dentry)) - vfsub_update_h_iattr(&path, /*did*/NULL); /*ignore*/ - -out: - AuTraceErrPtr(path.dentry); - return path.dentry; -} - -void vfsub_call_lkup_one(void *args) -{ - struct vfsub_lkup_one_args *a = args; - *a->errp = vfsub_lkup_one(a->name, a->parent); -} - -/* ---------------------------------------------------------------------- */ - -struct dentry *vfsub_lock_rename(struct dentry *d1, struct au_hinode *hdir1, - struct dentry *d2, struct au_hinode *hdir2) -{ - struct dentry *d; - - lockdep_off(); - d = lock_rename(d1, d2); - lockdep_on(); - au_hn_suspend(hdir1); - if (hdir1 != hdir2) - au_hn_suspend(hdir2); - - return d; -} - -void vfsub_unlock_rename(struct dentry *d1, struct au_hinode *hdir1, - struct dentry *d2, struct au_hinode *hdir2) -{ - au_hn_resume(hdir1); - if (hdir1 != hdir2) - au_hn_resume(hdir2); - lockdep_off(); - unlock_rename(d1, d2); - lockdep_on(); -} - -/* ---------------------------------------------------------------------- */ - -int vfsub_create(struct inode *dir, struct path *path, int mode, bool want_excl) -{ - int err; - struct dentry *d; - - IMustLock(dir); - - d = path->dentry; - path->dentry = d->d_parent; - err = security_path_mknod(path, d, mode, 0); - path->dentry = d; - if (unlikely(err)) - goto out; - - lockdep_off(); - err = vfs_create(dir, path->dentry, mode, want_excl); - lockdep_on(); - if (!err) { - struct path tmp = *path; - int did; - - vfsub_update_h_iattr(&tmp, &did); - if (did) { - tmp.dentry = path->dentry->d_parent; - vfsub_update_h_iattr(&tmp, /*did*/NULL); - } - /*ignore*/ - } - -out: - return err; -} - -int vfsub_symlink(struct inode *dir, struct path *path, const char *symname) -{ - int err; - struct dentry *d; - - IMustLock(dir); - - d = path->dentry; - path->dentry = d->d_parent; - err = security_path_symlink(path, d, symname); - path->dentry = d; - if (unlikely(err)) - goto out; - - lockdep_off(); - err = vfs_symlink(dir, path->dentry, symname); - lockdep_on(); - if (!err) { - struct path tmp = *path; - int did; - - vfsub_update_h_iattr(&tmp, &did); - if (did) { - tmp.dentry = path->dentry->d_parent; - vfsub_update_h_iattr(&tmp, /*did*/NULL); - } - /*ignore*/ - } - -out: - return err; -} - -int vfsub_mknod(struct inode *dir, struct path *path, int mode, dev_t dev) -{ - int err; - struct dentry *d; - - IMustLock(dir); - - d = path->dentry; - path->dentry = d->d_parent; - err = security_path_mknod(path, d, mode, new_encode_dev(dev)); - path->dentry = d; - if (unlikely(err)) - goto out; - - lockdep_off(); - err = vfs_mknod(dir, path->dentry, mode, dev); - lockdep_on(); - if (!err) { - struct path tmp = *path; - int did; - - vfsub_update_h_iattr(&tmp, &did); - if (did) { - tmp.dentry = path->dentry->d_parent; - vfsub_update_h_iattr(&tmp, /*did*/NULL); - } - /*ignore*/ - } - -out: - return err; -} - -static int au_test_nlink(struct inode *inode) -{ - const unsigned int link_max = UINT_MAX >> 1; /* rough margin */ - - if (!au_test_fs_no_limit_nlink(inode->i_sb) - || inode->i_nlink < link_max) - return 0; - return -EMLINK; -} - -int vfsub_link(struct dentry *src_dentry, struct inode *dir, struct path *path, - struct inode **delegated_inode) -{ - int err; - struct dentry *d; - - IMustLock(dir); - - err = au_test_nlink(d_inode(src_dentry)); - if (unlikely(err)) - return err; - - /* we don't call may_linkat() */ - d = path->dentry; - path->dentry = d->d_parent; - err = security_path_link(src_dentry, path, d); - path->dentry = d; - if (unlikely(err)) - goto out; - - lockdep_off(); - err = vfs_link(src_dentry, dir, path->dentry, delegated_inode); - lockdep_on(); - if (!err) { - struct path tmp = *path; - int did; - - /* fuse has different memory inode for the same inumber */ - vfsub_update_h_iattr(&tmp, &did); - if (did) { - tmp.dentry = path->dentry->d_parent; - vfsub_update_h_iattr(&tmp, /*did*/NULL); - tmp.dentry = src_dentry; - vfsub_update_h_iattr(&tmp, /*did*/NULL); - } - /*ignore*/ - } - -out: - return err; -} - -int vfsub_rename(struct inode *src_dir, struct dentry *src_dentry, - struct inode *dir, struct path *path, - struct inode **delegated_inode) -{ - int err; - struct path tmp = { - .mnt = path->mnt - }; - struct dentry *d; - - IMustLock(dir); - IMustLock(src_dir); - - d = path->dentry; - path->dentry = d->d_parent; - tmp.dentry = src_dentry->d_parent; - err = security_path_rename(&tmp, src_dentry, path, d, /*flags*/0); - path->dentry = d; - if (unlikely(err)) - goto out; - - lockdep_off(); - err = vfs_rename(src_dir, src_dentry, dir, path->dentry, - delegated_inode, /*flags*/0); - lockdep_on(); - if (!err) { - int did; - - tmp.dentry = d->d_parent; - vfsub_update_h_iattr(&tmp, &did); - if (did) { - tmp.dentry = src_dentry; - vfsub_update_h_iattr(&tmp, /*did*/NULL); - tmp.dentry = src_dentry->d_parent; - vfsub_update_h_iattr(&tmp, /*did*/NULL); - } - /*ignore*/ - } - -out: - return err; -} - -int vfsub_mkdir(struct inode *dir, struct path *path, int mode) -{ - int err; - struct dentry *d; - - IMustLock(dir); - - d = path->dentry; - path->dentry = d->d_parent; - err = security_path_mkdir(path, d, mode); - path->dentry = d; - if (unlikely(err)) - goto out; - - lockdep_off(); - err = vfs_mkdir(dir, path->dentry, mode); - lockdep_on(); - if (!err) { - struct path tmp = *path; - int did; - - vfsub_update_h_iattr(&tmp, &did); - if (did) { - tmp.dentry = path->dentry->d_parent; - vfsub_update_h_iattr(&tmp, /*did*/NULL); - } - /*ignore*/ - } - -out: - return err; -} - -int vfsub_rmdir(struct inode *dir, struct path *path) -{ - int err; - struct dentry *d; - - IMustLock(dir); - - d = path->dentry; - path->dentry = d->d_parent; - err = security_path_rmdir(path, d); - path->dentry = d; - if (unlikely(err)) - goto out; - - lockdep_off(); - err = vfs_rmdir(dir, path->dentry); - lockdep_on(); - if (!err) { - struct path tmp = { - .dentry = path->dentry->d_parent, - .mnt = path->mnt - }; - - vfsub_update_h_iattr(&tmp, /*did*/NULL); /*ignore*/ - } - -out: - return err; -} - -/* ---------------------------------------------------------------------- */ - -/* todo: support mmap_sem? */ -ssize_t vfsub_read_u(struct file *file, char __user *ubuf, size_t count, - loff_t *ppos) -{ - ssize_t err; - - lockdep_off(); - err = vfs_read(file, ubuf, count, ppos); - lockdep_on(); - if (err >= 0) - vfsub_update_h_iattr(&file->f_path, /*did*/NULL); /*ignore*/ - return err; -} - -/* todo: kernel_read()? */ -ssize_t vfsub_read_k(struct file *file, void *kbuf, size_t count, - loff_t *ppos) -{ - ssize_t err; - mm_segment_t oldfs; - union { - void *k; - char __user *u; - } buf; - - buf.k = kbuf; - oldfs = get_fs(); - set_fs(KERNEL_DS); - err = vfsub_read_u(file, buf.u, count, ppos); - set_fs(oldfs); - return err; -} - -ssize_t vfsub_write_u(struct file *file, const char __user *ubuf, size_t count, - loff_t *ppos) -{ - ssize_t err; - - lockdep_off(); - err = vfs_write(file, ubuf, count, ppos); - lockdep_on(); - if (err >= 0) - vfsub_update_h_iattr(&file->f_path, /*did*/NULL); /*ignore*/ - return err; -} - -ssize_t vfsub_write_k(struct file *file, void *kbuf, size_t count, loff_t *ppos) -{ - ssize_t err; - mm_segment_t oldfs; - union { - void *k; - const char __user *u; - } buf; - - buf.k = kbuf; - oldfs = get_fs(); - set_fs(KERNEL_DS); - err = vfsub_write_u(file, buf.u, count, ppos); - set_fs(oldfs); - return err; -} - -int vfsub_flush(struct file *file, fl_owner_t id) -{ - int err; - - err = 0; - if (file->f_op->flush) { - if (!au_test_nfs(file->f_path.dentry->d_sb)) - err = file->f_op->flush(file, id); - else { - lockdep_off(); - err = file->f_op->flush(file, id); - lockdep_on(); - } - if (!err) - vfsub_update_h_iattr(&file->f_path, /*did*/NULL); - /*ignore*/ - } - return err; -} - -int vfsub_iterate_dir(struct file *file, struct dir_context *ctx) -{ - int err; - - AuDbg("%pD, ctx{%pf, %llu}\n", file, ctx->actor, ctx->pos); - - lockdep_off(); - err = iterate_dir(file, ctx); - lockdep_on(); - if (err >= 0) - vfsub_update_h_iattr(&file->f_path, /*did*/NULL); /*ignore*/ - return err; -} - -long vfsub_splice_to(struct file *in, loff_t *ppos, - struct pipe_inode_info *pipe, size_t len, - unsigned int flags) -{ - long err; - - lockdep_off(); - err = do_splice_to(in, ppos, pipe, len, flags); - lockdep_on(); - file_accessed(in); - if (err >= 0) - vfsub_update_h_iattr(&in->f_path, /*did*/NULL); /*ignore*/ - return err; -} - -long vfsub_splice_from(struct pipe_inode_info *pipe, struct file *out, - loff_t *ppos, size_t len, unsigned int flags) -{ - long err; - - lockdep_off(); - err = do_splice_from(pipe, out, ppos, len, flags); - lockdep_on(); - if (err >= 0) - vfsub_update_h_iattr(&out->f_path, /*did*/NULL); /*ignore*/ - return err; -} - -int vfsub_fsync(struct file *file, struct path *path, int datasync) -{ - int err; - - /* file can be NULL */ - lockdep_off(); - err = vfs_fsync(file, datasync); - lockdep_on(); - if (!err) { - if (!path) { - AuDebugOn(!file); - path = &file->f_path; - } - vfsub_update_h_iattr(path, /*did*/NULL); /*ignore*/ - } - return err; -} - -/* cf. open.c:do_sys_truncate() and do_sys_ftruncate() */ -int vfsub_trunc(struct path *h_path, loff_t length, unsigned int attr, - struct file *h_file) -{ - int err; - struct inode *h_inode; - struct super_block *h_sb; - - if (!h_file) { - err = vfsub_truncate(h_path, length); - goto out; - } - - h_inode = d_inode(h_path->dentry); - h_sb = h_inode->i_sb; - lockdep_off(); - sb_start_write(h_sb); - lockdep_on(); - err = locks_verify_truncate(h_inode, h_file, length); - if (!err) - err = security_path_truncate(h_path); - if (!err) { - lockdep_off(); - err = do_truncate(h_path->dentry, length, attr, h_file); - lockdep_on(); - } - lockdep_off(); - sb_end_write(h_sb); - lockdep_on(); - -out: - return err; -} - -/* ---------------------------------------------------------------------- */ - -struct au_vfsub_mkdir_args { - int *errp; - struct inode *dir; - struct path *path; - int mode; -}; - -static void au_call_vfsub_mkdir(void *args) -{ - struct au_vfsub_mkdir_args *a = args; - *a->errp = vfsub_mkdir(a->dir, a->path, a->mode); -} - -int vfsub_sio_mkdir(struct inode *dir, struct path *path, int mode) -{ - int err, do_sio, wkq_err; - - do_sio = au_test_h_perm_sio(dir, MAY_EXEC | MAY_WRITE); - if (!do_sio) { - lockdep_off(); - err = vfsub_mkdir(dir, path, mode); - lockdep_on(); - } else { - struct au_vfsub_mkdir_args args = { - .errp = &err, - .dir = dir, - .path = path, - .mode = mode - }; - wkq_err = au_wkq_wait(au_call_vfsub_mkdir, &args); - if (unlikely(wkq_err)) - err = wkq_err; - } - - return err; -} - -struct au_vfsub_rmdir_args { - int *errp; - struct inode *dir; - struct path *path; -}; - -static void au_call_vfsub_rmdir(void *args) -{ - struct au_vfsub_rmdir_args *a = args; - *a->errp = vfsub_rmdir(a->dir, a->path); -} - -int vfsub_sio_rmdir(struct inode *dir, struct path *path) -{ - int err, do_sio, wkq_err; - - do_sio = au_test_h_perm_sio(dir, MAY_EXEC | MAY_WRITE); - if (!do_sio) { - lockdep_off(); - err = vfsub_rmdir(dir, path); - lockdep_on(); - } else { - struct au_vfsub_rmdir_args args = { - .errp = &err, - .dir = dir, - .path = path - }; - wkq_err = au_wkq_wait(au_call_vfsub_rmdir, &args); - if (unlikely(wkq_err)) - err = wkq_err; - } - - return err; -} - -/* ---------------------------------------------------------------------- */ - -struct notify_change_args { - int *errp; - struct path *path; - struct iattr *ia; - struct inode **delegated_inode; -}; - -static void call_notify_change(void *args) -{ - struct notify_change_args *a = args; - struct inode *h_inode; - - h_inode = d_inode(a->path->dentry); - IMustLock(h_inode); - - *a->errp = -EPERM; - if (!IS_IMMUTABLE(h_inode) && !IS_APPEND(h_inode)) { - lockdep_off(); - *a->errp = notify_change(a->path->dentry, a->ia, - a->delegated_inode); - lockdep_on(); - if (!*a->errp) - vfsub_update_h_iattr(a->path, /*did*/NULL); /*ignore*/ - } - AuTraceErr(*a->errp); -} - -int vfsub_notify_change(struct path *path, struct iattr *ia, - struct inode **delegated_inode) -{ - int err; - struct notify_change_args args = { - .errp = &err, - .path = path, - .ia = ia, - .delegated_inode = delegated_inode - }; - - call_notify_change(&args); - - return err; -} - -int vfsub_sio_notify_change(struct path *path, struct iattr *ia, - struct inode **delegated_inode) -{ - int err, wkq_err; - struct notify_change_args args = { - .errp = &err, - .path = path, - .ia = ia, - .delegated_inode = delegated_inode - }; - - wkq_err = au_wkq_wait(call_notify_change, &args); - if (unlikely(wkq_err)) - err = wkq_err; - - return err; -} - -/* ---------------------------------------------------------------------- */ - -struct unlink_args { - int *errp; - struct inode *dir; - struct path *path; - struct inode **delegated_inode; -}; - -static void call_unlink(void *args) -{ - struct unlink_args *a = args; - struct dentry *d = a->path->dentry; - struct inode *h_inode; - const int stop_sillyrename = (au_test_nfs(d->d_sb) - && au_dcount(d) == 1); - - IMustLock(a->dir); - - a->path->dentry = d->d_parent; - *a->errp = security_path_unlink(a->path, d); - a->path->dentry = d; - if (unlikely(*a->errp)) - return; - - if (!stop_sillyrename) - dget(d); - h_inode = NULL; - if (d_is_positive(d)) { - h_inode = d_inode(d); - ihold(h_inode); - } - - lockdep_off(); - *a->errp = vfs_unlink(a->dir, d, a->delegated_inode); - lockdep_on(); - if (!*a->errp) { - struct path tmp = { - .dentry = d->d_parent, - .mnt = a->path->mnt - }; - vfsub_update_h_iattr(&tmp, /*did*/NULL); /*ignore*/ - } - - if (!stop_sillyrename) - dput(d); - if (h_inode) - iput(h_inode); - - AuTraceErr(*a->errp); -} - -/* - * @dir: must be locked. - * @dentry: target dentry. - */ -int vfsub_unlink(struct inode *dir, struct path *path, - struct inode **delegated_inode, int force) -{ - int err; - struct unlink_args args = { - .errp = &err, - .dir = dir, - .path = path, - .delegated_inode = delegated_inode - }; - - if (!force) - call_unlink(&args); - else { - int wkq_err; - - wkq_err = au_wkq_wait(call_unlink, &args); - if (unlikely(wkq_err)) - err = wkq_err; - } - - return err; -} diff --git a/fs/aufs/vfsub.h b/fs/aufs/vfsub.h deleted file mode 100644 index 5c89f8e56..000000000 --- a/fs/aufs/vfsub.h +++ /dev/null @@ -1,286 +0,0 @@ -/* - * Copyright (C) 2005-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * sub-routines for VFS - */ - -#ifndef __AUFS_VFSUB_H__ -#define __AUFS_VFSUB_H__ - -#ifdef __KERNEL__ - -#include <linux/fs.h> -#include <linux/mount.h> -#include <linux/xattr.h> -#include "debug.h" - -/* copied from linux/fs/internal.h */ -/* todo: BAD approach!! */ -extern void __mnt_drop_write(struct vfsmount *); -extern spinlock_t inode_sb_list_lock; -extern int open_check_o_direct(struct file *f); - -/* ---------------------------------------------------------------------- */ - -/* lock subclass for lower inode */ -/* default MAX_LOCKDEP_SUBCLASSES(8) is not enough */ -/* reduce? gave up. */ -enum { - AuLsc_I_Begin = I_MUTEX_PARENT2, /* 5 */ - AuLsc_I_PARENT, /* lower inode, parent first */ - AuLsc_I_PARENT2, /* copyup dirs */ - AuLsc_I_PARENT3, /* copyup wh */ - AuLsc_I_CHILD, - AuLsc_I_CHILD2, - AuLsc_I_End -}; - -/* to debug easier, do not make them inlined functions */ -#define MtxMustLock(mtx) AuDebugOn(!mutex_is_locked(mtx)) -#define IMustLock(i) MtxMustLock(&(i)->i_mutex) - -/* ---------------------------------------------------------------------- */ - -static inline void vfsub_drop_nlink(struct inode *inode) -{ - AuDebugOn(!inode->i_nlink); - drop_nlink(inode); -} - -static inline void vfsub_dead_dir(struct inode *inode) -{ - AuDebugOn(!S_ISDIR(inode->i_mode)); - inode->i_flags |= S_DEAD; - clear_nlink(inode); -} - -static inline int vfsub_native_ro(struct inode *inode) -{ - return (inode->i_sb->s_flags & MS_RDONLY) - || IS_RDONLY(inode) - /* || IS_APPEND(inode) */ - || IS_IMMUTABLE(inode); -} - -/* ---------------------------------------------------------------------- */ - -int vfsub_update_h_iattr(struct path *h_path, int *did); -struct file *vfsub_dentry_open(struct path *path, int flags); -struct file *vfsub_filp_open(const char *path, int oflags, int mode); -struct vfsub_aopen_args { - struct file *file; - unsigned int open_flag; - umode_t create_mode; - int *opened; -}; -struct au_branch; -int vfsub_atomic_open(struct inode *dir, struct dentry *dentry, - struct vfsub_aopen_args *args, struct au_branch *br); -int vfsub_kern_path(const char *name, unsigned int flags, struct path *path); - -struct dentry *vfsub_lookup_one_len(const char *name, struct dentry *parent, - int len); - -struct vfsub_lkup_one_args { - struct dentry **errp; - struct qstr *name; - struct dentry *parent; -}; - -static inline struct dentry *vfsub_lkup_one(struct qstr *name, - struct dentry *parent) -{ - return vfsub_lookup_one_len(name->name, parent, name->len); -} - -void vfsub_call_lkup_one(void *args); - -/* ---------------------------------------------------------------------- */ - -static inline int vfsub_mnt_want_write(struct vfsmount *mnt) -{ - int err; - - lockdep_off(); - err = mnt_want_write(mnt); - lockdep_on(); - return err; -} - -static inline void vfsub_mnt_drop_write(struct vfsmount *mnt) -{ - lockdep_off(); - mnt_drop_write(mnt); - lockdep_on(); -} - -#if 0 /* reserved */ -static inline void vfsub_mnt_drop_write_file(struct file *file) -{ - lockdep_off(); - mnt_drop_write_file(file); - lockdep_on(); -} -#endif - -/* ---------------------------------------------------------------------- */ - -struct au_hinode; -struct dentry *vfsub_lock_rename(struct dentry *d1, struct au_hinode *hdir1, - struct dentry *d2, struct au_hinode *hdir2); -void vfsub_unlock_rename(struct dentry *d1, struct au_hinode *hdir1, - struct dentry *d2, struct au_hinode *hdir2); - -int vfsub_create(struct inode *dir, struct path *path, int mode, - bool want_excl); -int vfsub_symlink(struct inode *dir, struct path *path, - const char *symname); -int vfsub_mknod(struct inode *dir, struct path *path, int mode, dev_t dev); -int vfsub_link(struct dentry *src_dentry, struct inode *dir, - struct path *path, struct inode **delegated_inode); -int vfsub_rename(struct inode *src_hdir, struct dentry *src_dentry, - struct inode *hdir, struct path *path, - struct inode **delegated_inode); -int vfsub_mkdir(struct inode *dir, struct path *path, int mode); -int vfsub_rmdir(struct inode *dir, struct path *path); - -/* ---------------------------------------------------------------------- */ - -ssize_t vfsub_read_u(struct file *file, char __user *ubuf, size_t count, - loff_t *ppos); -ssize_t vfsub_read_k(struct file *file, void *kbuf, size_t count, - loff_t *ppos); -ssize_t vfsub_write_u(struct file *file, const char __user *ubuf, size_t count, - loff_t *ppos); -ssize_t vfsub_write_k(struct file *file, void *kbuf, size_t count, - loff_t *ppos); -int vfsub_flush(struct file *file, fl_owner_t id); -int vfsub_iterate_dir(struct file *file, struct dir_context *ctx); - -static inline loff_t vfsub_f_size_read(struct file *file) -{ - return i_size_read(file_inode(file)); -} - -static inline unsigned int vfsub_file_flags(struct file *file) -{ - unsigned int flags; - - spin_lock(&file->f_lock); - flags = file->f_flags; - spin_unlock(&file->f_lock); - - return flags; -} - -#if 0 /* reserved */ -static inline void vfsub_file_accessed(struct file *h_file) -{ - file_accessed(h_file); - vfsub_update_h_iattr(&h_file->f_path, /*did*/NULL); /*ignore*/ -} -#endif - -static inline void vfsub_touch_atime(struct vfsmount *h_mnt, - struct dentry *h_dentry) -{ - struct path h_path = { - .dentry = h_dentry, - .mnt = h_mnt - }; - touch_atime(&h_path); - vfsub_update_h_iattr(&h_path, /*did*/NULL); /*ignore*/ -} - -static inline int vfsub_update_time(struct inode *h_inode, struct timespec *ts, - int flags) -{ - return generic_update_time(h_inode, ts, flags); - /* no vfsub_update_h_iattr() since we don't have struct path */ -} - -long vfsub_splice_to(struct file *in, loff_t *ppos, - struct pipe_inode_info *pipe, size_t len, - unsigned int flags); -long vfsub_splice_from(struct pipe_inode_info *pipe, struct file *out, - loff_t *ppos, size_t len, unsigned int flags); - -static inline long vfsub_truncate(struct path *path, loff_t length) -{ - long err; - - lockdep_off(); - err = vfs_truncate(path, length); - lockdep_on(); - return err; -} - -int vfsub_trunc(struct path *h_path, loff_t length, unsigned int attr, - struct file *h_file); -int vfsub_fsync(struct file *file, struct path *path, int datasync); - -/* ---------------------------------------------------------------------- */ - -static inline loff_t vfsub_llseek(struct file *file, loff_t offset, int origin) -{ - loff_t err; - - lockdep_off(); - err = vfs_llseek(file, offset, origin); - lockdep_on(); - return err; -} - -/* ---------------------------------------------------------------------- */ - -int vfsub_sio_mkdir(struct inode *dir, struct path *path, int mode); -int vfsub_sio_rmdir(struct inode *dir, struct path *path); -int vfsub_sio_notify_change(struct path *path, struct iattr *ia, - struct inode **delegated_inode); -int vfsub_notify_change(struct path *path, struct iattr *ia, - struct inode **delegated_inode); -int vfsub_unlink(struct inode *dir, struct path *path, - struct inode **delegated_inode, int force); - -/* ---------------------------------------------------------------------- */ - -static inline int vfsub_setxattr(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) -{ - int err; - - lockdep_off(); - err = vfs_setxattr(dentry, name, value, size, flags); - lockdep_on(); - - return err; -} - -static inline int vfsub_removexattr(struct dentry *dentry, const char *name) -{ - int err; - - lockdep_off(); - err = vfs_removexattr(dentry, name); - lockdep_on(); - - return err; -} - -#endif /* __KERNEL__ */ -#endif /* __AUFS_VFSUB_H__ */ diff --git a/fs/aufs/wbr_policy.c b/fs/aufs/wbr_policy.c deleted file mode 100644 index 3949efb50..000000000 --- a/fs/aufs/wbr_policy.c +++ /dev/null @@ -1,765 +0,0 @@ -/* - * Copyright (C) 2005-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * policies for selecting one among multiple writable branches - */ - -#include <linux/statfs.h> -#include "aufs.h" - -/* subset of cpup_attr() */ -static noinline_for_stack -int au_cpdown_attr(struct path *h_path, struct dentry *h_src) -{ - int err, sbits; - struct iattr ia; - struct inode *h_isrc; - - h_isrc = d_inode(h_src); - ia.ia_valid = ATTR_FORCE | ATTR_MODE | ATTR_UID | ATTR_GID; - ia.ia_mode = h_isrc->i_mode; - ia.ia_uid = h_isrc->i_uid; - ia.ia_gid = h_isrc->i_gid; - sbits = !!(ia.ia_mode & (S_ISUID | S_ISGID)); - au_cpup_attr_flags(d_inode(h_path->dentry), h_isrc->i_flags); - /* no delegation since it is just created */ - err = vfsub_sio_notify_change(h_path, &ia, /*delegated*/NULL); - - /* is this nfs only? */ - if (!err && sbits && au_test_nfs(h_path->dentry->d_sb)) { - ia.ia_valid = ATTR_FORCE | ATTR_MODE; - ia.ia_mode = h_isrc->i_mode; - err = vfsub_sio_notify_change(h_path, &ia, /*delegated*/NULL); - } - - return err; -} - -#define AuCpdown_PARENT_OPQ 1 -#define AuCpdown_WHED (1 << 1) -#define AuCpdown_MADE_DIR (1 << 2) -#define AuCpdown_DIROPQ (1 << 3) -#define au_ftest_cpdown(flags, name) ((flags) & AuCpdown_##name) -#define au_fset_cpdown(flags, name) \ - do { (flags) |= AuCpdown_##name; } while (0) -#define au_fclr_cpdown(flags, name) \ - do { (flags) &= ~AuCpdown_##name; } while (0) - -static int au_cpdown_dir_opq(struct dentry *dentry, aufs_bindex_t bdst, - unsigned int *flags) -{ - int err; - struct dentry *opq_dentry; - - opq_dentry = au_diropq_create(dentry, bdst); - err = PTR_ERR(opq_dentry); - if (IS_ERR(opq_dentry)) - goto out; - dput(opq_dentry); - au_fset_cpdown(*flags, DIROPQ); - -out: - return err; -} - -static int au_cpdown_dir_wh(struct dentry *dentry, struct dentry *h_parent, - struct inode *dir, aufs_bindex_t bdst) -{ - int err; - struct path h_path; - struct au_branch *br; - - br = au_sbr(dentry->d_sb, bdst); - h_path.dentry = au_wh_lkup(h_parent, &dentry->d_name, br); - err = PTR_ERR(h_path.dentry); - if (IS_ERR(h_path.dentry)) - goto out; - - err = 0; - if (d_is_positive(h_path.dentry)) { - h_path.mnt = au_br_mnt(br); - err = au_wh_unlink_dentry(au_h_iptr(dir, bdst), &h_path, - dentry); - } - dput(h_path.dentry); - -out: - return err; -} - -static int au_cpdown_dir(struct dentry *dentry, aufs_bindex_t bdst, - struct au_pin *pin, - struct dentry *h_parent, void *arg) -{ - int err, rerr; - aufs_bindex_t bopq, bstart; - struct path h_path; - struct dentry *parent; - struct inode *h_dir, *h_inode, *inode, *dir; - unsigned int *flags = arg; - - bstart = au_dbstart(dentry); - /* dentry is di-locked */ - parent = dget_parent(dentry); - dir = d_inode(parent); - h_dir = d_inode(h_parent); - AuDebugOn(h_dir != au_h_iptr(dir, bdst)); - IMustLock(h_dir); - - err = au_lkup_neg(dentry, bdst, /*wh*/0); - if (unlikely(err < 0)) - goto out; - h_path.dentry = au_h_dptr(dentry, bdst); - h_path.mnt = au_sbr_mnt(dentry->d_sb, bdst); - err = vfsub_sio_mkdir(au_h_iptr(dir, bdst), &h_path, - S_IRWXU | S_IRUGO | S_IXUGO); - if (unlikely(err)) - goto out_put; - au_fset_cpdown(*flags, MADE_DIR); - - bopq = au_dbdiropq(dentry); - au_fclr_cpdown(*flags, WHED); - au_fclr_cpdown(*flags, DIROPQ); - if (au_dbwh(dentry) == bdst) - au_fset_cpdown(*flags, WHED); - if (!au_ftest_cpdown(*flags, PARENT_OPQ) && bopq <= bdst) - au_fset_cpdown(*flags, PARENT_OPQ); - h_inode = d_inode(h_path.dentry); - mutex_lock_nested(&h_inode->i_mutex, AuLsc_I_CHILD); - if (au_ftest_cpdown(*flags, WHED)) { - err = au_cpdown_dir_opq(dentry, bdst, flags); - if (unlikely(err)) { - mutex_unlock(&h_inode->i_mutex); - goto out_dir; - } - } - - err = au_cpdown_attr(&h_path, au_h_dptr(dentry, bstart)); - mutex_unlock(&h_inode->i_mutex); - if (unlikely(err)) - goto out_opq; - - if (au_ftest_cpdown(*flags, WHED)) { - err = au_cpdown_dir_wh(dentry, h_parent, dir, bdst); - if (unlikely(err)) - goto out_opq; - } - - inode = d_inode(dentry); - if (au_ibend(inode) < bdst) - au_set_ibend(inode, bdst); - au_set_h_iptr(inode, bdst, au_igrab(h_inode), - au_hi_flags(inode, /*isdir*/1)); - au_fhsm_wrote(dentry->d_sb, bdst, /*force*/0); - goto out; /* success */ - - /* revert */ -out_opq: - if (au_ftest_cpdown(*flags, DIROPQ)) { - mutex_lock_nested(&h_inode->i_mutex, AuLsc_I_CHILD); - rerr = au_diropq_remove(dentry, bdst); - mutex_unlock(&h_inode->i_mutex); - if (unlikely(rerr)) { - AuIOErr("failed removing diropq for %pd b%d (%d)\n", - dentry, bdst, rerr); - err = -EIO; - goto out; - } - } -out_dir: - if (au_ftest_cpdown(*flags, MADE_DIR)) { - rerr = vfsub_sio_rmdir(au_h_iptr(dir, bdst), &h_path); - if (unlikely(rerr)) { - AuIOErr("failed removing %pd b%d (%d)\n", - dentry, bdst, rerr); - err = -EIO; - } - } -out_put: - au_set_h_dptr(dentry, bdst, NULL); - if (au_dbend(dentry) == bdst) - au_update_dbend(dentry); -out: - dput(parent); - return err; -} - -int au_cpdown_dirs(struct dentry *dentry, aufs_bindex_t bdst) -{ - int err; - unsigned int flags; - - flags = 0; - err = au_cp_dirs(dentry, bdst, au_cpdown_dir, &flags); - - return err; -} - -/* ---------------------------------------------------------------------- */ - -/* policies for create */ - -int au_wbr_nonopq(struct dentry *dentry, aufs_bindex_t bindex) -{ - int err, i, j, ndentry; - aufs_bindex_t bopq; - struct au_dcsub_pages dpages; - struct au_dpage *dpage; - struct dentry **dentries, *parent, *d; - - err = au_dpages_init(&dpages, GFP_NOFS); - if (unlikely(err)) - goto out; - parent = dget_parent(dentry); - err = au_dcsub_pages_rev_aufs(&dpages, parent, /*do_include*/0); - if (unlikely(err)) - goto out_free; - - err = bindex; - for (i = 0; i < dpages.ndpage; i++) { - dpage = dpages.dpages + i; - dentries = dpage->dentries; - ndentry = dpage->ndentry; - for (j = 0; j < ndentry; j++) { - d = dentries[j]; - di_read_lock_parent2(d, !AuLock_IR); - bopq = au_dbdiropq(d); - di_read_unlock(d, !AuLock_IR); - if (bopq >= 0 && bopq < err) - err = bopq; - } - } - -out_free: - dput(parent); - au_dpages_free(&dpages); -out: - return err; -} - -static int au_wbr_bu(struct super_block *sb, aufs_bindex_t bindex) -{ - for (; bindex >= 0; bindex--) - if (!au_br_rdonly(au_sbr(sb, bindex))) - return bindex; - return -EROFS; -} - -/* top down parent */ -static int au_wbr_create_tdp(struct dentry *dentry, - unsigned int flags __maybe_unused) -{ - int err; - aufs_bindex_t bstart, bindex; - struct super_block *sb; - struct dentry *parent, *h_parent; - - sb = dentry->d_sb; - bstart = au_dbstart(dentry); - err = bstart; - if (!au_br_rdonly(au_sbr(sb, bstart))) - goto out; - - err = -EROFS; - parent = dget_parent(dentry); - for (bindex = au_dbstart(parent); bindex < bstart; bindex++) { - h_parent = au_h_dptr(parent, bindex); - if (!h_parent || d_is_negative(h_parent)) - continue; - - if (!au_br_rdonly(au_sbr(sb, bindex))) { - err = bindex; - break; - } - } - dput(parent); - - /* bottom up here */ - if (unlikely(err < 0)) { - err = au_wbr_bu(sb, bstart - 1); - if (err >= 0) - err = au_wbr_nonopq(dentry, err); - } - -out: - AuDbg("b%d\n", err); - return err; -} - -/* ---------------------------------------------------------------------- */ - -/* an exception for the policy other than tdp */ -static int au_wbr_create_exp(struct dentry *dentry) -{ - int err; - aufs_bindex_t bwh, bdiropq; - struct dentry *parent; - - err = -1; - bwh = au_dbwh(dentry); - parent = dget_parent(dentry); - bdiropq = au_dbdiropq(parent); - if (bwh >= 0) { - if (bdiropq >= 0) - err = min(bdiropq, bwh); - else - err = bwh; - AuDbg("%d\n", err); - } else if (bdiropq >= 0) { - err = bdiropq; - AuDbg("%d\n", err); - } - dput(parent); - - if (err >= 0) - err = au_wbr_nonopq(dentry, err); - - if (err >= 0 && au_br_rdonly(au_sbr(dentry->d_sb, err))) - err = -1; - - AuDbg("%d\n", err); - return err; -} - -/* ---------------------------------------------------------------------- */ - -/* round robin */ -static int au_wbr_create_init_rr(struct super_block *sb) -{ - int err; - - err = au_wbr_bu(sb, au_sbend(sb)); - atomic_set(&au_sbi(sb)->si_wbr_rr_next, -err); /* less important */ - /* smp_mb(); */ - - AuDbg("b%d\n", err); - return err; -} - -static int au_wbr_create_rr(struct dentry *dentry, unsigned int flags) -{ - int err, nbr; - unsigned int u; - aufs_bindex_t bindex, bend; - struct super_block *sb; - atomic_t *next; - - err = au_wbr_create_exp(dentry); - if (err >= 0) - goto out; - - sb = dentry->d_sb; - next = &au_sbi(sb)->si_wbr_rr_next; - bend = au_sbend(sb); - nbr = bend + 1; - for (bindex = 0; bindex <= bend; bindex++) { - if (!au_ftest_wbr(flags, DIR)) { - err = atomic_dec_return(next) + 1; - /* modulo for 0 is meaningless */ - if (unlikely(!err)) - err = atomic_dec_return(next) + 1; - } else - err = atomic_read(next); - AuDbg("%d\n", err); - u = err; - err = u % nbr; - AuDbg("%d\n", err); - if (!au_br_rdonly(au_sbr(sb, err))) - break; - err = -EROFS; - } - - if (err >= 0) - err = au_wbr_nonopq(dentry, err); - -out: - AuDbg("%d\n", err); - return err; -} - -/* ---------------------------------------------------------------------- */ - -/* most free space */ -static void au_mfs(struct dentry *dentry, struct dentry *parent) -{ - struct super_block *sb; - struct au_branch *br; - struct au_wbr_mfs *mfs; - struct dentry *h_parent; - aufs_bindex_t bindex, bend; - int err; - unsigned long long b, bavail; - struct path h_path; - /* reduce the stack usage */ - struct kstatfs *st; - - st = kmalloc(sizeof(*st), GFP_NOFS); - if (unlikely(!st)) { - AuWarn1("failed updating mfs(%d), ignored\n", -ENOMEM); - return; - } - - bavail = 0; - sb = dentry->d_sb; - mfs = &au_sbi(sb)->si_wbr_mfs; - MtxMustLock(&mfs->mfs_lock); - mfs->mfs_bindex = -EROFS; - mfs->mfsrr_bytes = 0; - if (!parent) { - bindex = 0; - bend = au_sbend(sb); - } else { - bindex = au_dbstart(parent); - bend = au_dbtaildir(parent); - } - - for (; bindex <= bend; bindex++) { - if (parent) { - h_parent = au_h_dptr(parent, bindex); - if (!h_parent || d_is_negative(h_parent)) - continue; - } - br = au_sbr(sb, bindex); - if (au_br_rdonly(br)) - continue; - - /* sb->s_root for NFS is unreliable */ - h_path.mnt = au_br_mnt(br); - h_path.dentry = h_path.mnt->mnt_root; - err = vfs_statfs(&h_path, st); - if (unlikely(err)) { - AuWarn1("failed statfs, b%d, %d\n", bindex, err); - continue; - } - - /* when the available size is equal, select the lower one */ - BUILD_BUG_ON(sizeof(b) < sizeof(st->f_bavail) - || sizeof(b) < sizeof(st->f_bsize)); - b = st->f_bavail * st->f_bsize; - br->br_wbr->wbr_bytes = b; - if (b >= bavail) { - bavail = b; - mfs->mfs_bindex = bindex; - mfs->mfs_jiffy = jiffies; - } - } - - mfs->mfsrr_bytes = bavail; - AuDbg("b%d\n", mfs->mfs_bindex); - kfree(st); -} - -static int au_wbr_create_mfs(struct dentry *dentry, unsigned int flags) -{ - int err; - struct dentry *parent; - struct super_block *sb; - struct au_wbr_mfs *mfs; - - err = au_wbr_create_exp(dentry); - if (err >= 0) - goto out; - - sb = dentry->d_sb; - parent = NULL; - if (au_ftest_wbr(flags, PARENT)) - parent = dget_parent(dentry); - mfs = &au_sbi(sb)->si_wbr_mfs; - mutex_lock(&mfs->mfs_lock); - if (time_after(jiffies, mfs->mfs_jiffy + mfs->mfs_expire) - || mfs->mfs_bindex < 0 - || au_br_rdonly(au_sbr(sb, mfs->mfs_bindex))) - au_mfs(dentry, parent); - mutex_unlock(&mfs->mfs_lock); - err = mfs->mfs_bindex; - dput(parent); - - if (err >= 0) - err = au_wbr_nonopq(dentry, err); - -out: - AuDbg("b%d\n", err); - return err; -} - -static int au_wbr_create_init_mfs(struct super_block *sb) -{ - struct au_wbr_mfs *mfs; - - mfs = &au_sbi(sb)->si_wbr_mfs; - mutex_init(&mfs->mfs_lock); - mfs->mfs_jiffy = 0; - mfs->mfs_bindex = -EROFS; - - return 0; -} - -static int au_wbr_create_fin_mfs(struct super_block *sb __maybe_unused) -{ - mutex_destroy(&au_sbi(sb)->si_wbr_mfs.mfs_lock); - return 0; -} - -/* ---------------------------------------------------------------------- */ - -/* most free space and then round robin */ -static int au_wbr_create_mfsrr(struct dentry *dentry, unsigned int flags) -{ - int err; - struct au_wbr_mfs *mfs; - - err = au_wbr_create_mfs(dentry, flags); - if (err >= 0) { - mfs = &au_sbi(dentry->d_sb)->si_wbr_mfs; - mutex_lock(&mfs->mfs_lock); - if (mfs->mfsrr_bytes < mfs->mfsrr_watermark) - err = au_wbr_create_rr(dentry, flags); - mutex_unlock(&mfs->mfs_lock); - } - - AuDbg("b%d\n", err); - return err; -} - -static int au_wbr_create_init_mfsrr(struct super_block *sb) -{ - int err; - - au_wbr_create_init_mfs(sb); /* ignore */ - err = au_wbr_create_init_rr(sb); - - return err; -} - -/* ---------------------------------------------------------------------- */ - -/* top down parent and most free space */ -static int au_wbr_create_pmfs(struct dentry *dentry, unsigned int flags) -{ - int err, e2; - unsigned long long b; - aufs_bindex_t bindex, bstart, bend; - struct super_block *sb; - struct dentry *parent, *h_parent; - struct au_branch *br; - - err = au_wbr_create_tdp(dentry, flags); - if (unlikely(err < 0)) - goto out; - parent = dget_parent(dentry); - bstart = au_dbstart(parent); - bend = au_dbtaildir(parent); - if (bstart == bend) - goto out_parent; /* success */ - - e2 = au_wbr_create_mfs(dentry, flags); - if (e2 < 0) - goto out_parent; /* success */ - - /* when the available size is equal, select upper one */ - sb = dentry->d_sb; - br = au_sbr(sb, err); - b = br->br_wbr->wbr_bytes; - AuDbg("b%d, %llu\n", err, b); - - for (bindex = bstart; bindex <= bend; bindex++) { - h_parent = au_h_dptr(parent, bindex); - if (!h_parent || d_is_negative(h_parent)) - continue; - - br = au_sbr(sb, bindex); - if (!au_br_rdonly(br) && br->br_wbr->wbr_bytes > b) { - b = br->br_wbr->wbr_bytes; - err = bindex; - AuDbg("b%d, %llu\n", err, b); - } - } - - if (err >= 0) - err = au_wbr_nonopq(dentry, err); - -out_parent: - dput(parent); -out: - AuDbg("b%d\n", err); - return err; -} - -/* ---------------------------------------------------------------------- */ - -/* - * - top down parent - * - most free space with parent - * - most free space round-robin regardless parent - */ -static int au_wbr_create_pmfsrr(struct dentry *dentry, unsigned int flags) -{ - int err; - unsigned long long watermark; - struct super_block *sb; - struct au_branch *br; - struct au_wbr_mfs *mfs; - - err = au_wbr_create_pmfs(dentry, flags | AuWbr_PARENT); - if (unlikely(err < 0)) - goto out; - - sb = dentry->d_sb; - br = au_sbr(sb, err); - mfs = &au_sbi(sb)->si_wbr_mfs; - mutex_lock(&mfs->mfs_lock); - watermark = mfs->mfsrr_watermark; - mutex_unlock(&mfs->mfs_lock); - if (br->br_wbr->wbr_bytes < watermark) - /* regardless the parent dir */ - err = au_wbr_create_mfsrr(dentry, flags); - -out: - AuDbg("b%d\n", err); - return err; -} - -/* ---------------------------------------------------------------------- */ - -/* policies for copyup */ - -/* top down parent */ -static int au_wbr_copyup_tdp(struct dentry *dentry) -{ - return au_wbr_create_tdp(dentry, /*flags, anything is ok*/0); -} - -/* bottom up parent */ -static int au_wbr_copyup_bup(struct dentry *dentry) -{ - int err; - aufs_bindex_t bindex, bstart; - struct dentry *parent, *h_parent; - struct super_block *sb; - - err = -EROFS; - sb = dentry->d_sb; - parent = dget_parent(dentry); - bstart = au_dbstart(parent); - for (bindex = au_dbstart(dentry); bindex >= bstart; bindex--) { - h_parent = au_h_dptr(parent, bindex); - if (!h_parent || d_is_negative(h_parent)) - continue; - - if (!au_br_rdonly(au_sbr(sb, bindex))) { - err = bindex; - break; - } - } - dput(parent); - - /* bottom up here */ - if (unlikely(err < 0)) - err = au_wbr_bu(sb, bstart - 1); - - AuDbg("b%d\n", err); - return err; -} - -/* bottom up */ -int au_wbr_do_copyup_bu(struct dentry *dentry, aufs_bindex_t bstart) -{ - int err; - - err = au_wbr_bu(dentry->d_sb, bstart); - AuDbg("b%d\n", err); - if (err > bstart) - err = au_wbr_nonopq(dentry, err); - - AuDbg("b%d\n", err); - return err; -} - -static int au_wbr_copyup_bu(struct dentry *dentry) -{ - int err; - aufs_bindex_t bstart; - - bstart = au_dbstart(dentry); - err = au_wbr_do_copyup_bu(dentry, bstart); - return err; -} - -/* ---------------------------------------------------------------------- */ - -struct au_wbr_copyup_operations au_wbr_copyup_ops[] = { - [AuWbrCopyup_TDP] = { - .copyup = au_wbr_copyup_tdp - }, - [AuWbrCopyup_BUP] = { - .copyup = au_wbr_copyup_bup - }, - [AuWbrCopyup_BU] = { - .copyup = au_wbr_copyup_bu - } -}; - -struct au_wbr_create_operations au_wbr_create_ops[] = { - [AuWbrCreate_TDP] = { - .create = au_wbr_create_tdp - }, - [AuWbrCreate_RR] = { - .create = au_wbr_create_rr, - .init = au_wbr_create_init_rr - }, - [AuWbrCreate_MFS] = { - .create = au_wbr_create_mfs, - .init = au_wbr_create_init_mfs, - .fin = au_wbr_create_fin_mfs - }, - [AuWbrCreate_MFSV] = { - .create = au_wbr_create_mfs, - .init = au_wbr_create_init_mfs, - .fin = au_wbr_create_fin_mfs - }, - [AuWbrCreate_MFSRR] = { - .create = au_wbr_create_mfsrr, - .init = au_wbr_create_init_mfsrr, - .fin = au_wbr_create_fin_mfs - }, - [AuWbrCreate_MFSRRV] = { - .create = au_wbr_create_mfsrr, - .init = au_wbr_create_init_mfsrr, - .fin = au_wbr_create_fin_mfs - }, - [AuWbrCreate_PMFS] = { - .create = au_wbr_create_pmfs, - .init = au_wbr_create_init_mfs, - .fin = au_wbr_create_fin_mfs - }, - [AuWbrCreate_PMFSV] = { - .create = au_wbr_create_pmfs, - .init = au_wbr_create_init_mfs, - .fin = au_wbr_create_fin_mfs - }, - [AuWbrCreate_PMFSRR] = { - .create = au_wbr_create_pmfsrr, - .init = au_wbr_create_init_mfsrr, - .fin = au_wbr_create_fin_mfs - }, - [AuWbrCreate_PMFSRRV] = { - .create = au_wbr_create_pmfsrr, - .init = au_wbr_create_init_mfsrr, - .fin = au_wbr_create_fin_mfs - } -}; diff --git a/fs/aufs/whout.c b/fs/aufs/whout.c deleted file mode 100644 index c3fc2d4bd..000000000 --- a/fs/aufs/whout.c +++ /dev/null @@ -1,1063 +0,0 @@ -/* - * Copyright (C) 2005-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * whiteout for logical deletion and opaque directory - */ - -#include "aufs.h" - -#define WH_MASK S_IRUGO - -/* - * If a directory contains this file, then it is opaque. We start with the - * .wh. flag so that it is blocked by lookup. - */ -static struct qstr diropq_name = QSTR_INIT(AUFS_WH_DIROPQ, - sizeof(AUFS_WH_DIROPQ) - 1); - -/* - * generate whiteout name, which is NOT terminated by NULL. - * @name: original d_name.name - * @len: original d_name.len - * @wh: whiteout qstr - * returns zero when succeeds, otherwise error. - * succeeded value as wh->name should be freed by kfree(). - */ -int au_wh_name_alloc(struct qstr *wh, const struct qstr *name) -{ - char *p; - - if (unlikely(name->len > PATH_MAX - AUFS_WH_PFX_LEN)) - return -ENAMETOOLONG; - - wh->len = name->len + AUFS_WH_PFX_LEN; - p = kmalloc(wh->len, GFP_NOFS); - wh->name = p; - if (p) { - memcpy(p, AUFS_WH_PFX, AUFS_WH_PFX_LEN); - memcpy(p + AUFS_WH_PFX_LEN, name->name, name->len); - /* smp_mb(); */ - return 0; - } - return -ENOMEM; -} - -/* ---------------------------------------------------------------------- */ - -/* - * test if the @wh_name exists under @h_parent. - * @try_sio specifies the necessary of super-io. - */ -int au_wh_test(struct dentry *h_parent, struct qstr *wh_name, int try_sio) -{ - int err; - struct dentry *wh_dentry; - - if (!try_sio) - wh_dentry = vfsub_lkup_one(wh_name, h_parent); - else - wh_dentry = au_sio_lkup_one(wh_name, h_parent); - err = PTR_ERR(wh_dentry); - if (IS_ERR(wh_dentry)) { - if (err == -ENAMETOOLONG) - err = 0; - goto out; - } - - err = 0; - if (d_is_negative(wh_dentry)) - goto out_wh; /* success */ - - err = 1; - if (d_is_reg(wh_dentry)) - goto out_wh; /* success */ - - err = -EIO; - AuIOErr("%pd Invalid whiteout entry type 0%o.\n", - wh_dentry, d_inode(wh_dentry)->i_mode); - -out_wh: - dput(wh_dentry); -out: - return err; -} - -/* - * test if the @h_dentry sets opaque or not. - */ -int au_diropq_test(struct dentry *h_dentry) -{ - int err; - struct inode *h_dir; - - h_dir = d_inode(h_dentry); - err = au_wh_test(h_dentry, &diropq_name, - au_test_h_perm_sio(h_dir, MAY_EXEC)); - return err; -} - -/* - * returns a negative dentry whose name is unique and temporary. - */ -struct dentry *au_whtmp_lkup(struct dentry *h_parent, struct au_branch *br, - struct qstr *prefix) -{ - struct dentry *dentry; - int i; - char defname[NAME_MAX - AUFS_MAX_NAMELEN + DNAME_INLINE_LEN + 1], - *name, *p; - /* strict atomic_t is unnecessary here */ - static unsigned short cnt; - struct qstr qs; - - BUILD_BUG_ON(sizeof(cnt) * 2 > AUFS_WH_TMP_LEN); - - name = defname; - qs.len = sizeof(defname) - DNAME_INLINE_LEN + prefix->len - 1; - if (unlikely(prefix->len > DNAME_INLINE_LEN)) { - dentry = ERR_PTR(-ENAMETOOLONG); - if (unlikely(qs.len > NAME_MAX)) - goto out; - dentry = ERR_PTR(-ENOMEM); - name = kmalloc(qs.len + 1, GFP_NOFS); - if (unlikely(!name)) - goto out; - } - - /* doubly whiteout-ed */ - memcpy(name, AUFS_WH_PFX AUFS_WH_PFX, AUFS_WH_PFX_LEN * 2); - p = name + AUFS_WH_PFX_LEN * 2; - memcpy(p, prefix->name, prefix->len); - p += prefix->len; - *p++ = '.'; - AuDebugOn(name + qs.len + 1 - p <= AUFS_WH_TMP_LEN); - - qs.name = name; - for (i = 0; i < 3; i++) { - sprintf(p, "%.*x", AUFS_WH_TMP_LEN, cnt++); - dentry = au_sio_lkup_one(&qs, h_parent); - if (IS_ERR(dentry) || d_is_negative(dentry)) - goto out_name; - dput(dentry); - } - /* pr_warn("could not get random name\n"); */ - dentry = ERR_PTR(-EEXIST); - AuDbg("%.*s\n", AuLNPair(&qs)); - BUG(); - -out_name: - if (name != defname) - kfree(name); -out: - AuTraceErrPtr(dentry); - return dentry; -} - -/* - * rename the @h_dentry on @br to the whiteouted temporary name. - */ -int au_whtmp_ren(struct dentry *h_dentry, struct au_branch *br) -{ - int err; - struct path h_path = { - .mnt = au_br_mnt(br) - }; - struct inode *h_dir, *delegated; - struct dentry *h_parent; - - h_parent = h_dentry->d_parent; /* dir inode is locked */ - h_dir = d_inode(h_parent); - IMustLock(h_dir); - - h_path.dentry = au_whtmp_lkup(h_parent, br, &h_dentry->d_name); - err = PTR_ERR(h_path.dentry); - if (IS_ERR(h_path.dentry)) - goto out; - - /* under the same dir, no need to lock_rename() */ - delegated = NULL; - err = vfsub_rename(h_dir, h_dentry, h_dir, &h_path, &delegated); - AuTraceErr(err); - if (unlikely(err == -EWOULDBLOCK)) { - pr_warn("cannot retry for NFSv4 delegation" - " for an internal rename\n"); - iput(delegated); - } - dput(h_path.dentry); - -out: - AuTraceErr(err); - return err; -} - -/* ---------------------------------------------------------------------- */ -/* - * functions for removing a whiteout - */ - -static int do_unlink_wh(struct inode *h_dir, struct path *h_path) -{ - int err, force; - struct inode *delegated; - - /* - * forces superio when the dir has a sticky bit. - * this may be a violation of unix fs semantics. - */ - force = (h_dir->i_mode & S_ISVTX) - && !uid_eq(current_fsuid(), d_inode(h_path->dentry)->i_uid); - delegated = NULL; - err = vfsub_unlink(h_dir, h_path, &delegated, force); - if (unlikely(err == -EWOULDBLOCK)) { - pr_warn("cannot retry for NFSv4 delegation" - " for an internal unlink\n"); - iput(delegated); - } - return err; -} - -int au_wh_unlink_dentry(struct inode *h_dir, struct path *h_path, - struct dentry *dentry) -{ - int err; - - err = do_unlink_wh(h_dir, h_path); - if (!err && dentry) - au_set_dbwh(dentry, -1); - - return err; -} - -static int unlink_wh_name(struct dentry *h_parent, struct qstr *wh, - struct au_branch *br) -{ - int err; - struct path h_path = { - .mnt = au_br_mnt(br) - }; - - err = 0; - h_path.dentry = vfsub_lkup_one(wh, h_parent); - if (IS_ERR(h_path.dentry)) - err = PTR_ERR(h_path.dentry); - else { - if (d_is_reg(h_path.dentry)) - err = do_unlink_wh(d_inode(h_parent), &h_path); - dput(h_path.dentry); - } - - return err; -} - -/* ---------------------------------------------------------------------- */ -/* - * initialize/clean whiteout for a branch - */ - -static void au_wh_clean(struct inode *h_dir, struct path *whpath, - const int isdir) -{ - int err; - struct inode *delegated; - - if (d_is_negative(whpath->dentry)) - return; - - if (isdir) - err = vfsub_rmdir(h_dir, whpath); - else { - delegated = NULL; - err = vfsub_unlink(h_dir, whpath, &delegated, /*force*/0); - if (unlikely(err == -EWOULDBLOCK)) { - pr_warn("cannot retry for NFSv4 delegation" - " for an internal unlink\n"); - iput(delegated); - } - } - if (unlikely(err)) - pr_warn("failed removing %pd (%d), ignored.\n", - whpath->dentry, err); -} - -static int test_linkable(struct dentry *h_root) -{ - struct inode *h_dir = d_inode(h_root); - - if (h_dir->i_op->link) - return 0; - - pr_err("%pd (%s) doesn't support link(2), use noplink and rw+nolwh\n", - h_root, au_sbtype(h_root->d_sb)); - return -ENOSYS; -} - -/* todo: should this mkdir be done in /sbin/mount.aufs helper? */ -static int au_whdir(struct inode *h_dir, struct path *path) -{ - int err; - - err = -EEXIST; - if (d_is_negative(path->dentry)) { - int mode = S_IRWXU; - - if (au_test_nfs(path->dentry->d_sb)) - mode |= S_IXUGO; - err = vfsub_mkdir(h_dir, path, mode); - } else if (d_is_dir(path->dentry)) - err = 0; - else - pr_err("unknown %pd exists\n", path->dentry); - - return err; -} - -struct au_wh_base { - const struct qstr *name; - struct dentry *dentry; -}; - -static void au_wh_init_ro(struct inode *h_dir, struct au_wh_base base[], - struct path *h_path) -{ - h_path->dentry = base[AuBrWh_BASE].dentry; - au_wh_clean(h_dir, h_path, /*isdir*/0); - h_path->dentry = base[AuBrWh_PLINK].dentry; - au_wh_clean(h_dir, h_path, /*isdir*/1); - h_path->dentry = base[AuBrWh_ORPH].dentry; - au_wh_clean(h_dir, h_path, /*isdir*/1); -} - -/* - * returns tri-state, - * minus: error, caller should print the message - * zero: succuess - * plus: error, caller should NOT print the message - */ -static int au_wh_init_rw_nolink(struct dentry *h_root, struct au_wbr *wbr, - int do_plink, struct au_wh_base base[], - struct path *h_path) -{ - int err; - struct inode *h_dir; - - h_dir = d_inode(h_root); - h_path->dentry = base[AuBrWh_BASE].dentry; - au_wh_clean(h_dir, h_path, /*isdir*/0); - h_path->dentry = base[AuBrWh_PLINK].dentry; - if (do_plink) { - err = test_linkable(h_root); - if (unlikely(err)) { - err = 1; - goto out; - } - - err = au_whdir(h_dir, h_path); - if (unlikely(err)) - goto out; - wbr->wbr_plink = dget(base[AuBrWh_PLINK].dentry); - } else - au_wh_clean(h_dir, h_path, /*isdir*/1); - h_path->dentry = base[AuBrWh_ORPH].dentry; - err = au_whdir(h_dir, h_path); - if (unlikely(err)) - goto out; - wbr->wbr_orph = dget(base[AuBrWh_ORPH].dentry); - -out: - return err; -} - -/* - * for the moment, aufs supports the branch filesystem which does not support - * link(2). testing on FAT which does not support i_op->setattr() fully either, - * copyup failed. finally, such filesystem will not be used as the writable - * branch. - * - * returns tri-state, see above. - */ -static int au_wh_init_rw(struct dentry *h_root, struct au_wbr *wbr, - int do_plink, struct au_wh_base base[], - struct path *h_path) -{ - int err; - struct inode *h_dir; - - WbrWhMustWriteLock(wbr); - - err = test_linkable(h_root); - if (unlikely(err)) { - err = 1; - goto out; - } - - /* - * todo: should this create be done in /sbin/mount.aufs helper? - */ - err = -EEXIST; - h_dir = d_inode(h_root); - if (d_is_negative(base[AuBrWh_BASE].dentry)) { - h_path->dentry = base[AuBrWh_BASE].dentry; - err = vfsub_create(h_dir, h_path, WH_MASK, /*want_excl*/true); - } else if (d_is_reg(base[AuBrWh_BASE].dentry)) - err = 0; - else - pr_err("unknown %pd2 exists\n", base[AuBrWh_BASE].dentry); - if (unlikely(err)) - goto out; - - h_path->dentry = base[AuBrWh_PLINK].dentry; - if (do_plink) { - err = au_whdir(h_dir, h_path); - if (unlikely(err)) - goto out; - wbr->wbr_plink = dget(base[AuBrWh_PLINK].dentry); - } else - au_wh_clean(h_dir, h_path, /*isdir*/1); - wbr->wbr_whbase = dget(base[AuBrWh_BASE].dentry); - - h_path->dentry = base[AuBrWh_ORPH].dentry; - err = au_whdir(h_dir, h_path); - if (unlikely(err)) - goto out; - wbr->wbr_orph = dget(base[AuBrWh_ORPH].dentry); - -out: - return err; -} - -/* - * initialize the whiteout base file/dir for @br. - */ -int au_wh_init(struct au_branch *br, struct super_block *sb) -{ - int err, i; - const unsigned char do_plink - = !!au_opt_test(au_mntflags(sb), PLINK); - struct inode *h_dir; - struct path path = br->br_path; - struct dentry *h_root = path.dentry; - struct au_wbr *wbr = br->br_wbr; - static const struct qstr base_name[] = { - [AuBrWh_BASE] = QSTR_INIT(AUFS_BASE_NAME, - sizeof(AUFS_BASE_NAME) - 1), - [AuBrWh_PLINK] = QSTR_INIT(AUFS_PLINKDIR_NAME, - sizeof(AUFS_PLINKDIR_NAME) - 1), - [AuBrWh_ORPH] = QSTR_INIT(AUFS_ORPHDIR_NAME, - sizeof(AUFS_ORPHDIR_NAME) - 1) - }; - struct au_wh_base base[] = { - [AuBrWh_BASE] = { - .name = base_name + AuBrWh_BASE, - .dentry = NULL - }, - [AuBrWh_PLINK] = { - .name = base_name + AuBrWh_PLINK, - .dentry = NULL - }, - [AuBrWh_ORPH] = { - .name = base_name + AuBrWh_ORPH, - .dentry = NULL - } - }; - - if (wbr) - WbrWhMustWriteLock(wbr); - - for (i = 0; i < AuBrWh_Last; i++) { - /* doubly whiteouted */ - struct dentry *d; - - d = au_wh_lkup(h_root, (void *)base[i].name, br); - err = PTR_ERR(d); - if (IS_ERR(d)) - goto out; - - base[i].dentry = d; - AuDebugOn(wbr - && wbr->wbr_wh[i] - && wbr->wbr_wh[i] != base[i].dentry); - } - - if (wbr) - for (i = 0; i < AuBrWh_Last; i++) { - dput(wbr->wbr_wh[i]); - wbr->wbr_wh[i] = NULL; - } - - err = 0; - if (!au_br_writable(br->br_perm)) { - h_dir = d_inode(h_root); - au_wh_init_ro(h_dir, base, &path); - } else if (!au_br_wh_linkable(br->br_perm)) { - err = au_wh_init_rw_nolink(h_root, wbr, do_plink, base, &path); - if (err > 0) - goto out; - else if (err) - goto out_err; - } else { - err = au_wh_init_rw(h_root, wbr, do_plink, base, &path); - if (err > 0) - goto out; - else if (err) - goto out_err; - } - goto out; /* success */ - -out_err: - pr_err("an error(%d) on the writable branch %pd(%s)\n", - err, h_root, au_sbtype(h_root->d_sb)); -out: - for (i = 0; i < AuBrWh_Last; i++) - dput(base[i].dentry); - return err; -} - -/* ---------------------------------------------------------------------- */ -/* - * whiteouts are all hard-linked usually. - * when its link count reaches a ceiling, we create a new whiteout base - * asynchronously. - */ - -struct reinit_br_wh { - struct super_block *sb; - struct au_branch *br; -}; - -static void reinit_br_wh(void *arg) -{ - int err; - aufs_bindex_t bindex; - struct path h_path; - struct reinit_br_wh *a = arg; - struct au_wbr *wbr; - struct inode *dir, *delegated; - struct dentry *h_root; - struct au_hinode *hdir; - - err = 0; - wbr = a->br->br_wbr; - /* big aufs lock */ - si_noflush_write_lock(a->sb); - if (!au_br_writable(a->br->br_perm)) - goto out; - bindex = au_br_index(a->sb, a->br->br_id); - if (unlikely(bindex < 0)) - goto out; - - di_read_lock_parent(a->sb->s_root, AuLock_IR); - dir = d_inode(a->sb->s_root); - hdir = au_hi(dir, bindex); - h_root = au_h_dptr(a->sb->s_root, bindex); - AuDebugOn(h_root != au_br_dentry(a->br)); - - au_hn_imtx_lock_nested(hdir, AuLsc_I_PARENT); - wbr_wh_write_lock(wbr); - err = au_h_verify(wbr->wbr_whbase, au_opt_udba(a->sb), hdir->hi_inode, - h_root, a->br); - if (!err) { - h_path.dentry = wbr->wbr_whbase; - h_path.mnt = au_br_mnt(a->br); - delegated = NULL; - err = vfsub_unlink(hdir->hi_inode, &h_path, &delegated, - /*force*/0); - if (unlikely(err == -EWOULDBLOCK)) { - pr_warn("cannot retry for NFSv4 delegation" - " for an internal unlink\n"); - iput(delegated); - } - } else { - pr_warn("%pd is moved, ignored\n", wbr->wbr_whbase); - err = 0; - } - dput(wbr->wbr_whbase); - wbr->wbr_whbase = NULL; - if (!err) - err = au_wh_init(a->br, a->sb); - wbr_wh_write_unlock(wbr); - au_hn_imtx_unlock(hdir); - di_read_unlock(a->sb->s_root, AuLock_IR); - if (!err) - au_fhsm_wrote(a->sb, bindex, /*force*/0); - -out: - if (wbr) - atomic_dec(&wbr->wbr_wh_running); - atomic_dec(&a->br->br_count); - si_write_unlock(a->sb); - au_nwt_done(&au_sbi(a->sb)->si_nowait); - kfree(arg); - if (unlikely(err)) - AuIOErr("err %d\n", err); -} - -static void kick_reinit_br_wh(struct super_block *sb, struct au_branch *br) -{ - int do_dec, wkq_err; - struct reinit_br_wh *arg; - - do_dec = 1; - if (atomic_inc_return(&br->br_wbr->wbr_wh_running) != 1) - goto out; - - /* ignore ENOMEM */ - arg = kmalloc(sizeof(*arg), GFP_NOFS); - if (arg) { - /* - * dec(wh_running), kfree(arg) and dec(br_count) - * in reinit function - */ - arg->sb = sb; - arg->br = br; - atomic_inc(&br->br_count); - wkq_err = au_wkq_nowait(reinit_br_wh, arg, sb, /*flags*/0); - if (unlikely(wkq_err)) { - atomic_dec(&br->br_wbr->wbr_wh_running); - atomic_dec(&br->br_count); - kfree(arg); - } - do_dec = 0; - } - -out: - if (do_dec) - atomic_dec(&br->br_wbr->wbr_wh_running); -} - -/* ---------------------------------------------------------------------- */ - -/* - * create the whiteout @wh. - */ -static int link_or_create_wh(struct super_block *sb, aufs_bindex_t bindex, - struct dentry *wh) -{ - int err; - struct path h_path = { - .dentry = wh - }; - struct au_branch *br; - struct au_wbr *wbr; - struct dentry *h_parent; - struct inode *h_dir, *delegated; - - h_parent = wh->d_parent; /* dir inode is locked */ - h_dir = d_inode(h_parent); - IMustLock(h_dir); - - br = au_sbr(sb, bindex); - h_path.mnt = au_br_mnt(br); - wbr = br->br_wbr; - wbr_wh_read_lock(wbr); - if (wbr->wbr_whbase) { - delegated = NULL; - err = vfsub_link(wbr->wbr_whbase, h_dir, &h_path, &delegated); - if (unlikely(err == -EWOULDBLOCK)) { - pr_warn("cannot retry for NFSv4 delegation" - " for an internal link\n"); - iput(delegated); - } - if (!err || err != -EMLINK) - goto out; - - /* link count full. re-initialize br_whbase. */ - kick_reinit_br_wh(sb, br); - } - - /* return this error in this context */ - err = vfsub_create(h_dir, &h_path, WH_MASK, /*want_excl*/true); - if (!err) - au_fhsm_wrote(sb, bindex, /*force*/0); - -out: - wbr_wh_read_unlock(wbr); - return err; -} - -/* ---------------------------------------------------------------------- */ - -/* - * create or remove the diropq. - */ -static struct dentry *do_diropq(struct dentry *dentry, aufs_bindex_t bindex, - unsigned int flags) -{ - struct dentry *opq_dentry, *h_dentry; - struct super_block *sb; - struct au_branch *br; - int err; - - sb = dentry->d_sb; - br = au_sbr(sb, bindex); - h_dentry = au_h_dptr(dentry, bindex); - opq_dentry = vfsub_lkup_one(&diropq_name, h_dentry); - if (IS_ERR(opq_dentry)) - goto out; - - if (au_ftest_diropq(flags, CREATE)) { - err = link_or_create_wh(sb, bindex, opq_dentry); - if (!err) { - au_set_dbdiropq(dentry, bindex); - goto out; /* success */ - } - } else { - struct path tmp = { - .dentry = opq_dentry, - .mnt = au_br_mnt(br) - }; - err = do_unlink_wh(au_h_iptr(d_inode(dentry), bindex), &tmp); - if (!err) - au_set_dbdiropq(dentry, -1); - } - dput(opq_dentry); - opq_dentry = ERR_PTR(err); - -out: - return opq_dentry; -} - -struct do_diropq_args { - struct dentry **errp; - struct dentry *dentry; - aufs_bindex_t bindex; - unsigned int flags; -}; - -static void call_do_diropq(void *args) -{ - struct do_diropq_args *a = args; - *a->errp = do_diropq(a->dentry, a->bindex, a->flags); -} - -struct dentry *au_diropq_sio(struct dentry *dentry, aufs_bindex_t bindex, - unsigned int flags) -{ - struct dentry *diropq, *h_dentry; - - h_dentry = au_h_dptr(dentry, bindex); - if (!au_test_h_perm_sio(d_inode(h_dentry), MAY_EXEC | MAY_WRITE)) - diropq = do_diropq(dentry, bindex, flags); - else { - int wkq_err; - struct do_diropq_args args = { - .errp = &diropq, - .dentry = dentry, - .bindex = bindex, - .flags = flags - }; - - wkq_err = au_wkq_wait(call_do_diropq, &args); - if (unlikely(wkq_err)) - diropq = ERR_PTR(wkq_err); - } - - return diropq; -} - -/* ---------------------------------------------------------------------- */ - -/* - * lookup whiteout dentry. - * @h_parent: lower parent dentry which must exist and be locked - * @base_name: name of dentry which will be whiteouted - * returns dentry for whiteout. - */ -struct dentry *au_wh_lkup(struct dentry *h_parent, struct qstr *base_name, - struct au_branch *br) -{ - int err; - struct qstr wh_name; - struct dentry *wh_dentry; - - err = au_wh_name_alloc(&wh_name, base_name); - wh_dentry = ERR_PTR(err); - if (!err) { - wh_dentry = vfsub_lkup_one(&wh_name, h_parent); - kfree(wh_name.name); - } - return wh_dentry; -} - -/* - * link/create a whiteout for @dentry on @bindex. - */ -struct dentry *au_wh_create(struct dentry *dentry, aufs_bindex_t bindex, - struct dentry *h_parent) -{ - struct dentry *wh_dentry; - struct super_block *sb; - int err; - - sb = dentry->d_sb; - wh_dentry = au_wh_lkup(h_parent, &dentry->d_name, au_sbr(sb, bindex)); - if (!IS_ERR(wh_dentry) && d_is_negative(wh_dentry)) { - err = link_or_create_wh(sb, bindex, wh_dentry); - if (!err) { - au_set_dbwh(dentry, bindex); - au_fhsm_wrote(sb, bindex, /*force*/0); - } else { - dput(wh_dentry); - wh_dentry = ERR_PTR(err); - } - } - - return wh_dentry; -} - -/* ---------------------------------------------------------------------- */ - -/* Delete all whiteouts in this directory on branch bindex. */ -static int del_wh_children(struct dentry *h_dentry, struct au_nhash *whlist, - aufs_bindex_t bindex, struct au_branch *br) -{ - int err; - unsigned long ul, n; - struct qstr wh_name; - char *p; - struct hlist_head *head; - struct au_vdir_wh *pos; - struct au_vdir_destr *str; - - err = -ENOMEM; - p = (void *)__get_free_page(GFP_NOFS); - wh_name.name = p; - if (unlikely(!wh_name.name)) - goto out; - - err = 0; - memcpy(p, AUFS_WH_PFX, AUFS_WH_PFX_LEN); - p += AUFS_WH_PFX_LEN; - n = whlist->nh_num; - head = whlist->nh_head; - for (ul = 0; !err && ul < n; ul++, head++) { - hlist_for_each_entry(pos, head, wh_hash) { - if (pos->wh_bindex != bindex) - continue; - - str = &pos->wh_str; - if (str->len + AUFS_WH_PFX_LEN <= PATH_MAX) { - memcpy(p, str->name, str->len); - wh_name.len = AUFS_WH_PFX_LEN + str->len; - err = unlink_wh_name(h_dentry, &wh_name, br); - if (!err) - continue; - break; - } - AuIOErr("whiteout name too long %.*s\n", - str->len, str->name); - err = -EIO; - break; - } - } - free_page((unsigned long)wh_name.name); - -out: - return err; -} - -struct del_wh_children_args { - int *errp; - struct dentry *h_dentry; - struct au_nhash *whlist; - aufs_bindex_t bindex; - struct au_branch *br; -}; - -static void call_del_wh_children(void *args) -{ - struct del_wh_children_args *a = args; - *a->errp = del_wh_children(a->h_dentry, a->whlist, a->bindex, a->br); -} - -/* ---------------------------------------------------------------------- */ - -struct au_whtmp_rmdir *au_whtmp_rmdir_alloc(struct super_block *sb, gfp_t gfp) -{ - struct au_whtmp_rmdir *whtmp; - int err; - unsigned int rdhash; - - SiMustAnyLock(sb); - - whtmp = kmalloc(sizeof(*whtmp), gfp); - if (unlikely(!whtmp)) { - whtmp = ERR_PTR(-ENOMEM); - goto out; - } - - whtmp->dir = NULL; - whtmp->br = NULL; - whtmp->wh_dentry = NULL; - /* no estimation for dir size */ - rdhash = au_sbi(sb)->si_rdhash; - if (!rdhash) - rdhash = AUFS_RDHASH_DEF; - err = au_nhash_alloc(&whtmp->whlist, rdhash, gfp); - if (unlikely(err)) { - kfree(whtmp); - whtmp = ERR_PTR(err); - } - -out: - return whtmp; -} - -void au_whtmp_rmdir_free(struct au_whtmp_rmdir *whtmp) -{ - if (whtmp->br) - atomic_dec(&whtmp->br->br_count); - dput(whtmp->wh_dentry); - iput(whtmp->dir); - au_nhash_wh_free(&whtmp->whlist); - kfree(whtmp); -} - -/* - * rmdir the whiteouted temporary named dir @h_dentry. - * @whlist: whiteouted children. - */ -int au_whtmp_rmdir(struct inode *dir, aufs_bindex_t bindex, - struct dentry *wh_dentry, struct au_nhash *whlist) -{ - int err; - unsigned int h_nlink; - struct path h_tmp; - struct inode *wh_inode, *h_dir; - struct au_branch *br; - - h_dir = d_inode(wh_dentry->d_parent); /* dir inode is locked */ - IMustLock(h_dir); - - br = au_sbr(dir->i_sb, bindex); - wh_inode = d_inode(wh_dentry); - mutex_lock_nested(&wh_inode->i_mutex, AuLsc_I_CHILD); - - /* - * someone else might change some whiteouts while we were sleeping. - * it means this whlist may have an obsoleted entry. - */ - if (!au_test_h_perm_sio(wh_inode, MAY_EXEC | MAY_WRITE)) - err = del_wh_children(wh_dentry, whlist, bindex, br); - else { - int wkq_err; - struct del_wh_children_args args = { - .errp = &err, - .h_dentry = wh_dentry, - .whlist = whlist, - .bindex = bindex, - .br = br - }; - - wkq_err = au_wkq_wait(call_del_wh_children, &args); - if (unlikely(wkq_err)) - err = wkq_err; - } - mutex_unlock(&wh_inode->i_mutex); - - if (!err) { - h_tmp.dentry = wh_dentry; - h_tmp.mnt = au_br_mnt(br); - h_nlink = h_dir->i_nlink; - err = vfsub_rmdir(h_dir, &h_tmp); - /* some fs doesn't change the parent nlink in some cases */ - h_nlink -= h_dir->i_nlink; - } - - if (!err) { - if (au_ibstart(dir) == bindex) { - /* todo: dir->i_mutex is necessary */ - au_cpup_attr_timesizes(dir); - if (h_nlink) - vfsub_drop_nlink(dir); - } - return 0; /* success */ - } - - pr_warn("failed removing %pd(%d), ignored\n", wh_dentry, err); - return err; -} - -static void call_rmdir_whtmp(void *args) -{ - int err; - aufs_bindex_t bindex; - struct au_whtmp_rmdir *a = args; - struct super_block *sb; - struct dentry *h_parent; - struct inode *h_dir; - struct au_hinode *hdir; - - /* rmdir by nfsd may cause deadlock with this i_mutex */ - /* mutex_lock(&a->dir->i_mutex); */ - err = -EROFS; - sb = a->dir->i_sb; - si_read_lock(sb, !AuLock_FLUSH); - if (!au_br_writable(a->br->br_perm)) - goto out; - bindex = au_br_index(sb, a->br->br_id); - if (unlikely(bindex < 0)) - goto out; - - err = -EIO; - ii_write_lock_parent(a->dir); - h_parent = dget_parent(a->wh_dentry); - h_dir = d_inode(h_parent); - hdir = au_hi(a->dir, bindex); - err = vfsub_mnt_want_write(au_br_mnt(a->br)); - if (unlikely(err)) - goto out_mnt; - au_hn_imtx_lock_nested(hdir, AuLsc_I_PARENT); - err = au_h_verify(a->wh_dentry, au_opt_udba(sb), h_dir, h_parent, - a->br); - if (!err) - err = au_whtmp_rmdir(a->dir, bindex, a->wh_dentry, &a->whlist); - au_hn_imtx_unlock(hdir); - vfsub_mnt_drop_write(au_br_mnt(a->br)); - -out_mnt: - dput(h_parent); - ii_write_unlock(a->dir); -out: - /* mutex_unlock(&a->dir->i_mutex); */ - au_whtmp_rmdir_free(a); - si_read_unlock(sb); - au_nwt_done(&au_sbi(sb)->si_nowait); - if (unlikely(err)) - AuIOErr("err %d\n", err); -} - -void au_whtmp_kick_rmdir(struct inode *dir, aufs_bindex_t bindex, - struct dentry *wh_dentry, struct au_whtmp_rmdir *args) -{ - int wkq_err; - struct super_block *sb; - - IMustLock(dir); - - /* all post-process will be done in do_rmdir_whtmp(). */ - sb = dir->i_sb; - args->dir = au_igrab(dir); - args->br = au_sbr(sb, bindex); - atomic_inc(&args->br->br_count); - args->wh_dentry = dget(wh_dentry); - wkq_err = au_wkq_nowait(call_rmdir_whtmp, args, sb, /*flags*/0); - if (unlikely(wkq_err)) { - pr_warn("rmdir error %pd (%d), ignored\n", wh_dentry, wkq_err); - au_whtmp_rmdir_free(args); - } -} diff --git a/fs/aufs/whout.h b/fs/aufs/whout.h deleted file mode 100644 index 81652ce2f..000000000 --- a/fs/aufs/whout.h +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Copyright (C) 2005-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * whiteout for logical deletion and opaque directory - */ - -#ifndef __AUFS_WHOUT_H__ -#define __AUFS_WHOUT_H__ - -#ifdef __KERNEL__ - -#include "dir.h" - -/* whout.c */ -int au_wh_name_alloc(struct qstr *wh, const struct qstr *name); -int au_wh_test(struct dentry *h_parent, struct qstr *wh_name, int try_sio); -int au_diropq_test(struct dentry *h_dentry); -struct au_branch; -struct dentry *au_whtmp_lkup(struct dentry *h_parent, struct au_branch *br, - struct qstr *prefix); -int au_whtmp_ren(struct dentry *h_dentry, struct au_branch *br); -int au_wh_unlink_dentry(struct inode *h_dir, struct path *h_path, - struct dentry *dentry); -int au_wh_init(struct au_branch *br, struct super_block *sb); - -/* diropq flags */ -#define AuDiropq_CREATE 1 -#define au_ftest_diropq(flags, name) ((flags) & AuDiropq_##name) -#define au_fset_diropq(flags, name) \ - do { (flags) |= AuDiropq_##name; } while (0) -#define au_fclr_diropq(flags, name) \ - do { (flags) &= ~AuDiropq_##name; } while (0) - -struct dentry *au_diropq_sio(struct dentry *dentry, aufs_bindex_t bindex, - unsigned int flags); -struct dentry *au_wh_lkup(struct dentry *h_parent, struct qstr *base_name, - struct au_branch *br); -struct dentry *au_wh_create(struct dentry *dentry, aufs_bindex_t bindex, - struct dentry *h_parent); - -/* real rmdir for the whiteout-ed dir */ -struct au_whtmp_rmdir { - struct inode *dir; - struct au_branch *br; - struct dentry *wh_dentry; - struct au_nhash whlist; -}; - -struct au_whtmp_rmdir *au_whtmp_rmdir_alloc(struct super_block *sb, gfp_t gfp); -void au_whtmp_rmdir_free(struct au_whtmp_rmdir *whtmp); -int au_whtmp_rmdir(struct inode *dir, aufs_bindex_t bindex, - struct dentry *wh_dentry, struct au_nhash *whlist); -void au_whtmp_kick_rmdir(struct inode *dir, aufs_bindex_t bindex, - struct dentry *wh_dentry, struct au_whtmp_rmdir *args); - -/* ---------------------------------------------------------------------- */ - -static inline struct dentry *au_diropq_create(struct dentry *dentry, - aufs_bindex_t bindex) -{ - return au_diropq_sio(dentry, bindex, AuDiropq_CREATE); -} - -static inline int au_diropq_remove(struct dentry *dentry, aufs_bindex_t bindex) -{ - return PTR_ERR(au_diropq_sio(dentry, bindex, !AuDiropq_CREATE)); -} - -#endif /* __KERNEL__ */ -#endif /* __AUFS_WHOUT_H__ */ diff --git a/fs/aufs/wkq.c b/fs/aufs/wkq.c deleted file mode 100644 index 3ffedae93..000000000 --- a/fs/aufs/wkq.c +++ /dev/null @@ -1,213 +0,0 @@ -/* - * Copyright (C) 2005-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * workqueue for asynchronous/super-io operations - * todo: try new dredential scheme - */ - -#include <linux/module.h> -#include "aufs.h" - -/* internal workqueue named AUFS_WKQ_NAME */ - -static struct workqueue_struct *au_wkq; - -struct au_wkinfo { - struct work_struct wk; - struct kobject *kobj; - - unsigned int flags; /* see wkq.h */ - - au_wkq_func_t func; - void *args; - - struct completion *comp; -}; - -/* ---------------------------------------------------------------------- */ - -static void wkq_func(struct work_struct *wk) -{ - struct au_wkinfo *wkinfo = container_of(wk, struct au_wkinfo, wk); - - AuDebugOn(!uid_eq(current_fsuid(), GLOBAL_ROOT_UID)); - AuDebugOn(rlimit(RLIMIT_FSIZE) != RLIM_INFINITY); - - wkinfo->func(wkinfo->args); - if (au_ftest_wkq(wkinfo->flags, WAIT)) - complete(wkinfo->comp); - else { - kobject_put(wkinfo->kobj); - module_put(THIS_MODULE); /* todo: ?? */ - kfree(wkinfo); - } -} - -/* - * Since struct completion is large, try allocating it dynamically. - */ -#if 1 /* defined(CONFIG_4KSTACKS) || defined(AuTest4KSTACKS) */ -#define AuWkqCompDeclare(name) struct completion *comp = NULL - -static int au_wkq_comp_alloc(struct au_wkinfo *wkinfo, struct completion **comp) -{ - *comp = kmalloc(sizeof(**comp), GFP_NOFS); - if (*comp) { - init_completion(*comp); - wkinfo->comp = *comp; - return 0; - } - return -ENOMEM; -} - -static void au_wkq_comp_free(struct completion *comp) -{ - kfree(comp); -} - -#else - -/* no braces */ -#define AuWkqCompDeclare(name) \ - DECLARE_COMPLETION_ONSTACK(_ ## name); \ - struct completion *comp = &_ ## name - -static int au_wkq_comp_alloc(struct au_wkinfo *wkinfo, struct completion **comp) -{ - wkinfo->comp = *comp; - return 0; -} - -static void au_wkq_comp_free(struct completion *comp __maybe_unused) -{ - /* empty */ -} -#endif /* 4KSTACKS */ - -static void au_wkq_run(struct au_wkinfo *wkinfo) -{ - if (au_ftest_wkq(wkinfo->flags, NEST)) { - if (au_wkq_test()) { - AuWarn1("wkq from wkq, unless silly-rename on NFS," - " due to a dead dir by UDBA?\n"); - AuDebugOn(au_ftest_wkq(wkinfo->flags, WAIT)); - } - } else - au_dbg_verify_kthread(); - - if (au_ftest_wkq(wkinfo->flags, WAIT)) { - INIT_WORK_ONSTACK(&wkinfo->wk, wkq_func); - queue_work(au_wkq, &wkinfo->wk); - } else { - INIT_WORK(&wkinfo->wk, wkq_func); - schedule_work(&wkinfo->wk); - } -} - -/* - * Be careful. It is easy to make deadlock happen. - * processA: lock, wkq and wait - * processB: wkq and wait, lock in wkq - * --> deadlock - */ -int au_wkq_do_wait(unsigned int flags, au_wkq_func_t func, void *args) -{ - int err; - AuWkqCompDeclare(comp); - struct au_wkinfo wkinfo = { - .flags = flags, - .func = func, - .args = args - }; - - err = au_wkq_comp_alloc(&wkinfo, &comp); - if (!err) { - au_wkq_run(&wkinfo); - /* no timeout, no interrupt */ - wait_for_completion(wkinfo.comp); - au_wkq_comp_free(comp); - destroy_work_on_stack(&wkinfo.wk); - } - - return err; - -} - -/* - * Note: dget/dput() in func for aufs dentries are not supported. It will be a - * problem in a concurrent umounting. - */ -int au_wkq_nowait(au_wkq_func_t func, void *args, struct super_block *sb, - unsigned int flags) -{ - int err; - struct au_wkinfo *wkinfo; - - atomic_inc(&au_sbi(sb)->si_nowait.nw_len); - - /* - * wkq_func() must free this wkinfo. - * it highly depends upon the implementation of workqueue. - */ - err = 0; - wkinfo = kmalloc(sizeof(*wkinfo), GFP_NOFS); - if (wkinfo) { - wkinfo->kobj = &au_sbi(sb)->si_kobj; - wkinfo->flags = flags & ~AuWkq_WAIT; - wkinfo->func = func; - wkinfo->args = args; - wkinfo->comp = NULL; - kobject_get(wkinfo->kobj); - __module_get(THIS_MODULE); /* todo: ?? */ - - au_wkq_run(wkinfo); - } else { - err = -ENOMEM; - au_nwt_done(&au_sbi(sb)->si_nowait); - } - - return err; -} - -/* ---------------------------------------------------------------------- */ - -void au_nwt_init(struct au_nowait_tasks *nwt) -{ - atomic_set(&nwt->nw_len, 0); - /* smp_mb(); */ /* atomic_set */ - init_waitqueue_head(&nwt->nw_wq); -} - -void au_wkq_fin(void) -{ - destroy_workqueue(au_wkq); -} - -int __init au_wkq_init(void) -{ - int err; - - err = 0; - au_wkq = alloc_workqueue(AUFS_WKQ_NAME, 0, WQ_DFL_ACTIVE); - if (IS_ERR(au_wkq)) - err = PTR_ERR(au_wkq); - else if (!au_wkq) - err = -ENOMEM; - - return err; -} diff --git a/fs/aufs/wkq.h b/fs/aufs/wkq.h deleted file mode 100644 index e57bfb3d9..000000000 --- a/fs/aufs/wkq.h +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Copyright (C) 2005-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * workqueue for asynchronous/super-io operations - * todo: try new credentials management scheme - */ - -#ifndef __AUFS_WKQ_H__ -#define __AUFS_WKQ_H__ - -#ifdef __KERNEL__ - -struct super_block; - -/* ---------------------------------------------------------------------- */ - -/* - * in the next operation, wait for the 'nowait' tasks in system-wide workqueue - */ -struct au_nowait_tasks { - atomic_t nw_len; - wait_queue_head_t nw_wq; -}; - -/* ---------------------------------------------------------------------- */ - -typedef void (*au_wkq_func_t)(void *args); - -/* wkq flags */ -#define AuWkq_WAIT 1 -#define AuWkq_NEST (1 << 1) -#define au_ftest_wkq(flags, name) ((flags) & AuWkq_##name) -#define au_fset_wkq(flags, name) \ - do { (flags) |= AuWkq_##name; } while (0) -#define au_fclr_wkq(flags, name) \ - do { (flags) &= ~AuWkq_##name; } while (0) - -#ifndef CONFIG_AUFS_HNOTIFY -#undef AuWkq_NEST -#define AuWkq_NEST 0 -#endif - -/* wkq.c */ -int au_wkq_do_wait(unsigned int flags, au_wkq_func_t func, void *args); -int au_wkq_nowait(au_wkq_func_t func, void *args, struct super_block *sb, - unsigned int flags); -void au_nwt_init(struct au_nowait_tasks *nwt); -int __init au_wkq_init(void); -void au_wkq_fin(void); - -/* ---------------------------------------------------------------------- */ - -static inline int au_wkq_test(void) -{ - return current->flags & PF_WQ_WORKER; -} - -static inline int au_wkq_wait(au_wkq_func_t func, void *args) -{ - return au_wkq_do_wait(AuWkq_WAIT, func, args); -} - -static inline void au_nwt_done(struct au_nowait_tasks *nwt) -{ - if (atomic_dec_and_test(&nwt->nw_len)) - wake_up_all(&nwt->nw_wq); -} - -static inline int au_nwt_flush(struct au_nowait_tasks *nwt) -{ - wait_event(nwt->nw_wq, !atomic_read(&nwt->nw_len)); - return 0; -} - -#endif /* __KERNEL__ */ -#endif /* __AUFS_WKQ_H__ */ diff --git a/fs/aufs/xattr.c b/fs/aufs/xattr.c deleted file mode 100644 index b7e13ca3e..000000000 --- a/fs/aufs/xattr.c +++ /dev/null @@ -1,344 +0,0 @@ -/* - * Copyright (C) 2014-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * handling xattr functions - */ - -#include <linux/xattr.h> -#include "aufs.h" - -static int au_xattr_ignore(int err, char *name, unsigned int ignore_flags) -{ - if (!ignore_flags) - goto out; - switch (err) { - case -ENOMEM: - case -EDQUOT: - goto out; - } - - if ((ignore_flags & AuBrAttr_ICEX) == AuBrAttr_ICEX) { - err = 0; - goto out; - } - -#define cmp(brattr, prefix) do { \ - if (!strncmp(name, XATTR_##prefix##_PREFIX, \ - XATTR_##prefix##_PREFIX_LEN)) { \ - if (ignore_flags & AuBrAttr_ICEX_##brattr) \ - err = 0; \ - goto out; \ - } \ - } while (0) - - cmp(SEC, SECURITY); - cmp(SYS, SYSTEM); - cmp(TR, TRUSTED); - cmp(USR, USER); -#undef cmp - - if (ignore_flags & AuBrAttr_ICEX_OTH) - err = 0; - -out: - return err; -} - -static const int au_xattr_out_of_list = AuBrAttr_ICEX_OTH << 1; - -static int au_do_cpup_xattr(struct dentry *h_dst, struct dentry *h_src, - char *name, char **buf, unsigned int ignore_flags, - unsigned int verbose) -{ - int err; - ssize_t ssz; - struct inode *h_idst; - - ssz = vfs_getxattr_alloc(h_src, name, buf, 0, GFP_NOFS); - err = ssz; - if (unlikely(err <= 0)) { - if (err == -ENODATA - || (err == -EOPNOTSUPP - && ((ignore_flags & au_xattr_out_of_list) - || (au_test_nfs_noacl(d_inode(h_src)) - && (!strcmp(name, XATTR_NAME_POSIX_ACL_ACCESS) - || !strcmp(name, - XATTR_NAME_POSIX_ACL_DEFAULT)))) - )) - err = 0; - if (err && (verbose || au_debug_test())) - pr_err("%s, err %d\n", name, err); - goto out; - } - - /* unlock it temporary */ - h_idst = d_inode(h_dst); - mutex_unlock(&h_idst->i_mutex); - err = vfsub_setxattr(h_dst, name, *buf, ssz, /*flags*/0); - mutex_lock_nested(&h_idst->i_mutex, AuLsc_I_CHILD2); - if (unlikely(err)) { - if (verbose || au_debug_test()) - pr_err("%s, err %d\n", name, err); - err = au_xattr_ignore(err, name, ignore_flags); - } - -out: - return err; -} - -int au_cpup_xattr(struct dentry *h_dst, struct dentry *h_src, int ignore_flags, - unsigned int verbose) -{ - int err, unlocked, acl_access, acl_default; - ssize_t ssz; - struct inode *h_isrc, *h_idst; - char *value, *p, *o, *e; - - /* try stopping to update the source inode while we are referencing */ - /* there should not be the parent-child relationship between them */ - h_isrc = d_inode(h_src); - h_idst = d_inode(h_dst); - mutex_unlock(&h_idst->i_mutex); - mutex_lock_nested(&h_isrc->i_mutex, AuLsc_I_CHILD); - mutex_lock_nested(&h_idst->i_mutex, AuLsc_I_CHILD2); - unlocked = 0; - - /* some filesystems don't list POSIX ACL, for example tmpfs */ - ssz = vfs_listxattr(h_src, NULL, 0); - err = ssz; - if (unlikely(err < 0)) { - AuTraceErr(err); - if (err == -ENODATA - || err == -EOPNOTSUPP) - err = 0; /* ignore */ - goto out; - } - - err = 0; - p = NULL; - o = NULL; - if (ssz) { - err = -ENOMEM; - p = kmalloc(ssz, GFP_NOFS); - o = p; - if (unlikely(!p)) - goto out; - err = vfs_listxattr(h_src, p, ssz); - } - mutex_unlock(&h_isrc->i_mutex); - unlocked = 1; - AuDbg("err %d, ssz %zd\n", err, ssz); - if (unlikely(err < 0)) - goto out_free; - - err = 0; - e = p + ssz; - value = NULL; - acl_access = 0; - acl_default = 0; - while (!err && p < e) { - acl_access |= !strncmp(p, XATTR_NAME_POSIX_ACL_ACCESS, - sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1); - acl_default |= !strncmp(p, XATTR_NAME_POSIX_ACL_DEFAULT, - sizeof(XATTR_NAME_POSIX_ACL_DEFAULT) - - 1); - err = au_do_cpup_xattr(h_dst, h_src, p, &value, ignore_flags, - verbose); - p += strlen(p) + 1; - } - AuTraceErr(err); - ignore_flags |= au_xattr_out_of_list; - if (!err && !acl_access) { - err = au_do_cpup_xattr(h_dst, h_src, - XATTR_NAME_POSIX_ACL_ACCESS, &value, - ignore_flags, verbose); - AuTraceErr(err); - } - if (!err && !acl_default) { - err = au_do_cpup_xattr(h_dst, h_src, - XATTR_NAME_POSIX_ACL_DEFAULT, &value, - ignore_flags, verbose); - AuTraceErr(err); - } - - kfree(value); - -out_free: - kfree(o); -out: - if (!unlocked) - mutex_unlock(&h_isrc->i_mutex); - AuTraceErr(err); - return err; -} - -/* ---------------------------------------------------------------------- */ - -enum { - AU_XATTR_LIST, - AU_XATTR_GET -}; - -struct au_lgxattr { - int type; - union { - struct { - char *list; - size_t size; - } list; - struct { - const char *name; - void *value; - size_t size; - } get; - } u; -}; - -static ssize_t au_lgxattr(struct dentry *dentry, struct au_lgxattr *arg) -{ - ssize_t err; - struct path h_path; - struct super_block *sb; - - sb = dentry->d_sb; - err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM); - if (unlikely(err)) - goto out; - err = au_h_path_getattr(dentry, /*force*/1, &h_path); - if (unlikely(err)) - goto out_si; - if (unlikely(!h_path.dentry)) - /* illegally overlapped or something */ - goto out_di; /* pretending success */ - - /* always topmost entry only */ - switch (arg->type) { - case AU_XATTR_LIST: - err = vfs_listxattr(h_path.dentry, - arg->u.list.list, arg->u.list.size); - break; - case AU_XATTR_GET: - err = vfs_getxattr(h_path.dentry, - arg->u.get.name, arg->u.get.value, - arg->u.get.size); - break; - } - -out_di: - di_read_unlock(dentry, AuLock_IR); -out_si: - si_read_unlock(sb); -out: - AuTraceErr(err); - return err; -} - -ssize_t aufs_listxattr(struct dentry *dentry, char *list, size_t size) -{ - struct au_lgxattr arg = { - .type = AU_XATTR_LIST, - .u.list = { - .list = list, - .size = size - }, - }; - - return au_lgxattr(dentry, &arg); -} - -ssize_t aufs_getxattr(struct dentry *dentry, const char *name, void *value, - size_t size) -{ - struct au_lgxattr arg = { - .type = AU_XATTR_GET, - .u.get = { - .name = name, - .value = value, - .size = size - }, - }; - - return au_lgxattr(dentry, &arg); -} - -int aufs_setxattr(struct dentry *dentry, const char *name, const void *value, - size_t size, int flags) -{ - struct au_srxattr arg = { - .type = AU_XATTR_SET, - .u.set = { - .name = name, - .value = value, - .size = size, - .flags = flags - }, - }; - - return au_srxattr(dentry, &arg); -} - -int aufs_removexattr(struct dentry *dentry, const char *name) -{ - struct au_srxattr arg = { - .type = AU_XATTR_REMOVE, - .u.remove = { - .name = name - }, - }; - - return au_srxattr(dentry, &arg); -} - -/* ---------------------------------------------------------------------- */ - -#if 0 -static size_t au_xattr_list(struct dentry *dentry, char *list, size_t list_size, - const char *name, size_t name_len, int type) -{ - return aufs_listxattr(dentry, list, list_size); -} - -static int au_xattr_get(struct dentry *dentry, const char *name, void *buffer, - size_t size, int type) -{ - return aufs_getxattr(dentry, name, buffer, size); -} - -static int au_xattr_set(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, int type) -{ - return aufs_setxattr(dentry, name, value, size, flags); -} - -static const struct xattr_handler au_xattr_handler = { - /* no prefix, no flags */ - .list = au_xattr_list, - .get = au_xattr_get, - .set = au_xattr_set - /* why no remove? */ -}; - -static const struct xattr_handler *au_xattr_handlers[] = { - &au_xattr_handler -}; - -void au_xattr_init(struct super_block *sb) -{ - /* sb->s_xattr = au_xattr_handlers; */ -} -#endif diff --git a/fs/aufs/xino.c b/fs/aufs/xino.c deleted file mode 100644 index 053594410..000000000 --- a/fs/aufs/xino.c +++ /dev/null @@ -1,1297 +0,0 @@ -/* - * Copyright (C) 2005-2015 Junjiro R. Okajima - * - * This program, aufs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program. If not, see <http://www.gnu.org/licenses/>. - */ - -/* - * external inode number translation table and bitmap - */ - -#include <linux/seq_file.h> -#include <linux/statfs.h> -#include "aufs.h" - -/* todo: unnecessary to support mmap_sem since kernel-space? */ -ssize_t xino_fread(vfs_readf_t func, struct file *file, void *kbuf, size_t size, - loff_t *pos) -{ - ssize_t err; - mm_segment_t oldfs; - union { - void *k; - char __user *u; - } buf; - - buf.k = kbuf; - oldfs = get_fs(); - set_fs(KERNEL_DS); - do { - /* todo: signal_pending? */ - err = func(file, buf.u, size, pos); - } while (err == -EAGAIN || err == -EINTR); - set_fs(oldfs); - -#if 0 /* reserved for future use */ - if (err > 0) - fsnotify_access(file->f_path.dentry); -#endif - - return err; -} - -/* ---------------------------------------------------------------------- */ - -static ssize_t do_xino_fwrite(vfs_writef_t func, struct file *file, void *kbuf, - size_t size, loff_t *pos) -{ - ssize_t err; - mm_segment_t oldfs; - union { - void *k; - const char __user *u; - } buf; - - buf.k = kbuf; - oldfs = get_fs(); - set_fs(KERNEL_DS); - do { - /* todo: signal_pending? */ - err = func(file, buf.u, size, pos); - } while (err == -EAGAIN || err == -EINTR); - set_fs(oldfs); - -#if 0 /* reserved for future use */ - if (err > 0) - fsnotify_modify(file->f_path.dentry); -#endif - - return err; -} - -struct do_xino_fwrite_args { - ssize_t *errp; - vfs_writef_t func; - struct file *file; - void *buf; - size_t size; - loff_t *pos; -}; - -static void call_do_xino_fwrite(void *args) -{ - struct do_xino_fwrite_args *a = args; - *a->errp = do_xino_fwrite(a->func, a->file, a->buf, a->size, a->pos); -} - -ssize_t xino_fwrite(vfs_writef_t func, struct file *file, void *buf, - size_t size, loff_t *pos) -{ - ssize_t err; - - /* todo: signal block and no wkq? */ - if (rlimit(RLIMIT_FSIZE) == RLIM_INFINITY) { - lockdep_off(); - err = do_xino_fwrite(func, file, buf, size, pos); - lockdep_on(); - } else { - /* - * it breaks RLIMIT_FSIZE and normal user's limit, - * users should care about quota and real 'filesystem full.' - */ - int wkq_err; - struct do_xino_fwrite_args args = { - .errp = &err, - .func = func, - .file = file, - .buf = buf, - .size = size, - .pos = pos - }; - - wkq_err = au_wkq_wait(call_do_xino_fwrite, &args); - if (unlikely(wkq_err)) - err = wkq_err; - } - - return err; -} - -/* ---------------------------------------------------------------------- */ - -/* - * create a new xinofile at the same place/path as @base_file. - */ -struct file *au_xino_create2(struct file *base_file, struct file *copy_src) -{ - struct file *file; - struct dentry *base, *parent; - struct inode *dir, *delegated; - struct qstr *name; - struct path path; - int err; - - base = base_file->f_path.dentry; - parent = base->d_parent; /* dir inode is locked */ - dir = d_inode(parent); - IMustLock(dir); - - file = ERR_PTR(-EINVAL); - name = &base->d_name; - path.dentry = vfsub_lookup_one_len(name->name, parent, name->len); - if (IS_ERR(path.dentry)) { - file = (void *)path.dentry; - pr_err("%pd lookup err %ld\n", - base, PTR_ERR(path.dentry)); - goto out; - } - - /* no need to mnt_want_write() since we call dentry_open() later */ - err = vfs_create(dir, path.dentry, S_IRUGO | S_IWUGO, NULL); - if (unlikely(err)) { - file = ERR_PTR(err); - pr_err("%pd create err %d\n", base, err); - goto out_dput; - } - - path.mnt = base_file->f_path.mnt; - file = vfsub_dentry_open(&path, - O_RDWR | O_CREAT | O_EXCL | O_LARGEFILE - /* | __FMODE_NONOTIFY */); - if (IS_ERR(file)) { - pr_err("%pd open err %ld\n", base, PTR_ERR(file)); - goto out_dput; - } - - delegated = NULL; - err = vfsub_unlink(dir, &file->f_path, &delegated, /*force*/0); - if (unlikely(err == -EWOULDBLOCK)) { - pr_warn("cannot retry for NFSv4 delegation" - " for an internal unlink\n"); - iput(delegated); - } - if (unlikely(err)) { - pr_err("%pd unlink err %d\n", base, err); - goto out_fput; - } - - if (copy_src) { - /* no one can touch copy_src xino */ - err = au_copy_file(file, copy_src, vfsub_f_size_read(copy_src)); - if (unlikely(err)) { - pr_err("%pd copy err %d\n", base, err); - goto out_fput; - } - } - goto out_dput; /* success */ - -out_fput: - fput(file); - file = ERR_PTR(err); -out_dput: - dput(path.dentry); -out: - return file; -} - -struct au_xino_lock_dir { - struct au_hinode *hdir; - struct dentry *parent; - struct mutex *mtx; -}; - -static void au_xino_lock_dir(struct super_block *sb, struct file *xino, - struct au_xino_lock_dir *ldir) -{ - aufs_bindex_t brid, bindex; - - ldir->hdir = NULL; - bindex = -1; - brid = au_xino_brid(sb); - if (brid >= 0) - bindex = au_br_index(sb, brid); - if (bindex >= 0) { - ldir->hdir = au_hi(d_inode(sb->s_root), bindex); - au_hn_imtx_lock_nested(ldir->hdir, AuLsc_I_PARENT); - } else { - ldir->parent = dget_parent(xino->f_path.dentry); - ldir->mtx = &d_inode(ldir->parent)->i_mutex; - mutex_lock_nested(ldir->mtx, AuLsc_I_PARENT); - } -} - -static void au_xino_unlock_dir(struct au_xino_lock_dir *ldir) -{ - if (ldir->hdir) - au_hn_imtx_unlock(ldir->hdir); - else { - mutex_unlock(ldir->mtx); - dput(ldir->parent); - } -} - -/* ---------------------------------------------------------------------- */ - -/* trucate xino files asynchronously */ - -int au_xino_trunc(struct super_block *sb, aufs_bindex_t bindex) -{ - int err; - unsigned long jiffy; - blkcnt_t blocks; - aufs_bindex_t bi, bend; - struct kstatfs *st; - struct au_branch *br; - struct file *new_xino, *file; - struct super_block *h_sb; - struct au_xino_lock_dir ldir; - - err = -ENOMEM; - st = kzalloc(sizeof(*st), GFP_NOFS); - if (unlikely(!st)) - goto out; - - err = -EINVAL; - bend = au_sbend(sb); - if (unlikely(bindex < 0 || bend < bindex)) - goto out_st; - br = au_sbr(sb, bindex); - file = br->br_xino.xi_file; - if (!file) - goto out_st; - - err = vfs_statfs(&file->f_path, st); - if (unlikely(err)) - AuErr1("statfs err %d, ignored\n", err); - jiffy = jiffies; - blocks = file_inode(file)->i_blocks; - pr_info("begin truncating xino(b%d), ib%llu, %llu/%llu free blks\n", - bindex, (u64)blocks, st->f_bfree, st->f_blocks); - - au_xino_lock_dir(sb, file, &ldir); - /* mnt_want_write() is unnecessary here */ - new_xino = au_xino_create2(file, file); - au_xino_unlock_dir(&ldir); - err = PTR_ERR(new_xino); - if (IS_ERR(new_xino)) { - pr_err("err %d, ignored\n", err); - goto out_st; - } - err = 0; - fput(file); - br->br_xino.xi_file = new_xino; - - h_sb = au_br_sb(br); - for (bi = 0; bi <= bend; bi++) { - if (unlikely(bi == bindex)) - continue; - br = au_sbr(sb, bi); - if (au_br_sb(br) != h_sb) - continue; - - fput(br->br_xino.xi_file); - br->br_xino.xi_file = new_xino; - get_file(new_xino); - } - - err = vfs_statfs(&new_xino->f_path, st); - if (!err) { - pr_info("end truncating xino(b%d), ib%llu, %llu/%llu free blks\n", - bindex, (u64)file_inode(new_xino)->i_blocks, - st->f_bfree, st->f_blocks); - if (file_inode(new_xino)->i_blocks < blocks) - au_sbi(sb)->si_xino_jiffy = jiffy; - } else - AuErr1("statfs err %d, ignored\n", err); - -out_st: - kfree(st); -out: - return err; -} - -struct xino_do_trunc_args { - struct super_block *sb; - struct au_branch *br; -}; - -static void xino_do_trunc(void *_args) -{ - struct xino_do_trunc_args *args = _args; - struct super_block *sb; - struct au_branch *br; - struct inode *dir; - int err; - aufs_bindex_t bindex; - - err = 0; - sb = args->sb; - dir = d_inode(sb->s_root); - br = args->br; - - si_noflush_write_lock(sb); - ii_read_lock_parent(dir); - bindex = au_br_index(sb, br->br_id); - err = au_xino_trunc(sb, bindex); - ii_read_unlock(dir); - if (unlikely(err)) - pr_warn("err b%d, (%d)\n", bindex, err); - atomic_dec(&br->br_xino_running); - atomic_dec(&br->br_count); - si_write_unlock(sb); - au_nwt_done(&au_sbi(sb)->si_nowait); - kfree(args); -} - -static int xino_trunc_test(struct super_block *sb, struct au_branch *br) -{ - int err; - struct kstatfs st; - struct au_sbinfo *sbinfo; - - /* todo: si_xino_expire and the ratio should be customizable */ - sbinfo = au_sbi(sb); - if (time_before(jiffies, - sbinfo->si_xino_jiffy + sbinfo->si_xino_expire)) - return 0; - - /* truncation border */ - err = vfs_statfs(&br->br_xino.xi_file->f_path, &st); - if (unlikely(err)) { - AuErr1("statfs err %d, ignored\n", err); - return 0; - } - if (div64_u64(st.f_bfree * 100, st.f_blocks) >= AUFS_XINO_DEF_TRUNC) - return 0; - - return 1; -} - -static void xino_try_trunc(struct super_block *sb, struct au_branch *br) -{ - struct xino_do_trunc_args *args; - int wkq_err; - - if (!xino_trunc_test(sb, br)) - return; - - if (atomic_inc_return(&br->br_xino_running) > 1) - goto out; - - /* lock and kfree() will be called in trunc_xino() */ - args = kmalloc(sizeof(*args), GFP_NOFS); - if (unlikely(!args)) { - AuErr1("no memory\n"); - goto out_args; - } - - atomic_inc(&br->br_count); - args->sb = sb; - args->br = br; - wkq_err = au_wkq_nowait(xino_do_trunc, args, sb, /*flags*/0); - if (!wkq_err) - return; /* success */ - - pr_err("wkq %d\n", wkq_err); - atomic_dec(&br->br_count); - -out_args: - kfree(args); -out: - atomic_dec(&br->br_xino_running); -} - -/* ---------------------------------------------------------------------- */ - -static int au_xino_do_write(vfs_writef_t write, struct file *file, - ino_t h_ino, ino_t ino) -{ - loff_t pos; - ssize_t sz; - - pos = h_ino; - if (unlikely(au_loff_max / sizeof(ino) - 1 < pos)) { - AuIOErr1("too large hi%lu\n", (unsigned long)h_ino); - return -EFBIG; - } - pos *= sizeof(ino); - sz = xino_fwrite(write, file, &ino, sizeof(ino), &pos); - if (sz == sizeof(ino)) - return 0; /* success */ - - AuIOErr("write failed (%zd)\n", sz); - return -EIO; -} - -/* - * write @ino to the xinofile for the specified branch{@sb, @bindex} - * at the position of @h_ino. - * even if @ino is zero, it is written to the xinofile and means no entry. - * if the size of the xino file on a specific filesystem exceeds the watermark, - * try truncating it. - */ -int au_xino_write(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino, - ino_t ino) -{ - int err; - unsigned int mnt_flags; - struct au_branch *br; - - BUILD_BUG_ON(sizeof(long long) != sizeof(au_loff_max) - || ((loff_t)-1) > 0); - SiMustAnyLock(sb); - - mnt_flags = au_mntflags(sb); - if (!au_opt_test(mnt_flags, XINO)) - return 0; - - br = au_sbr(sb, bindex); - err = au_xino_do_write(au_sbi(sb)->si_xwrite, br->br_xino.xi_file, - h_ino, ino); - if (!err) { - if (au_opt_test(mnt_flags, TRUNC_XINO) - && au_test_fs_trunc_xino(au_br_sb(br))) - xino_try_trunc(sb, br); - return 0; /* success */ - } - - AuIOErr("write failed (%d)\n", err); - return -EIO; -} - -/* ---------------------------------------------------------------------- */ - -/* aufs inode number bitmap */ - -static const int page_bits = (int)PAGE_SIZE * BITS_PER_BYTE; -static ino_t xib_calc_ino(unsigned long pindex, int bit) -{ - ino_t ino; - - AuDebugOn(bit < 0 || page_bits <= bit); - ino = AUFS_FIRST_INO + pindex * page_bits + bit; - return ino; -} - -static void xib_calc_bit(ino_t ino, unsigned long *pindex, int *bit) -{ - AuDebugOn(ino < AUFS_FIRST_INO); - ino -= AUFS_FIRST_INO; - *pindex = ino / page_bits; - *bit = ino % page_bits; -} - -static int xib_pindex(struct super_block *sb, unsigned long pindex) -{ - int err; - loff_t pos; - ssize_t sz; - struct au_sbinfo *sbinfo; - struct file *xib; - unsigned long *p; - - sbinfo = au_sbi(sb); - MtxMustLock(&sbinfo->si_xib_mtx); - AuDebugOn(pindex > ULONG_MAX / PAGE_SIZE - || !au_opt_test(sbinfo->si_mntflags, XINO)); - - if (pindex == sbinfo->si_xib_last_pindex) - return 0; - - xib = sbinfo->si_xib; - p = sbinfo->si_xib_buf; - pos = sbinfo->si_xib_last_pindex; - pos *= PAGE_SIZE; - sz = xino_fwrite(sbinfo->si_xwrite, xib, p, PAGE_SIZE, &pos); - if (unlikely(sz != PAGE_SIZE)) - goto out; - - pos = pindex; - pos *= PAGE_SIZE; - if (vfsub_f_size_read(xib) >= pos + PAGE_SIZE) - sz = xino_fread(sbinfo->si_xread, xib, p, PAGE_SIZE, &pos); - else { - memset(p, 0, PAGE_SIZE); - sz = xino_fwrite(sbinfo->si_xwrite, xib, p, PAGE_SIZE, &pos); - } - if (sz == PAGE_SIZE) { - sbinfo->si_xib_last_pindex = pindex; - return 0; /* success */ - } - -out: - AuIOErr1("write failed (%zd)\n", sz); - err = sz; - if (sz >= 0) - err = -EIO; - return err; -} - -/* ---------------------------------------------------------------------- */ - -static void au_xib_clear_bit(struct inode *inode) -{ - int err, bit; - unsigned long pindex; - struct super_block *sb; - struct au_sbinfo *sbinfo; - - AuDebugOn(inode->i_nlink); - - sb = inode->i_sb; - xib_calc_bit(inode->i_ino, &pindex, &bit); - AuDebugOn(page_bits <= bit); - sbinfo = au_sbi(sb); - mutex_lock(&sbinfo->si_xib_mtx); - err = xib_pindex(sb, pindex); - if (!err) { - clear_bit(bit, sbinfo->si_xib_buf); - sbinfo->si_xib_next_bit = bit; - } - mutex_unlock(&sbinfo->si_xib_mtx); -} - -/* for s_op->delete_inode() */ -void au_xino_delete_inode(struct inode *inode, const int unlinked) -{ - int err; - unsigned int mnt_flags; - aufs_bindex_t bindex, bend, bi; - unsigned char try_trunc; - struct au_iinfo *iinfo; - struct super_block *sb; - struct au_hinode *hi; - struct inode *h_inode; - struct au_branch *br; - vfs_writef_t xwrite; - - sb = inode->i_sb; - mnt_flags = au_mntflags(sb); - if (!au_opt_test(mnt_flags, XINO) - || inode->i_ino == AUFS_ROOT_INO) - return; - - if (unlinked) { - au_xigen_inc(inode); - au_xib_clear_bit(inode); - } - - iinfo = au_ii(inode); - if (!iinfo) - return; - - bindex = iinfo->ii_bstart; - if (bindex < 0) - return; - - xwrite = au_sbi(sb)->si_xwrite; - try_trunc = !!au_opt_test(mnt_flags, TRUNC_XINO); - hi = iinfo->ii_hinode + bindex; - bend = iinfo->ii_bend; - for (; bindex <= bend; bindex++, hi++) { - h_inode = hi->hi_inode; - if (!h_inode - || (!unlinked && h_inode->i_nlink)) - continue; - - /* inode may not be revalidated */ - bi = au_br_index(sb, hi->hi_id); - if (bi < 0) - continue; - - br = au_sbr(sb, bi); - err = au_xino_do_write(xwrite, br->br_xino.xi_file, - h_inode->i_ino, /*ino*/0); - if (!err && try_trunc - && au_test_fs_trunc_xino(au_br_sb(br))) - xino_try_trunc(sb, br); - } -} - -/* get an unused inode number from bitmap */ -ino_t au_xino_new_ino(struct super_block *sb) -{ - ino_t ino; - unsigned long *p, pindex, ul, pend; - struct au_sbinfo *sbinfo; - struct file *file; - int free_bit, err; - - if (!au_opt_test(au_mntflags(sb), XINO)) - return iunique(sb, AUFS_FIRST_INO); - - sbinfo = au_sbi(sb); - mutex_lock(&sbinfo->si_xib_mtx); - p = sbinfo->si_xib_buf; - free_bit = sbinfo->si_xib_next_bit; - if (free_bit < page_bits && !test_bit(free_bit, p)) - goto out; /* success */ - free_bit = find_first_zero_bit(p, page_bits); - if (free_bit < page_bits) - goto out; /* success */ - - pindex = sbinfo->si_xib_last_pindex; - for (ul = pindex - 1; ul < ULONG_MAX; ul--) { - err = xib_pindex(sb, ul); - if (unlikely(err)) - goto out_err; - free_bit = find_first_zero_bit(p, page_bits); - if (free_bit < page_bits) - goto out; /* success */ - } - - file = sbinfo->si_xib; - pend = vfsub_f_size_read(file) / PAGE_SIZE; - for (ul = pindex + 1; ul <= pend; ul++) { - err = xib_pindex(sb, ul); - if (unlikely(err)) - goto out_err; - free_bit = find_first_zero_bit(p, page_bits); - if (free_bit < page_bits) - goto out; /* success */ - } - BUG(); - -out: - set_bit(free_bit, p); - sbinfo->si_xib_next_bit = free_bit + 1; - pindex = sbinfo->si_xib_last_pindex; - mutex_unlock(&sbinfo->si_xib_mtx); - ino = xib_calc_ino(pindex, free_bit); - AuDbg("i%lu\n", (unsigned long)ino); - return ino; -out_err: - mutex_unlock(&sbinfo->si_xib_mtx); - AuDbg("i0\n"); - return 0; -} - -/* - * read @ino from xinofile for the specified branch{@sb, @bindex} - * at the position of @h_ino. - * if @ino does not exist and @do_new is true, get new one. - */ -int au_xino_read(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino, - ino_t *ino) -{ - int err; - ssize_t sz; - loff_t pos; - struct file *file; - struct au_sbinfo *sbinfo; - - *ino = 0; - if (!au_opt_test(au_mntflags(sb), XINO)) - return 0; /* no xino */ - - err = 0; - sbinfo = au_sbi(sb); - pos = h_ino; - if (unlikely(au_loff_max / sizeof(*ino) - 1 < pos)) { - AuIOErr1("too large hi%lu\n", (unsigned long)h_ino); - return -EFBIG; - } - pos *= sizeof(*ino); - - file = au_sbr(sb, bindex)->br_xino.xi_file; - if (vfsub_f_size_read(file) < pos + sizeof(*ino)) - return 0; /* no ino */ - - sz = xino_fread(sbinfo->si_xread, file, ino, sizeof(*ino), &pos); - if (sz == sizeof(*ino)) - return 0; /* success */ - - err = sz; - if (unlikely(sz >= 0)) { - err = -EIO; - AuIOErr("xino read error (%zd)\n", sz); - } - - return err; -} - -/* ---------------------------------------------------------------------- */ - -/* create and set a new xino file */ - -struct file *au_xino_create(struct super_block *sb, char *fname, int silent) -{ - struct file *file; - struct dentry *h_parent, *d; - struct inode *h_dir, *inode; - int err; - - /* - * at mount-time, and the xino file is the default path, - * hnotify is disabled so we have no notify events to ignore. - * when a user specified the xino, we cannot get au_hdir to be ignored. - */ - file = vfsub_filp_open(fname, O_RDWR | O_CREAT | O_EXCL | O_LARGEFILE - /* | __FMODE_NONOTIFY */, - S_IRUGO | S_IWUGO); - if (IS_ERR(file)) { - if (!silent) - pr_err("open %s(%ld)\n", fname, PTR_ERR(file)); - return file; - } - - /* keep file count */ - err = 0; - inode = file_inode(file); - h_parent = dget_parent(file->f_path.dentry); - h_dir = d_inode(h_parent); - mutex_lock_nested(&h_dir->i_mutex, AuLsc_I_PARENT); - /* mnt_want_write() is unnecessary here */ - /* no delegation since it is just created */ - if (inode->i_nlink) - err = vfsub_unlink(h_dir, &file->f_path, /*delegated*/NULL, - /*force*/0); - mutex_unlock(&h_dir->i_mutex); - dput(h_parent); - if (unlikely(err)) { - if (!silent) - pr_err("unlink %s(%d)\n", fname, err); - goto out; - } - - err = -EINVAL; - d = file->f_path.dentry; - if (unlikely(sb == d->d_sb)) { - if (!silent) - pr_err("%s must be outside\n", fname); - goto out; - } - if (unlikely(au_test_fs_bad_xino(d->d_sb))) { - if (!silent) - pr_err("xino doesn't support %s(%s)\n", - fname, au_sbtype(d->d_sb)); - goto out; - } - return file; /* success */ - -out: - fput(file); - file = ERR_PTR(err); - return file; -} - -/* - * find another branch who is on the same filesystem of the specified - * branch{@btgt}. search until @bend. - */ -static int is_sb_shared(struct super_block *sb, aufs_bindex_t btgt, - aufs_bindex_t bend) -{ - aufs_bindex_t bindex; - struct super_block *tgt_sb = au_sbr_sb(sb, btgt); - - for (bindex = 0; bindex < btgt; bindex++) - if (unlikely(tgt_sb == au_sbr_sb(sb, bindex))) - return bindex; - for (bindex++; bindex <= bend; bindex++) - if (unlikely(tgt_sb == au_sbr_sb(sb, bindex))) - return bindex; - return -1; -} - -/* ---------------------------------------------------------------------- */ - -/* - * initialize the xinofile for the specified branch @br - * at the place/path where @base_file indicates. - * test whether another branch is on the same filesystem or not, - * if @do_test is true. - */ -int au_xino_br(struct super_block *sb, struct au_branch *br, ino_t h_ino, - struct file *base_file, int do_test) -{ - int err; - ino_t ino; - aufs_bindex_t bend, bindex; - struct au_branch *shared_br, *b; - struct file *file; - struct super_block *tgt_sb; - - shared_br = NULL; - bend = au_sbend(sb); - if (do_test) { - tgt_sb = au_br_sb(br); - for (bindex = 0; bindex <= bend; bindex++) { - b = au_sbr(sb, bindex); - if (tgt_sb == au_br_sb(b)) { - shared_br = b; - break; - } - } - } - - if (!shared_br || !shared_br->br_xino.xi_file) { - struct au_xino_lock_dir ldir; - - au_xino_lock_dir(sb, base_file, &ldir); - /* mnt_want_write() is unnecessary here */ - file = au_xino_create2(base_file, NULL); - au_xino_unlock_dir(&ldir); - err = PTR_ERR(file); - if (IS_ERR(file)) - goto out; - br->br_xino.xi_file = file; - } else { - br->br_xino.xi_file = shared_br->br_xino.xi_file; - get_file(br->br_xino.xi_file); - } - - ino = AUFS_ROOT_INO; - err = au_xino_do_write(au_sbi(sb)->si_xwrite, br->br_xino.xi_file, - h_ino, ino); - if (unlikely(err)) { - fput(br->br_xino.xi_file); - br->br_xino.xi_file = NULL; - } - -out: - return err; -} - -/* ---------------------------------------------------------------------- */ - -/* trucate a xino bitmap file */ - -/* todo: slow */ -static int do_xib_restore(struct super_block *sb, struct file *file, void *page) -{ - int err, bit; - ssize_t sz; - unsigned long pindex; - loff_t pos, pend; - struct au_sbinfo *sbinfo; - vfs_readf_t func; - ino_t *ino; - unsigned long *p; - - err = 0; - sbinfo = au_sbi(sb); - MtxMustLock(&sbinfo->si_xib_mtx); - p = sbinfo->si_xib_buf; - func = sbinfo->si_xread; - pend = vfsub_f_size_read(file); - pos = 0; - while (pos < pend) { - sz = xino_fread(func, file, page, PAGE_SIZE, &pos); - err = sz; - if (unlikely(sz <= 0)) - goto out; - - err = 0; - for (ino = page; sz > 0; ino++, sz -= sizeof(ino)) { - if (unlikely(*ino < AUFS_FIRST_INO)) - continue; - - xib_calc_bit(*ino, &pindex, &bit); - AuDebugOn(page_bits <= bit); - err = xib_pindex(sb, pindex); - if (!err) - set_bit(bit, p); - else - goto out; - } - } - -out: - return err; -} - -static int xib_restore(struct super_block *sb) -{ - int err; - aufs_bindex_t bindex, bend; - void *page; - - err = -ENOMEM; - page = (void *)__get_free_page(GFP_NOFS); - if (unlikely(!page)) - goto out; - - err = 0; - bend = au_sbend(sb); - for (bindex = 0; !err && bindex <= bend; bindex++) - if (!bindex || is_sb_shared(sb, bindex, bindex - 1) < 0) - err = do_xib_restore - (sb, au_sbr(sb, bindex)->br_xino.xi_file, page); - else - AuDbg("b%d\n", bindex); - free_page((unsigned long)page); - -out: - return err; -} - -int au_xib_trunc(struct super_block *sb) -{ - int err; - ssize_t sz; - loff_t pos; - struct au_xino_lock_dir ldir; - struct au_sbinfo *sbinfo; - unsigned long *p; - struct file *file; - - SiMustWriteLock(sb); - - err = 0; - sbinfo = au_sbi(sb); - if (!au_opt_test(sbinfo->si_mntflags, XINO)) - goto out; - - file = sbinfo->si_xib; - if (vfsub_f_size_read(file) <= PAGE_SIZE) - goto out; - - au_xino_lock_dir(sb, file, &ldir); - /* mnt_want_write() is unnecessary here */ - file = au_xino_create2(sbinfo->si_xib, NULL); - au_xino_unlock_dir(&ldir); - err = PTR_ERR(file); - if (IS_ERR(file)) - goto out; - fput(sbinfo->si_xib); - sbinfo->si_xib = file; - - p = sbinfo->si_xib_buf; - memset(p, 0, PAGE_SIZE); - pos = 0; - sz = xino_fwrite(sbinfo->si_xwrite, sbinfo->si_xib, p, PAGE_SIZE, &pos); - if (unlikely(sz != PAGE_SIZE)) { - err = sz; - AuIOErr("err %d\n", err); - if (sz >= 0) - err = -EIO; - goto out; - } - - mutex_lock(&sbinfo->si_xib_mtx); - /* mnt_want_write() is unnecessary here */ - err = xib_restore(sb); - mutex_unlock(&sbinfo->si_xib_mtx); - -out: - return err; -} - -/* ---------------------------------------------------------------------- */ - -/* - * xino mount option handlers - */ - -/* xino bitmap */ -static void xino_clear_xib(struct super_block *sb) -{ - struct au_sbinfo *sbinfo; - - SiMustWriteLock(sb); - - sbinfo = au_sbi(sb); - sbinfo->si_xread = NULL; - sbinfo->si_xwrite = NULL; - if (sbinfo->si_xib) - fput(sbinfo->si_xib); - sbinfo->si_xib = NULL; - free_page((unsigned long)sbinfo->si_xib_buf); - sbinfo->si_xib_buf = NULL; -} - -static int au_xino_set_xib(struct super_block *sb, struct file *base) -{ - int err; - loff_t pos; - struct au_sbinfo *sbinfo; - struct file *file; - - SiMustWriteLock(sb); - - sbinfo = au_sbi(sb); - file = au_xino_create2(base, sbinfo->si_xib); - err = PTR_ERR(file); - if (IS_ERR(file)) - goto out; - if (sbinfo->si_xib) - fput(sbinfo->si_xib); - sbinfo->si_xib = file; - sbinfo->si_xread = vfs_readf(file); - sbinfo->si_xwrite = vfs_writef(file); - - err = -ENOMEM; - if (!sbinfo->si_xib_buf) - sbinfo->si_xib_buf = (void *)get_zeroed_page(GFP_NOFS); - if (unlikely(!sbinfo->si_xib_buf)) - goto out_unset; - - sbinfo->si_xib_last_pindex = 0; - sbinfo->si_xib_next_bit = 0; - if (vfsub_f_size_read(file) < PAGE_SIZE) { - pos = 0; - err = xino_fwrite(sbinfo->si_xwrite, file, sbinfo->si_xib_buf, - PAGE_SIZE, &pos); - if (unlikely(err != PAGE_SIZE)) - goto out_free; - } - err = 0; - goto out; /* success */ - -out_free: - free_page((unsigned long)sbinfo->si_xib_buf); - sbinfo->si_xib_buf = NULL; - if (err >= 0) - err = -EIO; -out_unset: - fput(sbinfo->si_xib); - sbinfo->si_xib = NULL; - sbinfo->si_xread = NULL; - sbinfo->si_xwrite = NULL; -out: - return err; -} - -/* xino for each branch */ -static void xino_clear_br(struct super_block *sb) -{ - aufs_bindex_t bindex, bend; - struct au_branch *br; - - bend = au_sbend(sb); - for (bindex = 0; bindex <= bend; bindex++) { - br = au_sbr(sb, bindex); - if (!br || !br->br_xino.xi_file) - continue; - - fput(br->br_xino.xi_file); - br->br_xino.xi_file = NULL; - } -} - -static int au_xino_set_br(struct super_block *sb, struct file *base) -{ - int err; - ino_t ino; - aufs_bindex_t bindex, bend, bshared; - struct { - struct file *old, *new; - } *fpair, *p; - struct au_branch *br; - struct inode *inode; - vfs_writef_t writef; - - SiMustWriteLock(sb); - - err = -ENOMEM; - bend = au_sbend(sb); - fpair = kcalloc(bend + 1, sizeof(*fpair), GFP_NOFS); - if (unlikely(!fpair)) - goto out; - - inode = d_inode(sb->s_root); - ino = AUFS_ROOT_INO; - writef = au_sbi(sb)->si_xwrite; - for (bindex = 0, p = fpair; bindex <= bend; bindex++, p++) { - br = au_sbr(sb, bindex); - bshared = is_sb_shared(sb, bindex, bindex - 1); - if (bshared >= 0) { - /* shared xino */ - *p = fpair[bshared]; - get_file(p->new); - } - - if (!p->new) { - /* new xino */ - p->old = br->br_xino.xi_file; - p->new = au_xino_create2(base, br->br_xino.xi_file); - err = PTR_ERR(p->new); - if (IS_ERR(p->new)) { - p->new = NULL; - goto out_pair; - } - } - - err = au_xino_do_write(writef, p->new, - au_h_iptr(inode, bindex)->i_ino, ino); - if (unlikely(err)) - goto out_pair; - } - - for (bindex = 0, p = fpair; bindex <= bend; bindex++, p++) { - br = au_sbr(sb, bindex); - if (br->br_xino.xi_file) - fput(br->br_xino.xi_file); - get_file(p->new); - br->br_xino.xi_file = p->new; - } - -out_pair: - for (bindex = 0, p = fpair; bindex <= bend; bindex++, p++) - if (p->new) - fput(p->new); - else - break; - kfree(fpair); -out: - return err; -} - -void au_xino_clr(struct super_block *sb) -{ - struct au_sbinfo *sbinfo; - - au_xigen_clr(sb); - xino_clear_xib(sb); - xino_clear_br(sb); - sbinfo = au_sbi(sb); - /* lvalue, do not call au_mntflags() */ - au_opt_clr(sbinfo->si_mntflags, XINO); -} - -int au_xino_set(struct super_block *sb, struct au_opt_xino *xino, int remount) -{ - int err, skip; - struct dentry *parent, *cur_parent; - struct qstr *dname, *cur_name; - struct file *cur_xino; - struct inode *dir; - struct au_sbinfo *sbinfo; - - SiMustWriteLock(sb); - - err = 0; - sbinfo = au_sbi(sb); - parent = dget_parent(xino->file->f_path.dentry); - if (remount) { - skip = 0; - dname = &xino->file->f_path.dentry->d_name; - cur_xino = sbinfo->si_xib; - if (cur_xino) { - cur_parent = dget_parent(cur_xino->f_path.dentry); - cur_name = &cur_xino->f_path.dentry->d_name; - skip = (cur_parent == parent - && au_qstreq(dname, cur_name)); - dput(cur_parent); - } - if (skip) - goto out; - } - - au_opt_set(sbinfo->si_mntflags, XINO); - dir = d_inode(parent); - mutex_lock_nested(&dir->i_mutex, AuLsc_I_PARENT); - /* mnt_want_write() is unnecessary here */ - err = au_xino_set_xib(sb, xino->file); - if (!err) - err = au_xigen_set(sb, xino->file); - if (!err) - err = au_xino_set_br(sb, xino->file); - mutex_unlock(&dir->i_mutex); - if (!err) - goto out; /* success */ - - /* reset all */ - AuIOErr("failed creating xino(%d).\n", err); - au_xigen_clr(sb); - xino_clear_xib(sb); - -out: - dput(parent); - return err; -} - -/* ---------------------------------------------------------------------- */ - -/* - * create a xinofile at the default place/path. - */ -struct file *au_xino_def(struct super_block *sb) -{ - struct file *file; - char *page, *p; - struct au_branch *br; - struct super_block *h_sb; - struct path path; - aufs_bindex_t bend, bindex, bwr; - - br = NULL; - bend = au_sbend(sb); - bwr = -1; - for (bindex = 0; bindex <= bend; bindex++) { - br = au_sbr(sb, bindex); - if (au_br_writable(br->br_perm) - && !au_test_fs_bad_xino(au_br_sb(br))) { - bwr = bindex; - break; - } - } - - if (bwr >= 0) { - file = ERR_PTR(-ENOMEM); - page = (void *)__get_free_page(GFP_NOFS); - if (unlikely(!page)) - goto out; - path.mnt = au_br_mnt(br); - path.dentry = au_h_dptr(sb->s_root, bwr); - p = d_path(&path, page, PATH_MAX - sizeof(AUFS_XINO_FNAME)); - file = (void *)p; - if (!IS_ERR(p)) { - strcat(p, "/" AUFS_XINO_FNAME); - AuDbg("%s\n", p); - file = au_xino_create(sb, p, /*silent*/0); - if (!IS_ERR(file)) - au_xino_brid_set(sb, br->br_id); - } - free_page((unsigned long)page); - } else { - file = au_xino_create(sb, AUFS_XINO_DEFPATH, /*silent*/0); - if (IS_ERR(file)) - goto out; - h_sb = file->f_path.dentry->d_sb; - if (unlikely(au_test_fs_bad_xino(h_sb))) { - pr_err("xino doesn't support %s(%s)\n", - AUFS_XINO_DEFPATH, au_sbtype(h_sb)); - fput(file); - file = ERR_PTR(-EINVAL); - } - if (!IS_ERR(file)) - au_xino_brid_set(sb, -1); - } - -out: - return file; -} - -/* ---------------------------------------------------------------------- */ - -int au_xino_path(struct seq_file *seq, struct file *file) -{ - int err; - - err = au_seq_path(seq, &file->f_path); - if (unlikely(err < 0)) - goto out; - - err = 0; -#define Deleted "\\040(deleted)" - seq->count -= sizeof(Deleted) - 1; - AuDebugOn(memcmp(seq->buf + seq->count, Deleted, - sizeof(Deleted) - 1)); -#undef Deleted - -out: - return err; -} diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index 5b700ef1e..c37149b92 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -238,11 +238,6 @@ static inline u64 autofs4_get_ino(struct autofs_sb_info *sbi) return d_inode(sbi->sb->s_root)->i_ino; } -static inline int simple_positive(struct dentry *dentry) -{ - return d_really_is_positive(dentry) && !d_unhashed(dentry); -} - static inline void __autofs4_add_expiring(struct dentry *dentry) { struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); diff --git a/fs/autofs4/symlink.c b/fs/autofs4/symlink.c index de58cc7b8..da0c33481 100644 --- a/fs/autofs4/symlink.c +++ b/fs/autofs4/symlink.c @@ -12,14 +12,13 @@ #include "autofs_i.h" -static void *autofs4_follow_link(struct dentry *dentry, struct nameidata *nd) +static const char *autofs4_follow_link(struct dentry *dentry, void **cookie) { struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); struct autofs_info *ino = autofs4_dentry_ino(dentry); if (ino && !autofs4_oz_mode(sbi)) ino->last_used = jiffies; - nd_set_link(nd, d_inode(dentry)->i_private); - return NULL; + return d_inode(dentry)->i_private; } const struct inode_operations autofs4_symlink_inode_operations = { diff --git a/fs/befs/befs.h b/fs/befs/befs.h index 1fead8d56..35d19e873 100644 --- a/fs/befs/befs.h +++ b/fs/befs/befs.h @@ -112,7 +112,7 @@ BEFS_SB(const struct super_block *super) static inline struct befs_inode_info * BEFS_I(const struct inode *inode) { - return list_entry(inode, struct befs_inode_info, vfs_inode); + return container_of(inode, struct befs_inode_info, vfs_inode); } static inline befs_blocknr_t diff --git a/fs/befs/btree.c b/fs/befs/btree.c index 0826e91da..22c166280 100644 --- a/fs/befs/btree.c +++ b/fs/befs/btree.c @@ -137,8 +137,8 @@ static int befs_bt_read_super(struct super_block *sb, befs_data_stream * ds, befs_btree_super * sup) { - struct buffer_head *bh = NULL; - befs_disk_btree_super *od_sup = NULL; + struct buffer_head *bh; + befs_disk_btree_super *od_sup; befs_debug(sb, "---> %s", __func__); @@ -250,7 +250,7 @@ int befs_btree_find(struct super_block *sb, befs_data_stream * ds, const char *key, befs_off_t * value) { - struct befs_btree_node *this_node = NULL; + struct befs_btree_node *this_node; befs_btree_super bt_super; befs_off_t node_off; int res; diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 7943533c3..46aedacfa 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -42,8 +42,7 @@ static struct inode *befs_iget(struct super_block *, unsigned long); static struct inode *befs_alloc_inode(struct super_block *sb); static void befs_destroy_inode(struct inode *inode); static void befs_destroy_inodecache(void); -static void *befs_follow_link(struct dentry *, struct nameidata *); -static void *befs_fast_follow_link(struct dentry *, struct nameidata *); +static const char *befs_follow_link(struct dentry *, void **); static int befs_utf2nls(struct super_block *sb, const char *in, int in_len, char **out, int *out_len); static int befs_nls2utf(struct super_block *sb, const char *in, int in_len, @@ -80,11 +79,6 @@ static const struct address_space_operations befs_aops = { .bmap = befs_bmap, }; -static const struct inode_operations befs_fast_symlink_inode_operations = { - .readlink = generic_readlink, - .follow_link = befs_fast_follow_link, -}; - static const struct inode_operations befs_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = befs_follow_link, @@ -403,10 +397,12 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino) inode->i_op = &befs_dir_inode_operations; inode->i_fop = &befs_dir_operations; } else if (S_ISLNK(inode->i_mode)) { - if (befs_ino->i_flags & BEFS_LONG_SYMLINK) + if (befs_ino->i_flags & BEFS_LONG_SYMLINK) { inode->i_op = &befs_symlink_inode_operations; - else - inode->i_op = &befs_fast_symlink_inode_operations; + } else { + inode->i_link = befs_ino->i_data.symlink; + inode->i_op = &simple_symlink_inode_operations; + } } else { befs_error(sb, "Inode %lu is not a regular file, " "directory or symlink. THAT IS WRONG! BeFS has no " @@ -467,8 +463,8 @@ befs_destroy_inodecache(void) * The data stream become link name. Unless the LONG_SYMLINK * flag is set. */ -static void * -befs_follow_link(struct dentry *dentry, struct nameidata *nd) +static const char * +befs_follow_link(struct dentry *dentry, void **cookie) { struct super_block *sb = dentry->d_sb; struct befs_inode_info *befs_ino = BEFS_I(d_inode(dentry)); @@ -478,33 +474,20 @@ befs_follow_link(struct dentry *dentry, struct nameidata *nd) if (len == 0) { befs_error(sb, "Long symlink with illegal length"); - link = ERR_PTR(-EIO); - } else { - befs_debug(sb, "Follow long symlink"); - - link = kmalloc(len, GFP_NOFS); - if (!link) { - link = ERR_PTR(-ENOMEM); - } else if (befs_read_lsymlink(sb, data, link, len) != len) { - kfree(link); - befs_error(sb, "Failed to read entire long symlink"); - link = ERR_PTR(-EIO); - } else { - link[len - 1] = '\0'; - } + return ERR_PTR(-EIO); } - nd_set_link(nd, link); - return NULL; -} - - -static void * -befs_fast_follow_link(struct dentry *dentry, struct nameidata *nd) -{ - struct befs_inode_info *befs_ino = BEFS_I(d_inode(dentry)); + befs_debug(sb, "Follow long symlink"); - nd_set_link(nd, befs_ino->i_data.symlink); - return NULL; + link = kmalloc(len, GFP_NOFS); + if (!link) + return ERR_PTR(-ENOMEM); + if (befs_read_lsymlink(sb, data, link, len) != len) { + kfree(link); + befs_error(sb, "Failed to read entire long symlink"); + return ERR_PTR(-EIO); + } + link[len - 1] = '\0'; + return *cookie = link; } /* diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index cd46e4158..6b6599678 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1530,7 +1530,7 @@ static int fill_files_note(struct memelfnote *note) file = vma->vm_file; if (!file) continue; - filename = d_path(&file->f_path, name_curpos, remaining); + filename = file_path(file, name_curpos, remaining); if (IS_ERR(filename)) { if (PTR_ERR(filename) == -ENAMETOOLONG) { vfree(data); @@ -1540,7 +1540,7 @@ static int fill_files_note(struct memelfnote *note) continue; } - /* d_path() fills at the end, move name down */ + /* file_path() fills at the end, move name down */ /* n = strlen(filename) + 1: */ n = (name_curpos + remaining) - filename; remaining = filename - name_curpos; diff --git a/fs/block_dev.c b/fs/block_dev.c index c7e4163ed..198243717 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -14,6 +14,7 @@ #include <linux/device_cgroup.h> #include <linux/highmem.h> #include <linux/blkdev.h> +#include <linux/backing-dev.h> #include <linux/module.h> #include <linux/blkpg.h> #include <linux/magic.h> @@ -42,7 +43,7 @@ static inline struct bdev_inode *BDEV_I(struct inode *inode) return container_of(inode, struct bdev_inode, vfs_inode); } -inline struct block_device *I_BDEV(struct inode *inode) +struct block_device *I_BDEV(struct inode *inode) { return &BDEV_I(inode)->bdev; } @@ -151,6 +152,9 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; + if (IS_DAX(inode)) + return dax_do_io(iocb, inode, iter, offset, blkdev_get_block, + NULL, DIO_SKIP_DIO_COUNT); return __blockdev_direct_IO(iocb, inode, I_BDEV(inode), iter, offset, blkdev_get_block, NULL, NULL, DIO_SKIP_DIO_COUNT); @@ -376,7 +380,7 @@ int bdev_read_page(struct block_device *bdev, sector_t sector, struct page *page) { const struct block_device_operations *ops = bdev->bd_disk->fops; - if (!ops->rw_page) + if (!ops->rw_page || bdev_get_integrity(bdev)) return -EOPNOTSUPP; return ops->rw_page(bdev, sector + get_start_sect(bdev), page, READ); } @@ -407,7 +411,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector, int result; int rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE; const struct block_device_operations *ops = bdev->bd_disk->fops; - if (!ops->rw_page) + if (!ops->rw_page || bdev_get_integrity(bdev)) return -EOPNOTSUPP; set_page_writeback(page); result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, rw); @@ -442,6 +446,12 @@ long bdev_direct_access(struct block_device *bdev, sector_t sector, long avail; const struct block_device_operations *ops = bdev->bd_disk->fops; + /* + * The device driver is allowed to sleep, in order to make the + * memory directly accessible. + */ + might_sleep(); + if (size < 0) return size; if (!ops->direct_access) @@ -546,7 +556,8 @@ static struct file_system_type bd_type = { .kill_sb = kill_anon_super, }; -static struct super_block *blockdev_superblock __read_mostly; +struct super_block *blockdev_superblock __read_mostly; +EXPORT_SYMBOL_GPL(blockdev_superblock); void __init bdev_cache_init(void) { @@ -687,11 +698,6 @@ static struct block_device *bd_acquire(struct inode *inode) return bdev; } -int sb_is_blkdev_sb(struct super_block *sb) -{ - return sb == blockdev_superblock; -} - /* Call when you free inode */ void bd_forget(struct inode *inode) @@ -1173,6 +1179,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) bdev->bd_disk = disk; bdev->bd_queue = disk->queue; bdev->bd_contains = bdev; + bdev->bd_inode->i_flags = disk->fops->direct_access ? S_DAX : 0; if (!partno) { ret = -ENXIO; bdev->bd_part = disk_get_part(disk, partno); diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index df9932b00..1ce06c849 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -85,6 +85,7 @@ BTRFS_WORK_HELPER(extent_refs_helper); BTRFS_WORK_HELPER(scrub_helper); BTRFS_WORK_HELPER(scrubwrc_helper); BTRFS_WORK_HELPER(scrubnc_helper); +BTRFS_WORK_HELPER(scrubparity_helper); static struct __btrfs_workqueue * __btrfs_alloc_workqueue(const char *name, unsigned int flags, int max_active, diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index ec2ee477f..b0b093b6a 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -64,6 +64,8 @@ BTRFS_WORK_HELPER_PROTO(extent_refs_helper); BTRFS_WORK_HELPER_PROTO(scrub_helper); BTRFS_WORK_HELPER_PROTO(scrubwrc_helper); BTRFS_WORK_HELPER_PROTO(scrubnc_helper); +BTRFS_WORK_HELPER_PROTO(scrubparity_helper); + struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, unsigned int flags, diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 614aaa196..802fabb30 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -250,8 +250,12 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, * the first item to check. But sometimes, we may enter it with * slot==nritems. In that case, go to the next leaf before we continue. */ - if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) - ret = btrfs_next_old_leaf(root, path, time_seq); + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + if (time_seq == (u64)-1) + ret = btrfs_next_leaf(root, path); + else + ret = btrfs_next_old_leaf(root, path, time_seq); + } while (!ret && count < total_refs) { eb = path->nodes[0]; @@ -291,7 +295,10 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, eie = NULL; } next: - ret = btrfs_next_old_item(root, path, time_seq); + if (time_seq == (u64)-1) + ret = btrfs_next_item(root, path); + else + ret = btrfs_next_old_item(root, path, time_seq); } if (ret > 0) @@ -334,6 +341,8 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, if (path->search_commit_root) root_level = btrfs_header_level(root->commit_root); + else if (time_seq == (u64)-1) + root_level = btrfs_header_level(root->node); else root_level = btrfs_old_root_level(root, time_seq); @@ -343,7 +352,12 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info, } path->lowest_level = level; - ret = btrfs_search_old_slot(root, &ref->key_for_search, path, time_seq); + if (time_seq == (u64)-1) + ret = btrfs_search_slot(NULL, root, &ref->key_for_search, path, + 0, 0); + else + ret = btrfs_search_old_slot(root, &ref->key_for_search, path, + time_seq); /* root node has been locked, we can release @subvol_srcu safely here */ srcu_read_unlock(&fs_info->subvol_srcu, index); @@ -491,7 +505,9 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info, BUG_ON(!ref->wanted_disk_byte); eb = read_tree_block(fs_info->tree_root, ref->wanted_disk_byte, 0); - if (!eb || !extent_buffer_uptodate(eb)) { + if (IS_ERR(eb)) { + return PTR_ERR(eb); + } else if (!extent_buffer_uptodate(eb)) { free_extent_buffer(eb); return -EIO; } @@ -507,7 +523,7 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info, } /* - * merge two lists of backrefs and adjust counts accordingly + * merge backrefs and adjust counts accordingly * * mode = 1: merge identical keys, if key is set * FIXME: if we add more keys in __add_prelim_ref, we can merge more here. @@ -535,9 +551,9 @@ static void __merge_refs(struct list_head *head, int mode) ref2 = list_entry(pos2, struct __prelim_ref, list); + if (!ref_for_same_block(ref1, ref2)) + continue; if (mode == 1) { - if (!ref_for_same_block(ref1, ref2)) - continue; if (!ref1->parent && ref2->parent) { xchg = ref1; ref1 = ref2; @@ -572,8 +588,8 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, struct list_head *prefs, u64 *total_refs, u64 inum) { + struct btrfs_delayed_ref_node *node; struct btrfs_delayed_extent_op *extent_op = head->extent_op; - struct rb_node *n = &head->node.rb_node; struct btrfs_key key; struct btrfs_key op_key = {0}; int sgn; @@ -583,12 +599,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, btrfs_disk_key_to_cpu(&op_key, &extent_op->key); spin_lock(&head->lock); - n = rb_first(&head->ref_root); - while (n) { - struct btrfs_delayed_ref_node *node; - node = rb_entry(n, struct btrfs_delayed_ref_node, - rb_node); - n = rb_next(n); + list_for_each_entry(node, &head->ref_list, list) { if (node->seq > seq) continue; @@ -882,6 +893,11 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info, * * NOTE: This can return values > 0 * + * If time_seq is set to (u64)-1, it will not search delayed_refs, and behave + * much like trans == NULL case, the difference only lies in it will not + * commit root. + * The special case is for qgroup to search roots in commit_transaction(). + * * FIXME some caching might speed things up */ static int find_parent_nodes(struct btrfs_trans_handle *trans, @@ -920,6 +936,9 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, path->skip_locking = 1; } + if (time_seq == (u64)-1) + path->skip_locking = 1; + /* * grab both a lock on the path and a lock on the delayed ref head. * We need both to get a consistent picture of how the refs look @@ -934,9 +953,10 @@ again: BUG_ON(ret == 0); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS - if (trans && likely(trans->type != __TRANS_DUMMY)) { + if (trans && likely(trans->type != __TRANS_DUMMY) && + time_seq != (u64)-1) { #else - if (trans) { + if (trans && time_seq != (u64)-1) { #endif /* * look if there are updates for this ref queued and lock the @@ -1034,7 +1054,10 @@ again: eb = read_tree_block(fs_info->extent_root, ref->parent, 0); - if (!eb || !extent_buffer_uptodate(eb)) { + if (IS_ERR(eb)) { + ret = PTR_ERR(eb); + goto out; + } else if (!extent_buffer_uptodate(eb)) { free_extent_buffer(eb); ret = -EIO; goto out; diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 0ef5cc13f..81220b220 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -44,6 +44,8 @@ #define BTRFS_INODE_IN_DELALLOC_LIST 9 #define BTRFS_INODE_READDIO_NEED_LOCK 10 #define BTRFS_INODE_HAS_PROPS 11 +/* DIO is ready to submit */ +#define BTRFS_INODE_DIO_READY 12 /* * The following 3 bits are meant only for the btree inode. * When any of them is set, it means an error happened while writing an diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 0f11ebc92..54114b488 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1439,8 +1439,9 @@ get_old_root(struct btrfs_root *root, u64 time_seq) btrfs_tree_read_unlock(eb_root); free_extent_buffer(eb_root); old = read_tree_block(root, logical, 0); - if (WARN_ON(!old || !extent_buffer_uptodate(old))) { - free_extent_buffer(old); + if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) { + if (!IS_ERR(old)) + free_extent_buffer(old); btrfs_warn(root->fs_info, "failed to read tree block %llu from get_old_root", logical); } else { @@ -1685,7 +1686,9 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, if (!cur || !uptodate) { if (!cur) { cur = read_tree_block(root, blocknr, gen); - if (!cur || !extent_buffer_uptodate(cur)) { + if (IS_ERR(cur)) { + return PTR_ERR(cur); + } else if (!extent_buffer_uptodate(cur)) { free_extent_buffer(cur); return -EIO; } @@ -1864,8 +1867,9 @@ static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root, eb = read_tree_block(root, btrfs_node_blockptr(parent, slot), btrfs_node_ptr_generation(parent, slot)); - if (eb && !extent_buffer_uptodate(eb)) { - free_extent_buffer(eb); + if (IS_ERR(eb) || !extent_buffer_uptodate(eb)) { + if (!IS_ERR(eb)) + free_extent_buffer(eb); eb = NULL; } @@ -2494,7 +2498,7 @@ read_block_for_search(struct btrfs_trans_handle *trans, ret = -EAGAIN; tmp = read_tree_block(root, blocknr, 0); - if (tmp) { + if (!IS_ERR(tmp)) { /* * If the read above didn't mark this buffer up to date, * it will never end up being up to date. Set ret to EIO now diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 6f364e1d8..aac314e14 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -174,7 +174,7 @@ struct btrfs_ordered_sum; /* csum types */ #define BTRFS_CSUM_TYPE_CRC32 0 -static int btrfs_csum_sizes[] = { 4, 0 }; +static int btrfs_csum_sizes[] = { 4 }; /* four bytes for CRC32 */ #define BTRFS_EMPTY_DIR_SIZE 0 @@ -1619,10 +1619,7 @@ struct btrfs_fs_info { struct task_struct *cleaner_kthread; int thread_pool_size; - struct kobject super_kobj; struct kobject *space_info_kobj; - struct kobject *device_dir_kobj; - struct completion kobj_unregister; int do_barriers; int closing; int log_root_recovering; @@ -1698,6 +1695,7 @@ struct btrfs_fs_info { struct btrfs_workqueue *scrub_workers; struct btrfs_workqueue *scrub_wr_completion_workers; struct btrfs_workqueue *scrub_nocow_workers; + struct btrfs_workqueue *scrub_parity_workers; #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY u32 check_integrity_print_mask; @@ -1735,7 +1733,7 @@ struct btrfs_fs_info { /* list of dirty qgroups to be written at next commit */ struct list_head dirty_qgroups; - /* used by btrfs_qgroup_record_ref for an efficient tree traversal */ + /* used by qgroup for an efficient tree traversal */ u64 qgroup_seq; /* qgroup rescan items */ @@ -1780,6 +1778,7 @@ struct btrfs_fs_info { spinlock_t unused_bgs_lock; struct list_head unused_bgs; struct mutex unused_bg_unpin_mutex; + struct mutex delete_unused_bgs_mutex; /* For btrfs to record security options */ struct security_mnt_opts security_opts; @@ -3458,6 +3457,7 @@ int btrfs_check_data_free_space(struct inode *inode, u64 bytes, u64 write_bytes) void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes); void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, struct btrfs_root *root); +void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans); int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, struct inode *inode); void btrfs_orphan_release_metadata(struct inode *inode); @@ -3515,6 +3515,9 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, int __get_raid_index(u64 flags); int btrfs_start_write_no_snapshoting(struct btrfs_root *root); void btrfs_end_write_no_snapshoting(struct btrfs_root *root); +void check_system_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const u64 type); /* ctree.c */ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, int level, int *slot); @@ -4050,6 +4053,7 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...) #ifdef CONFIG_BTRFS_ASSERT +__cold static inline void assfail(char *expr, char *file, int line) { pr_err("BTRFS: assertion failed: %s, file: %s, line: %d", @@ -4065,10 +4069,12 @@ static inline void assfail(char *expr, char *file, int line) #define btrfs_assert() __printf(5, 6) +__cold void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...); +__cold void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *function, unsigned int line, int errno); @@ -4111,11 +4117,17 @@ static inline int __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag) * Call btrfs_abort_transaction as early as possible when an error condition is * detected, that way the exact line number is reported. */ - #define btrfs_abort_transaction(trans, root, errno) \ do { \ - __btrfs_abort_transaction(trans, root, __func__, \ - __LINE__, errno); \ + /* Report first abort since mount */ \ + if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \ + &((root)->fs_info->fs_state))) { \ + WARN(1, KERN_DEBUG \ + "BTRFS: Transaction aborted (error %d)\n", \ + (errno)); \ + } \ + __btrfs_abort_transaction((trans), (root), __func__, \ + __LINE__, (errno)); \ } while (0) #define btrfs_std_error(fs_info, errno) \ @@ -4132,6 +4144,7 @@ do { \ } while (0) __printf(5, 6) +__cold void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...); diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 8f8ed7d20..ac3e81da6 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -22,6 +22,7 @@ #include "ctree.h" #include "delayed-ref.h" #include "transaction.h" +#include "qgroup.h" struct kmem_cache *btrfs_delayed_ref_head_cachep; struct kmem_cache *btrfs_delayed_tree_ref_cachep; @@ -84,87 +85,6 @@ static int comp_data_refs(struct btrfs_delayed_data_ref *ref2, return 0; } -/* - * entries in the rb tree are ordered by the byte number of the extent, - * type of the delayed backrefs and content of delayed backrefs. - */ -static int comp_entry(struct btrfs_delayed_ref_node *ref2, - struct btrfs_delayed_ref_node *ref1, - bool compare_seq) -{ - if (ref1->bytenr < ref2->bytenr) - return -1; - if (ref1->bytenr > ref2->bytenr) - return 1; - if (ref1->is_head && ref2->is_head) - return 0; - if (ref2->is_head) - return -1; - if (ref1->is_head) - return 1; - if (ref1->type < ref2->type) - return -1; - if (ref1->type > ref2->type) - return 1; - if (ref1->no_quota > ref2->no_quota) - return 1; - if (ref1->no_quota < ref2->no_quota) - return -1; - /* merging of sequenced refs is not allowed */ - if (compare_seq) { - if (ref1->seq < ref2->seq) - return -1; - if (ref1->seq > ref2->seq) - return 1; - } - if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY || - ref1->type == BTRFS_SHARED_BLOCK_REF_KEY) { - return comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref2), - btrfs_delayed_node_to_tree_ref(ref1), - ref1->type); - } else if (ref1->type == BTRFS_EXTENT_DATA_REF_KEY || - ref1->type == BTRFS_SHARED_DATA_REF_KEY) { - return comp_data_refs(btrfs_delayed_node_to_data_ref(ref2), - btrfs_delayed_node_to_data_ref(ref1)); - } - BUG(); - return 0; -} - -/* - * insert a new ref into the rbtree. This returns any existing refs - * for the same (bytenr,parent) tuple, or NULL if the new node was properly - * inserted. - */ -static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root, - struct rb_node *node) -{ - struct rb_node **p = &root->rb_node; - struct rb_node *parent_node = NULL; - struct btrfs_delayed_ref_node *entry; - struct btrfs_delayed_ref_node *ins; - int cmp; - - ins = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - while (*p) { - parent_node = *p; - entry = rb_entry(parent_node, struct btrfs_delayed_ref_node, - rb_node); - - cmp = comp_entry(entry, ins, 1); - if (cmp < 0) - p = &(*p)->rb_left; - else if (cmp > 0) - p = &(*p)->rb_right; - else - return entry; - } - - rb_link_node(node, parent_node, p); - rb_insert_color(node, root); - return NULL; -} - /* insert a new ref to head ref rbtree */ static struct btrfs_delayed_ref_head *htree_insert(struct rb_root *root, struct rb_node *node) @@ -268,7 +188,7 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans, rb_erase(&head->href_node, &delayed_refs->href_root); } else { assert_spin_locked(&head->lock); - rb_erase(&ref->rb_node, &head->ref_root); + list_del(&ref->list); } ref->in_tree = 0; btrfs_put_delayed_ref(ref); @@ -277,99 +197,6 @@ static inline void drop_delayed_ref(struct btrfs_trans_handle *trans, trans->delayed_ref_updates--; } -static int merge_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_delayed_ref_head *head, - struct btrfs_delayed_ref_node *ref, u64 seq) -{ - struct rb_node *node; - int mod = 0; - int done = 0; - - node = rb_next(&ref->rb_node); - while (!done && node) { - struct btrfs_delayed_ref_node *next; - - next = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - node = rb_next(node); - if (seq && next->seq >= seq) - break; - if (comp_entry(ref, next, 0)) - continue; - - if (ref->action == next->action) { - mod = next->ref_mod; - } else { - if (ref->ref_mod < next->ref_mod) { - struct btrfs_delayed_ref_node *tmp; - - tmp = ref; - ref = next; - next = tmp; - done = 1; - } - mod = -next->ref_mod; - } - - drop_delayed_ref(trans, delayed_refs, head, next); - ref->ref_mod += mod; - if (ref->ref_mod == 0) { - drop_delayed_ref(trans, delayed_refs, head, ref); - done = 1; - } else { - /* - * You can't have multiples of the same ref on a tree - * block. - */ - WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY || - ref->type == BTRFS_SHARED_BLOCK_REF_KEY); - } - } - return done; -} - -void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_delayed_ref_head *head) -{ - struct rb_node *node; - u64 seq = 0; - - assert_spin_locked(&head->lock); - /* - * We don't have too much refs to merge in the case of delayed data - * refs. - */ - if (head->is_data) - return; - - spin_lock(&fs_info->tree_mod_seq_lock); - if (!list_empty(&fs_info->tree_mod_seq_list)) { - struct seq_list *elem; - - elem = list_first_entry(&fs_info->tree_mod_seq_list, - struct seq_list, list); - seq = elem->seq; - } - spin_unlock(&fs_info->tree_mod_seq_lock); - - node = rb_first(&head->ref_root); - while (node) { - struct btrfs_delayed_ref_node *ref; - - ref = rb_entry(node, struct btrfs_delayed_ref_node, - rb_node); - /* We can't merge refs that are outside of our seq count */ - if (seq && ref->seq >= seq) - break; - if (merge_ref(trans, delayed_refs, head, ref, seq)) - node = rb_first(&head->ref_root); - else - node = rb_next(&ref->rb_node); - } -} - int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, u64 seq) @@ -443,45 +270,71 @@ again: } /* - * helper function to update an extent delayed ref in the - * rbtree. existing and update must both have the same - * bytenr and parent + * Helper to insert the ref_node to the tail or merge with tail. * - * This may free existing if the update cancels out whatever - * operation it was doing. + * Return 0 for insert. + * Return >0 for merge. */ -static noinline void -update_existing_ref(struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_root *delayed_refs, - struct btrfs_delayed_ref_head *head, - struct btrfs_delayed_ref_node *existing, - struct btrfs_delayed_ref_node *update) +static int +add_delayed_ref_tail_merge(struct btrfs_trans_handle *trans, + struct btrfs_delayed_ref_root *root, + struct btrfs_delayed_ref_head *href, + struct btrfs_delayed_ref_node *ref) { - if (update->action != existing->action) { - /* - * this is effectively undoing either an add or a - * drop. We decrement the ref_mod, and if it goes - * down to zero we just delete the entry without - * every changing the extent allocation tree. - */ - existing->ref_mod--; - if (existing->ref_mod == 0) - drop_delayed_ref(trans, delayed_refs, head, existing); - else - WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY || - existing->type == BTRFS_SHARED_BLOCK_REF_KEY); + struct btrfs_delayed_ref_node *exist; + int mod; + int ret = 0; + + spin_lock(&href->lock); + /* Check whether we can merge the tail node with ref */ + if (list_empty(&href->ref_list)) + goto add_tail; + exist = list_entry(href->ref_list.prev, struct btrfs_delayed_ref_node, + list); + /* No need to compare bytenr nor is_head */ + if (exist->type != ref->type || exist->no_quota != ref->no_quota || + exist->seq != ref->seq) + goto add_tail; + + if ((exist->type == BTRFS_TREE_BLOCK_REF_KEY || + exist->type == BTRFS_SHARED_BLOCK_REF_KEY) && + comp_tree_refs(btrfs_delayed_node_to_tree_ref(exist), + btrfs_delayed_node_to_tree_ref(ref), + ref->type)) + goto add_tail; + if ((exist->type == BTRFS_EXTENT_DATA_REF_KEY || + exist->type == BTRFS_SHARED_DATA_REF_KEY) && + comp_data_refs(btrfs_delayed_node_to_data_ref(exist), + btrfs_delayed_node_to_data_ref(ref))) + goto add_tail; + + /* Now we are sure we can merge */ + ret = 1; + if (exist->action == ref->action) { + mod = ref->ref_mod; } else { - WARN_ON(existing->type == BTRFS_TREE_BLOCK_REF_KEY || - existing->type == BTRFS_SHARED_BLOCK_REF_KEY); - /* - * the action on the existing ref matches - * the action on the ref we're trying to add. - * Bump the ref_mod by one so the backref that - * is eventually added/removed has the correct - * reference count - */ - existing->ref_mod += update->ref_mod; + /* Need to change action */ + if (exist->ref_mod < ref->ref_mod) { + exist->action = ref->action; + mod = -exist->ref_mod; + exist->ref_mod = ref->ref_mod; + } else + mod = -ref->ref_mod; } + exist->ref_mod += mod; + + /* remove existing tail if its ref_mod is zero */ + if (exist->ref_mod == 0) + drop_delayed_ref(trans, root, href, exist); + spin_unlock(&href->lock); + return ret; + +add_tail: + list_add_tail(&ref->list, &href->ref_list); + atomic_inc(&root->num_entries); + trans->delayed_ref_updates++; + spin_unlock(&href->lock); + return ret; } /* @@ -568,12 +421,14 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, static noinline struct btrfs_delayed_ref_head * add_delayed_ref_head(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans, - struct btrfs_delayed_ref_node *ref, u64 bytenr, - u64 num_bytes, int action, int is_data) + struct btrfs_delayed_ref_node *ref, + struct btrfs_qgroup_extent_record *qrecord, + u64 bytenr, u64 num_bytes, int action, int is_data) { struct btrfs_delayed_ref_head *existing; struct btrfs_delayed_ref_head *head_ref = NULL; struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_qgroup_extent_record *qexisting; int count_mod = 1; int must_insert_reserved = 0; @@ -618,10 +473,22 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info, head_ref = btrfs_delayed_node_to_head(ref); head_ref->must_insert_reserved = must_insert_reserved; head_ref->is_data = is_data; - head_ref->ref_root = RB_ROOT; + INIT_LIST_HEAD(&head_ref->ref_list); head_ref->processing = 0; head_ref->total_ref_mod = count_mod; + /* Record qgroup extent info if provided */ + if (qrecord) { + qrecord->bytenr = bytenr; + qrecord->num_bytes = num_bytes; + qrecord->old_roots = NULL; + + qexisting = btrfs_qgroup_insert_dirty_extent(delayed_refs, + qrecord); + if (qexisting) + kfree(qrecord); + } + spin_lock_init(&head_ref->lock); mutex_init(&head_ref->mutex); @@ -659,10 +526,10 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info, u64 num_bytes, u64 parent, u64 ref_root, int level, int action, int no_quota) { - struct btrfs_delayed_ref_node *existing; struct btrfs_delayed_tree_ref *full_ref; struct btrfs_delayed_ref_root *delayed_refs; u64 seq = 0; + int ret; if (action == BTRFS_ADD_DELAYED_EXTENT) action = BTRFS_ADD_DELAYED_REF; @@ -693,21 +560,14 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info, trace_add_delayed_tree_ref(ref, full_ref, action); - spin_lock(&head_ref->lock); - existing = tree_insert(&head_ref->ref_root, &ref->rb_node); - if (existing) { - update_existing_ref(trans, delayed_refs, head_ref, existing, - ref); - /* - * we've updated the existing ref, free the newly - * allocated ref - */ + ret = add_delayed_ref_tail_merge(trans, delayed_refs, head_ref, ref); + + /* + * XXX: memory should be freed at the same level allocated. + * But bad practice is anywhere... Follow it now. Need cleanup. + */ + if (ret > 0) kmem_cache_free(btrfs_delayed_tree_ref_cachep, full_ref); - } else { - atomic_inc(&delayed_refs->num_entries); - trans->delayed_ref_updates++; - } - spin_unlock(&head_ref->lock); } /* @@ -721,10 +581,10 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info, u64 num_bytes, u64 parent, u64 ref_root, u64 owner, u64 offset, int action, int no_quota) { - struct btrfs_delayed_ref_node *existing; struct btrfs_delayed_data_ref *full_ref; struct btrfs_delayed_ref_root *delayed_refs; u64 seq = 0; + int ret; if (action == BTRFS_ADD_DELAYED_EXTENT) action = BTRFS_ADD_DELAYED_REF; @@ -758,21 +618,10 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info, trace_add_delayed_data_ref(ref, full_ref, action); - spin_lock(&head_ref->lock); - existing = tree_insert(&head_ref->ref_root, &ref->rb_node); - if (existing) { - update_existing_ref(trans, delayed_refs, head_ref, existing, - ref); - /* - * we've updated the existing ref, free the newly - * allocated ref - */ + ret = add_delayed_ref_tail_merge(trans, delayed_refs, head_ref, ref); + + if (ret > 0) kmem_cache_free(btrfs_delayed_data_ref_cachep, full_ref); - } else { - atomic_inc(&delayed_refs->num_entries); - trans->delayed_ref_updates++; - } - spin_unlock(&head_ref->lock); } /* @@ -790,6 +639,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, struct btrfs_delayed_tree_ref *ref; struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_qgroup_extent_record *record = NULL; if (!is_fstree(ref_root) || !fs_info->quota_enabled) no_quota = 0; @@ -800,9 +650,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, return -ENOMEM; head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS); - if (!head_ref) { - kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); - return -ENOMEM; + if (!head_ref) + goto free_ref; + + if (fs_info->quota_enabled && is_fstree(ref_root)) { + record = kmalloc(sizeof(*record), GFP_NOFS); + if (!record) + goto free_head_ref; } head_ref->extent_op = extent_op; @@ -814,7 +668,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, * insert both the head node and the new ref without dropping * the spin lock */ - head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, + head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record, bytenr, num_bytes, action, 0); add_delayed_tree_ref(fs_info, trans, head_ref, &ref->node, bytenr, @@ -823,6 +677,13 @@ int btrfs_add_delayed_tree_ref(struct btrfs_fs_info *fs_info, spin_unlock(&delayed_refs->lock); return 0; + +free_head_ref: + kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref); +free_ref: + kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); + + return -ENOMEM; } /* @@ -839,6 +700,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, struct btrfs_delayed_data_ref *ref; struct btrfs_delayed_ref_head *head_ref; struct btrfs_delayed_ref_root *delayed_refs; + struct btrfs_qgroup_extent_record *record = NULL; if (!is_fstree(ref_root) || !fs_info->quota_enabled) no_quota = 0; @@ -854,6 +716,16 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, return -ENOMEM; } + if (fs_info->quota_enabled && is_fstree(ref_root)) { + record = kmalloc(sizeof(*record), GFP_NOFS); + if (!record) { + kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); + kmem_cache_free(btrfs_delayed_ref_head_cachep, + head_ref); + return -ENOMEM; + } + } + head_ref->extent_op = extent_op; delayed_refs = &trans->transaction->delayed_refs; @@ -863,7 +735,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_fs_info *fs_info, * insert both the head node and the new ref without dropping * the spin lock */ - head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, + head_ref = add_delayed_ref_head(fs_info, trans, &head_ref->node, record, bytenr, num_bytes, action, 1); add_delayed_data_ref(fs_info, trans, head_ref, &ref->node, bytenr, @@ -891,9 +763,9 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info *fs_info, delayed_refs = &trans->transaction->delayed_refs; spin_lock(&delayed_refs->lock); - add_delayed_ref_head(fs_info, trans, &head_ref->node, bytenr, - num_bytes, BTRFS_UPDATE_DELAYED_HEAD, - extent_op->is_data); + add_delayed_ref_head(fs_info, trans, &head_ref->node, NULL, bytenr, + num_bytes, BTRFS_UPDATE_DELAYED_HEAD, + extent_op->is_data); spin_unlock(&delayed_refs->lock); return 0; diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 5eb089239..13fb5e609 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -24,9 +24,25 @@ #define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */ #define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */ +/* + * XXX: Qu: I really hate the design that ref_head and tree/data ref shares the + * same ref_node structure. + * Ref_head is in a higher logic level than tree/data ref, and duplicated + * bytenr/num_bytes in ref_node is really a waste or memory, they should be + * referred from ref_head. + * This gets more disgusting after we use list to store tree/data ref in + * ref_head. Must clean this mess up later. + */ struct btrfs_delayed_ref_node { + /* + * ref_head use rb tree, stored in ref_root->href. + * indexed by bytenr + */ struct rb_node rb_node; + /*data/tree ref use list, stored in ref_head->ref_list. */ + struct list_head list; + /* the starting bytenr of the extent */ u64 bytenr; @@ -83,7 +99,7 @@ struct btrfs_delayed_ref_head { struct mutex mutex; spinlock_t lock; - struct rb_root ref_root; + struct list_head ref_list; struct rb_node href_node; @@ -132,6 +148,9 @@ struct btrfs_delayed_ref_root { /* head ref rbtree */ struct rb_root href_root; + /* dirty extent records */ + struct rb_root dirty_extent_root; + /* this spin lock protects the rbtree and the entries inside */ spinlock_t lock; @@ -156,6 +175,14 @@ struct btrfs_delayed_ref_root { int flushing; u64 run_delayed_start; + + /* + * To make qgroup to skip given root. + * This is for snapshot, as btrfs_qgroup_inherit() will manully + * modify counters for snapshot and its source, so we should skip + * the snapshot in new_root/old_roots or it will get calculated twice + */ + u64 qgroup_to_skip; }; extern struct kmem_cache *btrfs_delayed_ref_head_cachep; diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 0573848c7..564a7de17 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -376,6 +376,10 @@ int btrfs_dev_replace_start(struct btrfs_root *root, WARN_ON(!tgt_device); dev_replace->tgtdev = tgt_device; + ret = btrfs_kobj_add_device(tgt_device->fs_devices, tgt_device); + if (ret) + btrfs_err(root->fs_info, "kobj add dev failed %d\n", ret); + printk_in_rcu(KERN_INFO "BTRFS: dev_replace from %s (devid %llu) to %s started\n", src_device->missing ? "<missing disk>" : @@ -583,8 +587,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, mutex_unlock(&uuid_mutex); /* replace the sysfs entry */ - btrfs_kobj_rm_device(fs_info, src_device); - btrfs_kobj_add_device(fs_info, tgt_device); + btrfs_kobj_rm_device(fs_info->fs_devices, src_device); btrfs_rm_dev_replace_free_srcdev(fs_info, src_device); /* write back the superblocks */ diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 2ef9a4b72..f556c3732 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1149,12 +1149,12 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, buf = btrfs_find_create_tree_block(root, bytenr); if (!buf) - return NULL; + return ERR_PTR(-ENOMEM); ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); if (ret) { free_extent_buffer(buf); - return NULL; + return ERR_PTR(ret); } return buf; @@ -1509,20 +1509,19 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, generation = btrfs_root_generation(&root->root_item); root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), generation); - if (!root->node) { - ret = -ENOMEM; + if (IS_ERR(root->node)) { + ret = PTR_ERR(root->node); goto find_fail; } else if (!btrfs_buffer_uptodate(root->node, generation, 0)) { ret = -EIO; - goto read_fail; + free_extent_buffer(root->node); + goto find_fail; } root->commit_root = btrfs_root_node(root); out: btrfs_free_path(path); return root; -read_fail: - free_extent_buffer(root->node); find_fail: kfree(root); alloc_fail: @@ -1745,13 +1744,14 @@ static void end_workqueue_fn(struct btrfs_work *work) bio->bi_private = end_io_wq->private; bio->bi_end_io = end_io_wq->end_io; kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq); - bio_endio_nodec(bio, error); + bio_endio(bio, error); } static int cleaner_kthread(void *arg) { struct btrfs_root *root = arg; int again; + struct btrfs_trans_handle *trans; do { again = 0; @@ -1773,7 +1773,6 @@ static int cleaner_kthread(void *arg) } btrfs_run_delayed_iputs(root); - btrfs_delete_unused_bgs(root->fs_info); again = btrfs_clean_one_deleted_snapshot(root); mutex_unlock(&root->fs_info->cleaner_mutex); @@ -1782,6 +1781,16 @@ static int cleaner_kthread(void *arg) * needn't do anything special here. */ btrfs_run_defrag_inodes(root->fs_info); + + /* + * Acquires fs_info->delete_unused_bgs_mutex to avoid racing + * with relocation (btrfs_relocate_chunk) and relocation + * acquires fs_info->cleaner_mutex (btrfs_relocate_block_group) + * after acquiring fs_info->delete_unused_bgs_mutex. So we + * can't hold, nor need to, fs_info->cleaner_mutex when deleting + * unused block groups. + */ + btrfs_delete_unused_bgs(root->fs_info); sleep: if (!try_to_freeze() && !again) { set_current_state(TASK_INTERRUPTIBLE); @@ -1790,6 +1799,34 @@ sleep: __set_current_state(TASK_RUNNING); } } while (!kthread_should_stop()); + + /* + * Transaction kthread is stopped before us and wakes us up. + * However we might have started a new transaction and COWed some + * tree blocks when deleting unused block groups for example. So + * make sure we commit the transaction we started to have a clean + * shutdown when evicting the btree inode - if it has dirty pages + * when we do the final iput() on it, eviction will trigger a + * writeback for it which will fail with null pointer dereferences + * since work queues and other resources were already released and + * destroyed by the time the iput/eviction/writeback is made. + */ + trans = btrfs_attach_transaction(root); + if (IS_ERR(trans)) { + if (PTR_ERR(trans) != -ENOENT) + btrfs_err(root->fs_info, + "cleaner transaction attach returned %ld", + PTR_ERR(trans)); + } else { + int ret; + + ret = btrfs_commit_transaction(trans, root); + if (ret) + btrfs_err(root->fs_info, + "cleaner open transaction commit returned %d", + ret); + } + return 0; } @@ -2320,8 +2357,12 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info, log_tree_root->node = read_tree_block(tree_root, bytenr, fs_info->generation + 1); - if (!log_tree_root->node || - !extent_buffer_uptodate(log_tree_root->node)) { + if (IS_ERR(log_tree_root->node)) { + printk(KERN_ERR "BTRFS: failed to read log tree\n"); + ret = PTR_ERR(log_tree_root->node); + kfree(log_tree_root); + return ret; + } else if (!extent_buffer_uptodate(log_tree_root->node)) { printk(KERN_ERR "BTRFS: failed to read log tree\n"); free_extent_buffer(log_tree_root->node); kfree(log_tree_root); @@ -2489,12 +2530,12 @@ int open_ctree(struct super_block *sb, spin_lock_init(&fs_info->unused_bgs_lock); rwlock_init(&fs_info->tree_mod_log_lock); mutex_init(&fs_info->unused_bg_unpin_mutex); + mutex_init(&fs_info->delete_unused_bgs_mutex); mutex_init(&fs_info->reloc_mutex); mutex_init(&fs_info->delalloc_root_mutex); seqlock_init(&fs_info->profiles_lock); init_rwsem(&fs_info->delayed_iput_sem); - init_completion(&fs_info->kobj_unregister); INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); INIT_LIST_HEAD(&fs_info->space_info); INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); @@ -2797,10 +2838,11 @@ int open_ctree(struct super_block *sb, chunk_root->node = read_tree_block(chunk_root, btrfs_super_chunk_root(disk_super), generation); - if (!chunk_root->node || - !test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) { + if (IS_ERR(chunk_root->node) || + !extent_buffer_uptodate(chunk_root->node)) { printk(KERN_ERR "BTRFS: failed to read chunk root on %s\n", sb->s_id); + chunk_root->node = NULL; goto fail_tree_roots; } btrfs_set_root_node(&chunk_root->root_item, chunk_root->node); @@ -2834,11 +2876,11 @@ retry_root_backup: tree_root->node = read_tree_block(tree_root, btrfs_super_root(disk_super), generation); - if (!tree_root->node || - !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) { + if (IS_ERR(tree_root->node) || + !extent_buffer_uptodate(tree_root->node)) { printk(KERN_WARNING "BTRFS: failed to read tree root on %s\n", sb->s_id); - + tree_root->node = NULL; goto recovery_tree_root; } @@ -2874,10 +2916,22 @@ retry_root_backup: btrfs_close_extra_devices(fs_devices, 1); + ret = btrfs_sysfs_add_fsid(fs_devices, NULL); + if (ret) { + pr_err("BTRFS: failed to init sysfs fsid interface: %d\n", ret); + goto fail_block_groups; + } + + ret = btrfs_sysfs_add_device(fs_devices); + if (ret) { + pr_err("BTRFS: failed to init sysfs device interface: %d\n", ret); + goto fail_fsdev_sysfs; + } + ret = btrfs_sysfs_add_one(fs_info); if (ret) { pr_err("BTRFS: failed to init sysfs interface: %d\n", ret); - goto fail_block_groups; + goto fail_fsdev_sysfs; } ret = btrfs_init_space_info(fs_info); @@ -3055,6 +3109,9 @@ fail_cleaner: fail_sysfs: btrfs_sysfs_remove_one(fs_info); +fail_fsdev_sysfs: + btrfs_sysfs_remove_fsid(fs_info->fs_devices); + fail_block_groups: btrfs_put_block_group_cache(fs_info); btrfs_free_block_groups(fs_info); @@ -3269,11 +3326,8 @@ static int write_dev_supers(struct btrfs_device *device, */ static void btrfs_end_empty_barrier(struct bio *bio, int err) { - if (err) { - if (err == -EOPNOTSUPP) - set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); + if (err) clear_bit(BIO_UPTODATE, &bio->bi_flags); - } if (bio->bi_private) complete(bio->bi_private); bio_put(bio); @@ -3301,11 +3355,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait) wait_for_completion(&device->flush_wait); - if (bio_flagged(bio, BIO_EOPNOTSUPP)) { - printk_in_rcu("BTRFS: disabling barriers on dev %s\n", - rcu_str_deref(device->name)); - device->nobarriers = 1; - } else if (!bio_flagged(bio, BIO_UPTODATE)) { + if (!bio_flagged(bio, BIO_UPTODATE)) { ret = -EIO; btrfs_dev_stat_inc_and_print(device, BTRFS_DEV_STAT_FLUSH_ERRS); @@ -3732,6 +3782,7 @@ void close_ctree(struct btrfs_root *root) } btrfs_sysfs_remove_one(fs_info); + btrfs_sysfs_remove_fsid(fs_info->fs_devices); btrfs_free_fs_roots(fs_info); @@ -4060,6 +4111,7 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, while ((node = rb_first(&delayed_refs->href_root)) != NULL) { struct btrfs_delayed_ref_head *head; + struct btrfs_delayed_ref_node *tmp; bool pin_bytes = false; head = rb_entry(node, struct btrfs_delayed_ref_head, @@ -4075,11 +4127,10 @@ static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, continue; } spin_lock(&head->lock); - while ((node = rb_first(&head->ref_root)) != NULL) { - ref = rb_entry(node, struct btrfs_delayed_ref_node, - rb_node); + list_for_each_entry_safe_reverse(ref, tmp, &head->ref_list, + list) { ref->in_tree = 0; - rb_erase(&ref->rb_node, &head->ref_root); + list_del(&ref->list); atomic_dec(&delayed_refs->num_entries); btrfs_put_delayed_ref(ref); } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0ec3acd14..07204bf60 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -79,11 +79,10 @@ static int update_block_group(struct btrfs_trans_handle *trans, u64 num_bytes, int alloc); static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 bytenr, u64 num_bytes, u64 parent, + struct btrfs_delayed_ref_node *node, u64 parent, u64 root_objectid, u64 owner_objectid, u64 owner_offset, int refs_to_drop, - struct btrfs_delayed_extent_op *extra_op, - int no_quota); + struct btrfs_delayed_extent_op *extra_op); static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op, struct extent_buffer *leaf, struct btrfs_extent_item *ei); @@ -1967,10 +1966,9 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 bytenr, u64 num_bytes, + struct btrfs_delayed_ref_node *node, u64 parent, u64 root_objectid, u64 owner, u64 offset, int refs_to_add, - int no_quota, struct btrfs_delayed_extent_op *extent_op) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -1978,9 +1976,11 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; struct btrfs_extent_item *item; struct btrfs_key key; + u64 bytenr = node->bytenr; + u64 num_bytes = node->num_bytes; u64 refs; int ret; - enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_ADD_EXCL; + int no_quota = node->no_quota; path = btrfs_alloc_path(); if (!path) @@ -1996,26 +1996,8 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, bytenr, num_bytes, parent, root_objectid, owner, offset, refs_to_add, extent_op); - if ((ret < 0 && ret != -EAGAIN) || (!ret && no_quota)) + if ((ret < 0 && ret != -EAGAIN) || !ret) goto out; - /* - * Ok we were able to insert an inline extent and it appears to be a new - * reference, deal with the qgroup accounting. - */ - if (!ret && !no_quota) { - ASSERT(root->fs_info->quota_enabled); - leaf = path->nodes[0]; - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - item = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_extent_item); - if (btrfs_extent_refs(leaf, item) > (u64)refs_to_add) - type = BTRFS_QGROUP_OPER_ADD_SHARED; - btrfs_release_path(path); - - ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, - bytenr, num_bytes, type, 0); - goto out; - } /* * Ok we had -EAGAIN which means we didn't have space to insert and @@ -2026,8 +2008,6 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); refs = btrfs_extent_refs(leaf, item); - if (refs) - type = BTRFS_QGROUP_OPER_ADD_SHARED; btrfs_set_extent_refs(leaf, item, refs + refs_to_add); if (extent_op) __run_delayed_extent_op(extent_op, leaf, item); @@ -2035,13 +2015,6 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); - if (!no_quota) { - ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, - bytenr, num_bytes, type, 0); - if (ret) - goto out; - } - path->reada = 1; path->leave_spinning = 1; /* now insert the actual backref */ @@ -2087,17 +2060,15 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans, ref->objectid, ref->offset, &ins, node->ref_mod); } else if (node->action == BTRFS_ADD_DELAYED_REF) { - ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, - node->num_bytes, parent, + ret = __btrfs_inc_extent_ref(trans, root, node, parent, ref_root, ref->objectid, ref->offset, node->ref_mod, - node->no_quota, extent_op); + extent_op); } else if (node->action == BTRFS_DROP_DELAYED_REF) { - ret = __btrfs_free_extent(trans, root, node->bytenr, - node->num_bytes, parent, + ret = __btrfs_free_extent(trans, root, node, parent, ref_root, ref->objectid, ref->offset, node->ref_mod, - extent_op, node->no_quota); + extent_op); } else { BUG(); } @@ -2255,15 +2226,14 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, ref->level, &ins, node->no_quota); } else if (node->action == BTRFS_ADD_DELAYED_REF) { - ret = __btrfs_inc_extent_ref(trans, root, node->bytenr, - node->num_bytes, parent, ref_root, - ref->level, 0, 1, node->no_quota, + ret = __btrfs_inc_extent_ref(trans, root, node, + parent, ref_root, + ref->level, 0, 1, extent_op); } else if (node->action == BTRFS_DROP_DELAYED_REF) { - ret = __btrfs_free_extent(trans, root, node->bytenr, - node->num_bytes, parent, ref_root, - ref->level, 0, 1, extent_op, - node->no_quota); + ret = __btrfs_free_extent(trans, root, node, + parent, ref_root, + ref->level, 0, 1, extent_op); } else { BUG(); } @@ -2323,28 +2293,27 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, return ret; } -static noinline struct btrfs_delayed_ref_node * +static inline struct btrfs_delayed_ref_node * select_delayed_ref(struct btrfs_delayed_ref_head *head) { - struct rb_node *node; - struct btrfs_delayed_ref_node *ref, *last = NULL;; + struct btrfs_delayed_ref_node *ref; + + if (list_empty(&head->ref_list)) + return NULL; /* - * select delayed ref of type BTRFS_ADD_DELAYED_REF first. - * this prevents ref count from going down to zero when - * there still are pending delayed ref. + * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first. + * This is to prevent a ref count from going down to zero, which deletes + * the extent item from the extent tree, when there still are references + * to add, which would fail because they would not find the extent item. */ - node = rb_first(&head->ref_root); - while (node) { - ref = rb_entry(node, struct btrfs_delayed_ref_node, - rb_node); + list_for_each_entry(ref, &head->ref_list, list) { if (ref->action == BTRFS_ADD_DELAYED_REF) return ref; - else if (last == NULL) - last = ref; - node = rb_next(node); } - return last; + + return list_entry(head->ref_list.next, struct btrfs_delayed_ref_node, + list); } /* @@ -2396,16 +2365,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, } } - /* - * We need to try and merge add/drops of the same ref since we - * can run into issues with relocate dropping the implicit ref - * and then it being added back again before the drop can - * finish. If we merged anything we need to re-loop so we can - * get a good ref. - */ spin_lock(&locked_ref->lock); - btrfs_merge_delayed_refs(trans, fs_info, delayed_refs, - locked_ref); /* * locked_ref is the head node, so we have to go one @@ -2482,7 +2442,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, spin_unlock(&locked_ref->lock); spin_lock(&delayed_refs->lock); spin_lock(&locked_ref->lock); - if (rb_first(&locked_ref->ref_root) || + if (!list_empty(&locked_ref->ref_list) || locked_ref->extent_op) { spin_unlock(&locked_ref->lock); spin_unlock(&delayed_refs->lock); @@ -2496,7 +2456,7 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, } else { actual_count++; ref->in_tree = 0; - rb_erase(&ref->rb_node, &locked_ref->ref_root); + list_del(&ref->list); } atomic_dec(&delayed_refs->num_entries); @@ -2864,9 +2824,6 @@ again: goto again; } out: - ret = btrfs_delayed_qgroup_accounting(trans, root->fs_info); - if (ret) - return ret; assert_qgroups_uptodate(trans); return 0; } @@ -2905,7 +2862,6 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *ref; struct btrfs_delayed_data_ref *data_ref; struct btrfs_delayed_ref_root *delayed_refs; - struct rb_node *node; int ret = 0; delayed_refs = &trans->transaction->delayed_refs; @@ -2934,11 +2890,7 @@ static noinline int check_delayed_ref(struct btrfs_trans_handle *trans, spin_unlock(&delayed_refs->lock); spin_lock(&head->lock); - node = rb_first(&head->ref_root); - while (node) { - ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node); - node = rb_next(node); - + list_for_each_entry(ref, &head->ref_list, list) { /* If it's a shared ref we know a cross reference exists */ if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) { ret = 1; @@ -3693,7 +3645,8 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, found->disk_total += total_bytes * factor; found->bytes_used += bytes_used; found->disk_used += bytes_used * factor; - found->full = 0; + if (total_bytes > 0) + found->full = 0; spin_unlock(&found->lock); *space_info = found; return 0; @@ -3721,7 +3674,10 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, found->bytes_reserved = 0; found->bytes_readonly = 0; found->bytes_may_use = 0; - found->full = 0; + if (total_bytes > 0) + found->full = 0; + else + found->full = 1; found->force_alloc = CHUNK_ALLOC_NO_FORCE; found->chunk_alloc = 0; found->flush = 0; @@ -3975,6 +3931,9 @@ commit_trans: !atomic_read(&root->fs_info->open_ioctl_trans)) { need_commit--; + if (need_commit > 0) + btrfs_wait_ordered_roots(fs_info, -1); + trans = btrfs_join_transaction(root); if (IS_ERR(trans)) return PTR_ERR(trans); @@ -4088,7 +4047,7 @@ static int should_alloc_chunk(struct btrfs_root *root, return 1; } -static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type) +static u64 get_profile_num_devs(struct btrfs_root *root, u64 type) { u64 num_dev; @@ -4102,24 +4061,43 @@ static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type) else num_dev = 1; /* DUP or single */ - /* metadata for updaing devices and chunk tree */ - return btrfs_calc_trans_metadata_size(root, num_dev + 1); + return num_dev; } -static void check_system_chunk(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u64 type) +/* + * If @is_allocation is true, reserve space in the system space info necessary + * for allocating a chunk, otherwise if it's false, reserve space necessary for + * removing a chunk. + */ +void check_system_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 type) { struct btrfs_space_info *info; u64 left; u64 thresh; + int ret = 0; + u64 num_devs; + + /* + * Needed because we can end up allocating a system chunk and for an + * atomic and race free space reservation in the chunk block reserve. + */ + ASSERT(mutex_is_locked(&root->fs_info->chunk_mutex)); info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM); spin_lock(&info->lock); left = info->total_bytes - info->bytes_used - info->bytes_pinned - - info->bytes_reserved - info->bytes_readonly; + info->bytes_reserved - info->bytes_readonly - + info->bytes_may_use; spin_unlock(&info->lock); - thresh = get_system_chunk_thresh(root, type); + num_devs = get_profile_num_devs(root, type); + + /* num_devs device items to update and 1 chunk item to add or remove */ + thresh = btrfs_calc_trunc_metadata_size(root, num_devs) + + btrfs_calc_trans_metadata_size(root, 1); + if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) { btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu", left, thresh, type); @@ -4130,7 +4108,21 @@ static void check_system_chunk(struct btrfs_trans_handle *trans, u64 flags; flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0); - btrfs_alloc_chunk(trans, root, flags); + /* + * Ignore failure to create system chunk. We might end up not + * needing it, as we might not need to COW all nodes/leafs from + * the paths we visit in the chunk tree (they were already COWed + * or created in the current transaction for example). + */ + ret = btrfs_alloc_chunk(trans, root, flags); + } + + if (!ret) { + ret = btrfs_block_rsv_add(root->fs_info->chunk_root, + &root->fs_info->chunk_block_rsv, + thresh, BTRFS_RESERVE_NO_FLUSH); + if (!ret) + trans->chunk_bytes_reserved += thresh; } } @@ -4235,6 +4227,24 @@ out: space_info->chunk_alloc = 0; spin_unlock(&space_info->lock); mutex_unlock(&fs_info->chunk_mutex); + /* + * When we allocate a new chunk we reserve space in the chunk block + * reserve to make sure we can COW nodes/leafs in the chunk tree or + * add new nodes/leafs to it if we end up needing to do it when + * inserting the chunk item and updating device items as part of the + * second phase of chunk allocation, performed by + * btrfs_finish_chunk_alloc(). So make sure we don't accumulate a + * large number of new block groups to create in our transaction + * handle's new_bgs list to avoid exhausting the chunk block reserve + * in extreme cases - like having a single transaction create many new + * block groups when starting to write out the free space caches of all + * the block groups that were made dirty during the lifetime of the + * transaction. + */ + if (trans->chunk_bytes_reserved >= (2 * 1024 * 1024ull)) { + btrfs_create_pending_block_groups(trans, trans->root); + btrfs_trans_release_chunk_metadata(trans); + } return ret; } @@ -5188,6 +5198,24 @@ void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans, trans->bytes_reserved = 0; } +/* + * To be called after all the new block groups attached to the transaction + * handle have been created (btrfs_create_pending_block_groups()). + */ +void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) +{ + struct btrfs_fs_info *fs_info = trans->root->fs_info; + + if (!trans->chunk_bytes_reserved) + return; + + WARN_ON_ONCE(!list_empty(&trans->new_bgs)); + + block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL, + trans->chunk_bytes_reserved); + trans->chunk_bytes_reserved = 0; +} + /* Can only return 0 or -ENOSPC */ int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans, struct inode *inode) @@ -6092,11 +6120,10 @@ static void add_pinned_bytes(struct btrfs_fs_info *fs_info, u64 num_bytes, static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 bytenr, u64 num_bytes, u64 parent, + struct btrfs_delayed_ref_node *node, u64 parent, u64 root_objectid, u64 owner_objectid, u64 owner_offset, int refs_to_drop, - struct btrfs_delayed_extent_op *extent_op, - int no_quota) + struct btrfs_delayed_extent_op *extent_op) { struct btrfs_key key; struct btrfs_path *path; @@ -6110,10 +6137,12 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, int extent_slot = 0; int found_extent = 0; int num_to_del = 1; + int no_quota = node->no_quota; u32 item_size; u64 refs; + u64 bytenr = node->bytenr; + u64 num_bytes = node->num_bytes; int last_ref = 0; - enum btrfs_qgroup_operation_type type = BTRFS_QGROUP_OPER_SUB_EXCL; bool skinny_metadata = btrfs_fs_incompat(root->fs_info, SKINNY_METADATA); @@ -6294,7 +6323,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, refs -= refs_to_drop; if (refs > 0) { - type = BTRFS_QGROUP_OPER_SUB_SHARED; if (extent_op) __run_delayed_extent_op(extent_op, leaf, ei); /* @@ -6356,18 +6384,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } btrfs_release_path(path); - /* Deal with the quota accounting */ - if (!ret && last_ref && !no_quota) { - int mod_seq = 0; - - if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID && - type == BTRFS_QGROUP_OPER_SUB_SHARED) - mod_seq = 1; - - ret = btrfs_qgroup_record_ref(trans, info, root_objectid, - bytenr, num_bytes, type, - mod_seq); - } out: btrfs_free_path(path); return ret; @@ -6393,7 +6409,7 @@ static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans, goto out_delayed_unlock; spin_lock(&head->lock); - if (rb_first(&head->ref_root)) + if (!list_empty(&head->ref_list)) goto out; if (head->extent_op) { @@ -7303,13 +7319,6 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_free_path(path); - /* Always set parent to 0 here since its exclusive anyway. */ - ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, - ins->objectid, ins->offset, - BTRFS_QGROUP_OPER_ADD_EXCL, 0); - if (ret) - return ret; - ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); if (ret) { /* -ENOENT, logic error */ btrfs_err(fs_info, "update block group failed for %llu %llu", @@ -7391,14 +7400,6 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_free_path(path); - if (!no_quota) { - ret = btrfs_qgroup_record_ref(trans, fs_info, root_objectid, - ins->objectid, num_bytes, - BTRFS_QGROUP_OPER_ADD_EXCL, 0); - if (ret) - return ret; - } - ret = update_block_group(trans, root, ins->objectid, root->nodesize, 1); if (ret) { /* -ENOENT, logic error */ @@ -7755,12 +7756,18 @@ reada: wc->reada_slot = slot; } +/* + * TODO: Modify related function to add related node/leaf to dirty_extent_root, + * for later qgroup accounting. + * + * Current, this function does nothing. + */ static int account_leaf_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *eb) { int nr = btrfs_header_nritems(eb); - int i, extent_type, ret; + int i, extent_type; struct btrfs_key key; struct btrfs_file_extent_item *fi; u64 bytenr, num_bytes; @@ -7783,13 +7790,6 @@ static int account_leaf_items(struct btrfs_trans_handle *trans, continue; num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi); - - ret = btrfs_qgroup_record_ref(trans, root->fs_info, - root->objectid, - bytenr, num_bytes, - BTRFS_QGROUP_OPER_SUB_SUBTREE, 0); - if (ret) - return ret; } return 0; } @@ -7858,6 +7858,8 @@ static int adjust_slots_upwards(struct btrfs_root *root, /* * root_eb is the subtree root and is locked before this function is called. + * TODO: Modify this function to mark all (including complete shared node) + * to dirty_extent_root to allow it get accounted in qgroup. */ static int account_shared_subtree(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -7920,7 +7922,11 @@ walk_down: child_gen = btrfs_node_ptr_generation(eb, parent_slot); eb = read_tree_block(root, child_bytenr, child_gen); - if (!eb || !extent_buffer_uptodate(eb)) { + if (IS_ERR(eb)) { + ret = PTR_ERR(eb); + goto out; + } else if (!extent_buffer_uptodate(eb)) { + free_extent_buffer(eb); ret = -EIO; goto out; } @@ -7931,16 +7937,6 @@ walk_down: btrfs_tree_read_lock(eb); btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK); path->locks[level] = BTRFS_READ_LOCK_BLOCKING; - - ret = btrfs_qgroup_record_ref(trans, root->fs_info, - root->objectid, - child_bytenr, - root->nodesize, - BTRFS_QGROUP_OPER_SUB_SUBTREE, - 0); - if (ret) - goto out; - } if (level == 0) { @@ -8151,7 +8147,9 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, if (reada && level == 1) reada_walk_down(trans, root, wc, path); next = read_tree_block(root, bytenr, generation); - if (!next || !extent_buffer_uptodate(next)) { + if (IS_ERR(next)) { + return PTR_ERR(next); + } else if (!extent_buffer_uptodate(next)) { free_extent_buffer(next); return -EIO; } @@ -8533,24 +8531,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root, goto out_end_trans; } - /* - * Qgroup update accounting is run from - * delayed ref handling. This usually works - * out because delayed refs are normally the - * only way qgroup updates are added. However, - * we may have added updates during our tree - * walk so run qgroups here to make sure we - * don't lose any updates. - */ - ret = btrfs_delayed_qgroup_accounting(trans, - root->fs_info); - if (ret) - printk_ratelimited(KERN_ERR "BTRFS: Failure %d " - "running qgroup updates " - "during snapshot delete. " - "Quota is out of sync, " - "rescan required.\n", ret); - btrfs_end_transaction_throttle(trans, tree_root); if (!for_reloc && btrfs_need_cleaner_sleep(root)) { pr_debug("BTRFS: drop snapshot early exit\n"); @@ -8604,14 +8584,6 @@ int btrfs_drop_snapshot(struct btrfs_root *root, } root_dropped = true; out_end_trans: - ret = btrfs_delayed_qgroup_accounting(trans, tree_root->fs_info); - if (ret) - printk_ratelimited(KERN_ERR "BTRFS: Failure %d " - "running qgroup updates " - "during snapshot delete. " - "Quota is out of sync, " - "rescan required.\n", ret); - btrfs_end_transaction_throttle(trans, tree_root); out_free: kfree(wc); @@ -9562,6 +9534,19 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, free_excluded_extents(root, cache); + /* + * Call to ensure the corresponding space_info object is created and + * assigned to our block group, but don't update its counters just yet. + * We want our bg to be added to the rbtree with its ->space_info set. + */ + ret = update_space_info(root->fs_info, cache->flags, 0, 0, + &cache->space_info); + if (ret) { + btrfs_remove_free_space_cache(cache); + btrfs_put_block_group(cache); + return ret; + } + ret = btrfs_add_block_group_cache(root->fs_info, cache); if (ret) { btrfs_remove_free_space_cache(cache); @@ -9569,6 +9554,10 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, return ret; } + /* + * Now that our block group has its ->space_info set and is inserted in + * the rbtree, update the space info's counters. + */ ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, &cache->space_info); if (ret) { @@ -9931,6 +9920,8 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) } spin_unlock(&fs_info->unused_bgs_lock); + mutex_lock(&root->fs_info->delete_unused_bgs_mutex); + /* Don't want to race with allocators so take the groups_sem */ down_write(&space_info->groups_sem); spin_lock(&block_group->lock); @@ -10025,6 +10016,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) end_trans: btrfs_end_transaction(trans, root); next: + mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); btrfs_put_block_group(block_group); spin_lock(&fs_info->unused_bgs_lock); } diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h new file mode 100644 index 000000000..e69de29bb --- /dev/null +++ b/fs/btrfs/extent-tree.h diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c32d226bf..02d05817c 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1277,7 +1277,12 @@ int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, gfp_t mask) { - return clear_extent_bit(tree, start, end, bits, 0, 0, NULL, mask); + int wake = 0; + + if (bits & EXTENT_LOCKED) + wake = 1; + + return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, mask); } int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, @@ -2767,8 +2772,6 @@ static int __must_check submit_one_bio(int rw, struct bio *bio, else btrfsic_submit_bio(rw, bio); - if (bio_flagged(bio, BIO_EOPNOTSUPP)) - ret = -EOPNOTSUPP; bio_put(bio); return ret; } @@ -4492,6 +4495,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, } if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) flags |= FIEMAP_EXTENT_ENCODED; + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + flags |= FIEMAP_EXTENT_UNWRITTEN; free_extent_map(em); em = NULL; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index b072e1747..b823fac91 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1748,7 +1748,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, } current->backing_dev_info = inode_to_bdi(inode); - err = file_remove_suid(file); + err = file_remove_privs(file); if (err) { mutex_unlock(&inode->i_mutex); goto out; @@ -1868,6 +1868,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) struct btrfs_log_ctx ctx; int ret = 0; bool full_sync = 0; + const u64 len = end - start + 1; trace_btrfs_sync_file(file, datasync); @@ -1896,7 +1897,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * all extents are persisted and the respective file extent * items are in the fs/subvol btree. */ - ret = btrfs_wait_ordered_range(inode, start, end - start + 1); + ret = btrfs_wait_ordered_range(inode, start, len); } else { /* * Start any new ordered operations before starting to log the @@ -1968,8 +1969,10 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ smp_mb(); if (btrfs_inode_in_log(inode, root->fs_info->generation) || - (full_sync && BTRFS_I(inode)->last_trans <= - root->fs_info->last_trans_committed)) { + (BTRFS_I(inode)->last_trans <= + root->fs_info->last_trans_committed && + (full_sync || + !btrfs_have_ordered_extents_in_range(inode, start, len)))) { /* * We'v had everything committed since the last time we were * modified so clear this flag in case it was set for whatever diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 9dbe5b548..fb5a6b1c6 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -231,6 +231,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root, { int ret = 0; struct btrfs_path *path = btrfs_alloc_path(); + bool locked = false; if (!path) { ret = -ENOMEM; @@ -238,6 +239,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root, } if (block_group) { + locked = true; mutex_lock(&trans->transaction->cache_write_mutex); if (!list_empty(&block_group->io_list)) { list_del_init(&block_group->io_list); @@ -269,18 +271,14 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root, */ ret = btrfs_truncate_inode_items(trans, root, inode, 0, BTRFS_EXTENT_DATA_KEY); - if (ret) { - mutex_unlock(&trans->transaction->cache_write_mutex); - btrfs_abort_transaction(trans, root, ret); - return ret; - } + if (ret) + goto fail; ret = btrfs_update_inode(trans, root, inode); - if (block_group) - mutex_unlock(&trans->transaction->cache_write_mutex); - fail: + if (locked) + mutex_unlock(&trans->transaction->cache_write_mutex); if (ret) btrfs_abort_transaction(trans, root, ret); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8bb013672..e33dff356 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4209,7 +4209,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, u64 extent_num_bytes = 0; u64 extent_offset = 0; u64 item_end = 0; - u64 last_size = (u64)-1; + u64 last_size = new_size; u32 found_type = (u8)-1; int found_extent; int del_item; @@ -4493,8 +4493,7 @@ out: btrfs_abort_transaction(trans, root, ret); } error: - if (last_size != (u64)-1 && - root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) + if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) btrfs_ordered_update_i_size(inode, last_size, NULL); btrfs_free_path(path); @@ -4986,24 +4985,41 @@ static void evict_inode_truncate_pages(struct inode *inode) } write_unlock(&map_tree->lock); + /* + * Keep looping until we have no more ranges in the io tree. + * We can have ongoing bios started by readpages (called from readahead) + * that have their endio callback (extent_io.c:end_bio_extent_readpage) + * still in progress (unlocked the pages in the bio but did not yet + * unlocked the ranges in the io tree). Therefore this means some + * ranges can still be locked and eviction started because before + * submitting those bios, which are executed by a separate task (work + * queue kthread), inode references (inode->i_count) were not taken + * (which would be dropped in the end io callback of each bio). + * Therefore here we effectively end up waiting for those bios and + * anyone else holding locked ranges without having bumped the inode's + * reference count - if we don't do it, when they access the inode's + * io_tree to unlock a range it may be too late, leading to an + * use-after-free issue. + */ spin_lock(&io_tree->lock); while (!RB_EMPTY_ROOT(&io_tree->state)) { struct extent_state *state; struct extent_state *cached_state = NULL; + u64 start; + u64 end; node = rb_first(&io_tree->state); state = rb_entry(node, struct extent_state, rb_node); - atomic_inc(&state->refs); + start = state->start; + end = state->end; spin_unlock(&io_tree->lock); - lock_extent_bits(io_tree, state->start, state->end, - 0, &cached_state); - clear_extent_bit(io_tree, state->start, state->end, + lock_extent_bits(io_tree, start, end, 0, &cached_state); + clear_extent_bit(io_tree, start, end, EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 1, 1, &cached_state, GFP_NOFS); - free_extent_state(state); cond_resched(); spin_lock(&io_tree->lock); @@ -7530,6 +7546,7 @@ unlock: current->journal_info = outstanding_extents; btrfs_free_reserved_data_space(inode, len); + set_bit(BTRFS_INODE_DIO_READY, &BTRFS_I(inode)->runtime_flags); } /* @@ -7855,8 +7872,6 @@ static void btrfs_endio_direct_write(struct bio *bio, int err) struct bio *dio_bio; int ret; - if (err) - goto out_done; again: ret = btrfs_dec_test_first_ordered_pending(inode, &ordered, &ordered_offset, @@ -7879,7 +7894,6 @@ out_test: ordered = NULL; goto again; } -out_done: dio_bio = dip->dio_bio; kfree(dip); @@ -8147,9 +8161,8 @@ out_err: static void btrfs_submit_direct(int rw, struct bio *dio_bio, struct inode *inode, loff_t file_offset) { - struct btrfs_root *root = BTRFS_I(inode)->root; - struct btrfs_dio_private *dip; - struct bio *io_bio; + struct btrfs_dio_private *dip = NULL; + struct bio *io_bio = NULL; struct btrfs_io_bio *btrfs_bio; int skip_sum; int write = rw & REQ_WRITE; @@ -8166,7 +8179,7 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio, dip = kzalloc(sizeof(*dip), GFP_NOFS); if (!dip) { ret = -ENOMEM; - goto free_io_bio; + goto free_ordered; } dip->private = dio_bio->bi_private; @@ -8194,25 +8207,55 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio, if (btrfs_bio->end_io) btrfs_bio->end_io(btrfs_bio, ret); -free_io_bio: - bio_put(io_bio); free_ordered: /* - * If this is a write, we need to clean up the reserved space and kill - * the ordered extent. + * If we arrived here it means either we failed to submit the dip + * or we either failed to clone the dio_bio or failed to allocate the + * dip. If we cloned the dio_bio and allocated the dip, we can just + * call bio_endio against our io_bio so that we get proper resource + * cleanup if we fail to submit the dip, otherwise, we must do the + * same as btrfs_endio_direct_[write|read] because we can't call these + * callbacks - they require an allocated dip and a clone of dio_bio. */ - if (write) { - struct btrfs_ordered_extent *ordered; - ordered = btrfs_lookup_ordered_extent(inode, file_offset); - if (!test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags) && - !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) - btrfs_free_reserved_extent(root, ordered->start, - ordered->disk_len, 1); - btrfs_put_ordered_extent(ordered); - btrfs_put_ordered_extent(ordered); + if (io_bio && dip) { + bio_endio(io_bio, ret); + /* + * The end io callbacks free our dip, do the final put on io_bio + * and all the cleanup and final put for dio_bio (through + * dio_end_io()). + */ + dip = NULL; + io_bio = NULL; + } else { + if (write) { + struct btrfs_ordered_extent *ordered; + + ordered = btrfs_lookup_ordered_extent(inode, + file_offset); + set_bit(BTRFS_ORDERED_IOERR, &ordered->flags); + /* + * Decrements our ref on the ordered extent and removes + * the ordered extent from the inode's ordered tree, + * doing all the proper resource cleanup such as for the + * reserved space and waking up any waiters for this + * ordered extent (through btrfs_remove_ordered_extent). + */ + btrfs_finish_ordered_io(ordered); + } else { + unlock_extent(&BTRFS_I(inode)->io_tree, file_offset, + file_offset + dio_bio->bi_iter.bi_size - 1); + } + clear_bit(BIO_UPTODATE, &dio_bio->bi_flags); + /* + * Releases and cleans up our dio_bio, no need to bio_put() + * nor bio_endio()/bio_io_error() against dio_bio. + */ + dio_end_io(dio_bio, ret); } - bio_endio(dio_bio, ret); + if (io_bio) + bio_put(io_bio); + kfree(dip); } static ssize_t check_direct_IO(struct btrfs_root *root, struct kiocb *iocb, @@ -8314,9 +8357,18 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, btrfs_submit_direct, flags); if (iov_iter_rw(iter) == WRITE) { current->journal_info = NULL; - if (ret < 0 && ret != -EIOCBQUEUED) - btrfs_delalloc_release_space(inode, count); - else if (ret >= 0 && (size_t)ret < count) + if (ret < 0 && ret != -EIOCBQUEUED) { + /* + * If the error comes from submitting stage, + * btrfs_get_blocsk_direct() has free'd data space, + * and metadata space will be handled by + * finish_ordered_fn, don't do that again to make + * sure bytes_may_use is correct. + */ + if (!test_and_clear_bit(BTRFS_INODE_DIO_READY, + &BTRFS_I(inode)->runtime_flags)) + btrfs_delalloc_release_space(inode, count); + } else if (ret >= 0 && (size_t)ret < count) btrfs_delalloc_release_space(inode, count - (size_t)ret); } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 37d456a9a..0770c9158 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -87,7 +87,8 @@ struct btrfs_ioctl_received_subvol_args_32 { static int btrfs_clone(struct inode *src, struct inode *inode, - u64 off, u64 olen, u64 olen_aligned, u64 destoff); + u64 off, u64 olen, u64 olen_aligned, u64 destoff, + int no_time_update); /* Mask out flags that are inappropriate for the given type of inode. */ static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags) @@ -553,8 +554,8 @@ static noinline int create_subvol(struct inode *dir, key.offset = (u64)-1; new_root = btrfs_read_fs_root_no_name(root->fs_info, &key); if (IS_ERR(new_root)) { - btrfs_abort_transaction(trans, root, PTR_ERR(new_root)); ret = PTR_ERR(new_root); + btrfs_abort_transaction(trans, root, ret); goto fail; } @@ -1318,7 +1319,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, i = range->start >> PAGE_CACHE_SHIFT; } if (!max_to_defrag) - max_to_defrag = last_index + 1; + max_to_defrag = last_index - i + 1; /* * make writeback starts from i, so the defrag range can be @@ -1368,7 +1369,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, ra_index = max(i, ra_index); btrfs_force_ra(inode->i_mapping, ra, file, ra_index, cluster); - ra_index += max_cluster; + ra_index += cluster; } mutex_lock(&inode->i_mutex); @@ -2271,10 +2272,7 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file, { struct btrfs_ioctl_ino_lookup_args *args; struct inode *inode; - int ret; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; + int ret = 0; args = memdup_user(argp, sizeof(*args)); if (IS_ERR(args)) @@ -2282,13 +2280,28 @@ static noinline int btrfs_ioctl_ino_lookup(struct file *file, inode = file_inode(file); + /* + * Unprivileged query to obtain the containing subvolume root id. The + * path is reset so it's consistent with btrfs_search_path_in_tree. + */ if (args->treeid == 0) args->treeid = BTRFS_I(inode)->root->root_key.objectid; + if (args->objectid == BTRFS_FIRST_FREE_OBJECTID) { + args->name[0] = 0; + goto out; + } + + if (!capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto out; + } + ret = btrfs_search_path_in_tree(BTRFS_I(inode)->root->fs_info, args->treeid, args->objectid, args->name); +out: if (ret == 0 && copy_to_user(argp, args, sizeof(*args))) ret = -EFAULT; @@ -2753,14 +2766,11 @@ out: return ret; } -static struct page *extent_same_get_page(struct inode *inode, u64 off) +static struct page *extent_same_get_page(struct inode *inode, pgoff_t index) { struct page *page; - pgoff_t index; struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; - index = off >> PAGE_CACHE_SHIFT; - page = grab_cache_page(inode->i_mapping, index); if (!page) return NULL; @@ -2781,6 +2791,20 @@ static struct page *extent_same_get_page(struct inode *inode, u64 off) return page; } +static int gather_extent_pages(struct inode *inode, struct page **pages, + int num_pages, u64 off) +{ + int i; + pgoff_t index = off >> PAGE_CACHE_SHIFT; + + for (i = 0; i < num_pages; i++) { + pages[i] = extent_same_get_page(inode, index + i); + if (!pages[i]) + return -ENOMEM; + } + return 0; +} + static inline void lock_extent_range(struct inode *inode, u64 off, u64 len) { /* do any pending delalloc/csum calc on src, one way or @@ -2806,52 +2830,120 @@ static inline void lock_extent_range(struct inode *inode, u64 off, u64 len) } } -static void btrfs_double_unlock(struct inode *inode1, u64 loff1, - struct inode *inode2, u64 loff2, u64 len) +static void btrfs_double_inode_unlock(struct inode *inode1, struct inode *inode2) { - unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1); - unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); - mutex_unlock(&inode1->i_mutex); mutex_unlock(&inode2->i_mutex); } -static void btrfs_double_lock(struct inode *inode1, u64 loff1, - struct inode *inode2, u64 loff2, u64 len) +static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2) +{ + if (inode1 < inode2) + swap(inode1, inode2); + + mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT); + if (inode1 != inode2) + mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD); +} + +static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1, + struct inode *inode2, u64 loff2, u64 len) +{ + unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1); + unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); +} + +static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1, + struct inode *inode2, u64 loff2, u64 len) { if (inode1 < inode2) { swap(inode1, inode2); swap(loff1, loff2); } - - mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT); lock_extent_range(inode1, loff1, len); - if (inode1 != inode2) { - mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD); + if (inode1 != inode2) lock_extent_range(inode2, loff2, len); +} + +struct cmp_pages { + int num_pages; + struct page **src_pages; + struct page **dst_pages; +}; + +static void btrfs_cmp_data_free(struct cmp_pages *cmp) +{ + int i; + struct page *pg; + + for (i = 0; i < cmp->num_pages; i++) { + pg = cmp->src_pages[i]; + if (pg) + page_cache_release(pg); + pg = cmp->dst_pages[i]; + if (pg) + page_cache_release(pg); + } + kfree(cmp->src_pages); + kfree(cmp->dst_pages); +} + +static int btrfs_cmp_data_prepare(struct inode *src, u64 loff, + struct inode *dst, u64 dst_loff, + u64 len, struct cmp_pages *cmp) +{ + int ret; + int num_pages = PAGE_CACHE_ALIGN(len) >> PAGE_CACHE_SHIFT; + struct page **src_pgarr, **dst_pgarr; + + /* + * We must gather up all the pages before we initiate our + * extent locking. We use an array for the page pointers. Size + * of the array is bounded by len, which is in turn bounded by + * BTRFS_MAX_DEDUPE_LEN. + */ + src_pgarr = kzalloc(num_pages * sizeof(struct page *), GFP_NOFS); + dst_pgarr = kzalloc(num_pages * sizeof(struct page *), GFP_NOFS); + if (!src_pgarr || !dst_pgarr) { + kfree(src_pgarr); + kfree(dst_pgarr); + return -ENOMEM; } + cmp->num_pages = num_pages; + cmp->src_pages = src_pgarr; + cmp->dst_pages = dst_pgarr; + + ret = gather_extent_pages(src, cmp->src_pages, cmp->num_pages, loff); + if (ret) + goto out; + + ret = gather_extent_pages(dst, cmp->dst_pages, cmp->num_pages, dst_loff); + +out: + if (ret) + btrfs_cmp_data_free(cmp); + return 0; } static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst, - u64 dst_loff, u64 len) + u64 dst_loff, u64 len, struct cmp_pages *cmp) { int ret = 0; + int i; struct page *src_page, *dst_page; unsigned int cmp_len = PAGE_CACHE_SIZE; void *addr, *dst_addr; + i = 0; while (len) { if (len < PAGE_CACHE_SIZE) cmp_len = len; - src_page = extent_same_get_page(src, loff); - if (!src_page) - return -EINVAL; - dst_page = extent_same_get_page(dst, dst_loff); - if (!dst_page) { - page_cache_release(src_page); - return -EINVAL; - } + BUG_ON(i >= cmp->num_pages); + + src_page = cmp->src_pages[i]; + dst_page = cmp->dst_pages[i]; + addr = kmap_atomic(src_page); dst_addr = kmap_atomic(dst_page); @@ -2863,26 +2955,30 @@ static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst, kunmap_atomic(addr); kunmap_atomic(dst_addr); - page_cache_release(src_page); - page_cache_release(dst_page); if (ret) break; - loff += cmp_len; - dst_loff += cmp_len; len -= cmp_len; + i++; } return ret; } -static int extent_same_check_offsets(struct inode *inode, u64 off, u64 len) +static int extent_same_check_offsets(struct inode *inode, u64 off, u64 *plen, + u64 olen) { + u64 len = *plen; u64 bs = BTRFS_I(inode)->root->fs_info->sb->s_blocksize; - if (off + len > inode->i_size || off + len < off) + if (off + olen > inode->i_size || off + olen < off) return -EINVAL; + + /* if we extend to eof, continue to block boundary */ + if (off + len == inode->i_size) + *plen = len = ALIGN(inode->i_size, bs) - off; + /* Check that we are block aligned - btrfs_clone() requires this */ if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs)) return -EINVAL; @@ -2890,31 +2986,67 @@ static int extent_same_check_offsets(struct inode *inode, u64 off, u64 len) return 0; } -static int btrfs_extent_same(struct inode *src, u64 loff, u64 len, +static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, struct inode *dst, u64 dst_loff) { int ret; + u64 len = olen; + struct cmp_pages cmp; + int same_inode = 0; + u64 same_lock_start = 0; + u64 same_lock_len = 0; - /* - * btrfs_clone() can't handle extents in the same file - * yet. Once that works, we can drop this check and replace it - * with a check for the same inode, but overlapping extents. - */ if (src == dst) - return -EINVAL; + same_inode = 1; if (len == 0) return 0; - btrfs_double_lock(src, loff, dst, dst_loff, len); + if (same_inode) { + mutex_lock(&src->i_mutex); - ret = extent_same_check_offsets(src, loff, len); - if (ret) - goto out_unlock; + ret = extent_same_check_offsets(src, loff, &len, olen); + if (ret) + goto out_unlock; - ret = extent_same_check_offsets(dst, dst_loff, len); - if (ret) - goto out_unlock; + /* + * Single inode case wants the same checks, except we + * don't want our length pushed out past i_size as + * comparing that data range makes no sense. + * + * extent_same_check_offsets() will do this for an + * unaligned length at i_size, so catch it here and + * reject the request. + * + * This effectively means we require aligned extents + * for the single-inode case, whereas the other cases + * allow an unaligned length so long as it ends at + * i_size. + */ + if (len != olen) { + ret = -EINVAL; + goto out_unlock; + } + + /* Check for overlapping ranges */ + if (dst_loff + len > loff && dst_loff < loff + len) { + ret = -EINVAL; + goto out_unlock; + } + + same_lock_start = min_t(u64, loff, dst_loff); + same_lock_len = max_t(u64, loff, dst_loff) + len - same_lock_start; + } else { + btrfs_double_inode_lock(src, dst); + + ret = extent_same_check_offsets(src, loff, &len, olen); + if (ret) + goto out_unlock; + + ret = extent_same_check_offsets(dst, dst_loff, &len, olen); + if (ret) + goto out_unlock; + } /* don't make the dst file partly checksummed */ if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) != @@ -2923,12 +3055,32 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 len, goto out_unlock; } - ret = btrfs_cmp_data(src, loff, dst, dst_loff, len); + ret = btrfs_cmp_data_prepare(src, loff, dst, dst_loff, olen, &cmp); + if (ret) + goto out_unlock; + + if (same_inode) + lock_extent_range(src, same_lock_start, same_lock_len); + else + btrfs_double_extent_lock(src, loff, dst, dst_loff, len); + + /* pass original length for comparison so we stay within i_size */ + ret = btrfs_cmp_data(src, loff, dst, dst_loff, olen, &cmp); if (ret == 0) - ret = btrfs_clone(src, dst, loff, len, len, dst_loff); + ret = btrfs_clone(src, dst, loff, olen, len, dst_loff, 1); + if (same_inode) + unlock_extent(&BTRFS_I(src)->io_tree, same_lock_start, + same_lock_start + same_lock_len - 1); + else + btrfs_double_extent_unlock(src, loff, dst, dst_loff, len); + + btrfs_cmp_data_free(&cmp); out_unlock: - btrfs_double_unlock(src, loff, dst, dst_loff, len); + if (same_inode) + mutex_unlock(&src->i_mutex); + else + btrfs_double_inode_unlock(src, dst); return ret; } @@ -3082,13 +3234,15 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans, struct inode *inode, u64 endoff, const u64 destoff, - const u64 olen) + const u64 olen, + int no_time_update) { struct btrfs_root *root = BTRFS_I(inode)->root; int ret; inode_inc_iversion(inode); - inode->i_mtime = inode->i_ctime = CURRENT_TIME; + if (!no_time_update) + inode->i_mtime = inode->i_ctime = CURRENT_TIME; /* * We round up to the block size at eof when determining which * extents to clone above, but shouldn't round up the file size. @@ -3173,13 +3327,13 @@ static void clone_update_extent_map(struct inode *inode, * @inode: Inode to clone to * @off: Offset within source to start clone from * @olen: Original length, passed by user, of range to clone - * @olen_aligned: Block-aligned value of olen, extent_same uses - * identical values here + * @olen_aligned: Block-aligned value of olen * @destoff: Offset within @inode to start clone + * @no_time_update: Whether to update mtime/ctime on the target inode */ static int btrfs_clone(struct inode *src, struct inode *inode, const u64 off, const u64 olen, const u64 olen_aligned, - const u64 destoff) + const u64 destoff, int no_time_update) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_path *path = NULL; @@ -3517,7 +3671,8 @@ process_slot: root->sectorsize); ret = clone_finish_inode_update(trans, inode, last_dest_end, - destoff, olen); + destoff, olen, + no_time_update); if (ret) goto out; if (new_key.offset + datal >= destoff + len) @@ -3555,7 +3710,7 @@ process_slot: clone_update_extent_map(inode, trans, NULL, last_dest_end, destoff + len - last_dest_end); ret = clone_finish_inode_update(trans, inode, destoff + len, - destoff, olen); + destoff, olen, no_time_update); } out: @@ -3692,7 +3847,7 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, lock_extent_range(inode, destoff, len); } - ret = btrfs_clone(src, inode, off, olen, len, destoff); + ret = btrfs_clone(src, inode, off, olen, len, destoff, 0); if (same_inode) { u64 lock_start = min_t(u64, off, destoff); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 760c4a5e0..52170cf17 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -198,9 +198,6 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, entry->file_offset = file_offset; entry->start = start; entry->len = len; - if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) && - !(type == BTRFS_ORDERED_NOCOW)) - entry->csum_bytes_left = disk_len; entry->disk_len = disk_len; entry->bytes_left = len; entry->inode = igrab(inode); @@ -286,10 +283,6 @@ void btrfs_add_ordered_sum(struct inode *inode, tree = &BTRFS_I(inode)->ordered_tree; spin_lock_irq(&tree->lock); list_add_tail(&sum->list, &entry->list); - WARN_ON(entry->csum_bytes_left < sum->len); - entry->csum_bytes_left -= sum->len; - if (entry->csum_bytes_left == 0) - wake_up(&entry->wait); spin_unlock_irq(&tree->lock); } @@ -509,7 +502,21 @@ void btrfs_wait_logged_extents(struct btrfs_trans_handle *trans, wait_event(ordered->wait, test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags)); - list_add_tail(&ordered->trans_list, &trans->ordered); + /* + * If our ordered extent completed it means it updated the + * fs/subvol and csum trees already, so no need to make the + * current transaction's commit wait for it, as we end up + * holding memory unnecessarily and delaying the inode's iput + * until the transaction commit (we schedule an iput for the + * inode when the ordered extent's refcount drops to 0), which + * prevents it from being evictable until the transaction + * commits. + */ + if (test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) + btrfs_put_ordered_extent(ordered); + else + list_add_tail(&ordered->trans_list, &trans->ordered); + spin_lock_irq(&log->log_extents_lock[index]); } spin_unlock_irq(&log->log_extents_lock[index]); @@ -545,6 +552,10 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) trace_btrfs_ordered_extent_put(entry->inode, entry); if (atomic_dec_and_test(&entry->refs)) { + ASSERT(list_empty(&entry->log_list)); + ASSERT(list_empty(&entry->trans_list)); + ASSERT(list_empty(&entry->root_extent_list)); + ASSERT(RB_EMPTY_NODE(&entry->rb_node)); if (entry->inode) btrfs_add_delayed_iput(entry->inode); while (!list_empty(&entry->list)) { @@ -572,6 +583,7 @@ void btrfs_remove_ordered_extent(struct inode *inode, spin_lock_irq(&tree->lock); node = &entry->rb_node; rb_erase(node, &tree->tree); + RB_CLEAR_NODE(node); if (tree->last == node) tree->last = NULL; set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); @@ -844,6 +856,20 @@ out: return entry; } +bool btrfs_have_ordered_extents_in_range(struct inode *inode, + u64 file_offset, + u64 len) +{ + struct btrfs_ordered_extent *oe; + + oe = btrfs_lookup_ordered_range(inode, file_offset, len); + if (oe) { + btrfs_put_ordered_extent(oe); + return true; + } + return false; +} + /* * lookup and return any extent before 'file_offset'. NULL is returned * if none is found diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index e96cd4ccd..7176cc0fe 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -89,9 +89,6 @@ struct btrfs_ordered_extent { /* number of bytes that still need writing */ u64 bytes_left; - /* number of bytes that still need csumming */ - u64 csum_bytes_left; - /* * the end of the ordered extent which is behind it but * didn't update disk_i_size. Please see the comment of @@ -191,6 +188,9 @@ btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset); struct btrfs_ordered_extent *btrfs_lookup_ordered_range(struct inode *inode, u64 file_offset, u64 len); +bool btrfs_have_ordered_extents_in_range(struct inode *inode, + u64 file_offset, + u64 len); int btrfs_ordered_update_i_size(struct inode *inode, u64 offset, struct btrfs_ordered_extent *ordered); int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 3d6546581..8a8202956 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -34,6 +34,7 @@ #include "extent_io.h" #include "qgroup.h" + /* TODO XXX FIXME * - subvol delete -> delete when ref goes to 0? delete limits also? * - reorganize keys @@ -84,11 +85,42 @@ struct btrfs_qgroup { /* * temp variables for accounting operations + * Refer to qgroup_shared_accouting() for details. */ u64 old_refcnt; u64 new_refcnt; }; +static void btrfs_qgroup_update_old_refcnt(struct btrfs_qgroup *qg, u64 seq, + int mod) +{ + if (qg->old_refcnt < seq) + qg->old_refcnt = seq; + qg->old_refcnt += mod; +} + +static void btrfs_qgroup_update_new_refcnt(struct btrfs_qgroup *qg, u64 seq, + int mod) +{ + if (qg->new_refcnt < seq) + qg->new_refcnt = seq; + qg->new_refcnt += mod; +} + +static inline u64 btrfs_qgroup_get_old_refcnt(struct btrfs_qgroup *qg, u64 seq) +{ + if (qg->old_refcnt < seq) + return 0; + return qg->old_refcnt - seq; +} + +static inline u64 btrfs_qgroup_get_new_refcnt(struct btrfs_qgroup *qg, u64 seq) +{ + if (qg->new_refcnt < seq) + return 0; + return qg->new_refcnt - seq; +} + /* * glue structure to represent the relations between qgroups. */ @@ -1115,14 +1147,14 @@ int btrfs_add_qgroup_relation(struct btrfs_trans_handle *trans, struct ulist *tmp; int ret = 0; - tmp = ulist_alloc(GFP_NOFS); - if (!tmp) - return -ENOMEM; - /* Check the level of src and dst first */ if (btrfs_qgroup_level(src) >= btrfs_qgroup_level(dst)) return -EINVAL; + tmp = ulist_alloc(GFP_NOFS); + if (!tmp) + return -ENOMEM; + mutex_lock(&fs_info->qgroup_ioctl_lock); quota_root = fs_info->quota_root; if (!quota_root) { @@ -1317,6 +1349,11 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, struct btrfs_root *quota_root; struct btrfs_qgroup *qgroup; int ret = 0; + /* Sometimes we would want to clear the limit on this qgroup. + * To meet this requirement, we treat the -1 as a special value + * which tell kernel to clear the limit on this qgroup. + */ + const u64 CLEAR_VALUE = -1; mutex_lock(&fs_info->qgroup_ioctl_lock); quota_root = fs_info->quota_root; @@ -1332,14 +1369,42 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, } spin_lock(&fs_info->qgroup_lock); - if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_RFER) - qgroup->max_rfer = limit->max_rfer; - if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) - qgroup->max_excl = limit->max_excl; - if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_RFER) - qgroup->rsv_rfer = limit->rsv_rfer; - if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_EXCL) - qgroup->rsv_excl = limit->rsv_excl; + if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_RFER) { + if (limit->max_rfer == CLEAR_VALUE) { + qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER; + limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_RFER; + qgroup->max_rfer = 0; + } else { + qgroup->max_rfer = limit->max_rfer; + } + } + if (limit->flags & BTRFS_QGROUP_LIMIT_MAX_EXCL) { + if (limit->max_excl == CLEAR_VALUE) { + qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL; + limit->flags &= ~BTRFS_QGROUP_LIMIT_MAX_EXCL; + qgroup->max_excl = 0; + } else { + qgroup->max_excl = limit->max_excl; + } + } + if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_RFER) { + if (limit->rsv_rfer == CLEAR_VALUE) { + qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER; + limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_RFER; + qgroup->rsv_rfer = 0; + } else { + qgroup->rsv_rfer = limit->rsv_rfer; + } + } + if (limit->flags & BTRFS_QGROUP_LIMIT_RSV_EXCL) { + if (limit->rsv_excl == CLEAR_VALUE) { + qgroup->lim_flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL; + limit->flags &= ~BTRFS_QGROUP_LIMIT_RSV_EXCL; + qgroup->rsv_excl = 0; + } else { + qgroup->rsv_excl = limit->rsv_excl; + } + } qgroup->lim_flags |= limit->flags; spin_unlock(&fs_info->qgroup_lock); @@ -1356,239 +1421,86 @@ out: return ret; } -static int comp_oper_exist(struct btrfs_qgroup_operation *oper1, - struct btrfs_qgroup_operation *oper2) +int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) { - /* - * Ignore seq and type here, we're looking for any operation - * at all related to this extent on that root. - */ - if (oper1->bytenr < oper2->bytenr) - return -1; - if (oper1->bytenr > oper2->bytenr) - return 1; - if (oper1->ref_root < oper2->ref_root) - return -1; - if (oper1->ref_root > oper2->ref_root) - return 1; - return 0; -} + struct btrfs_qgroup_extent_record *record; + struct btrfs_delayed_ref_root *delayed_refs; + struct rb_node *node; + u64 qgroup_to_skip; + int ret = 0; -static int qgroup_oper_exists(struct btrfs_fs_info *fs_info, - struct btrfs_qgroup_operation *oper) -{ - struct rb_node *n; - struct btrfs_qgroup_operation *cur; - int cmp; + delayed_refs = &trans->transaction->delayed_refs; + qgroup_to_skip = delayed_refs->qgroup_to_skip; - spin_lock(&fs_info->qgroup_op_lock); - n = fs_info->qgroup_op_tree.rb_node; - while (n) { - cur = rb_entry(n, struct btrfs_qgroup_operation, n); - cmp = comp_oper_exist(cur, oper); - if (cmp < 0) { - n = n->rb_right; - } else if (cmp) { - n = n->rb_left; - } else { - spin_unlock(&fs_info->qgroup_op_lock); - return -EEXIST; - } + /* + * No need to do lock, since this function will only be called in + * btrfs_commmit_transaction(). + */ + node = rb_first(&delayed_refs->dirty_extent_root); + while (node) { + record = rb_entry(node, struct btrfs_qgroup_extent_record, + node); + ret = btrfs_find_all_roots(NULL, fs_info, record->bytenr, 0, + &record->old_roots); + if (ret < 0) + break; + if (qgroup_to_skip) + ulist_del(record->old_roots, qgroup_to_skip, 0); + node = rb_next(node); } - spin_unlock(&fs_info->qgroup_op_lock); - return 0; -} - -static int comp_oper(struct btrfs_qgroup_operation *oper1, - struct btrfs_qgroup_operation *oper2) -{ - if (oper1->bytenr < oper2->bytenr) - return -1; - if (oper1->bytenr > oper2->bytenr) - return 1; - if (oper1->ref_root < oper2->ref_root) - return -1; - if (oper1->ref_root > oper2->ref_root) - return 1; - if (oper1->seq < oper2->seq) - return -1; - if (oper1->seq > oper2->seq) - return 1; - if (oper1->type < oper2->type) - return -1; - if (oper1->type > oper2->type) - return 1; - return 0; + return ret; } -static int insert_qgroup_oper(struct btrfs_fs_info *fs_info, - struct btrfs_qgroup_operation *oper) +struct btrfs_qgroup_extent_record +*btrfs_qgroup_insert_dirty_extent(struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_qgroup_extent_record *record) { - struct rb_node **p; - struct rb_node *parent = NULL; - struct btrfs_qgroup_operation *cur; - int cmp; + struct rb_node **p = &delayed_refs->dirty_extent_root.rb_node; + struct rb_node *parent_node = NULL; + struct btrfs_qgroup_extent_record *entry; + u64 bytenr = record->bytenr; - spin_lock(&fs_info->qgroup_op_lock); - p = &fs_info->qgroup_op_tree.rb_node; while (*p) { - parent = *p; - cur = rb_entry(parent, struct btrfs_qgroup_operation, n); - cmp = comp_oper(cur, oper); - if (cmp < 0) { - p = &(*p)->rb_right; - } else if (cmp) { + parent_node = *p; + entry = rb_entry(parent_node, struct btrfs_qgroup_extent_record, + node); + if (bytenr < entry->bytenr) p = &(*p)->rb_left; - } else { - spin_unlock(&fs_info->qgroup_op_lock); - return -EEXIST; - } - } - rb_link_node(&oper->n, parent, p); - rb_insert_color(&oper->n, &fs_info->qgroup_op_tree); - spin_unlock(&fs_info->qgroup_op_lock); - return 0; -} - -/* - * Record a quota operation for processing later on. - * @trans: the transaction we are adding the delayed op to. - * @fs_info: the fs_info for this fs. - * @ref_root: the root of the reference we are acting on, - * @bytenr: the bytenr we are acting on. - * @num_bytes: the number of bytes in the reference. - * @type: the type of operation this is. - * @mod_seq: do we need to get a sequence number for looking up roots. - * - * We just add it to our trans qgroup_ref_list and carry on and process these - * operations in order at some later point. If the reference root isn't a fs - * root then we don't bother with doing anything. - * - * MUST BE HOLDING THE REF LOCK. - */ -int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 ref_root, - u64 bytenr, u64 num_bytes, - enum btrfs_qgroup_operation_type type, int mod_seq) -{ - struct btrfs_qgroup_operation *oper; - int ret; - - if (!is_fstree(ref_root) || !fs_info->quota_enabled) - return 0; - - oper = kmalloc(sizeof(*oper), GFP_NOFS); - if (!oper) - return -ENOMEM; - - oper->ref_root = ref_root; - oper->bytenr = bytenr; - oper->num_bytes = num_bytes; - oper->type = type; - oper->seq = atomic_inc_return(&fs_info->qgroup_op_seq); - INIT_LIST_HEAD(&oper->elem.list); - oper->elem.seq = 0; - - trace_btrfs_qgroup_record_ref(oper); - - if (type == BTRFS_QGROUP_OPER_SUB_SUBTREE) { - /* - * If any operation for this bytenr/ref_root combo - * exists, then we know it's not exclusively owned and - * shouldn't be queued up. - * - * This also catches the case where we have a cloned - * extent that gets queued up multiple times during - * drop snapshot. - */ - if (qgroup_oper_exists(fs_info, oper)) { - kfree(oper); - return 0; - } - } - - ret = insert_qgroup_oper(fs_info, oper); - if (ret) { - /* Shouldn't happen so have an assert for developers */ - ASSERT(0); - kfree(oper); - return ret; + else if (bytenr > entry->bytenr) + p = &(*p)->rb_right; + else + return entry; } - list_add_tail(&oper->list, &trans->qgroup_ref_list); - - if (mod_seq) - btrfs_get_tree_mod_seq(fs_info, &oper->elem); - - return 0; -} - -static int qgroup_excl_accounting(struct btrfs_fs_info *fs_info, - struct btrfs_qgroup_operation *oper) -{ - struct ulist *tmp; - int sign = 0; - int ret = 0; - tmp = ulist_alloc(GFP_NOFS); - if (!tmp) - return -ENOMEM; - - spin_lock(&fs_info->qgroup_lock); - if (!fs_info->quota_root) - goto out; - - switch (oper->type) { - case BTRFS_QGROUP_OPER_ADD_EXCL: - sign = 1; - break; - case BTRFS_QGROUP_OPER_SUB_EXCL: - sign = -1; - break; - default: - ASSERT(0); - } - ret = __qgroup_excl_accounting(fs_info, tmp, oper->ref_root, - oper->num_bytes, sign); -out: - spin_unlock(&fs_info->qgroup_lock); - ulist_free(tmp); - return ret; + rb_link_node(&record->node, parent_node, p); + rb_insert_color(&record->node, &delayed_refs->dirty_extent_root); + return NULL; } +#define UPDATE_NEW 0 +#define UPDATE_OLD 1 /* - * Walk all of the roots that pointed to our bytenr and adjust their refcnts as - * properly. + * Walk all of the roots that points to the bytenr and adjust their refcnts. */ -static int qgroup_calc_old_refcnt(struct btrfs_fs_info *fs_info, - u64 root_to_skip, struct ulist *tmp, - struct ulist *roots, struct ulist *qgroups, - u64 seq, int *old_roots, int rescan) +static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info, + struct ulist *roots, struct ulist *tmp, + struct ulist *qgroups, u64 seq, int update_old) { struct ulist_node *unode; struct ulist_iterator uiter; struct ulist_node *tmp_unode; struct ulist_iterator tmp_uiter; struct btrfs_qgroup *qg; - int ret; + int ret = 0; + if (!roots) + return 0; ULIST_ITER_INIT(&uiter); while ((unode = ulist_next(roots, &uiter))) { - /* We don't count our current root here */ - if (unode->val == root_to_skip) - continue; qg = find_qgroup_rb(fs_info, unode->val); if (!qg) continue; - /* - * We could have a pending removal of this same ref so we may - * not have actually found our ref root when doing - * btrfs_find_all_roots, so we need to keep track of how many - * old roots we find in case we removed ours and added a - * different one at the same time. I don't think this could - * happen in practice but that sort of thinking leads to pain - * and suffering and to the dark side. - */ - (*old_roots)++; ulist_reinit(tmp); ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg), @@ -1603,29 +1515,10 @@ static int qgroup_calc_old_refcnt(struct btrfs_fs_info *fs_info, struct btrfs_qgroup_list *glist; qg = u64_to_ptr(tmp_unode->aux); - /* - * We use this sequence number to keep from having to - * run the whole list and 0 out the refcnt every time. - * We basically use sequnce as the known 0 count and - * then add 1 everytime we see a qgroup. This is how we - * get how many of the roots actually point up to the - * upper level qgroups in order to determine exclusive - * counts. - * - * For rescan we want to set old_refcnt to seq so our - * exclusive calculations end up correct. - */ - if (rescan) - qg->old_refcnt = seq; - else if (qg->old_refcnt < seq) - qg->old_refcnt = seq + 1; + if (update_old) + btrfs_qgroup_update_old_refcnt(qg, seq, 1); else - qg->old_refcnt++; - - if (qg->new_refcnt < seq) - qg->new_refcnt = seq + 1; - else - qg->new_refcnt++; + btrfs_qgroup_update_new_refcnt(qg, seq, 1); list_for_each_entry(glist, &qg->groups, next_group) { ret = ulist_add(qgroups, glist->group->qgroupid, ptr_to_u64(glist->group), @@ -1644,161 +1537,46 @@ static int qgroup_calc_old_refcnt(struct btrfs_fs_info *fs_info, } /* - * We need to walk forward in our operation tree and account for any roots that - * were deleted after we made this operation. - */ -static int qgroup_account_deleted_refs(struct btrfs_fs_info *fs_info, - struct btrfs_qgroup_operation *oper, - struct ulist *tmp, - struct ulist *qgroups, u64 seq, - int *old_roots) -{ - struct ulist_node *unode; - struct ulist_iterator uiter; - struct btrfs_qgroup *qg; - struct btrfs_qgroup_operation *tmp_oper; - struct rb_node *n; - int ret; - - ulist_reinit(tmp); - - /* - * We only walk forward in the tree since we're only interested in - * removals that happened _after_ our operation. - */ - spin_lock(&fs_info->qgroup_op_lock); - n = rb_next(&oper->n); - spin_unlock(&fs_info->qgroup_op_lock); - if (!n) - return 0; - tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n); - while (tmp_oper->bytenr == oper->bytenr) { - /* - * If it's not a removal we don't care, additions work out - * properly with our refcnt tracking. - */ - if (tmp_oper->type != BTRFS_QGROUP_OPER_SUB_SHARED && - tmp_oper->type != BTRFS_QGROUP_OPER_SUB_EXCL) - goto next; - qg = find_qgroup_rb(fs_info, tmp_oper->ref_root); - if (!qg) - goto next; - ret = ulist_add(qgroups, qg->qgroupid, ptr_to_u64(qg), - GFP_ATOMIC); - if (ret) { - if (ret < 0) - return ret; - /* - * We only want to increase old_roots if this qgroup is - * not already in the list of qgroups. If it is already - * there then that means it must have been re-added or - * the delete will be discarded because we had an - * existing ref that we haven't looked up yet. In this - * case we don't want to increase old_roots. So if ret - * == 1 then we know that this is the first time we've - * seen this qgroup and we can bump the old_roots. - */ - (*old_roots)++; - ret = ulist_add(tmp, qg->qgroupid, ptr_to_u64(qg), - GFP_ATOMIC); - if (ret < 0) - return ret; - } -next: - spin_lock(&fs_info->qgroup_op_lock); - n = rb_next(&tmp_oper->n); - spin_unlock(&fs_info->qgroup_op_lock); - if (!n) - break; - tmp_oper = rb_entry(n, struct btrfs_qgroup_operation, n); - } - - /* Ok now process the qgroups we found */ - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(tmp, &uiter))) { - struct btrfs_qgroup_list *glist; - - qg = u64_to_ptr(unode->aux); - if (qg->old_refcnt < seq) - qg->old_refcnt = seq + 1; - else - qg->old_refcnt++; - if (qg->new_refcnt < seq) - qg->new_refcnt = seq + 1; - else - qg->new_refcnt++; - list_for_each_entry(glist, &qg->groups, next_group) { - ret = ulist_add(qgroups, glist->group->qgroupid, - ptr_to_u64(glist->group), GFP_ATOMIC); - if (ret < 0) - return ret; - ret = ulist_add(tmp, glist->group->qgroupid, - ptr_to_u64(glist->group), GFP_ATOMIC); - if (ret < 0) - return ret; - } - } - return 0; -} - -/* Add refcnt for the newly added reference. */ -static int qgroup_calc_new_refcnt(struct btrfs_fs_info *fs_info, - struct btrfs_qgroup_operation *oper, - struct btrfs_qgroup *qgroup, - struct ulist *tmp, struct ulist *qgroups, - u64 seq) -{ - struct ulist_node *unode; - struct ulist_iterator uiter; - struct btrfs_qgroup *qg; - int ret; - - ulist_reinit(tmp); - ret = ulist_add(qgroups, qgroup->qgroupid, ptr_to_u64(qgroup), - GFP_ATOMIC); - if (ret < 0) - return ret; - ret = ulist_add(tmp, qgroup->qgroupid, ptr_to_u64(qgroup), - GFP_ATOMIC); - if (ret < 0) - return ret; - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(tmp, &uiter))) { - struct btrfs_qgroup_list *glist; - - qg = u64_to_ptr(unode->aux); - if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) { - if (qg->new_refcnt < seq) - qg->new_refcnt = seq + 1; - else - qg->new_refcnt++; - } else { - if (qg->old_refcnt < seq) - qg->old_refcnt = seq + 1; - else - qg->old_refcnt++; - } - list_for_each_entry(glist, &qg->groups, next_group) { - ret = ulist_add(tmp, glist->group->qgroupid, - ptr_to_u64(glist->group), GFP_ATOMIC); - if (ret < 0) - return ret; - ret = ulist_add(qgroups, glist->group->qgroupid, - ptr_to_u64(glist->group), GFP_ATOMIC); - if (ret < 0) - return ret; - } - } - return 0; -} - -/* - * This adjusts the counters for all referenced qgroups if need be. + * Update qgroup rfer/excl counters. + * Rfer update is easy, codes can explain themselves. + * + * Excl update is tricky, the update is split into 2 part. + * Part 1: Possible exclusive <-> sharing detect: + * | A | !A | + * ------------------------------------- + * B | * | - | + * ------------------------------------- + * !B | + | ** | + * ------------------------------------- + * + * Conditions: + * A: cur_old_roots < nr_old_roots (not exclusive before) + * !A: cur_old_roots == nr_old_roots (possible exclusive before) + * B: cur_new_roots < nr_new_roots (not exclusive now) + * !B: cur_new_roots == nr_new_roots (possible exclsuive now) + * + * Results: + * +: Possible sharing -> exclusive -: Possible exclusive -> sharing + * *: Definitely not changed. **: Possible unchanged. + * + * For !A and !B condition, the exception is cur_old/new_roots == 0 case. + * + * To make the logic clear, we first use condition A and B to split + * combination into 4 results. + * + * Then, for result "+" and "-", check old/new_roots == 0 case, as in them + * only on variant maybe 0. + * + * Lastly, check result **, since there are 2 variants maybe 0, split them + * again(2x2). + * But this time we don't need to consider other things, the codes and logic + * is easy to understand now. */ -static int qgroup_adjust_counters(struct btrfs_fs_info *fs_info, - u64 root_to_skip, u64 num_bytes, - struct ulist *qgroups, u64 seq, - int old_roots, int new_roots, int rescan) +static int qgroup_update_counters(struct btrfs_fs_info *fs_info, + struct ulist *qgroups, + u64 nr_old_roots, + u64 nr_new_roots, + u64 num_bytes, u64 seq) { struct ulist_node *unode; struct ulist_iterator uiter; @@ -1810,423 +1588,196 @@ static int qgroup_adjust_counters(struct btrfs_fs_info *fs_info, bool dirty = false; qg = u64_to_ptr(unode->aux); - /* - * Wasn't referenced before but is now, add to the reference - * counters. - */ - if (qg->old_refcnt <= seq && qg->new_refcnt > seq) { + cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq); + cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq); + + /* Rfer update part */ + if (cur_old_count == 0 && cur_new_count > 0) { qg->rfer += num_bytes; qg->rfer_cmpr += num_bytes; dirty = true; } - - /* - * Was referenced before but isn't now, subtract from the - * reference counters. - */ - if (qg->old_refcnt > seq && qg->new_refcnt <= seq) { + if (cur_old_count > 0 && cur_new_count == 0) { qg->rfer -= num_bytes; qg->rfer_cmpr -= num_bytes; dirty = true; } - if (qg->old_refcnt < seq) - cur_old_count = 0; - else - cur_old_count = qg->old_refcnt - seq; - if (qg->new_refcnt < seq) - cur_new_count = 0; - else - cur_new_count = qg->new_refcnt - seq; + /* Excl update part */ + /* Exclusive/none -> shared case */ + if (cur_old_count == nr_old_roots && + cur_new_count < nr_new_roots) { + /* Exclusive -> shared */ + if (cur_old_count != 0) { + qg->excl -= num_bytes; + qg->excl_cmpr -= num_bytes; + dirty = true; + } + } - /* - * If our refcount was the same as the roots previously but our - * new count isn't the same as the number of roots now then we - * went from having a exclusive reference on this range to not. - */ - if (old_roots && cur_old_count == old_roots && - (cur_new_count != new_roots || new_roots == 0)) { - WARN_ON(cur_new_count != new_roots && new_roots == 0); - qg->excl -= num_bytes; - qg->excl_cmpr -= num_bytes; - dirty = true; + /* Shared -> exclusive/none case */ + if (cur_old_count < nr_old_roots && + cur_new_count == nr_new_roots) { + /* Shared->exclusive */ + if (cur_new_count != 0) { + qg->excl += num_bytes; + qg->excl_cmpr += num_bytes; + dirty = true; + } } - /* - * If we didn't reference all the roots before but now we do we - * have an exclusive reference to this range. - */ - if ((!old_roots || (old_roots && cur_old_count != old_roots)) - && cur_new_count == new_roots) { - qg->excl += num_bytes; - qg->excl_cmpr += num_bytes; - dirty = true; + /* Exclusive/none -> exclusive/none case */ + if (cur_old_count == nr_old_roots && + cur_new_count == nr_new_roots) { + if (cur_old_count == 0) { + /* None -> exclusive/none */ + + if (cur_new_count != 0) { + /* None -> exclusive */ + qg->excl += num_bytes; + qg->excl_cmpr += num_bytes; + dirty = true; + } + /* None -> none, nothing changed */ + } else { + /* Exclusive -> exclusive/none */ + + if (cur_new_count == 0) { + /* Exclusive -> none */ + qg->excl -= num_bytes; + qg->excl_cmpr -= num_bytes; + dirty = true; + } + /* Exclusive -> exclusive, nothing changed */ + } } + /* For exclusive extent, free its reserved bytes too */ + if (nr_old_roots == 0 && nr_new_roots == 1 && + cur_new_count == nr_new_roots) + qg->reserved -= num_bytes; if (dirty) qgroup_dirty(fs_info, qg); } return 0; } -/* - * If we removed a data extent and there were other references for that bytenr - * then we need to lookup all referenced roots to make sure we still don't - * reference this bytenr. If we do then we can just discard this operation. - */ -static int check_existing_refs(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - struct btrfs_qgroup_operation *oper) -{ - struct ulist *roots = NULL; - struct ulist_node *unode; - struct ulist_iterator uiter; - int ret = 0; - - ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, - oper->elem.seq, &roots); - if (ret < 0) - return ret; - ret = 0; - - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(roots, &uiter))) { - if (unode->val == oper->ref_root) { - ret = 1; - break; - } - } - ulist_free(roots); - btrfs_put_tree_mod_seq(fs_info, &oper->elem); - - return ret; -} - -/* - * If we share a reference across multiple roots then we may need to adjust - * various qgroups referenced and exclusive counters. The basic premise is this - * - * 1) We have seq to represent a 0 count. Instead of looping through all of the - * qgroups and resetting their refcount to 0 we just constantly bump this - * sequence number to act as the base reference count. This means that if - * anybody is equal to or below this sequence they were never referenced. We - * jack this sequence up by the number of roots we found each time in order to - * make sure we don't have any overlap. - * - * 2) We first search all the roots that reference the area _except_ the root - * we're acting on currently. This makes up the old_refcnt of all the qgroups - * before. - * - * 3) We walk all of the qgroups referenced by the root we are currently acting - * on, and will either adjust old_refcnt in the case of a removal or the - * new_refcnt in the case of an addition. - * - * 4) Finally we walk all the qgroups that are referenced by this range - * including the root we are acting on currently. We will adjust the counters - * based on the number of roots we had and will have after this operation. - * - * Take this example as an illustration - * - * [qgroup 1/0] - * / | \ - * [qg 0/0] [qg 0/1] [qg 0/2] - * \ | / - * [ extent ] - * - * Say we are adding a reference that is covered by qg 0/0. The first step - * would give a refcnt of 1 to qg 0/1 and 0/2 and a refcnt of 2 to qg 1/0 with - * old_roots being 2. Because it is adding new_roots will be 1. We then go - * through qg 0/0 which will get the new_refcnt set to 1 and add 1 to qg 1/0's - * new_refcnt, bringing it to 3. We then walk through all of the qgroups, we - * notice that the old refcnt for qg 0/0 < the new refcnt, so we added a - * reference and thus must add the size to the referenced bytes. Everything - * else is the same so nothing else changes. - */ -static int qgroup_shared_accounting(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - struct btrfs_qgroup_operation *oper) +int +btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + u64 bytenr, u64 num_bytes, + struct ulist *old_roots, struct ulist *new_roots) { - struct ulist *roots = NULL; - struct ulist *qgroups, *tmp; - struct btrfs_qgroup *qgroup; - struct seq_list elem = SEQ_LIST_INIT(elem); + struct ulist *qgroups = NULL; + struct ulist *tmp = NULL; u64 seq; - int old_roots = 0; - int new_roots = 0; + u64 nr_new_roots = 0; + u64 nr_old_roots = 0; int ret = 0; - if (oper->elem.seq) { - ret = check_existing_refs(trans, fs_info, oper); - if (ret < 0) - return ret; - if (ret) - return 0; - } + if (new_roots) + nr_new_roots = new_roots->nnodes; + if (old_roots) + nr_old_roots = old_roots->nnodes; - qgroups = ulist_alloc(GFP_NOFS); - if (!qgroups) - return -ENOMEM; + if (!fs_info->quota_enabled) + goto out_free; + BUG_ON(!fs_info->quota_root); + qgroups = ulist_alloc(GFP_NOFS); + if (!qgroups) { + ret = -ENOMEM; + goto out_free; + } tmp = ulist_alloc(GFP_NOFS); if (!tmp) { - ulist_free(qgroups); - return -ENOMEM; + ret = -ENOMEM; + goto out_free; } - btrfs_get_tree_mod_seq(fs_info, &elem); - ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, elem.seq, - &roots); - btrfs_put_tree_mod_seq(fs_info, &elem); - if (ret < 0) { - ulist_free(qgroups); - ulist_free(tmp); - return ret; + mutex_lock(&fs_info->qgroup_rescan_lock); + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { + if (fs_info->qgroup_rescan_progress.objectid <= bytenr) { + mutex_unlock(&fs_info->qgroup_rescan_lock); + ret = 0; + goto out_free; + } } + mutex_unlock(&fs_info->qgroup_rescan_lock); + spin_lock(&fs_info->qgroup_lock); - qgroup = find_qgroup_rb(fs_info, oper->ref_root); - if (!qgroup) - goto out; seq = fs_info->qgroup_seq; - /* - * So roots is the list of all the roots currently pointing at the - * bytenr, including the ref we are adding if we are adding, or not if - * we are removing a ref. So we pass in the ref_root to skip that root - * in our calculations. We set old_refnct and new_refcnt cause who the - * hell knows what everything looked like before, and it doesn't matter - * except... - */ - ret = qgroup_calc_old_refcnt(fs_info, oper->ref_root, tmp, roots, qgroups, - seq, &old_roots, 0); - if (ret < 0) - goto out; - - /* - * Now adjust the refcounts of the qgroups that care about this - * reference, either the old_count in the case of removal or new_count - * in the case of an addition. - */ - ret = qgroup_calc_new_refcnt(fs_info, oper, qgroup, tmp, qgroups, - seq); + /* Update old refcnts using old_roots */ + ret = qgroup_update_refcnt(fs_info, old_roots, tmp, qgroups, seq, + UPDATE_OLD); if (ret < 0) goto out; - /* - * ...in the case of removals. If we had a removal before we got around - * to processing this operation then we need to find that guy and count - * his references as if they really existed so we don't end up screwing - * up the exclusive counts. Then whenever we go to process the delete - * everything will be grand and we can account for whatever exclusive - * changes need to be made there. We also have to pass in old_roots so - * we have an accurate count of the roots as it pertains to this - * operations view of the world. - */ - ret = qgroup_account_deleted_refs(fs_info, oper, tmp, qgroups, seq, - &old_roots); + /* Update new refcnts using new_roots */ + ret = qgroup_update_refcnt(fs_info, new_roots, tmp, qgroups, seq, + UPDATE_NEW); if (ret < 0) goto out; - /* - * We are adding our root, need to adjust up the number of roots, - * otherwise old_roots is the number of roots we want. - */ - if (oper->type == BTRFS_QGROUP_OPER_ADD_SHARED) { - new_roots = old_roots + 1; - } else { - new_roots = old_roots; - old_roots++; - } - fs_info->qgroup_seq += old_roots + 1; - + qgroup_update_counters(fs_info, qgroups, nr_old_roots, nr_new_roots, + num_bytes, seq); /* - * And now the magic happens, bless Arne for having a pretty elegant - * solution for this. + * Bump qgroup_seq to avoid seq overlap */ - qgroup_adjust_counters(fs_info, oper->ref_root, oper->num_bytes, - qgroups, seq, old_roots, new_roots, 0); + fs_info->qgroup_seq += max(nr_old_roots, nr_new_roots) + 1; out: spin_unlock(&fs_info->qgroup_lock); - ulist_free(qgroups); - ulist_free(roots); +out_free: ulist_free(tmp); + ulist_free(qgroups); + ulist_free(old_roots); + ulist_free(new_roots); return ret; } -/* - * Process a reference to a shared subtree. This type of operation is - * queued during snapshot removal when we encounter extents which are - * shared between more than one root. - */ -static int qgroup_subtree_accounting(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - struct btrfs_qgroup_operation *oper) -{ - struct ulist *roots = NULL; - struct ulist_node *unode; - struct ulist_iterator uiter; - struct btrfs_qgroup_list *glist; - struct ulist *parents; - int ret = 0; - int err; - struct btrfs_qgroup *qg; - u64 root_obj = 0; - struct seq_list elem = SEQ_LIST_INIT(elem); - - parents = ulist_alloc(GFP_NOFS); - if (!parents) - return -ENOMEM; - - btrfs_get_tree_mod_seq(fs_info, &elem); - ret = btrfs_find_all_roots(trans, fs_info, oper->bytenr, - elem.seq, &roots); - btrfs_put_tree_mod_seq(fs_info, &elem); - if (ret < 0) - goto out; - - if (roots->nnodes != 1) - goto out; - - ULIST_ITER_INIT(&uiter); - unode = ulist_next(roots, &uiter); /* Only want 1 so no need to loop */ - /* - * If we find our ref root then that means all refs - * this extent has to the root have not yet been - * deleted. In that case, we do nothing and let the - * last ref for this bytenr drive our update. - * - * This can happen for example if an extent is - * referenced multiple times in a snapshot (clone, - * etc). If we are in the middle of snapshot removal, - * queued updates for such an extent will find the - * root if we have not yet finished removing the - * snapshot. - */ - if (unode->val == oper->ref_root) - goto out; - - root_obj = unode->val; - BUG_ON(!root_obj); - - spin_lock(&fs_info->qgroup_lock); - qg = find_qgroup_rb(fs_info, root_obj); - if (!qg) - goto out_unlock; - - qg->excl += oper->num_bytes; - qg->excl_cmpr += oper->num_bytes; - qgroup_dirty(fs_info, qg); - - /* - * Adjust counts for parent groups. First we find all - * parents, then in the 2nd loop we do the adjustment - * while adding parents of the parents to our ulist. - */ - list_for_each_entry(glist, &qg->groups, next_group) { - err = ulist_add(parents, glist->group->qgroupid, - ptr_to_u64(glist->group), GFP_ATOMIC); - if (err < 0) { - ret = err; - goto out_unlock; - } - } - - ULIST_ITER_INIT(&uiter); - while ((unode = ulist_next(parents, &uiter))) { - qg = u64_to_ptr(unode->aux); - qg->excl += oper->num_bytes; - qg->excl_cmpr += oper->num_bytes; - qgroup_dirty(fs_info, qg); - - /* Add any parents of the parents */ - list_for_each_entry(glist, &qg->groups, next_group) { - err = ulist_add(parents, glist->group->qgroupid, - ptr_to_u64(glist->group), GFP_ATOMIC); - if (err < 0) { - ret = err; - goto out_unlock; - } - } - } - -out_unlock: - spin_unlock(&fs_info->qgroup_lock); - -out: - ulist_free(roots); - ulist_free(parents); - return ret; -} - -/* - * btrfs_qgroup_account_ref is called for every ref that is added to or deleted - * from the fs. First, all roots referencing the extent are searched, and - * then the space is accounted accordingly to the different roots. The - * accounting algorithm works in 3 steps documented inline. - */ -static int btrfs_qgroup_account(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - struct btrfs_qgroup_operation *oper) +int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) { + struct btrfs_qgroup_extent_record *record; + struct btrfs_delayed_ref_root *delayed_refs; + struct ulist *new_roots = NULL; + struct rb_node *node; + u64 qgroup_to_skip; int ret = 0; - if (!fs_info->quota_enabled) - return 0; - - BUG_ON(!fs_info->quota_root); + delayed_refs = &trans->transaction->delayed_refs; + qgroup_to_skip = delayed_refs->qgroup_to_skip; + while ((node = rb_first(&delayed_refs->dirty_extent_root))) { + record = rb_entry(node, struct btrfs_qgroup_extent_record, + node); - mutex_lock(&fs_info->qgroup_rescan_lock); - if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { - if (fs_info->qgroup_rescan_progress.objectid <= oper->bytenr) { - mutex_unlock(&fs_info->qgroup_rescan_lock); - return 0; + if (!ret) { + /* + * Use (u64)-1 as time_seq to do special search, which + * doesn't lock tree or delayed_refs and search current + * root. It's safe inside commit_transaction(). + */ + ret = btrfs_find_all_roots(trans, fs_info, + record->bytenr, (u64)-1, &new_roots); + if (ret < 0) + goto cleanup; + if (qgroup_to_skip) + ulist_del(new_roots, qgroup_to_skip, 0); + ret = btrfs_qgroup_account_extent(trans, fs_info, + record->bytenr, record->num_bytes, + record->old_roots, new_roots); + record->old_roots = NULL; + new_roots = NULL; } - } - mutex_unlock(&fs_info->qgroup_rescan_lock); - - ASSERT(is_fstree(oper->ref_root)); - - trace_btrfs_qgroup_account(oper); - - switch (oper->type) { - case BTRFS_QGROUP_OPER_ADD_EXCL: - case BTRFS_QGROUP_OPER_SUB_EXCL: - ret = qgroup_excl_accounting(fs_info, oper); - break; - case BTRFS_QGROUP_OPER_ADD_SHARED: - case BTRFS_QGROUP_OPER_SUB_SHARED: - ret = qgroup_shared_accounting(trans, fs_info, oper); - break; - case BTRFS_QGROUP_OPER_SUB_SUBTREE: - ret = qgroup_subtree_accounting(trans, fs_info, oper); - break; - default: - ASSERT(0); - } - return ret; -} - -/* - * Needs to be called everytime we run delayed refs, even if there is an error - * in order to cleanup outstanding operations. - */ -int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info) -{ - struct btrfs_qgroup_operation *oper; - int ret = 0; +cleanup: + ulist_free(record->old_roots); + ulist_free(new_roots); + new_roots = NULL; + rb_erase(node, &delayed_refs->dirty_extent_root); + kfree(record); - while (!list_empty(&trans->qgroup_ref_list)) { - oper = list_first_entry(&trans->qgroup_ref_list, - struct btrfs_qgroup_operation, list); - list_del_init(&oper->list); - if (!ret || !trans->aborted) - ret = btrfs_qgroup_account(trans, fs_info, oper); - spin_lock(&fs_info->qgroup_op_lock); - rb_erase(&oper->n, &fs_info->qgroup_op_tree); - spin_unlock(&fs_info->qgroup_op_lock); - btrfs_put_tree_mod_seq(fs_info, &oper->elem); - kfree(oper); } return ret; } @@ -2637,15 +2188,13 @@ void assert_qgroups_uptodate(struct btrfs_trans_handle *trans) */ static int qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, - struct btrfs_trans_handle *trans, struct ulist *qgroups, - struct ulist *tmp, struct extent_buffer *scratch_leaf) + struct btrfs_trans_handle *trans, + struct extent_buffer *scratch_leaf) { struct btrfs_key found; struct ulist *roots = NULL; struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem); u64 num_bytes; - u64 seq; - int new_roots; int slot; int ret; @@ -2695,33 +2244,15 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, else num_bytes = found.offset; - ulist_reinit(qgroups); ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0, &roots); if (ret < 0) goto out; - spin_lock(&fs_info->qgroup_lock); - seq = fs_info->qgroup_seq; - fs_info->qgroup_seq += roots->nnodes + 1; /* max refcnt */ - - new_roots = 0; - ret = qgroup_calc_old_refcnt(fs_info, 0, tmp, roots, qgroups, - seq, &new_roots, 1); - if (ret < 0) { - spin_unlock(&fs_info->qgroup_lock); - ulist_free(roots); - goto out; - } - - ret = qgroup_adjust_counters(fs_info, 0, num_bytes, qgroups, - seq, 0, new_roots, 1); - if (ret < 0) { - spin_unlock(&fs_info->qgroup_lock); - ulist_free(roots); + /* For rescan, just pass old_roots as NULL */ + ret = btrfs_qgroup_account_extent(trans, fs_info, + found.objectid, num_bytes, NULL, roots); + if (ret < 0) goto out; - } - spin_unlock(&fs_info->qgroup_lock); - ulist_free(roots); } out: btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem); @@ -2735,7 +2266,6 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) qgroup_rescan_work); struct btrfs_path *path; struct btrfs_trans_handle *trans = NULL; - struct ulist *tmp = NULL, *qgroups = NULL; struct extent_buffer *scratch_leaf = NULL; int err = -ENOMEM; int ret = 0; @@ -2743,12 +2273,6 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) path = btrfs_alloc_path(); if (!path) goto out; - qgroups = ulist_alloc(GFP_NOFS); - if (!qgroups) - goto out; - tmp = ulist_alloc(GFP_NOFS); - if (!tmp) - goto out; scratch_leaf = kmalloc(sizeof(*scratch_leaf), GFP_NOFS); if (!scratch_leaf) goto out; @@ -2764,7 +2288,7 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) err = -EINTR; } else { err = qgroup_rescan_leaf(fs_info, path, trans, - qgroups, tmp, scratch_leaf); + scratch_leaf); } if (err > 0) btrfs_commit_transaction(trans, fs_info->fs_root); @@ -2774,8 +2298,6 @@ static void btrfs_qgroup_rescan_worker(struct btrfs_work *work) out: kfree(scratch_leaf); - ulist_free(qgroups); - ulist_free(tmp); btrfs_free_path(path); mutex_lock(&fs_info->qgroup_rescan_lock); diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index c5242aa9a..6387dcfa3 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -19,43 +19,18 @@ #ifndef __BTRFS_QGROUP__ #define __BTRFS_QGROUP__ +#include "ulist.h" +#include "delayed-ref.h" + /* - * A description of the operations, all of these operations only happen when we - * are adding the 1st reference for that subvolume in the case of adding space - * or on the last reference delete in the case of subtraction. The only - * exception is the last one, which is added for confusion. - * - * BTRFS_QGROUP_OPER_ADD_EXCL: adding bytes where this subvolume is the only - * one pointing at the bytes we are adding. This is called on the first - * allocation. - * - * BTRFS_QGROUP_OPER_ADD_SHARED: adding bytes where this bytenr is going to be - * shared between subvols. This is called on the creation of a ref that already - * has refs from a different subvolume, so basically reflink. - * - * BTRFS_QGROUP_OPER_SUB_EXCL: removing bytes where this subvolume is the only - * one referencing the range. - * - * BTRFS_QGROUP_OPER_SUB_SHARED: removing bytes where this subvolume shares with - * refs with other subvolumes. + * Record a dirty extent, and info qgroup to update quota on it + * TODO: Use kmem cache to alloc it. */ -enum btrfs_qgroup_operation_type { - BTRFS_QGROUP_OPER_ADD_EXCL, - BTRFS_QGROUP_OPER_ADD_SHARED, - BTRFS_QGROUP_OPER_SUB_EXCL, - BTRFS_QGROUP_OPER_SUB_SHARED, - BTRFS_QGROUP_OPER_SUB_SUBTREE, -}; - -struct btrfs_qgroup_operation { - u64 ref_root; +struct btrfs_qgroup_extent_record { + struct rb_node node; u64 bytenr; u64 num_bytes; - u64 seq; - enum btrfs_qgroup_operation_type type; - struct seq_list elem; - struct rb_node n; - struct list_head list; + struct ulist *old_roots; }; int btrfs_quota_enable(struct btrfs_trans_handle *trans, @@ -79,16 +54,18 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans, int btrfs_read_qgroup_config(struct btrfs_fs_info *fs_info); void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info); struct btrfs_delayed_extent_op; -int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, u64 ref_root, +int btrfs_qgroup_prepare_account_extents(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +struct btrfs_qgroup_extent_record +*btrfs_qgroup_insert_dirty_extent(struct btrfs_delayed_ref_root *delayed_refs, + struct btrfs_qgroup_extent_record *record); +int +btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, u64 num_bytes, - enum btrfs_qgroup_operation_type type, - int mod_seq); -int btrfs_delayed_qgroup_accounting(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info); -void btrfs_remove_qgroup_operation(struct btrfs_trans_handle *trans, - struct btrfs_fs_info *fs_info, - struct btrfs_qgroup_operation *oper); + struct ulist *old_roots, struct ulist *new_roots); +int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); int btrfs_run_qgroups(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info); int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 74b24b01d..88cbb5995 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1847,8 +1847,10 @@ again: } eb = read_tree_block(dest, old_bytenr, old_ptr_gen); - if (!eb || !extent_buffer_uptodate(eb)) { - ret = (!eb) ? -ENOMEM : -EIO; + if (IS_ERR(eb)) { + ret = PTR_ERR(eb); + } else if (!extent_buffer_uptodate(eb)) { + ret = -EIO; free_extent_buffer(eb); break; } @@ -2002,7 +2004,9 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, bytenr = btrfs_node_blockptr(eb, path->slots[i]); eb = read_tree_block(root, bytenr, ptr_gen); - if (!eb || !extent_buffer_uptodate(eb)) { + if (IS_ERR(eb)) { + return PTR_ERR(eb); + } else if (!extent_buffer_uptodate(eb)) { free_extent_buffer(eb); return -EIO; } @@ -2710,7 +2714,10 @@ static int do_relocation(struct btrfs_trans_handle *trans, blocksize = root->nodesize; generation = btrfs_node_ptr_generation(upper->eb, slot); eb = read_tree_block(root, bytenr, generation); - if (!eb || !extent_buffer_uptodate(eb)) { + if (IS_ERR(eb)) { + err = PTR_ERR(eb); + goto next; + } else if (!extent_buffer_uptodate(eb)) { free_extent_buffer(eb); err = -EIO; goto next; @@ -2873,7 +2880,9 @@ static int get_tree_block_key(struct reloc_control *rc, BUG_ON(block->key_ready); eb = read_tree_block(rc->extent_root, block->bytenr, block->key.offset); - if (!eb || !extent_buffer_uptodate(eb)) { + if (IS_ERR(eb)) { + return PTR_ERR(eb); + } else if (!extent_buffer_uptodate(eb)) { free_extent_buffer(eb); return -EIO; } @@ -4040,7 +4049,7 @@ restart: if (trans && progress && err == -ENOSPC) { ret = btrfs_force_chunk_alloc(trans, rc->extent_root, rc->block_group->flags); - if (ret == 0) { + if (ret == 1) { err = 0; progress = 0; goto restart; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index ab5811545..94db0fa52 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2662,18 +2662,30 @@ static void scrub_free_parity(struct scrub_parity *sparity) kfree(sparity); } +static void scrub_parity_bio_endio_worker(struct btrfs_work *work) +{ + struct scrub_parity *sparity = container_of(work, struct scrub_parity, + work); + struct scrub_ctx *sctx = sparity->sctx; + + scrub_free_parity(sparity); + scrub_pending_bio_dec(sctx); +} + static void scrub_parity_bio_endio(struct bio *bio, int error) { struct scrub_parity *sparity = (struct scrub_parity *)bio->bi_private; - struct scrub_ctx *sctx = sparity->sctx; if (error) bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap, sparity->nsectors); - scrub_free_parity(sparity); - scrub_pending_bio_dec(sctx); bio_put(bio); + + btrfs_init_work(&sparity->work, btrfs_scrubparity_helper, + scrub_parity_bio_endio_worker, NULL, NULL); + btrfs_queue_work(sparity->sctx->dev_root->fs_info->scrub_parity_workers, + &sparity->work); } static void scrub_parity_check_and_repair(struct scrub_parity *sparity) @@ -3559,7 +3571,6 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, int is_dev_replace) { - int ret = 0; unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND; int max_active = fs_info->thread_pool_size; @@ -3572,27 +3583,36 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, fs_info->scrub_workers = btrfs_alloc_workqueue("btrfs-scrub", flags, max_active, 4); - if (!fs_info->scrub_workers) { - ret = -ENOMEM; - goto out; - } + if (!fs_info->scrub_workers) + goto fail_scrub_workers; + fs_info->scrub_wr_completion_workers = btrfs_alloc_workqueue("btrfs-scrubwrc", flags, max_active, 2); - if (!fs_info->scrub_wr_completion_workers) { - ret = -ENOMEM; - goto out; - } + if (!fs_info->scrub_wr_completion_workers) + goto fail_scrub_wr_completion_workers; + fs_info->scrub_nocow_workers = btrfs_alloc_workqueue("btrfs-scrubnc", flags, 1, 0); - if (!fs_info->scrub_nocow_workers) { - ret = -ENOMEM; - goto out; - } + if (!fs_info->scrub_nocow_workers) + goto fail_scrub_nocow_workers; + fs_info->scrub_parity_workers = + btrfs_alloc_workqueue("btrfs-scrubparity", flags, + max_active, 2); + if (!fs_info->scrub_parity_workers) + goto fail_scrub_parity_workers; } ++fs_info->scrub_workers_refcnt; -out: - return ret; + return 0; + +fail_scrub_parity_workers: + btrfs_destroy_workqueue(fs_info->scrub_nocow_workers); +fail_scrub_nocow_workers: + btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers); +fail_scrub_wr_completion_workers: + btrfs_destroy_workqueue(fs_info->scrub_workers); +fail_scrub_workers: + return -ENOMEM; } static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info) @@ -3601,6 +3621,7 @@ static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info) btrfs_destroy_workqueue(fs_info->scrub_workers); btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers); btrfs_destroy_workqueue(fs_info->scrub_nocow_workers); + btrfs_destroy_workqueue(fs_info->scrub_parity_workers); } WARN_ON(fs_info->scrub_workers_refcnt < 0); } diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 5cf7838fb..aa72bfd28 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -243,6 +243,7 @@ struct waiting_dir_move { * after this directory is moved, we can try to rmdir the ino rmdir_ino. */ u64 rmdir_ino; + bool orphanized; }; struct orphan_dir_info { @@ -1916,8 +1917,13 @@ static int did_overwrite_ref(struct send_ctx *sctx, goto out; } - /* we know that it is or will be overwritten. check this now */ - if (ow_inode < sctx->send_progress) + /* + * We know that it is or will be overwritten. Check this now. + * The current inode being processed might have been the one that caused + * inode 'ino' to be orphanized, therefore ow_inode can actually be the + * same as sctx->send_progress. + */ + if (ow_inode <= sctx->send_progress) ret = 1; else ret = 0; @@ -2239,6 +2245,8 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen, fs_path_reset(dest); while (!stop && ino != BTRFS_FIRST_FREE_OBJECTID) { + struct waiting_dir_move *wdm; + fs_path_reset(name); if (is_waiting_for_rm(sctx, ino)) { @@ -2249,7 +2257,11 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen, break; } - if (is_waiting_for_move(sctx, ino)) { + wdm = get_waiting_dir_move(sctx, ino); + if (wdm && wdm->orphanized) { + ret = gen_unique_name(sctx, ino, gen, name); + stop = 1; + } else if (wdm) { ret = get_first_ref(sctx->parent_root, ino, &parent_inode, &parent_gen, name); } else { @@ -2344,8 +2356,12 @@ static int send_subvol_begin(struct send_ctx *sctx) TLV_PUT_U64(sctx, BTRFS_SEND_A_CTRANSID, le64_to_cpu(sctx->send_root->root_item.ctransid)); if (parent_root) { - TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, - sctx->parent_root->root_item.uuid); + if (!btrfs_is_empty_uuid(parent_root->root_item.received_uuid)) + TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, + parent_root->root_item.received_uuid); + else + TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, + parent_root->root_item.uuid); TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID, le64_to_cpu(sctx->parent_root->root_item.ctransid)); } @@ -2939,7 +2955,7 @@ static int is_waiting_for_move(struct send_ctx *sctx, u64 ino) return entry != NULL; } -static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino) +static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino, bool orphanized) { struct rb_node **p = &sctx->waiting_dir_moves.rb_node; struct rb_node *parent = NULL; @@ -2950,6 +2966,7 @@ static int add_waiting_dir_move(struct send_ctx *sctx, u64 ino) return -ENOMEM; dm->ino = ino; dm->rmdir_ino = 0; + dm->orphanized = orphanized; while (*p) { parent = *p; @@ -3046,7 +3063,7 @@ static int add_pending_dir_move(struct send_ctx *sctx, goto out; } - ret = add_waiting_dir_move(sctx, pm->ino); + ret = add_waiting_dir_move(sctx, pm->ino, is_orphan); if (ret) goto out; @@ -3369,8 +3386,40 @@ out: return ret; } +/* + * Check if ino ino1 is an ancestor of inode ino2 in the given root. + * Return 1 if true, 0 if false and < 0 on error. + */ +static int is_ancestor(struct btrfs_root *root, + const u64 ino1, + const u64 ino1_gen, + const u64 ino2, + struct fs_path *fs_path) +{ + u64 ino = ino2; + + while (ino > BTRFS_FIRST_FREE_OBJECTID) { + int ret; + u64 parent; + u64 parent_gen; + + fs_path_reset(fs_path); + ret = get_first_ref(root, ino, &parent, &parent_gen, fs_path); + if (ret < 0) { + if (ret == -ENOENT && ino == ino2) + ret = 0; + return ret; + } + if (parent == ino1) + return parent_gen == ino1_gen ? 1 : 0; + ino = parent; + } + return 0; +} + static int wait_for_parent_move(struct send_ctx *sctx, - struct recorded_ref *parent_ref) + struct recorded_ref *parent_ref, + const bool is_orphan) { int ret = 0; u64 ino = parent_ref->dir; @@ -3390,11 +3439,24 @@ static int wait_for_parent_move(struct send_ctx *sctx, * Our current directory inode may not yet be renamed/moved because some * ancestor (immediate or not) has to be renamed/moved first. So find if * such ancestor exists and make sure our own rename/move happens after - * that ancestor is processed. + * that ancestor is processed to avoid path build infinite loops (done + * at get_cur_path()). */ while (ino > BTRFS_FIRST_FREE_OBJECTID) { if (is_waiting_for_move(sctx, ino)) { - ret = 1; + /* + * If the current inode is an ancestor of ino in the + * parent root, we need to delay the rename of the + * current inode, otherwise don't delayed the rename + * because we can end up with a circular dependency + * of renames, resulting in some directories never + * getting the respective rename operations issued in + * the send stream or getting into infinite path build + * loops. + */ + ret = is_ancestor(sctx->parent_root, + sctx->cur_ino, sctx->cur_inode_gen, + ino, path_before); break; } @@ -3436,7 +3498,7 @@ out: ino, &sctx->new_refs, &sctx->deleted_refs, - false); + is_orphan); if (!ret) ret = 1; } @@ -3605,6 +3667,17 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); } } + if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root && + can_rename) { + ret = wait_for_parent_move(sctx, cur, is_orphan); + if (ret < 0) + goto out; + if (ret == 1) { + can_rename = false; + *pending_move = 1; + } + } + /* * link/move the ref to the new place. If we have an orphan * inode, move it and update valid_path. If not, link or move @@ -3625,18 +3698,11 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino); * dirs, we always have one new and one deleted * ref. The deleted ref is ignored later. */ - ret = wait_for_parent_move(sctx, cur); - if (ret < 0) - goto out; - if (ret) { - *pending_move = 1; - } else { - ret = send_rename(sctx, valid_path, - cur->full_path); - if (!ret) - ret = fs_path_copy(valid_path, - cur->full_path); - } + ret = send_rename(sctx, valid_path, + cur->full_path); + if (!ret) + ret = fs_path_copy(valid_path, + cur->full_path); if (ret < 0) goto out; } else { @@ -4524,8 +4590,21 @@ verbose_printk("btrfs: send_clone offset=%llu, len=%d, clone_root=%llu, " if (ret < 0) goto out; - TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, - clone_root->root->root_item.uuid); + /* + * If the parent we're using has a received_uuid set then use that as + * our clone source as that is what we will look for when doing a + * receive. + * + * This covers the case that we create a snapshot off of a received + * subvolume and then use that as the parent and try to receive on a + * different host. + */ + if (!btrfs_is_empty_uuid(clone_root->root->root_item.received_uuid)) + TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, + clone_root->root->root_item.received_uuid); + else + TLV_PUT_UUID(sctx, BTRFS_SEND_A_CLONE_UUID, + clone_root->root->root_item.uuid); TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_CTRANSID, le64_to_cpu(clone_root->root->root_item.ctransid)); TLV_PUT_PATH(sctx, BTRFS_SEND_A_CLONE_PATH, p); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 9e66f5e72..cd7ef34d2 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -135,6 +135,7 @@ static void btrfs_handle_error(struct btrfs_fs_info *fs_info) * __btrfs_std_error decodes expected errors from the caller and * invokes the approciate error response. */ +__cold void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...) { @@ -247,18 +248,11 @@ void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function, * We'll complete the cleanup in btrfs_end_transaction and * btrfs_commit_transaction. */ +__cold void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root, const char *function, unsigned int line, int errno) { - /* - * Report first abort since mount - */ - if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, - &root->fs_info->fs_state)) { - WARN(1, KERN_DEBUG "BTRFS: Transaction aborted (error %d)\n", - errno); - } trans->aborted = errno; /* Nothing used. The other threads that have joined this * transaction may be able to continue. */ @@ -281,6 +275,7 @@ void __btrfs_abort_transaction(struct btrfs_trans_handle *trans, * __btrfs_panic decodes unexpected, fatal errors from the caller, * issues an alert, and either panics or BUGs, depending on mount options. */ +__cold void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int errno, const char *fmt, ...) { @@ -841,33 +836,153 @@ out: return error; } -static struct dentry *get_default_root(struct super_block *sb, - u64 subvol_objectid) +static char *get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, + u64 subvol_objectid) { - struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_root *root = fs_info->tree_root; - struct btrfs_root *new_root; - struct btrfs_dir_item *di; - struct btrfs_path *path; - struct btrfs_key location; - struct inode *inode; - u64 dir_id; - int new = 0; + struct btrfs_root *fs_root; + struct btrfs_root_ref *root_ref; + struct btrfs_inode_ref *inode_ref; + struct btrfs_key key; + struct btrfs_path *path = NULL; + char *name = NULL, *ptr; + u64 dirid; + int len; + int ret; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto err; + } + path->leave_spinning = 1; + + name = kmalloc(PATH_MAX, GFP_NOFS); + if (!name) { + ret = -ENOMEM; + goto err; + } + ptr = name + PATH_MAX - 1; + ptr[0] = '\0'; /* - * We have a specific subvol we want to mount, just setup location and - * go look up the root. + * Walk up the subvolume trees in the tree of tree roots by root + * backrefs until we hit the top-level subvolume. */ - if (subvol_objectid) { - location.objectid = subvol_objectid; - location.type = BTRFS_ROOT_ITEM_KEY; - location.offset = (u64)-1; - goto find_root; + while (subvol_objectid != BTRFS_FS_TREE_OBJECTID) { + key.objectid = subvol_objectid; + key.type = BTRFS_ROOT_BACKREF_KEY; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + goto err; + } else if (ret > 0) { + ret = btrfs_previous_item(root, path, subvol_objectid, + BTRFS_ROOT_BACKREF_KEY); + if (ret < 0) { + goto err; + } else if (ret > 0) { + ret = -ENOENT; + goto err; + } + } + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + subvol_objectid = key.offset; + + root_ref = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_root_ref); + len = btrfs_root_ref_name_len(path->nodes[0], root_ref); + ptr -= len + 1; + if (ptr < name) { + ret = -ENAMETOOLONG; + goto err; + } + read_extent_buffer(path->nodes[0], ptr + 1, + (unsigned long)(root_ref + 1), len); + ptr[0] = '/'; + dirid = btrfs_root_ref_dirid(path->nodes[0], root_ref); + btrfs_release_path(path); + + key.objectid = subvol_objectid; + key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; + fs_root = btrfs_read_fs_root_no_name(fs_info, &key); + if (IS_ERR(fs_root)) { + ret = PTR_ERR(fs_root); + goto err; + } + + /* + * Walk up the filesystem tree by inode refs until we hit the + * root directory. + */ + while (dirid != BTRFS_FIRST_FREE_OBJECTID) { + key.objectid = dirid; + key.type = BTRFS_INODE_REF_KEY; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0); + if (ret < 0) { + goto err; + } else if (ret > 0) { + ret = btrfs_previous_item(fs_root, path, dirid, + BTRFS_INODE_REF_KEY); + if (ret < 0) { + goto err; + } else if (ret > 0) { + ret = -ENOENT; + goto err; + } + } + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + dirid = key.offset; + + inode_ref = btrfs_item_ptr(path->nodes[0], + path->slots[0], + struct btrfs_inode_ref); + len = btrfs_inode_ref_name_len(path->nodes[0], + inode_ref); + ptr -= len + 1; + if (ptr < name) { + ret = -ENAMETOOLONG; + goto err; + } + read_extent_buffer(path->nodes[0], ptr + 1, + (unsigned long)(inode_ref + 1), len); + ptr[0] = '/'; + btrfs_release_path(path); + } } + btrfs_free_path(path); + if (ptr == name + PATH_MAX - 1) { + name[0] = '/'; + name[1] = '\0'; + } else { + memmove(name, ptr, name + PATH_MAX - ptr); + } + return name; + +err: + btrfs_free_path(path); + kfree(name); + return ERR_PTR(ret); +} + +static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objectid) +{ + struct btrfs_root *root = fs_info->tree_root; + struct btrfs_dir_item *di; + struct btrfs_path *path; + struct btrfs_key location; + u64 dir_id; + path = btrfs_alloc_path(); if (!path) - return ERR_PTR(-ENOMEM); + return -ENOMEM; path->leave_spinning = 1; /* @@ -879,58 +994,23 @@ static struct dentry *get_default_root(struct super_block *sb, di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0); if (IS_ERR(di)) { btrfs_free_path(path); - return ERR_CAST(di); + return PTR_ERR(di); } if (!di) { /* * Ok the default dir item isn't there. This is weird since * it's always been there, but don't freak out, just try and - * mount to root most subvolume. + * mount the top-level subvolume. */ btrfs_free_path(path); - dir_id = BTRFS_FIRST_FREE_OBJECTID; - new_root = fs_info->fs_root; - goto setup_root; + *objectid = BTRFS_FS_TREE_OBJECTID; + return 0; } btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); btrfs_free_path(path); - -find_root: - new_root = btrfs_read_fs_root_no_name(fs_info, &location); - if (IS_ERR(new_root)) - return ERR_CAST(new_root); - - if (!(sb->s_flags & MS_RDONLY)) { - int ret; - down_read(&fs_info->cleanup_work_sem); - ret = btrfs_orphan_cleanup(new_root); - up_read(&fs_info->cleanup_work_sem); - if (ret) - return ERR_PTR(ret); - } - - dir_id = btrfs_root_dirid(&new_root->root_item); -setup_root: - location.objectid = dir_id; - location.type = BTRFS_INODE_ITEM_KEY; - location.offset = 0; - - inode = btrfs_iget(sb, &location, new_root, &new); - if (IS_ERR(inode)) - return ERR_CAST(inode); - - /* - * If we're just mounting the root most subvol put the inode and return - * a reference to the dentry. We will have already gotten a reference - * to the inode in btrfs_fill_super so we're good to go. - */ - if (!new && d_inode(sb->s_root) == inode) { - iput(inode); - return dget(sb->s_root); - } - - return d_obtain_root(inode); + *objectid = location.objectid; + return 0; } static int btrfs_fill_super(struct super_block *sb, @@ -1108,6 +1188,10 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) seq_puts(seq, ",fatal_errors=panic"); if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL) seq_printf(seq, ",commit=%d", info->commit_interval); + seq_printf(seq, ",subvolid=%llu", + BTRFS_I(d_inode(dentry))->root->root_key.objectid); + seq_puts(seq, ",subvol="); + seq_dentry(seq, dentry, " \t\n\\"); return 0; } @@ -1138,107 +1222,139 @@ static inline int is_subvolume_inode(struct inode *inode) } /* - * This will strip out the subvol=%s argument for an argument string and add - * subvolid=0 to make sure we get the actual tree root for path walking to the - * subvol we want. + * This will add subvolid=0 to the argument string while removing any subvol= + * and subvolid= arguments to make sure we get the top-level root for path + * walking to the subvol we want. */ static char *setup_root_args(char *args) { - unsigned len = strlen(args) + 2 + 1; - char *src, *dst, *buf; + char *buf, *dst, *sep; - /* - * We need the same args as before, but with this substitution: - * s!subvol=[^,]+!subvolid=0! - * - * Since the replacement string is up to 2 bytes longer than the - * original, allocate strlen(args) + 2 + 1 bytes. - */ + if (!args) + return kstrdup("subvolid=0", GFP_NOFS); - src = strstr(args, "subvol="); - /* This shouldn't happen, but just in case.. */ - if (!src) - return NULL; - - buf = dst = kmalloc(len, GFP_NOFS); + /* The worst case is that we add ",subvolid=0" to the end. */ + buf = dst = kmalloc(strlen(args) + strlen(",subvolid=0") + 1, GFP_NOFS); if (!buf) return NULL; - /* - * If the subvol= arg is not at the start of the string, - * copy whatever precedes it into buf. - */ - if (src != args) { - *src++ = '\0'; - strcpy(buf, args); - dst += strlen(args); + while (1) { + sep = strchrnul(args, ','); + if (!strstarts(args, "subvol=") && + !strstarts(args, "subvolid=")) { + memcpy(dst, args, sep - args); + dst += sep - args; + *dst++ = ','; + } + if (*sep) + args = sep + 1; + else + break; } - strcpy(dst, "subvolid=0"); - dst += strlen("subvolid=0"); - - /* - * If there is a "," after the original subvol=... string, - * copy that suffix into our buffer. Otherwise, we're done. - */ - src = strchr(src, ','); - if (src) - strcpy(dst, src); return buf; } -static struct dentry *mount_subvol(const char *subvol_name, int flags, - const char *device_name, char *data) +static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid, + int flags, const char *device_name, + char *data) { struct dentry *root; - struct vfsmount *mnt; + struct vfsmount *mnt = NULL; char *newargs; + int ret; newargs = setup_root_args(data); - if (!newargs) - return ERR_PTR(-ENOMEM); - mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name, - newargs); + if (!newargs) { + root = ERR_PTR(-ENOMEM); + goto out; + } - if (PTR_RET(mnt) == -EBUSY) { + mnt = vfs_kern_mount(&btrfs_fs_type, flags, device_name, newargs); + if (PTR_ERR_OR_ZERO(mnt) == -EBUSY) { if (flags & MS_RDONLY) { - mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~MS_RDONLY, device_name, - newargs); + mnt = vfs_kern_mount(&btrfs_fs_type, flags & ~MS_RDONLY, + device_name, newargs); } else { - int r; - mnt = vfs_kern_mount(&btrfs_fs_type, flags | MS_RDONLY, device_name, - newargs); + mnt = vfs_kern_mount(&btrfs_fs_type, flags | MS_RDONLY, + device_name, newargs); if (IS_ERR(mnt)) { - kfree(newargs); - return ERR_CAST(mnt); + root = ERR_CAST(mnt); + mnt = NULL; + goto out; } - r = btrfs_remount(mnt->mnt_sb, &flags, NULL); - if (r < 0) { - /* FIXME: release vfsmount mnt ??*/ - kfree(newargs); - return ERR_PTR(r); + down_write(&mnt->mnt_sb->s_umount); + ret = btrfs_remount(mnt->mnt_sb, &flags, NULL); + up_write(&mnt->mnt_sb->s_umount); + if (ret < 0) { + root = ERR_PTR(ret); + goto out; } } } + if (IS_ERR(mnt)) { + root = ERR_CAST(mnt); + mnt = NULL; + goto out; + } - kfree(newargs); + if (!subvol_name) { + if (!subvol_objectid) { + ret = get_default_subvol_objectid(btrfs_sb(mnt->mnt_sb), + &subvol_objectid); + if (ret) { + root = ERR_PTR(ret); + goto out; + } + } + subvol_name = get_subvol_name_from_objectid(btrfs_sb(mnt->mnt_sb), + subvol_objectid); + if (IS_ERR(subvol_name)) { + root = ERR_CAST(subvol_name); + subvol_name = NULL; + goto out; + } - if (IS_ERR(mnt)) - return ERR_CAST(mnt); + } root = mount_subtree(mnt, subvol_name); + /* mount_subtree() drops our reference on the vfsmount. */ + mnt = NULL; - if (!IS_ERR(root) && !is_subvolume_inode(d_inode(root))) { + if (!IS_ERR(root)) { struct super_block *s = root->d_sb; - dput(root); - root = ERR_PTR(-EINVAL); - deactivate_locked_super(s); - printk(KERN_ERR "BTRFS: '%s' is not a valid subvolume\n", - subvol_name); + struct inode *root_inode = d_inode(root); + u64 root_objectid = BTRFS_I(root_inode)->root->root_key.objectid; + + ret = 0; + if (!is_subvolume_inode(root_inode)) { + pr_err("BTRFS: '%s' is not a valid subvolume\n", + subvol_name); + ret = -EINVAL; + } + if (subvol_objectid && root_objectid != subvol_objectid) { + /* + * This will also catch a race condition where a + * subvolume which was passed by ID is renamed and + * another subvolume is renamed over the old location. + */ + pr_err("BTRFS: subvol '%s' does not match subvolid %llu\n", + subvol_name, subvol_objectid); + ret = -EINVAL; + } + if (ret) { + dput(root); + root = ERR_PTR(ret); + deactivate_locked_super(s); + } } +out: + mntput(mnt); + kfree(newargs); + kfree(subvol_name); return root; } @@ -1303,7 +1419,6 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, { struct block_device *bdev = NULL; struct super_block *s; - struct dentry *root; struct btrfs_fs_devices *fs_devices = NULL; struct btrfs_fs_info *fs_info = NULL; struct security_mnt_opts new_sec_opts; @@ -1323,10 +1438,10 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, return ERR_PTR(error); } - if (subvol_name) { - root = mount_subvol(subvol_name, flags, device_name, data); - kfree(subvol_name); - return root; + if (subvol_name || subvol_objectid != BTRFS_FS_TREE_OBJECTID) { + /* mount_subvol() will free subvol_name. */ + return mount_subvol(subvol_name, subvol_objectid, flags, + device_name, data); } security_init_mnt_opts(&new_sec_opts); @@ -1392,23 +1507,19 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, error = btrfs_fill_super(s, fs_devices, data, flags & MS_SILENT ? 1 : 0); } - - root = !error ? get_default_root(s, subvol_objectid) : ERR_PTR(error); - if (IS_ERR(root)) { + if (error) { deactivate_locked_super(s); - error = PTR_ERR(root); goto error_sec_opts; } fs_info = btrfs_sb(s); error = setup_security_options(fs_info, s, &new_sec_opts); if (error) { - dput(root); deactivate_locked_super(s); goto error_sec_opts; } - return root; + return dget(s->s_root); error_close_devices: btrfs_close_devices(fs_devices); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index e8a4c86d2..603b0cc2b 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -33,6 +33,7 @@ #include "volumes.h" static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj); +static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj); static u64 get_features(struct btrfs_fs_info *fs_info, enum btrfs_feature_set set) @@ -428,7 +429,7 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj, BTRFS_ATTR(clone_alignment, btrfs_clone_alignment_show); -static struct attribute *btrfs_attrs[] = { +static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(label), BTRFS_ATTR_PTR(nodesize), BTRFS_ATTR_PTR(sectorsize), @@ -438,21 +439,29 @@ static struct attribute *btrfs_attrs[] = { static void btrfs_release_super_kobj(struct kobject *kobj) { - struct btrfs_fs_info *fs_info = to_fs_info(kobj); - complete(&fs_info->kobj_unregister); + struct btrfs_fs_devices *fs_devs = to_fs_devs(kobj); + + memset(&fs_devs->super_kobj, 0, sizeof(struct kobject)); + complete(&fs_devs->kobj_unregister); } static struct kobj_type btrfs_ktype = { .sysfs_ops = &kobj_sysfs_ops, .release = btrfs_release_super_kobj, - .default_attrs = btrfs_attrs, }; +static inline struct btrfs_fs_devices *to_fs_devs(struct kobject *kobj) +{ + if (kobj->ktype != &btrfs_ktype) + return NULL; + return container_of(kobj, struct btrfs_fs_devices, super_kobj); +} + static inline struct btrfs_fs_info *to_fs_info(struct kobject *kobj) { if (kobj->ktype != &btrfs_ktype) return NULL; - return container_of(kobj, struct btrfs_fs_info, super_kobj); + return to_fs_devs(kobj)->fs_info; } #define NUM_FEATURE_BITS 64 @@ -493,12 +502,12 @@ static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add) attrs[0] = &fa->kobj_attr.attr; if (add) { int ret; - ret = sysfs_merge_group(&fs_info->super_kobj, + ret = sysfs_merge_group(&fs_info->fs_devices->super_kobj, &agroup); if (ret) return ret; } else - sysfs_unmerge_group(&fs_info->super_kobj, + sysfs_unmerge_group(&fs_info->fs_devices->super_kobj, &agroup); } @@ -506,25 +515,49 @@ static int addrm_unknown_feature_attrs(struct btrfs_fs_info *fs_info, bool add) return 0; } -static void __btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info) +static void __btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs) +{ + if (fs_devs->device_dir_kobj) { + kobject_del(fs_devs->device_dir_kobj); + kobject_put(fs_devs->device_dir_kobj); + fs_devs->device_dir_kobj = NULL; + } + + if (fs_devs->super_kobj.state_initialized) { + kobject_del(&fs_devs->super_kobj); + kobject_put(&fs_devs->super_kobj); + wait_for_completion(&fs_devs->kobj_unregister); + } +} + +/* when fs_devs is NULL it will remove all fsid kobject */ +void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs) { - kobject_del(&fs_info->super_kobj); - kobject_put(&fs_info->super_kobj); - wait_for_completion(&fs_info->kobj_unregister); + struct list_head *fs_uuids = btrfs_get_fs_uuids(); + + if (fs_devs) { + __btrfs_sysfs_remove_fsid(fs_devs); + return; + } + + list_for_each_entry(fs_devs, fs_uuids, list) { + __btrfs_sysfs_remove_fsid(fs_devs); + } } void btrfs_sysfs_remove_one(struct btrfs_fs_info *fs_info) { + btrfs_reset_fs_info_ptr(fs_info); + if (fs_info->space_info_kobj) { sysfs_remove_files(fs_info->space_info_kobj, allocation_attrs); kobject_del(fs_info->space_info_kobj); kobject_put(fs_info->space_info_kobj); } - kobject_del(fs_info->device_dir_kobj); - kobject_put(fs_info->device_dir_kobj); addrm_unknown_feature_attrs(fs_info, false); - sysfs_remove_group(&fs_info->super_kobj, &btrfs_feature_attr_group); - __btrfs_sysfs_remove_one(fs_info); + sysfs_remove_group(&fs_info->fs_devices->super_kobj, &btrfs_feature_attr_group); + sysfs_remove_files(&fs_info->fs_devices->super_kobj, btrfs_attrs); + btrfs_kobj_rm_device(fs_info->fs_devices, NULL); } const char * const btrfs_feature_set_names[3] = { @@ -602,40 +635,60 @@ static void init_feature_attrs(void) } } -int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info, +/* when one_device is NULL, it removes all device links */ + +int btrfs_kobj_rm_device(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device) { struct hd_struct *disk; struct kobject *disk_kobj; - if (!fs_info->device_dir_kobj) + if (!fs_devices->device_dir_kobj) return -EINVAL; if (one_device && one_device->bdev) { disk = one_device->bdev->bd_part; disk_kobj = &part_to_dev(disk)->kobj; - sysfs_remove_link(fs_info->device_dir_kobj, + sysfs_remove_link(fs_devices->device_dir_kobj, + disk_kobj->name); + } + + if (one_device) + return 0; + + list_for_each_entry(one_device, + &fs_devices->devices, dev_list) { + if (!one_device->bdev) + continue; + disk = one_device->bdev->bd_part; + disk_kobj = &part_to_dev(disk)->kobj; + + sysfs_remove_link(fs_devices->device_dir_kobj, disk_kobj->name); } return 0; } -int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info, - struct btrfs_device *one_device) +int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs) { - int error = 0; - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; - struct btrfs_device *dev; - - if (!fs_info->device_dir_kobj) - fs_info->device_dir_kobj = kobject_create_and_add("devices", - &fs_info->super_kobj); + if (!fs_devs->device_dir_kobj) + fs_devs->device_dir_kobj = kobject_create_and_add("devices", + &fs_devs->super_kobj); - if (!fs_info->device_dir_kobj) + if (!fs_devs->device_dir_kobj) return -ENOMEM; + return 0; +} + +int btrfs_kobj_add_device(struct btrfs_fs_devices *fs_devices, + struct btrfs_device *one_device) +{ + int error = 0; + struct btrfs_device *dev; + list_for_each_entry(dev, &fs_devices->devices, dev_list) { struct hd_struct *disk; struct kobject *disk_kobj; @@ -649,7 +702,7 @@ int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info, disk = dev->bdev->bd_part; disk_kobj = &part_to_dev(disk)->kobj; - error = sysfs_create_link(fs_info->device_dir_kobj, + error = sysfs_create_link(fs_devices->device_dir_kobj, disk_kobj, disk_kobj->name); if (error) break; @@ -667,34 +720,51 @@ static struct dentry *btrfs_debugfs_root_dentry; /* Debugging tunables and exported data */ u64 btrfs_debugfs_test; +/* + * Can be called by the device discovery thread. + * And parent can be specified for seed device + */ +int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs, + struct kobject *parent) +{ + int error; + + init_completion(&fs_devs->kobj_unregister); + fs_devs->super_kobj.kset = btrfs_kset; + error = kobject_init_and_add(&fs_devs->super_kobj, + &btrfs_ktype, parent, "%pU", fs_devs->fsid); + return error; +} + int btrfs_sysfs_add_one(struct btrfs_fs_info *fs_info) { int error; + struct btrfs_fs_devices *fs_devs = fs_info->fs_devices; + struct kobject *super_kobj = &fs_devs->super_kobj; + + btrfs_set_fs_info_ptr(fs_info); - init_completion(&fs_info->kobj_unregister); - fs_info->super_kobj.kset = btrfs_kset; - error = kobject_init_and_add(&fs_info->super_kobj, &btrfs_ktype, NULL, - "%pU", fs_info->fsid); + error = btrfs_kobj_add_device(fs_devs, NULL); if (error) return error; - error = sysfs_create_group(&fs_info->super_kobj, - &btrfs_feature_attr_group); + error = sysfs_create_files(super_kobj, btrfs_attrs); if (error) { - __btrfs_sysfs_remove_one(fs_info); + btrfs_kobj_rm_device(fs_devs, NULL); return error; } - error = addrm_unknown_feature_attrs(fs_info, true); + error = sysfs_create_group(super_kobj, + &btrfs_feature_attr_group); if (error) goto failure; - error = btrfs_kobj_add_device(fs_info, NULL); + error = addrm_unknown_feature_attrs(fs_info, true); if (error) goto failure; fs_info->space_info_kobj = kobject_create_and_add("allocation", - &fs_info->super_kobj); + super_kobj); if (!fs_info->space_info_kobj) { error = -ENOMEM; goto failure; diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index 3a4bbed72..6392527bc 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -82,8 +82,12 @@ char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags); extern const char * const btrfs_feature_set_names[3]; extern struct kobj_type space_info_ktype; extern struct kobj_type btrfs_raid_ktype; -int btrfs_kobj_add_device(struct btrfs_fs_info *fs_info, +int btrfs_kobj_add_device(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device); -int btrfs_kobj_rm_device(struct btrfs_fs_info *fs_info, +int btrfs_kobj_rm_device(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device); +int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs, + struct kobject *parent); +int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs); +void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs); #endif /* _BTRFS_SYSFS_H_ */ diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index c32a7ba76..846d277b1 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -21,6 +21,7 @@ #include "../transaction.h" #include "../disk-io.h" #include "../qgroup.h" +#include "../backref.h" static void init_dummy_trans(struct btrfs_trans_handle *trans) { @@ -227,6 +228,8 @@ static int test_no_shared_qgroup(struct btrfs_root *root) { struct btrfs_trans_handle trans; struct btrfs_fs_info *fs_info = root->fs_info; + struct ulist *old_roots = NULL; + struct ulist *new_roots = NULL; int ret; init_dummy_trans(&trans); @@ -238,10 +241,15 @@ static int test_no_shared_qgroup(struct btrfs_root *root) return ret; } - ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096, - BTRFS_QGROUP_OPER_ADD_EXCL, 0); + /* + * Since the test trans doesn't havee the complicated delayed refs, + * we can only call btrfs_qgroup_account_extent() directly to test + * quota. + */ + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); if (ret) { - test_msg("Couldn't add space to a qgroup %d\n", ret); + ulist_free(old_roots); + test_msg("Couldn't find old roots: %d\n", ret); return ret; } @@ -249,9 +257,18 @@ static int test_no_shared_qgroup(struct btrfs_root *root) if (ret) return ret; - ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); + if (ret) { + ulist_free(old_roots); + ulist_free(new_roots); + test_msg("Couldn't find old roots: %d\n", ret); + return ret; + } + + ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, + old_roots, new_roots); if (ret) { - test_msg("Delayed qgroup accounting failed %d\n", ret); + test_msg("Couldn't account space for a qgroup %d\n", ret); return ret; } @@ -259,21 +276,32 @@ static int test_no_shared_qgroup(struct btrfs_root *root) test_msg("Qgroup counts didn't match expected values\n"); return -EINVAL; } + old_roots = NULL; + new_roots = NULL; + + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); + if (ret) { + ulist_free(old_roots); + test_msg("Couldn't find old roots: %d\n", ret); + return ret; + } ret = remove_extent_item(root, 4096, 4096); if (ret) return -EINVAL; - ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096, - BTRFS_QGROUP_OPER_SUB_EXCL, 0); + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); if (ret) { - test_msg("Couldn't remove space from the qgroup %d\n", ret); - return -EINVAL; + ulist_free(old_roots); + ulist_free(new_roots); + test_msg("Couldn't find old roots: %d\n", ret); + return ret; } - ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); + ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, + old_roots, new_roots); if (ret) { - test_msg("Qgroup accounting failed %d\n", ret); + test_msg("Couldn't account space for a qgroup %d\n", ret); return -EINVAL; } @@ -294,6 +322,8 @@ static int test_multiple_refs(struct btrfs_root *root) { struct btrfs_trans_handle trans; struct btrfs_fs_info *fs_info = root->fs_info; + struct ulist *old_roots = NULL; + struct ulist *new_roots = NULL; int ret; init_dummy_trans(&trans); @@ -307,20 +337,29 @@ static int test_multiple_refs(struct btrfs_root *root) return ret; } + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); + if (ret) { + ulist_free(old_roots); + test_msg("Couldn't find old roots: %d\n", ret); + return ret; + } + ret = insert_normal_tree_ref(root, 4096, 4096, 0, 5); if (ret) return ret; - ret = btrfs_qgroup_record_ref(&trans, fs_info, 5, 4096, 4096, - BTRFS_QGROUP_OPER_ADD_EXCL, 0); + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); if (ret) { - test_msg("Couldn't add space to a qgroup %d\n", ret); + ulist_free(old_roots); + ulist_free(new_roots); + test_msg("Couldn't find old roots: %d\n", ret); return ret; } - ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); + ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, + old_roots, new_roots); if (ret) { - test_msg("Delayed qgroup accounting failed %d\n", ret); + test_msg("Couldn't account space for a qgroup %d\n", ret); return ret; } @@ -329,20 +368,29 @@ static int test_multiple_refs(struct btrfs_root *root) return -EINVAL; } + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); + if (ret) { + ulist_free(old_roots); + test_msg("Couldn't find old roots: %d\n", ret); + return ret; + } + ret = add_tree_ref(root, 4096, 4096, 0, 256); if (ret) return ret; - ret = btrfs_qgroup_record_ref(&trans, fs_info, 256, 4096, 4096, - BTRFS_QGROUP_OPER_ADD_SHARED, 0); + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); if (ret) { - test_msg("Qgroup record ref failed %d\n", ret); + ulist_free(old_roots); + ulist_free(new_roots); + test_msg("Couldn't find old roots: %d\n", ret); return ret; } - ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); + ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, + old_roots, new_roots); if (ret) { - test_msg("Qgroup accounting failed %d\n", ret); + test_msg("Couldn't account space for a qgroup %d\n", ret); return ret; } @@ -356,20 +404,29 @@ static int test_multiple_refs(struct btrfs_root *root) return -EINVAL; } + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &old_roots); + if (ret) { + ulist_free(old_roots); + test_msg("Couldn't find old roots: %d\n", ret); + return ret; + } + ret = remove_extent_ref(root, 4096, 4096, 0, 256); if (ret) return ret; - ret = btrfs_qgroup_record_ref(&trans, fs_info, 256, 4096, 4096, - BTRFS_QGROUP_OPER_SUB_SHARED, 0); + ret = btrfs_find_all_roots(&trans, fs_info, 4096, 0, &new_roots); if (ret) { - test_msg("Qgroup record ref failed %d\n", ret); + ulist_free(old_roots); + ulist_free(new_roots); + test_msg("Couldn't find old roots: %d\n", ret); return ret; } - ret = btrfs_delayed_qgroup_accounting(&trans, fs_info); + ret = btrfs_qgroup_account_extent(&trans, fs_info, 4096, 4096, + old_roots, new_roots); if (ret) { - test_msg("Qgroup accounting failed %d\n", ret); + test_msg("Couldn't account space for a qgroup %d\n", ret); return ret; } diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 94e909c5a..f5021fcb1 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -225,12 +225,14 @@ loop: cur_trans->dirty_bg_run = 0; cur_trans->delayed_refs.href_root = RB_ROOT; + cur_trans->delayed_refs.dirty_extent_root = RB_ROOT; atomic_set(&cur_trans->delayed_refs.num_entries, 0); cur_trans->delayed_refs.num_heads_ready = 0; cur_trans->delayed_refs.pending_csums = 0; cur_trans->delayed_refs.num_heads = 0; cur_trans->delayed_refs.flushing = 0; cur_trans->delayed_refs.run_delayed_start = 0; + cur_trans->delayed_refs.qgroup_to_skip = 0; /* * although the tree mod log is per file system and not per transaction, @@ -509,6 +511,7 @@ again: h->transaction = cur_trans; h->blocks_used = 0; h->bytes_reserved = 0; + h->chunk_bytes_reserved = 0; h->root = root; h->delayed_ref_updates = 0; h->use_count = 1; @@ -792,6 +795,8 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, if (!list_empty(&trans->new_bgs)) btrfs_create_pending_block_groups(trans, root); + btrfs_trans_release_chunk_metadata(trans); + if (lock && !atomic_read(&root->fs_info->open_ioctl_trans) && should_end_transaction(trans, root) && ACCESS_ONCE(cur_trans->state) == TRANS_STATE_RUNNING) { @@ -1290,6 +1295,12 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, if (pending->error) goto no_free_objectid; + /* + * Make qgroup to skip current new snapshot's qgroupid, as it is + * accounted by later btrfs_qgroup_inherit(). + */ + btrfs_set_skip_qgroup(trans, objectid); + btrfs_reloc_pre_snapshot(trans, pending, &to_reserve); if (to_reserve > 0) { @@ -1298,7 +1309,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, to_reserve, BTRFS_RESERVE_NO_FLUSH); if (pending->error) - goto no_free_objectid; + goto clear_skip_qgroup; } key.objectid = objectid; @@ -1396,25 +1407,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_abort_transaction(trans, root, ret); goto fail; } - - /* - * We need to flush delayed refs in order to make sure all of our quota - * operations have been done before we call btrfs_qgroup_inherit. - */ - ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - goto fail; - } - - ret = btrfs_qgroup_inherit(trans, fs_info, - root->root_key.objectid, - objectid, pending->inherit); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - goto fail; - } - /* see comments in should_cow_block() */ set_bit(BTRFS_ROOT_FORCE_COW, &root->state); smp_wmb(); @@ -1497,11 +1489,37 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, goto fail; } } + + ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } + + /* + * account qgroup counters before qgroup_inherit() + */ + ret = btrfs_qgroup_prepare_account_extents(trans, fs_info); + if (ret) + goto fail; + ret = btrfs_qgroup_account_extents(trans, fs_info); + if (ret) + goto fail; + ret = btrfs_qgroup_inherit(trans, fs_info, + root->root_key.objectid, + objectid, pending->inherit); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto fail; + } + fail: pending->error = ret; dir_item_existed: trans->block_rsv = rsv; trans->bytes_reserved = 0; +clear_skip_qgroup: + btrfs_clear_skip_qgroup(trans); no_free_objectid: kfree(new_root_item); root_item_alloc_fail: @@ -1963,6 +1981,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, goto scrub_continue; } + /* Reocrd old roots for later qgroup accounting */ + ret = btrfs_qgroup_prepare_account_extents(trans, root->fs_info); + if (ret) { + mutex_unlock(&root->fs_info->reloc_mutex); + goto scrub_continue; + } + /* * make sure none of the code above managed to slip in a * delayed item @@ -2004,6 +2029,17 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, */ btrfs_free_log_root_tree(trans, root->fs_info); + /* + * Since fs roots are all committed, we can get a quite accurate + * new_roots. So let's do quota accounting. + */ + ret = btrfs_qgroup_account_extents(trans, root->fs_info); + if (ret < 0) { + mutex_unlock(&root->fs_info->tree_log_mutex); + mutex_unlock(&root->fs_info->reloc_mutex); + goto scrub_continue; + } + ret = commit_cowonly_roots(trans, root); if (ret) { mutex_unlock(&root->fs_info->tree_log_mutex); @@ -2054,6 +2090,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, clear_bit(BTRFS_INODE_BTREE_LOG1_ERR, &btree_ino->runtime_flags); clear_bit(BTRFS_INODE_BTREE_LOG2_ERR, &btree_ino->runtime_flags); + btrfs_trans_release_chunk_metadata(trans); + spin_lock(&root->fs_info->trans_lock); cur_trans->state = TRANS_STATE_UNBLOCKED; root->fs_info->running_transaction = NULL; @@ -2114,7 +2152,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, kmem_cache_free(btrfs_trans_handle_cachep, trans); - if (current != root->fs_info->transaction_kthread) + if (current != root->fs_info->transaction_kthread && + current != root->fs_info->cleaner_kthread) btrfs_run_delayed_iputs(root); return ret; @@ -2123,6 +2162,7 @@ scrub_continue: btrfs_scrub_continue(root); cleanup_transaction: btrfs_trans_release_metadata(trans, root); + btrfs_trans_release_chunk_metadata(trans); trans->block_rsv = NULL; if (trans->qgroup_reserved) { btrfs_qgroup_free(root, trans->qgroup_reserved); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 0b2475559..eb09c2067 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -102,6 +102,7 @@ struct btrfs_transaction { struct btrfs_trans_handle { u64 transid; u64 bytes_reserved; + u64 chunk_bytes_reserved; u64 qgroup_reserved; unsigned long use_count; unsigned long blocks_reserved; @@ -153,6 +154,29 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans, spin_unlock(&BTRFS_I(inode)->lock); } +/* + * Make qgroup codes to skip given qgroupid, means the old/new_roots for + * qgroup won't contain the qgroupid in it. + */ +static inline void btrfs_set_skip_qgroup(struct btrfs_trans_handle *trans, + u64 qgroupid) +{ + struct btrfs_delayed_ref_root *delayed_refs; + + delayed_refs = &trans->transaction->delayed_refs; + WARN_ON(delayed_refs->qgroup_to_skip); + delayed_refs->qgroup_to_skip = qgroupid; +} + +static inline void btrfs_clear_skip_qgroup(struct btrfs_trans_handle *trans) +{ + struct btrfs_delayed_ref_root *delayed_refs; + + delayed_refs = &trans->transaction->delayed_refs; + WARN_ON(!delayed_refs->qgroup_to_skip); + delayed_refs->qgroup_to_skip = 0; +} + int btrfs_end_transaction(struct btrfs_trans_handle *trans, struct btrfs_root *root); struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index a63719cc9..a4b9c8b2d 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c @@ -52,9 +52,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) goto out; - if (btrfs_test_opt(root, SSD)) - goto out; - path = btrfs_alloc_path(); if (!path) return -ENOMEM; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 4920fceff..9c45431e6 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3881,12 +3881,6 @@ static int wait_ordered_extents(struct btrfs_trans_handle *trans, &ordered->flags)) continue; - if (ordered->csum_bytes_left) { - btrfs_start_ordered_extent(inode, ordered, 0); - wait_event(ordered->wait, - ordered->csum_bytes_left == 0); - } - list_for_each_entry(sum, &ordered->list, list) { ret = btrfs_csum_file_blocks(trans, log, sum); if (ret) @@ -4123,6 +4117,187 @@ static int logged_inode_size(struct btrfs_root *log, struct inode *inode, return 0; } +/* + * At the moment we always log all xattrs. This is to figure out at log replay + * time which xattrs must have their deletion replayed. If a xattr is missing + * in the log tree and exists in the fs/subvol tree, we delete it. This is + * because if a xattr is deleted, the inode is fsynced and a power failure + * happens, causing the log to be replayed the next time the fs is mounted, + * we want the xattr to not exist anymore (same behaviour as other filesystems + * with a journal, ext3/4, xfs, f2fs, etc). + */ +static int btrfs_log_all_xattrs(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, + struct btrfs_path *path, + struct btrfs_path *dst_path) +{ + int ret; + struct btrfs_key key; + const u64 ino = btrfs_ino(inode); + int ins_nr = 0; + int start_slot = 0; + + key.objectid = ino; + key.type = BTRFS_XATTR_ITEM_KEY; + key.offset = 0; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + return ret; + + while (true) { + int slot = path->slots[0]; + struct extent_buffer *leaf = path->nodes[0]; + int nritems = btrfs_header_nritems(leaf); + + if (slot >= nritems) { + if (ins_nr > 0) { + u64 last_extent = 0; + + ret = copy_items(trans, inode, dst_path, path, + &last_extent, start_slot, + ins_nr, 1, 0); + /* can't be 1, extent items aren't processed */ + ASSERT(ret <= 0); + if (ret < 0) + return ret; + ins_nr = 0; + } + ret = btrfs_next_leaf(root, path); + if (ret < 0) + return ret; + else if (ret > 0) + break; + continue; + } + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid != ino || key.type != BTRFS_XATTR_ITEM_KEY) + break; + + if (ins_nr == 0) + start_slot = slot; + ins_nr++; + path->slots[0]++; + cond_resched(); + } + if (ins_nr > 0) { + u64 last_extent = 0; + + ret = copy_items(trans, inode, dst_path, path, + &last_extent, start_slot, + ins_nr, 1, 0); + /* can't be 1, extent items aren't processed */ + ASSERT(ret <= 0); + if (ret < 0) + return ret; + } + + return 0; +} + +/* + * If the no holes feature is enabled we need to make sure any hole between the + * last extent and the i_size of our inode is explicitly marked in the log. This + * is to make sure that doing something like: + * + * 1) create file with 128Kb of data + * 2) truncate file to 64Kb + * 3) truncate file to 256Kb + * 4) fsync file + * 5) <crash/power failure> + * 6) mount fs and trigger log replay + * + * Will give us a file with a size of 256Kb, the first 64Kb of data match what + * the file had in its first 64Kb of data at step 1 and the last 192Kb of the + * file correspond to a hole. The presence of explicit holes in a log tree is + * what guarantees that log replay will remove/adjust file extent items in the + * fs/subvol tree. + * + * Here we do not need to care about holes between extents, that is already done + * by copy_items(). We also only need to do this in the full sync path, where we + * lookup for extents from the fs/subvol tree only. In the fast path case, we + * lookup the list of modified extent maps and if any represents a hole, we + * insert a corresponding extent representing a hole in the log tree. + */ +static int btrfs_log_trailing_hole(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, + struct btrfs_path *path) +{ + int ret; + struct btrfs_key key; + u64 hole_start; + u64 hole_size; + struct extent_buffer *leaf; + struct btrfs_root *log = root->log_root; + const u64 ino = btrfs_ino(inode); + const u64 i_size = i_size_read(inode); + + if (!btrfs_fs_incompat(root->fs_info, NO_HOLES)) + return 0; + + key.objectid = ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + ASSERT(ret != 0); + if (ret < 0) + return ret; + + ASSERT(path->slots[0] > 0); + path->slots[0]--; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + + if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY) { + /* inode does not have any extents */ + hole_start = 0; + hole_size = i_size; + } else { + struct btrfs_file_extent_item *extent; + u64 len; + + /* + * If there's an extent beyond i_size, an explicit hole was + * already inserted by copy_items(). + */ + if (key.offset >= i_size) + return 0; + + extent = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + + if (btrfs_file_extent_type(leaf, extent) == + BTRFS_FILE_EXTENT_INLINE) { + len = btrfs_file_extent_inline_len(leaf, + path->slots[0], + extent); + ASSERT(len == i_size); + return 0; + } + + len = btrfs_file_extent_num_bytes(leaf, extent); + /* Last extent goes beyond i_size, no need to log a hole. */ + if (key.offset + len > i_size) + return 0; + hole_start = key.offset + len; + hole_size = i_size - hole_start; + } + btrfs_release_path(path); + + /* Last extent ends at i_size. */ + if (hole_size == 0) + return 0; + + hole_size = ALIGN(hole_size, root->sectorsize); + ret = btrfs_insert_file_extent(trans, log, ino, hole_start, 0, 0, + hole_size, 0, hole_size, 0, 0, 0); + return ret; +} + /* log a single inode in the tree log. * At least one parent directory for this inode must exist in the tree * or be logged already. @@ -4295,6 +4470,25 @@ again: if (min_key.type == BTRFS_INODE_ITEM_KEY) need_log_inode_item = false; + /* Skip xattrs, we log them later with btrfs_log_all_xattrs() */ + if (min_key.type == BTRFS_XATTR_ITEM_KEY) { + if (ins_nr == 0) + goto next_slot; + ret = copy_items(trans, inode, dst_path, path, + &last_extent, ins_start_slot, + ins_nr, inode_only, logged_isize); + if (ret < 0) { + err = ret; + goto out_unlock; + } + ins_nr = 0; + if (ret) { + btrfs_release_path(path); + continue; + } + goto next_slot; + } + src = path->nodes[0]; if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { ins_nr++; @@ -4362,6 +4556,18 @@ next_slot: ins_nr = 0; } + btrfs_release_path(path); + btrfs_release_path(dst_path); + err = btrfs_log_all_xattrs(trans, root, inode, path, dst_path); + if (err) + goto out_unlock; + if (max_key.type >= BTRFS_EXTENT_DATA_KEY && !fast_search) { + btrfs_release_path(path); + btrfs_release_path(dst_path); + err = btrfs_log_trailing_hole(trans, root, inode, path); + if (err) + goto out_unlock; + } log_extents: btrfs_release_path(path); btrfs_release_path(dst_path); diff --git a/fs/btrfs/ulist.c b/fs/btrfs/ulist.c index 840a38b27..91feb2bde 100644 --- a/fs/btrfs/ulist.c +++ b/fs/btrfs/ulist.c @@ -132,6 +132,15 @@ static struct ulist_node *ulist_rbtree_search(struct ulist *ulist, u64 val) return NULL; } +static void ulist_rbtree_erase(struct ulist *ulist, struct ulist_node *node) +{ + rb_erase(&node->rb_node, &ulist->root); + list_del(&node->list); + kfree(node); + BUG_ON(ulist->nnodes == 0); + ulist->nnodes--; +} + static int ulist_rbtree_insert(struct ulist *ulist, struct ulist_node *ins) { struct rb_node **p = &ulist->root.rb_node; @@ -197,9 +206,6 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux, node->val = val; node->aux = aux; -#ifdef CONFIG_BTRFS_DEBUG - node->seqnum = ulist->nnodes; -#endif ret = ulist_rbtree_insert(ulist, node); ASSERT(!ret); @@ -209,6 +215,33 @@ int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux, return 1; } +/* + * ulist_del - delete one node from ulist + * @ulist: ulist to remove node from + * @val: value to delete + * @aux: aux to delete + * + * The deletion will only be done when *BOTH* val and aux matches. + * Return 0 for successful delete. + * Return > 0 for not found. + */ +int ulist_del(struct ulist *ulist, u64 val, u64 aux) +{ + struct ulist_node *node; + + node = ulist_rbtree_search(ulist, val); + /* Not found */ + if (!node) + return 1; + + if (node->aux != aux) + return 1; + + /* Found and delete */ + ulist_rbtree_erase(ulist, node); + return 0; +} + /** * ulist_next - iterate ulist * @ulist: ulist to iterate @@ -237,15 +270,7 @@ struct ulist_node *ulist_next(struct ulist *ulist, struct ulist_iterator *uiter) uiter->cur_list = uiter->cur_list->next; } else { uiter->cur_list = ulist->nodes.next; -#ifdef CONFIG_BTRFS_DEBUG - uiter->i = 0; -#endif } node = list_entry(uiter->cur_list, struct ulist_node, list); -#ifdef CONFIG_BTRFS_DEBUG - ASSERT(node->seqnum == uiter->i); - ASSERT(uiter->i >= 0 && uiter->i < ulist->nnodes); - uiter->i++; -#endif return node; } diff --git a/fs/btrfs/ulist.h b/fs/btrfs/ulist.h index 4c29db604..a01a2c458 100644 --- a/fs/btrfs/ulist.h +++ b/fs/btrfs/ulist.h @@ -57,6 +57,7 @@ void ulist_free(struct ulist *ulist); int ulist_add(struct ulist *ulist, u64 val, u64 aux, gfp_t gfp_mask); int ulist_add_merge(struct ulist *ulist, u64 val, u64 aux, u64 *old_aux, gfp_t gfp_mask); +int ulist_del(struct ulist *ulist, u64 val, u64 aux); /* just like ulist_add_merge() but take a pointer for the aux data */ static inline int ulist_add_merge_ptr(struct ulist *ulist, u64 val, void *aux, diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 174f5e1e0..fbe7c1045 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -52,6 +52,10 @@ static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); DEFINE_MUTEX(uuid_mutex); static LIST_HEAD(fs_uuids); +struct list_head *btrfs_get_fs_uuids(void) +{ + return &fs_uuids; +} static struct btrfs_fs_devices *__alloc_fs_devices(void) { @@ -345,7 +349,7 @@ loop_lock: waitqueue_active(&fs_info->async_submit_wait)) wake_up(&fs_info->async_submit_wait); - BUG_ON(atomic_read(&cur->bi_cnt) == 0); + BUG_ON(atomic_read(&cur->__bi_cnt) == 0); /* * if we're doing the sync list, record that our @@ -441,6 +445,61 @@ static void pending_bios_fn(struct btrfs_work *work) run_scheduled_bios(device); } + +void btrfs_free_stale_device(struct btrfs_device *cur_dev) +{ + struct btrfs_fs_devices *fs_devs; + struct btrfs_device *dev; + + if (!cur_dev->name) + return; + + list_for_each_entry(fs_devs, &fs_uuids, list) { + int del = 1; + + if (fs_devs->opened) + continue; + if (fs_devs->seeding) + continue; + + list_for_each_entry(dev, &fs_devs->devices, dev_list) { + + if (dev == cur_dev) + continue; + if (!dev->name) + continue; + + /* + * Todo: This won't be enough. What if the same device + * comes back (with new uuid and) with its mapper path? + * But for now, this does help as mostly an admin will + * either use mapper or non mapper path throughout. + */ + rcu_read_lock(); + del = strcmp(rcu_str_deref(dev->name), + rcu_str_deref(cur_dev->name)); + rcu_read_unlock(); + if (!del) + break; + } + + if (!del) { + /* delete the stale device */ + if (fs_devs->num_devices == 1) { + btrfs_sysfs_remove_fsid(fs_devs); + list_del(&fs_devs->list); + free_fs_devices(fs_devs); + } else { + fs_devs->num_devices--; + list_del(&dev->dev_list); + rcu_string_free(dev->name); + kfree(dev); + } + break; + } + } +} + /* * Add new device to list of registered devices * @@ -556,6 +615,12 @@ static noinline int device_list_add(const char *path, if (!fs_devices->opened) device->generation = found_transid; + /* + * if there is new btrfs on an already registered device, + * then remove the stale device entry. + */ + btrfs_free_stale_device(device); + *fs_devices_ret = fs_devices; return ret; @@ -693,13 +758,13 @@ static void free_device(struct rcu_head *head) static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) { - struct btrfs_device *device; + struct btrfs_device *device, *tmp; if (--fs_devices->opened > 0) return 0; mutex_lock(&fs_devices->device_list_mutex); - list_for_each_entry(device, &fs_devices->devices, dev_list) { + list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) { struct btrfs_device *new_device; struct rcu_string *name; @@ -1067,15 +1132,31 @@ again: map = (struct map_lookup *)em->bdev; for (i = 0; i < map->num_stripes; i++) { + u64 end; + if (map->stripes[i].dev != device) continue; if (map->stripes[i].physical >= physical_start + len || map->stripes[i].physical + em->orig_block_len <= physical_start) continue; - *start = map->stripes[i].physical + - em->orig_block_len; - ret = 1; + /* + * Make sure that while processing the pinned list we do + * not override our *start with a lower value, because + * we can have pinned chunks that fall within this + * device hole and that have lower physical addresses + * than the pending chunks we processed before. If we + * do not take this special care we can end up getting + * 2 pending chunks that start at the same physical + * device offsets because the end offset of a pinned + * chunk can be equal to the start offset of some + * pending chunk. + */ + end = map->stripes[i].physical + em->orig_block_len; + if (end > *start) { + *start = end; + ret = 1; + } } } if (search_list == &trans->transaction->pending_chunks) { @@ -1706,7 +1787,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) if (device->bdev) { device->fs_devices->open_devices--; /* remove sysfs entry */ - btrfs_kobj_rm_device(root->fs_info, device); + btrfs_kobj_rm_device(root->fs_info->fs_devices, device); } call_rcu(&device->rcu, free_device); @@ -1875,6 +1956,9 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, mutex_lock(&uuid_mutex); WARN_ON(!tgtdev); mutex_lock(&fs_info->fs_devices->device_list_mutex); + + btrfs_kobj_rm_device(fs_info->fs_devices, tgtdev); + if (tgtdev->bdev) { btrfs_scratch_superblock(tgtdev); fs_info->fs_devices->open_devices--; @@ -2211,7 +2295,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) tmp + 1); /* add sysfs device entry */ - btrfs_kobj_add_device(root->fs_info, device); + btrfs_kobj_add_device(root->fs_info->fs_devices, device); /* * we've got more storage, clear any full flags on the space @@ -2252,8 +2336,9 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) */ snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", root->fs_info->fsid); - if (kobject_rename(&root->fs_info->super_kobj, fsid_buf)) - goto error_trans; + if (kobject_rename(&root->fs_info->fs_devices->super_kobj, + fsid_buf)) + pr_warn("BTRFS: sysfs: failed to create fsid for sprout\n"); } root->fs_info->num_tolerated_disk_barrier_failures = @@ -2289,7 +2374,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) error_trans: btrfs_end_transaction(trans, root); rcu_string_free(device->name); - btrfs_kobj_rm_device(root->fs_info, device); + btrfs_kobj_rm_device(root->fs_info->fs_devices, device); kfree(device); error: blkdev_put(bdev, FMODE_EXCL); @@ -2609,6 +2694,9 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, return -EINVAL; } map = (struct map_lookup *)em->bdev; + lock_chunks(root->fs_info->chunk_root); + check_system_chunk(trans, extent_root, map->type); + unlock_chunks(root->fs_info->chunk_root); for (i = 0; i < map->num_stripes; i++) { struct btrfs_device *device = map->stripes[i].dev; @@ -2678,6 +2766,20 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, root = root->fs_info->chunk_root; extent_root = root->fs_info->extent_root; + /* + * Prevent races with automatic removal of unused block groups. + * After we relocate and before we remove the chunk with offset + * chunk_offset, automatic removal of the block group can kick in, + * resulting in a failure when calling btrfs_remove_chunk() below. + * + * Make sure to acquire this mutex before doing a tree search (dev + * or chunk trees) to find chunks. Otherwise the cleaner kthread might + * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after + * we release the path used to search the chunk/dev tree and before + * the current task acquires this mutex and calls us. + */ + ASSERT(mutex_is_locked(&root->fs_info->delete_unused_bgs_mutex)); + ret = btrfs_can_relocate(extent_root, chunk_offset); if (ret) return -ENOSPC; @@ -2726,13 +2828,18 @@ again: key.type = BTRFS_CHUNK_ITEM_KEY; while (1) { + mutex_lock(&root->fs_info->delete_unused_bgs_mutex); ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); - if (ret < 0) + if (ret < 0) { + mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); goto error; + } BUG_ON(ret == 0); /* Corruption */ ret = btrfs_previous_item(chunk_root, path, key.objectid, key.type); + if (ret) + mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); if (ret < 0) goto error; if (ret > 0) @@ -2755,6 +2862,7 @@ again: else BUG_ON(ret); } + mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); if (found_key.offset == 0) break; @@ -3211,9 +3319,12 @@ again: goto error; } + mutex_lock(&fs_info->delete_unused_bgs_mutex); ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); - if (ret < 0) + if (ret < 0) { + mutex_unlock(&fs_info->delete_unused_bgs_mutex); goto error; + } /* * this shouldn't happen, it means the last relocate @@ -3225,6 +3336,7 @@ again: ret = btrfs_previous_item(chunk_root, path, 0, BTRFS_CHUNK_ITEM_KEY); if (ret) { + mutex_unlock(&fs_info->delete_unused_bgs_mutex); ret = 0; break; } @@ -3233,8 +3345,10 @@ again: slot = path->slots[0]; btrfs_item_key_to_cpu(leaf, &found_key, slot); - if (found_key.objectid != key.objectid) + if (found_key.objectid != key.objectid) { + mutex_unlock(&fs_info->delete_unused_bgs_mutex); break; + } chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); @@ -3247,10 +3361,13 @@ again: ret = should_balance_chunk(chunk_root, leaf, chunk, found_key.offset); btrfs_release_path(path); - if (!ret) + if (!ret) { + mutex_unlock(&fs_info->delete_unused_bgs_mutex); goto loop; + } if (counting) { + mutex_unlock(&fs_info->delete_unused_bgs_mutex); spin_lock(&fs_info->balance_lock); bctl->stat.expected++; spin_unlock(&fs_info->balance_lock); @@ -3260,6 +3377,7 @@ again: ret = btrfs_relocate_chunk(chunk_root, found_key.objectid, found_key.offset); + mutex_unlock(&fs_info->delete_unused_bgs_mutex); if (ret && ret != -ENOSPC) goto error; if (ret == -ENOSPC) { @@ -3908,9 +4026,9 @@ int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info) uuid_root = btrfs_create_tree(trans, fs_info, BTRFS_UUID_TREE_OBJECTID); if (IS_ERR(uuid_root)) { - btrfs_abort_transaction(trans, tree_root, - PTR_ERR(uuid_root)); - return PTR_ERR(uuid_root); + ret = PTR_ERR(uuid_root); + btrfs_abort_transaction(trans, tree_root, ret); + return ret; } fs_info->uuid_root = uuid_root; @@ -3965,6 +4083,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) int slot; int failed = 0; bool retried = false; + bool checked_pending_chunks = false; struct extent_buffer *l; struct btrfs_key key; struct btrfs_super_block *super_copy = root->fs_info->super_copy; @@ -3998,11 +4117,16 @@ again: key.type = BTRFS_DEV_EXTENT_KEY; do { + mutex_lock(&root->fs_info->delete_unused_bgs_mutex); ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - if (ret < 0) + if (ret < 0) { + mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); goto done; + } ret = btrfs_previous_item(root, path, 0, key.type); + if (ret) + mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); if (ret < 0) goto done; if (ret) { @@ -4016,6 +4140,7 @@ again: btrfs_item_key_to_cpu(l, &key, path->slots[0]); if (key.objectid != device->devid) { + mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); btrfs_release_path(path); break; } @@ -4024,6 +4149,7 @@ again: length = btrfs_dev_extent_length(l, dev_extent); if (key.offset + length <= new_size) { + mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); btrfs_release_path(path); break; } @@ -4033,6 +4159,7 @@ again: btrfs_release_path(path); ret = btrfs_relocate_chunk(root, chunk_objectid, chunk_offset); + mutex_unlock(&root->fs_info->delete_unused_bgs_mutex); if (ret && ret != -ENOSPC) goto done; if (ret == -ENOSPC) @@ -4045,15 +4172,6 @@ again: goto again; } else if (failed && retried) { ret = -ENOSPC; - lock_chunks(root); - - btrfs_device_set_total_bytes(device, old_size); - if (device->writeable) - device->fs_devices->total_rw_bytes += diff; - spin_lock(&root->fs_info->free_chunk_lock); - root->fs_info->free_chunk_space += diff; - spin_unlock(&root->fs_info->free_chunk_lock); - unlock_chunks(root); goto done; } @@ -4065,6 +4183,35 @@ again: } lock_chunks(root); + + /* + * We checked in the above loop all device extents that were already in + * the device tree. However before we have updated the device's + * total_bytes to the new size, we might have had chunk allocations that + * have not complete yet (new block groups attached to transaction + * handles), and therefore their device extents were not yet in the + * device tree and we missed them in the loop above. So if we have any + * pending chunk using a device extent that overlaps the device range + * that we can not use anymore, commit the current transaction and + * repeat the search on the device tree - this way we guarantee we will + * not have chunks using device extents that end beyond 'new_size'. + */ + if (!checked_pending_chunks) { + u64 start = new_size; + u64 len = old_size - new_size; + + if (contains_pending_extent(trans, device, &start, len)) { + unlock_chunks(root); + checked_pending_chunks = true; + failed = 0; + retried = false; + ret = btrfs_commit_transaction(trans, root); + if (ret) + goto done; + goto again; + } + } + btrfs_device_set_disk_total_bytes(device, new_size); if (list_empty(&device->resized_list)) list_add_tail(&device->resized_list, @@ -4079,6 +4226,16 @@ again: btrfs_end_transaction(trans, root); done: btrfs_free_path(path); + if (ret) { + lock_chunks(root); + btrfs_device_set_total_bytes(device, old_size); + if (device->writeable) + device->fs_devices->total_rw_bytes += diff; + spin_lock(&root->fs_info->free_chunk_lock); + root->fs_info->free_chunk_space += diff; + spin_unlock(&root->fs_info->free_chunk_lock); + unlock_chunks(root); + } return ret; } @@ -5586,17 +5743,16 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio, int err) { - if (likely(bbio->flags & BTRFS_BIO_ORIG_BIO_SUBMITTED)) - bio_endio_nodec(bio, err); - else - bio_endio(bio, err); + bio->bi_private = bbio->private; + bio->bi_end_io = bbio->end_io; + bio_endio(bio, err); + btrfs_put_bbio(bbio); } static void btrfs_end_bio(struct bio *bio, int err) { struct btrfs_bio *bbio = bio->bi_private; - struct btrfs_device *dev = bbio->stripes[0].dev; int is_orig_bio = 0; if (err) { @@ -5604,6 +5760,7 @@ static void btrfs_end_bio(struct bio *bio, int err) if (err == -EIO || err == -EREMOTEIO) { unsigned int stripe_index = btrfs_io_bio(bio)->stripe_index; + struct btrfs_device *dev; BUG_ON(stripe_index >= bbio->num_stripes); dev = bbio->stripes[stripe_index].dev; @@ -5633,8 +5790,6 @@ static void btrfs_end_bio(struct bio *bio, int err) bio = bbio->orig_bio; } - bio->bi_private = bbio->private; - bio->bi_end_io = bbio->end_io; btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; /* only send an error to the higher layers if it is * beyond the tolerance of the btrfs bio @@ -5816,8 +5971,6 @@ static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical) /* Shoud be the original bio. */ WARN_ON(bio != bbio->orig_bio); - bio->bi_private = bbio->private; - bio->bi_end_io = bbio->end_io; btrfs_io_bio(bio)->mirror_num = bbio->mirror_num; bio->bi_iter.bi_sector = logical >> 9; @@ -5898,10 +6051,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, if (dev_nr < total_devs - 1) { bio = btrfs_bio_clone(first_bio, GFP_NOFS); BUG_ON(!bio); /* -ENOMEM */ - } else { + } else bio = first_bio; - bbio->flags |= BTRFS_BIO_ORIG_BIO_SUBMITTED; - } submit_stripe_bio(root, bbio, bio, bbio->stripes[dev_nr].physical, dev_nr, rw, @@ -6078,6 +6229,8 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, free_extent_map(em); return -EIO; } + btrfs_warn(root->fs_info, "devid %llu uuid %pU is missing", + devid, uuid); } map->stripes[i].dev->in_fs_metadata = 1; } @@ -6197,10 +6350,11 @@ static int read_one_dev(struct btrfs_root *root, if (!btrfs_test_opt(root, DEGRADED)) return -EIO; - btrfs_warn(root->fs_info, "devid %llu missing", devid); device = add_missing_dev(root, fs_devices, devid, dev_uuid); if (!device) return -ENOMEM; + btrfs_warn(root->fs_info, "devid %llu uuid %pU missing", + devid, dev_uuid); } else { if (!device->bdev && !btrfs_test_opt(root, DEGRADED)) return -EIO; @@ -6728,3 +6882,21 @@ void btrfs_update_commit_device_bytes_used(struct btrfs_root *root, } unlock_chunks(root); } + +void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + while (fs_devices) { + fs_devices->fs_info = fs_info; + fs_devices = fs_devices->seed; + } +} + +void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + while (fs_devices) { + fs_devices->fs_info = NULL; + fs_devices = fs_devices->seed; + } +} diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index ebc31331a..95842a909 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -253,6 +253,12 @@ struct btrfs_fs_devices { * nonrot flag set */ int rotating; + + struct btrfs_fs_info *fs_info; + /* sysfs kobjects */ + struct kobject super_kobj; + struct kobject *device_dir_kobj; + struct completion kobj_unregister; }; #define BTRFS_BIO_INLINE_CSUM_SIZE 64 @@ -292,8 +298,6 @@ struct btrfs_bio_stripe { struct btrfs_bio; typedef void (btrfs_bio_end_io_t) (struct btrfs_bio *bio, int err); -#define BTRFS_BIO_ORIG_BIO_SUBMITTED (1 << 0) - struct btrfs_bio { atomic_t refs; atomic_t stripes_pending; @@ -537,5 +541,8 @@ static inline void unlock_chunks(struct btrfs_root *root) mutex_unlock(&root->fs_info->chunk_mutex); } +struct list_head *btrfs_get_fs_uuids(void); +void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info); +void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info); #endif diff --git a/fs/buffer.c b/fs/buffer.c index 8c50a2250..1cf7a53a0 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -30,6 +30,7 @@ #include <linux/quotaops.h> #include <linux/highmem.h> #include <linux/export.h> +#include <linux/backing-dev.h> #include <linux/writeback.h> #include <linux/hash.h> #include <linux/suspend.h> @@ -44,6 +45,9 @@ #include <trace/events/block.h> static int fsync_buffers_list(spinlock_t *lock, struct list_head *list); +static int submit_bh_wbc(int rw, struct buffer_head *bh, + unsigned long bio_flags, + struct writeback_control *wbc); #define BH_ENTRY(list) list_entry((list), struct buffer_head, b_assoc_buffers) @@ -623,21 +627,22 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode); * * If warn is true, then emit a warning if the page is not uptodate and has * not been truncated. + * + * The caller must hold mem_cgroup_begin_page_stat() lock. */ -static void __set_page_dirty(struct page *page, - struct address_space *mapping, int warn) +static void __set_page_dirty(struct page *page, struct address_space *mapping, + struct mem_cgroup *memcg, int warn) { unsigned long flags; spin_lock_irqsave(&mapping->tree_lock, flags); if (page->mapping) { /* Race with truncate? */ WARN_ON_ONCE(warn && !PageUptodate(page)); - account_page_dirtied(page, mapping); + account_page_dirtied(page, mapping, memcg); radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } spin_unlock_irqrestore(&mapping->tree_lock, flags); - __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } /* @@ -668,6 +673,7 @@ static void __set_page_dirty(struct page *page, int __set_page_dirty_buffers(struct page *page) { int newly_dirty; + struct mem_cgroup *memcg; struct address_space *mapping = page_mapping(page); if (unlikely(!mapping)) @@ -683,11 +689,22 @@ int __set_page_dirty_buffers(struct page *page) bh = bh->b_this_page; } while (bh != head); } + /* + * Use mem_group_begin_page_stat() to keep PageDirty synchronized with + * per-memcg dirty page counters. + */ + memcg = mem_cgroup_begin_page_stat(page); newly_dirty = !TestSetPageDirty(page); spin_unlock(&mapping->private_lock); if (newly_dirty) - __set_page_dirty(page, mapping, 1); + __set_page_dirty(page, mapping, memcg, 1); + + mem_cgroup_end_page_stat(memcg); + + if (newly_dirty) + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); + return newly_dirty; } EXPORT_SYMBOL(__set_page_dirty_buffers); @@ -1158,11 +1175,18 @@ void mark_buffer_dirty(struct buffer_head *bh) if (!test_set_buffer_dirty(bh)) { struct page *page = bh->b_page; + struct address_space *mapping = NULL; + struct mem_cgroup *memcg; + + memcg = mem_cgroup_begin_page_stat(page); if (!TestSetPageDirty(page)) { - struct address_space *mapping = page_mapping(page); + mapping = page_mapping(page); if (mapping) - __set_page_dirty(page, mapping, 0); + __set_page_dirty(page, mapping, memcg, 0); } + mem_cgroup_end_page_stat(memcg); + if (mapping) + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } } EXPORT_SYMBOL(mark_buffer_dirty); @@ -1684,8 +1708,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page, struct buffer_head *bh, *head; unsigned int blocksize, bbits; int nr_underway = 0; - int write_op = (wbc->sync_mode == WB_SYNC_ALL ? - WRITE_SYNC : WRITE); + int write_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); head = create_page_buffers(page, inode, (1 << BH_Dirty)|(1 << BH_Uptodate)); @@ -1774,7 +1797,7 @@ static int __block_write_full_page(struct inode *inode, struct page *page, do { struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { - submit_bh(write_op, bh); + submit_bh_wbc(write_op, bh, 0, wbc); nr_underway++; } bh = next; @@ -1828,7 +1851,7 @@ recover: struct buffer_head *next = bh->b_this_page; if (buffer_async_write(bh)) { clear_buffer_dirty(bh); - submit_bh(write_op, bh); + submit_bh_wbc(write_op, bh, 0, wbc); nr_underway++; } bh = next; @@ -2450,7 +2473,7 @@ int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, * Update file times before taking page lock. We may end up failing the * fault so this update may be superfluous but who really cares... */ - vma_file_update_time(vma); + file_update_time(vma->vm_file); ret = __block_page_mkwrite(vma, vmf, get_block); sb_end_pagefault(sb); @@ -2938,10 +2961,6 @@ static void end_bio_bh_io_sync(struct bio *bio, int err) { struct buffer_head *bh = bio->bi_private; - if (err == -EOPNOTSUPP) { - set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); - } - if (unlikely (test_bit(BIO_QUIET,&bio->bi_flags))) set_bit(BH_Quiet, &bh->b_state); @@ -2997,10 +3016,10 @@ void guard_bio_eod(int rw, struct bio *bio) } } -int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags) +static int submit_bh_wbc(int rw, struct buffer_head *bh, + unsigned long bio_flags, struct writeback_control *wbc) { struct bio *bio; - int ret = 0; BUG_ON(!buffer_locked(bh)); BUG_ON(!buffer_mapped(bh)); @@ -3020,6 +3039,11 @@ int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags) */ bio = bio_alloc(GFP_NOIO, 1); + if (wbc) { + wbc_init_bio(wbc, bio); + wbc_account_io(wbc, bh->b_page, bh->b_size); + } + bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); bio->bi_bdev = bh->b_bdev; bio->bi_io_vec[0].bv_page = bh->b_page; @@ -3041,20 +3065,19 @@ int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags) if (buffer_prio(bh)) rw |= REQ_PRIO; - bio_get(bio); submit_bio(rw, bio); + return 0; +} - if (bio_flagged(bio, BIO_EOPNOTSUPP)) - ret = -EOPNOTSUPP; - - bio_put(bio); - return ret; +int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags) +{ + return submit_bh_wbc(rw, bh, bio_flags, NULL); } EXPORT_SYMBOL_GPL(_submit_bh); int submit_bh(int rw, struct buffer_head *bh) { - return _submit_bh(rw, bh, 0); + return submit_bh_wbc(rw, bh, 0, NULL); } EXPORT_SYMBOL(submit_bh); @@ -3243,8 +3266,8 @@ int try_to_free_buffers(struct page *page) * to synchronise against __set_page_dirty_buffers and prevent the * dirty bit from being lost. */ - if (ret && TestClearPageDirty(page)) - account_page_cleaned(page, mapping); + if (ret) + cancel_dirty_page(page); spin_unlock(&mapping->private_lock); out: if (buffers_to_free) { diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index 8c52472d2..aecd0859e 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -43,7 +43,6 @@ struct cachefiles_object { loff_t i_size; /* object size */ unsigned long flags; #define CACHEFILES_OBJECT_ACTIVE 0 /* T if marked active */ -#define CACHEFILES_OBJECT_BURIED 1 /* T if preemptively buried */ atomic_t usage; /* object usage count */ uint8_t type; /* object type */ uint8_t new; /* T if object new */ diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index ab857ab9f..fc1056f5c 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -97,7 +97,8 @@ static noinline void cachefiles_printk_object(struct cachefiles_object *object, * call vfs_unlink(), vfs_rmdir() or vfs_rename() */ static void cachefiles_mark_object_buried(struct cachefiles_cache *cache, - struct dentry *dentry) + struct dentry *dentry, + enum fscache_why_object_killed why) { struct cachefiles_object *object; struct rb_node *p; @@ -132,8 +133,9 @@ found_dentry: pr_err("\n"); pr_err("Error: Can't preemptively bury live object\n"); cachefiles_printk_object(object, NULL); - } else if (test_and_set_bit(CACHEFILES_OBJECT_BURIED, &object->flags)) { - pr_err("Error: Object already preemptively buried\n"); + } else { + if (why != FSCACHE_OBJECT_IS_STALE) + fscache_object_mark_killed(&object->fscache, why); } write_unlock(&cache->active_lock); @@ -265,7 +267,8 @@ requeue: static int cachefiles_bury_object(struct cachefiles_cache *cache, struct dentry *dir, struct dentry *rep, - bool preemptive) + bool preemptive, + enum fscache_why_object_killed why) { struct dentry *grave, *trap; struct path path, path_to_graveyard; @@ -289,7 +292,7 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache, ret = vfs_unlink(d_inode(dir), rep, NULL); if (preemptive) - cachefiles_mark_object_buried(cache, rep); + cachefiles_mark_object_buried(cache, rep, why); } mutex_unlock(&d_inode(dir)->i_mutex); @@ -394,7 +397,7 @@ try_again: "Rename failed with error %d", ret); if (preemptive) - cachefiles_mark_object_buried(cache, rep); + cachefiles_mark_object_buried(cache, rep, why); } unlock_rename(cache->graveyard, dir); @@ -422,7 +425,7 @@ int cachefiles_delete_object(struct cachefiles_cache *cache, mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT); - if (test_bit(CACHEFILES_OBJECT_BURIED, &object->flags)) { + if (test_bit(FSCACHE_OBJECT_KILLED_BY_CACHE, &object->fscache.flags)) { /* object allocation for the same key preemptively deleted this * object's file so that it could create its own file */ _debug("object preemptively buried"); @@ -433,7 +436,8 @@ int cachefiles_delete_object(struct cachefiles_cache *cache, * may have been renamed */ if (dir == object->dentry->d_parent) { ret = cachefiles_bury_object(cache, dir, - object->dentry, false); + object->dentry, false, + FSCACHE_OBJECT_WAS_RETIRED); } else { /* it got moved, presumably by cachefilesd culling it, * so it's no longer in the key path and we can ignore @@ -522,7 +526,7 @@ lookup_again: if (d_is_negative(next)) { ret = cachefiles_has_space(cache, 1, 0); if (ret < 0) - goto create_error; + goto no_space_error; path.dentry = dir; ret = security_path_mkdir(&path, next, 0); @@ -551,7 +555,7 @@ lookup_again: if (d_is_negative(next)) { ret = cachefiles_has_space(cache, 1, 0); if (ret < 0) - goto create_error; + goto no_space_error; path.dentry = dir; ret = security_path_mknod(&path, next, S_IFREG, 0); @@ -602,7 +606,8 @@ lookup_again: * mutex) */ object->dentry = NULL; - ret = cachefiles_bury_object(cache, dir, next, true); + ret = cachefiles_bury_object(cache, dir, next, true, + FSCACHE_OBJECT_IS_STALE); dput(next); next = NULL; @@ -610,6 +615,7 @@ lookup_again: goto delete_error; _debug("redo lookup"); + fscache_object_retrying_stale(&object->fscache); goto lookup_again; } } @@ -662,6 +668,8 @@ lookup_again: _leave(" = 0 [%lu]", d_backing_inode(object->dentry)->i_ino); return 0; +no_space_error: + fscache_object_mark_killed(&object->fscache, FSCACHE_OBJECT_NO_SPACE); create_error: _debug("create error %d", ret); if (ret == -EIO) @@ -927,7 +935,8 @@ int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir, /* actually remove the victim (drops the dir mutex) */ _debug("bury"); - ret = cachefiles_bury_object(cache, dir, victim, false); + ret = cachefiles_bury_object(cache, dir, victim, false, + FSCACHE_OBJECT_WAS_CULLED); if (ret < 0) goto error; diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c index 64fa24834..8f84646f1 100644 --- a/fs/ceph/acl.c +++ b/fs/ceph/acl.c @@ -187,10 +187,10 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode, val_size2 = posix_acl_xattr_size(default_acl->a_count); err = -ENOMEM; - tmp_buf = kmalloc(max(val_size1, val_size2), GFP_NOFS); + tmp_buf = kmalloc(max(val_size1, val_size2), GFP_KERNEL); if (!tmp_buf) goto out_err; - pagelist = kmalloc(sizeof(struct ceph_pagelist), GFP_NOFS); + pagelist = kmalloc(sizeof(struct ceph_pagelist), GFP_KERNEL); if (!pagelist) goto out_err; ceph_pagelist_init(pagelist); diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index e162bcd10..890c50971 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -87,17 +87,21 @@ static int ceph_set_page_dirty(struct page *page) inode = mapping->host; ci = ceph_inode(inode); - /* - * Note that we're grabbing a snapc ref here without holding - * any locks! - */ - snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context); - /* dirty the head */ spin_lock(&ci->i_ceph_lock); - if (ci->i_head_snapc == NULL) - ci->i_head_snapc = ceph_get_snap_context(snapc); - ++ci->i_wrbuffer_ref_head; + BUG_ON(ci->i_wr_ref == 0); // caller should hold Fw reference + if (__ceph_have_pending_cap_snap(ci)) { + struct ceph_cap_snap *capsnap = + list_last_entry(&ci->i_cap_snaps, + struct ceph_cap_snap, + ci_item); + snapc = ceph_get_snap_context(capsnap->context); + capsnap->dirty_pages++; + } else { + BUG_ON(!ci->i_head_snapc); + snapc = ceph_get_snap_context(ci->i_head_snapc); + ++ci->i_wrbuffer_ref_head; + } if (ci->i_wrbuffer_ref == 0) ihold(inode); ++ci->i_wrbuffer_ref; @@ -346,7 +350,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) /* build page vector */ nr_pages = calc_pages_for(0, len); - pages = kmalloc(sizeof(*pages) * nr_pages, GFP_NOFS); + pages = kmalloc(sizeof(*pages) * nr_pages, GFP_KERNEL); ret = -ENOMEM; if (!pages) goto out; @@ -358,7 +362,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) dout("start_read %p adding %p idx %lu\n", inode, page, page->index); if (add_to_page_cache_lru(page, &inode->i_data, page->index, - GFP_NOFS)) { + GFP_KERNEL)) { ceph_fscache_uncache_page(inode, page); page_cache_release(page); dout("start_read %p add_to_page_cache failed %p\n", @@ -436,7 +440,7 @@ out: * only snap context we are allowed to write back. */ static struct ceph_snap_context *get_oldest_context(struct inode *inode, - u64 *snap_size) + loff_t *snap_size) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_snap_context *snapc = NULL; @@ -476,8 +480,9 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) struct ceph_osd_client *osdc; struct ceph_snap_context *snapc, *oldest; loff_t page_off = page_offset(page); + loff_t snap_size = -1; long writeback_stat; - u64 truncate_size, snap_size = 0; + u64 truncate_size; u32 truncate_seq; int err = 0, len = PAGE_CACHE_SIZE; @@ -512,7 +517,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) spin_lock(&ci->i_ceph_lock); truncate_seq = ci->i_truncate_seq; truncate_size = ci->i_truncate_size; - if (!snap_size) + if (snap_size == -1) snap_size = i_size_read(inode); spin_unlock(&ci->i_ceph_lock); @@ -695,7 +700,8 @@ static int ceph_writepages_start(struct address_space *mapping, unsigned wsize = 1 << inode->i_blkbits; struct ceph_osd_request *req = NULL; int do_sync = 0; - u64 truncate_size, snap_size; + loff_t snap_size, i_size; + u64 truncate_size; u32 truncate_seq; /* @@ -741,7 +747,7 @@ static int ceph_writepages_start(struct address_space *mapping, retry: /* find oldest snap context with dirty data */ ceph_put_snap_context(snapc); - snap_size = 0; + snap_size = -1; snapc = get_oldest_context(inode, &snap_size); if (!snapc) { /* hmm, why does writepages get called when there @@ -749,16 +755,13 @@ retry: dout(" no snap context with dirty data?\n"); goto out; } - if (snap_size == 0) - snap_size = i_size_read(inode); dout(" oldest snapc is %p seq %lld (%d snaps)\n", snapc, snapc->seq, snapc->num_snaps); spin_lock(&ci->i_ceph_lock); truncate_seq = ci->i_truncate_seq; truncate_size = ci->i_truncate_size; - if (!snap_size) - snap_size = i_size_read(inode); + i_size = i_size_read(inode); spin_unlock(&ci->i_ceph_lock); if (last_snapc && snapc != last_snapc) { @@ -828,8 +831,10 @@ get_more_pages: dout("waiting on writeback %p\n", page); wait_on_page_writeback(page); } - if (page_offset(page) >= snap_size) { - dout("%p page eof %llu\n", page, snap_size); + if (page_offset(page) >= + (snap_size == -1 ? i_size : snap_size)) { + dout("%p page eof %llu\n", page, + (snap_size == -1 ? i_size : snap_size)); done = 1; unlock_page(page); break; @@ -884,7 +889,8 @@ get_more_pages: } if (do_sync) - osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC); + osd_req_op_init(req, 1, + CEPH_OSD_OP_STARTSYNC, 0); req->r_callback = writepages_finish; req->r_inode = inode; @@ -944,10 +950,18 @@ get_more_pages: } /* Format the osd request message and submit the write */ - offset = page_offset(pages[0]); - len = min(snap_size - offset, - (u64)locked_pages << PAGE_CACHE_SHIFT); + len = (u64)locked_pages << PAGE_CACHE_SHIFT; + if (snap_size == -1) { + len = min(len, (u64)i_size_read(inode) - offset); + /* writepages_finish() clears writeback pages + * according to the data length, so make sure + * data length covers all locked pages */ + len = max(len, 1 + + ((u64)(locked_pages - 1) << PAGE_CACHE_SHIFT)); + } else { + len = min(len, snap_size - offset); + } dout("writepages got %d pages at %llu~%llu\n", locked_pages, offset, len); @@ -1032,7 +1046,6 @@ static int ceph_update_writeable_page(struct file *file, { struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; loff_t page_off = pos & PAGE_CACHE_MASK; int pos_in_page = pos & ~PAGE_CACHE_MASK; int end_in_page = pos_in_page + len; @@ -1044,10 +1057,6 @@ retry_locked: /* writepages currently holds page lock, but if we change that later, */ wait_on_page_writeback(page); - /* check snap context */ - BUG_ON(!ci->i_snap_realm); - down_read(&mdsc->snap_rwsem); - BUG_ON(!ci->i_snap_realm->cached_context); snapc = page_snap_context(page); if (snapc && snapc != ci->i_head_snapc) { /* @@ -1055,7 +1064,6 @@ retry_locked: * context! is it writeable now? */ oldest = get_oldest_context(inode, NULL); - up_read(&mdsc->snap_rwsem); if (snapc->seq > oldest->seq) { ceph_put_snap_context(oldest); @@ -1112,7 +1120,6 @@ retry_locked: } /* we need to read it. */ - up_read(&mdsc->snap_rwsem); r = readpage_nounlock(file, page); if (r < 0) goto fail_nosnap; @@ -1157,16 +1164,13 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping, /* * we don't do anything in here that simple_write_end doesn't do - * except adjust dirty page accounting and drop read lock on - * mdsc->snap_rwsem. + * except adjust dirty page accounting */ static int ceph_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { struct inode *inode = file_inode(file); - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); - struct ceph_mds_client *mdsc = fsc->mdsc; unsigned from = pos & (PAGE_CACHE_SIZE - 1); int check_cap = 0; @@ -1188,7 +1192,6 @@ static int ceph_write_end(struct file *file, struct address_space *mapping, set_page_dirty(page); unlock_page(page); - up_read(&mdsc->snap_rwsem); page_cache_release(page); if (check_cap) @@ -1314,13 +1317,17 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) struct inode *inode = file_inode(vma->vm_file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_file_info *fi = vma->vm_file->private_data; - struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; + struct ceph_cap_flush *prealloc_cf; struct page *page = vmf->page; loff_t off = page_offset(page); loff_t size = i_size_read(inode); size_t len; int want, got, ret; + prealloc_cf = ceph_alloc_cap_flush(); + if (!prealloc_cf) + return VM_FAULT_SIGBUS; + if (ci->i_inline_version != CEPH_INLINE_NONE) { struct page *locked_page = NULL; if (off == 0) { @@ -1330,8 +1337,10 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) ret = ceph_uninline_data(vma->vm_file, locked_page); if (locked_page) unlock_page(locked_page); - if (ret < 0) - return VM_FAULT_SIGBUS; + if (ret < 0) { + ret = VM_FAULT_SIGBUS; + goto out_free; + } } if (off + PAGE_CACHE_SIZE <= size) @@ -1353,7 +1362,8 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) break; if (ret != -ERESTARTSYS) { WARN_ON(1); - return VM_FAULT_SIGBUS; + ret = VM_FAULT_SIGBUS; + goto out_free; } } dout("page_mkwrite %p %llu~%zd got cap refs on %s\n", @@ -1373,7 +1383,6 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) if (ret == 0) { /* success. we'll keep the page locked. */ set_page_dirty(page); - up_read(&mdsc->snap_rwsem); ret = VM_FAULT_LOCKED; } else { if (ret == -ENOMEM) @@ -1389,7 +1398,8 @@ out: int dirty; spin_lock(&ci->i_ceph_lock); ci->i_inline_version = CEPH_INLINE_NONE; - dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, + &prealloc_cf); spin_unlock(&ci->i_ceph_lock); if (dirty) __mark_inode_dirty(inode, dirty); @@ -1398,6 +1408,8 @@ out: dout("page_mkwrite %p %llu~%zd dropping cap refs on %s ret %d\n", inode, off, len, ceph_cap_string(got), ret); ceph_put_cap_refs(ci, got); +out_free: + ceph_free_cap_flush(prealloc_cf); return ret; } @@ -1509,8 +1521,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) ceph_vino(inode), 0, &len, 0, 1, CEPH_OSD_OP_CREATE, CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, - ci->i_snap_realm->cached_context, - 0, 0, false); + ceph_empty_snapc, 0, 0, false); if (IS_ERR(req)) { err = PTR_ERR(req); goto out; @@ -1528,7 +1539,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) ceph_vino(inode), 0, &len, 1, 3, CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, - ci->i_snap_realm->cached_context, + ceph_empty_snapc, ci->i_truncate_seq, ci->i_truncate_size, false); if (IS_ERR(req)) { @@ -1597,3 +1608,206 @@ int ceph_mmap(struct file *file, struct vm_area_struct *vma) vma->vm_ops = &ceph_vmops; return 0; } + +enum { + POOL_READ = 1, + POOL_WRITE = 2, +}; + +static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool) +{ + struct ceph_fs_client *fsc = ceph_inode_to_client(&ci->vfs_inode); + struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_osd_request *rd_req = NULL, *wr_req = NULL; + struct rb_node **p, *parent; + struct ceph_pool_perm *perm; + struct page **pages; + int err = 0, err2 = 0, have = 0; + + down_read(&mdsc->pool_perm_rwsem); + p = &mdsc->pool_perm_tree.rb_node; + while (*p) { + perm = rb_entry(*p, struct ceph_pool_perm, node); + if (pool < perm->pool) + p = &(*p)->rb_left; + else if (pool > perm->pool) + p = &(*p)->rb_right; + else { + have = perm->perm; + break; + } + } + up_read(&mdsc->pool_perm_rwsem); + if (*p) + goto out; + + dout("__ceph_pool_perm_get pool %u no perm cached\n", pool); + + down_write(&mdsc->pool_perm_rwsem); + parent = NULL; + while (*p) { + parent = *p; + perm = rb_entry(parent, struct ceph_pool_perm, node); + if (pool < perm->pool) + p = &(*p)->rb_left; + else if (pool > perm->pool) + p = &(*p)->rb_right; + else { + have = perm->perm; + break; + } + } + if (*p) { + up_write(&mdsc->pool_perm_rwsem); + goto out; + } + + rd_req = ceph_osdc_alloc_request(&fsc->client->osdc, + ceph_empty_snapc, + 1, false, GFP_NOFS); + if (!rd_req) { + err = -ENOMEM; + goto out_unlock; + } + + rd_req->r_flags = CEPH_OSD_FLAG_READ; + osd_req_op_init(rd_req, 0, CEPH_OSD_OP_STAT, 0); + rd_req->r_base_oloc.pool = pool; + snprintf(rd_req->r_base_oid.name, sizeof(rd_req->r_base_oid.name), + "%llx.00000000", ci->i_vino.ino); + rd_req->r_base_oid.name_len = strlen(rd_req->r_base_oid.name); + + wr_req = ceph_osdc_alloc_request(&fsc->client->osdc, + ceph_empty_snapc, + 1, false, GFP_NOFS); + if (!wr_req) { + err = -ENOMEM; + goto out_unlock; + } + + wr_req->r_flags = CEPH_OSD_FLAG_WRITE | + CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK; + osd_req_op_init(wr_req, 0, CEPH_OSD_OP_CREATE, CEPH_OSD_OP_FLAG_EXCL); + wr_req->r_base_oloc.pool = pool; + wr_req->r_base_oid = rd_req->r_base_oid; + + /* one page should be large enough for STAT data */ + pages = ceph_alloc_page_vector(1, GFP_KERNEL); + if (IS_ERR(pages)) { + err = PTR_ERR(pages); + goto out_unlock; + } + + osd_req_op_raw_data_in_pages(rd_req, 0, pages, PAGE_SIZE, + 0, false, true); + ceph_osdc_build_request(rd_req, 0, NULL, CEPH_NOSNAP, + &ci->vfs_inode.i_mtime); + err = ceph_osdc_start_request(&fsc->client->osdc, rd_req, false); + + ceph_osdc_build_request(wr_req, 0, NULL, CEPH_NOSNAP, + &ci->vfs_inode.i_mtime); + err2 = ceph_osdc_start_request(&fsc->client->osdc, wr_req, false); + + if (!err) + err = ceph_osdc_wait_request(&fsc->client->osdc, rd_req); + if (!err2) + err2 = ceph_osdc_wait_request(&fsc->client->osdc, wr_req); + + if (err >= 0 || err == -ENOENT) + have |= POOL_READ; + else if (err != -EPERM) + goto out_unlock; + + if (err2 == 0 || err2 == -EEXIST) + have |= POOL_WRITE; + else if (err2 != -EPERM) { + err = err2; + goto out_unlock; + } + + perm = kmalloc(sizeof(*perm), GFP_NOFS); + if (!perm) { + err = -ENOMEM; + goto out_unlock; + } + + perm->pool = pool; + perm->perm = have; + rb_link_node(&perm->node, parent, p); + rb_insert_color(&perm->node, &mdsc->pool_perm_tree); + err = 0; +out_unlock: + up_write(&mdsc->pool_perm_rwsem); + + if (rd_req) + ceph_osdc_put_request(rd_req); + if (wr_req) + ceph_osdc_put_request(wr_req); +out: + if (!err) + err = have; + dout("__ceph_pool_perm_get pool %u result = %d\n", pool, err); + return err; +} + +int ceph_pool_perm_check(struct ceph_inode_info *ci, int need) +{ + u32 pool; + int ret, flags; + + if (ceph_test_mount_opt(ceph_inode_to_client(&ci->vfs_inode), + NOPOOLPERM)) + return 0; + + spin_lock(&ci->i_ceph_lock); + flags = ci->i_ceph_flags; + pool = ceph_file_layout_pg_pool(ci->i_layout); + spin_unlock(&ci->i_ceph_lock); +check: + if (flags & CEPH_I_POOL_PERM) { + if ((need & CEPH_CAP_FILE_RD) && !(flags & CEPH_I_POOL_RD)) { + dout("ceph_pool_perm_check pool %u no read perm\n", + pool); + return -EPERM; + } + if ((need & CEPH_CAP_FILE_WR) && !(flags & CEPH_I_POOL_WR)) { + dout("ceph_pool_perm_check pool %u no write perm\n", + pool); + return -EPERM; + } + return 0; + } + + ret = __ceph_pool_perm_get(ci, pool); + if (ret < 0) + return ret; + + flags = CEPH_I_POOL_PERM; + if (ret & POOL_READ) + flags |= CEPH_I_POOL_RD; + if (ret & POOL_WRITE) + flags |= CEPH_I_POOL_WR; + + spin_lock(&ci->i_ceph_lock); + if (pool == ceph_file_layout_pg_pool(ci->i_layout)) { + ci->i_ceph_flags = flags; + } else { + pool = ceph_file_layout_pg_pool(ci->i_layout); + flags = ci->i_ceph_flags; + } + spin_unlock(&ci->i_ceph_lock); + goto check; +} + +void ceph_pool_perm_destroy(struct ceph_mds_client *mdsc) +{ + struct ceph_pool_perm *perm; + struct rb_node *n; + + while (!RB_EMPTY_ROOT(&mdsc->pool_perm_tree)) { + n = rb_first(&mdsc->pool_perm_tree); + perm = rb_entry(n, struct ceph_pool_perm, node); + rb_erase(n, &mdsc->pool_perm_tree); + kfree(perm); + } +} diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index be5ea6af8..ddd5e9471 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -833,7 +833,9 @@ int __ceph_caps_used(struct ceph_inode_info *ci) used |= CEPH_CAP_PIN; if (ci->i_rd_ref) used |= CEPH_CAP_FILE_RD; - if (ci->i_rdcache_ref || ci->vfs_inode.i_data.nrpages) + if (ci->i_rdcache_ref || + (!S_ISDIR(ci->vfs_inode.i_mode) && /* ignore readdir cache */ + ci->vfs_inode.i_data.nrpages)) used |= CEPH_CAP_FILE_CACHE; if (ci->i_wr_ref) used |= CEPH_CAP_FILE_WR; @@ -926,16 +928,6 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) /* remove from session list */ spin_lock(&session->s_cap_lock); - /* - * s_cap_reconnect is protected by s_cap_lock. no one changes - * s_cap_gen while session is in the reconnect state. - */ - if (queue_release && - (!session->s_cap_reconnect || - cap->cap_gen == session->s_cap_gen)) - __queue_cap_release(session, ci->i_vino.ino, cap->cap_id, - cap->mseq, cap->issue_seq); - if (session->s_cap_iterator == cap) { /* not yet, we are iterating over this very cap */ dout("__ceph_remove_cap delaying %p removal from session %p\n", @@ -948,6 +940,25 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) } /* protect backpointer with s_cap_lock: see iterate_session_caps */ cap->ci = NULL; + + /* + * s_cap_reconnect is protected by s_cap_lock. no one changes + * s_cap_gen while session is in the reconnect state. + */ + if (queue_release && + (!session->s_cap_reconnect || cap->cap_gen == session->s_cap_gen)) { + cap->queue_release = 1; + if (removed) { + list_add_tail(&cap->session_caps, + &session->s_cap_releases); + session->s_num_cap_releases++; + removed = 0; + } + } else { + cap->queue_release = 0; + } + cap->cap_ino = ci->i_vino.ino; + spin_unlock(&session->s_cap_lock); /* remove from inode list */ @@ -977,8 +988,8 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release) static int send_cap_msg(struct ceph_mds_session *session, u64 ino, u64 cid, int op, int caps, int wanted, int dirty, - u32 seq, u64 flush_tid, u32 issue_seq, u32 mseq, - u64 size, u64 max_size, + u32 seq, u64 flush_tid, u64 oldest_flush_tid, + u32 issue_seq, u32 mseq, u64 size, u64 max_size, struct timespec *mtime, struct timespec *atime, u64 time_warp_seq, kuid_t uid, kgid_t gid, umode_t mode, @@ -992,20 +1003,23 @@ static int send_cap_msg(struct ceph_mds_session *session, size_t extra_len; dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s" - " seq %u/%u mseq %u follows %lld size %llu/%llu" + " seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu" " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(op), cid, ino, ceph_cap_string(caps), ceph_cap_string(wanted), ceph_cap_string(dirty), - seq, issue_seq, mseq, follows, size, max_size, + seq, issue_seq, flush_tid, oldest_flush_tid, + mseq, follows, size, max_size, xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0); - /* flock buffer size + inline version + inline data size */ - extra_len = 4 + 8 + 4; + /* flock buffer size + inline version + inline data size + + * osd_epoch_barrier + oldest_flush_tid */ + extra_len = 4 + 8 + 4 + 4 + 8; msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc) + extra_len, GFP_NOFS, false); if (!msg) return -ENOMEM; + msg->hdr.version = cpu_to_le16(6); msg->hdr.tid = cpu_to_le64(flush_tid); fc = msg->front.iov_base; @@ -1041,6 +1055,10 @@ static int send_cap_msg(struct ceph_mds_session *session, ceph_encode_64(&p, inline_data ? 0 : CEPH_INLINE_NONE); /* inline data size */ ceph_encode_32(&p, 0); + /* osd_epoch_barrier */ + ceph_encode_32(&p, 0); + /* oldest_flush_tid */ + ceph_encode_64(&p, oldest_flush_tid); fc->xattr_version = cpu_to_le64(xattr_version); if (xattrs_buf) { @@ -1053,44 +1071,6 @@ static int send_cap_msg(struct ceph_mds_session *session, return 0; } -void __queue_cap_release(struct ceph_mds_session *session, - u64 ino, u64 cap_id, u32 migrate_seq, - u32 issue_seq) -{ - struct ceph_msg *msg; - struct ceph_mds_cap_release *head; - struct ceph_mds_cap_item *item; - - BUG_ON(!session->s_num_cap_releases); - msg = list_first_entry(&session->s_cap_releases, - struct ceph_msg, list_head); - - dout(" adding %llx release to mds%d msg %p (%d left)\n", - ino, session->s_mds, msg, session->s_num_cap_releases); - - BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE); - head = msg->front.iov_base; - le32_add_cpu(&head->num, 1); - item = msg->front.iov_base + msg->front.iov_len; - item->ino = cpu_to_le64(ino); - item->cap_id = cpu_to_le64(cap_id); - item->migrate_seq = cpu_to_le32(migrate_seq); - item->seq = cpu_to_le32(issue_seq); - - session->s_num_cap_releases--; - - msg->front.iov_len += sizeof(*item); - if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) { - dout(" release msg %p full\n", msg); - list_move_tail(&msg->list_head, &session->s_cap_releases_done); - } else { - dout(" release msg %p at %d/%d (%d)\n", msg, - (int)le32_to_cpu(head->num), - (int)CEPH_CAPS_PER_RELEASE, - (int)msg->front.iov_len); - } -} - /* * Queue cap releases when an inode is dropped from our cache. Since * inode is about to be destroyed, there is no need for i_ceph_lock. @@ -1127,7 +1107,7 @@ void ceph_queue_caps_release(struct inode *inode) */ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, int op, int used, int want, int retain, int flushing, - unsigned *pflush_tid) + u64 flush_tid, u64 oldest_flush_tid) __releases(cap->ci->i_ceph_lock) { struct ceph_inode_info *ci = cap->ci; @@ -1145,8 +1125,6 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, u64 xattr_version = 0; struct ceph_buffer *xattr_blob = NULL; int delayed = 0; - u64 flush_tid = 0; - int i; int ret; bool inline_data; @@ -1190,26 +1168,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, cap->implemented &= cap->issued | used; cap->mds_wanted = want; - if (flushing) { - /* - * assign a tid for flush operations so we can avoid - * flush1 -> dirty1 -> flush2 -> flushack1 -> mark - * clean type races. track latest tid for every bit - * so we can handle flush AxFw, flush Fw, and have the - * first ack clean Ax. - */ - flush_tid = ++ci->i_cap_flush_last_tid; - if (pflush_tid) - *pflush_tid = flush_tid; - dout(" cap_flush_tid %d\n", (int)flush_tid); - for (i = 0; i < CEPH_CAP_BITS; i++) - if (flushing & (1 << i)) - ci->i_cap_flush_tid[i] = flush_tid; - - follows = ci->i_head_snapc->seq; - } else { - follows = 0; - } + follows = flushing ? ci->i_head_snapc->seq : 0; keep = cap->implemented; seq = cap->seq; @@ -1237,7 +1196,8 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, spin_unlock(&ci->i_ceph_lock); ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id, - op, keep, want, flushing, seq, flush_tid, issue_seq, mseq, + op, keep, want, flushing, seq, + flush_tid, oldest_flush_tid, issue_seq, mseq, size, max_size, &mtime, &atime, time_warp_seq, uid, gid, mode, xattr_version, xattr_blob, follows, inline_data); @@ -1259,14 +1219,14 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, * asynchronously back to the MDS once sync writes complete and dirty * data is written out. * - * Unless @again is true, skip cap_snaps that were already sent to + * Unless @kick is true, skip cap_snaps that were already sent to * the MDS (i.e., during this session). * * Called under i_ceph_lock. Takes s_mutex as needed. */ void __ceph_flush_snaps(struct ceph_inode_info *ci, struct ceph_mds_session **psession, - int again) + int kick) __releases(ci->i_ceph_lock) __acquires(ci->i_ceph_lock) { @@ -1297,11 +1257,8 @@ retry: if (capsnap->dirty_pages || capsnap->writing) break; - /* - * if cap writeback already occurred, we should have dropped - * the capsnap in ceph_put_wrbuffer_cap_refs. - */ - BUG_ON(capsnap->dirty == 0); + /* should be removed by ceph_try_drop_cap_snap() */ + BUG_ON(!capsnap->need_flush); /* pick mds, take s_mutex */ if (ci->i_auth_cap == NULL) { @@ -1310,7 +1267,7 @@ retry: } /* only flush each capsnap once */ - if (!again && !list_empty(&capsnap->flushing_item)) { + if (!kick && !list_empty(&capsnap->flushing_item)) { dout("already flushed %p, skipping\n", capsnap); continue; } @@ -1320,6 +1277,9 @@ retry: if (session && session->s_mds != mds) { dout("oops, wrong session %p mutex\n", session); + if (kick) + goto out; + mutex_unlock(&session->s_mutex); ceph_put_mds_session(session); session = NULL; @@ -1343,20 +1303,22 @@ retry: goto retry; } - capsnap->flush_tid = ++ci->i_cap_flush_last_tid; + spin_lock(&mdsc->cap_dirty_lock); + capsnap->flush_tid = ++mdsc->last_cap_flush_tid; + spin_unlock(&mdsc->cap_dirty_lock); + atomic_inc(&capsnap->nref); - if (!list_empty(&capsnap->flushing_item)) - list_del_init(&capsnap->flushing_item); - list_add_tail(&capsnap->flushing_item, - &session->s_cap_snaps_flushing); + if (list_empty(&capsnap->flushing_item)) + list_add_tail(&capsnap->flushing_item, + &session->s_cap_snaps_flushing); spin_unlock(&ci->i_ceph_lock); dout("flush_snaps %p cap_snap %p follows %lld tid %llu\n", inode, capsnap, capsnap->follows, capsnap->flush_tid); send_cap_msg(session, ceph_vino(inode).ino, 0, CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0, - capsnap->dirty, 0, capsnap->flush_tid, 0, mseq, - capsnap->size, 0, + capsnap->dirty, 0, capsnap->flush_tid, 0, + 0, mseq, capsnap->size, 0, &capsnap->mtime, &capsnap->atime, capsnap->time_warp_seq, capsnap->uid, capsnap->gid, capsnap->mode, @@ -1396,7 +1358,8 @@ static void ceph_flush_snaps(struct ceph_inode_info *ci) * Caller is then responsible for calling __mark_inode_dirty with the * returned flags value. */ -int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) +int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask, + struct ceph_cap_flush **pcf) { struct ceph_mds_client *mdsc = ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc; @@ -1416,9 +1379,14 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) ceph_cap_string(was | mask)); ci->i_dirty_caps |= mask; if (was == 0) { - if (!ci->i_head_snapc) + WARN_ON_ONCE(ci->i_prealloc_cap_flush); + swap(ci->i_prealloc_cap_flush, *pcf); + + if (!ci->i_head_snapc) { + WARN_ON_ONCE(!rwsem_is_locked(&mdsc->snap_rwsem)); ci->i_head_snapc = ceph_get_snap_context( ci->i_snap_realm->cached_context); + } dout(" inode %p now dirty snapc %p auth cap %p\n", &ci->vfs_inode, ci->i_head_snapc, ci->i_auth_cap); BUG_ON(!list_empty(&ci->i_dirty_item)); @@ -1429,6 +1397,8 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) ihold(inode); dirty |= I_DIRTY_SYNC; } + } else { + WARN_ON_ONCE(!ci->i_prealloc_cap_flush); } BUG_ON(list_empty(&ci->i_dirty_item)); if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) && @@ -1438,6 +1408,74 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) return dirty; } +static void __add_cap_flushing_to_inode(struct ceph_inode_info *ci, + struct ceph_cap_flush *cf) +{ + struct rb_node **p = &ci->i_cap_flush_tree.rb_node; + struct rb_node *parent = NULL; + struct ceph_cap_flush *other = NULL; + + while (*p) { + parent = *p; + other = rb_entry(parent, struct ceph_cap_flush, i_node); + + if (cf->tid < other->tid) + p = &(*p)->rb_left; + else if (cf->tid > other->tid) + p = &(*p)->rb_right; + else + BUG(); + } + + rb_link_node(&cf->i_node, parent, p); + rb_insert_color(&cf->i_node, &ci->i_cap_flush_tree); +} + +static void __add_cap_flushing_to_mdsc(struct ceph_mds_client *mdsc, + struct ceph_cap_flush *cf) +{ + struct rb_node **p = &mdsc->cap_flush_tree.rb_node; + struct rb_node *parent = NULL; + struct ceph_cap_flush *other = NULL; + + while (*p) { + parent = *p; + other = rb_entry(parent, struct ceph_cap_flush, g_node); + + if (cf->tid < other->tid) + p = &(*p)->rb_left; + else if (cf->tid > other->tid) + p = &(*p)->rb_right; + else + BUG(); + } + + rb_link_node(&cf->g_node, parent, p); + rb_insert_color(&cf->g_node, &mdsc->cap_flush_tree); +} + +struct ceph_cap_flush *ceph_alloc_cap_flush(void) +{ + return kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL); +} + +void ceph_free_cap_flush(struct ceph_cap_flush *cf) +{ + if (cf) + kmem_cache_free(ceph_cap_flush_cachep, cf); +} + +static u64 __get_oldest_flush_tid(struct ceph_mds_client *mdsc) +{ + struct rb_node *n = rb_first(&mdsc->cap_flush_tree); + if (n) { + struct ceph_cap_flush *cf = + rb_entry(n, struct ceph_cap_flush, g_node); + return cf->tid; + } + return 0; +} + /* * Add dirty inode to the flushing list. Assigned a seq number so we * can wait for caps to flush without starving. @@ -1445,14 +1483,17 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask) * Called under i_ceph_lock. */ static int __mark_caps_flushing(struct inode *inode, - struct ceph_mds_session *session) + struct ceph_mds_session *session, + u64 *flush_tid, u64 *oldest_flush_tid) { struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_cap_flush *cf = NULL; int flushing; BUG_ON(ci->i_dirty_caps == 0); BUG_ON(list_empty(&ci->i_dirty_item)); + BUG_ON(!ci->i_prealloc_cap_flush); flushing = ci->i_dirty_caps; dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n", @@ -1463,22 +1504,30 @@ static int __mark_caps_flushing(struct inode *inode, ci->i_dirty_caps = 0; dout(" inode %p now !dirty\n", inode); + swap(cf, ci->i_prealloc_cap_flush); + cf->caps = flushing; + spin_lock(&mdsc->cap_dirty_lock); list_del_init(&ci->i_dirty_item); + cf->tid = ++mdsc->last_cap_flush_tid; + __add_cap_flushing_to_mdsc(mdsc, cf); + *oldest_flush_tid = __get_oldest_flush_tid(mdsc); + if (list_empty(&ci->i_flushing_item)) { - ci->i_cap_flush_seq = ++mdsc->cap_flush_seq; list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing); mdsc->num_cap_flushing++; - dout(" inode %p now flushing seq %lld\n", inode, - ci->i_cap_flush_seq); + dout(" inode %p now flushing tid %llu\n", inode, cf->tid); } else { list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing); - dout(" inode %p now flushing (more) seq %lld\n", inode, - ci->i_cap_flush_seq); + dout(" inode %p now flushing (more) tid %llu\n", + inode, cf->tid); } spin_unlock(&mdsc->cap_dirty_lock); + __add_cap_flushing_to_inode(ci, cf); + + *flush_tid = cf->tid; return flushing; } @@ -1524,6 +1573,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, struct ceph_mds_client *mdsc = fsc->mdsc; struct inode *inode = &ci->vfs_inode; struct ceph_cap *cap; + u64 flush_tid, oldest_flush_tid; int file_wanted, used, cap_used; int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */ int issued, implemented, want, retain, revoking, flushing = 0; @@ -1553,13 +1603,13 @@ retry: retry_locked: file_wanted = __ceph_caps_file_wanted(ci); used = __ceph_caps_used(ci); - want = file_wanted | used; issued = __ceph_caps_issued(ci, &implemented); revoking = implemented & ~issued; - retain = want | CEPH_CAP_PIN; + want = file_wanted; + retain = file_wanted | used | CEPH_CAP_PIN; if (!mdsc->stopping && inode->i_nlink > 0) { - if (want) { + if (file_wanted) { retain |= CEPH_CAP_ANY; /* be greedy */ } else if (S_ISDIR(inode->i_mode) && (issued & CEPH_CAP_FILE_SHARED) && @@ -1602,9 +1652,10 @@ retry_locked: * If we fail, it's because pages are locked.... try again later. */ if ((!is_delayed || mdsc->stopping) && - ci->i_wrbuffer_ref == 0 && /* no dirty pages... */ - inode->i_data.nrpages && /* have cached pages */ - (file_wanted == 0 || /* no open files */ + !S_ISDIR(inode->i_mode) && /* ignore readdir cache */ + ci->i_wrbuffer_ref == 0 && /* no dirty pages... */ + inode->i_data.nrpages && /* have cached pages */ + (file_wanted == 0 || /* no open files */ (revoking & (CEPH_CAP_FILE_CACHE| CEPH_CAP_FILE_LAZYIO))) && /* or revoking cache */ !tried_invalidate) { @@ -1742,17 +1793,25 @@ ack: took_snap_rwsem = 1; } - if (cap == ci->i_auth_cap && ci->i_dirty_caps) - flushing = __mark_caps_flushing(inode, session); - else + if (cap == ci->i_auth_cap && ci->i_dirty_caps) { + flushing = __mark_caps_flushing(inode, session, + &flush_tid, + &oldest_flush_tid); + } else { flushing = 0; + flush_tid = 0; + spin_lock(&mdsc->cap_dirty_lock); + oldest_flush_tid = __get_oldest_flush_tid(mdsc); + spin_unlock(&mdsc->cap_dirty_lock); + } mds = cap->mds; /* remember mds, so we don't repeat */ sent++; /* __send_cap drops i_ceph_lock */ delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, cap_used, - want, retain, flushing, NULL); + want, retain, flushing, + flush_tid, oldest_flush_tid); goto retry; /* retake i_ceph_lock and restart our cap scan. */ } @@ -1781,12 +1840,13 @@ ack: /* * Try to flush dirty caps back to the auth mds. */ -static int try_flush_caps(struct inode *inode, unsigned *flush_tid) +static int try_flush_caps(struct inode *inode, u64 *ptid) { struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); - int flushing = 0; struct ceph_mds_session *session = NULL; + int flushing = 0; + u64 flush_tid = 0, oldest_flush_tid = 0; retry: spin_lock(&ci->i_ceph_lock); @@ -1811,42 +1871,54 @@ retry: if (cap->session->s_state < CEPH_MDS_SESSION_OPEN) goto out; - flushing = __mark_caps_flushing(inode, session); + flushing = __mark_caps_flushing(inode, session, &flush_tid, + &oldest_flush_tid); /* __send_cap drops i_ceph_lock */ delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want, - cap->issued | cap->implemented, flushing, - flush_tid); - if (!delayed) - goto out_unlocked; + (cap->issued | cap->implemented), + flushing, flush_tid, oldest_flush_tid); - spin_lock(&ci->i_ceph_lock); - __cap_delay_requeue(mdsc, ci); + if (delayed) { + spin_lock(&ci->i_ceph_lock); + __cap_delay_requeue(mdsc, ci); + spin_unlock(&ci->i_ceph_lock); + } + } else { + struct rb_node *n = rb_last(&ci->i_cap_flush_tree); + if (n) { + struct ceph_cap_flush *cf = + rb_entry(n, struct ceph_cap_flush, i_node); + flush_tid = cf->tid; + } + flushing = ci->i_flushing_caps; + spin_unlock(&ci->i_ceph_lock); } out: - spin_unlock(&ci->i_ceph_lock); -out_unlocked: if (session) mutex_unlock(&session->s_mutex); + + *ptid = flush_tid; return flushing; } /* * Return true if we've flushed caps through the given flush_tid. */ -static int caps_are_flushed(struct inode *inode, unsigned tid) +static int caps_are_flushed(struct inode *inode, u64 flush_tid) { struct ceph_inode_info *ci = ceph_inode(inode); - int i, ret = 1; + struct ceph_cap_flush *cf; + struct rb_node *n; + int ret = 1; spin_lock(&ci->i_ceph_lock); - for (i = 0; i < CEPH_CAP_BITS; i++) - if ((ci->i_flushing_caps & (1 << i)) && - ci->i_cap_flush_tid[i] <= tid) { - /* still flushing this bit */ + n = rb_first(&ci->i_cap_flush_tree); + if (n) { + cf = rb_entry(n, struct ceph_cap_flush, i_node); + if (cf->tid <= flush_tid) ret = 0; - break; - } + } spin_unlock(&ci->i_ceph_lock); return ret; } @@ -1864,13 +1936,16 @@ static void sync_write_wait(struct inode *inode) struct ceph_osd_request *req; u64 last_tid; + if (!S_ISREG(inode->i_mode)) + return; + spin_lock(&ci->i_unsafe_lock); if (list_empty(head)) goto out; /* set upper bound as _last_ entry in chain */ - req = list_entry(head->prev, struct ceph_osd_request, - r_unsafe_item); + req = list_last_entry(head, struct ceph_osd_request, + r_unsafe_item); last_tid = req->r_tid; do { @@ -1888,18 +1963,64 @@ static void sync_write_wait(struct inode *inode) */ if (list_empty(head)) break; - req = list_entry(head->next, struct ceph_osd_request, - r_unsafe_item); + req = list_first_entry(head, struct ceph_osd_request, + r_unsafe_item); } while (req->r_tid < last_tid); out: spin_unlock(&ci->i_unsafe_lock); } +/* + * wait for any uncommitted directory operations to commit. + */ +static int unsafe_dirop_wait(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct list_head *head = &ci->i_unsafe_dirops; + struct ceph_mds_request *req; + u64 last_tid; + int ret = 0; + + if (!S_ISDIR(inode->i_mode)) + return 0; + + spin_lock(&ci->i_unsafe_lock); + if (list_empty(head)) + goto out; + + req = list_last_entry(head, struct ceph_mds_request, + r_unsafe_dir_item); + last_tid = req->r_tid; + + do { + ceph_mdsc_get_request(req); + spin_unlock(&ci->i_unsafe_lock); + + dout("unsafe_dirop_wait %p wait on tid %llu (until %llu)\n", + inode, req->r_tid, last_tid); + ret = !wait_for_completion_timeout(&req->r_safe_completion, + ceph_timeout_jiffies(req->r_timeout)); + if (ret) + ret = -EIO; /* timed out */ + + ceph_mdsc_put_request(req); + + spin_lock(&ci->i_unsafe_lock); + if (ret || list_empty(head)) + break; + req = list_first_entry(head, struct ceph_mds_request, + r_unsafe_dir_item); + } while (req->r_tid < last_tid); +out: + spin_unlock(&ci->i_unsafe_lock); + return ret; +} + int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; struct ceph_inode_info *ci = ceph_inode(inode); - unsigned flush_tid; + u64 flush_tid; int ret; int dirty; @@ -1908,25 +2029,30 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) ret = filemap_write_and_wait_range(inode->i_mapping, start, end); if (ret < 0) - return ret; + goto out; + + if (datasync) + goto out; + mutex_lock(&inode->i_mutex); dirty = try_flush_caps(inode, &flush_tid); dout("fsync dirty caps are %s\n", ceph_cap_string(dirty)); + ret = unsafe_dirop_wait(inode); + /* * only wait on non-file metadata writeback (the mds * can recover size and mtime, so we don't need to * wait for that) */ - if (!datasync && (dirty & ~CEPH_CAP_ANY_FILE_WR)) { - dout("fsync waiting for flush_tid %u\n", flush_tid); + if (!ret && (dirty & ~CEPH_CAP_ANY_FILE_WR)) { ret = wait_event_interruptible(ci->i_cap_wq, - caps_are_flushed(inode, flush_tid)); + caps_are_flushed(inode, flush_tid)); } - - dout("fsync %p%s done\n", inode, datasync ? " datasync" : ""); mutex_unlock(&inode->i_mutex); +out: + dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret); return ret; } @@ -1939,7 +2065,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) int ceph_write_inode(struct inode *inode, struct writeback_control *wbc) { struct ceph_inode_info *ci = ceph_inode(inode); - unsigned flush_tid; + u64 flush_tid; int err = 0; int dirty; int wait = wbc->sync_mode == WB_SYNC_ALL; @@ -1994,6 +2120,93 @@ static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc, } } +static int __kick_flushing_caps(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session, + struct ceph_inode_info *ci) +{ + struct inode *inode = &ci->vfs_inode; + struct ceph_cap *cap; + struct ceph_cap_flush *cf; + struct rb_node *n; + int delayed = 0; + u64 first_tid = 0; + u64 oldest_flush_tid; + + spin_lock(&mdsc->cap_dirty_lock); + oldest_flush_tid = __get_oldest_flush_tid(mdsc); + spin_unlock(&mdsc->cap_dirty_lock); + + while (true) { + spin_lock(&ci->i_ceph_lock); + cap = ci->i_auth_cap; + if (!(cap && cap->session == session)) { + pr_err("%p auth cap %p not mds%d ???\n", inode, + cap, session->s_mds); + spin_unlock(&ci->i_ceph_lock); + break; + } + + for (n = rb_first(&ci->i_cap_flush_tree); n; n = rb_next(n)) { + cf = rb_entry(n, struct ceph_cap_flush, i_node); + if (cf->tid >= first_tid) + break; + } + if (!n) { + spin_unlock(&ci->i_ceph_lock); + break; + } + + cf = rb_entry(n, struct ceph_cap_flush, i_node); + + first_tid = cf->tid + 1; + + dout("kick_flushing_caps %p cap %p tid %llu %s\n", inode, + cap, cf->tid, ceph_cap_string(cf->caps)); + delayed |= __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, + __ceph_caps_used(ci), + __ceph_caps_wanted(ci), + cap->issued | cap->implemented, + cf->caps, cf->tid, oldest_flush_tid); + } + return delayed; +} + +void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) +{ + struct ceph_inode_info *ci; + struct ceph_cap *cap; + + dout("early_kick_flushing_caps mds%d\n", session->s_mds); + list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) { + spin_lock(&ci->i_ceph_lock); + cap = ci->i_auth_cap; + if (!(cap && cap->session == session)) { + pr_err("%p auth cap %p not mds%d ???\n", + &ci->vfs_inode, cap, session->s_mds); + spin_unlock(&ci->i_ceph_lock); + continue; + } + + + /* + * if flushing caps were revoked, we re-send the cap flush + * in client reconnect stage. This guarantees MDS * processes + * the cap flush message before issuing the flushing caps to + * other client. + */ + if ((cap->issued & ci->i_flushing_caps) != + ci->i_flushing_caps) { + spin_unlock(&ci->i_ceph_lock); + if (!__kick_flushing_caps(mdsc, session, ci)) + continue; + spin_lock(&ci->i_ceph_lock); + } + + spin_unlock(&ci->i_ceph_lock); + } +} + void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { @@ -2003,28 +2216,10 @@ void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, dout("kick_flushing_caps mds%d\n", session->s_mds); list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) { - struct inode *inode = &ci->vfs_inode; - struct ceph_cap *cap; - int delayed = 0; - - spin_lock(&ci->i_ceph_lock); - cap = ci->i_auth_cap; - if (cap && cap->session == session) { - dout("kick_flushing_caps %p cap %p %s\n", inode, - cap, ceph_cap_string(ci->i_flushing_caps)); - delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, - __ceph_caps_used(ci), - __ceph_caps_wanted(ci), - cap->issued | cap->implemented, - ci->i_flushing_caps, NULL); - if (delayed) { - spin_lock(&ci->i_ceph_lock); - __cap_delay_requeue(mdsc, ci); - spin_unlock(&ci->i_ceph_lock); - } - } else { - pr_err("%p auth cap %p not mds%d ???\n", inode, - cap, session->s_mds); + int delayed = __kick_flushing_caps(mdsc, session, ci); + if (delayed) { + spin_lock(&ci->i_ceph_lock); + __cap_delay_requeue(mdsc, ci); spin_unlock(&ci->i_ceph_lock); } } @@ -2036,26 +2231,25 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc, { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_cap *cap; - int delayed = 0; spin_lock(&ci->i_ceph_lock); cap = ci->i_auth_cap; - dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode, - ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq); + dout("kick_flushing_inode_caps %p flushing %s\n", inode, + ceph_cap_string(ci->i_flushing_caps)); __ceph_flush_snaps(ci, &session, 1); if (ci->i_flushing_caps) { + int delayed; + spin_lock(&mdsc->cap_dirty_lock); list_move_tail(&ci->i_flushing_item, &cap->session->s_cap_flushing); spin_unlock(&mdsc->cap_dirty_lock); - delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, - __ceph_caps_used(ci), - __ceph_caps_wanted(ci), - cap->issued | cap->implemented, - ci->i_flushing_caps, NULL); + spin_unlock(&ci->i_ceph_lock); + + delayed = __kick_flushing_caps(mdsc, session, ci); if (delayed) { spin_lock(&ci->i_ceph_lock); __cap_delay_requeue(mdsc, ci); @@ -2073,7 +2267,8 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc, * * Protected by i_ceph_lock. */ -static void __take_cap_refs(struct ceph_inode_info *ci, int got) +static void __take_cap_refs(struct ceph_inode_info *ci, int got, + bool snap_rwsem_locked) { if (got & CEPH_CAP_PIN) ci->i_pin_ref++; @@ -2081,8 +2276,14 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got) ci->i_rd_ref++; if (got & CEPH_CAP_FILE_CACHE) ci->i_rdcache_ref++; - if (got & CEPH_CAP_FILE_WR) + if (got & CEPH_CAP_FILE_WR) { + if (ci->i_wr_ref == 0 && !ci->i_head_snapc) { + BUG_ON(!snap_rwsem_locked); + ci->i_head_snapc = ceph_get_snap_context( + ci->i_snap_realm->cached_context); + } ci->i_wr_ref++; + } if (got & CEPH_CAP_FILE_BUFFER) { if (ci->i_wb_ref == 0) ihold(&ci->vfs_inode); @@ -2100,16 +2301,19 @@ static void __take_cap_refs(struct ceph_inode_info *ci, int got) * requested from the MDS. */ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, - loff_t endoff, int *got, int *check_max, int *err) + loff_t endoff, bool nonblock, int *got, int *err) { struct inode *inode = &ci->vfs_inode; + struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; int ret = 0; int have, implemented; int file_wanted; + bool snap_rwsem_locked = false; dout("get_cap_refs %p need %s want %s\n", inode, ceph_cap_string(need), ceph_cap_string(want)); +again: spin_lock(&ci->i_ceph_lock); /* make sure file is actually open */ @@ -2125,6 +2329,10 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, /* finish pending truncate */ while (ci->i_truncate_pending) { spin_unlock(&ci->i_ceph_lock); + if (snap_rwsem_locked) { + up_read(&mdsc->snap_rwsem); + snap_rwsem_locked = false; + } __ceph_do_pending_vmtruncate(inode); spin_lock(&ci->i_ceph_lock); } @@ -2136,7 +2344,7 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, dout("get_cap_refs %p endoff %llu > maxsize %llu\n", inode, endoff, ci->i_max_size); if (endoff > ci->i_requested_max_size) { - *check_max = 1; + *err = -EAGAIN; ret = 1; } goto out_unlock; @@ -2164,8 +2372,29 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, inode, ceph_cap_string(have), ceph_cap_string(not), ceph_cap_string(revoking)); if ((revoking & not) == 0) { + if (!snap_rwsem_locked && + !ci->i_head_snapc && + (need & CEPH_CAP_FILE_WR)) { + if (!down_read_trylock(&mdsc->snap_rwsem)) { + /* + * we can not call down_read() when + * task isn't in TASK_RUNNING state + */ + if (nonblock) { + *err = -EAGAIN; + ret = 1; + goto out_unlock; + } + + spin_unlock(&ci->i_ceph_lock); + down_read(&mdsc->snap_rwsem); + snap_rwsem_locked = true; + goto again; + } + snap_rwsem_locked = true; + } *got = need | (have & want); - __take_cap_refs(ci, *got); + __take_cap_refs(ci, *got, true); ret = 1; } } else { @@ -2189,6 +2418,8 @@ static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want, } out_unlock: spin_unlock(&ci->i_ceph_lock); + if (snap_rwsem_locked) + up_read(&mdsc->snap_rwsem); dout("get_cap_refs %p ret %d got %s\n", inode, ret, ceph_cap_string(*got)); @@ -2231,50 +2462,70 @@ static void check_max_size(struct inode *inode, loff_t endoff) int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, loff_t endoff, int *got, struct page **pinned_page) { - int _got, check_max, ret, err = 0; + int _got, ret, err = 0; -retry: - if (endoff > 0) - check_max_size(&ci->vfs_inode, endoff); - _got = 0; - check_max = 0; - ret = wait_event_interruptible(ci->i_cap_wq, - try_get_cap_refs(ci, need, want, endoff, - &_got, &check_max, &err)); - if (err) - ret = err; + ret = ceph_pool_perm_check(ci, need); if (ret < 0) return ret; - if (check_max) - goto retry; + while (true) { + if (endoff > 0) + check_max_size(&ci->vfs_inode, endoff); - if (ci->i_inline_version != CEPH_INLINE_NONE && - (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && - i_size_read(&ci->vfs_inode) > 0) { - struct page *page = find_get_page(ci->vfs_inode.i_mapping, 0); - if (page) { - if (PageUptodate(page)) { - *pinned_page = page; - goto out; - } - page_cache_release(page); - } - /* - * drop cap refs first because getattr while holding - * caps refs can cause deadlock. - */ - ceph_put_cap_refs(ci, _got); + err = 0; _got = 0; + ret = try_get_cap_refs(ci, need, want, endoff, + false, &_got, &err); + if (ret) { + if (err == -EAGAIN) + continue; + if (err < 0) + return err; + } else { + ret = wait_event_interruptible(ci->i_cap_wq, + try_get_cap_refs(ci, need, want, endoff, + true, &_got, &err)); + if (err == -EAGAIN) + continue; + if (err < 0) + ret = err; + if (ret < 0) + return ret; + } - /* getattr request will bring inline data into page cache */ - ret = __ceph_do_getattr(&ci->vfs_inode, NULL, - CEPH_STAT_CAP_INLINE_DATA, true); - if (ret < 0) - return ret; - goto retry; + if (ci->i_inline_version != CEPH_INLINE_NONE && + (_got & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) && + i_size_read(&ci->vfs_inode) > 0) { + struct page *page = + find_get_page(ci->vfs_inode.i_mapping, 0); + if (page) { + if (PageUptodate(page)) { + *pinned_page = page; + break; + } + page_cache_release(page); + } + /* + * drop cap refs first because getattr while + * holding * caps refs can cause deadlock. + */ + ceph_put_cap_refs(ci, _got); + _got = 0; + + /* + * getattr request will bring inline data into + * page cache + */ + ret = __ceph_do_getattr(&ci->vfs_inode, NULL, + CEPH_STAT_CAP_INLINE_DATA, + true); + if (ret < 0) + return ret; + continue; + } + break; } -out: + *got = _got; return 0; } @@ -2286,10 +2537,31 @@ out: void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps) { spin_lock(&ci->i_ceph_lock); - __take_cap_refs(ci, caps); + __take_cap_refs(ci, caps, false); spin_unlock(&ci->i_ceph_lock); } + +/* + * drop cap_snap that is not associated with any snapshot. + * we don't need to send FLUSHSNAP message for it. + */ +static int ceph_try_drop_cap_snap(struct ceph_cap_snap *capsnap) +{ + if (!capsnap->need_flush && + !capsnap->writing && !capsnap->dirty_pages) { + + dout("dropping cap_snap %p follows %llu\n", + capsnap, capsnap->follows); + ceph_put_snap_context(capsnap->context); + list_del(&capsnap->ci_item); + list_del(&capsnap->flushing_item); + ceph_put_cap_snap(capsnap); + return 1; + } + return 0; +} + /* * Release cap refs. * @@ -2303,7 +2575,6 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) { struct inode *inode = &ci->vfs_inode; int last = 0, put = 0, flushsnaps = 0, wake = 0; - struct ceph_cap_snap *capsnap; spin_lock(&ci->i_ceph_lock); if (had & CEPH_CAP_PIN) @@ -2325,17 +2596,24 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) if (had & CEPH_CAP_FILE_WR) if (--ci->i_wr_ref == 0) { last++; - if (!list_empty(&ci->i_cap_snaps)) { - capsnap = list_first_entry(&ci->i_cap_snaps, - struct ceph_cap_snap, - ci_item); - if (capsnap->writing) { - capsnap->writing = 0; - flushsnaps = - __ceph_finish_cap_snap(ci, - capsnap); - wake = 1; - } + if (__ceph_have_pending_cap_snap(ci)) { + struct ceph_cap_snap *capsnap = + list_last_entry(&ci->i_cap_snaps, + struct ceph_cap_snap, + ci_item); + capsnap->writing = 0; + if (ceph_try_drop_cap_snap(capsnap)) + put++; + else if (__ceph_finish_cap_snap(ci, capsnap)) + flushsnaps = 1; + wake = 1; + } + if (ci->i_wrbuffer_ref_head == 0 && + ci->i_dirty_caps == 0 && + ci->i_flushing_caps == 0) { + BUG_ON(!ci->i_head_snapc); + ceph_put_snap_context(ci->i_head_snapc); + ci->i_head_snapc = NULL; } /* see comment in __ceph_remove_cap() */ if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) @@ -2352,7 +2630,7 @@ void ceph_put_cap_refs(struct ceph_inode_info *ci, int had) ceph_flush_snaps(ci); if (wake) wake_up_all(&ci->i_cap_wq); - if (put) + while (put-- > 0) iput(inode); } @@ -2380,7 +2658,9 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, if (ci->i_head_snapc == snapc) { ci->i_wrbuffer_ref_head -= nr; if (ci->i_wrbuffer_ref_head == 0 && - ci->i_dirty_caps == 0 && ci->i_flushing_caps == 0) { + ci->i_wr_ref == 0 && + ci->i_dirty_caps == 0 && + ci->i_flushing_caps == 0) { BUG_ON(!ci->i_head_snapc); ceph_put_snap_context(ci->i_head_snapc); ci->i_head_snapc = NULL; @@ -2401,25 +2681,15 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, capsnap->dirty_pages -= nr; if (capsnap->dirty_pages == 0) { complete_capsnap = 1; - if (capsnap->dirty == 0) - /* cap writeback completed before we created - * the cap_snap; no FLUSHSNAP is needed */ - drop_capsnap = 1; + drop_capsnap = ceph_try_drop_cap_snap(capsnap); } dout("put_wrbuffer_cap_refs on %p cap_snap %p " - " snap %lld %d/%d -> %d/%d %s%s%s\n", + " snap %lld %d/%d -> %d/%d %s%s\n", inode, capsnap, capsnap->context->seq, ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr, ci->i_wrbuffer_ref, capsnap->dirty_pages, last ? " (wrbuffer last)" : "", - complete_capsnap ? " (complete capsnap)" : "", - drop_capsnap ? " (drop capsnap)" : ""); - if (drop_capsnap) { - ceph_put_snap_context(capsnap->context); - list_del(&capsnap->ci_item); - list_del(&capsnap->flushing_item); - ceph_put_cap_snap(capsnap); - } + complete_capsnap ? " (complete capsnap)" : ""); } spin_unlock(&ci->i_ceph_lock); @@ -2526,7 +2796,8 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, * try to invalidate (once). (If there are dirty buffers, we * will invalidate _after_ writeback.) */ - if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) && + if (!S_ISDIR(inode->i_mode) && /* don't invalidate readdir cache */ + ((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) && (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && !ci->i_wrbuffer_ref) { if (try_nonblocking_invalidate(inode)) { @@ -2732,16 +3003,29 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + struct ceph_cap_flush *cf; + struct rb_node *n; + LIST_HEAD(to_remove); unsigned seq = le32_to_cpu(m->seq); int dirty = le32_to_cpu(m->dirty); int cleaned = 0; int drop = 0; - int i; - for (i = 0; i < CEPH_CAP_BITS; i++) - if ((dirty & (1 << i)) && - (u16)flush_tid == ci->i_cap_flush_tid[i]) - cleaned |= 1 << i; + n = rb_first(&ci->i_cap_flush_tree); + while (n) { + cf = rb_entry(n, struct ceph_cap_flush, i_node); + n = rb_next(&cf->i_node); + if (cf->tid == flush_tid) + cleaned = cf->caps; + if (cf->tid <= flush_tid) { + rb_erase(&cf->i_node, &ci->i_cap_flush_tree); + list_add_tail(&cf->list, &to_remove); + } else { + cleaned &= ~cf->caps; + if (!cleaned) + break; + } + } dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s," " flushing %s -> %s\n", @@ -2749,12 +3033,23 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps), ceph_cap_string(ci->i_flushing_caps & ~cleaned)); - if (ci->i_flushing_caps == (ci->i_flushing_caps & ~cleaned)) + if (list_empty(&to_remove) && !cleaned) goto out; ci->i_flushing_caps &= ~cleaned; spin_lock(&mdsc->cap_dirty_lock); + + if (!list_empty(&to_remove)) { + list_for_each_entry(cf, &to_remove, list) + rb_erase(&cf->g_node, &mdsc->cap_flush_tree); + + n = rb_first(&mdsc->cap_flush_tree); + cf = n ? rb_entry(n, struct ceph_cap_flush, g_node) : NULL; + if (!cf || cf->tid > flush_tid) + wake_up_all(&mdsc->cap_flushing_wq); + } + if (ci->i_flushing_caps == 0) { list_del_init(&ci->i_flushing_item); if (!list_empty(&session->s_cap_flushing)) @@ -2764,14 +3059,14 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, struct ceph_inode_info, i_flushing_item)->vfs_inode); mdsc->num_cap_flushing--; - wake_up_all(&mdsc->cap_flushing_wq); dout(" inode %p now !flushing\n", inode); if (ci->i_dirty_caps == 0) { dout(" inode %p now clean\n", inode); BUG_ON(!list_empty(&ci->i_dirty_item)); drop = 1; - if (ci->i_wrbuffer_ref_head == 0) { + if (ci->i_wr_ref == 0 && + ci->i_wrbuffer_ref_head == 0) { BUG_ON(!ci->i_head_snapc); ceph_put_snap_context(ci->i_head_snapc); ci->i_head_snapc = NULL; @@ -2785,6 +3080,13 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, out: spin_unlock(&ci->i_ceph_lock); + + while (!list_empty(&to_remove)) { + cf = list_first_entry(&to_remove, + struct ceph_cap_flush, list); + list_del(&cf->list); + ceph_free_cap_flush(cf); + } if (drop) iput(inode); } @@ -2800,6 +3102,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, struct ceph_mds_session *session) { struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; u64 follows = le64_to_cpu(m->snap_follows); struct ceph_cap_snap *capsnap; int drop = 0; @@ -2823,6 +3126,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid, list_del(&capsnap->ci_item); list_del(&capsnap->flushing_item); ceph_put_cap_snap(capsnap); + wake_up_all(&mdsc->cap_flushing_wq); drop = 1; break; } else { @@ -2971,7 +3275,6 @@ retry: mutex_lock_nested(&session->s_mutex, SINGLE_DEPTH_NESTING); } - ceph_add_cap_releases(mdsc, tsession); new_cap = ceph_get_cap(mdsc, NULL); } else { WARN_ON(1); @@ -3167,16 +3470,20 @@ void ceph_handle_caps(struct ceph_mds_session *session, dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq, (unsigned)seq); - if (op == CEPH_CAP_OP_IMPORT) - ceph_add_cap_releases(mdsc, session); - if (!inode) { dout(" i don't have ino %llx\n", vino.ino); if (op == CEPH_CAP_OP_IMPORT) { + cap = ceph_get_cap(mdsc, NULL); + cap->cap_ino = vino.ino; + cap->queue_release = 1; + cap->cap_id = cap_id; + cap->mseq = mseq; + cap->seq = seq; spin_lock(&session->s_cap_lock); - __queue_cap_release(session, vino.ino, cap_id, - mseq, seq); + list_add_tail(&cap->session_caps, + &session->s_cap_releases); + session->s_num_cap_releases++; spin_unlock(&session->s_cap_lock); } goto flush_cap_releases; @@ -3252,11 +3559,10 @@ void ceph_handle_caps(struct ceph_mds_session *session, flush_cap_releases: /* - * send any full release message to try to move things + * send any cap release message to try to move things * along for the mds (who clearly thinks we still have this * cap). */ - ceph_add_cap_releases(mdsc, session); ceph_send_cap_releases(mdsc, session); done: diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 4248307fe..9314b4ea2 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -38,7 +38,7 @@ int ceph_init_dentry(struct dentry *dentry) if (dentry->d_fsdata) return 0; - di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS | __GFP_ZERO); + di = kmem_cache_alloc(ceph_dentry_cachep, GFP_KERNEL | __GFP_ZERO); if (!di) return -ENOMEM; /* oh well */ @@ -107,6 +107,27 @@ static int fpos_cmp(loff_t l, loff_t r) } /* + * make note of the last dentry we read, so we can + * continue at the same lexicographical point, + * regardless of what dir changes take place on the + * server. + */ +static int note_last_dentry(struct ceph_file_info *fi, const char *name, + int len, unsigned next_offset) +{ + char *buf = kmalloc(len+1, GFP_KERNEL); + if (!buf) + return -ENOMEM; + kfree(fi->last_name); + fi->last_name = buf; + memcpy(fi->last_name, name, len); + fi->last_name[len] = 0; + fi->next_offset = next_offset; + dout("note_last_dentry '%s'\n", fi->last_name); + return 0; +} + +/* * When possible, we try to satisfy a readdir by peeking at the * dcache. We make this work by carefully ordering dentries on * d_child when we initially get results back from the MDS, and @@ -123,123 +144,113 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, struct ceph_file_info *fi = file->private_data; struct dentry *parent = file->f_path.dentry; struct inode *dir = d_inode(parent); - struct list_head *p; - struct dentry *dentry, *last; + struct dentry *dentry, *last = NULL; struct ceph_dentry_info *di; + unsigned nsize = PAGE_CACHE_SIZE / sizeof(struct dentry *); int err = 0; + loff_t ptr_pos = 0; + struct ceph_readdir_cache_control cache_ctl = {}; - /* claim ref on last dentry we returned */ - last = fi->dentry; - fi->dentry = NULL; - - dout("__dcache_readdir %p v%u at %llu (last %p)\n", - dir, shared_gen, ctx->pos, last); + dout("__dcache_readdir %p v%u at %llu\n", dir, shared_gen, ctx->pos); - spin_lock(&parent->d_lock); - - /* start at beginning? */ - if (ctx->pos == 2 || last == NULL || - fpos_cmp(ctx->pos, ceph_dentry(last)->offset) < 0) { - if (list_empty(&parent->d_subdirs)) - goto out_unlock; - p = parent->d_subdirs.prev; - dout(" initial p %p/%p\n", p->prev, p->next); - } else { - p = last->d_child.prev; + /* we can calculate cache index for the first dirfrag */ + if (ceph_frag_is_leftmost(fpos_frag(ctx->pos))) { + cache_ctl.index = fpos_off(ctx->pos) - 2; + BUG_ON(cache_ctl.index < 0); + ptr_pos = cache_ctl.index * sizeof(struct dentry *); } -more: - dentry = list_entry(p, struct dentry, d_child); - di = ceph_dentry(dentry); - while (1) { - dout(" p %p/%p %s d_subdirs %p/%p\n", p->prev, p->next, - d_unhashed(dentry) ? "!hashed" : "hashed", - parent->d_subdirs.prev, parent->d_subdirs.next); - if (p == &parent->d_subdirs) { + while (true) { + pgoff_t pgoff; + bool emit_dentry; + + if (ptr_pos >= i_size_read(dir)) { fi->flags |= CEPH_F_ATEND; - goto out_unlock; + err = 0; + break; + } + + err = -EAGAIN; + pgoff = ptr_pos >> PAGE_CACHE_SHIFT; + if (!cache_ctl.page || pgoff != page_index(cache_ctl.page)) { + ceph_readdir_cache_release(&cache_ctl); + cache_ctl.page = find_lock_page(&dir->i_data, pgoff); + if (!cache_ctl.page) { + dout(" page %lu not found\n", pgoff); + break; + } + /* reading/filling the cache are serialized by + * i_mutex, no need to use page lock */ + unlock_page(cache_ctl.page); + cache_ctl.dentries = kmap(cache_ctl.page); } - spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); + + rcu_read_lock(); + spin_lock(&parent->d_lock); + /* check i_size again here, because empty directory can be + * marked as complete while not holding the i_mutex. */ + if (ceph_dir_is_complete_ordered(dir) && + ptr_pos < i_size_read(dir)) + dentry = cache_ctl.dentries[cache_ctl.index % nsize]; + else + dentry = NULL; + spin_unlock(&parent->d_lock); + if (dentry && !lockref_get_not_dead(&dentry->d_lockref)) + dentry = NULL; + rcu_read_unlock(); + if (!dentry) + break; + + emit_dentry = false; + di = ceph_dentry(dentry); + spin_lock(&dentry->d_lock); if (di->lease_shared_gen == shared_gen && - !d_unhashed(dentry) && d_really_is_positive(dentry) && + d_really_is_positive(dentry) && ceph_snap(d_inode(dentry)) != CEPH_SNAPDIR && ceph_ino(d_inode(dentry)) != CEPH_INO_CEPH && - fpos_cmp(ctx->pos, di->offset) <= 0) - break; - dout(" skipping %p %pd at %llu (%llu)%s%s\n", dentry, - dentry, di->offset, - ctx->pos, d_unhashed(dentry) ? " unhashed" : "", - !d_inode(dentry) ? " null" : ""); + fpos_cmp(ctx->pos, di->offset) <= 0) { + emit_dentry = true; + } spin_unlock(&dentry->d_lock); - p = p->prev; - dentry = list_entry(p, struct dentry, d_child); - di = ceph_dentry(dentry); - } - - dget_dlock(dentry); - spin_unlock(&dentry->d_lock); - spin_unlock(&parent->d_lock); - /* make sure a dentry wasn't dropped while we didn't have parent lock */ - if (!ceph_dir_is_complete_ordered(dir)) { - dout(" lost dir complete on %p; falling back to mds\n", dir); - dput(dentry); - err = -EAGAIN; - goto out; - } + if (emit_dentry) { + dout(" %llu (%llu) dentry %p %pd %p\n", di->offset, ctx->pos, + dentry, dentry, d_inode(dentry)); + ctx->pos = di->offset; + if (!dir_emit(ctx, dentry->d_name.name, + dentry->d_name.len, + ceph_translate_ino(dentry->d_sb, + d_inode(dentry)->i_ino), + d_inode(dentry)->i_mode >> 12)) { + dput(dentry); + err = 0; + break; + } + ctx->pos++; - dout(" %llu (%llu) dentry %p %pd %p\n", di->offset, ctx->pos, - dentry, dentry, d_inode(dentry)); - if (!dir_emit(ctx, dentry->d_name.name, - dentry->d_name.len, - ceph_translate_ino(dentry->d_sb, d_inode(dentry)->i_ino), - d_inode(dentry)->i_mode >> 12)) { - if (last) { - /* remember our position */ - fi->dentry = last; - fi->next_offset = fpos_off(di->offset); + if (last) + dput(last); + last = dentry; + } else { + dput(dentry); } - dput(dentry); - return 0; - } - - ctx->pos = di->offset + 1; - if (last) - dput(last); - last = dentry; - - spin_lock(&parent->d_lock); - p = p->prev; /* advance to next dentry */ - goto more; - -out_unlock: - spin_unlock(&parent->d_lock); -out: - if (last) + cache_ctl.index++; + ptr_pos += sizeof(struct dentry *); + } + ceph_readdir_cache_release(&cache_ctl); + if (last) { + int ret; + di = ceph_dentry(last); + ret = note_last_dentry(fi, last->d_name.name, last->d_name.len, + fpos_off(di->offset) + 1); + if (ret < 0) + err = ret; dput(last); + } return err; } -/* - * make note of the last dentry we read, so we can - * continue at the same lexicographical point, - * regardless of what dir changes take place on the - * server. - */ -static int note_last_dentry(struct ceph_file_info *fi, const char *name, - int len) -{ - kfree(fi->last_name); - fi->last_name = kmalloc(len+1, GFP_NOFS); - if (!fi->last_name) - return -ENOMEM; - memcpy(fi->last_name, name, len); - fi->last_name[len] = 0; - dout("note_last_dentry '%s'\n", fi->last_name); - return 0; -} - static int ceph_readdir(struct file *file, struct dir_context *ctx) { struct ceph_file_info *fi = file->private_data; @@ -280,8 +291,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) /* can we use the dcache? */ spin_lock(&ci->i_ceph_lock); - if ((ctx->pos == 2 || fi->dentry) && - ceph_test_mount_opt(fsc, DCACHE) && + if (ceph_test_mount_opt(fsc, DCACHE) && !ceph_test_mount_opt(fsc, NOASYNCREADDIR) && ceph_snap(inode) != CEPH_SNAPDIR && __ceph_dir_is_complete_ordered(ci) && @@ -296,24 +306,8 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) } else { spin_unlock(&ci->i_ceph_lock); } - if (fi->dentry) { - err = note_last_dentry(fi, fi->dentry->d_name.name, - fi->dentry->d_name.len); - if (err) - return err; - dput(fi->dentry); - fi->dentry = NULL; - } /* proceed with a normal readdir */ - - if (ctx->pos == 2) { - /* note dir version at start of readdir so we can tell - * if any dentries get dropped */ - fi->dir_release_count = atomic_read(&ci->i_release_count); - fi->dir_ordered_count = ci->i_ordered_count; - } - more: /* do we have the correct frag content buffered? */ if (fi->frag != frag || fi->last_readdir == NULL) { @@ -342,12 +336,15 @@ more: req->r_direct_hash = ceph_frag_value(frag); req->r_direct_is_hash = true; if (fi->last_name) { - req->r_path2 = kstrdup(fi->last_name, GFP_NOFS); + req->r_path2 = kstrdup(fi->last_name, GFP_KERNEL); if (!req->r_path2) { ceph_mdsc_put_request(req); return -ENOMEM; } } + req->r_dir_release_cnt = fi->dir_release_count; + req->r_dir_ordered_cnt = fi->dir_ordered_count; + req->r_readdir_cache_idx = fi->readdir_cache_idx; req->r_readdir_offset = fi->next_offset; req->r_args.readdir.frag = cpu_to_le32(frag); @@ -364,26 +361,38 @@ more: (int)req->r_reply_info.dir_end, (int)req->r_reply_info.dir_complete); - if (!req->r_did_prepopulate) { - dout("readdir !did_prepopulate"); - /* preclude from marking dir complete */ - fi->dir_release_count--; - } /* note next offset and last dentry name */ rinfo = &req->r_reply_info; if (le32_to_cpu(rinfo->dir_dir->frag) != frag) { frag = le32_to_cpu(rinfo->dir_dir->frag); - if (ceph_frag_is_leftmost(frag)) - fi->next_offset = 2; - else - fi->next_offset = 0; - off = fi->next_offset; + off = req->r_readdir_offset; + fi->next_offset = off; } + fi->frag = frag; fi->offset = fi->next_offset; fi->last_readdir = req; + if (req->r_did_prepopulate) { + fi->readdir_cache_idx = req->r_readdir_cache_idx; + if (fi->readdir_cache_idx < 0) { + /* preclude from marking dir ordered */ + fi->dir_ordered_count = 0; + } else if (ceph_frag_is_leftmost(frag) && off == 2) { + /* note dir version at start of readdir so + * we can tell if any dentries get dropped */ + fi->dir_release_count = req->r_dir_release_cnt; + fi->dir_ordered_count = req->r_dir_ordered_cnt; + } + } else { + dout("readdir !did_prepopulate"); + /* disable readdir cache */ + fi->readdir_cache_idx = -1; + /* preclude from marking dir complete */ + fi->dir_release_count = 0; + } + if (req->r_reply_info.dir_end) { kfree(fi->last_name); fi->last_name = NULL; @@ -394,10 +403,10 @@ more: } else { err = note_last_dentry(fi, rinfo->dir_dname[rinfo->dir_nr-1], - rinfo->dir_dname_len[rinfo->dir_nr-1]); + rinfo->dir_dname_len[rinfo->dir_nr-1], + fi->next_offset + rinfo->dir_nr); if (err) return err; - fi->next_offset += rinfo->dir_nr; } } @@ -453,16 +462,22 @@ more: * were released during the whole readdir, and we should have * the complete dir contents in our cache. */ - spin_lock(&ci->i_ceph_lock); - if (atomic_read(&ci->i_release_count) == fi->dir_release_count) { - if (ci->i_ordered_count == fi->dir_ordered_count) + if (atomic64_read(&ci->i_release_count) == fi->dir_release_count) { + spin_lock(&ci->i_ceph_lock); + if (fi->dir_ordered_count == atomic64_read(&ci->i_ordered_count)) { dout(" marking %p complete and ordered\n", inode); - else + /* use i_size to track number of entries in + * readdir cache */ + BUG_ON(fi->readdir_cache_idx < 0); + i_size_write(inode, fi->readdir_cache_idx * + sizeof(struct dentry*)); + } else { dout(" marking %p complete\n", inode); + } __ceph_dir_set_complete(ci, fi->dir_release_count, fi->dir_ordered_count); + spin_unlock(&ci->i_ceph_lock); } - spin_unlock(&ci->i_ceph_lock); dout("readdir %p file %p done.\n", inode, file); return 0; @@ -476,14 +491,12 @@ static void reset_readdir(struct ceph_file_info *fi, unsigned frag) } kfree(fi->last_name); fi->last_name = NULL; + fi->dir_release_count = 0; + fi->readdir_cache_idx = -1; if (ceph_frag_is_leftmost(frag)) fi->next_offset = 2; /* compensate for . and .. */ else fi->next_offset = 0; - if (fi->dentry) { - dput(fi->dentry); - fi->dentry = NULL; - } fi->flags &= ~CEPH_F_ATEND; } @@ -497,13 +510,12 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) mutex_lock(&inode->i_mutex); retval = -EINVAL; switch (whence) { - case SEEK_END: - offset += inode->i_size + 2; /* FIXME */ - break; case SEEK_CUR: offset += file->f_pos; case SEEK_SET: break; + case SEEK_END: + retval = -EOPNOTSUPP; default: goto out; } @@ -516,20 +528,18 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) } retval = offset; - /* - * discard buffered readdir content on seekdir(0), or - * seek to new frag, or seek prior to current chunk. - */ if (offset == 0 || fpos_frag(offset) != fi->frag || fpos_off(offset) < fi->offset) { + /* discard buffered readdir content on seekdir(0), or + * seek to new frag, or seek prior to current chunk */ dout("dir_llseek dropping %p content\n", file); reset_readdir(fi, fpos_frag(offset)); + } else if (fpos_cmp(offset, old_offset) > 0) { + /* reset dir_release_count if we did a forward seek */ + fi->dir_release_count = 0; + fi->readdir_cache_idx = -1; } - - /* bump dir_release_count if we did a forward seek */ - if (fpos_cmp(offset, old_offset) > 0) - fi->dir_release_count--; } out: mutex_unlock(&inode->i_mutex); @@ -764,7 +774,7 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry, err = PTR_ERR(req); goto out; } - req->r_path2 = kstrdup(dest, GFP_NOFS); + req->r_path2 = kstrdup(dest, GFP_KERNEL); if (!req->r_path2) { err = -ENOMEM; ceph_mdsc_put_request(req); @@ -985,16 +995,15 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, * to do it here. */ + /* d_move screws up sibling dentries' offsets */ + ceph_dir_clear_complete(old_dir); + ceph_dir_clear_complete(new_dir); + d_move(old_dentry, new_dentry); /* ensure target dentry is invalidated, despite rehashing bug in vfs_rename_dir */ ceph_invalidate_dentry_lease(new_dentry); - - /* d_move screws up sibling dentries' offsets */ - ceph_dir_clear_complete(old_dir); - ceph_dir_clear_complete(new_dir); - } ceph_mdsc_put_request(req); return err; @@ -1189,7 +1198,7 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, return -EISDIR; if (!cf->dir_info) { - cf->dir_info = kmalloc(bufsize, GFP_NOFS); + cf->dir_info = kmalloc(bufsize, GFP_KERNEL); if (!cf->dir_info) return -ENOMEM; cf->dir_info_len = @@ -1224,66 +1233,6 @@ static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size, } /* - * an fsync() on a dir will wait for any uncommitted directory - * operations to commit. - */ -static int ceph_dir_fsync(struct file *file, loff_t start, loff_t end, - int datasync) -{ - struct inode *inode = file_inode(file); - struct ceph_inode_info *ci = ceph_inode(inode); - struct list_head *head = &ci->i_unsafe_dirops; - struct ceph_mds_request *req; - u64 last_tid; - int ret = 0; - - dout("dir_fsync %p\n", inode); - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); - if (ret) - return ret; - mutex_lock(&inode->i_mutex); - - spin_lock(&ci->i_unsafe_lock); - if (list_empty(head)) - goto out; - - req = list_entry(head->prev, - struct ceph_mds_request, r_unsafe_dir_item); - last_tid = req->r_tid; - - do { - ceph_mdsc_get_request(req); - spin_unlock(&ci->i_unsafe_lock); - - dout("dir_fsync %p wait on tid %llu (until %llu)\n", - inode, req->r_tid, last_tid); - if (req->r_timeout) { - unsigned long time_left = wait_for_completion_timeout( - &req->r_safe_completion, - req->r_timeout); - if (time_left > 0) - ret = 0; - else - ret = -EIO; /* timed out */ - } else { - wait_for_completion(&req->r_safe_completion); - } - ceph_mdsc_put_request(req); - - spin_lock(&ci->i_unsafe_lock); - if (ret || list_empty(head)) - break; - req = list_entry(head->next, - struct ceph_mds_request, r_unsafe_dir_item); - } while (req->r_tid < last_tid); -out: - spin_unlock(&ci->i_unsafe_lock); - mutex_unlock(&inode->i_mutex); - - return ret; -} - -/* * We maintain a private dentry LRU. * * FIXME: this needs to be changed to a per-mds lru to be useful. @@ -1353,7 +1302,7 @@ const struct file_operations ceph_dir_fops = { .open = ceph_open, .release = ceph_release, .unlocked_ioctl = ceph_ioctl, - .fsync = ceph_dir_fsync, + .fsync = ceph_fsync, }; const struct file_operations ceph_snapdir_fops = { diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 3b6b522b4..8b79d87ea 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -89,13 +89,14 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) case S_IFDIR: dout("init_file %p %p 0%o (regular)\n", inode, file, inode->i_mode); - cf = kmem_cache_alloc(ceph_file_cachep, GFP_NOFS | __GFP_ZERO); + cf = kmem_cache_alloc(ceph_file_cachep, GFP_KERNEL | __GFP_ZERO); if (cf == NULL) { ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ return -ENOMEM; } cf->fmode = fmode; cf->next_offset = 2; + cf->readdir_cache_idx = -1; file->private_data = cf; BUG_ON(inode->i_fop->release != ceph_release); break; @@ -324,7 +325,6 @@ int ceph_release(struct inode *inode, struct file *file) ceph_mdsc_put_request(cf->last_readdir); kfree(cf->last_name); kfree(cf->dir_info); - dput(cf->dentry); kmem_cache_free(ceph_file_cachep, cf); /* wake up anyone waiting for caps on this inode */ @@ -483,7 +483,7 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i, } } else { num_pages = calc_pages_for(off, len); - pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); + pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); if (IS_ERR(pages)) return PTR_ERR(pages); ret = striped_read(inode, off, len, pages, @@ -557,13 +557,13 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe) * objects, rollback on failure, etc.) */ static ssize_t -ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) +ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, + struct ceph_snap_context *snapc) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); - struct ceph_snap_context *snapc; struct ceph_vino vino; struct ceph_osd_request *req; struct page **pages; @@ -600,7 +600,6 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) size_t start; ssize_t n; - snapc = ci->i_snap_realm->cached_context; vino = ceph_vino(inode); req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, pos, &len, 0, @@ -614,7 +613,7 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) break; } - osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC); + osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0); n = iov_iter_get_pages_alloc(from, &pages, len, &start); if (unlikely(n < 0)) { @@ -674,13 +673,13 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) * objects, rollback on failure, etc.) */ static ssize_t -ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) +ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, + struct ceph_snap_context *snapc) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); - struct ceph_snap_context *snapc; struct ceph_vino vino; struct ceph_osd_request *req; struct page **pages; @@ -717,7 +716,6 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) size_t left; int n; - snapc = ci->i_snap_realm->cached_context; vino = ceph_vino(inode); req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, pos, &len, 0, 1, @@ -736,7 +734,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos) */ num_pages = (len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - pages = ceph_alloc_page_vector(num_pages, GFP_NOFS); + pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); if (IS_ERR(pages)) { ret = PTR_ERR(pages); goto out; @@ -860,7 +858,7 @@ again: struct page *page = NULL; loff_t i_size; if (retry_op == READ_INLINE) { - page = __page_cache_alloc(GFP_NOFS); + page = __page_cache_alloc(GFP_KERNEL); if (!page) return -ENOMEM; } @@ -941,6 +939,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_osd_client *osdc = &ceph_sb_to_client(inode->i_sb)->client->osdc; + struct ceph_cap_flush *prealloc_cf; ssize_t count, written = 0; int err, want, got; loff_t pos; @@ -948,6 +947,10 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) if (ceph_snap(inode) != CEPH_NOSNAP) return -EROFS; + prealloc_cf = ceph_alloc_cap_flush(); + if (!prealloc_cf) + return -ENOMEM; + mutex_lock(&inode->i_mutex); /* We can write back this queue in page reclaim */ @@ -959,7 +962,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) pos = iocb->ki_pos; count = iov_iter_count(from); - err = file_remove_suid(file); + err = file_remove_privs(file); if (err) goto out; @@ -996,14 +999,30 @@ retry_snap: if ((got & (CEPH_CAP_FILE_BUFFER|CEPH_CAP_FILE_LAZYIO)) == 0 || (iocb->ki_flags & IOCB_DIRECT) || (fi->flags & CEPH_F_SYNC)) { + struct ceph_snap_context *snapc; struct iov_iter data; mutex_unlock(&inode->i_mutex); + + spin_lock(&ci->i_ceph_lock); + if (__ceph_have_pending_cap_snap(ci)) { + struct ceph_cap_snap *capsnap = + list_last_entry(&ci->i_cap_snaps, + struct ceph_cap_snap, + ci_item); + snapc = ceph_get_snap_context(capsnap->context); + } else { + BUG_ON(!ci->i_head_snapc); + snapc = ceph_get_snap_context(ci->i_head_snapc); + } + spin_unlock(&ci->i_ceph_lock); + /* we might need to revert back to that point */ data = *from; if (iocb->ki_flags & IOCB_DIRECT) - written = ceph_sync_direct_write(iocb, &data, pos); + written = ceph_sync_direct_write(iocb, &data, pos, + snapc); else - written = ceph_sync_write(iocb, &data, pos); + written = ceph_sync_write(iocb, &data, pos, snapc); if (written == -EOLDSNAPC) { dout("aio_write %p %llx.%llx %llu~%u" "got EOLDSNAPC, retrying\n", @@ -1014,6 +1033,7 @@ retry_snap: } if (written > 0) iov_iter_advance(from, written); + ceph_put_snap_context(snapc); } else { loff_t old_size = inode->i_size; /* @@ -1035,7 +1055,8 @@ retry_snap: int dirty; spin_lock(&ci->i_ceph_lock); ci->i_inline_version = CEPH_INLINE_NONE; - dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, + &prealloc_cf); spin_unlock(&ci->i_ceph_lock); if (dirty) __mark_inode_dirty(inode, dirty); @@ -1059,6 +1080,7 @@ retry_snap: out: mutex_unlock(&inode->i_mutex); out_unlocked: + ceph_free_cap_flush(prealloc_cf); current->backing_dev_info = NULL; return written ? written : err; } @@ -1255,6 +1277,7 @@ static long ceph_fallocate(struct file *file, int mode, struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->client->osdc; + struct ceph_cap_flush *prealloc_cf; int want, got = 0; int dirty; int ret = 0; @@ -1267,6 +1290,10 @@ static long ceph_fallocate(struct file *file, int mode, if (!S_ISREG(inode->i_mode)) return -EOPNOTSUPP; + prealloc_cf = ceph_alloc_cap_flush(); + if (!prealloc_cf) + return -ENOMEM; + mutex_lock(&inode->i_mutex); if (ceph_snap(inode) != CEPH_NOSNAP) { @@ -1313,7 +1340,8 @@ static long ceph_fallocate(struct file *file, int mode, if (!ret) { spin_lock(&ci->i_ceph_lock); ci->i_inline_version = CEPH_INLINE_NONE; - dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR); + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, + &prealloc_cf); spin_unlock(&ci->i_ceph_lock); if (dirty) __mark_inode_dirty(inode, dirty); @@ -1322,6 +1350,7 @@ static long ceph_fallocate(struct file *file, int mode, ceph_put_cap_refs(ci, got); unlock: mutex_unlock(&inode->i_mutex); + ceph_free_cap_flush(prealloc_cf); return ret; } diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index e876e1944..96d2bd829 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -6,7 +6,6 @@ #include <linux/string.h> #include <linux/uaccess.h> #include <linux/kernel.h> -#include <linux/namei.h> #include <linux/writeback.h> #include <linux/vmalloc.h> #include <linux/posix_acl.h> @@ -390,9 +389,10 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_inline_version = 0; ci->i_time_warp_seq = 0; ci->i_ceph_flags = 0; - ci->i_ordered_count = 0; - atomic_set(&ci->i_release_count, 1); - atomic_set(&ci->i_complete_count, 0); + atomic64_set(&ci->i_ordered_count, 1); + atomic64_set(&ci->i_release_count, 1); + atomic64_set(&ci->i_complete_seq[0], 0); + atomic64_set(&ci->i_complete_seq[1], 0); ci->i_symlink = NULL; memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout)); @@ -416,9 +416,8 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_flushing_caps = 0; INIT_LIST_HEAD(&ci->i_dirty_item); INIT_LIST_HEAD(&ci->i_flushing_item); - ci->i_cap_flush_seq = 0; - ci->i_cap_flush_last_tid = 0; - memset(&ci->i_cap_flush_tid, 0, sizeof(ci->i_cap_flush_tid)); + ci->i_prealloc_cap_flush = NULL; + ci->i_cap_flush_tree = RB_ROOT; init_waitqueue_head(&ci->i_cap_wq); ci->i_hold_caps_min = 0; ci->i_hold_caps_max = 0; @@ -753,7 +752,10 @@ static int fill_inode(struct inode *inode, struct page *locked_page, if (new_version || (new_issued & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR))) { + if (ci->i_layout.fl_pg_pool != info->layout.fl_pg_pool) + ci->i_ceph_flags &= ~CEPH_I_POOL_PERM; ci->i_layout = info->layout; + queue_trunc = ceph_fill_file_size(inode, issued, le32_to_cpu(info->truncate_seq), le64_to_cpu(info->truncate_size), @@ -819,6 +821,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page, else kfree(sym); /* lost a race */ } + inode->i_link = ci->i_symlink; break; case S_IFDIR: inode->i_op = &ceph_dir_iops; @@ -858,9 +861,10 @@ static int fill_inode(struct inode *inode, struct page *locked_page, (issued & CEPH_CAP_FILE_EXCL) == 0 && !__ceph_dir_is_complete(ci)) { dout(" marking %p complete (empty)\n", inode); + i_size_write(inode, 0); __ceph_dir_set_complete(ci, - atomic_read(&ci->i_release_count), - ci->i_ordered_count); + atomic64_read(&ci->i_release_count), + atomic64_read(&ci->i_ordered_count)); } wake = true; @@ -1212,6 +1216,10 @@ retry_lookup: dout("fill_trace doing d_move %p -> %p\n", req->r_old_dentry, dn); + /* d_move screws up sibling dentries' offsets */ + ceph_dir_clear_ordered(dir); + ceph_dir_clear_ordered(olddir); + d_move(req->r_old_dentry, dn); dout(" src %p '%pd' dst %p '%pd'\n", req->r_old_dentry, @@ -1222,10 +1230,6 @@ retry_lookup: rehashing bug in vfs_rename_dir */ ceph_invalidate_dentry_lease(dn); - /* d_move screws up sibling dentries' offsets */ - ceph_dir_clear_ordered(dir); - ceph_dir_clear_ordered(olddir); - dout("dn %p gets new offset %lld\n", req->r_old_dentry, ceph_dentry(req->r_old_dentry)->offset); @@ -1333,6 +1337,49 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req, return err; } +void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl) +{ + if (ctl->page) { + kunmap(ctl->page); + page_cache_release(ctl->page); + ctl->page = NULL; + } +} + +static int fill_readdir_cache(struct inode *dir, struct dentry *dn, + struct ceph_readdir_cache_control *ctl, + struct ceph_mds_request *req) +{ + struct ceph_inode_info *ci = ceph_inode(dir); + unsigned nsize = PAGE_CACHE_SIZE / sizeof(struct dentry*); + unsigned idx = ctl->index % nsize; + pgoff_t pgoff = ctl->index / nsize; + + if (!ctl->page || pgoff != page_index(ctl->page)) { + ceph_readdir_cache_release(ctl); + ctl->page = grab_cache_page(&dir->i_data, pgoff); + if (!ctl->page) { + ctl->index = -1; + return -ENOMEM; + } + /* reading/filling the cache are serialized by + * i_mutex, no need to use page lock */ + unlock_page(ctl->page); + ctl->dentries = kmap(ctl->page); + } + + if (req->r_dir_release_cnt == atomic64_read(&ci->i_release_count) && + req->r_dir_ordered_cnt == atomic64_read(&ci->i_ordered_count)) { + dout("readdir cache dn %p idx %d\n", dn, ctl->index); + ctl->dentries[idx] = dn; + ctl->index++; + } else { + dout("disable readdir cache\n"); + ctl->index = -1; + } + return 0; +} + int ceph_readdir_prepopulate(struct ceph_mds_request *req, struct ceph_mds_session *session) { @@ -1345,8 +1392,11 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, struct inode *snapdir = NULL; struct ceph_mds_request_head *rhead = req->r_request->front.iov_base; struct ceph_dentry_info *di; - u64 r_readdir_offset = req->r_readdir_offset; u32 frag = le32_to_cpu(rhead->args.readdir.frag); + struct ceph_readdir_cache_control cache_ctl = {}; + + if (req->r_aborted) + return readdir_prepopulate_inodes_only(req, session); if (rinfo->dir_dir && le32_to_cpu(rinfo->dir_dir->frag) != frag) { @@ -1354,14 +1404,11 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, frag, le32_to_cpu(rinfo->dir_dir->frag)); frag = le32_to_cpu(rinfo->dir_dir->frag); if (ceph_frag_is_leftmost(frag)) - r_readdir_offset = 2; + req->r_readdir_offset = 2; else - r_readdir_offset = 0; + req->r_readdir_offset = 0; } - if (req->r_aborted) - return readdir_prepopulate_inodes_only(req, session); - if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) { snapdir = ceph_get_snapdir(d_inode(parent)); parent = d_find_alias(snapdir); @@ -1374,6 +1421,17 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, ceph_fill_dirfrag(d_inode(parent), rinfo->dir_dir); } + if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2) { + /* note dir version at start of readdir so we can tell + * if any dentries get dropped */ + struct ceph_inode_info *ci = ceph_inode(d_inode(parent)); + req->r_dir_release_cnt = atomic64_read(&ci->i_release_count); + req->r_dir_ordered_cnt = atomic64_read(&ci->i_ordered_count); + req->r_readdir_cache_idx = 0; + } + + cache_ctl.index = req->r_readdir_cache_idx; + /* FIXME: release caps/leases if error occurs */ for (i = 0; i < rinfo->dir_nr; i++) { struct ceph_vino vino; @@ -1413,13 +1471,6 @@ retry_lookup: d_delete(dn); dput(dn); goto retry_lookup; - } else { - /* reorder parent's d_subdirs */ - spin_lock(&parent->d_lock); - spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED); - list_move(&dn->d_child, &parent->d_subdirs); - spin_unlock(&dn->d_lock); - spin_unlock(&parent->d_lock); } /* inode */ @@ -1436,13 +1487,15 @@ retry_lookup: } } - if (fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session, - req->r_request_started, -1, - &req->r_caps_reservation) < 0) { + ret = fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session, + req->r_request_started, -1, + &req->r_caps_reservation); + if (ret < 0) { pr_err("fill_inode badness on %p\n", in); if (d_really_is_negative(dn)) iput(in); d_drop(dn); + err = ret; goto next_item; } @@ -1458,19 +1511,28 @@ retry_lookup: } di = dn->d_fsdata; - di->offset = ceph_make_fpos(frag, i + r_readdir_offset); + di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset); update_dentry_lease(dn, rinfo->dir_dlease[i], req->r_session, req->r_request_started); + + if (err == 0 && cache_ctl.index >= 0) { + ret = fill_readdir_cache(d_inode(parent), dn, + &cache_ctl, req); + if (ret < 0) + err = ret; + } next_item: if (dn) dput(dn); } - if (err == 0) - req->r_did_prepopulate = true; - out: + if (err == 0) { + req->r_did_prepopulate = true; + req->r_readdir_cache_idx = cache_ctl.index; + } + ceph_readdir_cache_release(&cache_ctl); if (snapdir) { iput(snapdir); dput(parent); @@ -1691,16 +1753,9 @@ retry: /* * symlinks */ -static void *ceph_sym_follow_link(struct dentry *dentry, struct nameidata *nd) -{ - struct ceph_inode_info *ci = ceph_inode(d_inode(dentry)); - nd_set_link(nd, ci->i_symlink); - return NULL; -} - static const struct inode_operations ceph_symlink_iops = { .readlink = generic_readlink, - .follow_link = ceph_sym_follow_link, + .follow_link = simple_follow_link, .setattr = ceph_setattr, .getattr = ceph_getattr, .setxattr = ceph_setxattr, @@ -1719,11 +1774,13 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) const unsigned int ia_valid = attr->ia_valid; struct ceph_mds_request *req; struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc; + struct ceph_cap_flush *prealloc_cf; int issued; int release = 0, dirtied = 0; int mask = 0; int err = 0; int inode_dirty_flags = 0; + bool lock_snap_rwsem = false; if (ceph_snap(inode) != CEPH_NOSNAP) return -EROFS; @@ -1732,13 +1789,31 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) if (err != 0) return err; + prealloc_cf = ceph_alloc_cap_flush(); + if (!prealloc_cf) + return -ENOMEM; + req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETATTR, USE_AUTH_MDS); - if (IS_ERR(req)) + if (IS_ERR(req)) { + ceph_free_cap_flush(prealloc_cf); return PTR_ERR(req); + } spin_lock(&ci->i_ceph_lock); issued = __ceph_caps_issued(ci, NULL); + + if (!ci->i_head_snapc && + (issued & (CEPH_CAP_ANY_EXCL | CEPH_CAP_FILE_WR))) { + lock_snap_rwsem = true; + if (!down_read_trylock(&mdsc->snap_rwsem)) { + spin_unlock(&ci->i_ceph_lock); + down_read(&mdsc->snap_rwsem); + spin_lock(&ci->i_ceph_lock); + issued = __ceph_caps_issued(ci, NULL); + } + } + dout("setattr %p issued %s\n", inode, ceph_cap_string(issued)); if (ia_valid & ATTR_UID) { @@ -1881,12 +1956,15 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) dout("setattr %p ATTR_FILE ... hrm!\n", inode); if (dirtied) { - inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied); + inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied, + &prealloc_cf); inode->i_ctime = CURRENT_TIME; } release &= issued; spin_unlock(&ci->i_ceph_lock); + if (lock_snap_rwsem) + up_read(&mdsc->snap_rwsem); if (inode_dirty_flags) __mark_inode_dirty(inode, inode_dirty_flags); @@ -1911,9 +1989,11 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) ceph_mdsc_put_request(req); if (mask & CEPH_SETATTR_SIZE) __ceph_do_pending_vmtruncate(inode); + ceph_free_cap_flush(prealloc_cf); return err; out_put: ceph_mdsc_put_request(req); + ceph_free_cap_flush(prealloc_cf); return err; } diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index 4347039ec..6706bde9a 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -287,7 +287,7 @@ int ceph_encode_locks_to_buffer(struct inode *inode, return 0; spin_lock(&ctx->flc_lock); - list_for_each_entry(lock, &ctx->flc_flock, fl_list) { + list_for_each_entry(lock, &ctx->flc_posix, fl_list) { ++seen_fcntl; if (seen_fcntl > num_fcntl_locks) { err = -ENOSPC; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 84f37f34f..6aa07af67 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -8,6 +8,7 @@ #include <linux/debugfs.h> #include <linux/seq_file.h> #include <linux/utsname.h> +#include <linux/ratelimit.h> #include "super.h" #include "mds_client.h" @@ -458,7 +459,6 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, s->s_cap_reconnect = 0; s->s_cap_iterator = NULL; INIT_LIST_HEAD(&s->s_cap_releases); - INIT_LIST_HEAD(&s->s_cap_releases_done); INIT_LIST_HEAD(&s->s_cap_flushing); INIT_LIST_HEAD(&s->s_cap_snaps_flushing); @@ -629,6 +629,9 @@ static void __register_request(struct ceph_mds_client *mdsc, req->r_uid = current_fsuid(); req->r_gid = current_fsgid(); + if (mdsc->oldest_tid == 0 && req->r_op != CEPH_MDS_OP_SETFILELOCK) + mdsc->oldest_tid = req->r_tid; + if (dir) { struct ceph_inode_info *ci = ceph_inode(dir); @@ -644,6 +647,21 @@ static void __unregister_request(struct ceph_mds_client *mdsc, struct ceph_mds_request *req) { dout("__unregister_request %p tid %lld\n", req, req->r_tid); + + if (req->r_tid == mdsc->oldest_tid) { + struct rb_node *p = rb_next(&req->r_node); + mdsc->oldest_tid = 0; + while (p) { + struct ceph_mds_request *next_req = + rb_entry(p, struct ceph_mds_request, r_node); + if (next_req->r_op != CEPH_MDS_OP_SETFILELOCK) { + mdsc->oldest_tid = next_req->r_tid; + break; + } + p = rb_next(p); + } + } + rb_erase(&req->r_node, &mdsc->request_tree); RB_CLEAR_NODE(&req->r_node); @@ -998,27 +1016,25 @@ void ceph_mdsc_open_export_target_sessions(struct ceph_mds_client *mdsc, * session caps */ -/* - * Free preallocated cap messages assigned to this session - */ -static void cleanup_cap_releases(struct ceph_mds_session *session) +/* caller holds s_cap_lock, we drop it */ +static void cleanup_cap_releases(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session) + __releases(session->s_cap_lock) { - struct ceph_msg *msg; + LIST_HEAD(tmp_list); + list_splice_init(&session->s_cap_releases, &tmp_list); + session->s_num_cap_releases = 0; + spin_unlock(&session->s_cap_lock); - spin_lock(&session->s_cap_lock); - while (!list_empty(&session->s_cap_releases)) { - msg = list_first_entry(&session->s_cap_releases, - struct ceph_msg, list_head); - list_del_init(&msg->list_head); - ceph_msg_put(msg); - } - while (!list_empty(&session->s_cap_releases_done)) { - msg = list_first_entry(&session->s_cap_releases_done, - struct ceph_msg, list_head); - list_del_init(&msg->list_head); - ceph_msg_put(msg); + dout("cleanup_cap_releases mds%d\n", session->s_mds); + while (!list_empty(&tmp_list)) { + struct ceph_cap *cap; + /* zero out the in-progress message */ + cap = list_first_entry(&tmp_list, + struct ceph_cap, session_caps); + list_del(&cap->session_caps); + ceph_put_cap(mdsc, cap); } - spin_unlock(&session->s_cap_lock); } static void cleanup_session_requests(struct ceph_mds_client *mdsc, @@ -1033,7 +1049,8 @@ static void cleanup_session_requests(struct ceph_mds_client *mdsc, req = list_first_entry(&session->s_unsafe, struct ceph_mds_request, r_unsafe_item); list_del_init(&req->r_unsafe_item); - pr_info(" dropping unsafe request %llu\n", req->r_tid); + pr_warn_ratelimited(" dropping unsafe request %llu\n", + req->r_tid); __unregister_request(mdsc, req); } /* zero r_attempts, so kick_requests() will re-send requests */ @@ -1095,10 +1112,16 @@ static int iterate_session_caps(struct ceph_mds_session *session, dout("iterate_session_caps finishing cap %p removal\n", cap); BUG_ON(cap->session != session); + cap->session = NULL; list_del_init(&cap->session_caps); session->s_nr_caps--; - cap->session = NULL; - old_cap = cap; /* put_cap it w/o locks held */ + if (cap->queue_release) { + list_add_tail(&cap->session_caps, + &session->s_cap_releases); + session->s_num_cap_releases++; + } else { + old_cap = cap; /* put_cap it w/o locks held */ + } } if (ret < 0) goto out; @@ -1119,6 +1142,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) { struct ceph_inode_info *ci = ceph_inode(inode); + LIST_HEAD(to_remove); int drop = 0; dout("removing cap %p, ci is %p, inode is %p\n", @@ -1126,12 +1150,27 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, spin_lock(&ci->i_ceph_lock); __ceph_remove_cap(cap, false); if (!ci->i_auth_cap) { + struct ceph_cap_flush *cf; struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc; + while (true) { + struct rb_node *n = rb_first(&ci->i_cap_flush_tree); + if (!n) + break; + cf = rb_entry(n, struct ceph_cap_flush, i_node); + rb_erase(&cf->i_node, &ci->i_cap_flush_tree); + list_add(&cf->list, &to_remove); + } + spin_lock(&mdsc->cap_dirty_lock); + + list_for_each_entry(cf, &to_remove, list) + rb_erase(&cf->g_node, &mdsc->cap_flush_tree); + if (!list_empty(&ci->i_dirty_item)) { - pr_info(" dropping dirty %s state for %p %lld\n", + pr_warn_ratelimited( + " dropping dirty %s state for %p %lld\n", ceph_cap_string(ci->i_dirty_caps), inode, ceph_ino(inode)); ci->i_dirty_caps = 0; @@ -1139,7 +1178,8 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, drop = 1; } if (!list_empty(&ci->i_flushing_item)) { - pr_info(" dropping dirty+flushing %s state for %p %lld\n", + pr_warn_ratelimited( + " dropping dirty+flushing %s state for %p %lld\n", ceph_cap_string(ci->i_flushing_caps), inode, ceph_ino(inode)); ci->i_flushing_caps = 0; @@ -1148,8 +1188,20 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, drop = 1; } spin_unlock(&mdsc->cap_dirty_lock); + + if (!ci->i_dirty_caps && ci->i_prealloc_cap_flush) { + list_add(&ci->i_prealloc_cap_flush->list, &to_remove); + ci->i_prealloc_cap_flush = NULL; + } } spin_unlock(&ci->i_ceph_lock); + while (!list_empty(&to_remove)) { + struct ceph_cap_flush *cf; + cf = list_first_entry(&to_remove, + struct ceph_cap_flush, list); + list_del(&cf->list); + ceph_free_cap_flush(cf); + } while (drop--) iput(inode); return 0; @@ -1191,11 +1243,12 @@ static void remove_session_caps(struct ceph_mds_session *session) spin_lock(&session->s_cap_lock); } } - spin_unlock(&session->s_cap_lock); + + // drop cap expires and unlock s_cap_lock + cleanup_cap_releases(session->s_mdsc, session); BUG_ON(session->s_nr_caps > 0); BUG_ON(!list_empty(&session->s_cap_flushing)); - cleanup_cap_releases(session); } /* @@ -1371,7 +1424,8 @@ static int trim_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) inode, cap, ceph_cap_string(mine), ceph_cap_string(oissued), ceph_cap_string(used), ceph_cap_string(wanted)); if (cap == ci->i_auth_cap) { - if (ci->i_dirty_caps | ci->i_flushing_caps) + if (ci->i_dirty_caps || ci->i_flushing_caps || + !list_empty(&ci->i_cap_snaps)) goto out; if ((used | wanted) & CEPH_CAP_ANY_WR) goto out; @@ -1417,121 +1471,80 @@ static int trim_caps(struct ceph_mds_client *mdsc, session->s_trim_caps = 0; } - ceph_add_cap_releases(mdsc, session); ceph_send_cap_releases(mdsc, session); return 0; } -/* - * Allocate cap_release messages. If there is a partially full message - * in the queue, try to allocate enough to cover it's remainder, so that - * we can send it immediately. - * - * Called under s_mutex. - */ -int ceph_add_cap_releases(struct ceph_mds_client *mdsc, - struct ceph_mds_session *session) +static int check_capsnap_flush(struct ceph_inode_info *ci, + u64 want_snap_seq) { - struct ceph_msg *msg, *partial = NULL; - struct ceph_mds_cap_release *head; - int err = -ENOMEM; - int extra = mdsc->fsc->mount_options->cap_release_safety; - int num; - - dout("add_cap_releases %p mds%d extra %d\n", session, session->s_mds, - extra); - - spin_lock(&session->s_cap_lock); - - if (!list_empty(&session->s_cap_releases)) { - msg = list_first_entry(&session->s_cap_releases, - struct ceph_msg, - list_head); - head = msg->front.iov_base; - num = le32_to_cpu(head->num); - if (num) { - dout(" partial %p with (%d/%d)\n", msg, num, - (int)CEPH_CAPS_PER_RELEASE); - extra += CEPH_CAPS_PER_RELEASE - num; - partial = msg; - } - } - while (session->s_num_cap_releases < session->s_nr_caps + extra) { - spin_unlock(&session->s_cap_lock); - msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, PAGE_CACHE_SIZE, - GFP_NOFS, false); - if (!msg) - goto out_unlocked; - dout("add_cap_releases %p msg %p now %d\n", session, msg, - (int)msg->front.iov_len); - head = msg->front.iov_base; - head->num = cpu_to_le32(0); - msg->front.iov_len = sizeof(*head); - spin_lock(&session->s_cap_lock); - list_add(&msg->list_head, &session->s_cap_releases); - session->s_num_cap_releases += CEPH_CAPS_PER_RELEASE; - } - - if (partial) { - head = partial->front.iov_base; - num = le32_to_cpu(head->num); - dout(" queueing partial %p with %d/%d\n", partial, num, - (int)CEPH_CAPS_PER_RELEASE); - list_move_tail(&partial->list_head, - &session->s_cap_releases_done); - session->s_num_cap_releases -= CEPH_CAPS_PER_RELEASE - num; + int ret = 1; + spin_lock(&ci->i_ceph_lock); + if (want_snap_seq > 0 && !list_empty(&ci->i_cap_snaps)) { + struct ceph_cap_snap *capsnap = + list_first_entry(&ci->i_cap_snaps, + struct ceph_cap_snap, ci_item); + ret = capsnap->follows >= want_snap_seq; } - err = 0; - spin_unlock(&session->s_cap_lock); -out_unlocked: - return err; + spin_unlock(&ci->i_ceph_lock); + return ret; } -static int check_cap_flush(struct inode *inode, u64 want_flush_seq) +static int check_caps_flush(struct ceph_mds_client *mdsc, + u64 want_flush_tid) { - struct ceph_inode_info *ci = ceph_inode(inode); - int ret; - spin_lock(&ci->i_ceph_lock); - if (ci->i_flushing_caps) - ret = ci->i_cap_flush_seq >= want_flush_seq; - else - ret = 1; - spin_unlock(&ci->i_ceph_lock); + struct rb_node *n; + struct ceph_cap_flush *cf; + int ret = 1; + + spin_lock(&mdsc->cap_dirty_lock); + n = rb_first(&mdsc->cap_flush_tree); + cf = n ? rb_entry(n, struct ceph_cap_flush, g_node) : NULL; + if (cf && cf->tid <= want_flush_tid) { + dout("check_caps_flush still flushing tid %llu <= %llu\n", + cf->tid, want_flush_tid); + ret = 0; + } + spin_unlock(&mdsc->cap_dirty_lock); return ret; } /* * flush all dirty inode data to disk. * - * returns true if we've flushed through want_flush_seq + * returns true if we've flushed through want_flush_tid */ -static void wait_caps_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq) +static void wait_caps_flush(struct ceph_mds_client *mdsc, + u64 want_flush_tid, u64 want_snap_seq) { int mds; - dout("check_cap_flush want %lld\n", want_flush_seq); + dout("check_caps_flush want %llu snap want %llu\n", + want_flush_tid, want_snap_seq); mutex_lock(&mdsc->mutex); - for (mds = 0; mds < mdsc->max_sessions; mds++) { + for (mds = 0; mds < mdsc->max_sessions; ) { struct ceph_mds_session *session = mdsc->sessions[mds]; struct inode *inode = NULL; - if (!session) + if (!session) { + mds++; continue; + } get_session(session); mutex_unlock(&mdsc->mutex); mutex_lock(&session->s_mutex); - if (!list_empty(&session->s_cap_flushing)) { - struct ceph_inode_info *ci = - list_entry(session->s_cap_flushing.next, - struct ceph_inode_info, - i_flushing_item); - - if (!check_cap_flush(&ci->vfs_inode, want_flush_seq)) { - dout("check_cap_flush still flushing %p " - "seq %lld <= %lld to mds%d\n", - &ci->vfs_inode, ci->i_cap_flush_seq, - want_flush_seq, session->s_mds); + if (!list_empty(&session->s_cap_snaps_flushing)) { + struct ceph_cap_snap *capsnap = + list_first_entry(&session->s_cap_snaps_flushing, + struct ceph_cap_snap, + flushing_item); + struct ceph_inode_info *ci = capsnap->ci; + if (!check_capsnap_flush(ci, want_snap_seq)) { + dout("check_cap_flush still flushing snap %p " + "follows %lld <= %lld to mds%d\n", + &ci->vfs_inode, capsnap->follows, + want_snap_seq, mds); inode = igrab(&ci->vfs_inode); } } @@ -1540,15 +1553,21 @@ static void wait_caps_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq) if (inode) { wait_event(mdsc->cap_flushing_wq, - check_cap_flush(inode, want_flush_seq)); + check_capsnap_flush(ceph_inode(inode), + want_snap_seq)); iput(inode); + } else { + mds++; } mutex_lock(&mdsc->mutex); } - mutex_unlock(&mdsc->mutex); - dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq); + + wait_event(mdsc->cap_flushing_wq, + check_caps_flush(mdsc, want_flush_tid)); + + dout("check_caps_flush ok, flushed thru %llu\n", want_flush_tid); } /* @@ -1557,60 +1576,74 @@ static void wait_caps_flush(struct ceph_mds_client *mdsc, u64 want_flush_seq) void ceph_send_cap_releases(struct ceph_mds_client *mdsc, struct ceph_mds_session *session) { - struct ceph_msg *msg; + struct ceph_msg *msg = NULL; + struct ceph_mds_cap_release *head; + struct ceph_mds_cap_item *item; + struct ceph_cap *cap; + LIST_HEAD(tmp_list); + int num_cap_releases; - dout("send_cap_releases mds%d\n", session->s_mds); spin_lock(&session->s_cap_lock); - while (!list_empty(&session->s_cap_releases_done)) { - msg = list_first_entry(&session->s_cap_releases_done, - struct ceph_msg, list_head); - list_del_init(&msg->list_head); - spin_unlock(&session->s_cap_lock); - msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); - dout("send_cap_releases mds%d %p\n", session->s_mds, msg); - ceph_con_send(&session->s_con, msg); - spin_lock(&session->s_cap_lock); - } +again: + list_splice_init(&session->s_cap_releases, &tmp_list); + num_cap_releases = session->s_num_cap_releases; + session->s_num_cap_releases = 0; spin_unlock(&session->s_cap_lock); -} -static void discard_cap_releases(struct ceph_mds_client *mdsc, - struct ceph_mds_session *session) -{ - struct ceph_msg *msg; - struct ceph_mds_cap_release *head; - unsigned num; - - dout("discard_cap_releases mds%d\n", session->s_mds); + while (!list_empty(&tmp_list)) { + if (!msg) { + msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPRELEASE, + PAGE_CACHE_SIZE, GFP_NOFS, false); + if (!msg) + goto out_err; + head = msg->front.iov_base; + head->num = cpu_to_le32(0); + msg->front.iov_len = sizeof(*head); + } + cap = list_first_entry(&tmp_list, struct ceph_cap, + session_caps); + list_del(&cap->session_caps); + num_cap_releases--; - if (!list_empty(&session->s_cap_releases)) { - /* zero out the in-progress message */ - msg = list_first_entry(&session->s_cap_releases, - struct ceph_msg, list_head); head = msg->front.iov_base; - num = le32_to_cpu(head->num); - dout("discard_cap_releases mds%d %p %u\n", - session->s_mds, msg, num); - head->num = cpu_to_le32(0); - msg->front.iov_len = sizeof(*head); - session->s_num_cap_releases += num; + le32_add_cpu(&head->num, 1); + item = msg->front.iov_base + msg->front.iov_len; + item->ino = cpu_to_le64(cap->cap_ino); + item->cap_id = cpu_to_le64(cap->cap_id); + item->migrate_seq = cpu_to_le32(cap->mseq); + item->seq = cpu_to_le32(cap->issue_seq); + msg->front.iov_len += sizeof(*item); + + ceph_put_cap(mdsc, cap); + + if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) { + msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); + dout("send_cap_releases mds%d %p\n", session->s_mds, msg); + ceph_con_send(&session->s_con, msg); + msg = NULL; + } } - /* requeue completed messages */ - while (!list_empty(&session->s_cap_releases_done)) { - msg = list_first_entry(&session->s_cap_releases_done, - struct ceph_msg, list_head); - list_del_init(&msg->list_head); + BUG_ON(num_cap_releases != 0); - head = msg->front.iov_base; - num = le32_to_cpu(head->num); - dout("discard_cap_releases mds%d %p %u\n", session->s_mds, msg, - num); - session->s_num_cap_releases += num; - head->num = cpu_to_le32(0); - msg->front.iov_len = sizeof(*head); - list_add(&msg->list_head, &session->s_cap_releases); + spin_lock(&session->s_cap_lock); + if (!list_empty(&session->s_cap_releases)) + goto again; + spin_unlock(&session->s_cap_lock); + + if (msg) { + msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); + dout("send_cap_releases mds%d %p\n", session->s_mds, msg); + ceph_con_send(&session->s_con, msg); } + return; +out_err: + pr_err("send_cap_releases mds%d, failed to allocate message\n", + session->s_mds); + spin_lock(&session->s_cap_lock); + list_splice(&tmp_list, &session->s_cap_releases); + session->s_num_cap_releases += num_cap_releases; + spin_unlock(&session->s_cap_lock); } /* @@ -1635,7 +1668,8 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, order = get_order(size * num_entries); while (order >= 0) { - rinfo->dir_in = (void*)__get_free_pages(GFP_NOFS | __GFP_NOWARN, + rinfo->dir_in = (void*)__get_free_pages(GFP_KERNEL | + __GFP_NOWARN, order); if (rinfo->dir_in) break; @@ -1697,13 +1731,9 @@ static struct ceph_mds_request *__get_oldest_req(struct ceph_mds_client *mdsc) struct ceph_mds_request, r_node); } -static u64 __get_oldest_tid(struct ceph_mds_client *mdsc) +static inline u64 __get_oldest_tid(struct ceph_mds_client *mdsc) { - struct ceph_mds_request *req = __get_oldest_req(mdsc); - - if (req) - return req->r_tid; - return 0; + return mdsc->oldest_tid; } /* @@ -2267,15 +2297,18 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, /* wait */ mutex_unlock(&mdsc->mutex); dout("do_request waiting\n"); - if (req->r_timeout) { - err = (long)wait_for_completion_killable_timeout( - &req->r_completion, req->r_timeout); - if (err == 0) - err = -EIO; - } else if (req->r_wait_for_completion) { + if (!req->r_timeout && req->r_wait_for_completion) { err = req->r_wait_for_completion(mdsc, req); } else { - err = wait_for_completion_killable(&req->r_completion); + long timeleft = wait_for_completion_killable_timeout( + &req->r_completion, + ceph_timeout_jiffies(req->r_timeout)); + if (timeleft > 0) + err = 0; + else if (!timeleft) + err = -EIO; /* timed out */ + else + err = timeleft; /* killed */ } dout("do_request waited, got %d\n", err); mutex_lock(&mdsc->mutex); @@ -2496,7 +2529,6 @@ out_err: } mutex_unlock(&mdsc->mutex); - ceph_add_cap_releases(mdsc, req->r_session); mutex_unlock(&session->s_mutex); /* kick calling process */ @@ -2888,8 +2920,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, */ session->s_cap_reconnect = 1; /* drop old cap expires; we're about to reestablish that state */ - discard_cap_releases(mdsc, session); - spin_unlock(&session->s_cap_lock); + cleanup_cap_releases(mdsc, session); /* trim unused caps to reduce MDS's cache rejoin time */ if (mdsc->fsc->sb->s_root) @@ -2956,6 +2987,9 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, reply->hdr.data_len = cpu_to_le32(pagelist->length); ceph_msg_data_add_pagelist(reply, pagelist); + + ceph_early_kick_flushing_caps(mdsc, session); + ceph_con_send(&session->s_con, reply); mutex_unlock(&session->s_mutex); @@ -3352,7 +3386,6 @@ static void delayed_work(struct work_struct *work) send_renew_caps(mdsc, s); else ceph_con_keepalive(&s->s_con); - ceph_add_cap_releases(mdsc, s); if (s->s_state == CEPH_MDS_SESSION_OPEN || s->s_state == CEPH_MDS_SESSION_HUNG) ceph_send_cap_releases(mdsc, s); @@ -3390,11 +3423,13 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) atomic_set(&mdsc->num_sessions, 0); mdsc->max_sessions = 0; mdsc->stopping = 0; + mdsc->last_snap_seq = 0; init_rwsem(&mdsc->snap_rwsem); mdsc->snap_realms = RB_ROOT; INIT_LIST_HEAD(&mdsc->snap_empty); spin_lock_init(&mdsc->snap_empty_lock); mdsc->last_tid = 0; + mdsc->oldest_tid = 0; mdsc->request_tree = RB_ROOT; INIT_DELAYED_WORK(&mdsc->delayed_work, delayed_work); mdsc->last_renew_caps = jiffies; @@ -3402,7 +3437,8 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) spin_lock_init(&mdsc->cap_delay_lock); INIT_LIST_HEAD(&mdsc->snap_flush_list); spin_lock_init(&mdsc->snap_flush_lock); - mdsc->cap_flush_seq = 0; + mdsc->last_cap_flush_tid = 1; + mdsc->cap_flush_tree = RB_ROOT; INIT_LIST_HEAD(&mdsc->cap_dirty); INIT_LIST_HEAD(&mdsc->cap_dirty_migrating); mdsc->num_cap_flushing = 0; @@ -3414,6 +3450,9 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) ceph_caps_init(mdsc); ceph_adjust_min_caps(mdsc, fsc->min_caps); + init_rwsem(&mdsc->pool_perm_rwsem); + mdsc->pool_perm_tree = RB_ROOT; + return 0; } @@ -3423,8 +3462,8 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) */ static void wait_requests(struct ceph_mds_client *mdsc) { + struct ceph_options *opts = mdsc->fsc->client->options; struct ceph_mds_request *req; - struct ceph_fs_client *fsc = mdsc->fsc; mutex_lock(&mdsc->mutex); if (__get_oldest_req(mdsc)) { @@ -3432,7 +3471,7 @@ static void wait_requests(struct ceph_mds_client *mdsc) dout("wait_requests waiting for requests\n"); wait_for_completion_timeout(&mdsc->safe_umount_waiters, - fsc->client->options->mount_timeout * HZ); + ceph_timeout_jiffies(opts->mount_timeout)); /* tear down remaining requests */ mutex_lock(&mdsc->mutex); @@ -3485,7 +3524,8 @@ restart: nextreq = rb_entry(n, struct ceph_mds_request, r_node); else nextreq = NULL; - if ((req->r_op & CEPH_MDS_OP_WRITE)) { + if (req->r_op != CEPH_MDS_OP_SETFILELOCK && + (req->r_op & CEPH_MDS_OP_WRITE)) { /* write op */ ceph_mdsc_get_request(req); if (nextreq) @@ -3513,7 +3553,7 @@ restart: void ceph_mdsc_sync(struct ceph_mds_client *mdsc) { - u64 want_tid, want_flush; + u64 want_tid, want_flush, want_snap; if (mdsc->fsc->mount_state == CEPH_MOUNT_SHUTDOWN) return; @@ -3525,13 +3565,18 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc) ceph_flush_dirty_caps(mdsc); spin_lock(&mdsc->cap_dirty_lock); - want_flush = mdsc->cap_flush_seq; + want_flush = mdsc->last_cap_flush_tid; spin_unlock(&mdsc->cap_dirty_lock); - dout("sync want tid %lld flush_seq %lld\n", want_tid, want_flush); + down_read(&mdsc->snap_rwsem); + want_snap = mdsc->last_snap_seq; + up_read(&mdsc->snap_rwsem); + + dout("sync want tid %lld flush_seq %lld snap_seq %lld\n", + want_tid, want_flush, want_snap); wait_unsafe_requests(mdsc, want_tid); - wait_caps_flush(mdsc, want_flush); + wait_caps_flush(mdsc, want_flush, want_snap); } /* @@ -3549,10 +3594,9 @@ static bool done_closing_sessions(struct ceph_mds_client *mdsc) */ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) { + struct ceph_options *opts = mdsc->fsc->client->options; struct ceph_mds_session *session; int i; - struct ceph_fs_client *fsc = mdsc->fsc; - unsigned long timeout = fsc->client->options->mount_timeout * HZ; dout("close_sessions\n"); @@ -3573,7 +3617,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) dout("waiting for sessions to close\n"); wait_event_timeout(mdsc->session_close_wq, done_closing_sessions(mdsc), - timeout); + ceph_timeout_jiffies(opts->mount_timeout)); /* tear down remaining sessions */ mutex_lock(&mdsc->mutex); @@ -3607,6 +3651,7 @@ static void ceph_mdsc_stop(struct ceph_mds_client *mdsc) ceph_mdsmap_destroy(mdsc->mdsmap); kfree(mdsc->sessions); ceph_caps_finalize(mdsc); + ceph_pool_perm_destroy(mdsc); } void ceph_mdsc_destroy(struct ceph_fs_client *fsc) diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 1875b5d98..762757e6c 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -139,7 +139,6 @@ struct ceph_mds_session { int s_cap_reconnect; int s_readonly; struct list_head s_cap_releases; /* waiting cap_release messages */ - struct list_head s_cap_releases_done; /* ready to send */ struct ceph_cap *s_cap_iterator; /* protected by mutex */ @@ -228,7 +227,7 @@ struct ceph_mds_request { int r_err; bool r_aborted; - unsigned long r_timeout; /* optional. jiffies */ + unsigned long r_timeout; /* optional. jiffies, 0 is "wait forever" */ unsigned long r_started; /* start time to measure timeout against */ unsigned long r_request_started; /* start time for mds request only, used to measure lease durations */ @@ -254,12 +253,21 @@ struct ceph_mds_request { bool r_got_unsafe, r_got_safe, r_got_result; bool r_did_prepopulate; + long long r_dir_release_cnt; + long long r_dir_ordered_cnt; + int r_readdir_cache_idx; u32 r_readdir_offset; struct ceph_cap_reservation r_caps_reservation; int r_num_caps; }; +struct ceph_pool_perm { + struct rb_node node; + u32 pool; + int perm; +}; + /* * mds client state */ @@ -284,12 +292,15 @@ struct ceph_mds_client { * references (implying they contain no inodes with caps) that * should be destroyed. */ + u64 last_snap_seq; struct rw_semaphore snap_rwsem; struct rb_root snap_realms; struct list_head snap_empty; spinlock_t snap_empty_lock; /* protect snap_empty */ u64 last_tid; /* most recent mds request */ + u64 oldest_tid; /* oldest incomplete mds request, + excluding setfilelock requests */ struct rb_root request_tree; /* pending mds requests */ struct delayed_work delayed_work; /* delayed work */ unsigned long last_renew_caps; /* last time we renewed our caps */ @@ -298,7 +309,8 @@ struct ceph_mds_client { struct list_head snap_flush_list; /* cap_snaps ready to flush */ spinlock_t snap_flush_lock; - u64 cap_flush_seq; + u64 last_cap_flush_tid; + struct rb_root cap_flush_tree; struct list_head cap_dirty; /* inodes with dirty caps */ struct list_head cap_dirty_migrating; /* ...that are migration... */ int num_cap_flushing; /* # caps we are flushing */ @@ -328,6 +340,9 @@ struct ceph_mds_client { spinlock_t dentry_lru_lock; struct list_head dentry_lru; int num_dentry; + + struct rw_semaphore pool_perm_rwsem; + struct rb_root pool_perm_tree; }; extern const char *ceph_mds_op_name(int op); @@ -379,8 +394,6 @@ static inline void ceph_mdsc_put_request(struct ceph_mds_request *req) kref_put(&req->r_kref, ceph_mdsc_release_request); } -extern int ceph_add_cap_releases(struct ceph_mds_client *mdsc, - struct ceph_mds_session *session); extern void ceph_send_cap_releases(struct ceph_mds_client *mdsc, struct ceph_mds_session *session); diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index a97e39f09..233d906ae 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -296,7 +296,7 @@ static int cmpu64_rev(const void *a, const void *b) } -static struct ceph_snap_context *empty_snapc; +struct ceph_snap_context *ceph_empty_snapc; /* * build the snap context for a given realm. @@ -338,9 +338,9 @@ static int build_snap_context(struct ceph_snap_realm *realm) return 0; } - if (num == 0 && realm->seq == empty_snapc->seq) { - ceph_get_snap_context(empty_snapc); - snapc = empty_snapc; + if (num == 0 && realm->seq == ceph_empty_snapc->seq) { + ceph_get_snap_context(ceph_empty_snapc); + snapc = ceph_empty_snapc; goto done; } @@ -436,6 +436,14 @@ static int dup_array(u64 **dst, __le64 *src, u32 num) return 0; } +static bool has_new_snaps(struct ceph_snap_context *o, + struct ceph_snap_context *n) +{ + if (n->num_snaps == 0) + return false; + /* snaps are in descending order */ + return n->snaps[0] > o->seq; +} /* * When a snapshot is applied, the size/mtime inode metadata is queued @@ -455,6 +463,7 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) { struct inode *inode = &ci->vfs_inode; struct ceph_cap_snap *capsnap; + struct ceph_snap_context *old_snapc, *new_snapc; int used, dirty; capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS); @@ -467,6 +476,9 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) used = __ceph_caps_used(ci); dirty = __ceph_caps_dirty(ci); + old_snapc = ci->i_head_snapc; + new_snapc = ci->i_snap_realm->cached_context; + /* * If there is a write in progress, treat that as a dirty Fw, * even though it hasn't completed yet; by the time we finish @@ -481,76 +493,95 @@ void ceph_queue_cap_snap(struct ceph_inode_info *ci) writes in progress now were started before the previous cap_snap. lucky us. */ dout("queue_cap_snap %p already pending\n", inode); - kfree(capsnap); - } else if (ci->i_snap_realm->cached_context == empty_snapc) { - dout("queue_cap_snap %p empty snapc\n", inode); - kfree(capsnap); - } else if (dirty & (CEPH_CAP_AUTH_EXCL|CEPH_CAP_XATTR_EXCL| - CEPH_CAP_FILE_EXCL|CEPH_CAP_FILE_WR)) { - struct ceph_snap_context *snapc = ci->i_head_snapc; - - /* - * if we are a sync write, we may need to go to the snaprealm - * to get the current snapc. - */ - if (!snapc) - snapc = ci->i_snap_realm->cached_context; + goto update_snapc; + } + if (ci->i_wrbuffer_ref_head == 0 && + !(dirty & (CEPH_CAP_ANY_EXCL|CEPH_CAP_FILE_WR))) { + dout("queue_cap_snap %p nothing dirty|writing\n", inode); + goto update_snapc; + } - dout("queue_cap_snap %p cap_snap %p queuing under %p %s\n", - inode, capsnap, snapc, ceph_cap_string(dirty)); - ihold(inode); + BUG_ON(!old_snapc); - atomic_set(&capsnap->nref, 1); - capsnap->ci = ci; - INIT_LIST_HEAD(&capsnap->ci_item); - INIT_LIST_HEAD(&capsnap->flushing_item); - - capsnap->follows = snapc->seq; - capsnap->issued = __ceph_caps_issued(ci, NULL); - capsnap->dirty = dirty; - - capsnap->mode = inode->i_mode; - capsnap->uid = inode->i_uid; - capsnap->gid = inode->i_gid; - - if (dirty & CEPH_CAP_XATTR_EXCL) { - __ceph_build_xattrs_blob(ci); - capsnap->xattr_blob = - ceph_buffer_get(ci->i_xattrs.blob); - capsnap->xattr_version = ci->i_xattrs.version; - } else { - capsnap->xattr_blob = NULL; - capsnap->xattr_version = 0; + /* + * There is no need to send FLUSHSNAP message to MDS if there is + * no new snapshot. But when there is dirty pages or on-going + * writes, we still need to create cap_snap. cap_snap is needed + * by the write path and page writeback path. + * + * also see ceph_try_drop_cap_snap() + */ + if (has_new_snaps(old_snapc, new_snapc)) { + if (dirty & (CEPH_CAP_ANY_EXCL|CEPH_CAP_FILE_WR)) + capsnap->need_flush = true; + } else { + if (!(used & CEPH_CAP_FILE_WR) && + ci->i_wrbuffer_ref_head == 0) { + dout("queue_cap_snap %p " + "no new_snap|dirty_page|writing\n", inode); + goto update_snapc; } + } - capsnap->inline_data = ci->i_inline_version != CEPH_INLINE_NONE; - - /* dirty page count moved from _head to this cap_snap; - all subsequent writes page dirties occur _after_ this - snapshot. */ - capsnap->dirty_pages = ci->i_wrbuffer_ref_head; - ci->i_wrbuffer_ref_head = 0; - capsnap->context = snapc; - ci->i_head_snapc = - ceph_get_snap_context(ci->i_snap_realm->cached_context); - dout(" new snapc is %p\n", ci->i_head_snapc); - list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps); - - if (used & CEPH_CAP_FILE_WR) { - dout("queue_cap_snap %p cap_snap %p snapc %p" - " seq %llu used WR, now pending\n", inode, - capsnap, snapc, snapc->seq); - capsnap->writing = 1; - } else { - /* note mtime, size NOW. */ - __ceph_finish_cap_snap(ci, capsnap); - } + dout("queue_cap_snap %p cap_snap %p queuing under %p %s %s\n", + inode, capsnap, old_snapc, ceph_cap_string(dirty), + capsnap->need_flush ? "" : "no_flush"); + ihold(inode); + + atomic_set(&capsnap->nref, 1); + capsnap->ci = ci; + INIT_LIST_HEAD(&capsnap->ci_item); + INIT_LIST_HEAD(&capsnap->flushing_item); + + capsnap->follows = old_snapc->seq; + capsnap->issued = __ceph_caps_issued(ci, NULL); + capsnap->dirty = dirty; + + capsnap->mode = inode->i_mode; + capsnap->uid = inode->i_uid; + capsnap->gid = inode->i_gid; + + if (dirty & CEPH_CAP_XATTR_EXCL) { + __ceph_build_xattrs_blob(ci); + capsnap->xattr_blob = + ceph_buffer_get(ci->i_xattrs.blob); + capsnap->xattr_version = ci->i_xattrs.version; } else { - dout("queue_cap_snap %p nothing dirty|writing\n", inode); - kfree(capsnap); + capsnap->xattr_blob = NULL; + capsnap->xattr_version = 0; } + capsnap->inline_data = ci->i_inline_version != CEPH_INLINE_NONE; + + /* dirty page count moved from _head to this cap_snap; + all subsequent writes page dirties occur _after_ this + snapshot. */ + capsnap->dirty_pages = ci->i_wrbuffer_ref_head; + ci->i_wrbuffer_ref_head = 0; + capsnap->context = old_snapc; + list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps); + old_snapc = NULL; + + if (used & CEPH_CAP_FILE_WR) { + dout("queue_cap_snap %p cap_snap %p snapc %p" + " seq %llu used WR, now pending\n", inode, + capsnap, old_snapc, old_snapc->seq); + capsnap->writing = 1; + } else { + /* note mtime, size NOW. */ + __ceph_finish_cap_snap(ci, capsnap); + } + capsnap = NULL; + +update_snapc: + if (ci->i_head_snapc) { + ci->i_head_snapc = ceph_get_snap_context(new_snapc); + dout(" new snapc is %p\n", new_snapc); + } spin_unlock(&ci->i_ceph_lock); + + kfree(capsnap); + ceph_put_snap_context(old_snapc); } /* @@ -699,6 +730,8 @@ more: /* queue realm for cap_snap creation */ list_add(&realm->dirty_item, &dirty_realms); + if (realm->seq > mdsc->last_snap_seq) + mdsc->last_snap_seq = realm->seq; invalidate = 1; } else if (!realm->cached_context) { @@ -964,14 +997,14 @@ out: int __init ceph_snap_init(void) { - empty_snapc = ceph_create_snap_context(0, GFP_NOFS); - if (!empty_snapc) + ceph_empty_snapc = ceph_create_snap_context(0, GFP_NOFS); + if (!ceph_empty_snapc) return -ENOMEM; - empty_snapc->seq = 1; + ceph_empty_snapc->seq = 1; return 0; } void ceph_snap_exit(void) { - ceph_put_snap_context(empty_snapc); + ceph_put_snap_context(ceph_empty_snapc); } diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 4e9905374..d1c833c32 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -134,10 +134,12 @@ enum { Opt_noino32, Opt_fscache, Opt_nofscache, + Opt_poolperm, + Opt_nopoolperm, #ifdef CONFIG_CEPH_FS_POSIX_ACL Opt_acl, #endif - Opt_noacl + Opt_noacl, }; static match_table_t fsopt_tokens = { @@ -165,6 +167,8 @@ static match_table_t fsopt_tokens = { {Opt_noino32, "noino32"}, {Opt_fscache, "fsc"}, {Opt_nofscache, "nofsc"}, + {Opt_poolperm, "poolperm"}, + {Opt_nopoolperm, "nopoolperm"}, #ifdef CONFIG_CEPH_FS_POSIX_ACL {Opt_acl, "acl"}, #endif @@ -268,6 +272,13 @@ static int parse_fsopt_token(char *c, void *private) case Opt_nofscache: fsopt->flags &= ~CEPH_MOUNT_OPT_FSCACHE; break; + case Opt_poolperm: + fsopt->flags &= ~CEPH_MOUNT_OPT_NOPOOLPERM; + printk ("pool perm"); + break; + case Opt_nopoolperm: + fsopt->flags |= CEPH_MOUNT_OPT_NOPOOLPERM; + break; #ifdef CONFIG_CEPH_FS_POSIX_ACL case Opt_acl: fsopt->sb_flags |= MS_POSIXACL; @@ -436,6 +447,8 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) seq_puts(m, ",nodcache"); if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE) seq_puts(m, ",fsc"); + if (fsopt->flags & CEPH_MOUNT_OPT_NOPOOLPERM) + seq_puts(m, ",nopoolperm"); #ifdef CONFIG_CEPH_FS_POSIX_ACL if (fsopt->sb_flags & MS_POSIXACL) @@ -609,6 +622,7 @@ static void destroy_fs_client(struct ceph_fs_client *fsc) */ struct kmem_cache *ceph_inode_cachep; struct kmem_cache *ceph_cap_cachep; +struct kmem_cache *ceph_cap_flush_cachep; struct kmem_cache *ceph_dentry_cachep; struct kmem_cache *ceph_file_cachep; @@ -634,6 +648,10 @@ static int __init init_caches(void) SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); if (ceph_cap_cachep == NULL) goto bad_cap; + ceph_cap_flush_cachep = KMEM_CACHE(ceph_cap_flush, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); + if (ceph_cap_flush_cachep == NULL) + goto bad_cap_flush; ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD); @@ -652,6 +670,8 @@ static int __init init_caches(void) bad_file: kmem_cache_destroy(ceph_dentry_cachep); bad_dentry: + kmem_cache_destroy(ceph_cap_flush_cachep); +bad_cap_flush: kmem_cache_destroy(ceph_cap_cachep); bad_cap: kmem_cache_destroy(ceph_inode_cachep); @@ -668,6 +688,7 @@ static void destroy_caches(void) kmem_cache_destroy(ceph_inode_cachep); kmem_cache_destroy(ceph_cap_cachep); + kmem_cache_destroy(ceph_cap_flush_cachep); kmem_cache_destroy(ceph_dentry_cachep); kmem_cache_destroy(ceph_file_cachep); @@ -729,7 +750,7 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc, req->r_ino1.ino = CEPH_INO_ROOT; req->r_ino1.snap = CEPH_NOSNAP; req->r_started = started; - req->r_timeout = fsc->client->options->mount_timeout * HZ; + req->r_timeout = fsc->client->options->mount_timeout; req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE); req->r_num_caps = 2; err = ceph_mdsc_do_request(mdsc, NULL, req); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index fa20e1318..2f2460d23 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -35,6 +35,7 @@ #define CEPH_MOUNT_OPT_INO32 (1<<8) /* 32 bit inos */ #define CEPH_MOUNT_OPT_DCACHE (1<<9) /* use dcache for readdir etc */ #define CEPH_MOUNT_OPT_FSCACHE (1<<10) /* use fscache */ +#define CEPH_MOUNT_OPT_NOPOOLPERM (1<<11) /* no pool permission check */ #define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES | \ CEPH_MOUNT_OPT_DCACHE) @@ -121,11 +122,21 @@ struct ceph_cap { struct rb_node ci_node; /* per-ci cap tree */ struct ceph_mds_session *session; struct list_head session_caps; /* per-session caplist */ - int mds; u64 cap_id; /* unique cap id (mds provided) */ - int issued; /* latest, from the mds */ - int implemented; /* implemented superset of issued (for revocation) */ - int mds_wanted; + union { + /* in-use caps */ + struct { + int issued; /* latest, from the mds */ + int implemented; /* implemented superset of + issued (for revocation) */ + int mds, mds_wanted; + }; + /* caps to release */ + struct { + u64 cap_ino; + int queue_release; + }; + }; u32 seq, issue_seq, mseq; u32 cap_gen; /* active/stale cycle */ unsigned long last_used; @@ -163,6 +174,7 @@ struct ceph_cap_snap { int writing; /* a sync write is still in progress */ int dirty_pages; /* dirty pages awaiting writeback */ bool inline_data; + bool need_flush; }; static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap) @@ -174,6 +186,16 @@ static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap) } } +struct ceph_cap_flush { + u64 tid; + int caps; + struct rb_node g_node; // global + union { + struct rb_node i_node; // inode + struct list_head list; + }; +}; + /* * The frag tree describes how a directory is fragmented, potentially across * multiple metadata servers. It is also used to indicate points where @@ -259,9 +281,9 @@ struct ceph_inode_info { u32 i_time_warp_seq; unsigned i_ceph_flags; - int i_ordered_count; - atomic_t i_release_count; - atomic_t i_complete_count; + atomic64_t i_release_count; + atomic64_t i_ordered_count; + atomic64_t i_complete_seq[2]; struct ceph_dir_layout i_dir_layout; struct ceph_file_layout i_layout; @@ -283,11 +305,11 @@ struct ceph_inode_info { struct ceph_cap *i_auth_cap; /* authoritative cap, if any */ unsigned i_dirty_caps, i_flushing_caps; /* mask of dirtied fields */ struct list_head i_dirty_item, i_flushing_item; - u64 i_cap_flush_seq; /* we need to track cap writeback on a per-cap-bit basis, to allow * overlapping, pipelined cap flushes to the mds. we can probably * reduce the tid to 8 bits if we're concerned about inode size. */ - u16 i_cap_flush_last_tid, i_cap_flush_tid[CEPH_CAP_BITS]; + struct ceph_cap_flush *i_prealloc_cap_flush; + struct rb_root i_cap_flush_tree; wait_queue_head_t i_cap_wq; /* threads waiting on a capability */ unsigned long i_hold_caps_min; /* jiffies */ unsigned long i_hold_caps_max; /* jiffies */ @@ -438,36 +460,46 @@ static inline struct inode *ceph_find_inode(struct super_block *sb, /* * Ceph inode. */ -#define CEPH_I_DIR_ORDERED 1 /* dentries in dir are ordered */ -#define CEPH_I_NODELAY 4 /* do not delay cap release */ -#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */ -#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */ +#define CEPH_I_DIR_ORDERED (1 << 0) /* dentries in dir are ordered */ +#define CEPH_I_NODELAY (1 << 1) /* do not delay cap release */ +#define CEPH_I_FLUSH (1 << 2) /* do not delay flush of dirty metadata */ +#define CEPH_I_NOFLUSH (1 << 3) /* do not flush dirty caps */ +#define CEPH_I_POOL_PERM (1 << 4) /* pool rd/wr bits are valid */ +#define CEPH_I_POOL_RD (1 << 5) /* can read from pool */ +#define CEPH_I_POOL_WR (1 << 6) /* can write to pool */ + static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci, - int release_count, int ordered_count) + long long release_count, + long long ordered_count) { - atomic_set(&ci->i_complete_count, release_count); - if (ci->i_ordered_count == ordered_count) - ci->i_ceph_flags |= CEPH_I_DIR_ORDERED; - else - ci->i_ceph_flags &= ~CEPH_I_DIR_ORDERED; + smp_mb__before_atomic(); + atomic64_set(&ci->i_complete_seq[0], release_count); + atomic64_set(&ci->i_complete_seq[1], ordered_count); } static inline void __ceph_dir_clear_complete(struct ceph_inode_info *ci) { - atomic_inc(&ci->i_release_count); + atomic64_inc(&ci->i_release_count); +} + +static inline void __ceph_dir_clear_ordered(struct ceph_inode_info *ci) +{ + atomic64_inc(&ci->i_ordered_count); } static inline bool __ceph_dir_is_complete(struct ceph_inode_info *ci) { - return atomic_read(&ci->i_complete_count) == - atomic_read(&ci->i_release_count); + return atomic64_read(&ci->i_complete_seq[0]) == + atomic64_read(&ci->i_release_count); } static inline bool __ceph_dir_is_complete_ordered(struct ceph_inode_info *ci) { - return __ceph_dir_is_complete(ci) && - (ci->i_ceph_flags & CEPH_I_DIR_ORDERED); + return atomic64_read(&ci->i_complete_seq[0]) == + atomic64_read(&ci->i_release_count) && + atomic64_read(&ci->i_complete_seq[1]) == + atomic64_read(&ci->i_ordered_count); } static inline void ceph_dir_clear_complete(struct inode *inode) @@ -477,20 +509,13 @@ static inline void ceph_dir_clear_complete(struct inode *inode) static inline void ceph_dir_clear_ordered(struct inode *inode) { - struct ceph_inode_info *ci = ceph_inode(inode); - spin_lock(&ci->i_ceph_lock); - ci->i_ordered_count++; - ci->i_ceph_flags &= ~CEPH_I_DIR_ORDERED; - spin_unlock(&ci->i_ceph_lock); + __ceph_dir_clear_ordered(ceph_inode(inode)); } static inline bool ceph_dir_is_complete_ordered(struct inode *inode) { - struct ceph_inode_info *ci = ceph_inode(inode); - bool ret; - spin_lock(&ci->i_ceph_lock); - ret = __ceph_dir_is_complete_ordered(ci); - spin_unlock(&ci->i_ceph_lock); + bool ret = __ceph_dir_is_complete_ordered(ceph_inode(inode)); + smp_rmb(); return ret; } @@ -552,7 +577,10 @@ static inline int __ceph_caps_dirty(struct ceph_inode_info *ci) { return ci->i_dirty_caps | ci->i_flushing_caps; } -extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask); +extern struct ceph_cap_flush *ceph_alloc_cap_flush(void); +extern void ceph_free_cap_flush(struct ceph_cap_flush *cf); +extern int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask, + struct ceph_cap_flush **pcf); extern int __ceph_caps_revoking_other(struct ceph_inode_info *ci, struct ceph_cap *ocap, int mask); @@ -606,16 +634,20 @@ struct ceph_file_info { unsigned offset; /* offset of last chunk, adjusted for . and .. */ unsigned next_offset; /* offset of next chunk (last_name's + 1) */ char *last_name; /* last entry in previous chunk */ - struct dentry *dentry; /* next dentry (for dcache readdir) */ - int dir_release_count; - int dir_ordered_count; + long long dir_release_count; + long long dir_ordered_count; + int readdir_cache_idx; /* used for -o dirstat read() on directory thing */ char *dir_info; int dir_info_len; }; - +struct ceph_readdir_cache_control { + struct page *page; + struct dentry **dentries; + int index; +}; /* * A "snap realm" describes a subset of the file hierarchy sharing @@ -687,6 +719,7 @@ static inline int default_congestion_kb(void) /* snap.c */ +extern struct ceph_snap_context *ceph_empty_snapc; struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc, u64 ino); extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc, @@ -713,8 +746,8 @@ extern void ceph_snap_exit(void); static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci) { return !list_empty(&ci->i_cap_snaps) && - list_entry(ci->i_cap_snaps.prev, struct ceph_cap_snap, - ci_item)->writing; + list_last_entry(&ci->i_cap_snaps, struct ceph_cap_snap, + ci_item)->writing; } /* inode.c */ @@ -838,12 +871,12 @@ extern void ceph_put_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap); extern int ceph_is_any_caps(struct inode *inode); -extern void __queue_cap_release(struct ceph_mds_session *session, u64 ino, - u64 cap_id, u32 migrate_seq, u32 issue_seq); extern void ceph_queue_caps_release(struct inode *inode); extern int ceph_write_inode(struct inode *inode, struct writeback_control *wbc); extern int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync); +extern void ceph_early_kick_flushing_caps(struct ceph_mds_client *mdsc, + struct ceph_mds_session *session); extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc, struct ceph_mds_session *session); extern struct ceph_cap *ceph_get_cap_for_mds(struct ceph_inode_info *ci, @@ -879,6 +912,9 @@ extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode); /* addr.c */ extern const struct address_space_operations ceph_aops; extern int ceph_mmap(struct file *file, struct vm_area_struct *vma); +extern int ceph_uninline_data(struct file *filp, struct page *locked_page); +extern int ceph_pool_perm_check(struct ceph_inode_info *ci, int need); +extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc); /* file.c */ extern const struct file_operations ceph_file_fops; @@ -890,7 +926,6 @@ extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry, extern int ceph_release(struct inode *inode, struct file *filp); extern void ceph_fill_inline_data(struct inode *inode, struct page *locked_page, char *data, size_t len); -int ceph_uninline_data(struct file *filp, struct page *locked_page); /* dir.c */ extern const struct file_operations ceph_dir_fops; extern const struct file_operations ceph_snapdir_fops; @@ -911,6 +946,7 @@ extern void ceph_dentry_lru_del(struct dentry *dn); extern void ceph_invalidate_dentry_lease(struct dentry *dentry); extern unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn); extern struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry); +extern void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl); /* * our d_ops vary depending on whether the inode is live, diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index cd7ffad40..819163d83 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -911,6 +911,8 @@ int __ceph_setxattr(struct dentry *dentry, const char *name, struct inode *inode = d_inode(dentry); struct ceph_vxattr *vxattr; struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc; + struct ceph_cap_flush *prealloc_cf = NULL; int issued; int err; int dirty = 0; @@ -920,6 +922,7 @@ int __ceph_setxattr(struct dentry *dentry, const char *name, char *newval = NULL; struct ceph_inode_xattr *xattr = NULL; int required_blob_size; + bool lock_snap_rwsem = false; if (!ceph_is_valid_xattr(name)) return -EOPNOTSUPP; @@ -948,12 +951,27 @@ int __ceph_setxattr(struct dentry *dentry, const char *name, if (!xattr) goto out; + prealloc_cf = ceph_alloc_cap_flush(); + if (!prealloc_cf) + goto out; + spin_lock(&ci->i_ceph_lock); retry: issued = __ceph_caps_issued(ci, NULL); - dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued)); if (ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) goto do_sync; + + if (!lock_snap_rwsem && !ci->i_head_snapc) { + lock_snap_rwsem = true; + if (!down_read_trylock(&mdsc->snap_rwsem)) { + spin_unlock(&ci->i_ceph_lock); + down_read(&mdsc->snap_rwsem); + spin_lock(&ci->i_ceph_lock); + goto retry; + } + } + + dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued)); __build_xattrs(inode); required_blob_size = __get_required_blob_size(ci, name_len, val_len); @@ -966,7 +984,7 @@ retry: dout(" preaallocating new blob size=%d\n", required_blob_size); blob = ceph_buffer_new(required_blob_size, GFP_NOFS); if (!blob) - goto out; + goto do_sync_unlocked; spin_lock(&ci->i_ceph_lock); if (ci->i_xattrs.prealloc_blob) ceph_buffer_put(ci->i_xattrs.prealloc_blob); @@ -978,21 +996,28 @@ retry: flags, value ? 1 : -1, &xattr); if (!err) { - dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL, + &prealloc_cf); ci->i_xattrs.dirty = true; inode->i_ctime = CURRENT_TIME; } spin_unlock(&ci->i_ceph_lock); + if (lock_snap_rwsem) + up_read(&mdsc->snap_rwsem); if (dirty) __mark_inode_dirty(inode, dirty); + ceph_free_cap_flush(prealloc_cf); return err; do_sync: spin_unlock(&ci->i_ceph_lock); do_sync_unlocked: + if (lock_snap_rwsem) + up_read(&mdsc->snap_rwsem); err = ceph_sync_setxattr(dentry, name, value, size, flags); out: + ceph_free_cap_flush(prealloc_cf); kfree(newname); kfree(newval); kfree(xattr); @@ -1044,10 +1069,13 @@ int __ceph_removexattr(struct dentry *dentry, const char *name) struct inode *inode = d_inode(dentry); struct ceph_vxattr *vxattr; struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_mds_client *mdsc = ceph_sb_to_client(dentry->d_sb)->mdsc; + struct ceph_cap_flush *prealloc_cf = NULL; int issued; int err; int required_blob_size; int dirty; + bool lock_snap_rwsem = false; if (!ceph_is_valid_xattr(name)) return -EOPNOTSUPP; @@ -1060,14 +1088,29 @@ int __ceph_removexattr(struct dentry *dentry, const char *name) if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN)) goto do_sync_unlocked; + prealloc_cf = ceph_alloc_cap_flush(); + if (!prealloc_cf) + return -ENOMEM; + err = -ENOMEM; spin_lock(&ci->i_ceph_lock); retry: issued = __ceph_caps_issued(ci, NULL); - dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued)); - if (ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) goto do_sync; + + if (!lock_snap_rwsem && !ci->i_head_snapc) { + lock_snap_rwsem = true; + if (!down_read_trylock(&mdsc->snap_rwsem)) { + spin_unlock(&ci->i_ceph_lock); + down_read(&mdsc->snap_rwsem); + spin_lock(&ci->i_ceph_lock); + goto retry; + } + } + + dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued)); + __build_xattrs(inode); required_blob_size = __get_required_blob_size(ci, 0, 0); @@ -1080,7 +1123,7 @@ retry: dout(" preaallocating new blob size=%d\n", required_blob_size); blob = ceph_buffer_new(required_blob_size, GFP_NOFS); if (!blob) - goto out; + goto do_sync_unlocked; spin_lock(&ci->i_ceph_lock); if (ci->i_xattrs.prealloc_blob) ceph_buffer_put(ci->i_xattrs.prealloc_blob); @@ -1090,18 +1133,24 @@ retry: err = __remove_xattr_by_name(ceph_inode(inode), name); - dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL); + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL, + &prealloc_cf); ci->i_xattrs.dirty = true; inode->i_ctime = CURRENT_TIME; spin_unlock(&ci->i_ceph_lock); + if (lock_snap_rwsem) + up_read(&mdsc->snap_rwsem); if (dirty) __mark_inode_dirty(inode, dirty); + ceph_free_cap_flush(prealloc_cf); return err; do_sync: spin_unlock(&ci->i_ceph_lock); do_sync_unlocked: + if (lock_snap_rwsem) + up_read(&mdsc->snap_rwsem); + ceph_free_cap_flush(prealloc_cf); err = ceph_send_removexattr(dentry, name); -out: return err; } diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index a2172f3f6..e7b478b49 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -192,6 +192,15 @@ config CIFS_SMB2 options are also slightly simpler (compared to CIFS) due to protocol improvements. +config CIFS_SMB311 + bool "SMB3.1.1 network file system support (Experimental)" + depends on CIFS_SMB2 && INET + + help + This enables experimental support for the newest, SMB3.1.1, dialect. + This dialect includes improved security negotiation features. + If unsure, say N + config CIFS_FSCACHE bool "Provide CIFS client caching support" depends on CIFS=m && FSCACHE || CIFS=y && FSCACHE=y diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 252f5c158..a782b2290 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -120,7 +120,7 @@ extern struct vfsmount *cifs_dfs_d_automount(struct path *path); #endif /* Functions related to symlinks */ -extern void *cifs_follow_link(struct dentry *direntry, struct nameidata *nd); +extern const char *cifs_follow_link(struct dentry *direntry, void **cookie); extern int cifs_readlink(struct dentry *direntry, char __user *buffer, int buflen); extern int cifs_symlink(struct inode *inode, struct dentry *direntry, diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 22b289a3b..b406a32de 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -171,6 +171,10 @@ enum smb_version { Smb_21, Smb_30, Smb_302, +#ifdef CONFIG_CIFS_SMB311 + Smb_311, +#endif /* SMB311 */ + Smb_version_err }; struct mid_q_entry; @@ -368,6 +372,8 @@ struct smb_version_operations { void (*new_lease_key)(struct cifs_fid *); int (*generate_signingkey)(struct cifs_ses *); int (*calc_signature)(struct smb_rqst *, struct TCP_Server_Info *); + int (*set_integrity)(const unsigned int, struct cifs_tcon *tcon, + struct cifsFileInfo *src_file); int (*query_mf_symlink)(unsigned int, struct cifs_tcon *, struct cifs_sb_info *, const unsigned char *, char *, unsigned int *); @@ -386,6 +392,9 @@ struct smb_version_operations { int (*clone_range)(const unsigned int, struct cifsFileInfo *src_file, struct cifsFileInfo *target_file, u64 src_off, u64 len, u64 dest_off); + int (*duplicate_extents)(const unsigned int, struct cifsFileInfo *src, + struct cifsFileInfo *target_file, u64 src_off, u64 len, + u64 dest_off); int (*validate_negotiate)(const unsigned int, struct cifs_tcon *); ssize_t (*query_all_EAs)(const unsigned int, struct cifs_tcon *, const unsigned char *, const unsigned char *, char *, @@ -1617,4 +1626,8 @@ extern struct smb_version_values smb30_values; #define SMB302_VERSION_STRING "3.02" /*extern struct smb_version_operations smb302_operations;*/ /* not needed yet */ extern struct smb_version_values smb302_values; +#define SMB311_VERSION_STRING "3.1.1" +#define ALT_SMB311_VERSION_STRING "3.11" +extern struct smb_version_operations smb311_operations; +extern struct smb_version_values smb311_values; #endif /* _CIFS_GLOB_H */ diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index 5f9822ac0..47b030da0 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -2255,6 +2255,8 @@ typedef struct { /* List of FileSystemAttributes - see 2.5.1 of MS-FSCC */ +#define FILE_SUPPORTS_SPARSE_VDL 0x10000000 /* faster nonsparse extend */ +#define FILE_SUPPORTS_BLOCK_REFCOUNTING 0x08000000 /* allow ioctl dup extents */ #define FILE_SUPPORT_INTEGRITY_STREAMS 0x04000000 #define FILE_SUPPORTS_USN_JOURNAL 0x02000000 #define FILE_SUPPORTS_OPEN_BY_FILE_ID 0x01000000 @@ -2310,6 +2312,16 @@ typedef struct { /* data block encoding of response to level 263 QPathInfo */ char FileName[1]; } __attribute__((packed)) FILE_ALL_INFO; /* level 0x107 QPathInfo */ +typedef struct { + __le64 AllocationSize; + __le64 EndOfFile; /* size ie offset to first free byte in file */ + __le32 NumberOfLinks; /* hard links */ + __u8 DeletePending; + __u8 Directory; + __u16 Pad; +} __attribute__((packed)) FILE_STANDARD_INFO; /* level 0x102 QPathInfo */ + + /* defines for enumerating possible values of the Unix type field below */ #define UNIX_FILE 0 #define UNIX_DIR 1 diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index f26ffbfc6..672ef35c9 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -625,9 +625,8 @@ CIFSSMBNegotiate(const unsigned int xid, struct cifs_ses *ses) server->negflavor = CIFS_NEGFLAVOR_UNENCAP; memcpy(ses->server->cryptkey, pSMBr->u.EncryptionKey, CIFS_CRYPTO_KEY_SIZE); - } else if ((pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC || - server->capabilities & CAP_EXTENDED_SECURITY) && - (pSMBr->EncryptionKeyLength == 0)) { + } else if (pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC || + server->capabilities & CAP_EXTENDED_SECURITY) { server->negflavor = CIFS_NEGFLAVOR_EXTENDED; rc = decode_ext_sec_blob(ses, pSMBr); } else if (server->sec_mode & SECMODE_PW_ENCRYPT) { diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 8383d5ea4..773f4dc77 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -280,6 +280,11 @@ static const match_table_t cifs_smb_version_tokens = { { Smb_21, SMB21_VERSION_STRING }, { Smb_30, SMB30_VERSION_STRING }, { Smb_302, SMB302_VERSION_STRING }, +#ifdef CONFIG_CIFS_SMB311 + { Smb_311, SMB311_VERSION_STRING }, + { Smb_311, ALT_SMB311_VERSION_STRING }, +#endif /* SMB311 */ + { Smb_version_err, NULL } }; static int ip_connect(struct TCP_Server_Info *server); @@ -1133,6 +1138,12 @@ cifs_parse_smb_version(char *value, struct smb_vol *vol) vol->ops = &smb30_operations; /* currently identical with 3.0 */ vol->vals = &smb302_values; break; +#ifdef CONFIG_CIFS_SMB311 + case Smb_311: + vol->ops = &smb311_operations; + vol->vals = &smb311_values; + break; +#endif /* SMB311 */ #endif default: cifs_dbg(VFS, "Unknown vers= option specified: %s\n", value); @@ -3461,6 +3472,8 @@ try_mount_again: else if (ses) cifs_put_smb_ses(ses); + cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_POSIX_PATHS; + free_xid(xid); } #endif diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index 8b7898b76..49b8b6e41 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -31,12 +31,15 @@ #include "cifsproto.h" #include "cifs_debug.h" #include "cifsfs.h" +#include <linux/btrfs.h> #define CIFS_IOCTL_MAGIC 0xCF #define CIFS_IOC_COPYCHUNK_FILE _IOW(CIFS_IOCTL_MAGIC, 3, int) +#define CIFS_IOC_SET_INTEGRITY _IO(CIFS_IOCTL_MAGIC, 4) static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file, - unsigned long srcfd, u64 off, u64 len, u64 destoff) + unsigned long srcfd, u64 off, u64 len, u64 destoff, + bool dup_extents) { int rc; struct cifsFileInfo *smb_file_target = dst_file->private_data; @@ -109,9 +112,14 @@ static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file, truncate_inode_pages_range(&target_inode->i_data, destoff, PAGE_CACHE_ALIGN(destoff + len)-1); - if (target_tcon->ses->server->ops->clone_range) + if (dup_extents && target_tcon->ses->server->ops->duplicate_extents) + rc = target_tcon->ses->server->ops->duplicate_extents(xid, + smb_file_src, smb_file_target, off, len, destoff); + else if (!dup_extents && target_tcon->ses->server->ops->clone_range) rc = target_tcon->ses->server->ops->clone_range(xid, smb_file_src, smb_file_target, off, len, destoff); + else + rc = -EOPNOTSUPP; /* force revalidate of size and timestamps of target file now that target is updated on the server */ @@ -205,7 +213,20 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) } break; case CIFS_IOC_COPYCHUNK_FILE: - rc = cifs_ioctl_clone(xid, filep, arg, 0, 0, 0); + rc = cifs_ioctl_clone(xid, filep, arg, 0, 0, 0, false); + break; + case BTRFS_IOC_CLONE: + rc = cifs_ioctl_clone(xid, filep, arg, 0, 0, 0, true); + break; + case CIFS_IOC_SET_INTEGRITY: + if (pSMBFile == NULL) + break; + tcon = tlink_tcon(pSMBFile->tlink); + if (tcon->ses->server->ops->set_integrity) + rc = tcon->ses->server->ops->set_integrity(xid, + tcon, pSMBFile); + else + rc = -EOPNOTSUPP; break; default: cifs_dbg(FYI, "unsupported ioctl\n"); diff --git a/fs/cifs/link.c b/fs/cifs/link.c index e6c707cc6..e3548f73b 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -626,8 +626,8 @@ cifs_hl_exit: return rc; } -void * -cifs_follow_link(struct dentry *direntry, struct nameidata *nd) +const char * +cifs_follow_link(struct dentry *direntry, void **cookie) { struct inode *inode = d_inode(direntry); int rc = -ENOMEM; @@ -643,16 +643,18 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd) tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) { - rc = PTR_ERR(tlink); - tlink = NULL; - goto out; + free_xid(xid); + return ERR_CAST(tlink); } tcon = tlink_tcon(tlink); server = tcon->ses->server; full_path = build_path_from_dentry(direntry); - if (!full_path) - goto out; + if (!full_path) { + free_xid(xid); + cifs_put_tlink(tlink); + return ERR_PTR(-ENOMEM); + } cifs_dbg(FYI, "Full path: %s inode = 0x%p\n", full_path, inode); @@ -670,17 +672,13 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd) &target_path, cifs_sb); kfree(full_path); -out: + free_xid(xid); + cifs_put_tlink(tlink); if (rc != 0) { kfree(target_path); - target_path = ERR_PTR(rc); + return ERR_PTR(rc); } - - free_xid(xid); - if (tlink) - cifs_put_tlink(tlink); - nd_set_link(nd, target_path); - return NULL; + return *cookie = target_path; } int diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 54daee5ad..df91bcf56 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -806,6 +806,53 @@ smb2_set_file_size(const unsigned int xid, struct cifs_tcon *tcon, cfile->fid.volatile_fid, cfile->pid, &eof, false); } +#ifdef CONFIG_CIFS_SMB311 +static int +smb2_duplicate_extents(const unsigned int xid, + struct cifsFileInfo *srcfile, + struct cifsFileInfo *trgtfile, u64 src_off, + u64 len, u64 dest_off) +{ + int rc; + unsigned int ret_data_len; + char *retbuf = NULL; + struct duplicate_extents_to_file dup_ext_buf; + struct cifs_tcon *tcon = tlink_tcon(trgtfile->tlink); + + /* server fileays advertise duplicate extent support with this flag */ + if ((le32_to_cpu(tcon->fsAttrInfo.Attributes) & + FILE_SUPPORTS_BLOCK_REFCOUNTING) == 0) + return -EOPNOTSUPP; + + dup_ext_buf.VolatileFileHandle = srcfile->fid.volatile_fid; + dup_ext_buf.PersistentFileHandle = srcfile->fid.persistent_fid; + dup_ext_buf.SourceFileOffset = cpu_to_le64(src_off); + dup_ext_buf.TargetFileOffset = cpu_to_le64(dest_off); + dup_ext_buf.ByteCount = cpu_to_le64(len); + cifs_dbg(FYI, "duplicate extents: src off %lld dst off %lld len %lld", + src_off, dest_off, len); + + rc = smb2_set_file_size(xid, tcon, trgtfile, dest_off + len, false); + if (rc) + goto duplicate_extents_out; + + rc = SMB2_ioctl(xid, tcon, trgtfile->fid.persistent_fid, + trgtfile->fid.volatile_fid, + FSCTL_DUPLICATE_EXTENTS_TO_FILE, + true /* is_fsctl */, (char *)&dup_ext_buf, + sizeof(struct duplicate_extents_to_file), + (char **)&retbuf, + &ret_data_len); + + if (ret_data_len > 0) + cifs_dbg(FYI, "non-zero response length in duplicate extents"); + +duplicate_extents_out: + return rc; +} +#endif /* CONFIG_CIFS_SMB311 */ + + static int smb2_set_compression(const unsigned int xid, struct cifs_tcon *tcon, struct cifsFileInfo *cfile) @@ -815,6 +862,28 @@ smb2_set_compression(const unsigned int xid, struct cifs_tcon *tcon, } static int +smb3_set_integrity(const unsigned int xid, struct cifs_tcon *tcon, + struct cifsFileInfo *cfile) +{ + struct fsctl_set_integrity_information_req integr_info; + char *retbuf = NULL; + unsigned int ret_data_len; + + integr_info.ChecksumAlgorithm = cpu_to_le16(CHECKSUM_TYPE_UNCHANGED); + integr_info.Flags = 0; + integr_info.Reserved = 0; + + return SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid, + cfile->fid.volatile_fid, + FSCTL_SET_INTEGRITY_INFORMATION, + true /* is_fsctl */, (char *)&integr_info, + sizeof(struct fsctl_set_integrity_information_req), + (char **)&retbuf, + &ret_data_len); + +} + +static int smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon, const char *path, struct cifs_sb_info *cifs_sb, struct cifs_fid *fid, __u16 search_flags, @@ -1624,6 +1693,7 @@ struct smb_version_operations smb30_operations = { .new_lease_key = smb2_new_lease_key, .generate_signingkey = generate_smb3signingkey, .calc_signature = smb3_calc_signature, + .set_integrity = smb3_set_integrity, .is_read_op = smb21_is_read_op, .set_oplock_level = smb3_set_oplock_level, .create_lease_buf = smb3_create_lease_buf, @@ -1635,6 +1705,94 @@ struct smb_version_operations smb30_operations = { .fallocate = smb3_fallocate, }; +#ifdef CONFIG_CIFS_SMB311 +struct smb_version_operations smb311_operations = { + .compare_fids = smb2_compare_fids, + .setup_request = smb2_setup_request, + .setup_async_request = smb2_setup_async_request, + .check_receive = smb2_check_receive, + .add_credits = smb2_add_credits, + .set_credits = smb2_set_credits, + .get_credits_field = smb2_get_credits_field, + .get_credits = smb2_get_credits, + .wait_mtu_credits = smb2_wait_mtu_credits, + .get_next_mid = smb2_get_next_mid, + .read_data_offset = smb2_read_data_offset, + .read_data_length = smb2_read_data_length, + .map_error = map_smb2_to_linux_error, + .find_mid = smb2_find_mid, + .check_message = smb2_check_message, + .dump_detail = smb2_dump_detail, + .clear_stats = smb2_clear_stats, + .print_stats = smb2_print_stats, + .dump_share_caps = smb2_dump_share_caps, + .is_oplock_break = smb2_is_valid_oplock_break, + .downgrade_oplock = smb2_downgrade_oplock, + .need_neg = smb2_need_neg, + .negotiate = smb2_negotiate, + .negotiate_wsize = smb2_negotiate_wsize, + .negotiate_rsize = smb2_negotiate_rsize, + .sess_setup = SMB2_sess_setup, + .logoff = SMB2_logoff, + .tree_connect = SMB2_tcon, + .tree_disconnect = SMB2_tdis, + .qfs_tcon = smb3_qfs_tcon, + .is_path_accessible = smb2_is_path_accessible, + .can_echo = smb2_can_echo, + .echo = SMB2_echo, + .query_path_info = smb2_query_path_info, + .get_srv_inum = smb2_get_srv_inum, + .query_file_info = smb2_query_file_info, + .set_path_size = smb2_set_path_size, + .set_file_size = smb2_set_file_size, + .set_file_info = smb2_set_file_info, + .set_compression = smb2_set_compression, + .mkdir = smb2_mkdir, + .mkdir_setinfo = smb2_mkdir_setinfo, + .rmdir = smb2_rmdir, + .unlink = smb2_unlink, + .rename = smb2_rename_path, + .create_hardlink = smb2_create_hardlink, + .query_symlink = smb2_query_symlink, + .query_mf_symlink = smb3_query_mf_symlink, + .create_mf_symlink = smb3_create_mf_symlink, + .open = smb2_open_file, + .set_fid = smb2_set_fid, + .close = smb2_close_file, + .flush = smb2_flush_file, + .async_readv = smb2_async_readv, + .async_writev = smb2_async_writev, + .sync_read = smb2_sync_read, + .sync_write = smb2_sync_write, + .query_dir_first = smb2_query_dir_first, + .query_dir_next = smb2_query_dir_next, + .close_dir = smb2_close_dir, + .calc_smb_size = smb2_calc_size, + .is_status_pending = smb2_is_status_pending, + .oplock_response = smb2_oplock_response, + .queryfs = smb2_queryfs, + .mand_lock = smb2_mand_lock, + .mand_unlock_range = smb2_unlock_range, + .push_mand_locks = smb2_push_mandatory_locks, + .get_lease_key = smb2_get_lease_key, + .set_lease_key = smb2_set_lease_key, + .new_lease_key = smb2_new_lease_key, + .generate_signingkey = generate_smb3signingkey, + .calc_signature = smb3_calc_signature, + .set_integrity = smb3_set_integrity, + .is_read_op = smb21_is_read_op, + .set_oplock_level = smb3_set_oplock_level, + .create_lease_buf = smb3_create_lease_buf, + .parse_lease_buf = smb3_parse_lease_buf, + .clone_range = smb2_clone_range, + .duplicate_extents = smb2_duplicate_extents, +/* .validate_negotiate = smb3_validate_negotiate, */ /* not used in 3.11 */ + .wp_retry_size = smb2_wp_retry_size, + .dir_needs_close = smb2_dir_needs_close, + .fallocate = smb3_fallocate, +}; +#endif /* CIFS_SMB311 */ + struct smb_version_values smb20_values = { .version_string = SMB20_VERSION_STRING, .protocol_id = SMB20_PROT_ID, @@ -1714,3 +1872,25 @@ struct smb_version_values smb302_values = { .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED, .create_lease_size = sizeof(struct create_lease_v2), }; + +#ifdef CONFIG_CIFS_SMB311 +struct smb_version_values smb311_values = { + .version_string = SMB311_VERSION_STRING, + .protocol_id = SMB311_PROT_ID, + .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU, + .large_lock_type = 0, + .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, + .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, + .unlock_lock_type = SMB2_LOCKFLAG_UNLOCK, + .header_size = sizeof(struct smb2_hdr), + .max_header_size = MAX_SMB2_HDR_SIZE, + .read_rsp_size = sizeof(struct smb2_read_rsp) - 1, + .lock_cmd = SMB2_LOCK, + .cap_unix = 0, + .cap_nt_find = SMB2_NT_FIND, + .cap_large_files = SMB2_LARGE_FILES, + .signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED, + .signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED, + .create_lease_size = sizeof(struct create_lease_v2), +}; +#endif /* SMB311 */ diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 54cbe19d9..b8b4f08ee 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -304,6 +304,59 @@ small_smb2_init(__le16 smb2_command, struct cifs_tcon *tcon, return rc; } +#ifdef CONFIG_CIFS_SMB311 +/* offset is sizeof smb2_negotiate_req - 4 but rounded up to 8 bytes */ +#define OFFSET_OF_NEG_CONTEXT 0x68 /* sizeof(struct smb2_negotiate_req) - 4 */ + + +#define SMB2_PREAUTH_INTEGRITY_CAPABILITIES cpu_to_le16(1) +#define SMB2_ENCRYPTION_CAPABILITIES cpu_to_le16(2) + +static void +build_preauth_ctxt(struct smb2_preauth_neg_context *pneg_ctxt) +{ + pneg_ctxt->ContextType = SMB2_PREAUTH_INTEGRITY_CAPABILITIES; + pneg_ctxt->DataLength = cpu_to_le16(38); + pneg_ctxt->HashAlgorithmCount = cpu_to_le16(1); + pneg_ctxt->SaltLength = cpu_to_le16(SMB311_SALT_SIZE); + get_random_bytes(pneg_ctxt->Salt, SMB311_SALT_SIZE); + pneg_ctxt->HashAlgorithms = SMB2_PREAUTH_INTEGRITY_SHA512; +} + +static void +build_encrypt_ctxt(struct smb2_encryption_neg_context *pneg_ctxt) +{ + pneg_ctxt->ContextType = SMB2_ENCRYPTION_CAPABILITIES; + pneg_ctxt->DataLength = cpu_to_le16(6); + pneg_ctxt->CipherCount = cpu_to_le16(2); + pneg_ctxt->Ciphers[0] = SMB2_ENCRYPTION_AES128_GCM; + pneg_ctxt->Ciphers[1] = SMB2_ENCRYPTION_AES128_CCM; +} + +static void +assemble_neg_contexts(struct smb2_negotiate_req *req) +{ + + /* +4 is to account for the RFC1001 len field */ + char *pneg_ctxt = (char *)req + OFFSET_OF_NEG_CONTEXT + 4; + + build_preauth_ctxt((struct smb2_preauth_neg_context *)pneg_ctxt); + /* Add 2 to size to round to 8 byte boundary */ + pneg_ctxt += 2 + sizeof(struct smb2_preauth_neg_context); + build_encrypt_ctxt((struct smb2_encryption_neg_context *)pneg_ctxt); + req->NegotiateContextOffset = cpu_to_le32(OFFSET_OF_NEG_CONTEXT); + req->NegotiateContextCount = cpu_to_le16(2); + inc_rfc1001_len(req, 4 + sizeof(struct smb2_preauth_neg_context) + 2 + + sizeof(struct smb2_encryption_neg_context)); /* calculate hash */ +} +#else +static void assemble_neg_contexts(struct smb2_negotiate_req *req) +{ + return; +} +#endif /* SMB311 */ + + /* * * SMB2 Worker functions follow: @@ -363,10 +416,12 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) /* ClientGUID must be zero for SMB2.02 dialect */ if (ses->server->vals->protocol_id == SMB20_PROT_ID) memset(req->ClientGUID, 0, SMB2_CLIENT_GUID_SIZE); - else + else { memcpy(req->ClientGUID, server->client_guid, SMB2_CLIENT_GUID_SIZE); - + if (ses->server->vals->protocol_id == SMB311_PROT_ID) + assemble_neg_contexts(req); + } iov[0].iov_base = (char *)req; /* 4 for rfc1002 length field */ iov[0].iov_len = get_rfc1002_length(req) + 4; @@ -393,8 +448,12 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) cifs_dbg(FYI, "negotiated smb3.0 dialect\n"); else if (rsp->DialectRevision == cpu_to_le16(SMB302_PROT_ID)) cifs_dbg(FYI, "negotiated smb3.02 dialect\n"); +#ifdef CONFIG_CIFS_SMB311 + else if (rsp->DialectRevision == cpu_to_le16(SMB311_PROT_ID)) + cifs_dbg(FYI, "negotiated smb3.1.1 dialect\n"); +#endif /* SMB311 */ else { - cifs_dbg(VFS, "Illegal dialect returned by server %d\n", + cifs_dbg(VFS, "Illegal dialect returned by server 0x%x\n", le16_to_cpu(rsp->DialectRevision)); rc = -EIO; goto neg_exit; @@ -572,7 +631,7 @@ ssetup_ntlmssp_authenticate: return rc; req->hdr.SessionId = 0; /* First session, not a reauthenticate */ - req->VcNumber = 0; /* MBZ */ + req->Flags = 0; /* MBZ */ /* to enable echos and oplocks */ req->hdr.CreditRequest = cpu_to_le16(3); diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 70867d54f..451108284 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -136,9 +136,6 @@ struct smb2_transform_hdr { __u64 SessionId; } __packed; -/* Encryption Algorithms */ -#define SMB2_ENCRYPTION_AES128_CCM cpu_to_le16(0x0001) - /* * SMB2 flag definitions */ @@ -191,7 +188,10 @@ struct smb2_negotiate_req { __le16 Reserved; /* MBZ */ __le32 Capabilities; __u8 ClientGUID[SMB2_CLIENT_GUID_SIZE]; - __le64 ClientStartTime; /* MBZ */ + /* In SMB3.02 and earlier next three were MBZ le64 ClientStartTime */ + __le32 NegotiateContextOffset; /* SMB3.1.1 only. MBZ earlier */ + __le16 NegotiateContextCount; /* SMB3.1.1 only. MBZ earlier */ + __le16 Reserved2; __le16 Dialects[1]; /* One dialect (vers=) at a time for now */ } __packed; @@ -200,6 +200,7 @@ struct smb2_negotiate_req { #define SMB21_PROT_ID 0x0210 #define SMB30_PROT_ID 0x0300 #define SMB302_PROT_ID 0x0302 +#define SMB311_PROT_ID 0x0311 #define BAD_PROT_ID 0xFFFF /* SecurityMode flags */ @@ -217,12 +218,38 @@ struct smb2_negotiate_req { #define SMB2_NT_FIND 0x00100000 #define SMB2_LARGE_FILES 0x00200000 +#define SMB311_SALT_SIZE 32 +/* Hash Algorithm Types */ +#define SMB2_PREAUTH_INTEGRITY_SHA512 cpu_to_le16(0x0001) + +struct smb2_preauth_neg_context { + __le16 ContextType; /* 1 */ + __le16 DataLength; + __le32 Reserved; + __le16 HashAlgorithmCount; /* 1 */ + __le16 SaltLength; + __le16 HashAlgorithms; /* HashAlgorithms[0] since only one defined */ + __u8 Salt[SMB311_SALT_SIZE]; +} __packed; + +/* Encryption Algorithms Ciphers */ +#define SMB2_ENCRYPTION_AES128_CCM cpu_to_le16(0x0001) +#define SMB2_ENCRYPTION_AES128_GCM cpu_to_le16(0x0002) + +struct smb2_encryption_neg_context { + __le16 ContextType; /* 2 */ + __le16 DataLength; + __le32 Reserved; + __le16 CipherCount; /* AES-128-GCM and AES-128-CCM */ + __le16 Ciphers[2]; /* Ciphers[0] since only one used now */ +} __packed; + struct smb2_negotiate_rsp { struct smb2_hdr hdr; __le16 StructureSize; /* Must be 65 */ __le16 SecurityMode; __le16 DialectRevision; - __le16 Reserved; /* MBZ */ + __le16 NegotiateContextCount; /* Prior to SMB3.1.1 was Reserved & MBZ */ __u8 ServerGUID[16]; __le32 Capabilities; __le32 MaxTransactSize; @@ -232,14 +259,18 @@ struct smb2_negotiate_rsp { __le64 ServerStartTime; __le16 SecurityBufferOffset; __le16 SecurityBufferLength; - __le32 Reserved2; /* may be any value, ignore */ + __le32 NegotiateContextOffset; /* Pre:SMB3.1.1 was reserved/ignored */ __u8 Buffer[1]; /* variable length GSS security buffer */ } __packed; +/* Flags */ +#define SMB2_SESSION_REQ_FLAG_BINDING 0x01 +#define SMB2_SESSION_REQ_FLAG_ENCRYPT_DATA 0x04 + struct smb2_sess_setup_req { struct smb2_hdr hdr; __le16 StructureSize; /* Must be 25 */ - __u8 VcNumber; + __u8 Flags; __u8 SecurityMode; __le32 Capabilities; __le32 Channel; @@ -274,10 +305,13 @@ struct smb2_logoff_rsp { __le16 Reserved; } __packed; +/* Flags/Reserved for SMB3.1.1 */ +#define SMB2_SHAREFLAG_CLUSTER_RECONNECT 0x0001 + struct smb2_tree_connect_req { struct smb2_hdr hdr; __le16 StructureSize; /* Must be 9 */ - __le16 Reserved; + __le16 Reserved; /* Flags in SMB3.1.1 */ __le16 PathOffset; __le16 PathLength; __u8 Buffer[1]; /* variable length */ @@ -587,6 +621,29 @@ struct copychunk_ioctl_rsp { __le32 TotalBytesWritten; } __packed; +struct fsctl_set_integrity_information_req { + __le16 ChecksumAlgorithm; + __le16 Reserved; + __le32 Flags; +} __packed; + +struct fsctl_get_integrity_information_rsp { + __le16 ChecksumAlgorithm; + __le16 Reserved; + __le32 Flags; + __le32 ChecksumChunkSizeInBytes; + __le32 ClusterSizeInBytes; +} __packed; + +/* Integrity ChecksumAlgorithm choices for above */ +#define CHECKSUM_TYPE_NONE 0x0000 +#define CHECKSUM_TYPE_CRC64 0x0002 +#define CHECKSUM_TYPE_UNCHANGED 0xFFFF /* set only */ + +/* Integrity flags for above */ +#define FSCTL_INTEGRITY_FLAG_CHECKSUM_ENFORCEMENT_OFF 0x00000001 + + struct validate_negotiate_info_req { __le32 Capabilities; __u8 Guid[SMB2_CLIENT_GUID_SIZE]; @@ -620,6 +677,14 @@ struct compress_ioctl { __le16 CompressionState; /* See cifspdu.h for possible flag values */ } __packed; +struct duplicate_extents_to_file { + __u64 PersistentFileHandle; /* source file handle, opaque endianness */ + __u64 VolatileFileHandle; + __le64 SourceFileOffset; + __le64 TargetFileOffset; + __le64 ByteCount; /* Bytes to be copied */ +} __packed; + struct smb2_ioctl_req { struct smb2_hdr hdr; __le16 StructureSize; /* Must be 57 */ diff --git a/fs/cifs/smbfsctl.h b/fs/cifs/smbfsctl.h index 83efa5953..a639d0dab 100644 --- a/fs/cifs/smbfsctl.h +++ b/fs/cifs/smbfsctl.h @@ -75,10 +75,13 @@ #define FSCTL_QUERY_SPARING_INFO 0x00090138 /* BB add struct */ #define FSCTL_SET_ZERO_ON_DEALLOC 0x00090194 /* BB add struct */ #define FSCTL_SET_SHORT_NAME_BEHAVIOR 0x000901B4 /* BB add struct */ +#define FSCTL_GET_INTEGRITY_INFORMATION 0x0009027C #define FSCTL_QUERY_ALLOCATED_RANGES 0x000940CF /* BB add struct */ #define FSCTL_SET_DEFECT_MANAGEMENT 0x00098134 /* BB add struct */ #define FSCTL_FILE_LEVEL_TRIM 0x00098208 /* BB add struct */ +#define FSCTL_DUPLICATE_EXTENTS_TO_FILE 0x00098344 #define FSCTL_SIS_LINK_FILES 0x0009C104 +#define FSCTL_SET_INTEGRITY_INFORMATION 0x0009C280 #define FSCTL_PIPE_PEEK 0x0011400C /* BB add struct */ #define FSCTL_PIPE_TRANSCEIVE 0x0011C017 /* BB add struct */ /* strange that the number for this op is not sequential with previous op */ diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h index d6f7a76a1..f829fe963 100644 --- a/fs/coda/coda_linux.h +++ b/fs/coda/coda_linux.h @@ -79,7 +79,7 @@ void coda_sysctl_clean(void); static inline struct coda_inode_info *ITOC(struct inode *inode) { - return list_entry(inode, struct coda_inode_info, vfs_inode); + return container_of(inode, struct coda_inode_info, vfs_inode); } static __inline__ struct CodaFid *coda_i2f(struct inode *inode) diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 6b8e2f091..48851f6ea 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -896,6 +896,7 @@ COMPATIBLE_IOCTL(FIGETBSZ) /* 'X' - originally XFS but some now in the VFS */ COMPATIBLE_IOCTL(FIFREEZE) COMPATIBLE_IOCTL(FITHAW) +COMPATIBLE_IOCTL(FITRIM) COMPATIBLE_IOCTL(KDGETKEYCODE) COMPATIBLE_IOCTL(KDSETKEYCODE) COMPATIBLE_IOCTL(KDGKBTYPE) diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index 8d89f5fd0..eae87575e 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -236,7 +236,7 @@ void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent) if (dentry) { spin_lock(&dentry->d_lock); - if (!d_unhashed(dentry) && d_really_is_positive(dentry)) { + if (simple_positive(dentry)) { dget_dlock(dentry); __d_drop(dentry); spin_unlock(&dentry->d_lock); diff --git a/fs/configfs/item.c b/fs/configfs/item.c index e65f9ffbb..b863a09cd 100644 --- a/fs/configfs/item.c +++ b/fs/configfs/item.c @@ -47,12 +47,11 @@ static void config_item_release(struct kref *kref); * config_item_init - initialize item. * @item: item in question. */ -void config_item_init(struct config_item *item) +static void config_item_init(struct config_item *item) { kref_init(&item->ci_kref); INIT_LIST_HEAD(&item->ci_entry); } -EXPORT_SYMBOL(config_item_init); /** * config_item_set_name - Set the name of an item @@ -116,7 +115,7 @@ void config_item_init_type_name(struct config_item *item, const char *name, struct config_item_type *type) { - config_item_set_name(item, name); + config_item_set_name(item, "%s", name); item->ci_type = type; config_item_init(item); } @@ -125,7 +124,7 @@ EXPORT_SYMBOL(config_item_init_type_name); void config_group_init_type_name(struct config_group *group, const char *name, struct config_item_type *type) { - config_item_set_name(&group->cg_item, name); + config_item_set_name(&group->cg_item, "%s", name); group->cg_item.ci_type = type; config_group_init(group); } diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c index cc9f2546e..ec5c8325b 100644 --- a/fs/configfs/symlink.c +++ b/fs/configfs/symlink.c @@ -279,36 +279,27 @@ static int configfs_getlink(struct dentry *dentry, char * path) } -static void *configfs_follow_link(struct dentry *dentry, struct nameidata *nd) +static const char *configfs_follow_link(struct dentry *dentry, void **cookie) { - int error = -ENOMEM; unsigned long page = get_zeroed_page(GFP_KERNEL); + int error; - if (page) { - error = configfs_getlink(dentry, (char *)page); - if (!error) { - nd_set_link(nd, (char *)page); - return (void *)page; - } - } - - nd_set_link(nd, ERR_PTR(error)); - return NULL; -} + if (!page) + return ERR_PTR(-ENOMEM); -static void configfs_put_link(struct dentry *dentry, struct nameidata *nd, - void *cookie) -{ - if (cookie) { - unsigned long page = (unsigned long)cookie; - free_page(page); + error = configfs_getlink(dentry, (char *)page); + if (!error) { + return *cookie = (void *)page; } + + free_page(page); + return ERR_PTR(error); } const struct inode_operations configfs_symlink_inode_operations = { .follow_link = configfs_follow_link, .readlink = generic_readlink, - .put_link = configfs_put_link, + .put_link = free_page_put_link, .setattr = configfs_setattr, }; diff --git a/fs/coredump.c b/fs/coredump.c index bbbe139ab..c5ecde6f3 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -70,7 +70,8 @@ static int expand_corename(struct core_name *cn, int size) return 0; } -static int cn_vprintf(struct core_name *cn, const char *fmt, va_list arg) +static __printf(2, 0) int cn_vprintf(struct core_name *cn, const char *fmt, + va_list arg) { int free, need; va_list arg_copy; @@ -93,7 +94,7 @@ again: return -ENOMEM; } -static int cn_printf(struct core_name *cn, const char *fmt, ...) +static __printf(2, 3) int cn_printf(struct core_name *cn, const char *fmt, ...) { va_list arg; int ret; @@ -105,7 +106,8 @@ static int cn_printf(struct core_name *cn, const char *fmt, ...) return ret; } -static int cn_esc_printf(struct core_name *cn, const char *fmt, ...) +static __printf(2, 3) +int cn_esc_printf(struct core_name *cn, const char *fmt, ...) { int cur = cn->used; va_list arg; @@ -138,7 +140,7 @@ static int cn_print_exe_file(struct core_name *cn) goto put_exe_file; } - path = d_path(&exe_file->f_path, pathbuf, PATH_MAX); + path = file_path(exe_file, pathbuf, PATH_MAX); if (IS_ERR(path)) { ret = PTR_ERR(path); goto free_buf; @@ -209,11 +211,15 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm) break; /* uid */ case 'u': - err = cn_printf(cn, "%d", cred->uid); + err = cn_printf(cn, "%u", + from_kuid(&init_user_ns, + cred->uid)); break; /* gid */ case 'g': - err = cn_printf(cn, "%d", cred->gid); + err = cn_printf(cn, "%u", + from_kgid(&init_user_ns, + cred->gid)); break; case 'd': err = cn_printf(cn, "%d", @@ -221,7 +227,8 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm) break; /* signal that caused the coredump */ case 's': - err = cn_printf(cn, "%ld", cprm->siginfo->si_signo); + err = cn_printf(cn, "%d", + cprm->siginfo->si_signo); break; /* UNIX time of coredump */ case 't': { @@ -155,7 +155,7 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, } if (iov_iter_rw(iter) == WRITE) - len = copy_from_iter(addr, max - pos, iter); + len = copy_from_iter_nocache(addr, max - pos, iter); else if (!hole) len = copy_to_iter(addr, max - pos, iter); else @@ -209,7 +209,8 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode, } /* Protects against truncate */ - inode_dio_begin(inode); + if (!(flags & DIO_SKIP_DIO_COUNT)) + inode_dio_begin(inode); retval = dax_io(inode, iter, pos, end, get_block, &bh); @@ -219,7 +220,8 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode, if ((retval > 0) && end_io) end_io(iocb, pos, retval, bh.b_private); - inode_dio_end(inode); + if (!(flags & DIO_SKIP_DIO_COUNT)) + inode_dio_end(inode); out: return retval; } @@ -309,14 +311,27 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, out: i_mmap_unlock_read(mapping); - if (bh->b_end_io) - bh->b_end_io(bh, 1); - return error; } -static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, - get_block_t get_block) +/** + * __dax_fault - handle a page fault on a DAX file + * @vma: The virtual memory area where the fault occurred + * @vmf: The description of the fault + * @get_block: The filesystem method used to translate file offsets to blocks + * @complete_unwritten: The filesystem method used to convert unwritten blocks + * to written so the data written to them is exposed. This is required for + * required by write faults for filesystems that will return unwritten + * extent mappings from @get_block, but it is optional for reads as + * dax_insert_mapping() will always zero unwritten blocks. If the fs does + * not support unwritten extents, the it should pass NULL. + * + * When a page fault occurs, filesystems may call this helper in their + * fault handler for DAX files. __dax_fault() assumes the caller has done all + * the necessary locking for the page fault to proceed successfully. + */ +int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, + get_block_t get_block, dax_iodone_t complete_unwritten) { struct file *file = vma->vm_file; struct address_space *mapping = file->f_mapping; @@ -417,7 +432,23 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, page_cache_release(page); } + /* + * If we successfully insert the new mapping over an unwritten extent, + * we need to ensure we convert the unwritten extent. If there is an + * error inserting the mapping, the filesystem needs to leave it as + * unwritten to prevent exposure of the stale underlying data to + * userspace, but we still need to call the completion function so + * the private resources on the mapping buffer can be released. We + * indicate what the callback should do via the uptodate variable, same + * as for normal BH based IO completions. + */ error = dax_insert_mapping(inode, &bh, vma, vmf); + if (buffer_unwritten(&bh)) { + if (complete_unwritten) + complete_unwritten(&bh, !error); + else + WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE)); + } out: if (error == -ENOMEM) @@ -434,6 +465,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, } goto out; } +EXPORT_SYMBOL(__dax_fault); /** * dax_fault - handle a page fault on a DAX file @@ -445,7 +477,7 @@ static int do_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, * fault handler for DAX files. */ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, - get_block_t get_block) + get_block_t get_block, dax_iodone_t complete_unwritten) { int result; struct super_block *sb = file_inode(vma->vm_file)->i_sb; @@ -454,7 +486,7 @@ int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, sb_start_pagefault(sb); file_update_time(vma->vm_file); } - result = do_dax_fault(vma, vmf, get_block); + result = __dax_fault(vma, vmf, get_block, complete_unwritten); if (vmf->flags & FAULT_FLAG_WRITE) sb_end_pagefault(sb); diff --git a/fs/dcache.c b/fs/dcache.c index 7c39f19bd..9b5fe503f 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -322,17 +322,17 @@ static void dentry_free(struct dentry *dentry) } /** - * dentry_rcuwalk_barrier - invalidate in-progress rcu-walk lookups + * dentry_rcuwalk_invalidate - invalidate in-progress rcu-walk lookups * @dentry: the target dentry * After this call, in-progress rcu-walk path lookup will fail. This * should be called after unhashing, and after changing d_inode (if * the dentry has not already been unhashed). */ -static inline void dentry_rcuwalk_barrier(struct dentry *dentry) +static inline void dentry_rcuwalk_invalidate(struct dentry *dentry) { - assert_spin_locked(&dentry->d_lock); - /* Go through a barrier */ - write_seqcount_barrier(&dentry->d_seq); + lockdep_assert_held(&dentry->d_lock); + /* Go through am invalidation barrier */ + write_seqcount_invalidate(&dentry->d_seq); } /* @@ -372,7 +372,7 @@ static void dentry_unlink_inode(struct dentry * dentry) struct inode *inode = dentry->d_inode; __d_clear_type_and_inode(dentry); hlist_del_init(&dentry->d_u.d_alias); - dentry_rcuwalk_barrier(dentry); + dentry_rcuwalk_invalidate(dentry); spin_unlock(&dentry->d_lock); spin_unlock(&inode->i_lock); if (!inode->i_nlink) @@ -494,7 +494,7 @@ void __d_drop(struct dentry *dentry) __hlist_bl_del(&dentry->d_hash); dentry->d_hash.pprev = NULL; hlist_bl_unlock(b); - dentry_rcuwalk_barrier(dentry); + dentry_rcuwalk_invalidate(dentry); } } EXPORT_SYMBOL(__d_drop); @@ -1167,7 +1167,7 @@ enum d_walk_ret { * * The @enter() and @finish() callbacks are called with d_lock held. */ -void d_walk(struct dentry *parent, void *data, +static void d_walk(struct dentry *parent, void *data, enum d_walk_ret (*enter)(void *, struct dentry *), void (*finish)(void *)) { @@ -1272,7 +1272,6 @@ rename_retry: seq = 1; goto again; } -EXPORT_SYMBOL(d_walk); /* * Search for at least 1 mount point in the dentry's subdirs. @@ -1677,7 +1676,8 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op) DCACHE_OP_COMPARE | DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE | - DCACHE_OP_DELETE )); + DCACHE_OP_DELETE | + DCACHE_OP_SELECT_INODE)); dentry->d_op = op; if (!op) return; @@ -1693,6 +1693,8 @@ void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op) dentry->d_flags |= DCACHE_OP_DELETE; if (op->d_prune) dentry->d_flags |= DCACHE_OP_PRUNE; + if (op->d_select_inode) + dentry->d_flags |= DCACHE_OP_SELECT_INODE; } EXPORT_SYMBOL(d_set_d_op); @@ -1756,7 +1758,7 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode) if (inode) hlist_add_head(&dentry->d_u.d_alias, &inode->i_dentry); __d_set_inode_and_type(dentry, inode, add_flags); - dentry_rcuwalk_barrier(dentry); + dentry_rcuwalk_invalidate(dentry); spin_unlock(&dentry->d_lock); fsnotify_d_instantiate(dentry, inode); } @@ -3440,22 +3442,15 @@ void __init vfs_caches_init_early(void) inode_init_early(); } -void __init vfs_caches_init(unsigned long mempages) +void __init vfs_caches_init(void) { - unsigned long reserve; - - /* Base hash sizes on available memory, with a reserve equal to - 150% of current kernel size */ - - reserve = min((mempages - nr_free_pages()) * 3/2, mempages - 1); - mempages -= reserve; - names_cachep = kmem_cache_create("names_cache", PATH_MAX, 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); dcache_init(); inode_init(); - files_init(mempages); + files_init(); + files_maxfiles_init(); mnt_init(); bdev_cache_init(); chrdev_init(); diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 830a7e76f..284f9aa00 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -17,7 +17,6 @@ #include <linux/fs.h> #include <linux/seq_file.h> #include <linux/pagemap.h> -#include <linux/namei.h> #include <linux/debugfs.h> #include <linux/io.h> #include <linux/slab.h> @@ -43,17 +42,6 @@ const struct file_operations debugfs_file_operations = { .llseek = noop_llseek, }; -static void *debugfs_follow_link(struct dentry *dentry, struct nameidata *nd) -{ - nd_set_link(nd, d_inode(dentry)->i_private); - return NULL; -} - -const struct inode_operations debugfs_link_operations = { - .readlink = generic_readlink, - .follow_link = debugfs_follow_link, -}; - static int debugfs_u8_set(void *data, u64 val) { *(u8 *)data = val; diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 12756040c..c711be8d6 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -44,11 +44,6 @@ static struct inode *debugfs_get_inode(struct super_block *sb) return inode; } -static inline int debugfs_positive(struct dentry *dentry) -{ - return d_really_is_positive(dentry) && !d_unhashed(dentry); -} - struct debugfs_mount_opts { kuid_t uid; kgid_t gid; @@ -174,7 +169,7 @@ static void debugfs_evict_inode(struct inode *inode) truncate_inode_pages_final(&inode->i_data); clear_inode(inode); if (S_ISLNK(inode->i_mode)) - kfree(inode->i_private); + kfree(inode->i_link); } static const struct super_operations debugfs_super_operations = { @@ -511,8 +506,8 @@ struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent, return failed_creating(dentry); } inode->i_mode = S_IFLNK | S_IRWXUGO; - inode->i_op = &debugfs_link_operations; - inode->i_private = link; + inode->i_op = &simple_symlink_inode_operations; + inode->i_link = link; d_instantiate(dentry, inode); return end_creating(dentry); } @@ -522,7 +517,7 @@ static int __debugfs_remove(struct dentry *dentry, struct dentry *parent) { int ret = 0; - if (debugfs_positive(dentry)) { + if (simple_positive(dentry)) { dget(dentry); if (d_is_dir(dentry)) ret = simple_rmdir(d_inode(parent), dentry); @@ -602,7 +597,7 @@ void debugfs_remove_recursive(struct dentry *dentry) */ spin_lock(&parent->d_lock); list_for_each_entry(child, &parent->d_subdirs, d_child) { - if (!debugfs_positive(child)) + if (!simple_positive(child)) continue; /* perhaps simple_empty(child) makes more sense */ @@ -623,7 +618,7 @@ void debugfs_remove_recursive(struct dentry *dentry) * from d_subdirs. When releasing the parent->d_lock we can * no longer trust that the next pointer is valid. * Restart the loop. We'll skip this one with the - * debugfs_positive() check. + * simple_positive() check. */ goto loop; } diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index add566303..c35ffdc12 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -142,6 +142,8 @@ static inline struct super_block *pts_sb_from_inode(struct inode *inode) if (inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC) return inode->i_sb; #endif + if (!devpts_mnt) + return NULL; return devpts_mnt->mnt_sb; } @@ -525,10 +527,14 @@ static struct file_system_type devpts_fs_type = { int devpts_new_index(struct inode *ptmx_inode) { struct super_block *sb = pts_sb_from_inode(ptmx_inode); - struct pts_fs_info *fsi = DEVPTS_SB(sb); + struct pts_fs_info *fsi; int index; int ida_ret; + if (!sb) + return -ENODEV; + + fsi = DEVPTS_SB(sb); retry: if (!ida_pre_get(&fsi->allocated_ptys, GFP_KERNEL)) return -ENOMEM; @@ -584,11 +590,18 @@ struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index, struct dentry *dentry; struct super_block *sb = pts_sb_from_inode(ptmx_inode); struct inode *inode; - struct dentry *root = sb->s_root; - struct pts_fs_info *fsi = DEVPTS_SB(sb); - struct pts_mount_opts *opts = &fsi->mount_opts; + struct dentry *root; + struct pts_fs_info *fsi; + struct pts_mount_opts *opts; char s[12]; + if (!sb) + return ERR_PTR(-ENODEV); + + root = sb->s_root; + fsi = DEVPTS_SB(sb); + opts = &fsi->mount_opts; + inode = new_inode(sb); if (!inode) return ERR_PTR(-ENOMEM); @@ -676,12 +689,16 @@ static int __init init_devpts_fs(void) struct ctl_table_header *table; if (!err) { + struct vfsmount *mnt; + table = register_sysctl_table(pty_root_table); - devpts_mnt = kern_mount(&devpts_fs_type); - if (IS_ERR(devpts_mnt)) { - err = PTR_ERR(devpts_mnt); + mnt = kern_mount(&devpts_fs_type); + if (IS_ERR(mnt)) { + err = PTR_ERR(mnt); unregister_filesystem(&devpts_fs_type); unregister_sysctl_table(table); + } else { + devpts_mnt = mnt; } } return err; diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index d08e079ea..754fd6c0b 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -921,8 +921,8 @@ static int tcp_accept_from_sock(struct connection *con) mutex_unlock(&connections_lock); memset(&peeraddr, 0, sizeof(peeraddr)); - result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_STREAM, - IPPROTO_TCP, &newsock); + result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family, + SOCK_STREAM, IPPROTO_TCP, &newsock); if (result < 0) return -ENOMEM; @@ -1173,8 +1173,8 @@ static void tcp_connect_to_sock(struct connection *con) goto out; /* Create a socket to communicate with */ - result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_STREAM, - IPPROTO_TCP, &sock); + result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family, + SOCK_STREAM, IPPROTO_TCP, &sock); if (result < 0) goto out_err; @@ -1258,8 +1258,8 @@ static struct socket *tcp_create_listen_sock(struct connection *con, addr_len = sizeof(struct sockaddr_in6); /* Create a socket to communicate with */ - result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_STREAM, - IPPROTO_TCP, &sock); + result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family, + SOCK_STREAM, IPPROTO_TCP, &sock); if (result < 0) { log_print("Can't create listening comms socket"); goto create_out; @@ -1365,8 +1365,8 @@ static int sctp_listen_for_all(void) log_print("Using SCTP for communications"); - result = sock_create_kern(dlm_local_addr[0]->ss_family, SOCK_SEQPACKET, - IPPROTO_SCTP, &sock); + result = sock_create_kern(&init_net, dlm_local_addr[0]->ss_family, + SOCK_SEQPACKET, IPPROTO_SCTP, &sock); if (result < 0) { log_print("Can't create comms socket, check SCTP is loaded"); goto out; diff --git a/fs/drop_caches.c b/fs/drop_caches.c index 8289cb195..5718cb9f7 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -8,7 +8,6 @@ #include <linux/writeback.h> #include <linux/sysctl.h> #include <linux/gfp.h> -#include <linux/export.h> #include "internal.h" /* A global variable is a bit ugly, but it keeps the code simple */ @@ -38,12 +37,6 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) iput(toput_inode); } -/* For TuxOnIce */ -void drop_pagecache(void) -{ - iterate_supers(drop_pagecache_sb, NULL); -} - int drop_caches_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 72afcc629..feef8a9c4 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -325,7 +325,6 @@ ecryptfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return rc; switch (cmd) { - case FITRIM: case FS_IOC32_GETFLAGS: case FS_IOC32_SETFLAGS: case FS_IOC32_GETVERSION: diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index fc850b55d..3c4db1172 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -170,7 +170,6 @@ out_unlock: * @directory_inode: inode of the new file's dentry's parent in ecryptfs * @ecryptfs_dentry: New file's dentry in ecryptfs * @mode: The mode of the new file - * @nd: nameidata of ecryptfs' parent's dentry & vfsmount * * Creates the underlying file and the eCryptfs inode which will link to * it. It will also update the eCryptfs directory inode to mimic the @@ -384,7 +383,7 @@ static int ecryptfs_lookup_interpose(struct dentry *dentry, * ecryptfs_lookup * @ecryptfs_dir_inode: The eCryptfs directory inode * @ecryptfs_dentry: The eCryptfs dentry that we are looking up - * @ecryptfs_nd: nameidata; may be NULL + * @flags: lookup flags * * Find a file on disk. If the file does not exist, then we'll add it to the * dentry cache and continue on to read it from the disk. @@ -675,18 +674,16 @@ out: return rc ? ERR_PTR(rc) : buf; } -static void *ecryptfs_follow_link(struct dentry *dentry, struct nameidata *nd) +static const char *ecryptfs_follow_link(struct dentry *dentry, void **cookie) { size_t len; char *buf = ecryptfs_readlink_lower(dentry, &len); if (IS_ERR(buf)) - goto out; + return buf; fsstack_copy_attr_atime(d_inode(dentry), d_inode(ecryptfs_dentry_to_lower(dentry))); buf[len] = '\0'; -out: - nd_set_link(nd, buf); - return NULL; + return *cookie = buf; } /** diff --git a/fs/efs/super.c b/fs/efs/super.c index 7fca462ea..c8411a30f 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -67,7 +67,7 @@ static struct kmem_cache * efs_inode_cachep; static struct inode *efs_alloc_inode(struct super_block *sb) { struct efs_inode_info *ei; - ei = (struct efs_inode_info *)kmem_cache_alloc(efs_inode_cachep, GFP_KERNEL); + ei = kmem_cache_alloc(efs_inode_cachep, GFP_KERNEL); if (!ei) return NULL; return &ei->vfs_inode; diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild index b47c7b8dc..a364fd096 100644 --- a/fs/exofs/Kbuild +++ b/fs/exofs/Kbuild @@ -16,5 +16,5 @@ libore-y := ore.o ore_raid.o obj-$(CONFIG_ORE) += libore.o -exofs-y := inode.o file.o symlink.o namei.o dir.o super.o sys.o +exofs-y := inode.o file.o namei.o dir.o super.o sys.o obj-$(CONFIG_EXOFS_FS) += exofs.o diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c index 4deb0b05b..e5bb2abf7 100644 --- a/fs/exofs/dir.c +++ b/fs/exofs/dir.c @@ -44,12 +44,6 @@ static inline void exofs_put_page(struct page *page) page_cache_release(page); } -/* Accesses dir's inode->i_size must be called under inode lock */ -static inline unsigned long dir_pages(struct inode *inode) -{ - return (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; -} - static unsigned exofs_last_byte(struct inode *inode, unsigned long page_nr) { loff_t last_byte = inode->i_size; diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index ad9cac670..2e86086bc 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h @@ -207,10 +207,6 @@ extern const struct address_space_operations exofs_aops; extern const struct inode_operations exofs_dir_inode_operations; extern const struct inode_operations exofs_special_inode_operations; -/* symlink.c */ -extern const struct inode_operations exofs_symlink_inode_operations; -extern const struct inode_operations exofs_fast_symlink_inode_operations; - /* exofs_init_comps will initialize an ore_components device array * pointing to a single ore_comp struct, and a round-robin view * of the device table. diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 786e4cc8c..73c64daa0 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -1222,10 +1222,11 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino) inode->i_fop = &exofs_dir_operations; inode->i_mapping->a_ops = &exofs_aops; } else if (S_ISLNK(inode->i_mode)) { - if (exofs_inode_is_fast_symlink(inode)) - inode->i_op = &exofs_fast_symlink_inode_operations; - else { - inode->i_op = &exofs_symlink_inode_operations; + if (exofs_inode_is_fast_symlink(inode)) { + inode->i_op = &simple_symlink_inode_operations; + inode->i_link = (char *)oi->i_data; + } else { + inode->i_op = &page_symlink_inode_operations; inode->i_mapping->a_ops = &exofs_aops; } } else { diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c index 5ae25e431..09a6bb1ad 100644 --- a/fs/exofs/namei.c +++ b/fs/exofs/namei.c @@ -113,7 +113,7 @@ static int exofs_symlink(struct inode *dir, struct dentry *dentry, oi = exofs_i(inode); if (l > sizeof(oi->i_data)) { /* slow symlink */ - inode->i_op = &exofs_symlink_inode_operations; + inode->i_op = &page_symlink_inode_operations; inode->i_mapping->a_ops = &exofs_aops; memset(oi->i_data, 0, sizeof(oi->i_data)); @@ -122,7 +122,8 @@ static int exofs_symlink(struct inode *dir, struct dentry *dentry, goto out_fail; } else { /* fast symlink */ - inode->i_op = &exofs_fast_symlink_inode_operations; + inode->i_op = &simple_symlink_inode_operations; + inode->i_link = (char *)oi->i_data; memcpy(oi->i_data, symname, l); inode->i_size = l-1; } diff --git a/fs/exofs/symlink.c b/fs/exofs/symlink.c deleted file mode 100644 index 6f6f3a4c1..000000000 --- a/fs/exofs/symlink.c +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Copyright (C) 2005, 2006 - * Avishay Traeger (avishay@gmail.com) - * Copyright (C) 2008, 2009 - * Boaz Harrosh <ooo@electrozaur.com> - * - * Copyrights for code taken from ext2: - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - * from - * linux/fs/minix/inode.c - * Copyright (C) 1991, 1992 Linus Torvalds - * - * This file is part of exofs. - * - * exofs is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation. Since it is based on ext2, and the only - * valid version of GPL for the Linux kernel is version 2, the only valid - * version of GPL for exofs is version 2. - * - * exofs is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with exofs; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ - -#include <linux/namei.h> - -#include "exofs.h" - -static void *exofs_follow_link(struct dentry *dentry, struct nameidata *nd) -{ - struct exofs_i_info *oi = exofs_i(d_inode(dentry)); - - nd_set_link(nd, (char *)oi->i_data); - return NULL; -} - -const struct inode_operations exofs_symlink_inode_operations = { - .readlink = generic_readlink, - .follow_link = page_follow_link_light, - .put_link = page_put_link, -}; - -const struct inode_operations exofs_fast_symlink_inode_operations = { - .readlink = generic_readlink, - .follow_link = exofs_follow_link, -}; diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 796b491e6..0c6638b40 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -70,11 +70,6 @@ static inline void ext2_put_page(struct page *page) page_cache_release(page); } -static inline unsigned long dir_pages(struct inode *inode) -{ - return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT; -} - /* * Return the offset into page `page_nr' of the last valid * byte in that page, plus one. diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 3a0a6c640..3b57c9f83 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -28,12 +28,12 @@ #ifdef CONFIG_FS_DAX static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { - return dax_fault(vma, vmf, ext2_get_block); + return dax_fault(vma, vmf, ext2_get_block, NULL); } static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { - return dax_mkwrite(vma, vmf, ext2_get_block); + return dax_mkwrite(vma, vmf, ext2_get_block, NULL); } static const struct vm_operations_struct ext2_dax_vm_ops = { diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index f460ae36d..5c09776d3 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -1403,6 +1403,7 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino) inode->i_mapping->a_ops = &ext2_aops; } else if (S_ISLNK(inode->i_mode)) { if (ext2_inode_is_fast_symlink(inode)) { + inode->i_link = (char *)ei->i_data; inode->i_op = &ext2_fast_symlink_inode_operations; nd_terminate_link(ei->i_data, inode->i_size, sizeof(ei->i_data) - 1); diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 3e074a9cc..13ec54a99 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -189,7 +189,8 @@ static int ext2_symlink (struct inode * dir, struct dentry * dentry, } else { /* fast symlink */ inode->i_op = &ext2_fast_symlink_inode_operations; - memcpy((char*)(EXT2_I(inode)->i_data),symname,l); + inode->i_link = (char*)EXT2_I(inode)->i_data; + memcpy(inode->i_link, symname, l); inode->i_size = l-1; } mark_inode_dirty(inode); diff --git a/fs/ext2/super.c b/fs/ext2/super.c index d0e746e96..900e19cf9 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -882,6 +882,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | ((EXT2_SB(sb)->s_mount_opt & EXT2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); + sb->s_iflags |= SB_I_CGROUPWB; if (le32_to_cpu(es->s_rev_level) == EXT2_GOOD_OLD_REV && (EXT2_HAS_COMPAT_FEATURE(sb, ~0U) || diff --git a/fs/ext2/symlink.c b/fs/ext2/symlink.c index 20608f17c..ae17179f3 100644 --- a/fs/ext2/symlink.c +++ b/fs/ext2/symlink.c @@ -19,14 +19,6 @@ #include "ext2.h" #include "xattr.h" -#include <linux/namei.h> - -static void *ext2_follow_link(struct dentry *dentry, struct nameidata *nd) -{ - struct ext2_inode_info *ei = EXT2_I(d_inode(dentry)); - nd_set_link(nd, (char *)ei->i_data); - return NULL; -} const struct inode_operations ext2_symlink_inode_operations = { .readlink = generic_readlink, @@ -43,7 +35,7 @@ const struct inode_operations ext2_symlink_inode_operations = { const struct inode_operations ext2_fast_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = ext2_follow_link, + .follow_link = simple_follow_link, .setattr = ext2_setattr, #ifdef CONFIG_EXT2_FS_XATTR .setxattr = generic_setxattr, diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 2ee2dc435..6c7e5468a 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -2999,6 +2999,7 @@ struct inode *ext3_iget(struct super_block *sb, unsigned long ino) inode->i_op = &ext3_fast_symlink_inode_operations; nd_terminate_link(ei->i_data, inode->i_size, sizeof(ei->i_data) - 1); + inode->i_link = (char *)ei->i_data; } else { inode->i_op = &ext3_symlink_inode_operations; ext3_set_aops(inode); diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 4264b9bd0..c9e767cd4 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -2308,7 +2308,8 @@ retry: } } else { inode->i_op = &ext3_fast_symlink_inode_operations; - memcpy((char*)&EXT3_I(inode)->i_data,symname,l); + inode->i_link = (char*)&EXT3_I(inode)->i_data; + memcpy(inode->i_link, symname, l); inode->i_size = l-1; } EXT3_I(inode)->i_disksize = inode->i_size; diff --git a/fs/ext3/super.c b/fs/ext3/super.c index a9312f0a5..5ed0044fb 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -1908,7 +1908,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) sbi->s_mount_state = le16_to_cpu(es->s_state); sbi->s_addr_per_block_bits = ilog2(EXT3_ADDR_PER_BLOCK(sb)); sbi->s_desc_per_block_bits = ilog2(EXT3_DESC_PER_BLOCK(sb)); - for (i=0; i < 4; i++) + for (i = 0; i < 4; i++) sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); sbi->s_def_hash_version = es->s_def_hash_version; i = le32_to_cpu(es->s_flags); diff --git a/fs/ext3/symlink.c b/fs/ext3/symlink.c index ea96df3c5..c08c59094 100644 --- a/fs/ext3/symlink.c +++ b/fs/ext3/symlink.c @@ -17,17 +17,9 @@ * ext3 symlink handling code */ -#include <linux/namei.h> #include "ext3.h" #include "xattr.h" -static void * ext3_follow_link(struct dentry *dentry, struct nameidata *nd) -{ - struct ext3_inode_info *ei = EXT3_I(d_inode(dentry)); - nd_set_link(nd, (char*)ei->i_data); - return NULL; -} - const struct inode_operations ext3_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = page_follow_link_light, @@ -43,7 +35,7 @@ const struct inode_operations ext3_symlink_inode_operations = { const struct inode_operations ext3_fast_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = ext3_follow_link, + .follow_link = simple_follow_link, .setattr = ext3_setattr, #ifdef CONFIG_EXT3_FS_XATTR .setxattr = generic_setxattr, diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index 024f2284d..bf8bc8aba 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -72,6 +72,7 @@ config EXT4_ENCRYPTION select CRYPTO_ECB select CRYPTO_XTS select CRYPTO_CTS + select CRYPTO_CTR select CRYPTO_SHA256 select KEYS select ENCRYPTED_KEYS diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 955bf49a7..cd6ea29be 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -369,7 +369,7 @@ static void ext4_validate_block_bitmap(struct super_block *sb, struct ext4_group_info *grp = ext4_get_group_info(sb, block_group); struct ext4_sb_info *sbi = EXT4_SB(sb); - if (buffer_verified(bh)) + if (buffer_verified(bh) || EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) return; ext4_lock_group(sb, block_group); @@ -446,7 +446,7 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) unlock_buffer(bh); if (err) ext4_error(sb, "Checksum bad for grp %u", block_group); - return bh; + goto verify; } ext4_unlock_group(sb, block_group); if (buffer_uptodate(bh)) { diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c index 8ff15273a..457315581 100644 --- a/fs/ext4/crypto.c +++ b/fs/ext4/crypto.c @@ -55,6 +55,9 @@ static mempool_t *ext4_bounce_page_pool; static LIST_HEAD(ext4_free_crypto_ctxs); static DEFINE_SPINLOCK(ext4_crypto_ctx_lock); +static struct kmem_cache *ext4_crypto_ctx_cachep; +struct kmem_cache *ext4_crypt_info_cachep; + /** * ext4_release_crypto_ctx() - Releases an encryption context * @ctx: The encryption context to release. @@ -68,18 +71,12 @@ void ext4_release_crypto_ctx(struct ext4_crypto_ctx *ctx) { unsigned long flags; - if (ctx->bounce_page) { - if (ctx->flags & EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL) - __free_page(ctx->bounce_page); - else - mempool_free(ctx->bounce_page, ext4_bounce_page_pool); - ctx->bounce_page = NULL; - } - ctx->control_page = NULL; + if (ctx->flags & EXT4_WRITE_PATH_FL && ctx->w.bounce_page) + mempool_free(ctx->w.bounce_page, ext4_bounce_page_pool); + ctx->w.bounce_page = NULL; + ctx->w.control_page = NULL; if (ctx->flags & EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL) { - if (ctx->tfm) - crypto_free_tfm(ctx->tfm); - kfree(ctx); + kmem_cache_free(ext4_crypto_ctx_cachep, ctx); } else { spin_lock_irqsave(&ext4_crypto_ctx_lock, flags); list_add(&ctx->free_list, &ext4_free_crypto_ctxs); @@ -88,23 +85,6 @@ void ext4_release_crypto_ctx(struct ext4_crypto_ctx *ctx) } /** - * ext4_alloc_and_init_crypto_ctx() - Allocates and inits an encryption context - * @mask: The allocation mask. - * - * Return: An allocated and initialized encryption context on success. An error - * value or NULL otherwise. - */ -static struct ext4_crypto_ctx *ext4_alloc_and_init_crypto_ctx(gfp_t mask) -{ - struct ext4_crypto_ctx *ctx = kzalloc(sizeof(struct ext4_crypto_ctx), - mask); - - if (!ctx) - return ERR_PTR(-ENOMEM); - return ctx; -} - -/** * ext4_get_crypto_ctx() - Gets an encryption context * @inode: The inode for which we are doing the crypto * @@ -118,10 +98,10 @@ struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode) struct ext4_crypto_ctx *ctx = NULL; int res = 0; unsigned long flags; - struct ext4_encryption_key *key = &EXT4_I(inode)->i_encryption_key; + struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; - if (!ext4_read_workqueue) - ext4_init_crypto(); + if (ci == NULL) + return ERR_PTR(-ENOKEY); /* * We first try getting the ctx from a free list because in @@ -140,50 +120,16 @@ struct ext4_crypto_ctx *ext4_get_crypto_ctx(struct inode *inode) list_del(&ctx->free_list); spin_unlock_irqrestore(&ext4_crypto_ctx_lock, flags); if (!ctx) { - ctx = ext4_alloc_and_init_crypto_ctx(GFP_NOFS); - if (IS_ERR(ctx)) { - res = PTR_ERR(ctx); + ctx = kmem_cache_zalloc(ext4_crypto_ctx_cachep, GFP_NOFS); + if (!ctx) { + res = -ENOMEM; goto out; } ctx->flags |= EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL; } else { ctx->flags &= ~EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL; } - - /* Allocate a new Crypto API context if we don't already have - * one or if it isn't the right mode. */ - BUG_ON(key->mode == EXT4_ENCRYPTION_MODE_INVALID); - if (ctx->tfm && (ctx->mode != key->mode)) { - crypto_free_tfm(ctx->tfm); - ctx->tfm = NULL; - ctx->mode = EXT4_ENCRYPTION_MODE_INVALID; - } - if (!ctx->tfm) { - switch (key->mode) { - case EXT4_ENCRYPTION_MODE_AES_256_XTS: - ctx->tfm = crypto_ablkcipher_tfm( - crypto_alloc_ablkcipher("xts(aes)", 0, 0)); - break; - case EXT4_ENCRYPTION_MODE_AES_256_GCM: - /* TODO(mhalcrow): AEAD w/ gcm(aes); - * crypto_aead_setauthsize() */ - ctx->tfm = ERR_PTR(-ENOTSUPP); - break; - default: - BUG(); - } - if (IS_ERR_OR_NULL(ctx->tfm)) { - res = PTR_ERR(ctx->tfm); - ctx->tfm = NULL; - goto out; - } - ctx->mode = key->mode; - } - BUG_ON(key->size != ext4_encryption_key_size(key->mode)); - - /* There shouldn't be a bounce page attached to the crypto - * context at this point. */ - BUG_ON(ctx->bounce_page); + ctx->flags &= ~EXT4_WRITE_PATH_FL; out: if (res) { @@ -204,20 +150,8 @@ void ext4_exit_crypto(void) { struct ext4_crypto_ctx *pos, *n; - list_for_each_entry_safe(pos, n, &ext4_free_crypto_ctxs, free_list) { - if (pos->bounce_page) { - if (pos->flags & - EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL) { - __free_page(pos->bounce_page); - } else { - mempool_free(pos->bounce_page, - ext4_bounce_page_pool); - } - } - if (pos->tfm) - crypto_free_tfm(pos->tfm); - kfree(pos); - } + list_for_each_entry_safe(pos, n, &ext4_free_crypto_ctxs, free_list) + kmem_cache_free(ext4_crypto_ctx_cachep, pos); INIT_LIST_HEAD(&ext4_free_crypto_ctxs); if (ext4_bounce_page_pool) mempool_destroy(ext4_bounce_page_pool); @@ -225,6 +159,12 @@ void ext4_exit_crypto(void) if (ext4_read_workqueue) destroy_workqueue(ext4_read_workqueue); ext4_read_workqueue = NULL; + if (ext4_crypto_ctx_cachep) + kmem_cache_destroy(ext4_crypto_ctx_cachep); + ext4_crypto_ctx_cachep = NULL; + if (ext4_crypt_info_cachep) + kmem_cache_destroy(ext4_crypt_info_cachep); + ext4_crypt_info_cachep = NULL; } /** @@ -237,23 +177,31 @@ void ext4_exit_crypto(void) */ int ext4_init_crypto(void) { - int i, res; + int i, res = -ENOMEM; mutex_lock(&crypto_init); if (ext4_read_workqueue) goto already_initialized; ext4_read_workqueue = alloc_workqueue("ext4_crypto", WQ_HIGHPRI, 0); - if (!ext4_read_workqueue) { - res = -ENOMEM; + if (!ext4_read_workqueue) + goto fail; + + ext4_crypto_ctx_cachep = KMEM_CACHE(ext4_crypto_ctx, + SLAB_RECLAIM_ACCOUNT); + if (!ext4_crypto_ctx_cachep) + goto fail; + + ext4_crypt_info_cachep = KMEM_CACHE(ext4_crypt_info, + SLAB_RECLAIM_ACCOUNT); + if (!ext4_crypt_info_cachep) goto fail; - } for (i = 0; i < num_prealloc_crypto_ctxs; i++) { struct ext4_crypto_ctx *ctx; - ctx = ext4_alloc_and_init_crypto_ctx(GFP_KERNEL); - if (IS_ERR(ctx)) { - res = PTR_ERR(ctx); + ctx = kmem_cache_zalloc(ext4_crypto_ctx_cachep, GFP_NOFS); + if (!ctx) { + res = -ENOMEM; goto fail; } list_add(&ctx->free_list, &ext4_free_crypto_ctxs); @@ -317,32 +265,11 @@ static int ext4_page_crypto(struct ext4_crypto_ctx *ctx, struct ablkcipher_request *req = NULL; DECLARE_EXT4_COMPLETION_RESULT(ecr); struct scatterlist dst, src; - struct ext4_inode_info *ei = EXT4_I(inode); - struct crypto_ablkcipher *atfm = __crypto_ablkcipher_cast(ctx->tfm); + struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; + struct crypto_ablkcipher *tfm = ci->ci_ctfm; int res = 0; - BUG_ON(!ctx->tfm); - BUG_ON(ctx->mode != ei->i_encryption_key.mode); - - if (ctx->mode != EXT4_ENCRYPTION_MODE_AES_256_XTS) { - printk_ratelimited(KERN_ERR - "%s: unsupported crypto algorithm: %d\n", - __func__, ctx->mode); - return -ENOTSUPP; - } - - crypto_ablkcipher_clear_flags(atfm, ~0); - crypto_tfm_set_flags(ctx->tfm, CRYPTO_TFM_REQ_WEAK_KEY); - - res = crypto_ablkcipher_setkey(atfm, ei->i_encryption_key.raw, - ei->i_encryption_key.size); - if (res) { - printk_ratelimited(KERN_ERR - "%s: crypto_ablkcipher_setkey() failed\n", - __func__); - return res; - } - req = ablkcipher_request_alloc(atfm, GFP_NOFS); + req = ablkcipher_request_alloc(tfm, GFP_NOFS); if (!req) { printk_ratelimited(KERN_ERR "%s: crypto_request_alloc() failed\n", @@ -384,6 +311,15 @@ static int ext4_page_crypto(struct ext4_crypto_ctx *ctx, return 0; } +static struct page *alloc_bounce_page(struct ext4_crypto_ctx *ctx) +{ + ctx->w.bounce_page = mempool_alloc(ext4_bounce_page_pool, GFP_NOWAIT); + if (ctx->w.bounce_page == NULL) + return ERR_PTR(-ENOMEM); + ctx->flags |= EXT4_WRITE_PATH_FL; + return ctx->w.bounce_page; +} + /** * ext4_encrypt() - Encrypts a page * @inode: The inode for which the encryption should take place @@ -413,27 +349,17 @@ struct page *ext4_encrypt(struct inode *inode, return (struct page *) ctx; /* The encryption operation will require a bounce page. */ - ciphertext_page = alloc_page(GFP_NOFS); - if (!ciphertext_page) { - /* This is a potential bottleneck, but at least we'll have - * forward progress. */ - ciphertext_page = mempool_alloc(ext4_bounce_page_pool, - GFP_NOFS); - if (WARN_ON_ONCE(!ciphertext_page)) { - ciphertext_page = mempool_alloc(ext4_bounce_page_pool, - GFP_NOFS | __GFP_WAIT); - } - ctx->flags &= ~EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL; - } else { - ctx->flags |= EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL; - } - ctx->bounce_page = ciphertext_page; - ctx->control_page = plaintext_page; + ciphertext_page = alloc_bounce_page(ctx); + if (IS_ERR(ciphertext_page)) + goto errout; + ctx->w.control_page = plaintext_page; err = ext4_page_crypto(ctx, inode, EXT4_ENCRYPT, plaintext_page->index, plaintext_page, ciphertext_page); if (err) { + ciphertext_page = ERR_PTR(err); + errout: ext4_release_crypto_ctx(ctx); - return ERR_PTR(err); + return ciphertext_page; } SetPagePrivate(ciphertext_page); set_page_private(ciphertext_page, (unsigned long)ctx); @@ -470,8 +396,8 @@ int ext4_decrypt_one(struct inode *inode, struct page *page) struct ext4_crypto_ctx *ctx = ext4_get_crypto_ctx(inode); - if (!ctx) - return -ENOMEM; + if (IS_ERR(ctx)) + return PTR_ERR(ctx); ret = ext4_decrypt(ctx, page); ext4_release_crypto_ctx(ctx); return ret; @@ -493,21 +419,11 @@ int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex) if (IS_ERR(ctx)) return PTR_ERR(ctx); - ciphertext_page = alloc_page(GFP_NOFS); - if (!ciphertext_page) { - /* This is a potential bottleneck, but at least we'll have - * forward progress. */ - ciphertext_page = mempool_alloc(ext4_bounce_page_pool, - GFP_NOFS); - if (WARN_ON_ONCE(!ciphertext_page)) { - ciphertext_page = mempool_alloc(ext4_bounce_page_pool, - GFP_NOFS | __GFP_WAIT); - } - ctx->flags &= ~EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL; - } else { - ctx->flags |= EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL; + ciphertext_page = alloc_bounce_page(ctx); + if (IS_ERR(ciphertext_page)) { + err = PTR_ERR(ciphertext_page); + goto errout; } - ctx->bounce_page = ciphertext_page; while (len--) { err = ext4_page_crypto(ctx, inode, EXT4_ENCRYPT, lblk, @@ -529,6 +445,7 @@ int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex) goto errout; } err = submit_bio_wait(WRITE, bio); + bio_put(bio); if (err) goto errout; } diff --git a/fs/ext4/crypto_fname.c b/fs/ext4/crypto_fname.c index fded02f72..7dc4eb559 100644 --- a/fs/ext4/crypto_fname.c +++ b/fs/ext4/crypto_fname.c @@ -48,6 +48,12 @@ bool ext4_valid_filenames_enc_mode(uint32_t mode) return (mode == EXT4_ENCRYPTION_MODE_AES_256_CTS); } +static unsigned max_name_len(struct inode *inode) +{ + return S_ISLNK(inode->i_mode) ? inode->i_sb->s_blocksize : + EXT4_NAME_LEN; +} + /** * ext4_fname_encrypt() - * @@ -55,43 +61,52 @@ bool ext4_valid_filenames_enc_mode(uint32_t mode) * ciphertext. Errors are returned as negative numbers. We trust the caller to * allocate sufficient memory to oname string. */ -static int ext4_fname_encrypt(struct ext4_fname_crypto_ctx *ctx, +static int ext4_fname_encrypt(struct inode *inode, const struct qstr *iname, struct ext4_str *oname) { u32 ciphertext_len; struct ablkcipher_request *req = NULL; DECLARE_EXT4_COMPLETION_RESULT(ecr); - struct crypto_ablkcipher *tfm = ctx->ctfm; + struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; + struct crypto_ablkcipher *tfm = ci->ci_ctfm; int res = 0; char iv[EXT4_CRYPTO_BLOCK_SIZE]; - struct scatterlist sg[1]; - int padding = 4 << (ctx->flags & EXT4_POLICY_FLAGS_PAD_MASK); - char *workbuf; + struct scatterlist src_sg, dst_sg; + int padding = 4 << (ci->ci_flags & EXT4_POLICY_FLAGS_PAD_MASK); + char *workbuf, buf[32], *alloc_buf = NULL; + unsigned lim = max_name_len(inode); - if (iname->len <= 0 || iname->len > ctx->lim) + if (iname->len <= 0 || iname->len > lim) return -EIO; ciphertext_len = (iname->len < EXT4_CRYPTO_BLOCK_SIZE) ? EXT4_CRYPTO_BLOCK_SIZE : iname->len; ciphertext_len = ext4_fname_crypto_round_up(ciphertext_len, padding); - ciphertext_len = (ciphertext_len > ctx->lim) - ? ctx->lim : ciphertext_len; + ciphertext_len = (ciphertext_len > lim) + ? lim : ciphertext_len; + + if (ciphertext_len <= sizeof(buf)) { + workbuf = buf; + } else { + alloc_buf = kmalloc(ciphertext_len, GFP_NOFS); + if (!alloc_buf) + return -ENOMEM; + workbuf = alloc_buf; + } /* Allocate request */ req = ablkcipher_request_alloc(tfm, GFP_NOFS); if (!req) { printk_ratelimited( KERN_ERR "%s: crypto_request_alloc() failed\n", __func__); + kfree(alloc_buf); return -ENOMEM; } ablkcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, ext4_dir_crypt_complete, &ecr); - /* Map the workpage */ - workbuf = kmap(ctx->workpage); - /* Copy the input */ memcpy(workbuf, iname->name, iname->len); if (iname->len < ciphertext_len) @@ -101,21 +116,16 @@ static int ext4_fname_encrypt(struct ext4_fname_crypto_ctx *ctx, memset(iv, 0, EXT4_CRYPTO_BLOCK_SIZE); /* Create encryption request */ - sg_init_table(sg, 1); - sg_set_page(sg, ctx->workpage, PAGE_SIZE, 0); - ablkcipher_request_set_crypt(req, sg, sg, ciphertext_len, iv); + sg_init_one(&src_sg, workbuf, ciphertext_len); + sg_init_one(&dst_sg, oname->name, ciphertext_len); + ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv); res = crypto_ablkcipher_encrypt(req); if (res == -EINPROGRESS || res == -EBUSY) { BUG_ON(req->base.data != &ecr); wait_for_completion(&ecr.completion); res = ecr.res; } - if (res >= 0) { - /* Copy the result to output */ - memcpy(oname->name, workbuf, ciphertext_len); - res = ciphertext_len; - } - kunmap(ctx->workpage); + kfree(alloc_buf); ablkcipher_request_free(req); if (res < 0) { printk_ratelimited( @@ -132,20 +142,21 @@ static int ext4_fname_encrypt(struct ext4_fname_crypto_ctx *ctx, * Errors are returned as negative numbers. * We trust the caller to allocate sufficient memory to oname string. */ -static int ext4_fname_decrypt(struct ext4_fname_crypto_ctx *ctx, +static int ext4_fname_decrypt(struct inode *inode, const struct ext4_str *iname, struct ext4_str *oname) { struct ext4_str tmp_in[2], tmp_out[1]; struct ablkcipher_request *req = NULL; DECLARE_EXT4_COMPLETION_RESULT(ecr); - struct scatterlist sg[1]; - struct crypto_ablkcipher *tfm = ctx->ctfm; + struct scatterlist src_sg, dst_sg; + struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; + struct crypto_ablkcipher *tfm = ci->ci_ctfm; int res = 0; char iv[EXT4_CRYPTO_BLOCK_SIZE]; - char *workbuf; + unsigned lim = max_name_len(inode); - if (iname->len <= 0 || iname->len > ctx->lim) + if (iname->len <= 0 || iname->len > lim) return -EIO; tmp_in[0].name = iname->name; @@ -163,31 +174,19 @@ static int ext4_fname_decrypt(struct ext4_fname_crypto_ctx *ctx, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, ext4_dir_crypt_complete, &ecr); - /* Map the workpage */ - workbuf = kmap(ctx->workpage); - - /* Copy the input */ - memcpy(workbuf, iname->name, iname->len); - /* Initialize IV */ memset(iv, 0, EXT4_CRYPTO_BLOCK_SIZE); /* Create encryption request */ - sg_init_table(sg, 1); - sg_set_page(sg, ctx->workpage, PAGE_SIZE, 0); - ablkcipher_request_set_crypt(req, sg, sg, iname->len, iv); + sg_init_one(&src_sg, iname->name, iname->len); + sg_init_one(&dst_sg, oname->name, oname->len); + ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv); res = crypto_ablkcipher_decrypt(req); if (res == -EINPROGRESS || res == -EBUSY) { BUG_ON(req->base.data != &ecr); wait_for_completion(&ecr.completion); res = ecr.res; } - if (res >= 0) { - /* Copy the result to output */ - memcpy(oname->name, workbuf, iname->len); - res = iname->len; - } - kunmap(ctx->workpage); ablkcipher_request_free(req); if (res < 0) { printk_ratelimited( @@ -254,207 +253,6 @@ static int digest_decode(const char *src, int len, char *dst) } /** - * ext4_free_fname_crypto_ctx() - - * - * Frees up a crypto context. - */ -void ext4_free_fname_crypto_ctx(struct ext4_fname_crypto_ctx *ctx) -{ - if (ctx == NULL || IS_ERR(ctx)) - return; - - if (ctx->ctfm && !IS_ERR(ctx->ctfm)) - crypto_free_ablkcipher(ctx->ctfm); - if (ctx->htfm && !IS_ERR(ctx->htfm)) - crypto_free_hash(ctx->htfm); - if (ctx->workpage && !IS_ERR(ctx->workpage)) - __free_page(ctx->workpage); - kfree(ctx); -} - -/** - * ext4_put_fname_crypto_ctx() - - * - * Return: The crypto context onto free list. If the free list is above a - * threshold, completely frees up the context, and returns the memory. - * - * TODO: Currently we directly free the crypto context. Eventually we should - * add code it to return to free list. Such an approach will increase - * efficiency of directory lookup. - */ -void ext4_put_fname_crypto_ctx(struct ext4_fname_crypto_ctx **ctx) -{ - if (*ctx == NULL || IS_ERR(*ctx)) - return; - ext4_free_fname_crypto_ctx(*ctx); - *ctx = NULL; -} - -/** - * ext4_search_fname_crypto_ctx() - - */ -static struct ext4_fname_crypto_ctx *ext4_search_fname_crypto_ctx( - const struct ext4_encryption_key *key) -{ - return NULL; -} - -/** - * ext4_alloc_fname_crypto_ctx() - - */ -struct ext4_fname_crypto_ctx *ext4_alloc_fname_crypto_ctx( - const struct ext4_encryption_key *key) -{ - struct ext4_fname_crypto_ctx *ctx; - - ctx = kmalloc(sizeof(struct ext4_fname_crypto_ctx), GFP_NOFS); - if (ctx == NULL) - return ERR_PTR(-ENOMEM); - if (key->mode == EXT4_ENCRYPTION_MODE_INVALID) { - /* This will automatically set key mode to invalid - * As enum for ENCRYPTION_MODE_INVALID is zero */ - memset(&ctx->key, 0, sizeof(ctx->key)); - } else { - memcpy(&ctx->key, key, sizeof(struct ext4_encryption_key)); - } - ctx->has_valid_key = (EXT4_ENCRYPTION_MODE_INVALID == key->mode) - ? 0 : 1; - ctx->ctfm_key_is_ready = 0; - ctx->ctfm = NULL; - ctx->htfm = NULL; - ctx->workpage = NULL; - return ctx; -} - -/** - * ext4_get_fname_crypto_ctx() - - * - * Allocates a free crypto context and initializes it to hold - * the crypto material for the inode. - * - * Return: NULL if not encrypted. Error value on error. Valid pointer otherwise. - */ -struct ext4_fname_crypto_ctx *ext4_get_fname_crypto_ctx( - struct inode *inode, u32 max_ciphertext_len) -{ - struct ext4_fname_crypto_ctx *ctx; - struct ext4_inode_info *ei = EXT4_I(inode); - int res; - - /* Check if the crypto policy is set on the inode */ - res = ext4_encrypted_inode(inode); - if (res == 0) - return NULL; - - if (!ext4_has_encryption_key(inode)) - ext4_generate_encryption_key(inode); - - /* Get a crypto context based on the key. - * A new context is allocated if no context matches the requested key. - */ - ctx = ext4_search_fname_crypto_ctx(&(ei->i_encryption_key)); - if (ctx == NULL) - ctx = ext4_alloc_fname_crypto_ctx(&(ei->i_encryption_key)); - if (IS_ERR(ctx)) - return ctx; - - ctx->flags = ei->i_crypt_policy_flags; - if (ctx->has_valid_key) { - if (ctx->key.mode != EXT4_ENCRYPTION_MODE_AES_256_CTS) { - printk_once(KERN_WARNING - "ext4: unsupported key mode %d\n", - ctx->key.mode); - return ERR_PTR(-ENOKEY); - } - - /* As a first cut, we will allocate new tfm in every call. - * later, we will keep the tfm around, in case the key gets - * re-used */ - if (ctx->ctfm == NULL) { - ctx->ctfm = crypto_alloc_ablkcipher("cts(cbc(aes))", - 0, 0); - } - if (IS_ERR(ctx->ctfm)) { - res = PTR_ERR(ctx->ctfm); - printk( - KERN_DEBUG "%s: error (%d) allocating crypto tfm\n", - __func__, res); - ctx->ctfm = NULL; - ext4_put_fname_crypto_ctx(&ctx); - return ERR_PTR(res); - } - if (ctx->ctfm == NULL) { - printk( - KERN_DEBUG "%s: could not allocate crypto tfm\n", - __func__); - ext4_put_fname_crypto_ctx(&ctx); - return ERR_PTR(-ENOMEM); - } - if (ctx->workpage == NULL) - ctx->workpage = alloc_page(GFP_NOFS); - if (IS_ERR(ctx->workpage)) { - res = PTR_ERR(ctx->workpage); - printk( - KERN_DEBUG "%s: error (%d) allocating work page\n", - __func__, res); - ctx->workpage = NULL; - ext4_put_fname_crypto_ctx(&ctx); - return ERR_PTR(res); - } - if (ctx->workpage == NULL) { - printk( - KERN_DEBUG "%s: could not allocate work page\n", - __func__); - ext4_put_fname_crypto_ctx(&ctx); - return ERR_PTR(-ENOMEM); - } - ctx->lim = max_ciphertext_len; - crypto_ablkcipher_clear_flags(ctx->ctfm, ~0); - crypto_tfm_set_flags(crypto_ablkcipher_tfm(ctx->ctfm), - CRYPTO_TFM_REQ_WEAK_KEY); - - /* If we are lucky, we will get a context that is already - * set up with the right key. Else, we will have to - * set the key */ - if (!ctx->ctfm_key_is_ready) { - /* Since our crypto objectives for filename encryption - * are pretty weak, - * we directly use the inode master key */ - res = crypto_ablkcipher_setkey(ctx->ctfm, - ctx->key.raw, ctx->key.size); - if (res) { - ext4_put_fname_crypto_ctx(&ctx); - return ERR_PTR(-EIO); - } - ctx->ctfm_key_is_ready = 1; - } else { - /* In the current implementation, key should never be - * marked "ready" for a context that has just been - * allocated. So we should never reach here */ - BUG(); - } - } - if (ctx->htfm == NULL) - ctx->htfm = crypto_alloc_hash("sha256", 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(ctx->htfm)) { - res = PTR_ERR(ctx->htfm); - printk(KERN_DEBUG "%s: error (%d) allocating hash tfm\n", - __func__, res); - ctx->htfm = NULL; - ext4_put_fname_crypto_ctx(&ctx); - return ERR_PTR(res); - } - if (ctx->htfm == NULL) { - printk(KERN_DEBUG "%s: could not allocate hash tfm\n", - __func__); - ext4_put_fname_crypto_ctx(&ctx); - return ERR_PTR(-ENOMEM); - } - - return ctx; -} - -/** * ext4_fname_crypto_round_up() - * * Return: The next multiple of block size @@ -464,44 +262,29 @@ u32 ext4_fname_crypto_round_up(u32 size, u32 blksize) return ((size+blksize-1)/blksize)*blksize; } -/** - * ext4_fname_crypto_namelen_on_disk() - - */ -int ext4_fname_crypto_namelen_on_disk(struct ext4_fname_crypto_ctx *ctx, - u32 namelen) +unsigned ext4_fname_encrypted_size(struct inode *inode, u32 ilen) { - u32 ciphertext_len; - int padding = 4 << (ctx->flags & EXT4_POLICY_FLAGS_PAD_MASK); - - if (ctx == NULL) - return -EIO; - if (!(ctx->has_valid_key)) - return -EACCES; - ciphertext_len = (namelen < EXT4_CRYPTO_BLOCK_SIZE) ? - EXT4_CRYPTO_BLOCK_SIZE : namelen; - ciphertext_len = ext4_fname_crypto_round_up(ciphertext_len, padding); - ciphertext_len = (ciphertext_len > ctx->lim) - ? ctx->lim : ciphertext_len; - return (int) ciphertext_len; + struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; + int padding = 32; + + if (ci) + padding = 4 << (ci->ci_flags & EXT4_POLICY_FLAGS_PAD_MASK); + if (ilen < EXT4_CRYPTO_BLOCK_SIZE) + ilen = EXT4_CRYPTO_BLOCK_SIZE; + return ext4_fname_crypto_round_up(ilen, padding); } -/** - * ext4_fname_crypto_alloc_obuff() - +/* + * ext4_fname_crypto_alloc_buffer() - * * Allocates an output buffer that is sufficient for the crypto operation * specified by the context and the direction. */ -int ext4_fname_crypto_alloc_buffer(struct ext4_fname_crypto_ctx *ctx, +int ext4_fname_crypto_alloc_buffer(struct inode *inode, u32 ilen, struct ext4_str *crypto_str) { - unsigned int olen; - int padding = 4 << (ctx->flags & EXT4_POLICY_FLAGS_PAD_MASK); + unsigned int olen = ext4_fname_encrypted_size(inode, ilen); - if (!ctx) - return -EIO; - if (padding < EXT4_CRYPTO_BLOCK_SIZE) - padding = EXT4_CRYPTO_BLOCK_SIZE; - olen = ext4_fname_crypto_round_up(ilen, padding); crypto_str->len = olen; if (olen < EXT4_FNAME_CRYPTO_DIGEST_SIZE*2) olen = EXT4_FNAME_CRYPTO_DIGEST_SIZE*2; @@ -529,7 +312,7 @@ void ext4_fname_crypto_free_buffer(struct ext4_str *crypto_str) /** * ext4_fname_disk_to_usr() - converts a filename from disk space to user space */ -int _ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx, +int _ext4_fname_disk_to_usr(struct inode *inode, struct dx_hash_info *hinfo, const struct ext4_str *iname, struct ext4_str *oname) @@ -537,8 +320,6 @@ int _ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx, char buf[24]; int ret; - if (ctx == NULL) - return -EIO; if (iname->len < 3) { /*Check for . and .. */ if (iname->name[0] == '.' && iname->name[iname->len-1] == '.') { @@ -548,8 +329,8 @@ int _ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx, return oname->len; } } - if (ctx->has_valid_key) - return ext4_fname_decrypt(ctx, iname, oname); + if (EXT4_I(inode)->i_crypt_info) + return ext4_fname_decrypt(inode, iname, oname); if (iname->len <= EXT4_FNAME_CRYPTO_DIGEST_SIZE) { ret = digest_encode(iname->name, iname->len, oname->name); @@ -568,7 +349,7 @@ int _ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx, return ret + 1; } -int ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx, +int ext4_fname_disk_to_usr(struct inode *inode, struct dx_hash_info *hinfo, const struct ext4_dir_entry_2 *de, struct ext4_str *oname) @@ -576,21 +357,20 @@ int ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx, struct ext4_str iname = {.name = (unsigned char *) de->name, .len = de->name_len }; - return _ext4_fname_disk_to_usr(ctx, hinfo, &iname, oname); + return _ext4_fname_disk_to_usr(inode, hinfo, &iname, oname); } /** * ext4_fname_usr_to_disk() - converts a filename from user space to disk space */ -int ext4_fname_usr_to_disk(struct ext4_fname_crypto_ctx *ctx, +int ext4_fname_usr_to_disk(struct inode *inode, const struct qstr *iname, struct ext4_str *oname) { int res; + struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; - if (ctx == NULL) - return -EIO; if (iname->len < 3) { /*Check for . and .. */ if (iname->name[0] == '.' && @@ -601,8 +381,8 @@ int ext4_fname_usr_to_disk(struct ext4_fname_crypto_ctx *ctx, return oname->len; } } - if (ctx->has_valid_key) { - res = ext4_fname_encrypt(ctx, iname, oname); + if (ci) { + res = ext4_fname_encrypt(inode, iname, oname); return res; } /* Without a proper key, a user is not allowed to modify the filenames @@ -611,109 +391,79 @@ int ext4_fname_usr_to_disk(struct ext4_fname_crypto_ctx *ctx, return -EACCES; } -/* - * Calculate the htree hash from a filename from user space - */ -int ext4_fname_usr_to_hash(struct ext4_fname_crypto_ctx *ctx, - const struct qstr *iname, - struct dx_hash_info *hinfo) +int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, + int lookup, struct ext4_filename *fname) { - struct ext4_str tmp; - int ret = 0; - char buf[EXT4_FNAME_CRYPTO_DIGEST_SIZE+1]; + struct ext4_crypt_info *ci; + int ret = 0, bigname = 0; + + memset(fname, 0, sizeof(struct ext4_filename)); + fname->usr_fname = iname; - if (!ctx || + if (!ext4_encrypted_inode(dir) || ((iname->name[0] == '.') && ((iname->len == 1) || ((iname->name[1] == '.') && (iname->len == 2))))) { - ext4fs_dirhash(iname->name, iname->len, hinfo); + fname->disk_name.name = (unsigned char *) iname->name; + fname->disk_name.len = iname->len; return 0; } - - if (!ctx->has_valid_key && iname->name[0] == '_') { - if (iname->len != 33) - return -ENOENT; - ret = digest_decode(iname->name+1, iname->len, buf); - if (ret != 24) - return -ENOENT; - memcpy(&hinfo->hash, buf, 4); - memcpy(&hinfo->minor_hash, buf + 4, 4); + ret = ext4_get_encryption_info(dir); + if (ret) + return ret; + ci = EXT4_I(dir)->i_crypt_info; + if (ci) { + ret = ext4_fname_crypto_alloc_buffer(dir, iname->len, + &fname->crypto_buf); + if (ret < 0) + return ret; + ret = ext4_fname_encrypt(dir, iname, &fname->crypto_buf); + if (ret < 0) + goto errout; + fname->disk_name.name = fname->crypto_buf.name; + fname->disk_name.len = fname->crypto_buf.len; return 0; } + if (!lookup) + return -EACCES; - if (!ctx->has_valid_key && iname->name[0] != '_') { - if (iname->len > 43) - return -ENOENT; - ret = digest_decode(iname->name, iname->len, buf); - ext4fs_dirhash(buf, ret, hinfo); - return 0; + /* We don't have the key and we are doing a lookup; decode the + * user-supplied name + */ + if (iname->name[0] == '_') + bigname = 1; + if ((bigname && (iname->len != 33)) || + (!bigname && (iname->len > 43))) + return -ENOENT; + + fname->crypto_buf.name = kmalloc(32, GFP_KERNEL); + if (fname->crypto_buf.name == NULL) + return -ENOMEM; + ret = digest_decode(iname->name + bigname, iname->len - bigname, + fname->crypto_buf.name); + if (ret < 0) { + ret = -ENOENT; + goto errout; } - - /* First encrypt the plaintext name */ - ret = ext4_fname_crypto_alloc_buffer(ctx, iname->len, &tmp); - if (ret < 0) - return ret; - - ret = ext4_fname_encrypt(ctx, iname, &tmp); - if (ret >= 0) { - ext4fs_dirhash(tmp.name, tmp.len, hinfo); - ret = 0; + fname->crypto_buf.len = ret; + if (bigname) { + memcpy(&fname->hinfo.hash, fname->crypto_buf.name, 4); + memcpy(&fname->hinfo.minor_hash, fname->crypto_buf.name + 4, 4); + } else { + fname->disk_name.name = fname->crypto_buf.name; + fname->disk_name.len = fname->crypto_buf.len; } - - ext4_fname_crypto_free_buffer(&tmp); + return 0; +errout: + kfree(fname->crypto_buf.name); + fname->crypto_buf.name = NULL; return ret; } -int ext4_fname_match(struct ext4_fname_crypto_ctx *ctx, struct ext4_str *cstr, - int len, const char * const name, - struct ext4_dir_entry_2 *de) +void ext4_fname_free_filename(struct ext4_filename *fname) { - int ret = -ENOENT; - int bigname = (*name == '_'); - - if (ctx->has_valid_key) { - if (cstr->name == NULL) { - struct qstr istr; - - ret = ext4_fname_crypto_alloc_buffer(ctx, len, cstr); - if (ret < 0) - goto errout; - istr.name = name; - istr.len = len; - ret = ext4_fname_encrypt(ctx, &istr, cstr); - if (ret < 0) - goto errout; - } - } else { - if (cstr->name == NULL) { - cstr->name = kmalloc(32, GFP_KERNEL); - if (cstr->name == NULL) - return -ENOMEM; - if ((bigname && (len != 33)) || - (!bigname && (len > 43))) - goto errout; - ret = digest_decode(name+bigname, len-bigname, - cstr->name); - if (ret < 0) { - ret = -ENOENT; - goto errout; - } - cstr->len = ret; - } - if (bigname) { - if (de->name_len < 16) - return 0; - ret = memcmp(de->name + de->name_len - 16, - cstr->name + 8, 16); - return (ret == 0) ? 1 : 0; - } - } - if (de->name_len != cstr->len) - return 0; - ret = memcmp(de->name, cstr->name, cstr->len); - return (ret == 0) ? 1 : 0; -errout: - kfree(cstr->name); - cstr->name = NULL; - return ret; + kfree(fname->crypto_buf.name); + fname->crypto_buf.name = NULL; + fname->usr_fname = NULL; + fname->disk_name.name = NULL; } diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c index 52170d0b7..442d24e8e 100644 --- a/fs/ext4/crypto_key.c +++ b/fs/ext4/crypto_key.c @@ -84,14 +84,38 @@ out: return res; } -/** - * ext4_generate_encryption_key() - generates an encryption key - * @inode: The inode to generate the encryption key for. - */ -int ext4_generate_encryption_key(struct inode *inode) +void ext4_free_crypt_info(struct ext4_crypt_info *ci) +{ + if (!ci) + return; + + if (ci->ci_keyring_key) + key_put(ci->ci_keyring_key); + crypto_free_ablkcipher(ci->ci_ctfm); + kmem_cache_free(ext4_crypt_info_cachep, ci); +} + +void ext4_free_encryption_info(struct inode *inode, + struct ext4_crypt_info *ci) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_crypt_info *prev; + + if (ci == NULL) + ci = ACCESS_ONCE(ei->i_crypt_info); + if (ci == NULL) + return; + prev = cmpxchg(&ei->i_crypt_info, ci, NULL); + if (prev != ci) + return; + + ext4_free_crypt_info(ci); +} + +int _ext4_get_encryption_info(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); - struct ext4_encryption_key *crypt_key = &ei->i_encryption_key; + struct ext4_crypt_info *crypt_info; char full_key_descriptor[EXT4_KEY_DESC_PREFIX_SIZE + (EXT4_KEY_DESCRIPTOR_SIZE * 2) + 1]; struct key *keyring_key = NULL; @@ -99,31 +123,76 @@ int ext4_generate_encryption_key(struct inode *inode) struct ext4_encryption_context ctx; struct user_key_payload *ukp; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - int res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION, - EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, - &ctx, sizeof(ctx)); + struct crypto_ablkcipher *ctfm; + const char *cipher_str; + char raw_key[EXT4_MAX_KEY_SIZE]; + char mode; + int res; - if (res != sizeof(ctx)) { - if (res > 0) - res = -EINVAL; - goto out; + if (!ext4_read_workqueue) { + res = ext4_init_crypto(); + if (res) + return res; + } + +retry: + crypt_info = ACCESS_ONCE(ei->i_crypt_info); + if (crypt_info) { + if (!crypt_info->ci_keyring_key || + key_validate(crypt_info->ci_keyring_key) == 0) + return 0; + ext4_free_encryption_info(inode, crypt_info); + goto retry; } + + res = ext4_xattr_get(inode, EXT4_XATTR_INDEX_ENCRYPTION, + EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, + &ctx, sizeof(ctx)); + if (res < 0) { + if (!DUMMY_ENCRYPTION_ENABLED(sbi)) + return res; + ctx.contents_encryption_mode = EXT4_ENCRYPTION_MODE_AES_256_XTS; + ctx.filenames_encryption_mode = + EXT4_ENCRYPTION_MODE_AES_256_CTS; + ctx.flags = 0; + } else if (res != sizeof(ctx)) + return -EINVAL; res = 0; - ei->i_crypt_policy_flags = ctx.flags; + crypt_info = kmem_cache_alloc(ext4_crypt_info_cachep, GFP_KERNEL); + if (!crypt_info) + return -ENOMEM; + + crypt_info->ci_flags = ctx.flags; + crypt_info->ci_data_mode = ctx.contents_encryption_mode; + crypt_info->ci_filename_mode = ctx.filenames_encryption_mode; + crypt_info->ci_ctfm = NULL; + crypt_info->ci_keyring_key = NULL; + memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor, + sizeof(crypt_info->ci_master_key)); if (S_ISREG(inode->i_mode)) - crypt_key->mode = ctx.contents_encryption_mode; + mode = crypt_info->ci_data_mode; else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - crypt_key->mode = ctx.filenames_encryption_mode; - else { - printk(KERN_ERR "ext4 crypto: Unsupported inode type.\n"); + mode = crypt_info->ci_filename_mode; + else BUG(); + switch (mode) { + case EXT4_ENCRYPTION_MODE_AES_256_XTS: + cipher_str = "xts(aes)"; + break; + case EXT4_ENCRYPTION_MODE_AES_256_CTS: + cipher_str = "cts(cbc(aes))"; + break; + default: + printk_once(KERN_WARNING + "ext4: unsupported key mode %d (ino %u)\n", + mode, (unsigned) inode->i_ino); + res = -ENOKEY; + goto out; } - crypt_key->size = ext4_encryption_key_size(crypt_key->mode); - BUG_ON(!crypt_key->size); if (DUMMY_ENCRYPTION_ENABLED(sbi)) { - memset(crypt_key->raw, 0x42, EXT4_AES_256_XTS_KEY_SIZE); - goto out; + memset(raw_key, 0x42, EXT4_AES_256_XTS_KEY_SIZE); + goto got_key; } memcpy(full_key_descriptor, EXT4_KEY_DESC_PREFIX, EXT4_KEY_DESC_PREFIX_SIZE); @@ -138,6 +207,7 @@ int ext4_generate_encryption_key(struct inode *inode) keyring_key = NULL; goto out; } + crypt_info->ci_keyring_key = keyring_key; BUG_ON(keyring_key->type != &key_type_logon); ukp = ((struct user_key_payload *)keyring_key->payload.data); if (ukp->datalen != sizeof(struct ext4_encryption_key)) { @@ -148,19 +218,43 @@ int ext4_generate_encryption_key(struct inode *inode) BUILD_BUG_ON(EXT4_AES_128_ECB_KEY_SIZE != EXT4_KEY_DERIVATION_NONCE_SIZE); BUG_ON(master_key->size != EXT4_AES_256_XTS_KEY_SIZE); - res = ext4_derive_key_aes(ctx.nonce, master_key->raw, crypt_key->raw); + res = ext4_derive_key_aes(ctx.nonce, master_key->raw, + raw_key); +got_key: + ctfm = crypto_alloc_ablkcipher(cipher_str, 0, 0); + if (!ctfm || IS_ERR(ctfm)) { + res = ctfm ? PTR_ERR(ctfm) : -ENOMEM; + printk(KERN_DEBUG + "%s: error %d (inode %u) allocating crypto tfm\n", + __func__, res, (unsigned) inode->i_ino); + goto out; + } + crypt_info->ci_ctfm = ctfm; + crypto_ablkcipher_clear_flags(ctfm, ~0); + crypto_tfm_set_flags(crypto_ablkcipher_tfm(ctfm), + CRYPTO_TFM_REQ_WEAK_KEY); + res = crypto_ablkcipher_setkey(ctfm, raw_key, + ext4_encryption_key_size(mode)); + if (res) + goto out; + memzero_explicit(raw_key, sizeof(raw_key)); + if (cmpxchg(&ei->i_crypt_info, NULL, crypt_info) != NULL) { + ext4_free_crypt_info(crypt_info); + goto retry; + } + return 0; + out: - if (keyring_key) - key_put(keyring_key); - if (res < 0) - crypt_key->mode = EXT4_ENCRYPTION_MODE_INVALID; + if (res == -ENOKEY) + res = 0; + ext4_free_crypt_info(crypt_info); + memzero_explicit(raw_key, sizeof(raw_key)); return res; } int ext4_has_encryption_key(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); - struct ext4_encryption_key *crypt_key = &ei->i_encryption_key; - return (crypt_key->mode != EXT4_ENCRYPTION_MODE_INVALID); + return (ei->i_crypt_info != NULL); } diff --git a/fs/ext4/crypto_policy.c b/fs/ext4/crypto_policy.c index a6d6291ae..02c4e5df7 100644 --- a/fs/ext4/crypto_policy.c +++ b/fs/ext4/crypto_policy.c @@ -51,6 +51,10 @@ static int ext4_create_encryption_context_from_policy( struct ext4_encryption_context ctx; int res = 0; + res = ext4_convert_inline_data(inode); + if (res) + return res; + ctx.format = EXT4_ENCRYPTION_CONTEXT_FORMAT_V1; memcpy(ctx.master_key_descriptor, policy->master_key_descriptor, EXT4_KEY_DESCRIPTOR_SIZE); @@ -89,6 +93,8 @@ int ext4_process_policy(const struct ext4_encryption_policy *policy, return -EINVAL; if (!ext4_inode_has_encryption_context(inode)) { + if (!S_ISDIR(inode->i_mode)) + return -EINVAL; if (!ext4_empty_dir(inode)) return -ENOTEMPTY; return ext4_create_encryption_context_from_policy(inode, @@ -126,7 +132,7 @@ int ext4_get_policy(struct inode *inode, struct ext4_encryption_policy *policy) int ext4_is_child_context_consistent_with_parent(struct inode *parent, struct inode *child) { - struct ext4_encryption_context parent_ctx, child_ctx; + struct ext4_crypt_info *parent_ci, *child_ci; int res; if ((parent == NULL) || (child == NULL)) { @@ -136,26 +142,28 @@ int ext4_is_child_context_consistent_with_parent(struct inode *parent, /* no restrictions if the parent directory is not encrypted */ if (!ext4_encrypted_inode(parent)) return 1; - res = ext4_xattr_get(parent, EXT4_XATTR_INDEX_ENCRYPTION, - EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, - &parent_ctx, sizeof(parent_ctx)); - if (res != sizeof(parent_ctx)) - return 0; /* if the child directory is not encrypted, this is always a problem */ if (!ext4_encrypted_inode(child)) return 0; - res = ext4_xattr_get(child, EXT4_XATTR_INDEX_ENCRYPTION, - EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, - &child_ctx, sizeof(child_ctx)); - if (res != sizeof(child_ctx)) + res = ext4_get_encryption_info(parent); + if (res) return 0; - return (memcmp(parent_ctx.master_key_descriptor, - child_ctx.master_key_descriptor, + res = ext4_get_encryption_info(child); + if (res) + return 0; + parent_ci = EXT4_I(parent)->i_crypt_info; + child_ci = EXT4_I(child)->i_crypt_info; + if (!parent_ci && !child_ci) + return 1; + if (!parent_ci || !child_ci) + return 0; + + return (memcmp(parent_ci->ci_master_key, + child_ci->ci_master_key, EXT4_KEY_DESCRIPTOR_SIZE) == 0 && - (parent_ctx.contents_encryption_mode == - child_ctx.contents_encryption_mode) && - (parent_ctx.filenames_encryption_mode == - child_ctx.filenames_encryption_mode)); + (parent_ci->ci_data_mode == child_ci->ci_data_mode) && + (parent_ci->ci_filename_mode == child_ci->ci_filename_mode) && + (parent_ci->ci_flags == child_ci->ci_flags)); } /** @@ -168,31 +176,40 @@ int ext4_is_child_context_consistent_with_parent(struct inode *parent, int ext4_inherit_context(struct inode *parent, struct inode *child) { struct ext4_encryption_context ctx; - int res = ext4_xattr_get(parent, EXT4_XATTR_INDEX_ENCRYPTION, - EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, - &ctx, sizeof(ctx)); + struct ext4_crypt_info *ci; + int res; + + res = ext4_get_encryption_info(parent); + if (res < 0) + return res; + ci = EXT4_I(parent)->i_crypt_info; + if (ci == NULL) + return -ENOKEY; - if (res != sizeof(ctx)) { - if (DUMMY_ENCRYPTION_ENABLED(EXT4_SB(parent->i_sb))) { - ctx.format = EXT4_ENCRYPTION_CONTEXT_FORMAT_V1; - ctx.contents_encryption_mode = - EXT4_ENCRYPTION_MODE_AES_256_XTS; - ctx.filenames_encryption_mode = - EXT4_ENCRYPTION_MODE_AES_256_CTS; - ctx.flags = 0; - memset(ctx.master_key_descriptor, 0x42, - EXT4_KEY_DESCRIPTOR_SIZE); - res = 0; - } else { - goto out; - } + ctx.format = EXT4_ENCRYPTION_CONTEXT_FORMAT_V1; + if (DUMMY_ENCRYPTION_ENABLED(EXT4_SB(parent->i_sb))) { + ctx.contents_encryption_mode = EXT4_ENCRYPTION_MODE_AES_256_XTS; + ctx.filenames_encryption_mode = + EXT4_ENCRYPTION_MODE_AES_256_CTS; + ctx.flags = 0; + memset(ctx.master_key_descriptor, 0x42, + EXT4_KEY_DESCRIPTOR_SIZE); + res = 0; + } else { + ctx.contents_encryption_mode = ci->ci_data_mode; + ctx.filenames_encryption_mode = ci->ci_filename_mode; + ctx.flags = ci->ci_flags; + memcpy(ctx.master_key_descriptor, ci->ci_master_key, + EXT4_KEY_DESCRIPTOR_SIZE); } get_random_bytes(ctx.nonce, EXT4_KEY_DERIVATION_NONCE_SIZE); res = ext4_xattr_set(child, EXT4_XATTR_INDEX_ENCRYPTION, EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx, sizeof(ctx), 0); -out: - if (!res) + if (!res) { ext4_set_inode_flag(child, EXT4_INODE_ENCRYPT); + ext4_clear_inode_state(child, EXT4_STATE_MAY_INLINE_DATA); + res = ext4_get_encryption_info(child); + } return res; } diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 5665d82d2..f9e149119 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -110,7 +110,6 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) struct super_block *sb = inode->i_sb; struct buffer_head *bh = NULL; int dir_has_error = 0; - struct ext4_fname_crypto_ctx *enc_ctx = NULL; struct ext4_str fname_crypto_str = {.name = NULL, .len = 0}; if (is_dx_dir(inode)) { @@ -134,16 +133,11 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) return err; } - enc_ctx = ext4_get_fname_crypto_ctx(inode, EXT4_NAME_LEN); - if (IS_ERR(enc_ctx)) - return PTR_ERR(enc_ctx); - if (enc_ctx) { - err = ext4_fname_crypto_alloc_buffer(enc_ctx, EXT4_NAME_LEN, + if (ext4_encrypted_inode(inode)) { + err = ext4_fname_crypto_alloc_buffer(inode, EXT4_NAME_LEN, &fname_crypto_str); - if (err < 0) { - ext4_put_fname_crypto_ctx(&enc_ctx); + if (err < 0) return err; - } } offset = ctx->pos & (sb->s_blocksize - 1); @@ -239,17 +233,19 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); if (le32_to_cpu(de->inode)) { - if (enc_ctx == NULL) { - /* Directory is not encrypted */ + if (!ext4_encrypted_inode(inode)) { if (!dir_emit(ctx, de->name, de->name_len, le32_to_cpu(de->inode), get_dtype(sb, de->file_type))) goto done; } else { + int save_len = fname_crypto_str.len; + /* Directory is encrypted */ - err = ext4_fname_disk_to_usr(enc_ctx, + err = ext4_fname_disk_to_usr(inode, NULL, de, &fname_crypto_str); + fname_crypto_str.len = save_len; if (err < 0) goto errout; if (!dir_emit(ctx, @@ -272,7 +268,6 @@ done: err = 0; errout: #ifdef CONFIG_EXT4_FS_ENCRYPTION - ext4_put_fname_crypto_ctx(&enc_ctx); ext4_fname_crypto_free_buffer(&fname_crypto_str); #endif brelse(bh); @@ -598,6 +593,13 @@ finished: return 0; } +static int ext4_dir_open(struct inode * inode, struct file * filp) +{ + if (ext4_encrypted_inode(inode)) + return ext4_get_encryption_info(inode) ? -EACCES : 0; + return 0; +} + static int ext4_release_dir(struct inode *inode, struct file *filp) { if (filp->private_data) @@ -640,5 +642,6 @@ const struct file_operations ext4_dir_operations = { .compat_ioctl = ext4_compat_ioctl, #endif .fsync = ext4_sync_file, + .open = ext4_dir_open, .release = ext4_release_dir, }; diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 9a83f149a..f5e9f0422 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -69,15 +69,6 @@ #define ext_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif -#define EXT4_ERROR_INODE(inode, fmt, a...) \ - ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a) - -#define EXT4_ERROR_INODE_BLOCK(inode, block, fmt, a...) \ - ext4_error_inode((inode), __func__, __LINE__, (block), (fmt), ## a) - -#define EXT4_ERROR_FILE(file, block, fmt, a...) \ - ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a) - /* data type for block offset of block group */ typedef int ext4_grpblk_t; @@ -90,6 +81,11 @@ typedef __u32 ext4_lblk_t; /* data type for block group number */ typedef unsigned int ext4_group_t; +enum SHIFT_DIRECTION { + SHIFT_LEFT = 0, + SHIFT_RIGHT, +}; + /* * Flags used in mballoc's allocation_context flags field. * @@ -911,7 +907,6 @@ struct ext4_inode_info { /* on-disk additional length */ __u16 i_extra_isize; - char i_crypt_policy_flags; /* Indicate the inline data space. */ u16 i_inline_off; @@ -955,7 +950,7 @@ struct ext4_inode_info { #ifdef CONFIG_EXT4_FS_ENCRYPTION /* Encryption params */ - struct ext4_encryption_key i_encryption_key; + struct ext4_crypt_info *i_crypt_info; #endif }; @@ -1374,12 +1369,6 @@ struct ext4_sb_info { struct ratelimit_state s_err_ratelimit_state; struct ratelimit_state s_warning_ratelimit_state; struct ratelimit_state s_msg_ratelimit_state; - -#ifdef CONFIG_EXT4_FS_ENCRYPTION - /* Encryption */ - uint32_t s_file_encryption_mode; - uint32_t s_dir_encryption_mode; -#endif }; static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) @@ -1838,6 +1827,17 @@ struct dx_hash_info */ #define HASH_NB_ALWAYS 1 +struct ext4_filename { + const struct qstr *usr_fname; + struct ext4_str disk_name; + struct dx_hash_info hinfo; +#ifdef CONFIG_EXT4_FS_ENCRYPTION + struct ext4_str crypto_buf; +#endif +}; + +#define fname_name(p) ((p)->disk_name.name) +#define fname_len(p) ((p)->disk_name.len) /* * Describe an inode's exact location on disk and in memory @@ -2054,6 +2054,7 @@ int ext4_get_policy(struct inode *inode, struct ext4_encryption_policy *policy); /* crypto.c */ +extern struct kmem_cache *ext4_crypt_info_cachep; bool ext4_valid_contents_enc_mode(uint32_t mode); uint32_t ext4_validate_encryption_key_size(uint32_t mode, uint32_t size); extern struct workqueue_struct *ext4_read_workqueue; @@ -2085,57 +2086,84 @@ static inline int ext4_sb_has_crypto(struct super_block *sb) /* crypto_fname.c */ bool ext4_valid_filenames_enc_mode(uint32_t mode); u32 ext4_fname_crypto_round_up(u32 size, u32 blksize); -int ext4_fname_crypto_alloc_buffer(struct ext4_fname_crypto_ctx *ctx, +unsigned ext4_fname_encrypted_size(struct inode *inode, u32 ilen); +int ext4_fname_crypto_alloc_buffer(struct inode *inode, u32 ilen, struct ext4_str *crypto_str); -int _ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx, +int _ext4_fname_disk_to_usr(struct inode *inode, struct dx_hash_info *hinfo, const struct ext4_str *iname, struct ext4_str *oname); -int ext4_fname_disk_to_usr(struct ext4_fname_crypto_ctx *ctx, +int ext4_fname_disk_to_usr(struct inode *inode, struct dx_hash_info *hinfo, const struct ext4_dir_entry_2 *de, struct ext4_str *oname); -int ext4_fname_usr_to_disk(struct ext4_fname_crypto_ctx *ctx, +int ext4_fname_usr_to_disk(struct inode *inode, const struct qstr *iname, struct ext4_str *oname); -int ext4_fname_usr_to_hash(struct ext4_fname_crypto_ctx *ctx, - const struct qstr *iname, - struct dx_hash_info *hinfo); -int ext4_fname_crypto_namelen_on_disk(struct ext4_fname_crypto_ctx *ctx, - u32 namelen); -int ext4_fname_match(struct ext4_fname_crypto_ctx *ctx, struct ext4_str *cstr, - int len, const char * const name, - struct ext4_dir_entry_2 *de); - - #ifdef CONFIG_EXT4_FS_ENCRYPTION -void ext4_put_fname_crypto_ctx(struct ext4_fname_crypto_ctx **ctx); -struct ext4_fname_crypto_ctx *ext4_get_fname_crypto_ctx(struct inode *inode, - u32 max_len); void ext4_fname_crypto_free_buffer(struct ext4_str *crypto_str); +int ext4_fname_setup_filename(struct inode *dir, const struct qstr *iname, + int lookup, struct ext4_filename *fname); +void ext4_fname_free_filename(struct ext4_filename *fname); #else static inline -void ext4_put_fname_crypto_ctx(struct ext4_fname_crypto_ctx **ctx) { } -static inline -struct ext4_fname_crypto_ctx *ext4_get_fname_crypto_ctx(struct inode *inode, - u32 max_len) +int ext4_setup_fname_crypto(struct inode *inode) { - return NULL; + return 0; } static inline void ext4_fname_crypto_free_buffer(struct ext4_str *p) { } +static inline int ext4_fname_setup_filename(struct inode *dir, + const struct qstr *iname, + int lookup, struct ext4_filename *fname) +{ + fname->usr_fname = iname; + fname->disk_name.name = (unsigned char *) iname->name; + fname->disk_name.len = iname->len; + return 0; +} +static inline void ext4_fname_free_filename(struct ext4_filename *fname) { } #endif /* crypto_key.c */ -int ext4_generate_encryption_key(struct inode *inode); +void ext4_free_crypt_info(struct ext4_crypt_info *ci); +void ext4_free_encryption_info(struct inode *inode, struct ext4_crypt_info *ci); +int _ext4_get_encryption_info(struct inode *inode); #ifdef CONFIG_EXT4_FS_ENCRYPTION int ext4_has_encryption_key(struct inode *inode); + +static inline int ext4_get_encryption_info(struct inode *inode) +{ + struct ext4_crypt_info *ci = EXT4_I(inode)->i_crypt_info; + + if (!ci || + (ci->ci_keyring_key && + (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) | + (1 << KEY_FLAG_REVOKED) | + (1 << KEY_FLAG_DEAD))))) + return _ext4_get_encryption_info(inode); + return 0; +} + +static inline struct ext4_crypt_info *ext4_encryption_info(struct inode *inode) +{ + return EXT4_I(inode)->i_crypt_info; +} + #else static inline int ext4_has_encryption_key(struct inode *inode) { return 0; } +static inline int ext4_get_encryption_info(struct inode *inode) +{ + return 0; +} +static inline struct ext4_crypt_info *ext4_encryption_info(struct inode *inode) +{ + return NULL; +} #endif @@ -2156,14 +2184,13 @@ extern void ext4_htree_free_dir_info(struct dir_private_info *p); extern int ext4_find_dest_de(struct inode *dir, struct inode *inode, struct buffer_head *bh, void *buf, int buf_size, - const char *name, int namelen, + struct ext4_filename *fname, struct ext4_dir_entry_2 **dest_de); int ext4_insert_dentry(struct inode *dir, - struct inode *inode, - struct ext4_dir_entry_2 *de, - int buf_size, - const struct qstr *iname, - const char *name, int namelen); + struct inode *inode, + struct ext4_dir_entry_2 *de, + int buf_size, + struct ext4_filename *fname); static inline void ext4_update_dx_flag(struct inode *inode) { if (!EXT4_HAS_COMPAT_FEATURE(inode->i_sb, @@ -2317,13 +2344,14 @@ extern int ext4_orphan_add(handle_t *, struct inode *); extern int ext4_orphan_del(handle_t *, struct inode *); extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, __u32 start_minor_hash, __u32 *next_hash); -extern int search_dir(struct buffer_head *bh, - char *search_buf, - int buf_size, - struct inode *dir, - const struct qstr *d_name, - unsigned int offset, - struct ext4_dir_entry_2 **res_dir); +extern int ext4_search_dir(struct buffer_head *bh, + char *search_buf, + int buf_size, + struct inode *dir, + struct ext4_filename *fname, + const struct qstr *d_name, + unsigned int offset, + struct ext4_dir_entry_2 **res_dir); extern int ext4_generic_delete_entry(handle_t *handle, struct inode *dir, struct ext4_dir_entry_2 *de_del, @@ -2368,6 +2396,9 @@ void __ext4_abort(struct super_block *, const char *, unsigned int, extern __printf(4, 5) void __ext4_warning(struct super_block *, const char *, unsigned int, const char *, ...); +extern __printf(4, 5) +void __ext4_warning_inode(const struct inode *inode, const char *function, + unsigned int line, const char *fmt, ...); extern __printf(3, 4) void __ext4_msg(struct super_block *, const char *, const char *, ...); extern void __dump_mmp_msg(struct super_block *, struct mmp_struct *mmp, @@ -2378,6 +2409,15 @@ void __ext4_grp_locked_error(const char *, unsigned int, unsigned long, ext4_fsblk_t, const char *, ...); +#define EXT4_ERROR_INODE(inode, fmt, a...) \ + ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a) + +#define EXT4_ERROR_INODE_BLOCK(inode, block, fmt, a...) \ + ext4_error_inode((inode), __func__, __LINE__, (block), (fmt), ## a) + +#define EXT4_ERROR_FILE(file, block, fmt, a...) \ + ext4_error_file((file), __func__, __LINE__, (block), (fmt), ## a) + #ifdef CONFIG_PRINTK #define ext4_error_inode(inode, func, line, block, fmt, ...) \ @@ -2390,6 +2430,8 @@ void __ext4_grp_locked_error(const char *, unsigned int, __ext4_abort(sb, __func__, __LINE__, fmt, ##__VA_ARGS__) #define ext4_warning(sb, fmt, ...) \ __ext4_warning(sb, __func__, __LINE__, fmt, ##__VA_ARGS__) +#define ext4_warning_inode(inode, fmt, ...) \ + __ext4_warning_inode(inode, __func__, __LINE__, fmt, ##__VA_ARGS__) #define ext4_msg(sb, level, fmt, ...) \ __ext4_msg(sb, level, fmt, ##__VA_ARGS__) #define dump_mmp_msg(sb, mmp, msg) \ @@ -2425,6 +2467,11 @@ do { \ no_printk(fmt, ##__VA_ARGS__); \ __ext4_warning(sb, "", 0, " "); \ } while (0) +#define ext4_warning_inode(inode, fmt, ...) \ +do { \ + no_printk(fmt, ##__VA_ARGS__); \ + __ext4_warning_inode(inode, "", 0, " "); \ +} while (0) #define ext4_msg(sb, level, fmt, ...) \ do { \ no_printk(fmt, ##__VA_ARGS__); \ @@ -2768,7 +2815,9 @@ extern int ext4_da_write_inline_data_begin(struct address_space *mapping, extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, unsigned copied, struct page *page); -extern int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry, +extern int ext4_try_add_inline_entry(handle_t *handle, + struct ext4_filename *fname, + struct dentry *dentry, struct inode *inode); extern int ext4_try_create_inline_dir(handle_t *handle, struct inode *parent, @@ -2782,6 +2831,7 @@ extern int htree_inlinedir_to_tree(struct file *dir_file, __u32 start_hash, __u32 start_minor_hash, int *has_inline_data); extern struct buffer_head *ext4_find_inline_entry(struct inode *dir, + struct ext4_filename *fname, const struct qstr *d_name, struct ext4_dir_entry_2 **res_dir, int *has_inline_data); @@ -2847,6 +2897,7 @@ extern int ext4_mpage_readpages(struct address_space *mapping, unsigned nr_pages); /* symlink.c */ +extern const struct inode_operations ext4_encrypted_symlink_inode_operations; extern const struct inode_operations ext4_symlink_inode_operations; extern const struct inode_operations ext4_fast_symlink_inode_operations; @@ -2912,6 +2963,7 @@ extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); extern int ext4_ext_precache(struct inode *inode); extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len); +extern int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len); extern int ext4_swap_extents(handle_t *handle, struct inode *inode1, struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2, ext4_lblk_t count, diff --git a/fs/ext4/ext4_crypto.h b/fs/ext4/ext4_crypto.h index d75159c10..ac7d4e813 100644 --- a/fs/ext4/ext4_crypto.h +++ b/fs/ext4/ext4_crypto.h @@ -66,24 +66,39 @@ struct ext4_encryption_context { #define EXT4_KEY_DESC_PREFIX "ext4:" #define EXT4_KEY_DESC_PREFIX_SIZE 5 +/* This is passed in from userspace into the kernel keyring */ struct ext4_encryption_key { - uint32_t mode; - char raw[EXT4_MAX_KEY_SIZE]; - uint32_t size; + __u32 mode; + char raw[EXT4_MAX_KEY_SIZE]; + __u32 size; +} __attribute__((__packed__)); + +struct ext4_crypt_info { + char ci_data_mode; + char ci_filename_mode; + char ci_flags; + struct crypto_ablkcipher *ci_ctfm; + struct key *ci_keyring_key; + char ci_master_key[EXT4_KEY_DESCRIPTOR_SIZE]; }; #define EXT4_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001 -#define EXT4_BOUNCE_PAGE_REQUIRES_FREE_ENCRYPT_FL 0x00000002 +#define EXT4_WRITE_PATH_FL 0x00000002 struct ext4_crypto_ctx { - struct crypto_tfm *tfm; /* Crypto API context */ - struct page *bounce_page; /* Ciphertext page on write path */ - struct page *control_page; /* Original page on write path */ - struct bio *bio; /* The bio for this context */ - struct work_struct work; /* Work queue for read complete path */ - struct list_head free_list; /* Free list */ - int flags; /* Flags */ - int mode; /* Encryption mode for tfm */ + union { + struct { + struct page *bounce_page; /* Ciphertext page */ + struct page *control_page; /* Original page */ + } w; + struct { + struct bio *bio; + struct work_struct work; + } r; + struct list_head free_list; /* Free list */ + }; + char flags; /* Flags */ + char mode; /* Encryption mode for tfm */ }; struct ext4_completion_result { @@ -121,18 +136,6 @@ struct ext4_str { u32 len; }; -struct ext4_fname_crypto_ctx { - u32 lim; - char tmp_buf[EXT4_CRYPTO_BLOCK_SIZE]; - struct crypto_ablkcipher *ctfm; - struct crypto_hash *htfm; - struct page *workpage; - struct ext4_encryption_key key; - unsigned flags : 8; - unsigned has_valid_key : 1; - unsigned ctfm_key_is_ready : 1; -}; - /** * For encrypted symlinks, the ciphertext length is stored at the beginning * of the string in little-endian format. diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 87ba10d1d..2553aa8b6 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -39,6 +39,7 @@ #include <linux/slab.h> #include <asm/uaccess.h> #include <linux/fiemap.h> +#include <linux/backing-dev.h> #include "ext4_jbd2.h" #include "ext4_extents.h" #include "xattr.h" @@ -4456,6 +4457,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ar.flags |= EXT4_MB_HINT_NOPREALLOC; if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) ar.flags |= EXT4_MB_DELALLOC_RESERVED; + if (flags & EXT4_GET_BLOCKS_METADATA_NOFAIL) + ar.flags |= EXT4_MB_USE_RESERVED; newblock = ext4_mb_new_blocks(handle, &ar, &err); if (!newblock) goto out2; @@ -4663,6 +4666,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, int ret = 0; int ret2 = 0; int retries = 0; + int depth = 0; struct ext4_map_blocks map; unsigned int credits; loff_t epos; @@ -4677,13 +4681,32 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, if (len <= EXT_UNWRITTEN_MAX_LEN) flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; + /* Wait all existing dio workers, newcomers will block on i_mutex */ + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + /* * credits to insert 1 extent into extent tree */ credits = ext4_chunk_trans_blocks(inode, len); + /* + * We can only call ext_depth() on extent based inodes + */ + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + depth = ext_depth(inode); + else + depth = -1; retry: while (ret >= 0 && len) { + /* + * Recalculate credits when extent tree depth changes. + */ + if (depth >= 0 && depth != ext_depth(inode)) { + credits = ext4_chunk_trans_blocks(inode, len); + depth = ext_depth(inode); + } + handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits); if (IS_ERR(handle)) { @@ -4725,6 +4748,8 @@ retry: goto retry; } + ext4_inode_resume_unlocked_dio(inode); + return ret > 0 ? ret2 : ret; } @@ -4912,12 +4937,14 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) * bug we should fix.... */ if (ext4_encrypted_inode(inode) && - (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))) + (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE | + FALLOC_FL_ZERO_RANGE))) return -EOPNOTSUPP; /* Return error if mode is not supported */ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | - FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE)) + FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | + FALLOC_FL_INSERT_RANGE)) return -EOPNOTSUPP; if (mode & FALLOC_FL_PUNCH_HOLE) @@ -4930,6 +4957,9 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (mode & FALLOC_FL_COLLAPSE_RANGE) return ext4_collapse_range(inode, offset, len); + if (mode & FALLOC_FL_INSERT_RANGE) + return ext4_insert_range(inode, offset, len); + if (mode & FALLOC_FL_ZERO_RANGE) return ext4_zero_range(file, offset, len, mode); @@ -5224,13 +5254,13 @@ ext4_access_path(handle_t *handle, struct inode *inode, /* * ext4_ext_shift_path_extents: * Shift the extents of a path structure lying between path[depth].p_ext - * and EXT_LAST_EXTENT(path[depth].p_hdr) downwards, by subtracting shift - * from starting block for each extent. + * and EXT_LAST_EXTENT(path[depth].p_hdr), by @shift blocks. @SHIFT tells + * if it is right shift or left shift operation. */ static int ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, struct inode *inode, handle_t *handle, - ext4_lblk_t *start) + enum SHIFT_DIRECTION SHIFT) { int depth, err = 0; struct ext4_extent *ex_start, *ex_last; @@ -5252,19 +5282,25 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, if (ex_start == EXT_FIRST_EXTENT(path[depth].p_hdr)) update = 1; - *start = le32_to_cpu(ex_last->ee_block) + - ext4_ext_get_actual_len(ex_last); - while (ex_start <= ex_last) { - le32_add_cpu(&ex_start->ee_block, -shift); - /* Try to merge to the left. */ - if ((ex_start > - EXT_FIRST_EXTENT(path[depth].p_hdr)) && - ext4_ext_try_to_merge_right(inode, - path, ex_start - 1)) + if (SHIFT == SHIFT_LEFT) { + le32_add_cpu(&ex_start->ee_block, + -shift); + /* Try to merge to the left. */ + if ((ex_start > + EXT_FIRST_EXTENT(path[depth].p_hdr)) + && + ext4_ext_try_to_merge_right(inode, + path, ex_start - 1)) + ex_last--; + else + ex_start++; + } else { + le32_add_cpu(&ex_last->ee_block, shift); + ext4_ext_try_to_merge_right(inode, path, + ex_last); ex_last--; - else - ex_start++; + } } err = ext4_ext_dirty(handle, inode, path + depth); if (err) @@ -5279,7 +5315,10 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, if (err) goto out; - le32_add_cpu(&path[depth].p_idx->ei_block, -shift); + if (SHIFT == SHIFT_LEFT) + le32_add_cpu(&path[depth].p_idx->ei_block, -shift); + else + le32_add_cpu(&path[depth].p_idx->ei_block, shift); err = ext4_ext_dirty(handle, inode, path + depth); if (err) goto out; @@ -5297,19 +5336,20 @@ out: /* * ext4_ext_shift_extents: - * All the extents which lies in the range from start to the last allocated - * block for the file are shifted downwards by shift blocks. + * All the extents which lies in the range from @start to the last allocated + * block for the @inode are shifted either towards left or right (depending + * upon @SHIFT) by @shift blocks. * On success, 0 is returned, error otherwise. */ static int ext4_ext_shift_extents(struct inode *inode, handle_t *handle, - ext4_lblk_t start, ext4_lblk_t shift) + ext4_lblk_t start, ext4_lblk_t shift, + enum SHIFT_DIRECTION SHIFT) { struct ext4_ext_path *path; int ret = 0, depth; struct ext4_extent *extent; - ext4_lblk_t stop_block; - ext4_lblk_t ex_start, ex_end; + ext4_lblk_t stop, *iterator, ex_start, ex_end; /* Let path point to the last extent */ path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0); @@ -5321,58 +5361,84 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, if (!extent) goto out; - stop_block = le32_to_cpu(extent->ee_block) + + stop = le32_to_cpu(extent->ee_block) + ext4_ext_get_actual_len(extent); - /* Nothing to shift, if hole is at the end of file */ - if (start >= stop_block) - goto out; + /* + * In case of left shift, Don't start shifting extents until we make + * sure the hole is big enough to accommodate the shift. + */ + if (SHIFT == SHIFT_LEFT) { + path = ext4_find_extent(inode, start - 1, &path, 0); + if (IS_ERR(path)) + return PTR_ERR(path); + depth = path->p_depth; + extent = path[depth].p_ext; + if (extent) { + ex_start = le32_to_cpu(extent->ee_block); + ex_end = le32_to_cpu(extent->ee_block) + + ext4_ext_get_actual_len(extent); + } else { + ex_start = 0; + ex_end = 0; + } - /* - * Don't start shifting extents until we make sure the hole is big - * enough to accomodate the shift. - */ - path = ext4_find_extent(inode, start - 1, &path, 0); - if (IS_ERR(path)) - return PTR_ERR(path); - depth = path->p_depth; - extent = path[depth].p_ext; - if (extent) { - ex_start = le32_to_cpu(extent->ee_block); - ex_end = le32_to_cpu(extent->ee_block) + - ext4_ext_get_actual_len(extent); - } else { - ex_start = 0; - ex_end = 0; + if ((start == ex_start && shift > ex_start) || + (shift > start - ex_end)) { + ext4_ext_drop_refs(path); + kfree(path); + return -EINVAL; + } } - if ((start == ex_start && shift > ex_start) || - (shift > start - ex_end)) - return -EINVAL; + /* + * In case of left shift, iterator points to start and it is increased + * till we reach stop. In case of right shift, iterator points to stop + * and it is decreased till we reach start. + */ + if (SHIFT == SHIFT_LEFT) + iterator = &start; + else + iterator = &stop; /* Its safe to start updating extents */ - while (start < stop_block) { - path = ext4_find_extent(inode, start, &path, 0); + while (start < stop) { + path = ext4_find_extent(inode, *iterator, &path, 0); if (IS_ERR(path)) return PTR_ERR(path); depth = path->p_depth; extent = path[depth].p_ext; if (!extent) { EXT4_ERROR_INODE(inode, "unexpected hole at %lu", - (unsigned long) start); + (unsigned long) *iterator); return -EIO; } - if (start > le32_to_cpu(extent->ee_block)) { + if (SHIFT == SHIFT_LEFT && *iterator > + le32_to_cpu(extent->ee_block)) { /* Hole, move to the next extent */ if (extent < EXT_LAST_EXTENT(path[depth].p_hdr)) { path[depth].p_ext++; } else { - start = ext4_ext_next_allocated_block(path); + *iterator = ext4_ext_next_allocated_block(path); continue; } } + + if (SHIFT == SHIFT_LEFT) { + extent = EXT_LAST_EXTENT(path[depth].p_hdr); + *iterator = le32_to_cpu(extent->ee_block) + + ext4_ext_get_actual_len(extent); + } else { + extent = EXT_FIRST_EXTENT(path[depth].p_hdr); + *iterator = le32_to_cpu(extent->ee_block) > 0 ? + le32_to_cpu(extent->ee_block) - 1 : 0; + /* Update path extent in case we need to stop */ + while (le32_to_cpu(extent->ee_block) < start) + extent++; + path[depth].p_ext = extent; + } ret = ext4_ext_shift_path_extents(path, shift, inode, - handle, &start); + handle, SHIFT); if (ret) break; } @@ -5485,7 +5551,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) ext4_discard_preallocations(inode); ret = ext4_ext_shift_extents(inode, handle, punch_stop, - punch_stop - punch_start); + punch_stop - punch_start, SHIFT_LEFT); if (ret) { up_write(&EXT4_I(inode)->i_data_sem); goto out_stop; @@ -5510,6 +5576,174 @@ out_mutex: return ret; } +/* + * ext4_insert_range: + * This function implements the FALLOC_FL_INSERT_RANGE flag of fallocate. + * The data blocks starting from @offset to the EOF are shifted by @len + * towards right to create a hole in the @inode. Inode size is increased + * by len bytes. + * Returns 0 on success, error otherwise. + */ +int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) +{ + struct super_block *sb = inode->i_sb; + handle_t *handle; + struct ext4_ext_path *path; + struct ext4_extent *extent; + ext4_lblk_t offset_lblk, len_lblk, ee_start_lblk = 0; + unsigned int credits, ee_len; + int ret = 0, depth, split_flag = 0; + loff_t ioffset; + + /* + * We need to test this early because xfstests assumes that an + * insert range of (0, 1) will return EOPNOTSUPP if the file + * system does not support insert range. + */ + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + return -EOPNOTSUPP; + + /* Insert range works only on fs block size aligned offsets. */ + if (offset & (EXT4_CLUSTER_SIZE(sb) - 1) || + len & (EXT4_CLUSTER_SIZE(sb) - 1)) + return -EINVAL; + + if (!S_ISREG(inode->i_mode)) + return -EOPNOTSUPP; + + trace_ext4_insert_range(inode, offset, len); + + offset_lblk = offset >> EXT4_BLOCK_SIZE_BITS(sb); + len_lblk = len >> EXT4_BLOCK_SIZE_BITS(sb); + + /* Call ext4_force_commit to flush all data in case of data=journal */ + if (ext4_should_journal_data(inode)) { + ret = ext4_force_commit(inode->i_sb); + if (ret) + return ret; + } + + /* + * Need to round down to align start offset to page size boundary + * for page size > block size. + */ + ioffset = round_down(offset, PAGE_SIZE); + + /* Write out all dirty pages */ + ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, + LLONG_MAX); + if (ret) + return ret; + + /* Take mutex lock */ + mutex_lock(&inode->i_mutex); + + /* Currently just for extent based files */ + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + ret = -EOPNOTSUPP; + goto out_mutex; + } + + /* Check for wrap through zero */ + if (inode->i_size + len > inode->i_sb->s_maxbytes) { + ret = -EFBIG; + goto out_mutex; + } + + /* Offset should be less than i_size */ + if (offset >= i_size_read(inode)) { + ret = -EINVAL; + goto out_mutex; + } + + truncate_pagecache(inode, ioffset); + + /* Wait for existing dio to complete */ + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + + credits = ext4_writepage_trans_blocks(inode); + handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out_dio; + } + + /* Expand file to avoid data loss if there is error while shifting */ + inode->i_size += len; + EXT4_I(inode)->i_disksize += len; + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + ret = ext4_mark_inode_dirty(handle, inode); + if (ret) + goto out_stop; + + down_write(&EXT4_I(inode)->i_data_sem); + ext4_discard_preallocations(inode); + + path = ext4_find_extent(inode, offset_lblk, NULL, 0); + if (IS_ERR(path)) { + up_write(&EXT4_I(inode)->i_data_sem); + goto out_stop; + } + + depth = ext_depth(inode); + extent = path[depth].p_ext; + if (extent) { + ee_start_lblk = le32_to_cpu(extent->ee_block); + ee_len = ext4_ext_get_actual_len(extent); + + /* + * If offset_lblk is not the starting block of extent, split + * the extent @offset_lblk + */ + if ((offset_lblk > ee_start_lblk) && + (offset_lblk < (ee_start_lblk + ee_len))) { + if (ext4_ext_is_unwritten(extent)) + split_flag = EXT4_EXT_MARK_UNWRIT1 | + EXT4_EXT_MARK_UNWRIT2; + ret = ext4_split_extent_at(handle, inode, &path, + offset_lblk, split_flag, + EXT4_EX_NOCACHE | + EXT4_GET_BLOCKS_PRE_IO | + EXT4_GET_BLOCKS_METADATA_NOFAIL); + } + + ext4_ext_drop_refs(path); + kfree(path); + if (ret < 0) { + up_write(&EXT4_I(inode)->i_data_sem); + goto out_stop; + } + } + + ret = ext4_es_remove_extent(inode, offset_lblk, + EXT_MAX_BLOCKS - offset_lblk); + if (ret) { + up_write(&EXT4_I(inode)->i_data_sem); + goto out_stop; + } + + /* + * if offset_lblk lies in a hole which is at start of file, use + * ee_start_lblk to shift extents + */ + ret = ext4_ext_shift_extents(inode, handle, + ee_start_lblk > offset_lblk ? ee_start_lblk : offset_lblk, + len_lblk, SHIFT_RIGHT); + + up_write(&EXT4_I(inode)->i_data_sem); + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + +out_stop: + ext4_journal_stop(handle); +out_dio: + ext4_inode_resume_unlocked_dio(inode); +out_mutex: + mutex_unlock(&inode->i_mutex); + return ret; +} + /** * ext4_swap_extents - Swap extents between two inodes * @@ -5542,7 +5776,7 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, BUG_ON(!rwsem_is_locked(&EXT4_I(inode1)->i_data_sem)); BUG_ON(!rwsem_is_locked(&EXT4_I(inode2)->i_data_sem)); BUG_ON(!mutex_is_locked(&inode1->i_mutex)); - BUG_ON(!mutex_is_locked(&inode1->i_mutex)); + BUG_ON(!mutex_is_locked(&inode2->i_mutex)); *erp = ext4_es_remove_extent(inode1, lblk1, count); if (unlikely(*erp)) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 0613c256c..bc313ac5d 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -192,15 +192,27 @@ out: } #ifdef CONFIG_FS_DAX +static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate) +{ + struct inode *inode = bh->b_assoc_map->host; + /* XXX: breaks on 32-bit > 16GB. Is that even supported? */ + loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits; + int err; + if (!uptodate) + return; + WARN_ON(!buffer_unwritten(bh)); + err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size); +} + static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { - return dax_fault(vma, vmf, ext4_get_block); + return dax_fault(vma, vmf, ext4_get_block, ext4_end_io_unwritten); /* Is this the right get_block? */ } static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { - return dax_mkwrite(vma, vmf, ext4_get_block); + return dax_mkwrite(vma, vmf, ext4_get_block, ext4_end_io_unwritten); } static const struct vm_operations_struct ext4_dax_vm_ops = { @@ -223,9 +235,11 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) struct inode *inode = file->f_mapping->host; if (ext4_encrypted_inode(inode)) { - int err = ext4_generate_encryption_key(inode); + int err = ext4_get_encryption_info(inode); if (err) return 0; + if (ext4_encryption_info(inode) == NULL) + return -ENOKEY; } file_accessed(file); if (IS_DAX(file_inode(file))) { @@ -278,6 +292,13 @@ static int ext4_file_open(struct inode * inode, struct file * filp) ext4_journal_stop(handle); } } + if (ext4_encrypted_inode(inode)) { + ret = ext4_get_encryption_info(inode); + if (ret) + return -EACCES; + if (ext4_encryption_info(inode) == NULL) + return -ENOKEY; + } /* * Set up the jbd2_inode if we are opening the inode for * writing and the journal is present @@ -287,13 +308,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp) if (ret < 0) return ret; } - ret = dquot_file_open(inode, filp); - if (!ret && ext4_encrypted_inode(inode)) { - ret = ext4_generate_encryption_key(inode); - if (ret) - ret = -EACCES; - } - return ret; + return dquot_file_open(inode, filp); } /* diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 1eaa6cb96..173c1ae21 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -726,11 +726,25 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, ext4_group_t i; ext4_group_t flex_group; struct ext4_group_info *grp; + int encrypt = 0; /* Cannot create files in a deleted directory */ if (!dir || !dir->i_nlink) return ERR_PTR(-EPERM); + if ((ext4_encrypted_inode(dir) || + DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb))) && + (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) { + err = ext4_get_encryption_info(dir); + if (err) + return ERR_PTR(err); + if (ext4_encryption_info(dir) == NULL) + return ERR_PTR(-EPERM); + if (!handle) + nblocks += EXT4_DATA_TRANS_BLOCKS(dir->i_sb); + encrypt = 1; + } + sb = dir->i_sb; ngroups = ext4_get_groups_count(sb); trace_ext4_request_inode(dir, mode); @@ -996,12 +1010,6 @@ got: ei->i_block_group = group; ei->i_last_alloc_group = ~0; - /* If the directory encrypted, then we should encrypt the inode. */ - if ((S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) && - (ext4_encrypted_inode(dir) || - DUMMY_ENCRYPTION_ENABLED(sbi))) - ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT); - ext4_set_inode_flags(inode); if (IS_DIRSYNC(inode)) ext4_handle_sync(handle); @@ -1034,28 +1042,9 @@ got: ext4_set_inode_state(inode, EXT4_STATE_NEW); ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize; -#ifdef CONFIG_EXT4_FS_ENCRYPTION - if ((sbi->s_file_encryption_mode == EXT4_ENCRYPTION_MODE_INVALID) && - (sbi->s_dir_encryption_mode == EXT4_ENCRYPTION_MODE_INVALID)) { - ei->i_inline_off = 0; - if (EXT4_HAS_INCOMPAT_FEATURE(sb, - EXT4_FEATURE_INCOMPAT_INLINE_DATA)) - ext4_set_inode_state(inode, - EXT4_STATE_MAY_INLINE_DATA); - } else { - /* Inline data and encryption are incompatible - * We turn off inline data since encryption is enabled */ - ei->i_inline_off = 1; - if (EXT4_HAS_INCOMPAT_FEATURE(sb, - EXT4_FEATURE_INCOMPAT_INLINE_DATA)) - ext4_clear_inode_state(inode, - EXT4_STATE_MAY_INLINE_DATA); - } -#else ei->i_inline_off = 0; if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_INLINE_DATA)) ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); -#endif ret = inode; err = dquot_alloc_inode(inode); if (err) @@ -1082,6 +1071,12 @@ got: ei->i_datasync_tid = handle->h_transaction->t_tid; } + if (encrypt) { + err = ext4_inherit_context(dir, inode); + if (err) + goto fail_free_drop; + } + err = ext4_mark_inode_dirty(handle, inode); if (err) { ext4_std_error(sb, err); diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 94ae6874c..4f6ac499f 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -576,6 +576,8 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, ar.flags = EXT4_MB_HINT_DATA; if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) ar.flags |= EXT4_MB_DELALLOC_RESERVED; + if (flags & EXT4_GET_BLOCKS_METADATA_NOFAIL) + ar.flags |= EXT4_MB_USE_RESERVED; ar.goal = ext4_find_goal(inode, map->m_lblk, partial); diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 095c7a258..cd944a7a9 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -995,20 +995,18 @@ void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh, * and -EEXIST if directory entry already exists. */ static int ext4_add_dirent_to_inline(handle_t *handle, + struct ext4_filename *fname, struct dentry *dentry, struct inode *inode, struct ext4_iloc *iloc, void *inline_start, int inline_size) { struct inode *dir = d_inode(dentry->d_parent); - const char *name = dentry->d_name.name; - int namelen = dentry->d_name.len; int err; struct ext4_dir_entry_2 *de; - err = ext4_find_dest_de(dir, inode, iloc->bh, - inline_start, inline_size, - name, namelen, &de); + err = ext4_find_dest_de(dir, inode, iloc->bh, inline_start, + inline_size, fname, &de); if (err) return err; @@ -1016,8 +1014,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle, err = ext4_journal_get_write_access(handle, iloc->bh); if (err) return err; - ext4_insert_dentry(dir, inode, de, inline_size, &dentry->d_name, - name, namelen); + ext4_insert_dentry(dir, inode, de, inline_size, fname); ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size); @@ -1248,8 +1245,8 @@ out: * If succeeds, return 0. If not, extended the inline dir and copied data to * the new created block. */ -int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry, - struct inode *inode) +int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, + struct dentry *dentry, struct inode *inode) { int ret, inline_size; void *inline_start; @@ -1268,7 +1265,7 @@ int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry, EXT4_INLINE_DOTDOT_SIZE; inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE; - ret = ext4_add_dirent_to_inline(handle, dentry, inode, &iloc, + ret = ext4_add_dirent_to_inline(handle, fname, dentry, inode, &iloc, inline_start, inline_size); if (ret != -ENOSPC) goto out; @@ -1289,8 +1286,9 @@ int ext4_try_add_inline_entry(handle_t *handle, struct dentry *dentry, if (inline_size) { inline_start = ext4_get_inline_xattr_pos(dir, &iloc); - ret = ext4_add_dirent_to_inline(handle, dentry, inode, &iloc, - inline_start, inline_size); + ret = ext4_add_dirent_to_inline(handle, fname, dentry, + inode, &iloc, inline_start, + inline_size); if (ret != -ENOSPC) goto out; @@ -1611,6 +1609,7 @@ out: } struct buffer_head *ext4_find_inline_entry(struct inode *dir, + struct ext4_filename *fname, const struct qstr *d_name, struct ext4_dir_entry_2 **res_dir, int *has_inline_data) @@ -1632,8 +1631,8 @@ struct buffer_head *ext4_find_inline_entry(struct inode *dir, inline_start = (void *)ext4_raw_inode(&iloc)->i_block + EXT4_INLINE_DOTDOT_SIZE; inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE; - ret = search_dir(iloc.bh, inline_start, inline_size, - dir, d_name, 0, res_dir); + ret = ext4_search_dir(iloc.bh, inline_start, inline_size, + dir, fname, d_name, 0, res_dir); if (ret == 1) goto out_find; if (ret < 0) @@ -1645,8 +1644,8 @@ struct buffer_head *ext4_find_inline_entry(struct inode *dir, inline_start = ext4_get_inline_xattr_pos(dir, &iloc); inline_size = ext4_get_inline_size(dir) - EXT4_MIN_INLINE_DATA_SIZE; - ret = search_dir(iloc.bh, inline_start, inline_size, - dir, d_name, 0, res_dir); + ret = ext4_search_dir(iloc.bh, inline_start, inline_size, + dir, fname, d_name, 0, res_dir); if (ret == 1) goto out_find; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 966c61482..cecf9aa10 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -656,18 +656,6 @@ has_zeroout: return retval; } -static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate) -{ - struct inode *inode = bh->b_assoc_map->host; - /* XXX: breaks on 32-bit > 16GB. Is that even supported? */ - loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits; - int err; - if (!uptodate) - return; - WARN_ON(!buffer_unwritten(bh)); - err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size); -} - /* Maximum number of blocks we map for direct IO at once. */ #define DIO_MAX_BLOCKS 4096 @@ -705,10 +693,15 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock, map_bh(bh, inode->i_sb, map.m_pblk); bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; - if (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) { + if (IS_DAX(inode) && buffer_unwritten(bh)) { + /* + * dgc: I suspect unwritten conversion on ext4+DAX is + * fundamentally broken here when there are concurrent + * read/write in progress on this inode. + */ + WARN_ON_ONCE(io_end); bh->b_assoc_map = inode->i_mapping; bh->b_private = (void *)(unsigned long)iblock; - bh->b_end_io = ext4_end_io_unwritten; } if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN) set_buffer_defer_completion(bh); @@ -731,18 +724,18 @@ int ext4_get_block(struct inode *inode, sector_t iblock, * `handle' can be NULL if create is zero */ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, - ext4_lblk_t block, int create) + ext4_lblk_t block, int map_flags) { struct ext4_map_blocks map; struct buffer_head *bh; + int create = map_flags & EXT4_GET_BLOCKS_CREATE; int err; J_ASSERT(handle != NULL || create == 0); map.m_lblk = block; map.m_len = 1; - err = ext4_map_blocks(handle, inode, &map, - create ? EXT4_GET_BLOCKS_CREATE : 0); + err = ext4_map_blocks(handle, inode, &map, map_flags); if (err == 0) return create ? ERR_PTR(-ENOSPC) : NULL; @@ -788,11 +781,11 @@ errout: } struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, - ext4_lblk_t block, int create) + ext4_lblk_t block, int map_flags) { struct buffer_head *bh; - bh = ext4_getblk(handle, inode, block, create); + bh = ext4_getblk(handle, inode, block, map_flags); if (IS_ERR(bh)) return bh; if (!bh || buffer_uptodate(bh)) @@ -1261,13 +1254,12 @@ static int ext4_journalled_write_end(struct file *file, } /* - * Reserve a single cluster located at lblock + * Reserve space for a single cluster */ -static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) +static int ext4_da_reserve_space(struct inode *inode) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); - unsigned int md_needed; int ret; /* @@ -1279,25 +1271,14 @@ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock) if (ret) return ret; - /* - * recalculate the amount of metadata blocks to reserve - * in order to allocate nrblocks - * worse case is one extent per block - */ spin_lock(&ei->i_block_reservation_lock); - /* - * ext4_calc_metadata_amount() has side effects, which we have - * to be prepared undo if we fail to claim space. - */ - md_needed = 0; - trace_ext4_da_reserve_space(inode, 0); - if (ext4_claim_free_clusters(sbi, 1, 0)) { spin_unlock(&ei->i_block_reservation_lock); dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1)); return -ENOSPC; } ei->i_reserved_data_blocks++; + trace_ext4_da_reserve_space(inode); spin_unlock(&ei->i_block_reservation_lock); return 0; /* success */ @@ -1575,9 +1556,9 @@ add_delayed: * then we don't need to reserve it again. However we still need * to reserve metadata for every block we're going to write. */ - if (EXT4_SB(inode->i_sb)->s_cluster_ratio <= 1 || + if (EXT4_SB(inode->i_sb)->s_cluster_ratio == 1 || !ext4_find_delalloc_cluster(inode, map->m_lblk)) { - ret = ext4_da_reserve_space(inode, iblock); + ret = ext4_da_reserve_space(inode); if (ret) { /* not enough space to reserve */ retval = ret; @@ -4237,8 +4218,11 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) inode->i_op = &ext4_dir_inode_operations; inode->i_fop = &ext4_dir_operations; } else if (S_ISLNK(inode->i_mode)) { - if (ext4_inode_is_fast_symlink(inode) && - !ext4_encrypted_inode(inode)) { + if (ext4_encrypted_inode(inode)) { + inode->i_op = &ext4_encrypted_symlink_inode_operations; + ext4_set_aops(inode); + } else if (ext4_inode_is_fast_symlink(inode)) { + inode->i_link = (char *)ei->i_data; inode->i_op = &ext4_fast_symlink_inode_operations; nd_terminate_link(ei->i_data, inode->i_size, sizeof(ei->i_data) - 1); @@ -4707,8 +4691,10 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) ext4_journal_stop(handle); } - if (attr->ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) { + if (attr->ia_valid & ATTR_SIZE) { handle_t *handle; + loff_t oldsize = inode->i_size; + int shrink = (attr->ia_size <= inode->i_size); if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); @@ -4716,24 +4702,26 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_size > sbi->s_bitmap_maxbytes) return -EFBIG; } + if (!S_ISREG(inode->i_mode)) + return -EINVAL; if (IS_I_VERSION(inode) && attr->ia_size != inode->i_size) inode_inc_iversion(inode); - if (S_ISREG(inode->i_mode) && + if (ext4_should_order_data(inode) && (attr->ia_size < inode->i_size)) { - if (ext4_should_order_data(inode)) { - error = ext4_begin_ordered_truncate(inode, + error = ext4_begin_ordered_truncate(inode, attr->ia_size); - if (error) - goto err_out; - } + if (error) + goto err_out; + } + if (attr->ia_size != inode->i_size) { handle = ext4_journal_start(inode, EXT4_HT_INODE, 3); if (IS_ERR(handle)) { error = PTR_ERR(handle); goto err_out; } - if (ext4_handle_valid(handle)) { + if (ext4_handle_valid(handle) && shrink) { error = ext4_orphan_add(handle, inode); orphan = 1; } @@ -4752,15 +4740,13 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) up_write(&EXT4_I(inode)->i_data_sem); ext4_journal_stop(handle); if (error) { - ext4_orphan_del(NULL, inode); + if (orphan) + ext4_orphan_del(NULL, inode); goto err_out; } - } else { - loff_t oldsize = inode->i_size; - - i_size_write(inode, attr->ia_size); - pagecache_isize_extended(inode, oldsize, inode->i_size); } + if (!shrink) + pagecache_isize_extended(inode, oldsize, inode->i_size); /* * Blocks are going to be removed from the inode. Wait @@ -4780,13 +4766,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) * in data=journal mode to make pages freeable. */ truncate_pagecache(inode, inode->i_size); + if (shrink) + ext4_truncate(inode); } - /* - * We want to call ext4_truncate() even if attr->ia_size == - * inode->i_size for cases like truncation of fallocated space - */ - if (attr->ia_valid & ATTR_SIZE) - ext4_truncate(inode); if (!rc) { setattr_copy(inode, attr); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 2cb9e178d..1346cfa35 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -31,14 +31,11 @@ static void memswap(void *a, void *b, size_t len) { unsigned char *ap, *bp; - unsigned char tmp; ap = (unsigned char *)a; bp = (unsigned char *)b; while (len-- > 0) { - tmp = *ap; - *ap = *bp; - *bp = tmp; + swap(*ap, *bp); ap++; bp++; } @@ -675,8 +672,8 @@ encryption_policy_out: if (err) return err; } - if (copy_to_user((void *) arg, sbi->s_es->s_encrypt_pw_salt, - 16)) + if (copy_to_user((void __user *) arg, + sbi->s_es->s_encrypt_pw_salt, 16)) return -EFAULT; return 0; } @@ -690,7 +687,7 @@ encryption_policy_out: err = ext4_get_policy(inode, &policy); if (err) return err; - if (copy_to_user((void *)arg, &policy, sizeof(policy))) + if (copy_to_user((void __user *)arg, &policy, sizeof(policy))) return -EFAULT; return 0; #else @@ -758,7 +755,6 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return err; } case EXT4_IOC_MOVE_EXT: - case FITRIM: case EXT4_IOC_RESIZE_FS: case EXT4_IOC_PRECACHE_EXTENTS: case EXT4_IOC_SET_ENCRYPTION_POLICY: diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 41260489d..34b610ea5 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -26,6 +26,7 @@ #include <linux/log2.h> #include <linux/module.h> #include <linux/slab.h> +#include <linux/backing-dev.h> #include <trace/events/ext4.h> #ifdef CONFIG_EXT4_DEBUG @@ -882,10 +883,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore) /* wait for I/O completion */ for (i = 0, group = first_group; i < groups_per_page; i++, group++) { - if (bh[i] && ext4_wait_block_bitmap(sb, group, bh[i])) { + if (bh[i] && ext4_wait_block_bitmap(sb, group, bh[i])) err = -EIO; - goto out; - } } first_block = page->index * blocks_per_page; @@ -898,6 +897,11 @@ static int ext4_mb_init_cache(struct page *page, char *incore) /* skip initialized uptodate buddy */ continue; + if (!buffer_verified(bh[group - first_group])) + /* Skip faulty bitmaps */ + continue; + err = 0; + /* * data carry information regarding this * particular group in the format specified @@ -2008,7 +2012,12 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, } } -/* This is now called BEFORE we load the buddy bitmap. */ +/* + * This is now called BEFORE we load the buddy bitmap. + * Returns either 1 or 0 indicating that the group is either suitable + * for the allocation or not. In addition it can also return negative + * error code when something goes wrong. + */ static int ext4_mb_good_group(struct ext4_allocation_context *ac, ext4_group_t group, int cr) { @@ -2031,7 +2040,7 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac, if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { int ret = ext4_mb_init_group(ac->ac_sb, group); if (ret) - return 0; + return ret; } fragments = grp->bb_fragments; @@ -2078,7 +2087,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) { ext4_group_t ngroups, group, i; int cr; - int err = 0; + int err = 0, first_err = 0; struct ext4_sb_info *sbi; struct super_block *sb; struct ext4_buddy e4b; @@ -2145,6 +2154,7 @@ repeat: group = ac->ac_g_ex.fe_group; for (i = 0; i < ngroups; group++, i++) { + int ret = 0; cond_resched(); /* * Artificially restricted ngroups for non-extent @@ -2154,8 +2164,12 @@ repeat: group = 0; /* This now checks without needing the buddy page */ - if (!ext4_mb_good_group(ac, group, cr)) + ret = ext4_mb_good_group(ac, group, cr); + if (ret <= 0) { + if (!first_err) + first_err = ret; continue; + } err = ext4_mb_load_buddy(sb, group, &e4b); if (err) @@ -2167,9 +2181,12 @@ repeat: * We need to check again after locking the * block group */ - if (!ext4_mb_good_group(ac, group, cr)) { + ret = ext4_mb_good_group(ac, group, cr); + if (ret <= 0) { ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); + if (!first_err) + first_err = ret; continue; } @@ -2216,6 +2233,8 @@ repeat: } } out: + if (!err && ac->ac_status != AC_STATUS_FOUND && first_err) + err = first_err; return err; } @@ -2257,12 +2276,9 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) group--; if (group == 0) - seq_printf(seq, "#%-5s: %-5s %-5s %-5s " - "[ %-5s %-5s %-5s %-5s %-5s %-5s %-5s " - "%-5s %-5s %-5s %-5s %-5s %-5s %-5s ]\n", - "group", "free", "frags", "first", - "2^0", "2^1", "2^2", "2^3", "2^4", "2^5", "2^6", - "2^7", "2^8", "2^9", "2^10", "2^11", "2^12", "2^13"); + seq_puts(seq, "#group: free frags first [" + " 2^0 2^1 2^2 2^3 2^4 2^5 2^6 " + " 2^7 2^8 2^9 2^10 2^11 2^12 2^13 ]"); i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + sizeof(struct ext4_group_info); diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 370420bfa..fb6f11709 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -166,12 +166,9 @@ mext_page_double_lock(struct inode *inode1, struct inode *inode2, */ wait_on_page_writeback(page[0]); wait_on_page_writeback(page[1]); - if (inode1 > inode2) { - struct page *tmp; - tmp = page[0]; - page[0] = page[1]; - page[1] = tmp; - } + if (inode1 > inode2) + swap(page[0], page[1]); + return 0; } @@ -574,12 +571,16 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } - /* TODO: This is non obvious task to swap blocks for inodes with full - jornaling enabled */ + + /* TODO: it's not obvious how to swap blocks for inodes with full + journaling enabled */ if (ext4_should_journal_data(orig_inode) || ext4_should_journal_data(donor_inode)) { - return -EINVAL; + ext4_msg(orig_inode->i_sb, KERN_ERR, + "Online defrag not supported with data journaling"); + return -EOPNOTSUPP; } + /* Protect orig and donor inodes against a truncate */ lock_two_nondirectories(orig_inode, donor_inode); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 814f3beb4..011dcfb5c 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -61,7 +61,7 @@ static struct buffer_head *ext4_append(handle_t *handle, *block = inode->i_size >> inode->i_sb->s_blocksize_bits; - bh = ext4_bread(handle, inode, *block, 1); + bh = ext4_bread(handle, inode, *block, EXT4_GET_BLOCKS_CREATE); if (IS_ERR(bh)) return bh; inode->i_size += inode->i_sb->s_blocksize; @@ -84,12 +84,13 @@ typedef enum { } dirblock_type_t; #define ext4_read_dirblock(inode, block, type) \ - __ext4_read_dirblock((inode), (block), (type), __LINE__) + __ext4_read_dirblock((inode), (block), (type), __func__, __LINE__) static struct buffer_head *__ext4_read_dirblock(struct inode *inode, - ext4_lblk_t block, - dirblock_type_t type, - unsigned int line) + ext4_lblk_t block, + dirblock_type_t type, + const char *func, + unsigned int line) { struct buffer_head *bh; struct ext4_dir_entry *dirent; @@ -97,15 +98,17 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, bh = ext4_bread(NULL, inode, block, 0); if (IS_ERR(bh)) { - __ext4_warning(inode->i_sb, __func__, line, - "error %ld reading directory block " - "(ino %lu, block %lu)", PTR_ERR(bh), inode->i_ino, - (unsigned long) block); + __ext4_warning(inode->i_sb, func, line, + "inode #%lu: lblock %lu: comm %s: " + "error %ld reading directory block", + inode->i_ino, (unsigned long)block, + current->comm, PTR_ERR(bh)); return bh; } if (!bh) { - ext4_error_inode(inode, __func__, line, block, "Directory hole found"); + ext4_error_inode(inode, func, line, block, + "Directory hole found"); return ERR_PTR(-EIO); } dirent = (struct ext4_dir_entry *) bh->b_data; @@ -119,7 +122,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, is_dx_block = 1; } if (!is_dx_block && type == INDEX) { - ext4_error_inode(inode, __func__, line, block, + ext4_error_inode(inode, func, line, block, "directory leaf block found instead of index block"); return ERR_PTR(-EIO); } @@ -136,8 +139,8 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, if (ext4_dx_csum_verify(inode, dirent)) set_buffer_verified(bh); else { - ext4_error_inode(inode, __func__, line, block, - "Directory index failed checksum"); + ext4_error_inode(inode, func, line, block, + "Directory index failed checksum"); brelse(bh); return ERR_PTR(-EIO); } @@ -146,8 +149,8 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, if (ext4_dirent_csum_verify(inode, dirent)) set_buffer_verified(bh); else { - ext4_error_inode(inode, __func__, line, block, - "Directory block failed checksum"); + ext4_error_inode(inode, func, line, block, + "Directory block failed checksum"); brelse(bh); return ERR_PTR(-EIO); } @@ -248,7 +251,7 @@ static void dx_set_count(struct dx_entry *entries, unsigned value); static void dx_set_limit(struct dx_entry *entries, unsigned value); static unsigned dx_root_limit(struct inode *dir, unsigned infosize); static unsigned dx_node_limit(struct inode *dir); -static struct dx_frame *dx_probe(const struct qstr *d_name, +static struct dx_frame *dx_probe(struct ext4_filename *fname, struct inode *dir, struct dx_hash_info *hinfo, struct dx_frame *frame); @@ -267,10 +270,10 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash, struct dx_frame *frames, __u32 *start_hash); static struct buffer_head * ext4_dx_find_entry(struct inode *dir, - const struct qstr *d_name, + struct ext4_filename *fname, struct ext4_dir_entry_2 **res_dir); -static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, - struct inode *inode); +static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, + struct dentry *dentry, struct inode *inode); /* checksumming functions */ void initialize_dirent_tail(struct ext4_dir_entry_tail *t, @@ -327,10 +330,14 @@ static __le32 ext4_dirent_csum(struct inode *inode, return cpu_to_le32(csum); } -static void warn_no_space_for_csum(struct inode *inode) +#define warn_no_space_for_csum(inode) \ + __warn_no_space_for_csum((inode), __func__, __LINE__) + +static void __warn_no_space_for_csum(struct inode *inode, const char *func, + unsigned int line) { - ext4_warning(inode->i_sb, "no space in directory inode %lu leaf for " - "checksum. Please run e2fsck -D.", inode->i_ino); + __ext4_warning_inode(inode, func, line, + "No space for directory leaf checksum. Please run e2fsck -D."); } int ext4_dirent_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent) @@ -607,17 +614,15 @@ static struct stats dx_show_leaf(struct inode *dir, char *name; struct ext4_str fname_crypto_str = {.name = NULL, .len = 0}; - struct ext4_fname_crypto_ctx *ctx = NULL; - int res; + int res = 0; name = de->name; len = de->name_len; - ctx = ext4_get_fname_crypto_ctx(dir, - EXT4_NAME_LEN); - if (IS_ERR(ctx)) { - printk(KERN_WARNING "Error acquiring" - " crypto ctxt--skipping crypto\n"); - ctx = NULL; + if (ext4_encrypted_inode(inode)) + res = ext4_get_encryption_info(dir); + if (res) { + printk(KERN_WARNING "Error setting up" + " fname crypto: %d\n", res); } if (ctx == NULL) { /* Directory is not encrypted */ @@ -637,7 +642,6 @@ static struct stats dx_show_leaf(struct inode *dir, "allocating crypto " "buffer--skipping " "crypto\n"); - ext4_put_fname_crypto_ctx(&ctx); ctx = NULL; } res = ext4_fname_disk_to_usr(ctx, NULL, de, @@ -658,7 +662,6 @@ static struct stats dx_show_leaf(struct inode *dir, printk("%*.s:(E)%x.%u ", len, name, h.hash, (unsigned) ((char *) de - base)); - ext4_put_fname_crypto_ctx(&ctx); ext4_fname_crypto_free_buffer( &fname_crypto_str); } @@ -724,7 +727,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, * back to userspace. */ static struct dx_frame * -dx_probe(const struct qstr *d_name, struct inode *dir, +dx_probe(struct ext4_filename *fname, struct inode *dir, struct dx_hash_info *hinfo, struct dx_frame *frame_in) { unsigned count, indirect; @@ -742,56 +745,41 @@ dx_probe(const struct qstr *d_name, struct inode *dir, if (root->info.hash_version != DX_HASH_TEA && root->info.hash_version != DX_HASH_HALF_MD4 && root->info.hash_version != DX_HASH_LEGACY) { - ext4_warning(dir->i_sb, "Unrecognised inode hash code %d", - root->info.hash_version); + ext4_warning_inode(dir, "Unrecognised inode hash code %u", + root->info.hash_version); goto fail; } + if (fname) + hinfo = &fname->hinfo; hinfo->hash_version = root->info.hash_version; if (hinfo->hash_version <= DX_HASH_TEA) hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed; -#ifdef CONFIG_EXT4_FS_ENCRYPTION - if (d_name) { - struct ext4_fname_crypto_ctx *ctx = NULL; - int res; - - /* Check if the directory is encrypted */ - ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN); - if (IS_ERR(ctx)) { - ret_err = ERR_PTR(PTR_ERR(ctx)); - goto fail; - } - res = ext4_fname_usr_to_hash(ctx, d_name, hinfo); - if (res < 0) { - ret_err = ERR_PTR(res); - goto fail; - } - ext4_put_fname_crypto_ctx(&ctx); - } -#else - if (d_name) - ext4fs_dirhash(d_name->name, d_name->len, hinfo); -#endif + if (fname && fname_name(fname)) + ext4fs_dirhash(fname_name(fname), fname_len(fname), hinfo); hash = hinfo->hash; if (root->info.unused_flags & 1) { - ext4_warning(dir->i_sb, "Unimplemented inode hash flags: %#06x", - root->info.unused_flags); + ext4_warning_inode(dir, "Unimplemented hash flags: %#06x", + root->info.unused_flags); goto fail; } - if ((indirect = root->info.indirect_levels) > 1) { - ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x", - root->info.indirect_levels); + indirect = root->info.indirect_levels; + if (indirect > 1) { + ext4_warning_inode(dir, "Unimplemented hash depth: %#06x", + root->info.indirect_levels); goto fail; } - entries = (struct dx_entry *) (((char *)&root->info) + - root->info.info_length); + entries = (struct dx_entry *)(((char *)&root->info) + + root->info.info_length); if (dx_get_limit(entries) != dx_root_limit(dir, root->info.info_length)) { - ext4_warning(dir->i_sb, "dx entry: limit != root limit"); + ext4_warning_inode(dir, "dx entry: limit %u != root limit %u", + dx_get_limit(entries), + dx_root_limit(dir, root->info.info_length)); goto fail; } @@ -799,15 +787,16 @@ dx_probe(const struct qstr *d_name, struct inode *dir, while (1) { count = dx_get_count(entries); if (!count || count > dx_get_limit(entries)) { - ext4_warning(dir->i_sb, - "dx entry: no count or count > limit"); + ext4_warning_inode(dir, + "dx entry: count %u beyond limit %u", + count, dx_get_limit(entries)); goto fail; } p = entries + 1; q = entries + count - 1; while (p <= q) { - m = p + (q - p)/2; + m = p + (q - p) / 2; dxtrace(printk(".")); if (dx_get_hash(m) > hash) q = m - 1; @@ -831,7 +820,8 @@ dx_probe(const struct qstr *d_name, struct inode *dir, } at = p - 1; - dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at))); + dxtrace(printk(" %x->%u\n", at == entries ? 0 : dx_get_hash(at), + dx_get_block(at))); frame->entries = entries; frame->at = at; if (!indirect--) @@ -845,9 +835,10 @@ dx_probe(const struct qstr *d_name, struct inode *dir, } entries = ((struct dx_node *) frame->bh->b_data)->entries; - if (dx_get_limit(entries) != dx_node_limit (dir)) { - ext4_warning(dir->i_sb, - "dx entry: limit != node limit"); + if (dx_get_limit(entries) != dx_node_limit(dir)) { + ext4_warning_inode(dir, + "dx entry: limit %u != node limit %u", + dx_get_limit(entries), dx_node_limit(dir)); goto fail; } } @@ -858,18 +849,17 @@ fail: } if (ret_err == ERR_PTR(ERR_BAD_DX_DIR)) - ext4_warning(dir->i_sb, - "Corrupt dir inode %lu, running e2fsck is " - "recommended.", dir->i_ino); + ext4_warning_inode(dir, + "Corrupt directory, running e2fsck is recommended"); return ret_err; } -static void dx_release (struct dx_frame *frames) +static void dx_release(struct dx_frame *frames) { if (frames[0].bh == NULL) return; - if (((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels) + if (((struct dx_root *)frames[0].bh->b_data)->info.indirect_levels) brelse(frames[1].bh); brelse(frames[0].bh); } @@ -962,7 +952,6 @@ static int htree_dirblock_to_tree(struct file *dir_file, struct buffer_head *bh; struct ext4_dir_entry_2 *de, *top; int err = 0, count = 0; - struct ext4_fname_crypto_ctx *ctx = NULL; struct ext4_str fname_crypto_str = {.name = NULL, .len = 0}, tmp_str; dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n", @@ -977,17 +966,15 @@ static int htree_dirblock_to_tree(struct file *dir_file, EXT4_DIR_REC_LEN(0)); #ifdef CONFIG_EXT4_FS_ENCRYPTION /* Check if the directory is encrypted */ - ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN); - if (IS_ERR(ctx)) { - err = PTR_ERR(ctx); - brelse(bh); - return err; - } - if (ctx != NULL) { - err = ext4_fname_crypto_alloc_buffer(ctx, EXT4_NAME_LEN, + if (ext4_encrypted_inode(dir)) { + err = ext4_get_encryption_info(dir); + if (err < 0) { + brelse(bh); + return err; + } + err = ext4_fname_crypto_alloc_buffer(dir, EXT4_NAME_LEN, &fname_crypto_str); if (err < 0) { - ext4_put_fname_crypto_ctx(&ctx); brelse(bh); return err; } @@ -1008,16 +995,17 @@ static int htree_dirblock_to_tree(struct file *dir_file, continue; if (de->inode == 0) continue; - if (ctx == NULL) { - /* Directory is not encrypted */ + if (!ext4_encrypted_inode(dir)) { tmp_str.name = de->name; tmp_str.len = de->name_len; err = ext4_htree_store_dirent(dir_file, hinfo->hash, hinfo->minor_hash, de, &tmp_str); } else { + int save_len = fname_crypto_str.len; + /* Directory is encrypted */ - err = ext4_fname_disk_to_usr(ctx, hinfo, de, + err = ext4_fname_disk_to_usr(dir, hinfo, de, &fname_crypto_str); if (err < 0) { count = err; @@ -1026,6 +1014,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, err = ext4_htree_store_dirent(dir_file, hinfo->hash, hinfo->minor_hash, de, &fname_crypto_str); + fname_crypto_str.len = save_len; } if (err != 0) { count = err; @@ -1036,7 +1025,6 @@ static int htree_dirblock_to_tree(struct file *dir_file, errout: brelse(bh); #ifdef CONFIG_EXT4_FS_ENCRYPTION - ext4_put_fname_crypto_ctx(&ctx); ext4_fname_crypto_free_buffer(&fname_crypto_str); #endif return count; @@ -1155,12 +1143,13 @@ errout: static inline int search_dirblock(struct buffer_head *bh, struct inode *dir, + struct ext4_filename *fname, const struct qstr *d_name, unsigned int offset, struct ext4_dir_entry_2 **res_dir) { - return search_dir(bh, bh->b_data, dir->i_sb->s_blocksize, dir, - d_name, offset, res_dir); + return ext4_search_dir(bh, bh->b_data, dir->i_sb->s_blocksize, dir, + fname, d_name, offset, res_dir); } /* @@ -1242,54 +1231,54 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block) * `len <= EXT4_NAME_LEN' is guaranteed by caller. * `de != NULL' is guaranteed by caller. */ -static inline int ext4_match(struct ext4_fname_crypto_ctx *ctx, - struct ext4_str *fname_crypto_str, - int len, const char * const name, +static inline int ext4_match(struct ext4_filename *fname, struct ext4_dir_entry_2 *de) { - int res; + const void *name = fname_name(fname); + u32 len = fname_len(fname); if (!de->inode) return 0; #ifdef CONFIG_EXT4_FS_ENCRYPTION - if (ctx) - return ext4_fname_match(ctx, fname_crypto_str, len, name, de); + if (unlikely(!name)) { + if (fname->usr_fname->name[0] == '_') { + int ret; + if (de->name_len < 16) + return 0; + ret = memcmp(de->name + de->name_len - 16, + fname->crypto_buf.name + 8, 16); + return (ret == 0) ? 1 : 0; + } + name = fname->crypto_buf.name; + len = fname->crypto_buf.len; + } #endif - if (len != de->name_len) + if (de->name_len != len) return 0; - res = memcmp(name, de->name, len); - return (res == 0) ? 1 : 0; + return (memcmp(de->name, name, len) == 0) ? 1 : 0; } /* * Returns 0 if not found, -1 on failure, and 1 on success */ -int search_dir(struct buffer_head *bh, char *search_buf, int buf_size, - struct inode *dir, const struct qstr *d_name, - unsigned int offset, struct ext4_dir_entry_2 **res_dir) +int ext4_search_dir(struct buffer_head *bh, char *search_buf, int buf_size, + struct inode *dir, struct ext4_filename *fname, + const struct qstr *d_name, + unsigned int offset, struct ext4_dir_entry_2 **res_dir) { struct ext4_dir_entry_2 * de; char * dlimit; int de_len; - const char *name = d_name->name; - int namelen = d_name->len; - struct ext4_fname_crypto_ctx *ctx = NULL; - struct ext4_str fname_crypto_str = {.name = NULL, .len = 0}; int res; - ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN); - if (IS_ERR(ctx)) - return -1; - de = (struct ext4_dir_entry_2 *)search_buf; dlimit = search_buf + buf_size; while ((char *) de < dlimit) { /* this code is executed quadratically often */ /* do minimal checking `by hand' */ if ((char *) de + de->name_len <= dlimit) { - res = ext4_match(ctx, &fname_crypto_str, namelen, - name, de); + res = ext4_match(fname, de); if (res < 0) { res = -1; goto return_result; @@ -1322,8 +1311,6 @@ int search_dir(struct buffer_head *bh, char *search_buf, int buf_size, res = 0; return_result: - ext4_put_fname_crypto_ctx(&ctx); - ext4_fname_crypto_free_buffer(&fname_crypto_str); return res; } @@ -1370,7 +1357,8 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, buffer */ int num = 0; ext4_lblk_t nblocks; - int i, namelen; + int i, namelen, retval; + struct ext4_filename fname; *res_dir = NULL; sb = dir->i_sb; @@ -1378,14 +1366,18 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, if (namelen > EXT4_NAME_LEN) return NULL; + retval = ext4_fname_setup_filename(dir, d_name, 1, &fname); + if (retval) + return ERR_PTR(retval); + if (ext4_has_inline_data(dir)) { int has_inline_data = 1; - ret = ext4_find_inline_entry(dir, d_name, res_dir, + ret = ext4_find_inline_entry(dir, &fname, d_name, res_dir, &has_inline_data); if (has_inline_data) { if (inlined) *inlined = 1; - return ret; + goto cleanup_and_exit; } } @@ -1400,14 +1392,14 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, goto restart; } if (is_dx(dir)) { - bh = ext4_dx_find_entry(dir, d_name, res_dir); + ret = ext4_dx_find_entry(dir, &fname, res_dir); /* * On success, or if the error was file not found, * return. Otherwise, fall back to doing a search the * old fashioned way. */ - if (!IS_ERR(bh) || PTR_ERR(bh) != ERR_BAD_DX_DIR) - return bh; + if (!IS_ERR(ret) || PTR_ERR(ret) != ERR_BAD_DX_DIR) + goto cleanup_and_exit; dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, " "falling back\n")); } @@ -1438,8 +1430,10 @@ restart: num++; bh = ext4_getblk(NULL, dir, b++, 0); if (unlikely(IS_ERR(bh))) { - if (ra_max == 0) - return bh; + if (ra_max == 0) { + ret = bh; + goto cleanup_and_exit; + } break; } bh_use[ra_max] = bh; @@ -1469,7 +1463,7 @@ restart: goto next; } set_buffer_verified(bh); - i = search_dirblock(bh, dir, d_name, + i = search_dirblock(bh, dir, &fname, d_name, block << EXT4_BLOCK_SIZE_BITS(sb), res_dir); if (i == 1) { EXT4_I(dir)->i_dir_start_lookup = block; @@ -1500,15 +1494,17 @@ cleanup_and_exit: /* Clean up the read-ahead blocks */ for (; ra_ptr < ra_max; ra_ptr++) brelse(bh_use[ra_ptr]); + ext4_fname_free_filename(&fname); return ret; } -static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name, - struct ext4_dir_entry_2 **res_dir) +static struct buffer_head * ext4_dx_find_entry(struct inode *dir, + struct ext4_filename *fname, + struct ext4_dir_entry_2 **res_dir) { struct super_block * sb = dir->i_sb; - struct dx_hash_info hinfo; struct dx_frame frames[2], *frame; + const struct qstr *d_name = fname->usr_fname; struct buffer_head *bh; ext4_lblk_t block; int retval; @@ -1516,7 +1512,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q #ifdef CONFIG_EXT4_FS_ENCRYPTION *res_dir = NULL; #endif - frame = dx_probe(d_name, dir, &hinfo, frames); + frame = dx_probe(fname, dir, NULL, frames); if (IS_ERR(frame)) return (struct buffer_head *) frame; do { @@ -1525,7 +1521,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q if (IS_ERR(bh)) goto errout; - retval = search_dirblock(bh, dir, d_name, + retval = search_dirblock(bh, dir, fname, d_name, block << EXT4_BLOCK_SIZE_BITS(sb), res_dir); if (retval == 1) @@ -1537,12 +1533,12 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q } /* Check to see if we should continue to search */ - retval = ext4_htree_next_block(dir, hinfo.hash, frame, + retval = ext4_htree_next_block(dir, fname->hinfo.hash, frame, frames, NULL); if (retval < 0) { - ext4_warning(sb, - "error %d reading index page in directory #%lu", - retval, dir->i_ino); + ext4_warning_inode(dir, + "error %d reading directory index block", + retval); bh = ERR_PTR(retval); goto errout; } @@ -1796,32 +1792,16 @@ journal_error: int ext4_find_dest_de(struct inode *dir, struct inode *inode, struct buffer_head *bh, void *buf, int buf_size, - const char *name, int namelen, + struct ext4_filename *fname, struct ext4_dir_entry_2 **dest_de) { struct ext4_dir_entry_2 *de; - unsigned short reclen = EXT4_DIR_REC_LEN(namelen); + unsigned short reclen = EXT4_DIR_REC_LEN(fname_len(fname)); int nlen, rlen; unsigned int offset = 0; char *top; - struct ext4_fname_crypto_ctx *ctx = NULL; - struct ext4_str fname_crypto_str = {.name = NULL, .len = 0}; int res; - ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN); - if (IS_ERR(ctx)) - return -1; - - if (ctx != NULL) { - /* Calculate record length needed to store the entry */ - res = ext4_fname_crypto_namelen_on_disk(ctx, namelen); - if (res < 0) { - ext4_put_fname_crypto_ctx(&ctx); - return res; - } - reclen = EXT4_DIR_REC_LEN(res); - } - de = (struct ext4_dir_entry_2 *)buf; top = buf + buf_size - reclen; while ((char *) de <= top) { @@ -1831,7 +1811,7 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode, goto return_result; } /* Provide crypto context and crypto buffer to ext4 match */ - res = ext4_match(ctx, &fname_crypto_str, namelen, name, de); + res = ext4_match(fname, de); if (res < 0) goto return_result; if (res > 0) { @@ -1853,8 +1833,6 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode, res = 0; } return_result: - ext4_put_fname_crypto_ctx(&ctx); - ext4_fname_crypto_free_buffer(&fname_crypto_str); return res; } @@ -1862,39 +1840,10 @@ int ext4_insert_dentry(struct inode *dir, struct inode *inode, struct ext4_dir_entry_2 *de, int buf_size, - const struct qstr *iname, - const char *name, int namelen) + struct ext4_filename *fname) { int nlen, rlen; - struct ext4_fname_crypto_ctx *ctx = NULL; - struct ext4_str fname_crypto_str = {.name = NULL, .len = 0}; - struct ext4_str tmp_str; - int res; - - ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN); - if (IS_ERR(ctx)) - return -EIO; - /* By default, the input name would be written to the disk */ - tmp_str.name = (unsigned char *)name; - tmp_str.len = namelen; - if (ctx != NULL) { - /* Directory is encrypted */ - res = ext4_fname_crypto_alloc_buffer(ctx, EXT4_NAME_LEN, - &fname_crypto_str); - if (res < 0) { - ext4_put_fname_crypto_ctx(&ctx); - return -ENOMEM; - } - res = ext4_fname_usr_to_disk(ctx, iname, &fname_crypto_str); - if (res < 0) { - ext4_put_fname_crypto_ctx(&ctx); - ext4_fname_crypto_free_buffer(&fname_crypto_str); - return res; - } - tmp_str.name = fname_crypto_str.name; - tmp_str.len = fname_crypto_str.len; - } nlen = EXT4_DIR_REC_LEN(de->name_len); rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); @@ -1908,11 +1857,8 @@ int ext4_insert_dentry(struct inode *dir, de->file_type = EXT4_FT_UNKNOWN; de->inode = cpu_to_le32(inode->i_ino); ext4_set_de_type(inode->i_sb, de, inode->i_mode); - de->name_len = tmp_str.len; - - memcpy(de->name, tmp_str.name, tmp_str.len); - ext4_put_fname_crypto_ctx(&ctx); - ext4_fname_crypto_free_buffer(&fname_crypto_str); + de->name_len = fname_len(fname); + memcpy(de->name, fname_name(fname), fname_len(fname)); return 0; } @@ -1924,13 +1870,11 @@ int ext4_insert_dentry(struct inode *dir, * space. It will return -ENOSPC if no space is available, and -EIO * and -EEXIST if directory entry already exists. */ -static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, +static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, + struct inode *dir, struct inode *inode, struct ext4_dir_entry_2 *de, struct buffer_head *bh) { - struct inode *dir = d_inode(dentry->d_parent); - const char *name = dentry->d_name.name; - int namelen = dentry->d_name.len; unsigned int blocksize = dir->i_sb->s_blocksize; int csum_size = 0; int err; @@ -1939,9 +1883,8 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, csum_size = sizeof(struct ext4_dir_entry_tail); if (!de) { - err = ext4_find_dest_de(dir, inode, - bh, bh->b_data, blocksize - csum_size, - name, namelen, &de); + err = ext4_find_dest_de(dir, inode, bh, bh->b_data, + blocksize - csum_size, fname, &de); if (err) return err; } @@ -1954,8 +1897,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, /* By now the buffer is marked for journaling. Due to crypto operations, * the following function call may fail */ - err = ext4_insert_dentry(dir, inode, de, blocksize, &dentry->d_name, - name, namelen); + err = ext4_insert_dentry(dir, inode, de, blocksize, fname); if (err < 0) return err; @@ -1985,17 +1927,11 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, * This converts a one block unindexed directory to a 3 block indexed * directory, and adds the dentry to the indexed directory. */ -static int make_indexed_dir(handle_t *handle, struct dentry *dentry, +static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, + struct dentry *dentry, struct inode *inode, struct buffer_head *bh) { struct inode *dir = d_inode(dentry->d_parent); -#ifdef CONFIG_EXT4_FS_ENCRYPTION - struct ext4_fname_crypto_ctx *ctx = NULL; - int res; -#else - const char *name = dentry->d_name.name; - int namelen = dentry->d_name.len; -#endif struct buffer_head *bh2; struct dx_root *root; struct dx_frame frames[2], *frame; @@ -2006,17 +1942,10 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, unsigned len; int retval; unsigned blocksize; - struct dx_hash_info hinfo; ext4_lblk_t block; struct fake_dirent *fde; int csum_size = 0; -#ifdef CONFIG_EXT4_FS_ENCRYPTION - ctx = ext4_get_fname_crypto_ctx(dir, EXT4_NAME_LEN); - if (IS_ERR(ctx)) - return PTR_ERR(ctx); -#endif - if (ext4_has_metadata_csum(inode->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); @@ -2078,22 +2007,12 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, dx_set_limit(entries, dx_root_limit(dir, sizeof(root->info))); /* Initialize as for dx_probe */ - hinfo.hash_version = root->info.hash_version; - if (hinfo.hash_version <= DX_HASH_TEA) - hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; - hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; -#ifdef CONFIG_EXT4_FS_ENCRYPTION - res = ext4_fname_usr_to_hash(ctx, &dentry->d_name, &hinfo); - if (res < 0) { - ext4_put_fname_crypto_ctx(&ctx); - ext4_mark_inode_dirty(handle, dir); - brelse(bh); - return res; - } - ext4_put_fname_crypto_ctx(&ctx); -#else - ext4fs_dirhash(name, namelen, &hinfo); -#endif + fname->hinfo.hash_version = root->info.hash_version; + if (fname->hinfo.hash_version <= DX_HASH_TEA) + fname->hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; + fname->hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; + ext4fs_dirhash(fname_name(fname), fname_len(fname), &fname->hinfo); + memset(frames, 0, sizeof(frames)); frame = frames; frame->entries = entries; @@ -2108,14 +2027,14 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, if (retval) goto out_frames; - de = do_split(handle,dir, &bh, frame, &hinfo); + de = do_split(handle,dir, &bh, frame, &fname->hinfo); if (IS_ERR(de)) { retval = PTR_ERR(de); goto out_frames; } dx_release(frames); - retval = add_dirent_to_buf(handle, dentry, inode, de, bh); + retval = add_dirent_to_buf(handle, fname, dir, inode, de, bh); brelse(bh); return retval; out_frames: @@ -2147,6 +2066,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, struct ext4_dir_entry_2 *de; struct ext4_dir_entry_tail *t; struct super_block *sb; + struct ext4_filename fname; int retval; int dx_fallback=0; unsigned blocksize; @@ -2161,10 +2081,15 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, if (!dentry->d_name.len) return -EINVAL; + retval = ext4_fname_setup_filename(dir, &dentry->d_name, 0, &fname); + if (retval) + return retval; + if (ext4_has_inline_data(dir)) { - retval = ext4_try_add_inline_entry(handle, dentry, inode); + retval = ext4_try_add_inline_entry(handle, &fname, + dentry, inode); if (retval < 0) - return retval; + goto out; if (retval == 1) { retval = 0; goto out; @@ -2172,7 +2097,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, } if (is_dx(dir)) { - retval = ext4_dx_add_entry(handle, dentry, inode); + retval = ext4_dx_add_entry(handle, &fname, dentry, inode); if (!retval || (retval != ERR_BAD_DX_DIR)) goto out; ext4_clear_inode_flag(dir, EXT4_INODE_INDEX); @@ -2182,24 +2107,31 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, blocks = dir->i_size >> sb->s_blocksize_bits; for (block = 0; block < blocks; block++) { bh = ext4_read_dirblock(dir, block, DIRENT); - if (IS_ERR(bh)) - return PTR_ERR(bh); - - retval = add_dirent_to_buf(handle, dentry, inode, NULL, bh); + if (IS_ERR(bh)) { + retval = PTR_ERR(bh); + bh = NULL; + goto out; + } + retval = add_dirent_to_buf(handle, &fname, dir, inode, + NULL, bh); if (retval != -ENOSPC) goto out; if (blocks == 1 && !dx_fallback && EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) { - retval = make_indexed_dir(handle, dentry, inode, bh); + retval = make_indexed_dir(handle, &fname, dentry, + inode, bh); bh = NULL; /* make_indexed_dir releases bh */ goto out; } brelse(bh); } bh = ext4_append(handle, dir, &block); - if (IS_ERR(bh)) - return PTR_ERR(bh); + if (IS_ERR(bh)) { + retval = PTR_ERR(bh); + bh = NULL; + goto out; + } de = (struct ext4_dir_entry_2 *) bh->b_data; de->inode = 0; de->rec_len = ext4_rec_len_to_disk(blocksize - csum_size, blocksize); @@ -2209,8 +2141,9 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, initialize_dirent_tail(t, blocksize); } - retval = add_dirent_to_buf(handle, dentry, inode, de, bh); + retval = add_dirent_to_buf(handle, &fname, dir, inode, de, bh); out: + ext4_fname_free_filename(&fname); brelse(bh); if (retval == 0) ext4_set_inode_state(inode, EXT4_STATE_NEWENTRY); @@ -2220,19 +2153,18 @@ out: /* * Returns 0 for success, or a negative error value */ -static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, - struct inode *inode) +static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, + struct dentry *dentry, struct inode *inode) { struct dx_frame frames[2], *frame; struct dx_entry *entries, *at; - struct dx_hash_info hinfo; struct buffer_head *bh; struct inode *dir = d_inode(dentry->d_parent); struct super_block *sb = dir->i_sb; struct ext4_dir_entry_2 *de; int err; - frame = dx_probe(&dentry->d_name, dir, &hinfo, frames); + frame = dx_probe(fname, dir, NULL, frames); if (IS_ERR(frame)) return PTR_ERR(frame); entries = frame->entries; @@ -2249,7 +2181,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, if (err) goto journal_error; - err = add_dirent_to_buf(handle, dentry, inode, NULL, bh); + err = add_dirent_to_buf(handle, fname, dir, inode, NULL, bh); if (err != -ENOSPC) goto cleanup; @@ -2267,7 +2199,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, if (levels && (dx_get_count(frames->entries) == dx_get_limit(frames->entries))) { - ext4_warning(sb, "Directory index full!"); + ext4_warning_inode(dir, "Directory index full!"); err = -ENOSPC; goto cleanup; } @@ -2345,12 +2277,12 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, goto cleanup; } } - de = do_split(handle, dir, &bh, frame, &hinfo); + de = do_split(handle, dir, &bh, frame, &fname->hinfo); if (IS_ERR(de)) { err = PTR_ERR(de); goto cleanup; } - err = add_dirent_to_buf(handle, dentry, inode, de, bh); + err = add_dirent_to_buf(handle, fname, dir, inode, de, bh); goto cleanup; journal_error: @@ -2517,20 +2449,7 @@ retry: inode->i_op = &ext4_file_inode_operations; inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); - err = 0; -#ifdef CONFIG_EXT4_FS_ENCRYPTION - if (!err && (ext4_encrypted_inode(dir) || - DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb)))) { - err = ext4_inherit_context(dir, inode); - if (err) { - clear_nlink(inode); - unlock_new_inode(inode); - iput(inode); - } - } -#endif - if (!err) - err = ext4_add_nondir(handle, dentry, inode); + err = ext4_add_nondir(handle, dentry, inode); if (!err && IS_DIRSYNC(dir)) ext4_handle_sync(handle); } @@ -2711,14 +2630,6 @@ retry: err = ext4_init_new_dir(handle, dir, inode); if (err) goto out_clear_inode; -#ifdef CONFIG_EXT4_FS_ENCRYPTION - if (ext4_encrypted_inode(dir) || - DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb))) { - err = ext4_inherit_context(dir, inode); - if (err) - goto out_clear_inode; - } -#endif err = ext4_mark_inode_dirty(handle, inode); if (!err) err = ext4_add_entry(handle, dentry, inode); @@ -2779,12 +2690,9 @@ int ext4_empty_dir(struct inode *inode) de = (struct ext4_dir_entry_2 *) bh->b_data; de1 = ext4_next_entry(de, sb->s_blocksize); if (le32_to_cpu(de->inode) != inode->i_ino || - !le32_to_cpu(de1->inode) || - strcmp(".", de->name) || - strcmp("..", de1->name)) { - ext4_warning(inode->i_sb, - "bad directory (dir #%lu) - no `.' or `..'", - inode->i_ino); + le32_to_cpu(de1->inode) == 0 || + strcmp(".", de->name) || strcmp("..", de1->name)) { + ext4_warning_inode(inode, "directory missing '.' and/or '..'"); brelse(bh); return 1; } @@ -3037,8 +2945,9 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) if (retval) goto end_rmdir; if (!EXT4_DIR_LINK_EMPTY(inode)) - ext4_warning(inode->i_sb, - "empty directory has too many links (%d)", + ext4_warning_inode(inode, + "empty directory '%.*s' has too many links (%u)", + dentry->d_name.len, dentry->d_name.name, inode->i_nlink); inode->i_version++; clear_nlink(inode); @@ -3098,10 +3007,9 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); - if (!inode->i_nlink) { - ext4_warning(inode->i_sb, - "Deleting nonexistent file (%lu), %d", - inode->i_ino, inode->i_nlink); + if (inode->i_nlink == 0) { + ext4_warning_inode(inode, "Deleting file '%.*s' with no links", + dentry->d_name.len, dentry->d_name.name); set_nlink(inode, 1); } retval = ext4_delete_entry(handle, dir, de, bh); @@ -3140,10 +3048,23 @@ static int ext4_symlink(struct inode *dir, encryption_required = (ext4_encrypted_inode(dir) || DUMMY_ENCRYPTION_ENABLED(EXT4_SB(dir->i_sb))); - if (encryption_required) - disk_link.len = encrypted_symlink_data_len(len) + 1; - if (disk_link.len > dir->i_sb->s_blocksize) - return -ENAMETOOLONG; + if (encryption_required) { + err = ext4_get_encryption_info(dir); + if (err) + return err; + if (ext4_encryption_info(dir) == NULL) + return -EPERM; + disk_link.len = (ext4_fname_encrypted_size(dir, len) + + sizeof(struct ext4_encrypted_symlink_data)); + sd = kzalloc(disk_link.len, GFP_KERNEL); + if (!sd) + return -ENOMEM; + } + + if (disk_link.len > dir->i_sb->s_blocksize) { + err = -ENAMETOOLONG; + goto err_free_sd; + } dquot_initialize(dir); @@ -3174,42 +3095,29 @@ static int ext4_symlink(struct inode *dir, if (IS_ERR(inode)) { if (handle) ext4_journal_stop(handle); - return PTR_ERR(inode); + err = PTR_ERR(inode); + goto err_free_sd; } if (encryption_required) { - struct ext4_fname_crypto_ctx *ctx = NULL; struct qstr istr; struct ext4_str ostr; - sd = kzalloc(disk_link.len, GFP_NOFS); - if (!sd) { - err = -ENOMEM; - goto err_drop_inode; - } - err = ext4_inherit_context(dir, inode); - if (err) - goto err_drop_inode; - ctx = ext4_get_fname_crypto_ctx(inode, - inode->i_sb->s_blocksize); - if (IS_ERR_OR_NULL(ctx)) { - /* We just set the policy, so ctx should not be NULL */ - err = (ctx == NULL) ? -EIO : PTR_ERR(ctx); - goto err_drop_inode; - } istr.name = (const unsigned char *) symname; istr.len = len; ostr.name = sd->encrypted_path; - err = ext4_fname_usr_to_disk(ctx, &istr, &ostr); - ext4_put_fname_crypto_ctx(&ctx); + ostr.len = disk_link.len; + err = ext4_fname_usr_to_disk(inode, &istr, &ostr); if (err < 0) goto err_drop_inode; sd->len = cpu_to_le16(ostr.len); disk_link.name = (char *) sd; + inode->i_op = &ext4_encrypted_symlink_inode_operations; } if ((disk_link.len > EXT4_N_BLOCKS * 4)) { - inode->i_op = &ext4_symlink_inode_operations; + if (!encryption_required) + inode->i_op = &ext4_symlink_inode_operations; ext4_set_aops(inode); /* * We cannot call page_symlink() with transaction started @@ -3249,9 +3157,10 @@ static int ext4_symlink(struct inode *dir, } else { /* clear the extent format for fast symlink */ ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS); - inode->i_op = encryption_required ? - &ext4_symlink_inode_operations : - &ext4_fast_symlink_inode_operations; + if (!encryption_required) { + inode->i_op = &ext4_fast_symlink_inode_operations; + inode->i_link = (char *)&EXT4_I(inode)->i_data; + } memcpy((char *)&EXT4_I(inode)->i_data, disk_link.name, disk_link.len); inode->i_size = disk_link.len - 1; @@ -3268,10 +3177,11 @@ static int ext4_symlink(struct inode *dir, err_drop_inode: if (handle) ext4_journal_stop(handle); - kfree(sd); clear_nlink(inode); unlock_new_inode(inode); iput(inode); +err_free_sd: + kfree(sd); return err; } @@ -3487,9 +3397,9 @@ static void ext4_rename_delete(handle_t *handle, struct ext4_renament *ent, } if (retval) { - ext4_warning(ent->dir->i_sb, - "Deleting old file (%lu), %d, error=%d", - ent->dir->i_ino, ent->dir->i_nlink, retval); + ext4_warning_inode(ent->dir, + "Deleting old file: nlink %d, error=%d", + ent->dir->i_nlink, retval); } } @@ -3759,6 +3669,15 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, u8 new_file_type; int retval; + if ((ext4_encrypted_inode(old_dir) || + ext4_encrypted_inode(new_dir)) && + (old_dir != new_dir) && + (!ext4_is_child_context_consistent_with_parent(new_dir, + old.inode) || + !ext4_is_child_context_consistent_with_parent(old_dir, + new.inode))) + return -EPERM; + dquot_initialize(old.dir); dquot_initialize(new.dir); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 5765f88b3..5602450f0 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -84,7 +84,7 @@ static void ext4_finish_bio(struct bio *bio) /* The bounce data pages are unmapped. */ data_page = page; ctx = (struct ext4_crypto_ctx *)page_private(data_page); - page = ctx->control_page; + page = ctx->w.control_page; } #endif @@ -359,7 +359,6 @@ void ext4_io_submit(struct ext4_io_submit *io) if (bio) { bio_get(io->io_bio); submit_bio(io->io_op, io->io_bio); - BUG_ON(bio_flagged(io->io_bio, BIO_EOPNOTSUPP)); bio_put(io->io_bio); } io->io_bio = NULL; diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index 171b9ac4b..ec3ef93a5 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -54,8 +54,8 @@ static void completion_pages(struct work_struct *work) { #ifdef CONFIG_EXT4_FS_ENCRYPTION struct ext4_crypto_ctx *ctx = - container_of(work, struct ext4_crypto_ctx, work); - struct bio *bio = ctx->bio; + container_of(work, struct ext4_crypto_ctx, r.work); + struct bio *bio = ctx->r.bio; struct bio_vec *bv; int i; @@ -109,9 +109,9 @@ static void mpage_end_io(struct bio *bio, int err) if (err) { ext4_release_crypto_ctx(ctx); } else { - INIT_WORK(&ctx->work, completion_pages); - ctx->bio = bio; - queue_work(ext4_read_workqueue, &ctx->work); + INIT_WORK(&ctx->r.work, completion_pages); + ctx->r.bio = bio; + queue_work(ext4_read_workqueue, &ctx->r.work); return; } } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index ca12affdb..58987b5c5 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -24,6 +24,7 @@ #include <linux/slab.h> #include <linux/init.h> #include <linux/blkdev.h> +#include <linux/backing-dev.h> #include <linux/parser.h> #include <linux/buffer_head.h> #include <linux/exportfs.h> @@ -451,7 +452,7 @@ void __ext4_error_file(struct file *file, const char *function, es = EXT4_SB(inode->i_sb)->s_es; es->s_last_error_ino = cpu_to_le32(inode->i_ino); if (ext4_error_ratelimit(inode->i_sb)) { - path = d_path(&(file->f_path), pathname, sizeof(pathname)); + path = file_path(file, pathname, sizeof(pathname)); if (IS_ERR(path)) path = "(unknown)"; va_start(args, fmt); @@ -591,14 +592,17 @@ void __ext4_msg(struct super_block *sb, va_end(args); } +#define ext4_warning_ratelimit(sb) \ + ___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state), \ + "EXT4-fs warning") + void __ext4_warning(struct super_block *sb, const char *function, unsigned int line, const char *fmt, ...) { struct va_format vaf; va_list args; - if (!___ratelimit(&(EXT4_SB(sb)->s_warning_ratelimit_state), - "EXT4-fs warning")) + if (!ext4_warning_ratelimit(sb)) return; va_start(args, fmt); @@ -609,6 +613,24 @@ void __ext4_warning(struct super_block *sb, const char *function, va_end(args); } +void __ext4_warning_inode(const struct inode *inode, const char *function, + unsigned int line, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + if (!ext4_warning_ratelimit(inode->i_sb)) + return; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: " + "inode #%lu: comm %s: %pV\n", inode->i_sb->s_id, + function, line, inode->i_ino, current->comm, &vaf); + va_end(args); +} + void __ext4_grp_locked_error(const char *function, unsigned int line, struct super_block *sb, ext4_group_t grp, unsigned long ino, ext4_fsblk_t block, @@ -880,9 +902,8 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) atomic_set(&ei->i_unwritten, 0); INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work); #ifdef CONFIG_EXT4_FS_ENCRYPTION - ei->i_encryption_key.mode = EXT4_ENCRYPTION_MODE_INVALID; + ei->i_crypt_info = NULL; #endif - return &ei->vfs_inode; } @@ -959,6 +980,10 @@ void ext4_clear_inode(struct inode *inode) jbd2_free_inode(EXT4_I(inode)->jinode); EXT4_I(inode)->jinode = NULL; } +#ifdef CONFIG_EXT4_FS_ENCRYPTION + if (EXT4_I(inode)->i_crypt_info) + ext4_free_encryption_info(inode, EXT4_I(inode)->i_crypt_info); +#endif } static struct inode *ext4_nfs_get_inode(struct super_block *sb, @@ -3421,7 +3446,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) unsigned long journal_devnum = 0; unsigned long def_mount_opts; struct inode *root; - char *cp; const char *descr; int ret = -ENOMEM; int blocksize, clustersize; @@ -3450,15 +3474,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (sb->s_bdev->bd_part) sbi->s_sectors_written_start = part_stat_read(sb->s_bdev->bd_part, sectors[1]); -#ifdef CONFIG_EXT4_FS_ENCRYPTION - /* Modes of operations for file and directory encryption. */ - sbi->s_file_encryption_mode = EXT4_ENCRYPTION_MODE_AES_256_XTS; - sbi->s_dir_encryption_mode = EXT4_ENCRYPTION_MODE_INVALID; -#endif /* Cleanup superblock name */ - for (cp = sb->s_id; (cp = strchr(cp, '/'));) - *cp = '!'; + strreplace(sb->s_id, '/', '!'); /* -EINVAL is default */ ret = -EINVAL; @@ -4068,7 +4086,15 @@ no_journal: } } - if (unlikely(sbi->s_mount_flags & EXT4_MF_TEST_DUMMY_ENCRYPTION) && + if ((DUMMY_ENCRYPTION_ENABLED(sbi) || + EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT)) && + (blocksize != PAGE_CACHE_SIZE)) { + ext4_msg(sb, KERN_ERR, + "Unsupported blocksize for fs encryption"); + goto failed_mount_wq; + } + + if (DUMMY_ENCRYPTION_ENABLED(sbi) && !(sb->s_flags & MS_RDONLY) && !EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT)) { EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_ENCRYPT); @@ -5414,6 +5440,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, struct inode *inode = sb_dqopt(sb)->files[type]; ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); int err, offset = off & (sb->s_blocksize - 1); + int retries = 0; struct buffer_head *bh; handle_t *handle = journal_current_handle(); @@ -5434,7 +5461,12 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, return -EIO; } - bh = ext4_bread(handle, inode, blk, 1); + do { + bh = ext4_bread(handle, inode, blk, + EXT4_GET_BLOCKS_CREATE | + EXT4_GET_BLOCKS_METADATA_NOFAIL); + } while (IS_ERR(bh) && (PTR_ERR(bh) == -ENOSPC) && + ext4_should_retry_alloc(inode->i_sb, &retries)); if (IS_ERR(bh)) return PTR_ERR(bh); if (!bh) @@ -5651,6 +5683,7 @@ out7: static void __exit ext4_exit_fs(void) { + ext4_exit_crypto(); ext4_destroy_lazyinit_thread(); unregister_as_ext2(); unregister_as_ext3(); diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index 187b78920..c677f2c10 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -23,34 +23,28 @@ #include "xattr.h" #ifdef CONFIG_EXT4_FS_ENCRYPTION -static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd) +static const char *ext4_encrypted_follow_link(struct dentry *dentry, void **cookie) { struct page *cpage = NULL; char *caddr, *paddr = NULL; struct ext4_str cstr, pstr; struct inode *inode = d_inode(dentry); - struct ext4_fname_crypto_ctx *ctx = NULL; struct ext4_encrypted_symlink_data *sd; loff_t size = min_t(loff_t, i_size_read(inode), PAGE_SIZE - 1); int res; u32 plen, max_size = inode->i_sb->s_blocksize; - if (!ext4_encrypted_inode(inode)) - return page_follow_link_light(dentry, nd); - - ctx = ext4_get_fname_crypto_ctx(inode, inode->i_sb->s_blocksize); - if (IS_ERR(ctx)) - return ctx; + res = ext4_get_encryption_info(inode); + if (res) + return ERR_PTR(res); if (ext4_inode_is_fast_symlink(inode)) { caddr = (char *) EXT4_I(inode)->i_data; max_size = sizeof(EXT4_I(inode)->i_data); } else { cpage = read_mapping_page(inode->i_mapping, 0, NULL); - if (IS_ERR(cpage)) { - ext4_put_fname_crypto_ctx(&ctx); - return cpage; - } + if (IS_ERR(cpage)) + return ERR_CAST(cpage); caddr = kmap(cpage); caddr[size] = 0; } @@ -74,21 +68,19 @@ static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd) goto errout; } pstr.name = paddr; - res = _ext4_fname_disk_to_usr(ctx, NULL, &cstr, &pstr); + pstr.len = plen; + res = _ext4_fname_disk_to_usr(inode, NULL, &cstr, &pstr); if (res < 0) goto errout; /* Null-terminate the name */ if (res <= plen) paddr[res] = '\0'; - nd_set_link(nd, paddr); - ext4_put_fname_crypto_ctx(&ctx); if (cpage) { kunmap(cpage); page_cache_release(cpage); } - return NULL; + return *cookie = paddr; errout: - ext4_put_fname_crypto_ctx(&ctx); if (cpage) { kunmap(cpage); page_cache_release(cpage); @@ -97,36 +89,22 @@ errout: return ERR_PTR(res); } -static void ext4_put_link(struct dentry *dentry, struct nameidata *nd, - void *cookie) -{ - struct page *page = cookie; - - if (!page) { - kfree(nd_get_link(nd)); - } else { - kunmap(page); - page_cache_release(page); - } -} +const struct inode_operations ext4_encrypted_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = ext4_encrypted_follow_link, + .put_link = kfree_put_link, + .setattr = ext4_setattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ext4_listxattr, + .removexattr = generic_removexattr, +}; #endif -static void *ext4_follow_fast_link(struct dentry *dentry, struct nameidata *nd) -{ - struct ext4_inode_info *ei = EXT4_I(d_inode(dentry)); - nd_set_link(nd, (char *) ei->i_data); - return NULL; -} - const struct inode_operations ext4_symlink_inode_operations = { .readlink = generic_readlink, -#ifdef CONFIG_EXT4_FS_ENCRYPTION - .follow_link = ext4_follow_link, - .put_link = ext4_put_link, -#else .follow_link = page_follow_link_light, .put_link = page_put_link, -#endif .setattr = ext4_setattr, .setxattr = generic_setxattr, .getxattr = generic_getxattr, @@ -136,7 +114,7 @@ const struct inode_operations ext4_symlink_inode_operations = { const struct inode_operations ext4_fast_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = ext4_follow_fast_link, + .follow_link = simple_follow_link, .setattr = ext4_setattr, .setxattr = generic_setxattr, .getxattr = generic_getxattr, diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index 05f0f663f..c62976200 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -72,6 +72,25 @@ config F2FS_CHECK_FS If you want to improve the performance, say N. +config F2FS_FS_ENCRYPTION + bool "F2FS Encryption" + depends on F2FS_FS + depends on F2FS_FS_XATTR + select CRYPTO_AES + select CRYPTO_CBC + select CRYPTO_ECB + select CRYPTO_XTS + select CRYPTO_CTS + select CRYPTO_CTR + select CRYPTO_SHA256 + select KEYS + select ENCRYPTED_KEYS + help + Enable encryption of f2fs files and directories. This + feature is similar to ecryptfs, but it is more memory + efficient since it avoids caching the encrypted and + decrypted pages in the page cache. + config F2FS_IO_TRACE bool "F2FS IO tracer" depends on F2FS_FS diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile index d92397731..396be1a39 100644 --- a/fs/f2fs/Makefile +++ b/fs/f2fs/Makefile @@ -6,3 +6,5 @@ f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o f2fs-$(CONFIG_F2FS_IO_TRACE) += trace.o +f2fs-$(CONFIG_F2FS_FS_ENCRYPTION) += crypto_policy.o crypto.o \ + crypto_key.o crypto_fname.o diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 4320ffab3..c8f25f724 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -334,51 +334,45 @@ static int f2fs_acl_create(struct inode *dir, umode_t *mode, struct page *dpage) { struct posix_acl *p; + struct posix_acl *clone; int ret; + *acl = NULL; + *default_acl = NULL; + if (S_ISLNK(*mode) || !IS_POSIXACL(dir)) - goto no_acl; + return 0; p = __f2fs_get_acl(dir, ACL_TYPE_DEFAULT, dpage); - if (IS_ERR(p)) { - if (p == ERR_PTR(-EOPNOTSUPP)) - goto apply_umask; - return PTR_ERR(p); + if (!p || p == ERR_PTR(-EOPNOTSUPP)) { + *mode &= ~current_umask(); + return 0; } + if (IS_ERR(p)) + return PTR_ERR(p); - if (!p) - goto apply_umask; - - *acl = f2fs_acl_clone(p, GFP_NOFS); - if (!*acl) + clone = f2fs_acl_clone(p, GFP_NOFS); + if (!clone) goto no_mem; - ret = f2fs_acl_create_masq(*acl, mode); + ret = f2fs_acl_create_masq(clone, mode); if (ret < 0) goto no_mem_clone; - if (ret == 0) { - posix_acl_release(*acl); - *acl = NULL; - } + if (ret == 0) + posix_acl_release(clone); + else + *acl = clone; - if (!S_ISDIR(*mode)) { + if (!S_ISDIR(*mode)) posix_acl_release(p); - *default_acl = NULL; - } else { + else *default_acl = p; - } - return 0; -apply_umask: - *mode &= ~current_umask(); -no_acl: - *default_acl = NULL; - *acl = NULL; return 0; no_mem_clone: - posix_acl_release(*acl); + posix_acl_release(clone); no_mem: posix_acl_release(p); return -ENOMEM; diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index a5e17a2a0..b70bbe1a6 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -52,9 +52,11 @@ struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) struct address_space *mapping = META_MAPPING(sbi); struct page *page; struct f2fs_io_info fio = { + .sbi = sbi, .type = META, .rw = READ_SYNC | REQ_META | REQ_PRIO, .blk_addr = index, + .encrypted_page = NULL, }; repeat: page = grab_cache_page(mapping, index); @@ -65,7 +67,9 @@ repeat: if (PageUptodate(page)) goto out; - if (f2fs_submit_page_bio(sbi, page, &fio)) + fio.page = page; + + if (f2fs_submit_page_bio(&fio)) goto repeat; lock_page(page); @@ -77,8 +81,7 @@ out: return page; } -static inline bool is_valid_blkaddr(struct f2fs_sb_info *sbi, - block_t blkaddr, int type) +bool is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type) { switch (type) { case META_NAT: @@ -118,8 +121,10 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type struct page *page; block_t blkno = start; struct f2fs_io_info fio = { + .sbi = sbi, .type = META, - .rw = READ_SYNC | REQ_META | REQ_PRIO + .rw = READ_SYNC | REQ_META | REQ_PRIO, + .encrypted_page = NULL, }; for (; nrpages-- > 0; blkno++) { @@ -161,7 +166,8 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type continue; } - f2fs_submit_page_mbio(sbi, page, &fio); + fio.page = page; + f2fs_submit_page_mbio(&fio); f2fs_put_page(page, 0); } out: @@ -510,7 +516,12 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) grab_meta_page(sbi, start_blk + index); index = 1; - spin_lock(&im->ino_lock); + + /* + * we don't need to do spin_lock(&im->ino_lock) here, since all the + * orphan inode operations are covered under f2fs_lock_op(). + * And, spin_lock should be avoided due to page operations below. + */ head = &im->ino_list; /* loop for each orphan inode entry and write them in Jornal block */ @@ -550,8 +561,6 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) set_page_dirty(page); f2fs_put_page(page, 1); } - - spin_unlock(&im->ino_lock); } static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, @@ -879,10 +888,8 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num; nid_t last_nid = nm_i->next_scan_nid; block_t start_blk; - struct page *cp_page; unsigned int data_sum_blocks, orphan_blocks; __u32 crc32 = 0; - void *kaddr; int i; int cp_payload_blks = __cp_payload(sbi); @@ -979,19 +986,11 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) start_blk = __start_cp_addr(sbi); /* write out checkpoint buffer at block 0 */ - cp_page = grab_meta_page(sbi, start_blk++); - kaddr = page_address(cp_page); - memcpy(kaddr, ckpt, F2FS_BLKSIZE); - set_page_dirty(cp_page); - f2fs_put_page(cp_page, 1); - - for (i = 1; i < 1 + cp_payload_blks; i++) { - cp_page = grab_meta_page(sbi, start_blk++); - kaddr = page_address(cp_page); - memcpy(kaddr, (char *)ckpt + i * F2FS_BLKSIZE, F2FS_BLKSIZE); - set_page_dirty(cp_page); - f2fs_put_page(cp_page, 1); - } + update_meta_page(sbi, ckpt, start_blk++); + + for (i = 1; i < 1 + cp_payload_blks; i++) + update_meta_page(sbi, (char *)ckpt + i * F2FS_BLKSIZE, + start_blk++); if (orphan_num) { write_orphan_inodes(sbi, start_blk); @@ -1006,11 +1005,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) } /* writeout checkpoint block */ - cp_page = grab_meta_page(sbi, start_blk); - kaddr = page_address(cp_page); - memcpy(kaddr, ckpt, F2FS_BLKSIZE); - set_page_dirty(cp_page); - f2fs_put_page(cp_page, 1); + update_meta_page(sbi, ckpt, start_blk); /* wait for previous submitted node/meta pages writeback */ wait_on_all_pages_writeback(sbi); @@ -1036,7 +1031,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) if (unlikely(f2fs_cp_error(sbi))) return; - clear_prefree_segments(sbi); + clear_prefree_segments(sbi, cpc); clear_sbi_flag(sbi, SBI_IS_DIRTY); } @@ -1051,7 +1046,8 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) mutex_lock(&sbi->cp_mutex); if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) && - (cpc->reason == CP_FASTBOOT || cpc->reason == CP_SYNC)) + (cpc->reason == CP_FASTBOOT || cpc->reason == CP_SYNC || + (cpc->reason == CP_DISCARD && !sbi->discard_blks))) goto out; if (unlikely(f2fs_cp_error(sbi))) goto out; diff --git a/fs/f2fs/crypto.c b/fs/f2fs/crypto.c new file mode 100644 index 000000000..4a62ef14e --- /dev/null +++ b/fs/f2fs/crypto.c @@ -0,0 +1,491 @@ +/* + * linux/fs/f2fs/crypto.c + * + * Copied from linux/fs/ext4/crypto.c + * + * Copyright (C) 2015, Google, Inc. + * Copyright (C) 2015, Motorola Mobility + * + * This contains encryption functions for f2fs + * + * Written by Michael Halcrow, 2014. + * + * Filename encryption additions + * Uday Savagaonkar, 2014 + * Encryption policy handling additions + * Ildar Muslukhov, 2014 + * Remove ext4_encrypted_zeroout(), + * add f2fs_restore_and_release_control_page() + * Jaegeuk Kim, 2015. + * + * This has not yet undergone a rigorous security audit. + * + * The usage of AES-XTS should conform to recommendations in NIST + * Special Publication 800-38E and IEEE P1619/D16. + */ +#include <crypto/hash.h> +#include <crypto/sha.h> +#include <keys/user-type.h> +#include <keys/encrypted-type.h> +#include <linux/crypto.h> +#include <linux/ecryptfs.h> +#include <linux/gfp.h> +#include <linux/kernel.h> +#include <linux/key.h> +#include <linux/list.h> +#include <linux/mempool.h> +#include <linux/module.h> +#include <linux/mutex.h> +#include <linux/random.h> +#include <linux/scatterlist.h> +#include <linux/spinlock_types.h> +#include <linux/f2fs_fs.h> +#include <linux/ratelimit.h> +#include <linux/bio.h> + +#include "f2fs.h" +#include "xattr.h" + +/* Encryption added and removed here! (L: */ + +static unsigned int num_prealloc_crypto_pages = 32; +static unsigned int num_prealloc_crypto_ctxs = 128; + +module_param(num_prealloc_crypto_pages, uint, 0444); +MODULE_PARM_DESC(num_prealloc_crypto_pages, + "Number of crypto pages to preallocate"); +module_param(num_prealloc_crypto_ctxs, uint, 0444); +MODULE_PARM_DESC(num_prealloc_crypto_ctxs, + "Number of crypto contexts to preallocate"); + +static mempool_t *f2fs_bounce_page_pool; + +static LIST_HEAD(f2fs_free_crypto_ctxs); +static DEFINE_SPINLOCK(f2fs_crypto_ctx_lock); + +static struct workqueue_struct *f2fs_read_workqueue; +static DEFINE_MUTEX(crypto_init); + +static struct kmem_cache *f2fs_crypto_ctx_cachep; +struct kmem_cache *f2fs_crypt_info_cachep; + +/** + * f2fs_release_crypto_ctx() - Releases an encryption context + * @ctx: The encryption context to release. + * + * If the encryption context was allocated from the pre-allocated pool, returns + * it to that pool. Else, frees it. + * + * If there's a bounce page in the context, this frees that. + */ +void f2fs_release_crypto_ctx(struct f2fs_crypto_ctx *ctx) +{ + unsigned long flags; + + if (ctx->flags & F2FS_WRITE_PATH_FL && ctx->w.bounce_page) { + mempool_free(ctx->w.bounce_page, f2fs_bounce_page_pool); + ctx->w.bounce_page = NULL; + } + ctx->w.control_page = NULL; + if (ctx->flags & F2FS_CTX_REQUIRES_FREE_ENCRYPT_FL) { + kmem_cache_free(f2fs_crypto_ctx_cachep, ctx); + } else { + spin_lock_irqsave(&f2fs_crypto_ctx_lock, flags); + list_add(&ctx->free_list, &f2fs_free_crypto_ctxs); + spin_unlock_irqrestore(&f2fs_crypto_ctx_lock, flags); + } +} + +/** + * f2fs_get_crypto_ctx() - Gets an encryption context + * @inode: The inode for which we are doing the crypto + * + * Allocates and initializes an encryption context. + * + * Return: An allocated and initialized encryption context on success; error + * value or NULL otherwise. + */ +struct f2fs_crypto_ctx *f2fs_get_crypto_ctx(struct inode *inode) +{ + struct f2fs_crypto_ctx *ctx = NULL; + unsigned long flags; + struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info; + + if (ci == NULL) + return ERR_PTR(-ENOKEY); + + /* + * We first try getting the ctx from a free list because in + * the common case the ctx will have an allocated and + * initialized crypto tfm, so it's probably a worthwhile + * optimization. For the bounce page, we first try getting it + * from the kernel allocator because that's just about as fast + * as getting it from a list and because a cache of free pages + * should generally be a "last resort" option for a filesystem + * to be able to do its job. + */ + spin_lock_irqsave(&f2fs_crypto_ctx_lock, flags); + ctx = list_first_entry_or_null(&f2fs_free_crypto_ctxs, + struct f2fs_crypto_ctx, free_list); + if (ctx) + list_del(&ctx->free_list); + spin_unlock_irqrestore(&f2fs_crypto_ctx_lock, flags); + if (!ctx) { + ctx = kmem_cache_zalloc(f2fs_crypto_ctx_cachep, GFP_NOFS); + if (!ctx) + return ERR_PTR(-ENOMEM); + ctx->flags |= F2FS_CTX_REQUIRES_FREE_ENCRYPT_FL; + } else { + ctx->flags &= ~F2FS_CTX_REQUIRES_FREE_ENCRYPT_FL; + } + ctx->flags &= ~F2FS_WRITE_PATH_FL; + return ctx; +} + +/* + * Call f2fs_decrypt on every single page, reusing the encryption + * context. + */ +static void completion_pages(struct work_struct *work) +{ + struct f2fs_crypto_ctx *ctx = + container_of(work, struct f2fs_crypto_ctx, r.work); + struct bio *bio = ctx->r.bio; + struct bio_vec *bv; + int i; + + bio_for_each_segment_all(bv, bio, i) { + struct page *page = bv->bv_page; + int ret = f2fs_decrypt(ctx, page); + + if (ret) { + WARN_ON_ONCE(1); + SetPageError(page); + } else + SetPageUptodate(page); + unlock_page(page); + } + f2fs_release_crypto_ctx(ctx); + bio_put(bio); +} + +void f2fs_end_io_crypto_work(struct f2fs_crypto_ctx *ctx, struct bio *bio) +{ + INIT_WORK(&ctx->r.work, completion_pages); + ctx->r.bio = bio; + queue_work(f2fs_read_workqueue, &ctx->r.work); +} + +static void f2fs_crypto_destroy(void) +{ + struct f2fs_crypto_ctx *pos, *n; + + list_for_each_entry_safe(pos, n, &f2fs_free_crypto_ctxs, free_list) + kmem_cache_free(f2fs_crypto_ctx_cachep, pos); + INIT_LIST_HEAD(&f2fs_free_crypto_ctxs); + if (f2fs_bounce_page_pool) + mempool_destroy(f2fs_bounce_page_pool); + f2fs_bounce_page_pool = NULL; +} + +/** + * f2fs_crypto_initialize() - Set up for f2fs encryption. + * + * We only call this when we start accessing encrypted files, since it + * results in memory getting allocated that wouldn't otherwise be used. + * + * Return: Zero on success, non-zero otherwise. + */ +int f2fs_crypto_initialize(void) +{ + int i, res = -ENOMEM; + + if (f2fs_bounce_page_pool) + return 0; + + mutex_lock(&crypto_init); + if (f2fs_bounce_page_pool) + goto already_initialized; + + for (i = 0; i < num_prealloc_crypto_ctxs; i++) { + struct f2fs_crypto_ctx *ctx; + + ctx = kmem_cache_zalloc(f2fs_crypto_ctx_cachep, GFP_KERNEL); + if (!ctx) + goto fail; + list_add(&ctx->free_list, &f2fs_free_crypto_ctxs); + } + + /* must be allocated at the last step to avoid race condition above */ + f2fs_bounce_page_pool = + mempool_create_page_pool(num_prealloc_crypto_pages, 0); + if (!f2fs_bounce_page_pool) + goto fail; + +already_initialized: + mutex_unlock(&crypto_init); + return 0; +fail: + f2fs_crypto_destroy(); + mutex_unlock(&crypto_init); + return res; +} + +/** + * f2fs_exit_crypto() - Shutdown the f2fs encryption system + */ +void f2fs_exit_crypto(void) +{ + f2fs_crypto_destroy(); + + if (f2fs_read_workqueue) + destroy_workqueue(f2fs_read_workqueue); + if (f2fs_crypto_ctx_cachep) + kmem_cache_destroy(f2fs_crypto_ctx_cachep); + if (f2fs_crypt_info_cachep) + kmem_cache_destroy(f2fs_crypt_info_cachep); +} + +int __init f2fs_init_crypto(void) +{ + int res = -ENOMEM; + + f2fs_read_workqueue = alloc_workqueue("f2fs_crypto", WQ_HIGHPRI, 0); + if (!f2fs_read_workqueue) + goto fail; + + f2fs_crypto_ctx_cachep = KMEM_CACHE(f2fs_crypto_ctx, + SLAB_RECLAIM_ACCOUNT); + if (!f2fs_crypto_ctx_cachep) + goto fail; + + f2fs_crypt_info_cachep = KMEM_CACHE(f2fs_crypt_info, + SLAB_RECLAIM_ACCOUNT); + if (!f2fs_crypt_info_cachep) + goto fail; + + return 0; +fail: + f2fs_exit_crypto(); + return res; +} + +void f2fs_restore_and_release_control_page(struct page **page) +{ + struct f2fs_crypto_ctx *ctx; + struct page *bounce_page; + + /* The bounce data pages are unmapped. */ + if ((*page)->mapping) + return; + + /* The bounce data page is unmapped. */ + bounce_page = *page; + ctx = (struct f2fs_crypto_ctx *)page_private(bounce_page); + + /* restore control page */ + *page = ctx->w.control_page; + + f2fs_restore_control_page(bounce_page); +} + +void f2fs_restore_control_page(struct page *data_page) +{ + struct f2fs_crypto_ctx *ctx = + (struct f2fs_crypto_ctx *)page_private(data_page); + + set_page_private(data_page, (unsigned long)NULL); + ClearPagePrivate(data_page); + unlock_page(data_page); + f2fs_release_crypto_ctx(ctx); +} + +/** + * f2fs_crypt_complete() - The completion callback for page encryption + * @req: The asynchronous encryption request context + * @res: The result of the encryption operation + */ +static void f2fs_crypt_complete(struct crypto_async_request *req, int res) +{ + struct f2fs_completion_result *ecr = req->data; + + if (res == -EINPROGRESS) + return; + ecr->res = res; + complete(&ecr->completion); +} + +typedef enum { + F2FS_DECRYPT = 0, + F2FS_ENCRYPT, +} f2fs_direction_t; + +static int f2fs_page_crypto(struct f2fs_crypto_ctx *ctx, + struct inode *inode, + f2fs_direction_t rw, + pgoff_t index, + struct page *src_page, + struct page *dest_page) +{ + u8 xts_tweak[F2FS_XTS_TWEAK_SIZE]; + struct ablkcipher_request *req = NULL; + DECLARE_F2FS_COMPLETION_RESULT(ecr); + struct scatterlist dst, src; + struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info; + struct crypto_ablkcipher *tfm = ci->ci_ctfm; + int res = 0; + + req = ablkcipher_request_alloc(tfm, GFP_NOFS); + if (!req) { + printk_ratelimited(KERN_ERR + "%s: crypto_request_alloc() failed\n", + __func__); + return -ENOMEM; + } + ablkcipher_request_set_callback( + req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, + f2fs_crypt_complete, &ecr); + + BUILD_BUG_ON(F2FS_XTS_TWEAK_SIZE < sizeof(index)); + memcpy(xts_tweak, &index, sizeof(index)); + memset(&xts_tweak[sizeof(index)], 0, + F2FS_XTS_TWEAK_SIZE - sizeof(index)); + + sg_init_table(&dst, 1); + sg_set_page(&dst, dest_page, PAGE_CACHE_SIZE, 0); + sg_init_table(&src, 1); + sg_set_page(&src, src_page, PAGE_CACHE_SIZE, 0); + ablkcipher_request_set_crypt(req, &src, &dst, PAGE_CACHE_SIZE, + xts_tweak); + if (rw == F2FS_DECRYPT) + res = crypto_ablkcipher_decrypt(req); + else + res = crypto_ablkcipher_encrypt(req); + if (res == -EINPROGRESS || res == -EBUSY) { + BUG_ON(req->base.data != &ecr); + wait_for_completion(&ecr.completion); + res = ecr.res; + } + ablkcipher_request_free(req); + if (res) { + printk_ratelimited(KERN_ERR + "%s: crypto_ablkcipher_encrypt() returned %d\n", + __func__, res); + return res; + } + return 0; +} + +static struct page *alloc_bounce_page(struct f2fs_crypto_ctx *ctx) +{ + ctx->w.bounce_page = mempool_alloc(f2fs_bounce_page_pool, GFP_NOWAIT); + if (ctx->w.bounce_page == NULL) + return ERR_PTR(-ENOMEM); + ctx->flags |= F2FS_WRITE_PATH_FL; + return ctx->w.bounce_page; +} + +/** + * f2fs_encrypt() - Encrypts a page + * @inode: The inode for which the encryption should take place + * @plaintext_page: The page to encrypt. Must be locked. + * + * Allocates a ciphertext page and encrypts plaintext_page into it using the ctx + * encryption context. + * + * Called on the page write path. The caller must call + * f2fs_restore_control_page() on the returned ciphertext page to + * release the bounce buffer and the encryption context. + * + * Return: An allocated page with the encrypted content on success. Else, an + * error value or NULL. + */ +struct page *f2fs_encrypt(struct inode *inode, + struct page *plaintext_page) +{ + struct f2fs_crypto_ctx *ctx; + struct page *ciphertext_page = NULL; + int err; + + BUG_ON(!PageLocked(plaintext_page)); + + ctx = f2fs_get_crypto_ctx(inode); + if (IS_ERR(ctx)) + return (struct page *)ctx; + + /* The encryption operation will require a bounce page. */ + ciphertext_page = alloc_bounce_page(ctx); + if (IS_ERR(ciphertext_page)) + goto err_out; + + ctx->w.control_page = plaintext_page; + err = f2fs_page_crypto(ctx, inode, F2FS_ENCRYPT, plaintext_page->index, + plaintext_page, ciphertext_page); + if (err) { + ciphertext_page = ERR_PTR(err); + goto err_out; + } + + SetPagePrivate(ciphertext_page); + set_page_private(ciphertext_page, (unsigned long)ctx); + lock_page(ciphertext_page); + return ciphertext_page; + +err_out: + f2fs_release_crypto_ctx(ctx); + return ciphertext_page; +} + +/** + * f2fs_decrypt() - Decrypts a page in-place + * @ctx: The encryption context. + * @page: The page to decrypt. Must be locked. + * + * Decrypts page in-place using the ctx encryption context. + * + * Called from the read completion callback. + * + * Return: Zero on success, non-zero otherwise. + */ +int f2fs_decrypt(struct f2fs_crypto_ctx *ctx, struct page *page) +{ + BUG_ON(!PageLocked(page)); + + return f2fs_page_crypto(ctx, page->mapping->host, + F2FS_DECRYPT, page->index, page, page); +} + +/* + * Convenience function which takes care of allocating and + * deallocating the encryption context + */ +int f2fs_decrypt_one(struct inode *inode, struct page *page) +{ + struct f2fs_crypto_ctx *ctx = f2fs_get_crypto_ctx(inode); + int ret; + + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + ret = f2fs_decrypt(ctx, page); + f2fs_release_crypto_ctx(ctx); + return ret; +} + +bool f2fs_valid_contents_enc_mode(uint32_t mode) +{ + return (mode == F2FS_ENCRYPTION_MODE_AES_256_XTS); +} + +/** + * f2fs_validate_encryption_key_size() - Validate the encryption key size + * @mode: The key mode. + * @size: The key size to validate. + * + * Return: The validated key size for @mode. Zero if invalid. + */ +uint32_t f2fs_validate_encryption_key_size(uint32_t mode, uint32_t size) +{ + if (size == f2fs_encryption_key_size(mode)) + return size; + return 0; +} diff --git a/fs/f2fs/crypto_fname.c b/fs/f2fs/crypto_fname.c new file mode 100644 index 000000000..ab377d496 --- /dev/null +++ b/fs/f2fs/crypto_fname.c @@ -0,0 +1,440 @@ +/* + * linux/fs/f2fs/crypto_fname.c + * + * Copied from linux/fs/ext4/crypto.c + * + * Copyright (C) 2015, Google, Inc. + * Copyright (C) 2015, Motorola Mobility + * + * This contains functions for filename crypto management in f2fs + * + * Written by Uday Savagaonkar, 2014. + * + * Adjust f2fs dentry structure + * Jaegeuk Kim, 2015. + * + * This has not yet undergone a rigorous security audit. + */ +#include <crypto/hash.h> +#include <crypto/sha.h> +#include <keys/encrypted-type.h> +#include <keys/user-type.h> +#include <linux/crypto.h> +#include <linux/gfp.h> +#include <linux/kernel.h> +#include <linux/key.h> +#include <linux/list.h> +#include <linux/mempool.h> +#include <linux/random.h> +#include <linux/scatterlist.h> +#include <linux/spinlock_types.h> +#include <linux/f2fs_fs.h> +#include <linux/ratelimit.h> + +#include "f2fs.h" +#include "f2fs_crypto.h" +#include "xattr.h" + +/** + * f2fs_dir_crypt_complete() - + */ +static void f2fs_dir_crypt_complete(struct crypto_async_request *req, int res) +{ + struct f2fs_completion_result *ecr = req->data; + + if (res == -EINPROGRESS) + return; + ecr->res = res; + complete(&ecr->completion); +} + +bool f2fs_valid_filenames_enc_mode(uint32_t mode) +{ + return (mode == F2FS_ENCRYPTION_MODE_AES_256_CTS); +} + +static unsigned max_name_len(struct inode *inode) +{ + return S_ISLNK(inode->i_mode) ? inode->i_sb->s_blocksize : + F2FS_NAME_LEN; +} + +/** + * f2fs_fname_encrypt() - + * + * This function encrypts the input filename, and returns the length of the + * ciphertext. Errors are returned as negative numbers. We trust the caller to + * allocate sufficient memory to oname string. + */ +static int f2fs_fname_encrypt(struct inode *inode, + const struct qstr *iname, struct f2fs_str *oname) +{ + u32 ciphertext_len; + struct ablkcipher_request *req = NULL; + DECLARE_F2FS_COMPLETION_RESULT(ecr); + struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info; + struct crypto_ablkcipher *tfm = ci->ci_ctfm; + int res = 0; + char iv[F2FS_CRYPTO_BLOCK_SIZE]; + struct scatterlist src_sg, dst_sg; + int padding = 4 << (ci->ci_flags & F2FS_POLICY_FLAGS_PAD_MASK); + char *workbuf, buf[32], *alloc_buf = NULL; + unsigned lim = max_name_len(inode); + + if (iname->len <= 0 || iname->len > lim) + return -EIO; + + ciphertext_len = (iname->len < F2FS_CRYPTO_BLOCK_SIZE) ? + F2FS_CRYPTO_BLOCK_SIZE : iname->len; + ciphertext_len = f2fs_fname_crypto_round_up(ciphertext_len, padding); + ciphertext_len = (ciphertext_len > lim) ? lim : ciphertext_len; + + if (ciphertext_len <= sizeof(buf)) { + workbuf = buf; + } else { + alloc_buf = kmalloc(ciphertext_len, GFP_NOFS); + if (!alloc_buf) + return -ENOMEM; + workbuf = alloc_buf; + } + + /* Allocate request */ + req = ablkcipher_request_alloc(tfm, GFP_NOFS); + if (!req) { + printk_ratelimited(KERN_ERR + "%s: crypto_request_alloc() failed\n", __func__); + kfree(alloc_buf); + return -ENOMEM; + } + ablkcipher_request_set_callback(req, + CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, + f2fs_dir_crypt_complete, &ecr); + + /* Copy the input */ + memcpy(workbuf, iname->name, iname->len); + if (iname->len < ciphertext_len) + memset(workbuf + iname->len, 0, ciphertext_len - iname->len); + + /* Initialize IV */ + memset(iv, 0, F2FS_CRYPTO_BLOCK_SIZE); + + /* Create encryption request */ + sg_init_one(&src_sg, workbuf, ciphertext_len); + sg_init_one(&dst_sg, oname->name, ciphertext_len); + ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv); + res = crypto_ablkcipher_encrypt(req); + if (res == -EINPROGRESS || res == -EBUSY) { + BUG_ON(req->base.data != &ecr); + wait_for_completion(&ecr.completion); + res = ecr.res; + } + kfree(alloc_buf); + ablkcipher_request_free(req); + if (res < 0) { + printk_ratelimited(KERN_ERR + "%s: Error (error code %d)\n", __func__, res); + } + oname->len = ciphertext_len; + return res; +} + +/* + * f2fs_fname_decrypt() + * This function decrypts the input filename, and returns + * the length of the plaintext. + * Errors are returned as negative numbers. + * We trust the caller to allocate sufficient memory to oname string. + */ +static int f2fs_fname_decrypt(struct inode *inode, + const struct f2fs_str *iname, struct f2fs_str *oname) +{ + struct ablkcipher_request *req = NULL; + DECLARE_F2FS_COMPLETION_RESULT(ecr); + struct scatterlist src_sg, dst_sg; + struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info; + struct crypto_ablkcipher *tfm = ci->ci_ctfm; + int res = 0; + char iv[F2FS_CRYPTO_BLOCK_SIZE]; + unsigned lim = max_name_len(inode); + + if (iname->len <= 0 || iname->len > lim) + return -EIO; + + /* Allocate request */ + req = ablkcipher_request_alloc(tfm, GFP_NOFS); + if (!req) { + printk_ratelimited(KERN_ERR + "%s: crypto_request_alloc() failed\n", __func__); + return -ENOMEM; + } + ablkcipher_request_set_callback(req, + CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, + f2fs_dir_crypt_complete, &ecr); + + /* Initialize IV */ + memset(iv, 0, F2FS_CRYPTO_BLOCK_SIZE); + + /* Create decryption request */ + sg_init_one(&src_sg, iname->name, iname->len); + sg_init_one(&dst_sg, oname->name, oname->len); + ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv); + res = crypto_ablkcipher_decrypt(req); + if (res == -EINPROGRESS || res == -EBUSY) { + BUG_ON(req->base.data != &ecr); + wait_for_completion(&ecr.completion); + res = ecr.res; + } + ablkcipher_request_free(req); + if (res < 0) { + printk_ratelimited(KERN_ERR + "%s: Error in f2fs_fname_decrypt (error code %d)\n", + __func__, res); + return res; + } + + oname->len = strnlen(oname->name, iname->len); + return oname->len; +} + +static const char *lookup_table = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,"; + +/** + * f2fs_fname_encode_digest() - + * + * Encodes the input digest using characters from the set [a-zA-Z0-9_+]. + * The encoded string is roughly 4/3 times the size of the input string. + */ +static int digest_encode(const char *src, int len, char *dst) +{ + int i = 0, bits = 0, ac = 0; + char *cp = dst; + + while (i < len) { + ac += (((unsigned char) src[i]) << bits); + bits += 8; + do { + *cp++ = lookup_table[ac & 0x3f]; + ac >>= 6; + bits -= 6; + } while (bits >= 6); + i++; + } + if (bits) + *cp++ = lookup_table[ac & 0x3f]; + return cp - dst; +} + +static int digest_decode(const char *src, int len, char *dst) +{ + int i = 0, bits = 0, ac = 0; + const char *p; + char *cp = dst; + + while (i < len) { + p = strchr(lookup_table, src[i]); + if (p == NULL || src[i] == 0) + return -2; + ac += (p - lookup_table) << bits; + bits += 6; + if (bits >= 8) { + *cp++ = ac & 0xff; + ac >>= 8; + bits -= 8; + } + i++; + } + if (ac) + return -1; + return cp - dst; +} + +/** + * f2fs_fname_crypto_round_up() - + * + * Return: The next multiple of block size + */ +u32 f2fs_fname_crypto_round_up(u32 size, u32 blksize) +{ + return ((size + blksize - 1) / blksize) * blksize; +} + +/** + * f2fs_fname_crypto_alloc_obuff() - + * + * Allocates an output buffer that is sufficient for the crypto operation + * specified by the context and the direction. + */ +int f2fs_fname_crypto_alloc_buffer(struct inode *inode, + u32 ilen, struct f2fs_str *crypto_str) +{ + unsigned int olen; + int padding = 16; + struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info; + + if (ci) + padding = 4 << (ci->ci_flags & F2FS_POLICY_FLAGS_PAD_MASK); + if (padding < F2FS_CRYPTO_BLOCK_SIZE) + padding = F2FS_CRYPTO_BLOCK_SIZE; + olen = f2fs_fname_crypto_round_up(ilen, padding); + crypto_str->len = olen; + if (olen < F2FS_FNAME_CRYPTO_DIGEST_SIZE * 2) + olen = F2FS_FNAME_CRYPTO_DIGEST_SIZE * 2; + /* Allocated buffer can hold one more character to null-terminate the + * string */ + crypto_str->name = kmalloc(olen + 1, GFP_NOFS); + if (!(crypto_str->name)) + return -ENOMEM; + return 0; +} + +/** + * f2fs_fname_crypto_free_buffer() - + * + * Frees the buffer allocated for crypto operation. + */ +void f2fs_fname_crypto_free_buffer(struct f2fs_str *crypto_str) +{ + if (!crypto_str) + return; + kfree(crypto_str->name); + crypto_str->name = NULL; +} + +/** + * f2fs_fname_disk_to_usr() - converts a filename from disk space to user space + */ +int f2fs_fname_disk_to_usr(struct inode *inode, + f2fs_hash_t *hash, + const struct f2fs_str *iname, + struct f2fs_str *oname) +{ + const struct qstr qname = FSTR_TO_QSTR(iname); + char buf[24]; + int ret; + + if (is_dot_dotdot(&qname)) { + oname->name[0] = '.'; + oname->name[iname->len - 1] = '.'; + oname->len = iname->len; + return oname->len; + } + + if (F2FS_I(inode)->i_crypt_info) + return f2fs_fname_decrypt(inode, iname, oname); + + if (iname->len <= F2FS_FNAME_CRYPTO_DIGEST_SIZE) { + ret = digest_encode(iname->name, iname->len, oname->name); + oname->len = ret; + return ret; + } + if (hash) { + memcpy(buf, hash, 4); + memset(buf + 4, 0, 4); + } else + memset(buf, 0, 8); + memcpy(buf + 8, iname->name + iname->len - 16, 16); + oname->name[0] = '_'; + ret = digest_encode(buf, 24, oname->name + 1); + oname->len = ret + 1; + return ret + 1; +} + +/** + * f2fs_fname_usr_to_disk() - converts a filename from user space to disk space + */ +int f2fs_fname_usr_to_disk(struct inode *inode, + const struct qstr *iname, + struct f2fs_str *oname) +{ + int res; + struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info; + + if (is_dot_dotdot(iname)) { + oname->name[0] = '.'; + oname->name[iname->len - 1] = '.'; + oname->len = iname->len; + return oname->len; + } + + if (ci) { + res = f2fs_fname_encrypt(inode, iname, oname); + return res; + } + /* Without a proper key, a user is not allowed to modify the filenames + * in a directory. Consequently, a user space name cannot be mapped to + * a disk-space name */ + return -EACCES; +} + +int f2fs_fname_setup_filename(struct inode *dir, const struct qstr *iname, + int lookup, struct f2fs_filename *fname) +{ + struct f2fs_crypt_info *ci; + int ret = 0, bigname = 0; + + memset(fname, 0, sizeof(struct f2fs_filename)); + fname->usr_fname = iname; + + if (!f2fs_encrypted_inode(dir) || is_dot_dotdot(iname)) { + fname->disk_name.name = (unsigned char *)iname->name; + fname->disk_name.len = iname->len; + return 0; + } + ret = f2fs_get_encryption_info(dir); + if (ret) + return ret; + ci = F2FS_I(dir)->i_crypt_info; + if (ci) { + ret = f2fs_fname_crypto_alloc_buffer(dir, iname->len, + &fname->crypto_buf); + if (ret < 0) + return ret; + ret = f2fs_fname_encrypt(dir, iname, &fname->crypto_buf); + if (ret < 0) + goto errout; + fname->disk_name.name = fname->crypto_buf.name; + fname->disk_name.len = fname->crypto_buf.len; + return 0; + } + if (!lookup) + return -EACCES; + + /* We don't have the key and we are doing a lookup; decode the + * user-supplied name + */ + if (iname->name[0] == '_') + bigname = 1; + if ((bigname && (iname->len != 33)) || + (!bigname && (iname->len > 43))) + return -ENOENT; + + fname->crypto_buf.name = kmalloc(32, GFP_KERNEL); + if (fname->crypto_buf.name == NULL) + return -ENOMEM; + ret = digest_decode(iname->name + bigname, iname->len - bigname, + fname->crypto_buf.name); + if (ret < 0) { + ret = -ENOENT; + goto errout; + } + fname->crypto_buf.len = ret; + if (bigname) { + memcpy(&fname->hash, fname->crypto_buf.name, 4); + } else { + fname->disk_name.name = fname->crypto_buf.name; + fname->disk_name.len = fname->crypto_buf.len; + } + return 0; +errout: + f2fs_fname_crypto_free_buffer(&fname->crypto_buf); + return ret; +} + +void f2fs_fname_free_filename(struct f2fs_filename *fname) +{ + kfree(fname->crypto_buf.name); + fname->crypto_buf.name = NULL; + fname->usr_fname = NULL; + fname->disk_name.name = NULL; +} diff --git a/fs/f2fs/crypto_key.c b/fs/f2fs/crypto_key.c new file mode 100644 index 000000000..95b8f936f --- /dev/null +++ b/fs/f2fs/crypto_key.c @@ -0,0 +1,255 @@ +/* + * linux/fs/f2fs/crypto_key.c + * + * Copied from linux/fs/f2fs/crypto_key.c + * + * Copyright (C) 2015, Google, Inc. + * + * This contains encryption key functions for f2fs + * + * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015. + */ +#include <keys/encrypted-type.h> +#include <keys/user-type.h> +#include <linux/random.h> +#include <linux/scatterlist.h> +#include <uapi/linux/keyctl.h> +#include <crypto/hash.h> +#include <linux/f2fs_fs.h> + +#include "f2fs.h" +#include "xattr.h" + +static void derive_crypt_complete(struct crypto_async_request *req, int rc) +{ + struct f2fs_completion_result *ecr = req->data; + + if (rc == -EINPROGRESS) + return; + + ecr->res = rc; + complete(&ecr->completion); +} + +/** + * f2fs_derive_key_aes() - Derive a key using AES-128-ECB + * @deriving_key: Encryption key used for derivatio. + * @source_key: Source key to which to apply derivation. + * @derived_key: Derived key. + * + * Return: Zero on success; non-zero otherwise. + */ +static int f2fs_derive_key_aes(char deriving_key[F2FS_AES_128_ECB_KEY_SIZE], + char source_key[F2FS_AES_256_XTS_KEY_SIZE], + char derived_key[F2FS_AES_256_XTS_KEY_SIZE]) +{ + int res = 0; + struct ablkcipher_request *req = NULL; + DECLARE_F2FS_COMPLETION_RESULT(ecr); + struct scatterlist src_sg, dst_sg; + struct crypto_ablkcipher *tfm = crypto_alloc_ablkcipher("ecb(aes)", 0, + 0); + + if (IS_ERR(tfm)) { + res = PTR_ERR(tfm); + tfm = NULL; + goto out; + } + crypto_ablkcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY); + req = ablkcipher_request_alloc(tfm, GFP_NOFS); + if (!req) { + res = -ENOMEM; + goto out; + } + ablkcipher_request_set_callback(req, + CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, + derive_crypt_complete, &ecr); + res = crypto_ablkcipher_setkey(tfm, deriving_key, + F2FS_AES_128_ECB_KEY_SIZE); + if (res < 0) + goto out; + + sg_init_one(&src_sg, source_key, F2FS_AES_256_XTS_KEY_SIZE); + sg_init_one(&dst_sg, derived_key, F2FS_AES_256_XTS_KEY_SIZE); + ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, + F2FS_AES_256_XTS_KEY_SIZE, NULL); + res = crypto_ablkcipher_encrypt(req); + if (res == -EINPROGRESS || res == -EBUSY) { + BUG_ON(req->base.data != &ecr); + wait_for_completion(&ecr.completion); + res = ecr.res; + } +out: + if (req) + ablkcipher_request_free(req); + if (tfm) + crypto_free_ablkcipher(tfm); + return res; +} + +static void f2fs_free_crypt_info(struct f2fs_crypt_info *ci) +{ + if (!ci) + return; + + if (ci->ci_keyring_key) + key_put(ci->ci_keyring_key); + crypto_free_ablkcipher(ci->ci_ctfm); + kmem_cache_free(f2fs_crypt_info_cachep, ci); +} + +void f2fs_free_encryption_info(struct inode *inode, struct f2fs_crypt_info *ci) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_crypt_info *prev; + + if (ci == NULL) + ci = ACCESS_ONCE(fi->i_crypt_info); + if (ci == NULL) + return; + prev = cmpxchg(&fi->i_crypt_info, ci, NULL); + if (prev != ci) + return; + + f2fs_free_crypt_info(ci); +} + +int _f2fs_get_encryption_info(struct inode *inode) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_crypt_info *crypt_info; + char full_key_descriptor[F2FS_KEY_DESC_PREFIX_SIZE + + (F2FS_KEY_DESCRIPTOR_SIZE * 2) + 1]; + struct key *keyring_key = NULL; + struct f2fs_encryption_key *master_key; + struct f2fs_encryption_context ctx; + struct user_key_payload *ukp; + struct crypto_ablkcipher *ctfm; + const char *cipher_str; + char raw_key[F2FS_MAX_KEY_SIZE]; + char mode; + int res; + + res = f2fs_crypto_initialize(); + if (res) + return res; +retry: + crypt_info = ACCESS_ONCE(fi->i_crypt_info); + if (crypt_info) { + if (!crypt_info->ci_keyring_key || + key_validate(crypt_info->ci_keyring_key) == 0) + return 0; + f2fs_free_encryption_info(inode, crypt_info); + goto retry; + } + + res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION, + F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, + &ctx, sizeof(ctx), NULL); + if (res < 0) + return res; + else if (res != sizeof(ctx)) + return -EINVAL; + res = 0; + + crypt_info = kmem_cache_alloc(f2fs_crypt_info_cachep, GFP_NOFS); + if (!crypt_info) + return -ENOMEM; + + crypt_info->ci_flags = ctx.flags; + crypt_info->ci_data_mode = ctx.contents_encryption_mode; + crypt_info->ci_filename_mode = ctx.filenames_encryption_mode; + crypt_info->ci_ctfm = NULL; + crypt_info->ci_keyring_key = NULL; + memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor, + sizeof(crypt_info->ci_master_key)); + if (S_ISREG(inode->i_mode)) + mode = crypt_info->ci_data_mode; + else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + mode = crypt_info->ci_filename_mode; + else + BUG(); + + switch (mode) { + case F2FS_ENCRYPTION_MODE_AES_256_XTS: + cipher_str = "xts(aes)"; + break; + case F2FS_ENCRYPTION_MODE_AES_256_CTS: + cipher_str = "cts(cbc(aes))"; + break; + default: + printk_once(KERN_WARNING + "f2fs: unsupported key mode %d (ino %u)\n", + mode, (unsigned) inode->i_ino); + res = -ENOKEY; + goto out; + } + + memcpy(full_key_descriptor, F2FS_KEY_DESC_PREFIX, + F2FS_KEY_DESC_PREFIX_SIZE); + sprintf(full_key_descriptor + F2FS_KEY_DESC_PREFIX_SIZE, + "%*phN", F2FS_KEY_DESCRIPTOR_SIZE, + ctx.master_key_descriptor); + full_key_descriptor[F2FS_KEY_DESC_PREFIX_SIZE + + (2 * F2FS_KEY_DESCRIPTOR_SIZE)] = '\0'; + keyring_key = request_key(&key_type_logon, full_key_descriptor, NULL); + if (IS_ERR(keyring_key)) { + res = PTR_ERR(keyring_key); + keyring_key = NULL; + goto out; + } + crypt_info->ci_keyring_key = keyring_key; + BUG_ON(keyring_key->type != &key_type_logon); + ukp = ((struct user_key_payload *)keyring_key->payload.data); + if (ukp->datalen != sizeof(struct f2fs_encryption_key)) { + res = -EINVAL; + goto out; + } + master_key = (struct f2fs_encryption_key *)ukp->data; + BUILD_BUG_ON(F2FS_AES_128_ECB_KEY_SIZE != + F2FS_KEY_DERIVATION_NONCE_SIZE); + BUG_ON(master_key->size != F2FS_AES_256_XTS_KEY_SIZE); + res = f2fs_derive_key_aes(ctx.nonce, master_key->raw, + raw_key); + if (res) + goto out; + + ctfm = crypto_alloc_ablkcipher(cipher_str, 0, 0); + if (!ctfm || IS_ERR(ctfm)) { + res = ctfm ? PTR_ERR(ctfm) : -ENOMEM; + printk(KERN_DEBUG + "%s: error %d (inode %u) allocating crypto tfm\n", + __func__, res, (unsigned) inode->i_ino); + goto out; + } + crypt_info->ci_ctfm = ctfm; + crypto_ablkcipher_clear_flags(ctfm, ~0); + crypto_tfm_set_flags(crypto_ablkcipher_tfm(ctfm), + CRYPTO_TFM_REQ_WEAK_KEY); + res = crypto_ablkcipher_setkey(ctfm, raw_key, + f2fs_encryption_key_size(mode)); + if (res) + goto out; + + memzero_explicit(raw_key, sizeof(raw_key)); + if (cmpxchg(&fi->i_crypt_info, NULL, crypt_info) != NULL) { + f2fs_free_crypt_info(crypt_info); + goto retry; + } + return 0; + +out: + if (res == -ENOKEY && !S_ISREG(inode->i_mode)) + res = 0; + + f2fs_free_crypt_info(crypt_info); + memzero_explicit(raw_key, sizeof(raw_key)); + return res; +} + +int f2fs_has_encryption_key(struct inode *inode) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + + return (fi->i_crypt_info != NULL); +} diff --git a/fs/f2fs/crypto_policy.c b/fs/f2fs/crypto_policy.c new file mode 100644 index 000000000..d4a96af51 --- /dev/null +++ b/fs/f2fs/crypto_policy.c @@ -0,0 +1,209 @@ +/* + * copied from linux/fs/ext4/crypto_policy.c + * + * Copyright (C) 2015, Google, Inc. + * Copyright (C) 2015, Motorola Mobility. + * + * This contains encryption policy functions for f2fs with some modifications + * to support f2fs-specific xattr APIs. + * + * Written by Michael Halcrow, 2015. + * Modified by Jaegeuk Kim, 2015. + */ +#include <linux/random.h> +#include <linux/string.h> +#include <linux/types.h> +#include <linux/f2fs_fs.h> + +#include "f2fs.h" +#include "xattr.h" + +static int f2fs_inode_has_encryption_context(struct inode *inode) +{ + int res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION, + F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, NULL, 0, NULL); + return (res > 0); +} + +/* + * check whether the policy is consistent with the encryption context + * for the inode + */ +static int f2fs_is_encryption_context_consistent_with_policy( + struct inode *inode, const struct f2fs_encryption_policy *policy) +{ + struct f2fs_encryption_context ctx; + int res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION, + F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx, + sizeof(ctx), NULL); + + if (res != sizeof(ctx)) + return 0; + + return (memcmp(ctx.master_key_descriptor, policy->master_key_descriptor, + F2FS_KEY_DESCRIPTOR_SIZE) == 0 && + (ctx.flags == policy->flags) && + (ctx.contents_encryption_mode == + policy->contents_encryption_mode) && + (ctx.filenames_encryption_mode == + policy->filenames_encryption_mode)); +} + +static int f2fs_create_encryption_context_from_policy( + struct inode *inode, const struct f2fs_encryption_policy *policy) +{ + struct f2fs_encryption_context ctx; + + ctx.format = F2FS_ENCRYPTION_CONTEXT_FORMAT_V1; + memcpy(ctx.master_key_descriptor, policy->master_key_descriptor, + F2FS_KEY_DESCRIPTOR_SIZE); + + if (!f2fs_valid_contents_enc_mode(policy->contents_encryption_mode)) { + printk(KERN_WARNING + "%s: Invalid contents encryption mode %d\n", __func__, + policy->contents_encryption_mode); + return -EINVAL; + } + + if (!f2fs_valid_filenames_enc_mode(policy->filenames_encryption_mode)) { + printk(KERN_WARNING + "%s: Invalid filenames encryption mode %d\n", __func__, + policy->filenames_encryption_mode); + return -EINVAL; + } + + if (policy->flags & ~F2FS_POLICY_FLAGS_VALID) + return -EINVAL; + + ctx.contents_encryption_mode = policy->contents_encryption_mode; + ctx.filenames_encryption_mode = policy->filenames_encryption_mode; + ctx.flags = policy->flags; + BUILD_BUG_ON(sizeof(ctx.nonce) != F2FS_KEY_DERIVATION_NONCE_SIZE); + get_random_bytes(ctx.nonce, F2FS_KEY_DERIVATION_NONCE_SIZE); + + return f2fs_setxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION, + F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx, + sizeof(ctx), NULL, XATTR_CREATE); +} + +int f2fs_process_policy(const struct f2fs_encryption_policy *policy, + struct inode *inode) +{ + if (policy->version != 0) + return -EINVAL; + + if (!S_ISDIR(inode->i_mode)) + return -EINVAL; + + if (!f2fs_inode_has_encryption_context(inode)) { + if (!f2fs_empty_dir(inode)) + return -ENOTEMPTY; + return f2fs_create_encryption_context_from_policy(inode, + policy); + } + + if (f2fs_is_encryption_context_consistent_with_policy(inode, policy)) + return 0; + + printk(KERN_WARNING "%s: Policy inconsistent with encryption context\n", + __func__); + return -EINVAL; +} + +int f2fs_get_policy(struct inode *inode, struct f2fs_encryption_policy *policy) +{ + struct f2fs_encryption_context ctx; + int res; + + if (!f2fs_encrypted_inode(inode)) + return -ENODATA; + + res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION, + F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, + &ctx, sizeof(ctx), NULL); + if (res != sizeof(ctx)) + return -ENODATA; + if (ctx.format != F2FS_ENCRYPTION_CONTEXT_FORMAT_V1) + return -EINVAL; + + policy->version = 0; + policy->contents_encryption_mode = ctx.contents_encryption_mode; + policy->filenames_encryption_mode = ctx.filenames_encryption_mode; + policy->flags = ctx.flags; + memcpy(&policy->master_key_descriptor, ctx.master_key_descriptor, + F2FS_KEY_DESCRIPTOR_SIZE); + return 0; +} + +int f2fs_is_child_context_consistent_with_parent(struct inode *parent, + struct inode *child) +{ + struct f2fs_crypt_info *parent_ci, *child_ci; + int res; + + if ((parent == NULL) || (child == NULL)) { + pr_err("parent %p child %p\n", parent, child); + BUG_ON(1); + } + + /* no restrictions if the parent directory is not encrypted */ + if (!f2fs_encrypted_inode(parent)) + return 1; + /* if the child directory is not encrypted, this is always a problem */ + if (!f2fs_encrypted_inode(child)) + return 0; + res = f2fs_get_encryption_info(parent); + if (res) + return 0; + res = f2fs_get_encryption_info(child); + if (res) + return 0; + parent_ci = F2FS_I(parent)->i_crypt_info; + child_ci = F2FS_I(child)->i_crypt_info; + if (!parent_ci && !child_ci) + return 1; + if (!parent_ci || !child_ci) + return 0; + + return (memcmp(parent_ci->ci_master_key, + child_ci->ci_master_key, + F2FS_KEY_DESCRIPTOR_SIZE) == 0 && + (parent_ci->ci_data_mode == child_ci->ci_data_mode) && + (parent_ci->ci_filename_mode == child_ci->ci_filename_mode) && + (parent_ci->ci_flags == child_ci->ci_flags)); +} + +/** + * f2fs_inherit_context() - Sets a child context from its parent + * @parent: Parent inode from which the context is inherited. + * @child: Child inode that inherits the context from @parent. + * + * Return: Zero on success, non-zero otherwise + */ +int f2fs_inherit_context(struct inode *parent, struct inode *child, + struct page *ipage) +{ + struct f2fs_encryption_context ctx; + struct f2fs_crypt_info *ci; + int res; + + res = f2fs_get_encryption_info(parent); + if (res < 0) + return res; + + ci = F2FS_I(parent)->i_crypt_info; + BUG_ON(ci == NULL); + + ctx.format = F2FS_ENCRYPTION_CONTEXT_FORMAT_V1; + + ctx.contents_encryption_mode = ci->ci_data_mode; + ctx.filenames_encryption_mode = ci->ci_filename_mode; + ctx.flags = ci->ci_flags; + memcpy(ctx.master_key_descriptor, ci->ci_master_key, + F2FS_KEY_DESCRIPTOR_SIZE); + + get_random_bytes(ctx.nonce, F2FS_KEY_DERIVATION_NONCE_SIZE); + return f2fs_setxattr(child, F2FS_XATTR_INDEX_ENCRYPTION, + F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx, + sizeof(ctx), ipage, XATTR_CREATE); +} diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 1e1aae669..f71e19a9d 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -18,6 +18,7 @@ #include <linux/bio.h> #include <linux/prefetch.h> #include <linux/uio.h> +#include <linux/cleancache.h> #include "f2fs.h" #include "node.h" @@ -33,6 +34,15 @@ static void f2fs_read_end_io(struct bio *bio, int err) struct bio_vec *bvec; int i; + if (f2fs_bio_encrypted(bio)) { + if (err) { + f2fs_release_crypto_ctx(bio->bi_private); + } else { + f2fs_end_io_crypto_work(bio->bi_private, bio); + return; + } + } + bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; @@ -56,6 +66,8 @@ static void f2fs_write_end_io(struct bio *bio, int err) bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; + f2fs_restore_and_release_control_page(&page); + if (unlikely(err)) { set_page_dirty(page); set_bit(AS_EIO, &page->mapping->flags); @@ -86,7 +98,7 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, bio->bi_bdev = sbi->sb->s_bdev; bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr); bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io; - bio->bi_private = sbi; + bio->bi_private = is_read ? NULL : sbi; return bio; } @@ -133,16 +145,16 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, * Fill the locked page with data located in the block address. * Return unlocked page. */ -int f2fs_submit_page_bio(struct f2fs_sb_info *sbi, struct page *page, - struct f2fs_io_info *fio) +int f2fs_submit_page_bio(struct f2fs_io_info *fio) { struct bio *bio; + struct page *page = fio->encrypted_page ? fio->encrypted_page : fio->page; trace_f2fs_submit_page_bio(page, fio); - f2fs_trace_ios(page, fio, 0); + f2fs_trace_ios(fio, 0); /* Allocate a new bio */ - bio = __bio_alloc(sbi, fio->blk_addr, 1, is_read_io(fio->rw)); + bio = __bio_alloc(fio->sbi, fio->blk_addr, 1, is_read_io(fio->rw)); if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) { bio_put(bio); @@ -154,12 +166,13 @@ int f2fs_submit_page_bio(struct f2fs_sb_info *sbi, struct page *page, return 0; } -void f2fs_submit_page_mbio(struct f2fs_sb_info *sbi, struct page *page, - struct f2fs_io_info *fio) +void f2fs_submit_page_mbio(struct f2fs_io_info *fio) { + struct f2fs_sb_info *sbi = fio->sbi; enum page_type btype = PAGE_TYPE_OF_BIO(fio->type); struct f2fs_bio_info *io; bool is_read = is_read_io(fio->rw); + struct page *bio_page; io = is_read ? &sbi->read_io : &sbi->write_io[btype]; @@ -181,17 +194,19 @@ alloc_new: io->fio = *fio; } - if (bio_add_page(io->bio, page, PAGE_CACHE_SIZE, 0) < + bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page; + + if (bio_add_page(io->bio, bio_page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) { __submit_merged_bio(io); goto alloc_new; } io->last_block_in_bio = fio->blk_addr; - f2fs_trace_ios(page, fio, 0); + f2fs_trace_ios(fio, 0); up_write(&io->io_rwsem); - trace_f2fs_submit_page_mbio(page, fio); + trace_f2fs_submit_page_mbio(fio->page, fio); } /* @@ -251,19 +266,6 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index) return err; } -static void f2fs_map_bh(struct super_block *sb, pgoff_t pgofs, - struct extent_info *ei, struct buffer_head *bh_result) -{ - unsigned int blkbits = sb->s_blocksize_bits; - size_t max_size = bh_result->b_size; - size_t mapped_size; - - clear_buffer_new(bh_result); - map_bh(bh_result, sb, ei->blk + pgofs - ei->fofs); - mapped_size = (ei->fofs + ei->len - pgofs) << blkbits; - bh_result->b_size = min(max_size, mapped_size); -} - static bool lookup_extent_info(struct inode *inode, pgoff_t pgofs, struct extent_info *ei) { @@ -905,7 +907,7 @@ void f2fs_update_extent_cache(struct dnode_of_data *dn) sync_inode_page(dn); } -struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync) +struct page *get_read_data_page(struct inode *inode, pgoff_t index, int rw) { struct address_space *mapping = inode->i_mapping; struct dnode_of_data dn; @@ -913,83 +915,15 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync) struct extent_info ei; int err; struct f2fs_io_info fio = { + .sbi = F2FS_I_SB(inode), .type = DATA, - .rw = sync ? READ_SYNC : READA, + .rw = rw, + .encrypted_page = NULL, }; - /* - * If sync is false, it needs to check its block allocation. - * This is need and triggered by two flows: - * gc and truncate_partial_data_page. - */ - if (!sync) - goto search; - - page = find_get_page(mapping, index); - if (page && PageUptodate(page)) - return page; - f2fs_put_page(page, 0); -search: - if (f2fs_lookup_extent_cache(inode, index, &ei)) { - dn.data_blkaddr = ei.blk + index - ei.fofs; - goto got_it; - } - - set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, index, LOOKUP_NODE); - if (err) - return ERR_PTR(err); - f2fs_put_dnode(&dn); + if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) + return read_mapping_page(mapping, index, NULL); - if (dn.data_blkaddr == NULL_ADDR) - return ERR_PTR(-ENOENT); - - /* By fallocate(), there is no cached page, but with NEW_ADDR */ - if (unlikely(dn.data_blkaddr == NEW_ADDR)) - return ERR_PTR(-EINVAL); - -got_it: - page = grab_cache_page(mapping, index); - if (!page) - return ERR_PTR(-ENOMEM); - - if (PageUptodate(page)) { - unlock_page(page); - return page; - } - - fio.blk_addr = dn.data_blkaddr; - err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, &fio); - if (err) - return ERR_PTR(err); - - if (sync) { - wait_on_page_locked(page); - if (unlikely(!PageUptodate(page))) { - f2fs_put_page(page, 0); - return ERR_PTR(-EIO); - } - } - return page; -} - -/* - * If it tries to access a hole, return an error. - * Because, the callers, functions in dir.c and GC, should be able to know - * whether this page exists or not. - */ -struct page *get_lock_data_page(struct inode *inode, pgoff_t index) -{ - struct address_space *mapping = inode->i_mapping; - struct dnode_of_data dn; - struct page *page; - struct extent_info ei; - int err; - struct f2fs_io_info fio = { - .type = DATA, - .rw = READ_SYNC, - }; -repeat: page = grab_cache_page(mapping, index); if (!page) return ERR_PTR(-ENOMEM); @@ -1011,10 +945,11 @@ repeat: f2fs_put_page(page, 1); return ERR_PTR(-ENOENT); } - got_it: - if (PageUptodate(page)) + if (PageUptodate(page)) { + unlock_page(page); return page; + } /* * A new dentry page is allocated but not able to be written, since its @@ -1025,14 +960,58 @@ got_it: if (dn.data_blkaddr == NEW_ADDR) { zero_user_segment(page, 0, PAGE_CACHE_SIZE); SetPageUptodate(page); + unlock_page(page); return page; } fio.blk_addr = dn.data_blkaddr; - err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, &fio); + fio.page = page; + err = f2fs_submit_page_bio(&fio); if (err) return ERR_PTR(err); + return page; +} + +struct page *find_data_page(struct inode *inode, pgoff_t index) +{ + struct address_space *mapping = inode->i_mapping; + struct page *page; + + page = find_get_page(mapping, index); + if (page && PageUptodate(page)) + return page; + f2fs_put_page(page, 0); + page = get_read_data_page(inode, index, READ_SYNC); + if (IS_ERR(page)) + return page; + + if (PageUptodate(page)) + return page; + + wait_on_page_locked(page); + if (unlikely(!PageUptodate(page))) { + f2fs_put_page(page, 0); + return ERR_PTR(-EIO); + } + return page; +} + +/* + * If it tries to access a hole, return an error. + * Because, the callers, functions in dir.c and GC, should be able to know + * whether this page exists or not. + */ +struct page *get_lock_data_page(struct inode *inode, pgoff_t index) +{ + struct address_space *mapping = inode->i_mapping; + struct page *page; +repeat: + page = get_read_data_page(inode, index, READ_SYNC); + if (IS_ERR(page)) + return page; + + /* wait for read completion */ lock_page(page); if (unlikely(!PageUptodate(page))) { f2fs_put_page(page, 1); @@ -1060,46 +1039,37 @@ struct page *get_new_data_page(struct inode *inode, struct page *page; struct dnode_of_data dn; int err; +repeat: + page = grab_cache_page(mapping, index); + if (!page) + return ERR_PTR(-ENOMEM); set_new_dnode(&dn, inode, ipage, NULL, 0); err = f2fs_reserve_block(&dn, index); - if (err) + if (err) { + f2fs_put_page(page, 1); return ERR_PTR(err); -repeat: - page = grab_cache_page(mapping, index); - if (!page) { - err = -ENOMEM; - goto put_err; } + if (!ipage) + f2fs_put_dnode(&dn); if (PageUptodate(page)) - return page; + goto got_it; if (dn.data_blkaddr == NEW_ADDR) { zero_user_segment(page, 0, PAGE_CACHE_SIZE); SetPageUptodate(page); } else { - struct f2fs_io_info fio = { - .type = DATA, - .rw = READ_SYNC, - .blk_addr = dn.data_blkaddr, - }; - err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, &fio); - if (err) - goto put_err; + f2fs_put_page(page, 1); - lock_page(page); - if (unlikely(!PageUptodate(page))) { - f2fs_put_page(page, 1); - err = -EIO; - goto put_err; - } - if (unlikely(page->mapping != mapping)) { - f2fs_put_page(page, 1); + page = get_read_data_page(inode, index, READ_SYNC); + if (IS_ERR(page)) goto repeat; - } - } + /* wait for read completion */ + lock_page(page); + } +got_it: if (new_i_size && i_size_read(inode) < ((index + 1) << PAGE_CACHE_SHIFT)) { i_size_write(inode, ((index + 1) << PAGE_CACHE_SHIFT)); @@ -1107,10 +1077,6 @@ repeat: set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR); } return page; - -put_err: - f2fs_put_dnode(&dn); - return ERR_PTR(err); } static int __allocate_data_block(struct dnode_of_data *dn) @@ -1208,18 +1174,18 @@ out: } /* - * get_data_block() now supported readahead/bmap/rw direct_IO with mapped bh. + * f2fs_map_blocks() now supported readahead/bmap/rw direct_IO with + * f2fs_map_blocks structure. * If original data blocks are allocated, then give them to blockdev. * Otherwise, * a. preallocate requested block addresses * b. do not use extent cache for better performance * c. give the block addresses to blockdev */ -static int __get_data_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create, bool fiemap) +static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, + int create, bool fiemap) { - unsigned int blkbits = inode->i_sb->s_blocksize_bits; - unsigned maxblocks = bh_result->b_size >> blkbits; + unsigned int maxblocks = map->m_len; struct dnode_of_data dn; int mode = create ? ALLOC_NODE : LOOKUP_NODE_RA; pgoff_t pgofs, end_offset; @@ -1227,11 +1193,16 @@ static int __get_data_block(struct inode *inode, sector_t iblock, struct extent_info ei; bool allocated = false; - /* Get the page offset from the block offset(iblock) */ - pgofs = (pgoff_t)(iblock >> (PAGE_CACHE_SHIFT - blkbits)); + map->m_len = 0; + map->m_flags = 0; + + /* it only supports block size == page size */ + pgofs = (pgoff_t)map->m_lblk; if (f2fs_lookup_extent_cache(inode, pgofs, &ei)) { - f2fs_map_bh(inode->i_sb, pgofs, &ei, bh_result); + map->m_pblk = ei.blk + pgofs - ei.fofs; + map->m_len = min((pgoff_t)maxblocks, ei.fofs + ei.len - pgofs); + map->m_flags = F2FS_MAP_MAPPED; goto out; } @@ -1250,21 +1221,23 @@ static int __get_data_block(struct inode *inode, sector_t iblock, goto put_out; if (dn.data_blkaddr != NULL_ADDR) { - clear_buffer_new(bh_result); - map_bh(bh_result, inode->i_sb, dn.data_blkaddr); + map->m_flags = F2FS_MAP_MAPPED; + map->m_pblk = dn.data_blkaddr; + if (dn.data_blkaddr == NEW_ADDR) + map->m_flags |= F2FS_MAP_UNWRITTEN; } else if (create) { err = __allocate_data_block(&dn); if (err) goto put_out; allocated = true; - set_buffer_new(bh_result); - map_bh(bh_result, inode->i_sb, dn.data_blkaddr); + map->m_flags = F2FS_MAP_NEW | F2FS_MAP_MAPPED; + map->m_pblk = dn.data_blkaddr; } else { goto put_out; } end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); - bh_result->b_size = (((size_t)1) << blkbits); + map->m_len = 1; dn.ofs_in_node++; pgofs++; @@ -1288,22 +1261,25 @@ get_next: end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); } - if (maxblocks > (bh_result->b_size >> blkbits)) { + if (maxblocks > map->m_len) { block_t blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); if (blkaddr == NULL_ADDR && create) { err = __allocate_data_block(&dn); if (err) goto sync_out; allocated = true; - set_buffer_new(bh_result); + map->m_flags |= F2FS_MAP_NEW; blkaddr = dn.data_blkaddr; } /* Give more consecutive addresses for the readahead */ - if (blkaddr == (bh_result->b_blocknr + ofs)) { + if ((map->m_pblk != NEW_ADDR && + blkaddr == (map->m_pblk + ofs)) || + (map->m_pblk == NEW_ADDR && + blkaddr == NEW_ADDR)) { ofs++; dn.ofs_in_node++; pgofs++; - bh_result->b_size += (((size_t)1) << blkbits); + map->m_len++; goto get_next; } } @@ -1316,10 +1292,28 @@ unlock_out: if (create) f2fs_unlock_op(F2FS_I_SB(inode)); out: - trace_f2fs_get_data_block(inode, iblock, bh_result, err); + trace_f2fs_map_blocks(inode, map, err); return err; } +static int __get_data_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int create, bool fiemap) +{ + struct f2fs_map_blocks map; + int ret; + + map.m_lblk = iblock; + map.m_len = bh->b_size >> inode->i_blkbits; + + ret = f2fs_map_blocks(inode, &map, create, fiemap); + if (!ret) { + map_bh(bh, inode->i_sb, map.m_pblk); + bh->b_state = (bh->b_state & ~F2FS_MAP_FLAGS) | map.m_flags; + bh->b_size = map.m_len << inode->i_blkbits; + } + return ret; +} + static int get_data_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { @@ -1332,11 +1326,268 @@ static int get_data_block_fiemap(struct inode *inode, sector_t iblock, return __get_data_block(inode, iblock, bh_result, create, true); } +static inline sector_t logical_to_blk(struct inode *inode, loff_t offset) +{ + return (offset >> inode->i_blkbits); +} + +static inline loff_t blk_to_logical(struct inode *inode, sector_t blk) +{ + return (blk << inode->i_blkbits); +} + int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { - return generic_block_fiemap(inode, fieinfo, - start, len, get_data_block_fiemap); + struct buffer_head map_bh; + sector_t start_blk, last_blk; + loff_t isize = i_size_read(inode); + u64 logical = 0, phys = 0, size = 0; + u32 flags = 0; + bool past_eof = false, whole_file = false; + int ret = 0; + + ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC); + if (ret) + return ret; + + mutex_lock(&inode->i_mutex); + + if (len >= isize) { + whole_file = true; + len = isize; + } + + if (logical_to_blk(inode, len) == 0) + len = blk_to_logical(inode, 1); + + start_blk = logical_to_blk(inode, start); + last_blk = logical_to_blk(inode, start + len - 1); +next: + memset(&map_bh, 0, sizeof(struct buffer_head)); + map_bh.b_size = len; + + ret = get_data_block_fiemap(inode, start_blk, &map_bh, 0); + if (ret) + goto out; + + /* HOLE */ + if (!buffer_mapped(&map_bh)) { + start_blk++; + + if (!past_eof && blk_to_logical(inode, start_blk) >= isize) + past_eof = 1; + + if (past_eof && size) { + flags |= FIEMAP_EXTENT_LAST; + ret = fiemap_fill_next_extent(fieinfo, logical, + phys, size, flags); + } else if (size) { + ret = fiemap_fill_next_extent(fieinfo, logical, + phys, size, flags); + size = 0; + } + + /* if we have holes up to/past EOF then we're done */ + if (start_blk > last_blk || past_eof || ret) + goto out; + } else { + if (start_blk > last_blk && !whole_file) { + ret = fiemap_fill_next_extent(fieinfo, logical, + phys, size, flags); + goto out; + } + + /* + * if size != 0 then we know we already have an extent + * to add, so add it. + */ + if (size) { + ret = fiemap_fill_next_extent(fieinfo, logical, + phys, size, flags); + if (ret) + goto out; + } + + logical = blk_to_logical(inode, start_blk); + phys = blk_to_logical(inode, map_bh.b_blocknr); + size = map_bh.b_size; + flags = 0; + if (buffer_unwritten(&map_bh)) + flags = FIEMAP_EXTENT_UNWRITTEN; + + start_blk += logical_to_blk(inode, size); + + /* + * If we are past the EOF, then we need to make sure as + * soon as we find a hole that the last extent we found + * is marked with FIEMAP_EXTENT_LAST + */ + if (!past_eof && logical + size >= isize) + past_eof = true; + } + cond_resched(); + if (fatal_signal_pending(current)) + ret = -EINTR; + else + goto next; +out: + if (ret == 1) + ret = 0; + + mutex_unlock(&inode->i_mutex); + return ret; +} + +/* + * This function was originally taken from fs/mpage.c, and customized for f2fs. + * Major change was from block_size == page_size in f2fs by default. + */ +static int f2fs_mpage_readpages(struct address_space *mapping, + struct list_head *pages, struct page *page, + unsigned nr_pages) +{ + struct bio *bio = NULL; + unsigned page_idx; + sector_t last_block_in_bio = 0; + struct inode *inode = mapping->host; + const unsigned blkbits = inode->i_blkbits; + const unsigned blocksize = 1 << blkbits; + sector_t block_in_file; + sector_t last_block; + sector_t last_block_in_file; + sector_t block_nr; + struct block_device *bdev = inode->i_sb->s_bdev; + struct f2fs_map_blocks map; + + map.m_pblk = 0; + map.m_lblk = 0; + map.m_len = 0; + map.m_flags = 0; + + for (page_idx = 0; nr_pages; page_idx++, nr_pages--) { + + prefetchw(&page->flags); + if (pages) { + page = list_entry(pages->prev, struct page, lru); + list_del(&page->lru); + if (add_to_page_cache_lru(page, mapping, + page->index, GFP_KERNEL)) + goto next_page; + } + + block_in_file = (sector_t)page->index; + last_block = block_in_file + nr_pages; + last_block_in_file = (i_size_read(inode) + blocksize - 1) >> + blkbits; + if (last_block > last_block_in_file) + last_block = last_block_in_file; + + /* + * Map blocks using the previous result first. + */ + if ((map.m_flags & F2FS_MAP_MAPPED) && + block_in_file > map.m_lblk && + block_in_file < (map.m_lblk + map.m_len)) + goto got_it; + + /* + * Then do more f2fs_map_blocks() calls until we are + * done with this page. + */ + map.m_flags = 0; + + if (block_in_file < last_block) { + map.m_lblk = block_in_file; + map.m_len = last_block - block_in_file; + + if (f2fs_map_blocks(inode, &map, 0, false)) + goto set_error_page; + } +got_it: + if ((map.m_flags & F2FS_MAP_MAPPED)) { + block_nr = map.m_pblk + block_in_file - map.m_lblk; + SetPageMappedToDisk(page); + + if (!PageUptodate(page) && !cleancache_get_page(page)) { + SetPageUptodate(page); + goto confused; + } + } else { + zero_user_segment(page, 0, PAGE_CACHE_SIZE); + SetPageUptodate(page); + unlock_page(page); + goto next_page; + } + + /* + * This page will go to BIO. Do we need to send this + * BIO off first? + */ + if (bio && (last_block_in_bio != block_nr - 1)) { +submit_and_realloc: + submit_bio(READ, bio); + bio = NULL; + } + if (bio == NULL) { + struct f2fs_crypto_ctx *ctx = NULL; + + if (f2fs_encrypted_inode(inode) && + S_ISREG(inode->i_mode)) { + struct page *cpage; + + ctx = f2fs_get_crypto_ctx(inode); + if (IS_ERR(ctx)) + goto set_error_page; + + /* wait the page to be moved by cleaning */ + cpage = find_lock_page( + META_MAPPING(F2FS_I_SB(inode)), + block_nr); + if (cpage) { + f2fs_wait_on_page_writeback(cpage, + DATA); + f2fs_put_page(cpage, 1); + } + } + + bio = bio_alloc(GFP_KERNEL, + min_t(int, nr_pages, bio_get_nr_vecs(bdev))); + if (!bio) { + if (ctx) + f2fs_release_crypto_ctx(ctx); + goto set_error_page; + } + bio->bi_bdev = bdev; + bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(block_nr); + bio->bi_end_io = f2fs_read_end_io; + bio->bi_private = ctx; + } + + if (bio_add_page(bio, page, blocksize, 0) < blocksize) + goto submit_and_realloc; + + last_block_in_bio = block_nr; + goto next_page; +set_error_page: + SetPageError(page); + zero_user_segment(page, 0, PAGE_CACHE_SIZE); + unlock_page(page); + goto next_page; +confused: + if (bio) { + submit_bio(READ, bio); + bio = NULL; + } + unlock_page(page); +next_page: + if (pages) + page_cache_release(page); + } + BUG_ON(pages && !list_empty(pages)); + if (bio) + submit_bio(READ, bio); + return 0; } static int f2fs_read_data_page(struct file *file, struct page *page) @@ -1350,8 +1601,7 @@ static int f2fs_read_data_page(struct file *file, struct page *page) if (f2fs_has_inline_data(inode)) ret = f2fs_read_inline_data(inode, page); if (ret == -EAGAIN) - ret = mpage_readpage(page, get_data_block); - + ret = f2fs_mpage_readpages(page->mapping, NULL, page, 1); return ret; } @@ -1365,11 +1615,12 @@ static int f2fs_read_data_pages(struct file *file, if (f2fs_has_inline_data(inode)) return 0; - return mpage_readpages(mapping, pages, nr_pages, get_data_block); + return f2fs_mpage_readpages(mapping, pages, NULL, nr_pages); } -int do_write_data_page(struct page *page, struct f2fs_io_info *fio) +int do_write_data_page(struct f2fs_io_info *fio) { + struct page *page = fio->page; struct inode *inode = page->mapping->host; struct dnode_of_data dn; int err = 0; @@ -1387,6 +1638,14 @@ int do_write_data_page(struct page *page, struct f2fs_io_info *fio) goto out_writepage; } + if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { + fio->encrypted_page = f2fs_encrypt(inode, fio->page); + if (IS_ERR(fio->encrypted_page)) { + err = PTR_ERR(fio->encrypted_page); + goto out_writepage; + } + } + set_page_writeback(page); /* @@ -1396,11 +1655,11 @@ int do_write_data_page(struct page *page, struct f2fs_io_info *fio) if (unlikely(fio->blk_addr != NEW_ADDR && !is_cold_data(page) && need_inplace_update(inode))) { - rewrite_data_page(page, fio); + rewrite_data_page(fio); set_inode_flag(F2FS_I(inode), FI_UPDATE_WRITE); trace_f2fs_do_write_data_page(page, IPU); } else { - write_data_page(page, &dn, fio); + write_data_page(&dn, fio); set_data_blkaddr(&dn); f2fs_update_extent_cache(&dn); trace_f2fs_do_write_data_page(page, OPU); @@ -1425,8 +1684,11 @@ static int f2fs_write_data_page(struct page *page, bool need_balance_fs = false; int err = 0; struct f2fs_io_info fio = { + .sbi = sbi, .type = DATA, .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE, + .page = page, + .encrypted_page = NULL, }; trace_f2fs_writepage(page, DATA); @@ -1456,7 +1718,7 @@ write: if (S_ISDIR(inode->i_mode)) { if (unlikely(f2fs_cp_error(sbi))) goto redirty_out; - err = do_write_data_page(page, &fio); + err = do_write_data_page(&fio); goto done; } @@ -1476,7 +1738,7 @@ write: if (f2fs_has_inline_data(inode)) err = f2fs_write_inline_data(inode, page); if (err == -EAGAIN) - err = do_write_data_page(page, &fio); + err = do_write_data_page(&fio); f2fs_unlock_op(sbi); done: if (err && err != -ENOENT) @@ -1645,11 +1907,14 @@ put_next: zero_user_segment(page, 0, PAGE_CACHE_SIZE); } else { struct f2fs_io_info fio = { + .sbi = sbi, .type = DATA, .rw = READ_SYNC, .blk_addr = dn.data_blkaddr, + .page = page, + .encrypted_page = NULL, }; - err = f2fs_submit_page_bio(sbi, page, &fio); + err = f2fs_submit_page_bio(&fio); if (err) goto fail; @@ -1663,6 +1928,15 @@ put_next: f2fs_put_page(page, 1); goto repeat; } + + /* avoid symlink page */ + if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { + err = f2fs_decrypt_one(inode, page); + if (err) { + f2fs_put_page(page, 1); + goto fail; + } + } } out: SetPageUptodate(page); @@ -1733,6 +2007,9 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, return err; } + if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) + return 0; + if (check_direct_IO(inode, iter, offset)) return 0; @@ -1795,8 +2072,6 @@ static int f2fs_set_data_page_dirty(struct page *page) return 1; } - mark_inode_dirty(inode); - if (!PageDirty(page)) { __set_page_dirty_nobuffers(page); update_dirty_page(inode, page); diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index f5388f372..75176e0dd 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -94,7 +94,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) static void update_sit_info(struct f2fs_sb_info *sbi) { struct f2fs_stat_info *si = F2FS_STAT(sbi); - unsigned int blks_per_sec, hblks_per_sec, total_vblocks, bimodal, dist; + unsigned long long blks_per_sec, hblks_per_sec, total_vblocks; + unsigned long long bimodal, dist; unsigned int segno, vblocks; int ndirty = 0; @@ -112,10 +113,10 @@ static void update_sit_info(struct f2fs_sb_info *sbi) ndirty++; } } - dist = MAIN_SECS(sbi) * hblks_per_sec * hblks_per_sec / 100; - si->bimodal = bimodal / dist; + dist = div_u64(MAIN_SECS(sbi) * hblks_per_sec * hblks_per_sec, 100); + si->bimodal = div_u64(bimodal, dist); if (si->dirty_count) - si->avg_vblocks = total_vblocks / ndirty; + si->avg_vblocks = div_u64(total_vblocks, ndirty); else si->avg_vblocks = 0; } @@ -143,7 +144,7 @@ static void update_mem_info(struct f2fs_sb_info *sbi) si->base_mem += sizeof(struct sit_info); si->base_mem += MAIN_SEGS(sbi) * sizeof(struct seg_entry); si->base_mem += f2fs_bitmap_size(MAIN_SEGS(sbi)); - si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi); + si->base_mem += 3 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi); si->base_mem += SIT_VBLOCK_MAP_SIZE; if (sbi->segs_per_sec > 1) si->base_mem += MAIN_SECS(sbi) * sizeof(struct sec_entry); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 3a3302ab7..a34ebd831 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -76,20 +76,10 @@ static unsigned long dir_block_index(unsigned int level, return bidx; } -static bool early_match_name(size_t namelen, f2fs_hash_t namehash, - struct f2fs_dir_entry *de) -{ - if (le16_to_cpu(de->name_len) != namelen) - return false; - - if (de->hash_code != namehash) - return false; - - return true; -} - static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, - struct qstr *name, int *max_slots, + struct f2fs_filename *fname, + f2fs_hash_t namehash, + int *max_slots, struct page **res_page) { struct f2fs_dentry_block *dentry_blk; @@ -98,9 +88,8 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, dentry_blk = (struct f2fs_dentry_block *)kmap(dentry_page); - make_dentry_ptr(&d, (void *)dentry_blk, 1); - de = find_target_dentry(name, max_slots, &d); - + make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1); + de = find_target_dentry(fname, namehash, max_slots, &d); if (de) *res_page = dentry_page; else @@ -114,13 +103,15 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, return de; } -struct f2fs_dir_entry *find_target_dentry(struct qstr *name, int *max_slots, - struct f2fs_dentry_ptr *d) +struct f2fs_dir_entry *find_target_dentry(struct f2fs_filename *fname, + f2fs_hash_t namehash, int *max_slots, + struct f2fs_dentry_ptr *d) { struct f2fs_dir_entry *de; unsigned long bit_pos = 0; - f2fs_hash_t namehash = f2fs_dentry_hash(name); int max_len = 0; + struct f2fs_str de_name = FSTR_INIT(NULL, 0); + struct f2fs_str *name = &fname->disk_name; if (max_slots) *max_slots = 0; @@ -132,8 +123,18 @@ struct f2fs_dir_entry *find_target_dentry(struct qstr *name, int *max_slots, } de = &d->dentry[bit_pos]; - if (early_match_name(name->len, namehash, de) && - !memcmp(d->filename[bit_pos], name->name, name->len)) + + /* encrypted case */ + de_name.name = d->filename[bit_pos]; + de_name.len = le16_to_cpu(de->name_len); + + /* show encrypted name */ + if (fname->hash) { + if (de->hash_code == fname->hash) + goto found; + } else if (de_name.len == name->len && + de->hash_code == namehash && + !memcmp(de_name.name, name->name, name->len)) goto found; if (max_slots && max_len > *max_slots) @@ -155,16 +156,21 @@ found: } static struct f2fs_dir_entry *find_in_level(struct inode *dir, - unsigned int level, struct qstr *name, - f2fs_hash_t namehash, struct page **res_page) + unsigned int level, + struct f2fs_filename *fname, + struct page **res_page) { - int s = GET_DENTRY_SLOTS(name->len); + struct qstr name = FSTR_TO_QSTR(&fname->disk_name); + int s = GET_DENTRY_SLOTS(name.len); unsigned int nbucket, nblock; unsigned int bidx, end_block; struct page *dentry_page; struct f2fs_dir_entry *de = NULL; bool room = false; int max_slots; + f2fs_hash_t namehash; + + namehash = f2fs_dentry_hash(&name); f2fs_bug_on(F2FS_I_SB(dir), level > MAX_DIR_HASH_DEPTH); @@ -177,13 +183,14 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, for (; bidx < end_block; bidx++) { /* no need to allocate new dentry pages to all the indices */ - dentry_page = find_data_page(dir, bidx, true); + dentry_page = find_data_page(dir, bidx); if (IS_ERR(dentry_page)) { room = true; continue; } - de = find_in_block(dentry_page, name, &max_slots, res_page); + de = find_in_block(dentry_page, fname, namehash, &max_slots, + res_page); if (de) break; @@ -211,30 +218,34 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, { unsigned long npages = dir_blocks(dir); struct f2fs_dir_entry *de = NULL; - f2fs_hash_t name_hash; unsigned int max_depth; unsigned int level; + struct f2fs_filename fname; + int err; *res_page = NULL; - if (f2fs_has_inline_dentry(dir)) - return find_in_inline_dir(dir, child, res_page); + err = f2fs_fname_setup_filename(dir, child, 1, &fname); + if (err) + return NULL; + + if (f2fs_has_inline_dentry(dir)) { + de = find_in_inline_dir(dir, &fname, res_page); + goto out; + } if (npages == 0) - return NULL; + goto out; - name_hash = f2fs_dentry_hash(child); max_depth = F2FS_I(dir)->i_current_depth; for (level = 0; level < max_depth; level++) { - de = find_in_level(dir, level, child, name_hash, res_page); + de = find_in_level(dir, level, &fname, res_page); if (de) break; } - if (!de && F2FS_I(dir)->chash != name_hash) { - F2FS_I(dir)->chash = name_hash; - F2FS_I(dir)->clevel = level - 1; - } +out: + f2fs_fname_free_filename(&fname); return de; } @@ -303,10 +314,14 @@ static void init_dent_inode(const struct qstr *name, struct page *ipage) set_page_dirty(ipage); } -int update_dent_inode(struct inode *inode, const struct qstr *name) +int update_dent_inode(struct inode *inode, struct inode *to, + const struct qstr *name) { struct page *page; + if (file_enc_name(to)) + return 0; + page = get_node_page(F2FS_I_SB(inode), inode->i_ino); if (IS_ERR(page)) return PTR_ERR(page); @@ -356,7 +371,7 @@ static int make_empty_dir(struct inode *inode, dentry_blk = kmap_atomic(dentry_page); - make_dentry_ptr(&d, (void *)dentry_blk, 1); + make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1); do_make_empty_dir(inode, parent, &d); kunmap_atomic(dentry_blk); @@ -390,6 +405,12 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir, err = f2fs_init_security(inode, dir, name, page); if (err) goto put_error; + + if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode)) { + err = f2fs_inherit_context(dir, inode, page); + if (err) + goto put_error; + } } else { page = get_node_page(F2FS_I_SB(dir), inode->i_ino); if (IS_ERR(page)) @@ -501,24 +522,33 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, unsigned long bidx, block; f2fs_hash_t dentry_hash; unsigned int nbucket, nblock; - size_t namelen = name->len; struct page *dentry_page = NULL; struct f2fs_dentry_block *dentry_blk = NULL; struct f2fs_dentry_ptr d; - int slots = GET_DENTRY_SLOTS(namelen); struct page *page = NULL; - int err = 0; + struct f2fs_filename fname; + struct qstr new_name; + int slots, err; + + err = f2fs_fname_setup_filename(dir, name, 0, &fname); + if (err) + return err; + + new_name.name = fname_name(&fname); + new_name.len = fname_len(&fname); if (f2fs_has_inline_dentry(dir)) { - err = f2fs_add_inline_entry(dir, name, inode, ino, mode); + err = f2fs_add_inline_entry(dir, &new_name, inode, ino, mode); if (!err || err != -EAGAIN) - return err; + goto out; else err = 0; } - dentry_hash = f2fs_dentry_hash(name); level = 0; + slots = GET_DENTRY_SLOTS(new_name.len); + dentry_hash = f2fs_dentry_hash(&new_name); + current_depth = F2FS_I(dir)->i_current_depth; if (F2FS_I(dir)->chash == dentry_hash) { level = F2FS_I(dir)->clevel; @@ -526,8 +556,10 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, } start: - if (unlikely(current_depth == MAX_DIR_HASH_DEPTH)) - return -ENOSPC; + if (unlikely(current_depth == MAX_DIR_HASH_DEPTH)) { + err = -ENOSPC; + goto out; + } /* Increase the depth, if required */ if (level == current_depth) @@ -541,8 +573,10 @@ start: for (block = bidx; block <= (bidx + nblock - 1); block++) { dentry_page = get_new_data_page(dir, NULL, block, true); - if (IS_ERR(dentry_page)) - return PTR_ERR(dentry_page); + if (IS_ERR(dentry_page)) { + err = PTR_ERR(dentry_page); + goto out; + } dentry_blk = kmap(dentry_page); bit_pos = room_for_filename(&dentry_blk->dentry_bitmap, @@ -562,15 +596,17 @@ add_dentry: if (inode) { down_write(&F2FS_I(inode)->i_sem); - page = init_inode_metadata(inode, dir, name, NULL); + page = init_inode_metadata(inode, dir, &new_name, NULL); if (IS_ERR(page)) { err = PTR_ERR(page); goto fail; } + if (f2fs_encrypted_inode(dir)) + file_set_enc_name(inode); } - make_dentry_ptr(&d, (void *)dentry_blk, 1); - f2fs_update_dentry(ino, mode, &d, name, dentry_hash, bit_pos); + make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1); + f2fs_update_dentry(ino, mode, &d, &new_name, dentry_hash, bit_pos); set_page_dirty(dentry_page); @@ -592,6 +628,8 @@ fail: } kunmap(dentry_page); f2fs_put_page(dentry_page, 1); +out: + f2fs_fname_free_filename(&fname); return err; } @@ -729,11 +767,12 @@ bool f2fs_empty_dir(struct inode *dir) } bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, - unsigned int start_pos) + unsigned int start_pos, struct f2fs_str *fstr) { unsigned char d_type = DT_UNKNOWN; unsigned int bit_pos; struct f2fs_dir_entry *de = NULL; + struct f2fs_str de_name = FSTR_INIT(NULL, 0); bit_pos = ((unsigned long)ctx->pos % d->max); @@ -747,8 +786,24 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, d_type = f2fs_filetype_table[de->file_type]; else d_type = DT_UNKNOWN; - if (!dir_emit(ctx, d->filename[bit_pos], - le16_to_cpu(de->name_len), + + /* encrypted case */ + de_name.name = d->filename[bit_pos]; + de_name.len = le16_to_cpu(de->name_len); + + if (f2fs_encrypted_inode(d->inode)) { + int save_len = fstr->len; + int ret; + + ret = f2fs_fname_disk_to_usr(d->inode, &de->hash_code, + &de_name, fstr); + de_name = *fstr; + fstr->len = save_len; + if (ret < 0) + return true; + } + + if (!dir_emit(ctx, de_name.name, de_name.len, le32_to_cpu(de->ino), d_type)) return true; @@ -767,9 +822,24 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) struct file_ra_state *ra = &file->f_ra; unsigned int n = ((unsigned long)ctx->pos / NR_DENTRY_IN_BLOCK); struct f2fs_dentry_ptr d; + struct f2fs_str fstr = FSTR_INIT(NULL, 0); + int err = 0; - if (f2fs_has_inline_dentry(inode)) - return f2fs_read_inline_dir(file, ctx); + if (f2fs_encrypted_inode(inode)) { + err = f2fs_get_encryption_info(inode); + if (err) + return err; + + err = f2fs_fname_crypto_alloc_buffer(inode, F2FS_NAME_LEN, + &fstr); + if (err < 0) + return err; + } + + if (f2fs_has_inline_dentry(inode)) { + err = f2fs_read_inline_dir(file, ctx, &fstr); + goto out; + } /* readahead for multi pages of dir */ if (npages - n > 1 && !ra_has_index(ra, n)) @@ -783,9 +853,9 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) dentry_blk = kmap(dentry_page); - make_dentry_ptr(&d, (void *)dentry_blk, 1); + make_dentry_ptr(inode, &d, (void *)dentry_blk, 1); - if (f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK)) + if (f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr)) goto stop; ctx->pos = (n + 1) * NR_DENTRY_IN_BLOCK; @@ -798,8 +868,9 @@ stop: kunmap(dentry_page); f2fs_put_page(dentry_page, 1); } - - return 0; +out: + f2fs_fname_crypto_free_buffer(&fstr); + return err; } const struct file_operations f2fs_dir_operations = { @@ -808,4 +879,7 @@ const struct file_operations f2fs_dir_operations = { .iterate = f2fs_readdir, .fsync = f2fs_sync_file, .unlocked_ioctl = f2fs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = f2fs_compat_ioctl, +#endif }; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 8de34ab6d..a8327ed73 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -70,6 +70,15 @@ struct f2fs_mount_info { unsigned int opt; }; +#define F2FS_FEATURE_ENCRYPT 0x0001 + +#define F2FS_HAS_FEATURE(sb, mask) \ + ((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0) +#define F2FS_SET_FEATURE(sb, mask) \ + F2FS_SB(sb)->raw_super->feature |= cpu_to_le32(mask) +#define F2FS_CLEAR_FEATURE(sb, mask) \ + F2FS_SB(sb)->raw_super->feature &= ~cpu_to_le32(mask) + #define CRCPOLY_LE 0xedb88320 static inline __u32 f2fs_crc32(void *buf, size_t len) @@ -110,6 +119,8 @@ enum { #define DEF_BATCHED_TRIM_SECTIONS 32 #define BATCHED_TRIM_SEGMENTS(sbi) \ (SM_I(sbi)->trim_sections * (sbi)->segs_per_sec) +#define BATCHED_TRIM_BLOCKS(sbi) \ + (BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg) struct cp_control { int reason; @@ -218,6 +229,13 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size, #define F2FS_IOC_RELEASE_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 4) #define F2FS_IOC_ABORT_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 5) +#define F2FS_IOC_SET_ENCRYPTION_POLICY \ + _IOR('f', 19, struct f2fs_encryption_policy) +#define F2FS_IOC_GET_ENCRYPTION_PWSALT \ + _IOW('f', 20, __u8[16]) +#define F2FS_IOC_GET_ENCRYPTION_POLICY \ + _IOW('f', 21, struct f2fs_encryption_policy) + /* * should be same as XFS_IOC_GOINGDOWN. * Flags for going down operation used by FS_IOC_GOINGDOWN @@ -239,16 +257,38 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size, * For INODE and NODE manager */ /* for directory operations */ +struct f2fs_str { + unsigned char *name; + u32 len; +}; + +struct f2fs_filename { + const struct qstr *usr_fname; + struct f2fs_str disk_name; + f2fs_hash_t hash; +#ifdef CONFIG_F2FS_FS_ENCRYPTION + struct f2fs_str crypto_buf; +#endif +}; + +#define FSTR_INIT(n, l) { .name = n, .len = l } +#define FSTR_TO_QSTR(f) QSTR_INIT((f)->name, (f)->len) +#define fname_name(p) ((p)->disk_name.name) +#define fname_len(p) ((p)->disk_name.len) + struct f2fs_dentry_ptr { + struct inode *inode; const void *bitmap; struct f2fs_dir_entry *dentry; __u8 (*filename)[F2FS_SLOT_LEN]; int max; }; -static inline void make_dentry_ptr(struct f2fs_dentry_ptr *d, - void *src, int type) +static inline void make_dentry_ptr(struct inode *inode, + struct f2fs_dentry_ptr *d, void *src, int type) { + d->inode = inode; + if (type == 1) { struct f2fs_dentry_block *t = (struct f2fs_dentry_block *)src; d->max = NR_DENTRY_IN_BLOCK; @@ -315,10 +355,51 @@ struct extent_tree { }; /* + * This structure is taken from ext4_map_blocks. + * + * Note that, however, f2fs uses NEW and MAPPED flags for f2fs_map_blocks(). + */ +#define F2FS_MAP_NEW (1 << BH_New) +#define F2FS_MAP_MAPPED (1 << BH_Mapped) +#define F2FS_MAP_UNWRITTEN (1 << BH_Unwritten) +#define F2FS_MAP_FLAGS (F2FS_MAP_NEW | F2FS_MAP_MAPPED |\ + F2FS_MAP_UNWRITTEN) + +struct f2fs_map_blocks { + block_t m_pblk; + block_t m_lblk; + unsigned int m_len; + unsigned int m_flags; +}; + +/* * i_advise uses FADVISE_XXX_BIT. We can add additional hints later. */ #define FADVISE_COLD_BIT 0x01 #define FADVISE_LOST_PINO_BIT 0x02 +#define FADVISE_ENCRYPT_BIT 0x04 +#define FADVISE_ENC_NAME_BIT 0x08 + +#define file_is_cold(inode) is_file(inode, FADVISE_COLD_BIT) +#define file_wrong_pino(inode) is_file(inode, FADVISE_LOST_PINO_BIT) +#define file_set_cold(inode) set_file(inode, FADVISE_COLD_BIT) +#define file_lost_pino(inode) set_file(inode, FADVISE_LOST_PINO_BIT) +#define file_clear_cold(inode) clear_file(inode, FADVISE_COLD_BIT) +#define file_got_pino(inode) clear_file(inode, FADVISE_LOST_PINO_BIT) +#define file_is_encrypt(inode) is_file(inode, FADVISE_ENCRYPT_BIT) +#define file_set_encrypt(inode) set_file(inode, FADVISE_ENCRYPT_BIT) +#define file_clear_encrypt(inode) clear_file(inode, FADVISE_ENCRYPT_BIT) +#define file_enc_name(inode) is_file(inode, FADVISE_ENC_NAME_BIT) +#define file_set_enc_name(inode) set_file(inode, FADVISE_ENC_NAME_BIT) + +/* Encryption algorithms */ +#define F2FS_ENCRYPTION_MODE_INVALID 0 +#define F2FS_ENCRYPTION_MODE_AES_256_XTS 1 +#define F2FS_ENCRYPTION_MODE_AES_256_GCM 2 +#define F2FS_ENCRYPTION_MODE_AES_256_CBC 3 +#define F2FS_ENCRYPTION_MODE_AES_256_CTS 4 + +#include "f2fs_crypto.h" #define DEF_DIR_LEVEL 0 @@ -346,6 +427,11 @@ struct f2fs_inode_info { struct radix_tree_root inmem_root; /* radix tree for inmem pages */ struct list_head inmem_pages; /* inmemory pages managed by f2fs */ struct mutex inmem_lock; /* lock for inmemory pages */ + +#ifdef CONFIG_F2FS_FS_ENCRYPTION + /* Encryption params */ + struct f2fs_crypt_info *i_crypt_info; +#endif }; static inline void get_extent_info(struct extent_info *ext, @@ -571,9 +657,12 @@ enum page_type { }; struct f2fs_io_info { + struct f2fs_sb_info *sbi; /* f2fs_sb_info pointer */ enum page_type type; /* contains DATA/NODE/META/META_FLUSH */ int rw; /* contains R/RS/W/WS with REQ_META/REQ_PRIO */ block_t blk_addr; /* block address to be written */ + struct page *page; /* page to be written */ + struct page *encrypted_page; /* encrypted page */ }; #define is_read_io(rw) (((rw) & 1) == READ) @@ -666,6 +755,7 @@ struct f2fs_sb_info { block_t user_block_count; /* # of user blocks */ block_t total_valid_block_count; /* # of valid blocks */ block_t alloc_valid_block_count; /* # of allocated blocks */ + block_t discard_blks; /* discard command candidats */ block_t last_valid_block_count; /* for recovery */ u32 s_next_generation; /* for NFS support */ atomic_t nr_pages[NR_COUNT_TYPE]; /* # of pages, see count_type */ @@ -1193,6 +1283,24 @@ static inline int f2fs_test_bit(unsigned int nr, char *addr) return mask & *addr; } +static inline void f2fs_set_bit(unsigned int nr, char *addr) +{ + int mask; + + addr += (nr >> 3); + mask = 1 << (7 - (nr & 0x07)); + *addr |= mask; +} + +static inline void f2fs_clear_bit(unsigned int nr, char *addr) +{ + int mask; + + addr += (nr >> 3); + mask = 1 << (7 - (nr & 0x07)); + *addr &= ~mask; +} + static inline int f2fs_test_and_set_bit(unsigned int nr, char *addr) { int mask; @@ -1391,6 +1499,21 @@ static inline void f2fs_dentry_kunmap(struct inode *dir, struct page *page) kunmap(page); } +static inline int is_file(struct inode *inode, int type) +{ + return F2FS_I(inode)->i_advise & type; +} + +static inline void set_file(struct inode *inode, int type) +{ + F2FS_I(inode)->i_advise |= type; +} + +static inline void clear_file(struct inode *inode, int type) +{ + F2FS_I(inode)->i_advise &= ~type; +} + static inline int f2fs_readonly(struct super_block *sb) { return sb->s_flags & MS_RDONLY; @@ -1407,6 +1530,17 @@ static inline void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi) sbi->sb->s_flags |= MS_RDONLY; } +static inline bool is_dot_dotdot(const struct qstr *str) +{ + if (str->len == 1 && str->name[0] == '.') + return true; + + if (str->len == 2 && str->name[0] == '.' && str->name[1] == '.') + return true; + + return false; +} + #define get_inode_mode(i) \ ((is_inode_flag_set(F2FS_I(i), FI_ACL_MODE)) ? \ (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) @@ -1453,10 +1587,11 @@ struct dentry *f2fs_get_parent(struct dentry *child); */ extern unsigned char f2fs_filetype_table[F2FS_FT_MAX]; void set_de_type(struct f2fs_dir_entry *, umode_t); -struct f2fs_dir_entry *find_target_dentry(struct qstr *, int *, - struct f2fs_dentry_ptr *); + +struct f2fs_dir_entry *find_target_dentry(struct f2fs_filename *, + f2fs_hash_t, int *, struct f2fs_dentry_ptr *); bool f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *, - unsigned int); + unsigned int, struct f2fs_str *); void do_make_empty_dir(struct inode *, struct inode *, struct f2fs_dentry_ptr *); struct page *init_inode_metadata(struct inode *, struct inode *, @@ -1470,7 +1605,7 @@ struct f2fs_dir_entry *f2fs_parent_dir(struct inode *, struct page **); ino_t f2fs_inode_by_name(struct inode *, struct qstr *); void f2fs_set_link(struct inode *, struct f2fs_dir_entry *, struct page *, struct inode *); -int update_dent_inode(struct inode *, const struct qstr *); +int update_dent_inode(struct inode *, struct inode *, const struct qstr *); void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *, const struct qstr *, f2fs_hash_t , unsigned int); int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *, nid_t, @@ -1478,7 +1613,6 @@ int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *, nid_t, void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *, struct inode *); int f2fs_do_tmpfile(struct inode *, struct inode *); -int f2fs_make_empty(struct inode *, struct inode *); bool f2fs_empty_dir(struct inode *); static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) @@ -1490,6 +1624,7 @@ static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) /* * super.c */ +int f2fs_commit_super(struct f2fs_sb_info *, bool); int f2fs_sync_fs(struct super_block *, int); extern __printf(3, 4) void f2fs_msg(struct super_block *, const char *, const char *, ...); @@ -1506,8 +1641,8 @@ struct dnode_of_data; struct node_info; bool available_free_memory(struct f2fs_sb_info *, int); +int need_dentry_mark(struct f2fs_sb_info *, nid_t); bool is_checkpointed_node(struct f2fs_sb_info *, nid_t); -bool has_fsynced_inode(struct f2fs_sb_info *, nid_t); bool need_inode_block_update(struct f2fs_sb_info *, nid_t); void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *); int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int); @@ -1548,21 +1683,20 @@ int create_flush_cmd_control(struct f2fs_sb_info *); void destroy_flush_cmd_control(struct f2fs_sb_info *); void invalidate_blocks(struct f2fs_sb_info *, block_t); void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t); -void clear_prefree_segments(struct f2fs_sb_info *); +void clear_prefree_segments(struct f2fs_sb_info *, struct cp_control *); void release_discard_addrs(struct f2fs_sb_info *); void discard_next_dnode(struct f2fs_sb_info *, block_t); int npages_for_summary_flush(struct f2fs_sb_info *, bool); void allocate_new_segments(struct f2fs_sb_info *); int f2fs_trim_fs(struct f2fs_sb_info *, struct fstrim_range *); struct page *get_sum_page(struct f2fs_sb_info *, unsigned int); +void update_meta_page(struct f2fs_sb_info *, void *, block_t); void write_meta_page(struct f2fs_sb_info *, struct page *); -void write_node_page(struct f2fs_sb_info *, struct page *, - unsigned int, struct f2fs_io_info *); -void write_data_page(struct page *, struct dnode_of_data *, - struct f2fs_io_info *); -void rewrite_data_page(struct page *, struct f2fs_io_info *); -void recover_data_page(struct f2fs_sb_info *, struct page *, - struct f2fs_summary *, block_t, block_t); +void write_node_page(unsigned int, struct f2fs_io_info *); +void write_data_page(struct dnode_of_data *, struct f2fs_io_info *); +void rewrite_data_page(struct f2fs_io_info *); +void f2fs_replace_block(struct f2fs_sb_info *, struct dnode_of_data *, + block_t, block_t, unsigned char, bool); void allocate_data_block(struct f2fs_sb_info *, struct page *, block_t, block_t *, struct f2fs_summary *, int); void f2fs_wait_on_page_writeback(struct page *, enum page_type); @@ -1581,6 +1715,7 @@ void destroy_segment_manager_caches(void); */ struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t); struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t); +bool is_valid_blkaddr(struct f2fs_sb_info *, block_t, int); int ra_meta_pages(struct f2fs_sb_info *, block_t, int, int); void ra_meta_pages_cond(struct f2fs_sb_info *, pgoff_t); long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long); @@ -1607,10 +1742,8 @@ void destroy_checkpoint_caches(void); * data.c */ void f2fs_submit_merged_bio(struct f2fs_sb_info *, enum page_type, int); -int f2fs_submit_page_bio(struct f2fs_sb_info *, struct page *, - struct f2fs_io_info *); -void f2fs_submit_page_mbio(struct f2fs_sb_info *, struct page *, - struct f2fs_io_info *); +int f2fs_submit_page_bio(struct f2fs_io_info *); +void f2fs_submit_page_mbio(struct f2fs_io_info *); void set_data_blkaddr(struct dnode_of_data *); int reserve_new_block(struct dnode_of_data *); int f2fs_reserve_block(struct dnode_of_data *, pgoff_t); @@ -1619,10 +1752,11 @@ void f2fs_destroy_extent_tree(struct inode *); void f2fs_init_extent_cache(struct inode *, struct f2fs_extent *); void f2fs_update_extent_cache(struct dnode_of_data *); void f2fs_preserve_extent_tree(struct inode *); -struct page *find_data_page(struct inode *, pgoff_t, bool); +struct page *get_read_data_page(struct inode *, pgoff_t, int); +struct page *find_data_page(struct inode *, pgoff_t); struct page *get_lock_data_page(struct inode *, pgoff_t); struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool); -int do_write_data_page(struct page *, struct f2fs_io_info *); +int do_write_data_page(struct f2fs_io_info *); int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64); void init_extent_cache_info(struct f2fs_sb_info *); int __init create_extent_cache(void); @@ -1787,13 +1921,15 @@ extern const struct address_space_operations f2fs_node_aops; extern const struct address_space_operations f2fs_meta_aops; extern const struct inode_operations f2fs_dir_inode_operations; extern const struct inode_operations f2fs_symlink_inode_operations; +extern const struct inode_operations f2fs_encrypted_symlink_inode_operations; extern const struct inode_operations f2fs_special_inode_operations; extern struct kmem_cache *inode_entry_slab; /* * inline.c */ -bool f2fs_may_inline(struct inode *); +bool f2fs_may_inline_data(struct inode *); +bool f2fs_may_inline_dentry(struct inode *); void read_inline_data(struct page *, struct page *); bool truncate_inline_inode(struct page *, u64); int f2fs_read_inline_data(struct inode *, struct page *); @@ -1801,8 +1937,8 @@ int f2fs_convert_inline_page(struct dnode_of_data *, struct page *); int f2fs_convert_inline_inode(struct inode *); int f2fs_write_inline_data(struct inode *, struct page *); bool recover_inline_data(struct inode *, struct page *); -struct f2fs_dir_entry *find_in_inline_dir(struct inode *, struct qstr *, - struct page **); +struct f2fs_dir_entry *find_in_inline_dir(struct inode *, + struct f2fs_filename *, struct page **); struct f2fs_dir_entry *f2fs_parent_inline_dir(struct inode *, struct page **); int make_empty_inline_dir(struct inode *inode, struct inode *, struct page *); int f2fs_add_inline_entry(struct inode *, const struct qstr *, struct inode *, @@ -1810,5 +1946,137 @@ int f2fs_add_inline_entry(struct inode *, const struct qstr *, struct inode *, void f2fs_delete_inline_entry(struct f2fs_dir_entry *, struct page *, struct inode *, struct inode *); bool f2fs_empty_inline_dir(struct inode *); -int f2fs_read_inline_dir(struct file *, struct dir_context *); +int f2fs_read_inline_dir(struct file *, struct dir_context *, + struct f2fs_str *); + +/* + * crypto support + */ +static inline int f2fs_encrypted_inode(struct inode *inode) +{ +#ifdef CONFIG_F2FS_FS_ENCRYPTION + return file_is_encrypt(inode); +#else + return 0; +#endif +} + +static inline void f2fs_set_encrypted_inode(struct inode *inode) +{ +#ifdef CONFIG_F2FS_FS_ENCRYPTION + file_set_encrypt(inode); +#endif +} + +static inline bool f2fs_bio_encrypted(struct bio *bio) +{ +#ifdef CONFIG_F2FS_FS_ENCRYPTION + return unlikely(bio->bi_private != NULL); +#else + return false; +#endif +} + +static inline int f2fs_sb_has_crypto(struct super_block *sb) +{ +#ifdef CONFIG_F2FS_FS_ENCRYPTION + return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_ENCRYPT); +#else + return 0; +#endif +} + +static inline bool f2fs_may_encrypt(struct inode *inode) +{ +#ifdef CONFIG_F2FS_FS_ENCRYPTION + mode_t mode = inode->i_mode; + + return (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)); +#else + return 0; +#endif +} + +/* crypto_policy.c */ +int f2fs_is_child_context_consistent_with_parent(struct inode *, + struct inode *); +int f2fs_inherit_context(struct inode *, struct inode *, struct page *); +int f2fs_process_policy(const struct f2fs_encryption_policy *, struct inode *); +int f2fs_get_policy(struct inode *, struct f2fs_encryption_policy *); + +/* crypt.c */ +extern struct kmem_cache *f2fs_crypt_info_cachep; +bool f2fs_valid_contents_enc_mode(uint32_t); +uint32_t f2fs_validate_encryption_key_size(uint32_t, uint32_t); +struct f2fs_crypto_ctx *f2fs_get_crypto_ctx(struct inode *); +void f2fs_release_crypto_ctx(struct f2fs_crypto_ctx *); +struct page *f2fs_encrypt(struct inode *, struct page *); +int f2fs_decrypt(struct f2fs_crypto_ctx *, struct page *); +int f2fs_decrypt_one(struct inode *, struct page *); +void f2fs_end_io_crypto_work(struct f2fs_crypto_ctx *, struct bio *); + +/* crypto_key.c */ +void f2fs_free_encryption_info(struct inode *, struct f2fs_crypt_info *); +int _f2fs_get_encryption_info(struct inode *inode); + +/* crypto_fname.c */ +bool f2fs_valid_filenames_enc_mode(uint32_t); +u32 f2fs_fname_crypto_round_up(u32, u32); +int f2fs_fname_crypto_alloc_buffer(struct inode *, u32, struct f2fs_str *); +int f2fs_fname_disk_to_usr(struct inode *, f2fs_hash_t *, + const struct f2fs_str *, struct f2fs_str *); +int f2fs_fname_usr_to_disk(struct inode *, const struct qstr *, + struct f2fs_str *); + +#ifdef CONFIG_F2FS_FS_ENCRYPTION +void f2fs_restore_and_release_control_page(struct page **); +void f2fs_restore_control_page(struct page *); + +int __init f2fs_init_crypto(void); +int f2fs_crypto_initialize(void); +void f2fs_exit_crypto(void); + +int f2fs_has_encryption_key(struct inode *); + +static inline int f2fs_get_encryption_info(struct inode *inode) +{ + struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info; + + if (!ci || + (ci->ci_keyring_key && + (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) | + (1 << KEY_FLAG_REVOKED) | + (1 << KEY_FLAG_DEAD))))) + return _f2fs_get_encryption_info(inode); + return 0; +} + +void f2fs_fname_crypto_free_buffer(struct f2fs_str *); +int f2fs_fname_setup_filename(struct inode *, const struct qstr *, + int lookup, struct f2fs_filename *); +void f2fs_fname_free_filename(struct f2fs_filename *); +#else +static inline void f2fs_restore_and_release_control_page(struct page **p) { } +static inline void f2fs_restore_control_page(struct page *p) { } + +static inline int __init f2fs_init_crypto(void) { return 0; } +static inline void f2fs_exit_crypto(void) { } + +static inline int f2fs_has_encryption_key(struct inode *i) { return 0; } +static inline int f2fs_get_encryption_info(struct inode *i) { return 0; } +static inline void f2fs_fname_crypto_free_buffer(struct f2fs_str *p) { } + +static inline int f2fs_fname_setup_filename(struct inode *dir, + const struct qstr *iname, + int lookup, struct f2fs_filename *fname) +{ + memset(fname, 0, sizeof(struct f2fs_filename)); + fname->usr_fname = iname; + fname->disk_name.name = (unsigned char *)iname->name; + fname->disk_name.len = iname->len; + return 0; +} + +static inline void f2fs_fname_free_filename(struct f2fs_filename *fname) { } +#endif #endif diff --git a/fs/f2fs/f2fs_crypto.h b/fs/f2fs/f2fs_crypto.h new file mode 100644 index 000000000..c2c1c2b63 --- /dev/null +++ b/fs/f2fs/f2fs_crypto.h @@ -0,0 +1,151 @@ +/* + * linux/fs/f2fs/f2fs_crypto.h + * + * Copied from linux/fs/ext4/ext4_crypto.h + * + * Copyright (C) 2015, Google, Inc. + * + * This contains encryption header content for f2fs + * + * Written by Michael Halcrow, 2015. + * Modified by Jaegeuk Kim, 2015. + */ +#ifndef _F2FS_CRYPTO_H +#define _F2FS_CRYPTO_H + +#include <linux/fs.h> + +#define F2FS_KEY_DESCRIPTOR_SIZE 8 + +/* Policy provided via an ioctl on the topmost directory */ +struct f2fs_encryption_policy { + char version; + char contents_encryption_mode; + char filenames_encryption_mode; + char flags; + char master_key_descriptor[F2FS_KEY_DESCRIPTOR_SIZE]; +} __attribute__((__packed__)); + +#define F2FS_ENCRYPTION_CONTEXT_FORMAT_V1 1 +#define F2FS_KEY_DERIVATION_NONCE_SIZE 16 + +#define F2FS_POLICY_FLAGS_PAD_4 0x00 +#define F2FS_POLICY_FLAGS_PAD_8 0x01 +#define F2FS_POLICY_FLAGS_PAD_16 0x02 +#define F2FS_POLICY_FLAGS_PAD_32 0x03 +#define F2FS_POLICY_FLAGS_PAD_MASK 0x03 +#define F2FS_POLICY_FLAGS_VALID 0x03 + +/** + * Encryption context for inode + * + * Protector format: + * 1 byte: Protector format (1 = this version) + * 1 byte: File contents encryption mode + * 1 byte: File names encryption mode + * 1 byte: Flags + * 8 bytes: Master Key descriptor + * 16 bytes: Encryption Key derivation nonce + */ +struct f2fs_encryption_context { + char format; + char contents_encryption_mode; + char filenames_encryption_mode; + char flags; + char master_key_descriptor[F2FS_KEY_DESCRIPTOR_SIZE]; + char nonce[F2FS_KEY_DERIVATION_NONCE_SIZE]; +} __attribute__((__packed__)); + +/* Encryption parameters */ +#define F2FS_XTS_TWEAK_SIZE 16 +#define F2FS_AES_128_ECB_KEY_SIZE 16 +#define F2FS_AES_256_GCM_KEY_SIZE 32 +#define F2FS_AES_256_CBC_KEY_SIZE 32 +#define F2FS_AES_256_CTS_KEY_SIZE 32 +#define F2FS_AES_256_XTS_KEY_SIZE 64 +#define F2FS_MAX_KEY_SIZE 64 + +#define F2FS_KEY_DESC_PREFIX "f2fs:" +#define F2FS_KEY_DESC_PREFIX_SIZE 5 + +struct f2fs_encryption_key { + __u32 mode; + char raw[F2FS_MAX_KEY_SIZE]; + __u32 size; +} __attribute__((__packed__)); + +struct f2fs_crypt_info { + char ci_data_mode; + char ci_filename_mode; + char ci_flags; + struct crypto_ablkcipher *ci_ctfm; + struct key *ci_keyring_key; + char ci_master_key[F2FS_KEY_DESCRIPTOR_SIZE]; +}; + +#define F2FS_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001 +#define F2FS_WRITE_PATH_FL 0x00000002 + +struct f2fs_crypto_ctx { + union { + struct { + struct page *bounce_page; /* Ciphertext page */ + struct page *control_page; /* Original page */ + } w; + struct { + struct bio *bio; + struct work_struct work; + } r; + struct list_head free_list; /* Free list */ + }; + char flags; /* Flags */ +}; + +struct f2fs_completion_result { + struct completion completion; + int res; +}; + +#define DECLARE_F2FS_COMPLETION_RESULT(ecr) \ + struct f2fs_completion_result ecr = { \ + COMPLETION_INITIALIZER((ecr).completion), 0 } + +static inline int f2fs_encryption_key_size(int mode) +{ + switch (mode) { + case F2FS_ENCRYPTION_MODE_AES_256_XTS: + return F2FS_AES_256_XTS_KEY_SIZE; + case F2FS_ENCRYPTION_MODE_AES_256_GCM: + return F2FS_AES_256_GCM_KEY_SIZE; + case F2FS_ENCRYPTION_MODE_AES_256_CBC: + return F2FS_AES_256_CBC_KEY_SIZE; + case F2FS_ENCRYPTION_MODE_AES_256_CTS: + return F2FS_AES_256_CTS_KEY_SIZE; + default: + BUG(); + } + return 0; +} + +#define F2FS_FNAME_NUM_SCATTER_ENTRIES 4 +#define F2FS_CRYPTO_BLOCK_SIZE 16 +#define F2FS_FNAME_CRYPTO_DIGEST_SIZE 32 + +/** + * For encrypted symlinks, the ciphertext length is stored at the beginning + * of the string in little-endian format. + */ +struct f2fs_encrypted_symlink_data { + __le16 len; + char encrypted_path[1]; +} __attribute__((__packed__)); + +/** + * This function is used to calculate the disk space required to + * store a filename of length l in encrypted symlink format. + */ +static inline u32 encrypted_symlink_data_len(u32 l) +{ + return (l + sizeof(struct f2fs_encrypted_symlink_data) - 1); +} +#endif /* _F2FS_CRYPTO_H */ diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 2b52e48d7..b0f38c3b3 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -20,6 +20,7 @@ #include <linux/uaccess.h> #include <linux/mount.h> #include <linux/pagevec.h> +#include <linux/random.h> #include "f2fs.h" #include "node.h" @@ -105,7 +106,7 @@ static int get_parent_ino(struct inode *inode, nid_t *pino) if (!dentry) return 0; - if (update_dent_inode(inode, &dentry->d_name)) { + if (update_dent_inode(inode, inode, &dentry->d_name)) { dput(dentry); return 0; } @@ -122,6 +123,8 @@ static inline bool need_do_checkpoint(struct inode *inode) if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1) need_cp = true; + else if (file_enc_name(inode) && need_dentry_mark(sbi, inode->i_ino)) + need_cp = true; else if (file_wrong_pino(inode)) need_cp = true; else if (!space_for_roll_forward(sbi)) @@ -271,7 +274,7 @@ flush_out: ret = f2fs_issue_flush(sbi); out: trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); - f2fs_trace_ios(NULL, NULL, 1); + f2fs_trace_ios(NULL, 1); return ret; } @@ -407,6 +410,12 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file_inode(file); + if (f2fs_encrypted_inode(inode)) { + int err = f2fs_get_encryption_info(inode); + if (err) + return 0; + } + /* we don't need to use inline_data strictly */ if (f2fs_has_inline_data(inode)) { int err = f2fs_convert_inline_inode(inode); @@ -419,6 +428,18 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) return 0; } +static int f2fs_file_open(struct inode *inode, struct file *filp) +{ + int ret = generic_file_open(inode, filp); + + if (!ret && f2fs_encrypted_inode(inode)) { + ret = f2fs_get_encryption_info(inode); + if (ret) + ret = -EACCES; + } + return ret; +} + int truncate_data_blocks_range(struct dnode_of_data *dn, int count) { int nr_free = 0, ofs = dn->ofs_in_node; @@ -461,28 +482,32 @@ void truncate_data_blocks(struct dnode_of_data *dn) } static int truncate_partial_data_page(struct inode *inode, u64 from, - bool force) + bool cache_only) { unsigned offset = from & (PAGE_CACHE_SIZE - 1); + pgoff_t index = from >> PAGE_CACHE_SHIFT; + struct address_space *mapping = inode->i_mapping; struct page *page; - if (!offset && !force) + if (!offset && !cache_only) return 0; - page = find_data_page(inode, from >> PAGE_CACHE_SHIFT, force); - if (IS_ERR(page)) + if (cache_only) { + page = grab_cache_page(mapping, index); + if (page && PageUptodate(page)) + goto truncate_out; + f2fs_put_page(page, 1); return 0; + } - lock_page(page); - if (unlikely(!PageUptodate(page) || - page->mapping != inode->i_mapping)) - goto out; - + page = get_lock_data_page(inode, index); + if (IS_ERR(page)) + return 0; +truncate_out: f2fs_wait_on_page_writeback(page, DATA); zero_user(page, offset, PAGE_CACHE_SIZE - offset); - if (!force) + if (!cache_only || !f2fs_encrypted_inode(inode) || !S_ISREG(inode->i_mode)) set_page_dirty(page); -out: f2fs_put_page(page, 1); return 0; } @@ -560,7 +585,7 @@ void f2fs_truncate(struct inode *inode) trace_f2fs_truncate(inode); /* we should check inline_data size */ - if (f2fs_has_inline_data(inode) && !f2fs_may_inline(inode)) { + if (f2fs_has_inline_data(inode) && !f2fs_may_inline_data(inode)) { if (f2fs_convert_inline_inode(inode)) return; } @@ -622,16 +647,20 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) return err; if (attr->ia_valid & ATTR_SIZE) { - if (attr->ia_size != i_size_read(inode)) { + if (f2fs_encrypted_inode(inode) && + f2fs_get_encryption_info(inode)) + return -EACCES; + + if (attr->ia_size <= i_size_read(inode)) { truncate_setsize(inode, attr->ia_size); f2fs_truncate(inode); f2fs_balance_fs(F2FS_I_SB(inode)); } else { /* - * giving a chance to truncate blocks past EOF which - * are fallocated with FALLOC_FL_KEEP_SIZE. + * do not trim all blocks after i_size if target size is + * larger than i_size. */ - f2fs_truncate(inode); + truncate_setsize(inode, attr->ia_size); } } @@ -718,10 +747,6 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) if (!S_ISREG(inode->i_mode)) return -EOPNOTSUPP; - /* skip punching hole beyond i_size */ - if (offset >= inode->i_size) - return ret; - if (f2fs_has_inline_data(inode)) { ret = f2fs_convert_inline_inode(inode); if (ret) @@ -765,6 +790,320 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) return ret; } +static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct dnode_of_data dn; + pgoff_t nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; + int ret = 0; + + f2fs_lock_op(sbi); + + for (; end < nrpages; start++, end++) { + block_t new_addr, old_addr; + + set_new_dnode(&dn, inode, NULL, NULL, 0); + ret = get_dnode_of_data(&dn, end, LOOKUP_NODE_RA); + if (ret && ret != -ENOENT) { + goto out; + } else if (ret == -ENOENT) { + new_addr = NULL_ADDR; + } else { + new_addr = dn.data_blkaddr; + truncate_data_blocks_range(&dn, 1); + f2fs_put_dnode(&dn); + } + + if (new_addr == NULL_ADDR) { + set_new_dnode(&dn, inode, NULL, NULL, 0); + ret = get_dnode_of_data(&dn, start, LOOKUP_NODE_RA); + if (ret && ret != -ENOENT) + goto out; + else if (ret == -ENOENT) + continue; + + if (dn.data_blkaddr == NULL_ADDR) { + f2fs_put_dnode(&dn); + continue; + } else { + truncate_data_blocks_range(&dn, 1); + } + + f2fs_put_dnode(&dn); + } else { + struct page *ipage; + + ipage = get_node_page(sbi, inode->i_ino); + if (IS_ERR(ipage)) { + ret = PTR_ERR(ipage); + goto out; + } + + set_new_dnode(&dn, inode, ipage, NULL, 0); + ret = f2fs_reserve_block(&dn, start); + if (ret) + goto out; + + old_addr = dn.data_blkaddr; + if (old_addr != NEW_ADDR && new_addr == NEW_ADDR) { + dn.data_blkaddr = NULL_ADDR; + f2fs_update_extent_cache(&dn); + invalidate_blocks(sbi, old_addr); + + dn.data_blkaddr = new_addr; + set_data_blkaddr(&dn); + } else if (new_addr != NEW_ADDR) { + struct node_info ni; + + get_node_info(sbi, dn.nid, &ni); + f2fs_replace_block(sbi, &dn, old_addr, new_addr, + ni.version, true); + } + + f2fs_put_dnode(&dn); + } + } + ret = 0; +out: + f2fs_unlock_op(sbi); + return ret; +} + +static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) +{ + pgoff_t pg_start, pg_end; + loff_t new_size; + int ret; + + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + + if (offset + len >= i_size_read(inode)) + return -EINVAL; + + /* collapse range should be aligned to block size of f2fs. */ + if (offset & (F2FS_BLKSIZE - 1) || len & (F2FS_BLKSIZE - 1)) + return -EINVAL; + + pg_start = offset >> PAGE_CACHE_SHIFT; + pg_end = (offset + len) >> PAGE_CACHE_SHIFT; + + /* write out all dirty pages from offset */ + ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); + if (ret) + return ret; + + truncate_pagecache(inode, offset); + + ret = f2fs_do_collapse(inode, pg_start, pg_end); + if (ret) + return ret; + + new_size = i_size_read(inode) - len; + + ret = truncate_blocks(inode, new_size, true); + if (!ret) + i_size_write(inode, new_size); + + return ret; +} + +static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, + int mode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct address_space *mapping = inode->i_mapping; + pgoff_t index, pg_start, pg_end; + loff_t new_size = i_size_read(inode); + loff_t off_start, off_end; + int ret = 0; + + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + + ret = inode_newsize_ok(inode, (len + offset)); + if (ret) + return ret; + + f2fs_balance_fs(sbi); + + if (f2fs_has_inline_data(inode)) { + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; + } + + ret = filemap_write_and_wait_range(mapping, offset, offset + len - 1); + if (ret) + return ret; + + truncate_pagecache_range(inode, offset, offset + len - 1); + + pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT; + pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT; + + off_start = offset & (PAGE_CACHE_SIZE - 1); + off_end = (offset + len) & (PAGE_CACHE_SIZE - 1); + + if (pg_start == pg_end) { + fill_zero(inode, pg_start, off_start, off_end - off_start); + if (offset + len > new_size) + new_size = offset + len; + new_size = max_t(loff_t, new_size, offset + len); + } else { + if (off_start) { + fill_zero(inode, pg_start++, off_start, + PAGE_CACHE_SIZE - off_start); + new_size = max_t(loff_t, new_size, + pg_start << PAGE_CACHE_SHIFT); + } + + for (index = pg_start; index < pg_end; index++) { + struct dnode_of_data dn; + struct page *ipage; + + f2fs_lock_op(sbi); + + ipage = get_node_page(sbi, inode->i_ino); + if (IS_ERR(ipage)) { + ret = PTR_ERR(ipage); + f2fs_unlock_op(sbi); + goto out; + } + + set_new_dnode(&dn, inode, ipage, NULL, 0); + ret = f2fs_reserve_block(&dn, index); + if (ret) { + f2fs_unlock_op(sbi); + goto out; + } + + if (dn.data_blkaddr != NEW_ADDR) { + invalidate_blocks(sbi, dn.data_blkaddr); + + dn.data_blkaddr = NEW_ADDR; + set_data_blkaddr(&dn); + + dn.data_blkaddr = NULL_ADDR; + f2fs_update_extent_cache(&dn); + } + f2fs_put_dnode(&dn); + f2fs_unlock_op(sbi); + + new_size = max_t(loff_t, new_size, + (index + 1) << PAGE_CACHE_SHIFT); + } + + if (off_end) { + fill_zero(inode, pg_end, 0, off_end); + new_size = max_t(loff_t, new_size, offset + len); + } + } + +out: + if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size) { + i_size_write(inode, new_size); + mark_inode_dirty(inode); + update_inode_page(inode); + } + + return ret; +} + +static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + pgoff_t pg_start, pg_end, delta, nrpages, idx; + loff_t new_size; + int ret; + + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + + new_size = i_size_read(inode) + len; + if (new_size > inode->i_sb->s_maxbytes) + return -EFBIG; + + if (offset >= i_size_read(inode)) + return -EINVAL; + + /* insert range should be aligned to block size of f2fs. */ + if (offset & (F2FS_BLKSIZE - 1) || len & (F2FS_BLKSIZE - 1)) + return -EINVAL; + + f2fs_balance_fs(sbi); + + ret = truncate_blocks(inode, i_size_read(inode), true); + if (ret) + return ret; + + /* write out all dirty pages from offset */ + ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); + if (ret) + return ret; + + truncate_pagecache(inode, offset); + + pg_start = offset >> PAGE_CACHE_SHIFT; + pg_end = (offset + len) >> PAGE_CACHE_SHIFT; + delta = pg_end - pg_start; + nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; + + for (idx = nrpages - 1; idx >= pg_start && idx != -1; idx--) { + struct dnode_of_data dn; + struct page *ipage; + block_t new_addr, old_addr; + + f2fs_lock_op(sbi); + + set_new_dnode(&dn, inode, NULL, NULL, 0); + ret = get_dnode_of_data(&dn, idx, LOOKUP_NODE_RA); + if (ret && ret != -ENOENT) { + goto out; + } else if (ret == -ENOENT) { + goto next; + } else if (dn.data_blkaddr == NULL_ADDR) { + f2fs_put_dnode(&dn); + goto next; + } else { + new_addr = dn.data_blkaddr; + truncate_data_blocks_range(&dn, 1); + f2fs_put_dnode(&dn); + } + + ipage = get_node_page(sbi, inode->i_ino); + if (IS_ERR(ipage)) { + ret = PTR_ERR(ipage); + goto out; + } + + set_new_dnode(&dn, inode, ipage, NULL, 0); + ret = f2fs_reserve_block(&dn, idx + delta); + if (ret) + goto out; + + old_addr = dn.data_blkaddr; + f2fs_bug_on(sbi, old_addr != NEW_ADDR); + + if (new_addr != NEW_ADDR) { + struct node_info ni; + + get_node_info(sbi, dn.nid, &ni); + f2fs_replace_block(sbi, &dn, old_addr, new_addr, + ni.version, true); + } + f2fs_put_dnode(&dn); +next: + f2fs_unlock_op(sbi); + } + + i_size_write(inode, new_size); + return 0; +out: + f2fs_unlock_op(sbi); + return ret; +} + static int expand_inode_data(struct inode *inode, loff_t offset, loff_t len, int mode) { @@ -830,23 +1169,40 @@ static long f2fs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); - long ret; + long ret = 0; + + if (f2fs_encrypted_inode(inode) && + (mode & (FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_INSERT_RANGE))) + return -EOPNOTSUPP; - if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) + if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | + FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE | + FALLOC_FL_INSERT_RANGE)) return -EOPNOTSUPP; mutex_lock(&inode->i_mutex); - if (mode & FALLOC_FL_PUNCH_HOLE) + if (mode & FALLOC_FL_PUNCH_HOLE) { + if (offset >= inode->i_size) + goto out; + ret = punch_hole(inode, offset, len); - else + } else if (mode & FALLOC_FL_COLLAPSE_RANGE) { + ret = f2fs_collapse_range(inode, offset, len); + } else if (mode & FALLOC_FL_ZERO_RANGE) { + ret = f2fs_zero_range(inode, offset, len, mode); + } else if (mode & FALLOC_FL_INSERT_RANGE) { + ret = f2fs_insert_range(inode, offset, len); + } else { ret = expand_inode_data(inode, offset, len, mode); + } if (!ret) { inode->i_mtime = inode->i_ctime = CURRENT_TIME; mark_inode_dirty(inode); } +out: mutex_unlock(&inode->i_mutex); trace_f2fs_fallocate(inode, mode, offset, len, ret); @@ -975,12 +1331,13 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) if (ret) return ret; - if (f2fs_is_atomic_file(inode)) + if (f2fs_is_atomic_file(inode)) { + clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); commit_inmem_pages(inode, false); + } ret = f2fs_sync_file(filp, 0, LONG_MAX, 0); mnt_drop_write_file(filp); - clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); return ret; } @@ -1031,15 +1388,13 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp) f2fs_balance_fs(F2FS_I_SB(inode)); if (f2fs_is_atomic_file(inode)) { - commit_inmem_pages(inode, false); clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); + commit_inmem_pages(inode, false); } - if (f2fs_is_volatile_file(inode)) { + if (f2fs_is_volatile_file(inode)) clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE); - filemap_fdatawrite(inode->i_mapping); - set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE); - } + mnt_drop_write_file(filp); return ret; } @@ -1109,6 +1464,86 @@ static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg) return 0; } +static bool uuid_is_nonzero(__u8 u[16]) +{ + int i; + + for (i = 0; i < 16; i++) + if (u[i]) + return true; + return false; +} + +static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg) +{ +#ifdef CONFIG_F2FS_FS_ENCRYPTION + struct f2fs_encryption_policy policy; + struct inode *inode = file_inode(filp); + + if (copy_from_user(&policy, (struct f2fs_encryption_policy __user *)arg, + sizeof(policy))) + return -EFAULT; + + return f2fs_process_policy(&policy, inode); +#else + return -EOPNOTSUPP; +#endif +} + +static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg) +{ +#ifdef CONFIG_F2FS_FS_ENCRYPTION + struct f2fs_encryption_policy policy; + struct inode *inode = file_inode(filp); + int err; + + err = f2fs_get_policy(inode, &policy); + if (err) + return err; + + if (copy_to_user((struct f2fs_encryption_policy __user *)arg, &policy, + sizeof(policy))) + return -EFAULT; + return 0; +#else + return -EOPNOTSUPP; +#endif +} + +static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + int err; + + if (!f2fs_sb_has_crypto(inode->i_sb)) + return -EOPNOTSUPP; + + if (uuid_is_nonzero(sbi->raw_super->encrypt_pw_salt)) + goto got_it; + + err = mnt_want_write_file(filp); + if (err) + return err; + + /* update superblock with uuid */ + generate_random_uuid(sbi->raw_super->encrypt_pw_salt); + + err = f2fs_commit_super(sbi, false); + + mnt_drop_write_file(filp); + if (err) { + /* undo new data */ + memset(sbi->raw_super->encrypt_pw_salt, 0, 16); + return err; + } +got_it: + if (copy_to_user((__u8 __user *)arg, sbi->raw_super->encrypt_pw_salt, + 16)) + return -EFAULT; + return 0; +} + long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { switch (cmd) { @@ -1132,11 +1567,29 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_shutdown(filp, arg); case FITRIM: return f2fs_ioc_fitrim(filp, arg); + case F2FS_IOC_SET_ENCRYPTION_POLICY: + return f2fs_ioc_set_encryption_policy(filp, arg); + case F2FS_IOC_GET_ENCRYPTION_POLICY: + return f2fs_ioc_get_encryption_policy(filp, arg); + case F2FS_IOC_GET_ENCRYPTION_PWSALT: + return f2fs_ioc_get_encryption_pwsalt(filp, arg); default: return -ENOTTY; } } +static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct inode *inode = file_inode(iocb->ki_filp); + + if (f2fs_encrypted_inode(inode) && + !f2fs_has_encryption_key(inode) && + f2fs_get_encryption_info(inode)) + return -EACCES; + + return generic_file_write_iter(iocb, from); +} + #ifdef CONFIG_COMPAT long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -1157,8 +1610,8 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) const struct file_operations f2fs_file_operations = { .llseek = f2fs_llseek, .read_iter = generic_file_read_iter, - .write_iter = generic_file_write_iter, - .open = generic_file_open, + .write_iter = f2fs_file_write_iter, + .open = f2fs_file_open, .release = f2fs_release_file, .mmap = f2fs_file_mmap, .fsync = f2fs_sync_file, diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index ed58211fe..22fb5ef37 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -518,12 +518,91 @@ static int check_dnode(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, return 1; } -static void move_data_page(struct inode *inode, struct page *page, int gc_type) +static void move_encrypted_block(struct inode *inode, block_t bidx) { struct f2fs_io_info fio = { + .sbi = F2FS_I_SB(inode), .type = DATA, - .rw = WRITE_SYNC, + .rw = READ_SYNC, + .encrypted_page = NULL, }; + struct dnode_of_data dn; + struct f2fs_summary sum; + struct node_info ni; + struct page *page; + int err; + + /* do not read out */ + page = grab_cache_page(inode->i_mapping, bidx); + if (!page) + return; + + set_new_dnode(&dn, inode, NULL, NULL, 0); + err = get_dnode_of_data(&dn, bidx, LOOKUP_NODE); + if (err) + goto out; + + if (unlikely(dn.data_blkaddr == NULL_ADDR)) + goto put_out; + + get_node_info(fio.sbi, dn.nid, &ni); + set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version); + + /* read page */ + fio.page = page; + fio.blk_addr = dn.data_blkaddr; + + fio.encrypted_page = grab_cache_page(META_MAPPING(fio.sbi), fio.blk_addr); + if (!fio.encrypted_page) + goto put_out; + + err = f2fs_submit_page_bio(&fio); + if (err) + goto put_page_out; + + /* write page */ + lock_page(fio.encrypted_page); + + if (unlikely(!PageUptodate(fio.encrypted_page))) + goto put_page_out; + if (unlikely(fio.encrypted_page->mapping != META_MAPPING(fio.sbi))) + goto put_page_out; + + set_page_dirty(fio.encrypted_page); + f2fs_wait_on_page_writeback(fio.encrypted_page, META); + if (clear_page_dirty_for_io(fio.encrypted_page)) + dec_page_count(fio.sbi, F2FS_DIRTY_META); + + set_page_writeback(fio.encrypted_page); + + /* allocate block address */ + f2fs_wait_on_page_writeback(dn.node_page, NODE); + allocate_data_block(fio.sbi, NULL, fio.blk_addr, + &fio.blk_addr, &sum, CURSEG_COLD_DATA); + fio.rw = WRITE_SYNC; + f2fs_submit_page_mbio(&fio); + + dn.data_blkaddr = fio.blk_addr; + set_data_blkaddr(&dn); + f2fs_update_extent_cache(&dn); + set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE); + if (page->index == 0) + set_inode_flag(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN); +put_page_out: + f2fs_put_page(fio.encrypted_page, 1); +put_out: + f2fs_put_dnode(&dn); +out: + f2fs_put_page(page, 1); +} + +static void move_data_page(struct inode *inode, block_t bidx, int gc_type) +{ + struct page *page; + + page = get_lock_data_page(inode, bidx); + if (IS_ERR(page)) + return; if (gc_type == BG_GC) { if (PageWriteback(page)) @@ -531,12 +610,19 @@ static void move_data_page(struct inode *inode, struct page *page, int gc_type) set_page_dirty(page); set_cold_data(page); } else { + struct f2fs_io_info fio = { + .sbi = F2FS_I_SB(inode), + .type = DATA, + .rw = WRITE_SYNC, + .page = page, + .encrypted_page = NULL, + }; + set_page_dirty(page); f2fs_wait_on_page_writeback(page, DATA); - if (clear_page_dirty_for_io(page)) inode_dec_dirty_pages(inode); set_cold_data(page); - do_write_data_page(page, &fio); + do_write_data_page(&fio); clear_cold_data(page); } out: @@ -599,10 +685,16 @@ next_step: if (IS_ERR(inode) || is_bad_inode(inode)) continue; - start_bidx = start_bidx_of_node(nofs, F2FS_I(inode)); + /* if encrypted inode, let's go phase 3 */ + if (f2fs_encrypted_inode(inode) && + S_ISREG(inode->i_mode)) { + add_gc_inode(gc_list, inode); + continue; + } - data_page = find_data_page(inode, - start_bidx + ofs_in_node, false); + start_bidx = start_bidx_of_node(nofs, F2FS_I(inode)); + data_page = get_read_data_page(inode, + start_bidx + ofs_in_node, READA); if (IS_ERR(data_page)) { iput(inode); continue; @@ -616,12 +708,12 @@ next_step: /* phase 3 */ inode = find_gc_inode(gc_list, dni.ino); if (inode) { - start_bidx = start_bidx_of_node(nofs, F2FS_I(inode)); - data_page = get_lock_data_page(inode, - start_bidx + ofs_in_node); - if (IS_ERR(data_page)) - continue; - move_data_page(inode, data_page, gc_type); + start_bidx = start_bidx_of_node(nofs, F2FS_I(inode)) + + ofs_in_node; + if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) + move_encrypted_block(inode, start_bidx); + else + move_data_page(inode, start_bidx, gc_type); stat_inc_data_blk_count(sbi, 1, gc_type); } } @@ -670,6 +762,15 @@ static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno, sum = page_address(sum_page); + /* + * this is to avoid deadlock: + * - lock_page(sum_page) - f2fs_replace_block + * - check_valid_map() - mutex_lock(sentry_lock) + * - mutex_lock(sentry_lock) - change_curseg() + * - lock_page(sum_page) + */ + unlock_page(sum_page); + switch (GET_SUM_TYPE((&sum->footer))) { case SUM_TYPE_NODE: gc_node_segment(sbi, sum->entries, segno, gc_type); @@ -683,7 +784,7 @@ static void do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno, stat_inc_seg_count(sbi, GET_SUM_TYPE((&sum->footer)), gc_type); stat_inc_call_count(sbi->stat_info); - f2fs_put_page(sum_page, 1); + f2fs_put_page(sum_page, 0); } int f2fs_gc(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c index a844fcfb9..71b7206c4 100644 --- a/fs/f2fs/hash.c +++ b/fs/f2fs/hash.c @@ -79,8 +79,7 @@ f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info) const unsigned char *name = name_info->name; size_t len = name_info->len; - if ((len <= 2) && (name[0] == '.') && - (name[1] == '.' || name[1] == '\0')) + if (is_dot_dotdot(name_info)) return 0; /* Initialize the default seed for the hash checksum functions */ diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 8140e4f0e..a13ffcc32 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -13,7 +13,7 @@ #include "f2fs.h" -bool f2fs_may_inline(struct inode *inode) +bool f2fs_may_inline_data(struct inode *inode) { if (!test_opt(F2FS_I_SB(inode), INLINE_DATA)) return false; @@ -27,6 +27,20 @@ bool f2fs_may_inline(struct inode *inode) if (i_size_read(inode) > MAX_INLINE_DATA) return false; + if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) + return false; + + return true; +} + +bool f2fs_may_inline_dentry(struct inode *inode) +{ + if (!test_opt(F2FS_I_SB(inode), INLINE_DENTRY)) + return false; + + if (!S_ISDIR(inode->i_mode)) + return false; + return true; } @@ -95,8 +109,11 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) { void *src_addr, *dst_addr; struct f2fs_io_info fio = { + .sbi = F2FS_I_SB(dn->inode), .type = DATA, .rw = WRITE_SYNC | REQ_PRIO, + .page = page, + .encrypted_page = NULL, }; int dirty, err; @@ -124,13 +141,15 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) kunmap_atomic(dst_addr); SetPageUptodate(page); no_update: + set_page_dirty(page); + /* clear dirty state */ dirty = clear_page_dirty_for_io(page); /* write data page to try to make data consistent */ set_page_writeback(page); fio.blk_addr = dn->data_blkaddr; - write_data_page(page, dn, &fio); + write_data_page(dn, &fio); set_data_blkaddr(dn); f2fs_update_extent_cache(dn); f2fs_wait_on_page_writeback(page, DATA); @@ -267,23 +286,26 @@ process_inline: } struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, - struct qstr *name, struct page **res_page) + struct f2fs_filename *fname, struct page **res_page) { struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); struct f2fs_inline_dentry *inline_dentry; + struct qstr name = FSTR_TO_QSTR(&fname->disk_name); struct f2fs_dir_entry *de; struct f2fs_dentry_ptr d; struct page *ipage; + f2fs_hash_t namehash; ipage = get_node_page(sbi, dir->i_ino); if (IS_ERR(ipage)) return NULL; - inline_dentry = inline_data_addr(ipage); + namehash = f2fs_dentry_hash(&name); - make_dentry_ptr(&d, (void *)inline_dentry, 2); - de = find_target_dentry(name, NULL, &d); + inline_dentry = inline_data_addr(ipage); + make_dentry_ptr(NULL, &d, (void *)inline_dentry, 2); + de = find_target_dentry(fname, namehash, NULL, &d); unlock_page(ipage); if (de) *res_page = ipage; @@ -325,7 +347,7 @@ int make_empty_inline_dir(struct inode *inode, struct inode *parent, dentry_blk = inline_data_addr(ipage); - make_dentry_ptr(&d, (void *)dentry_blk, 2); + make_dentry_ptr(NULL, &d, (void *)dentry_blk, 2); do_make_empty_dir(inode, parent, &d); set_page_dirty(ipage); @@ -429,7 +451,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name, f2fs_wait_on_page_writeback(ipage, NODE); name_hash = f2fs_dentry_hash(name); - make_dentry_ptr(&d, (void *)dentry_blk, 2); + make_dentry_ptr(NULL, &d, (void *)dentry_blk, 2); f2fs_update_dentry(ino, mode, &d, name, name_hash, bit_pos); set_page_dirty(ipage); @@ -506,7 +528,8 @@ bool f2fs_empty_inline_dir(struct inode *dir) return true; } -int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx) +int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, + struct f2fs_str *fstr) { struct inode *inode = file_inode(file); struct f2fs_inline_dentry *inline_dentry = NULL; @@ -522,9 +545,9 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx) inline_dentry = inline_data_addr(ipage); - make_dentry_ptr(&d, (void *)inline_dentry, 2); + make_dentry_ptr(inode, &d, (void *)inline_dentry, 2); - if (!f2fs_fill_dentries(ctx, &d, 0)) + if (!f2fs_fill_dentries(ctx, &d, 0, fstr)) ctx->pos = NR_INLINE_DENTRY; f2fs_put_page(ipage, 1); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index e622ec954..2550868dc 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -198,7 +198,10 @@ make_now: inode->i_mapping->a_ops = &f2fs_dblock_aops; mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_HIGH_ZERO); } else if (S_ISLNK(inode->i_mode)) { - inode->i_op = &f2fs_symlink_inode_operations; + if (f2fs_encrypted_inode(inode)) + inode->i_op = &f2fs_encrypted_symlink_inode_operations; + else + inode->i_op = &f2fs_symlink_inode_operations; inode->i_mapping->a_ops = &f2fs_dblock_aops; } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { @@ -359,6 +362,10 @@ no_delete: if (is_inode_flag_set(F2FS_I(inode), FI_UPDATE_WRITE)) add_dirty_inode(sbi, inode->i_ino, UPDATE_INO); out_clear: +#ifdef CONFIG_F2FS_FS_ENCRYPTION + if (F2FS_I(inode)->i_crypt_info) + f2fs_free_encryption_info(inode, F2FS_I(inode)->i_crypt_info); +#endif clear_inode(inode); } diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 658e8079a..fdbae21ee 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -56,11 +56,18 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) goto out; } - if (f2fs_may_inline(inode)) + /* If the directory encrypted, then we should encrypt the inode. */ + if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode)) + f2fs_set_encrypted_inode(inode); + + if (f2fs_may_inline_data(inode)) set_inode_flag(F2FS_I(inode), FI_INLINE_DATA); - if (test_opt(sbi, INLINE_DENTRY) && S_ISDIR(inode->i_mode)) + if (f2fs_may_inline_dentry(inode)) set_inode_flag(F2FS_I(inode), FI_INLINE_DENTRY); + stat_inc_inline_inode(inode); + stat_inc_inline_dir(inode); + trace_f2fs_new_inode(inode, 0); mark_inode_dirty(inode); return inode; @@ -136,7 +143,6 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, alloc_nid_done(sbi, ino); - stat_inc_inline_inode(inode); d_instantiate(dentry, inode); unlock_new_inode(inode); @@ -155,6 +161,10 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, struct f2fs_sb_info *sbi = F2FS_I_SB(dir); int err; + if (f2fs_encrypted_inode(dir) && + !f2fs_is_child_context_consistent_with_parent(dir, inode)) + return -EPERM; + f2fs_balance_fs(sbi); inode->i_ctime = CURRENT_TIME; @@ -232,32 +242,34 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, struct inode *inode = NULL; struct f2fs_dir_entry *de; struct page *page; + nid_t ino; + int err = 0; if (dentry->d_name.len > F2FS_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); de = f2fs_find_entry(dir, &dentry->d_name, &page); - if (de) { - nid_t ino = le32_to_cpu(de->ino); - f2fs_dentry_kunmap(dir, page); - f2fs_put_page(page, 0); + if (!de) + return d_splice_alias(inode, dentry); - inode = f2fs_iget(dir->i_sb, ino); - if (IS_ERR(inode)) - return ERR_CAST(inode); + ino = le32_to_cpu(de->ino); + f2fs_dentry_kunmap(dir, page); + f2fs_put_page(page, 0); - if (f2fs_has_inline_dots(inode)) { - int err; + inode = f2fs_iget(dir->i_sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); - err = __recover_dot_dentries(inode, dir->i_ino); - if (err) { - iget_failed(inode); - return ERR_PTR(err); - } - } + if (f2fs_has_inline_dots(inode)) { + err = __recover_dot_dentries(inode, dir->i_ino); + if (err) + goto err_out; } - return d_splice_alias(inode, dentry); + +err_out: + iget_failed(inode); + return ERR_PTR(err); } static int f2fs_unlink(struct inode *dir, struct dentry *dentry) @@ -296,19 +308,15 @@ fail: return err; } -static void *f2fs_follow_link(struct dentry *dentry, struct nameidata *nd) +static const char *f2fs_follow_link(struct dentry *dentry, void **cookie) { - struct page *page = page_follow_link_light(dentry, nd); - - if (IS_ERR_OR_NULL(page)) - return page; - - /* this is broken symlink case */ - if (*nd_get_link(nd) == 0) { - page_put_link(dentry, nd, page); - return ERR_PTR(-ENOENT); + const char *link = page_follow_link_light(dentry, cookie); + if (!IS_ERR(link) && !*link) { + /* this is broken symlink case */ + page_put_link(NULL, *cookie); + link = ERR_PTR(-ENOENT); } - return page; + return link; } static int f2fs_symlink(struct inode *dir, struct dentry *dentry, @@ -316,16 +324,26 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct inode *inode; - size_t symlen = strlen(symname) + 1; + size_t len = strlen(symname); + size_t p_len; + char *p_str; + struct f2fs_str disk_link = FSTR_INIT(NULL, 0); + struct f2fs_encrypted_symlink_data *sd = NULL; int err; + if (len > dir->i_sb->s_blocksize) + return -ENAMETOOLONG; + f2fs_balance_fs(sbi); inode = f2fs_new_inode(dir, S_IFLNK | S_IRWXUGO); if (IS_ERR(inode)) return PTR_ERR(inode); - inode->i_op = &f2fs_symlink_inode_operations; + if (f2fs_encrypted_inode(inode)) + inode->i_op = &f2fs_encrypted_symlink_inode_operations; + else + inode->i_op = &f2fs_symlink_inode_operations; inode->i_mapping->a_ops = &f2fs_dblock_aops; f2fs_lock_op(sbi); @@ -333,10 +351,46 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, if (err) goto out; f2fs_unlock_op(sbi); - - err = page_symlink(inode, symname, symlen); alloc_nid_done(sbi, inode->i_ino); + if (f2fs_encrypted_inode(dir)) { + struct qstr istr = QSTR_INIT(symname, len); + + err = f2fs_get_encryption_info(inode); + if (err) + goto err_out; + + err = f2fs_fname_crypto_alloc_buffer(inode, len, &disk_link); + if (err) + goto err_out; + + err = f2fs_fname_usr_to_disk(inode, &istr, &disk_link); + if (err < 0) + goto err_out; + + p_len = encrypted_symlink_data_len(disk_link.len) + 1; + + if (p_len > dir->i_sb->s_blocksize) { + err = -ENAMETOOLONG; + goto err_out; + } + + sd = kzalloc(p_len, GFP_NOFS); + if (!sd) { + err = -ENOMEM; + goto err_out; + } + memcpy(sd->encrypted_path, disk_link.name, disk_link.len); + sd->len = cpu_to_le16(disk_link.len); + p_str = (char *)sd; + } else { + p_len = len + 1; + p_str = (char *)symname; + } + + err = page_symlink(inode, p_str, p_len); + +err_out: d_instantiate(dentry, inode); unlock_new_inode(inode); @@ -349,10 +403,14 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, * If the symlink path is stored into inline_data, there is no * performance regression. */ - filemap_write_and_wait_range(inode->i_mapping, 0, symlen - 1); + if (!err) + filemap_write_and_wait_range(inode->i_mapping, 0, p_len - 1); if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); + + kfree(sd); + f2fs_fname_crypto_free_buffer(&disk_link); return err; out: handle_failed_inode(inode); @@ -383,7 +441,6 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) goto out_fail; f2fs_unlock_op(sbi); - stat_inc_inline_dir(inode); alloc_nid_done(sbi, inode->i_ino); d_instantiate(dentry, inode); @@ -445,19 +502,101 @@ out: return err; } +static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, + umode_t mode, struct inode **whiteout) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + struct inode *inode; + int err; + + if (!whiteout) + f2fs_balance_fs(sbi); + + inode = f2fs_new_inode(dir, mode); + if (IS_ERR(inode)) + return PTR_ERR(inode); + + if (whiteout) { + init_special_inode(inode, inode->i_mode, WHITEOUT_DEV); + inode->i_op = &f2fs_special_inode_operations; + } else { + inode->i_op = &f2fs_file_inode_operations; + inode->i_fop = &f2fs_file_operations; + inode->i_mapping->a_ops = &f2fs_dblock_aops; + } + + f2fs_lock_op(sbi); + err = acquire_orphan_inode(sbi); + if (err) + goto out; + + err = f2fs_do_tmpfile(inode, dir); + if (err) + goto release_out; + + /* + * add this non-linked tmpfile to orphan list, in this way we could + * remove all unused data of tmpfile after abnormal power-off. + */ + add_orphan_inode(sbi, inode->i_ino); + f2fs_unlock_op(sbi); + + alloc_nid_done(sbi, inode->i_ino); + + if (whiteout) { + inode_dec_link_count(inode); + *whiteout = inode; + } else { + d_tmpfile(dentry, inode); + } + unlock_new_inode(inode); + return 0; + +release_out: + release_orphan_inode(sbi); +out: + handle_failed_inode(inode); + return err; +} + +static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + if (f2fs_encrypted_inode(dir)) { + int err = f2fs_get_encryption_info(dir); + if (err) + return err; + } + + return __f2fs_tmpfile(dir, dentry, mode, NULL); +} + +static int f2fs_create_whiteout(struct inode *dir, struct inode **whiteout) +{ + return __f2fs_tmpfile(dir, NULL, S_IFCHR | WHITEOUT_MODE, whiteout); +} + static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) { struct f2fs_sb_info *sbi = F2FS_I_SB(old_dir); struct inode *old_inode = d_inode(old_dentry); struct inode *new_inode = d_inode(new_dentry); + struct inode *whiteout = NULL; struct page *old_dir_page; - struct page *old_page, *new_page; + struct page *old_page, *new_page = NULL; struct f2fs_dir_entry *old_dir_entry = NULL; struct f2fs_dir_entry *old_entry; struct f2fs_dir_entry *new_entry; int err = -ENOENT; + if ((old_dir != new_dir) && f2fs_encrypted_inode(new_dir) && + !f2fs_is_child_context_consistent_with_parent(new_dir, + old_inode)) { + err = -EPERM; + goto out; + } + f2fs_balance_fs(sbi); old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); @@ -471,17 +610,23 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out_old; } + if (flags & RENAME_WHITEOUT) { + err = f2fs_create_whiteout(old_dir, &whiteout); + if (err) + goto out_dir; + } + if (new_inode) { err = -ENOTEMPTY; if (old_dir_entry && !f2fs_empty_dir(new_inode)) - goto out_dir; + goto out_whiteout; err = -ENOENT; new_entry = f2fs_find_entry(new_dir, &new_dentry->d_name, &new_page); if (!new_entry) - goto out_dir; + goto out_whiteout; f2fs_lock_op(sbi); @@ -489,7 +634,8 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (err) goto put_out_dir; - if (update_dent_inode(old_inode, &new_dentry->d_name)) { + if (update_dent_inode(old_inode, new_inode, + &new_dentry->d_name)) { release_orphan_inode(sbi); goto put_out_dir; } @@ -518,7 +664,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, err = f2fs_add_link(new_dentry, old_inode); if (err) { f2fs_unlock_op(sbi); - goto out_dir; + goto out_whiteout; } if (old_dir_entry) { @@ -529,6 +675,8 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, down_write(&F2FS_I(old_inode)->i_sem); file_lost_pino(old_inode); + if (new_inode && file_enc_name(new_inode)) + file_set_enc_name(old_inode); up_write(&F2FS_I(old_inode)->i_sem); old_inode->i_ctime = CURRENT_TIME; @@ -536,8 +684,18 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_delete_entry(old_entry, old_page, old_dir, NULL); + if (whiteout) { + whiteout->i_state |= I_LINKABLE; + set_inode_flag(F2FS_I(whiteout), FI_INC_LINK); + err = f2fs_add_link(old_dentry, whiteout); + if (err) + goto put_out_dir; + whiteout->i_state &= ~I_LINKABLE; + iput(whiteout); + } + if (old_dir_entry) { - if (old_dir != new_dir) { + if (old_dir != new_dir && !whiteout) { f2fs_set_link(old_inode, old_dir_entry, old_dir_page, new_dir); update_inode_page(old_inode); @@ -558,8 +716,13 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, put_out_dir: f2fs_unlock_op(sbi); - f2fs_dentry_kunmap(new_dir, new_page); - f2fs_put_page(new_page, 0); + if (new_page) { + f2fs_dentry_kunmap(new_dir, new_page); + f2fs_put_page(new_page, 0); + } +out_whiteout: + if (whiteout) + iput(whiteout); out_dir: if (old_dir_entry) { f2fs_dentry_kunmap(old_inode, old_dir_page); @@ -585,6 +748,14 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, int old_nlink = 0, new_nlink = 0; int err = -ENOENT; + if ((f2fs_encrypted_inode(old_dir) || f2fs_encrypted_inode(new_dir)) && + (old_dir != new_dir) && + (!f2fs_is_child_context_consistent_with_parent(new_dir, + old_inode) || + !f2fs_is_child_context_consistent_with_parent(old_dir, + new_inode))) + return -EPERM; + f2fs_balance_fs(sbi); old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); @@ -631,13 +802,17 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_lock_op(sbi); - err = update_dent_inode(old_inode, &new_dentry->d_name); + err = update_dent_inode(old_inode, new_inode, &new_dentry->d_name); if (err) goto out_unlock; + if (file_enc_name(new_inode)) + file_set_enc_name(old_inode); - err = update_dent_inode(new_inode, &old_dentry->d_name); + err = update_dent_inode(new_inode, old_inode, &old_dentry->d_name); if (err) goto out_undo; + if (file_enc_name(old_inode)) + file_set_enc_name(new_inode); /* update ".." directory entry info of old dentry */ if (old_dir_entry) @@ -695,8 +870,11 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_sync_fs(sbi->sb, 1); return 0; out_undo: - /* Still we may fail to recover name info of f2fs_inode here */ - update_dent_inode(old_inode, &old_dentry->d_name); + /* + * Still we may fail to recover name info of f2fs_inode here + * Drop it, once its name is set as encrypted + */ + update_dent_inode(old_inode, old_inode, &old_dentry->d_name); out_unlock: f2fs_unlock_op(sbi); out_new_dir: @@ -723,7 +901,7 @@ static int f2fs_rename2(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { - if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; if (flags & RENAME_EXCHANGE) { @@ -734,53 +912,85 @@ static int f2fs_rename2(struct inode *old_dir, struct dentry *old_dentry, * VFS has already handled the new dentry existence case, * here, we just deal with "RENAME_NOREPLACE" as regular rename. */ - return f2fs_rename(old_dir, old_dentry, new_dir, new_dentry); + return f2fs_rename(old_dir, old_dentry, new_dir, new_dentry, flags); } -static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +#ifdef CONFIG_F2FS_FS_ENCRYPTION +static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cookie) { - struct f2fs_sb_info *sbi = F2FS_I_SB(dir); - struct inode *inode; - int err; - - inode = f2fs_new_inode(dir, mode); - if (IS_ERR(inode)) - return PTR_ERR(inode); - - inode->i_op = &f2fs_file_inode_operations; - inode->i_fop = &f2fs_file_operations; - inode->i_mapping->a_ops = &f2fs_dblock_aops; - - f2fs_lock_op(sbi); - err = acquire_orphan_inode(sbi); - if (err) - goto out; - - err = f2fs_do_tmpfile(inode, dir); - if (err) - goto release_out; - - /* - * add this non-linked tmpfile to orphan list, in this way we could - * remove all unused data of tmpfile after abnormal power-off. - */ - add_orphan_inode(sbi, inode->i_ino); - f2fs_unlock_op(sbi); - - alloc_nid_done(sbi, inode->i_ino); + struct page *cpage = NULL; + char *caddr, *paddr = NULL; + struct f2fs_str cstr; + struct f2fs_str pstr = FSTR_INIT(NULL, 0); + struct inode *inode = d_inode(dentry); + struct f2fs_encrypted_symlink_data *sd; + loff_t size = min_t(loff_t, i_size_read(inode), PAGE_SIZE - 1); + u32 max_size = inode->i_sb->s_blocksize; + int res; + + res = f2fs_get_encryption_info(inode); + if (res) + return ERR_PTR(res); + + cpage = read_mapping_page(inode->i_mapping, 0, NULL); + if (IS_ERR(cpage)) + return ERR_CAST(cpage); + caddr = kmap(cpage); + caddr[size] = 0; + + /* Symlink is encrypted */ + sd = (struct f2fs_encrypted_symlink_data *)caddr; + cstr.name = sd->encrypted_path; + cstr.len = le16_to_cpu(sd->len); - stat_inc_inline_inode(inode); - d_tmpfile(dentry, inode); - unlock_new_inode(inode); - return 0; + /* this is broken symlink case */ + if (cstr.name[0] == 0 && cstr.len == 0) { + res = -ENOENT; + goto errout; + } -release_out: - release_orphan_inode(sbi); -out: - handle_failed_inode(inode); - return err; + if ((cstr.len + sizeof(struct f2fs_encrypted_symlink_data) - 1) > + max_size) { + /* Symlink data on the disk is corrupted */ + res = -EIO; + goto errout; + } + res = f2fs_fname_crypto_alloc_buffer(inode, cstr.len, &pstr); + if (res) + goto errout; + + res = f2fs_fname_disk_to_usr(inode, NULL, &cstr, &pstr); + if (res < 0) + goto errout; + + paddr = pstr.name; + + /* Null-terminate the name */ + paddr[res] = '\0'; + + kunmap(cpage); + page_cache_release(cpage); + return *cookie = paddr; +errout: + f2fs_fname_crypto_free_buffer(&pstr); + kunmap(cpage); + page_cache_release(cpage); + return ERR_PTR(res); } +const struct inode_operations f2fs_encrypted_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = f2fs_encrypted_follow_link, + .put_link = kfree_put_link, + .getattr = f2fs_getattr, + .setattr = f2fs_setattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = f2fs_listxattr, + .removexattr = generic_removexattr, +}; +#endif + const struct inode_operations f2fs_dir_inode_operations = { .create = f2fs_create, .lookup = f2fs_lookup, diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 8ab0cf193..7dd63b794 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -53,7 +53,7 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) PAGE_CACHE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); } else if (type == DIRTY_DENTS) { - if (sbi->sb->s_bdi->dirty_exceeded) + if (sbi->sb->s_bdi->wb.dirty_exceeded) return false; mem_size = get_pages(sbi, F2FS_DIRTY_DENTS); res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); @@ -70,7 +70,7 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) sizeof(struct extent_node)) >> PAGE_CACHE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); } else { - if (sbi->sb->s_bdi->dirty_exceeded) + if (sbi->sb->s_bdi->wb.dirty_exceeded) return false; } return res; @@ -195,32 +195,35 @@ static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i, start, nr); } -bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid) +int need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct nat_entry *e; - bool is_cp = true; + bool need = false; down_read(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, nid); - if (e && !get_nat_flag(e, IS_CHECKPOINTED)) - is_cp = false; + if (e) { + if (!get_nat_flag(e, IS_CHECKPOINTED) && + !get_nat_flag(e, HAS_FSYNCED_INODE)) + need = true; + } up_read(&nm_i->nat_tree_lock); - return is_cp; + return need; } -bool has_fsynced_inode(struct f2fs_sb_info *sbi, nid_t ino) +bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct nat_entry *e; - bool fsynced = false; + bool is_cp = true; down_read(&nm_i->nat_tree_lock); - e = __lookup_nat_cache(nm_i, ino); - if (e && get_nat_flag(e, HAS_FSYNCED_INODE)) - fsynced = true; + e = __lookup_nat_cache(nm_i, nid); + if (e && !get_nat_flag(e, IS_CHECKPOINTED)) + is_cp = false; up_read(&nm_i->nat_tree_lock); - return fsynced; + return is_cp; } bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino) @@ -312,7 +315,8 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, __set_nat_cache_dirty(nm_i, e); /* update fsync_mark if its inode nat entry is still alive */ - e = __lookup_nat_cache(nm_i, ni->ino); + if (ni->nid != ni->ino) + e = __lookup_nat_cache(nm_i, ni->ino); if (e) { if (fsync_done && ni->nid == ni->ino) set_nat_flag(e, HAS_FSYNCED_INODE, true); @@ -995,8 +999,11 @@ static int read_node_page(struct page *page, int rw) struct f2fs_sb_info *sbi = F2FS_P_SB(page); struct node_info ni; struct f2fs_io_info fio = { + .sbi = sbi, .type = NODE, .rw = rw, + .page = page, + .encrypted_page = NULL, }; get_node_info(sbi, page->index, &ni); @@ -1011,7 +1018,7 @@ static int read_node_page(struct page *page, int rw) return LOCKED_PAGE; fio.blk_addr = ni.blk_addr; - return f2fs_submit_page_bio(sbi, page, &fio); + return f2fs_submit_page_bio(&fio); } /* @@ -1204,13 +1211,9 @@ continue_unlock: /* called by fsync() */ if (ino && IS_DNODE(page)) { set_fsync_mark(page, 1); - if (IS_INODE(page)) { - if (!is_checkpointed_node(sbi, ino) && - !has_fsynced_inode(sbi, ino)) - set_dentry_mark(page, 1); - else - set_dentry_mark(page, 0); - } + if (IS_INODE(page)) + set_dentry_mark(page, + need_dentry_mark(sbi, ino)); nwritten++; } else { set_fsync_mark(page, 0); @@ -1293,8 +1296,11 @@ static int f2fs_write_node_page(struct page *page, nid_t nid; struct node_info ni; struct f2fs_io_info fio = { + .sbi = sbi, .type = NODE, .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE, + .page = page, + .encrypted_page = NULL, }; trace_f2fs_writepage(page, NODE); @@ -1329,7 +1335,7 @@ static int f2fs_write_node_page(struct page *page, set_page_writeback(page); fio.blk_addr = ni.blk_addr; - write_node_page(sbi, page, nid, &fio); + write_node_page(nid, &fio); set_node_addr(sbi, &ni, fio.blk_addr, is_fsync_dnode(page)); dec_page_count(sbi, F2FS_DIRTY_NODES); up_read(&sbi->node_write); diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index c56026f17..7427e956a 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -343,28 +343,6 @@ static inline nid_t get_nid(struct page *p, int off, bool i) * - Mark cold node blocks in their node footer * - Mark cold data pages in page cache */ -static inline int is_file(struct inode *inode, int type) -{ - return F2FS_I(inode)->i_advise & type; -} - -static inline void set_file(struct inode *inode, int type) -{ - F2FS_I(inode)->i_advise |= type; -} - -static inline void clear_file(struct inode *inode, int type) -{ - F2FS_I(inode)->i_advise &= ~type; -} - -#define file_is_cold(inode) is_file(inode, FADVISE_COLD_BIT) -#define file_wrong_pino(inode) is_file(inode, FADVISE_LOST_PINO_BIT) -#define file_set_cold(inode) set_file(inode, FADVISE_COLD_BIT) -#define file_lost_pino(inode) set_file(inode, FADVISE_LOST_PINO_BIT) -#define file_clear_cold(inode) clear_file(inode, FADVISE_COLD_BIT) -#define file_got_pino(inode) clear_file(inode, FADVISE_LOST_PINO_BIT) - static inline int is_cold_data(struct page *page) { return PageChecked(page); diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 8d8ea99f2..24a8c1d4f 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -83,6 +83,11 @@ static int recover_dentry(struct inode *inode, struct page *ipage) goto out; } + if (file_enc_name(inode)) { + iput(dir); + return 0; + } + name.len = le32_to_cpu(raw_inode->i_namelen); name.name = raw_inode->i_name; @@ -143,6 +148,7 @@ out: static void recover_inode(struct inode *inode, struct page *page) { struct f2fs_inode *raw = F2FS_INODE(page); + char *name; inode->i_mode = le16_to_cpu(raw->i_mode); i_size_write(inode, le64_to_cpu(raw->i_size)); @@ -153,8 +159,13 @@ static void recover_inode(struct inode *inode, struct page *page) inode->i_ctime.tv_nsec = le32_to_cpu(raw->i_ctime_nsec); inode->i_mtime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec); + if (file_enc_name(inode)) + name = "<encrypted>"; + else + name = F2FS_INODE(page)->i_name; + f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode: ino = %x, name = %s", - ino_of_node(page), F2FS_INODE(page)->i_name); + ino_of_node(page), name); } static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) @@ -174,7 +185,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) while (1) { struct fsync_inode_entry *entry; - if (blkaddr < MAIN_BLKADDR(sbi) || blkaddr >= MAX_BLKADDR(sbi)) + if (!is_valid_blkaddr(sbi, blkaddr, META_POR)) return 0; page = get_meta_page(sbi, blkaddr); @@ -349,7 +360,6 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, struct f2fs_inode_info *fi = F2FS_I(inode); unsigned int start, end; struct dnode_of_data dn; - struct f2fs_summary sum; struct node_info ni; int err = 0, recovered = 0; @@ -396,7 +406,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, dest = datablock_addr(page, dn.ofs_in_node); if (src != dest && dest != NEW_ADDR && dest != NULL_ADDR && - dest >= MAIN_BLKADDR(sbi) && dest < MAX_BLKADDR(sbi)) { + is_valid_blkaddr(sbi, dest, META_POR)) { if (src == NULL_ADDR) { err = reserve_new_block(&dn); @@ -409,13 +419,9 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, if (err) goto err; - set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version); - /* write dummy data page */ - recover_data_page(sbi, NULL, &sum, src, dest); - dn.data_blkaddr = dest; - set_data_blkaddr(&dn); - f2fs_update_extent_cache(&dn); + f2fs_replace_block(sbi, &dn, src, dest, + ni.version, false); recovered++; } dn.ofs_in_node++; @@ -454,7 +460,7 @@ static int recover_data(struct f2fs_sb_info *sbi, while (1) { struct fsync_inode_entry *entry; - if (blkaddr < MAIN_BLKADDR(sbi) || blkaddr >= MAX_BLKADDR(sbi)) + if (!is_valid_blkaddr(sbi, blkaddr, META_POR)) break; ra_meta_pages_cond(sbi, blkaddr); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index f93966094..61b97f9cb 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -75,6 +75,14 @@ static inline unsigned long __reverse_ffs(unsigned long word) static unsigned long __find_rev_next_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { + while (!f2fs_test_bit(offset, (unsigned char *)addr)) + offset++; + + if (offset > size) + offset = size; + + return offset; +#if 0 const unsigned long *p = addr + BIT_WORD(offset); unsigned long result = offset & ~(BITS_PER_LONG - 1); unsigned long tmp; @@ -121,11 +129,20 @@ found_first: return result + size; /* Nope. */ found_middle: return result + __reverse_ffs(tmp); +#endif } static unsigned long __find_rev_next_zero_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { + while (f2fs_test_bit(offset, (unsigned char *)addr)) + offset++; + + if (offset > size) + offset = size; + + return offset; +#if 0 const unsigned long *p = addr + BIT_WORD(offset); unsigned long result = offset & ~(BITS_PER_LONG - 1); unsigned long tmp; @@ -173,6 +190,7 @@ found_first: return result + size; /* Nope. */ found_middle: return result + __reverse_ffz(tmp); +#endif } void register_inmem_page(struct inode *inode, struct page *page) @@ -216,8 +234,10 @@ void commit_inmem_pages(struct inode *inode, bool abort) struct inmem_pages *cur, *tmp; bool submit_bio = false; struct f2fs_io_info fio = { + .sbi = sbi, .type = DATA, .rw = WRITE_SYNC | REQ_PRIO, + .encrypted_page = NULL, }; /* @@ -237,11 +257,13 @@ void commit_inmem_pages(struct inode *inode, bool abort) if (!abort) { lock_page(cur->page); if (cur->page->mapping == inode->i_mapping) { + set_page_dirty(cur->page); f2fs_wait_on_page_writeback(cur->page, DATA); if (clear_page_dirty_for_io(cur->page)) inode_dec_dirty_pages(inode); trace_f2fs_commit_inmem_page(cur->page, INMEM); - do_write_data_page(cur->page, &fio); + fio.page = cur->page; + do_write_data_page(&fio); submit_bio = true; } f2fs_put_page(cur->page, 1); @@ -466,22 +488,43 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi, { sector_t start = SECTOR_FROM_BLOCK(blkstart); sector_t len = SECTOR_FROM_BLOCK(blklen); + struct seg_entry *se; + unsigned int offset; + block_t i; + + for (i = blkstart; i < blkstart + blklen; i++) { + se = get_seg_entry(sbi, GET_SEGNO(sbi, i)); + offset = GET_BLKOFF_FROM_SEG0(sbi, i); + + if (!f2fs_test_and_set_bit(offset, se->discard_map)) + sbi->discard_blks--; + } trace_f2fs_issue_discard(sbi->sb, blkstart, blklen); return blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0); } void discard_next_dnode(struct f2fs_sb_info *sbi, block_t blkaddr) { - if (f2fs_issue_discard(sbi, blkaddr, 1)) { - struct page *page = grab_meta_page(sbi, blkaddr); - /* zero-filled page */ - set_page_dirty(page); - f2fs_put_page(page, 1); + int err = -ENOTSUPP; + + if (test_opt(sbi, DISCARD)) { + struct seg_entry *se = get_seg_entry(sbi, + GET_SEGNO(sbi, blkaddr)); + unsigned int offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); + + if (f2fs_test_bit(offset, se->discard_map)) + return; + + err = f2fs_issue_discard(sbi, blkaddr, 1); } + + if (err) + update_meta_page(sbi, NULL, blkaddr); } static void __add_discard_entry(struct f2fs_sb_info *sbi, - struct cp_control *cpc, unsigned int start, unsigned int end) + struct cp_control *cpc, struct seg_entry *se, + unsigned int start, unsigned int end) { struct list_head *head = &SM_I(sbi)->discard_list; struct discard_entry *new, *last; @@ -502,7 +545,6 @@ static void __add_discard_entry(struct f2fs_sb_info *sbi, list_add_tail(&new->list, head); done: SM_I(sbi)->nr_discards += end - start; - cpc->trimmed += end - start; } static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc) @@ -512,41 +554,24 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc) struct seg_entry *se = get_seg_entry(sbi, cpc->trim_start); unsigned long *cur_map = (unsigned long *)se->cur_valid_map; unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map; + unsigned long *discard_map = (unsigned long *)se->discard_map; unsigned long *dmap = SIT_I(sbi)->tmp_map; unsigned int start = 0, end = -1; bool force = (cpc->reason == CP_DISCARD); int i; - if (!force && (!test_opt(sbi, DISCARD) || - SM_I(sbi)->nr_discards >= SM_I(sbi)->max_discards)) + if (se->valid_blocks == max_blocks) return; - if (force && !se->valid_blocks) { - struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - /* - * if this segment is registered in the prefree list, then - * we should skip adding a discard candidate, and let the - * checkpoint do that later. - */ - mutex_lock(&dirty_i->seglist_lock); - if (test_bit(cpc->trim_start, dirty_i->dirty_segmap[PRE])) { - mutex_unlock(&dirty_i->seglist_lock); - cpc->trimmed += sbi->blocks_per_seg; + if (!force) { + if (!test_opt(sbi, DISCARD) || !se->valid_blocks || + SM_I(sbi)->nr_discards >= SM_I(sbi)->max_discards) return; - } - mutex_unlock(&dirty_i->seglist_lock); - - __add_discard_entry(sbi, cpc, 0, sbi->blocks_per_seg); - return; } - /* zero block will be discarded through the prefree list */ - if (!se->valid_blocks || se->valid_blocks == max_blocks) - return; - /* SIT_VBLOCK_MAP_SIZE should be multiple of sizeof(unsigned long) */ for (i = 0; i < entries; i++) - dmap[i] = force ? ~ckpt_map[i] : + dmap[i] = force ? ~ckpt_map[i] & ~discard_map[i] : (cur_map[i] ^ ckpt_map[i]) & ckpt_map[i]; while (force || SM_I(sbi)->nr_discards <= SM_I(sbi)->max_discards) { @@ -555,11 +580,7 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc) break; end = __find_rev_next_zero_bit(dmap, max_blocks, start + 1); - - if (force && end - start < cpc->trim_minlen) - continue; - - __add_discard_entry(sbi, cpc, start, end); + __add_discard_entry(sbi, cpc, se, start, end); } } @@ -589,7 +610,7 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) mutex_unlock(&dirty_i->seglist_lock); } -void clear_prefree_segments(struct f2fs_sb_info *sbi) +void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct list_head *head = &(SM_I(sbi)->discard_list); struct discard_entry *entry, *this; @@ -622,7 +643,11 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi) /* send small discards */ list_for_each_entry_safe(entry, this, head, list) { + if (cpc->reason == CP_DISCARD && entry->len < cpc->trim_minlen) + goto skip; f2fs_issue_discard(sbi, entry->blkaddr, entry->len); + cpc->trimmed += entry->len; +skip: list_del(&entry->list); SM_I(sbi)->nr_discards -= entry->len; kmem_cache_free(discard_entry_slab, entry); @@ -673,9 +698,13 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) if (del > 0) { if (f2fs_test_and_set_bit(offset, se->cur_valid_map)) f2fs_bug_on(sbi, 1); + if (!f2fs_test_and_set_bit(offset, se->discard_map)) + sbi->discard_blks--; } else { if (!f2fs_test_and_clear_bit(offset, se->cur_valid_map)) f2fs_bug_on(sbi, 1); + if (f2fs_test_and_clear_bit(offset, se->discard_map)) + sbi->discard_blks++; } if (!f2fs_test_bit(offset, se->ckpt_valid_map)) se->ckpt_valid_blocks += del; @@ -769,16 +798,25 @@ struct page *get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno) return get_meta_page(sbi, GET_SUM_BLOCK(sbi, segno)); } -static void write_sum_page(struct f2fs_sb_info *sbi, - struct f2fs_summary_block *sum_blk, block_t blk_addr) +void update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr) { struct page *page = grab_meta_page(sbi, blk_addr); - void *kaddr = page_address(page); - memcpy(kaddr, sum_blk, PAGE_CACHE_SIZE); + void *dst = page_address(page); + + if (src) + memcpy(dst, src, PAGE_CACHE_SIZE); + else + memset(dst, 0, PAGE_CACHE_SIZE); set_page_dirty(page); f2fs_put_page(page, 1); } +static void write_sum_page(struct f2fs_sb_info *sbi, + struct f2fs_summary_block *sum_blk, block_t blk_addr) +{ + update_meta_page(sbi, (void *)sum_blk, blk_addr); +} + static int is_next_segment_free(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); @@ -1060,8 +1098,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) unsigned int start_segno, end_segno; struct cp_control cpc; - if (range->minlen > SEGMENT_SIZE(sbi) || start >= MAX_BLKADDR(sbi) || - range->len < sbi->blocksize) + if (start >= MAX_BLKADDR(sbi) || range->len < sbi->blocksize) return -EINVAL; cpc.trimmed = 0; @@ -1073,12 +1110,19 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 : GET_SEGNO(sbi, end); cpc.reason = CP_DISCARD; - cpc.trim_minlen = F2FS_BYTES_TO_BLK(range->minlen); + cpc.trim_minlen = max_t(__u64, 1, F2FS_BYTES_TO_BLK(range->minlen)); /* do checkpoint to issue discard commands safely */ for (; start_segno <= end_segno; start_segno = cpc.trim_end + 1) { cpc.trim_start = start_segno; - cpc.trim_end = min_t(unsigned int, rounddown(start_segno + + + if (sbi->discard_blks == 0) + break; + else if (sbi->discard_blks < BATCHED_TRIM_BLOCKS(sbi)) + cpc.trim_end = end_segno; + else + cpc.trim_end = min_t(unsigned int, + rounddown(start_segno + BATCHED_TRIM_SEGMENTS(sbi), sbi->segs_per_sec) - 1, end_segno); @@ -1206,84 +1250,95 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, mutex_unlock(&curseg->curseg_mutex); } -static void do_write_page(struct f2fs_sb_info *sbi, struct page *page, - struct f2fs_summary *sum, - struct f2fs_io_info *fio) +static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) { - int type = __get_segment_type(page, fio->type); + int type = __get_segment_type(fio->page, fio->type); - allocate_data_block(sbi, page, fio->blk_addr, &fio->blk_addr, sum, type); + allocate_data_block(fio->sbi, fio->page, fio->blk_addr, + &fio->blk_addr, sum, type); /* writeout dirty page into bdev */ - f2fs_submit_page_mbio(sbi, page, fio); + f2fs_submit_page_mbio(fio); } void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) { struct f2fs_io_info fio = { + .sbi = sbi, .type = META, .rw = WRITE_SYNC | REQ_META | REQ_PRIO, .blk_addr = page->index, + .page = page, + .encrypted_page = NULL, }; set_page_writeback(page); - f2fs_submit_page_mbio(sbi, page, &fio); + f2fs_submit_page_mbio(&fio); } -void write_node_page(struct f2fs_sb_info *sbi, struct page *page, - unsigned int nid, struct f2fs_io_info *fio) +void write_node_page(unsigned int nid, struct f2fs_io_info *fio) { struct f2fs_summary sum; + set_summary(&sum, nid, 0, 0); - do_write_page(sbi, page, &sum, fio); + do_write_page(&sum, fio); } -void write_data_page(struct page *page, struct dnode_of_data *dn, - struct f2fs_io_info *fio) +void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio) { - struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + struct f2fs_sb_info *sbi = fio->sbi; struct f2fs_summary sum; struct node_info ni; f2fs_bug_on(sbi, dn->data_blkaddr == NULL_ADDR); get_node_info(sbi, dn->nid, &ni); set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); - do_write_page(sbi, page, &sum, fio); + do_write_page(&sum, fio); dn->data_blkaddr = fio->blk_addr; } -void rewrite_data_page(struct page *page, struct f2fs_io_info *fio) +void rewrite_data_page(struct f2fs_io_info *fio) { - stat_inc_inplace_blocks(F2FS_P_SB(page)); - f2fs_submit_page_mbio(F2FS_P_SB(page), page, fio); + stat_inc_inplace_blocks(fio->sbi); + f2fs_submit_page_mbio(fio); } -void recover_data_page(struct f2fs_sb_info *sbi, - struct page *page, struct f2fs_summary *sum, - block_t old_blkaddr, block_t new_blkaddr) +static void __f2fs_replace_block(struct f2fs_sb_info *sbi, + struct f2fs_summary *sum, + block_t old_blkaddr, block_t new_blkaddr, + bool recover_curseg) { struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg; unsigned int segno, old_cursegno; struct seg_entry *se; int type; + unsigned short old_blkoff; segno = GET_SEGNO(sbi, new_blkaddr); se = get_seg_entry(sbi, segno); type = se->type; - if (se->valid_blocks == 0 && !IS_CURSEG(sbi, segno)) { - if (old_blkaddr == NULL_ADDR) - type = CURSEG_COLD_DATA; - else + if (!recover_curseg) { + /* for recovery flow */ + if (se->valid_blocks == 0 && !IS_CURSEG(sbi, segno)) { + if (old_blkaddr == NULL_ADDR) + type = CURSEG_COLD_DATA; + else + type = CURSEG_WARM_DATA; + } + } else { + if (!IS_CURSEG(sbi, segno)) type = CURSEG_WARM_DATA; } + curseg = CURSEG_I(sbi, type); mutex_lock(&curseg->curseg_mutex); mutex_lock(&sit_i->sentry_lock); old_cursegno = curseg->segno; + old_blkoff = curseg->next_blkoff; /* change the current segment */ if (segno != curseg->segno) { @@ -1297,30 +1352,67 @@ void recover_data_page(struct f2fs_sb_info *sbi, refresh_sit_entry(sbi, old_blkaddr, new_blkaddr); locate_dirty_segment(sbi, old_cursegno); + if (recover_curseg) { + if (old_cursegno != curseg->segno) { + curseg->next_segno = old_cursegno; + change_curseg(sbi, type, true); + } + curseg->next_blkoff = old_blkoff; + } + mutex_unlock(&sit_i->sentry_lock); mutex_unlock(&curseg->curseg_mutex); } +void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, + block_t old_addr, block_t new_addr, + unsigned char version, bool recover_curseg) +{ + struct f2fs_summary sum; + + set_summary(&sum, dn->nid, dn->ofs_in_node, version); + + __f2fs_replace_block(sbi, &sum, old_addr, new_addr, recover_curseg); + + dn->data_blkaddr = new_addr; + set_data_blkaddr(dn); + f2fs_update_extent_cache(dn); +} + static inline bool is_merged_page(struct f2fs_sb_info *sbi, struct page *page, enum page_type type) { enum page_type btype = PAGE_TYPE_OF_BIO(type); struct f2fs_bio_info *io = &sbi->write_io[btype]; struct bio_vec *bvec; + struct page *target; int i; down_read(&io->io_rwsem); - if (!io->bio) - goto out; + if (!io->bio) { + up_read(&io->io_rwsem); + return false; + } bio_for_each_segment_all(bvec, io->bio, i) { - if (page == bvec->bv_page) { + + if (bvec->bv_page->mapping) { + target = bvec->bv_page; + } else { + struct f2fs_crypto_ctx *ctx; + + /* encrypted page */ + ctx = (struct f2fs_crypto_ctx *)page_private( + bvec->bv_page); + target = ctx->w.control_page; + } + + if (page == target) { up_read(&io->io_rwsem); return true; } } -out: up_read(&io->io_rwsem); return false; } @@ -1857,8 +1949,11 @@ static int build_sit_info(struct f2fs_sb_info *sbi) = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); sit_i->sentries[start].ckpt_valid_map = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); - if (!sit_i->sentries[start].cur_valid_map - || !sit_i->sentries[start].ckpt_valid_map) + sit_i->sentries[start].discard_map + = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); + if (!sit_i->sentries[start].cur_valid_map || + !sit_i->sentries[start].ckpt_valid_map || + !sit_i->sentries[start].discard_map) return -ENOMEM; } @@ -1996,6 +2091,11 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) got_it: check_block_count(sbi, start, &sit); seg_info_from_raw_sit(se, &sit); + + /* build discard map only one time */ + memcpy(se->discard_map, se->cur_valid_map, SIT_VBLOCK_MAP_SIZE); + sbi->discard_blks += sbi->blocks_per_seg - se->valid_blocks; + if (sbi->segs_per_sec > 1) { struct sec_entry *e = get_sec_entry(sbi, start); e->valid_blocks += se->valid_blocks; @@ -2245,6 +2345,7 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi) for (start = 0; start < MAIN_SEGS(sbi); start++) { kfree(sit_i->sentries[start].cur_valid_map); kfree(sit_i->sentries[start].ckpt_valid_map); + kfree(sit_i->sentries[start].discard_map); } } kfree(sit_i->tmp_map); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 85d7fa751..79e7b879a 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -9,6 +9,7 @@ * published by the Free Software Foundation. */ #include <linux/blkdev.h> +#include <linux/backing-dev.h> /* constant macro */ #define NULL_SEGNO ((unsigned int)(~0)) @@ -163,6 +164,7 @@ struct seg_entry { */ unsigned short ckpt_valid_blocks; unsigned char *ckpt_valid_map; + unsigned char *discard_map; unsigned char type; /* segment type like CURSEG_XXX_TYPE */ unsigned long long mtime; /* modification time of the segment */ }; @@ -713,7 +715,7 @@ static inline unsigned int max_hw_blocks(struct f2fs_sb_info *sbi) */ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type) { - if (sbi->sb->s_bdi->dirty_exceeded) + if (sbi->sb->s_bdi->wb.dirty_exceeded) return 0; if (type == DATA) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index b2dd1b01f..a06b0b46f 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -258,6 +258,7 @@ static void init_once(void *foo) static int parse_options(struct super_block *sb, char *options) { struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct request_queue *q; substring_t args[MAX_OPT_ARGS]; char *p, *name; int arg = 0; @@ -302,7 +303,14 @@ static int parse_options(struct super_block *sb, char *options) return -EINVAL; break; case Opt_discard: - set_opt(sbi, DISCARD); + q = bdev_get_queue(sb->s_bdev); + if (blk_queue_discard(q)) { + set_opt(sbi, DISCARD); + } else { + f2fs_msg(sb, KERN_WARNING, + "mounting with \"discard\" option, but " + "the device does not support discard"); + } break; case Opt_noheap: set_opt(sbi, NOHEAP); @@ -416,6 +424,9 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) /* Will be used by directory only */ fi->i_dir_level = F2FS_SB(sb)->dir_level; +#ifdef CONFIG_F2FS_FS_ENCRYPTION + fi->i_crypt_info = NULL; +#endif return &fi->vfs_inode; } @@ -428,8 +439,31 @@ static int f2fs_drop_inode(struct inode *inode) * - f2fs_gc -> iput -> evict * - inode_wait_for_writeback(inode) */ - if (!inode_unhashed(inode) && inode->i_state & I_SYNC) + if (!inode_unhashed(inode) && inode->i_state & I_SYNC) { + if (!inode->i_nlink && !is_bad_inode(inode)) { + spin_unlock(&inode->i_lock); + + /* some remained atomic pages should discarded */ + if (f2fs_is_atomic_file(inode)) + commit_inmem_pages(inode, true); + + sb_start_intwrite(inode->i_sb); + i_size_write(inode, 0); + + if (F2FS_HAS_BLOCKS(inode)) + f2fs_truncate(inode); + + sb_end_intwrite(inode->i_sb); + +#ifdef CONFIG_F2FS_FS_ENCRYPTION + if (F2FS_I(inode)->i_crypt_info) + f2fs_free_encryption_info(inode, + F2FS_I(inode)->i_crypt_info); +#endif + spin_lock(&inode->i_lock); + } return 0; + } return generic_drop_inode(inode); } @@ -520,7 +554,7 @@ int f2fs_sync_fs(struct super_block *sb, int sync) } else { f2fs_balance_fs(sbi); } - f2fs_trace_ios(NULL, NULL, 1); + f2fs_trace_ios(NULL, 1); return 0; } @@ -658,6 +692,22 @@ static const struct file_operations f2fs_seq_segment_info_fops = { .release = single_release, }; +static void default_options(struct f2fs_sb_info *sbi) +{ + /* init some FS parameters */ + sbi->active_logs = NR_CURSEG_TYPE; + + set_opt(sbi, BG_GC); + set_opt(sbi, INLINE_DATA); + +#ifdef CONFIG_F2FS_FS_XATTR + set_opt(sbi, XATTR_USER); +#endif +#ifdef CONFIG_F2FS_FS_POSIX_ACL + set_opt(sbi, POSIX_ACL); +#endif +} + static int f2fs_remount(struct super_block *sb, int *flags, char *data) { struct f2fs_sb_info *sbi = F2FS_SB(sb); @@ -676,7 +726,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) active_logs = sbi->active_logs; sbi->mount_opt.opt = 0; - sbi->active_logs = NR_CURSEG_TYPE; + default_options(sbi); /* parse mount options */ err = parse_options(sb, data); @@ -929,29 +979,36 @@ static void init_sb_info(struct f2fs_sb_info *sbi) */ static int read_raw_super_block(struct super_block *sb, struct f2fs_super_block **raw_super, - struct buffer_head **raw_super_buf) + struct buffer_head **raw_super_buf, + int *recovery) { int block = 0; + struct buffer_head *buffer; + struct f2fs_super_block *super; + int err = 0; retry: - *raw_super_buf = sb_bread(sb, block); - if (!*raw_super_buf) { + buffer = sb_bread(sb, block); + if (!buffer) { + *recovery = 1; f2fs_msg(sb, KERN_ERR, "Unable to read %dth superblock", block + 1); if (block == 0) { block++; goto retry; } else { - return -EIO; + err = -EIO; + goto out; } } - *raw_super = (struct f2fs_super_block *) - ((char *)(*raw_super_buf)->b_data + F2FS_SUPER_OFFSET); + super = (struct f2fs_super_block *) + ((char *)(buffer)->b_data + F2FS_SUPER_OFFSET); /* sanity checking of raw super */ - if (sanity_check_raw_super(sb, *raw_super)) { - brelse(*raw_super_buf); + if (sanity_check_raw_super(sb, super)) { + brelse(buffer); + *recovery = 1; f2fs_msg(sb, KERN_ERR, "Can't find valid F2FS filesystem in %dth superblock", block + 1); @@ -959,25 +1016,76 @@ retry: block++; goto retry; } else { - return -EINVAL; + err = -EINVAL; + goto out; } } + if (!*raw_super) { + *raw_super_buf = buffer; + *raw_super = super; + } else { + /* already have a valid superblock */ + brelse(buffer); + } + + /* check the validity of the second superblock */ + if (block == 0) { + block++; + goto retry; + } + +out: + /* No valid superblock */ + if (!*raw_super) + return err; + return 0; } +int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover) +{ + struct buffer_head *sbh = sbi->raw_super_buf; + sector_t block = sbh->b_blocknr; + int err; + + /* write back-up superblock first */ + sbh->b_blocknr = block ? 0 : 1; + mark_buffer_dirty(sbh); + err = sync_dirty_buffer(sbh); + + sbh->b_blocknr = block; + + /* if we are in recovery path, skip writing valid superblock */ + if (recover || err) + goto out; + + /* write current valid superblock */ + mark_buffer_dirty(sbh); + err = sync_dirty_buffer(sbh); +out: + clear_buffer_write_io_error(sbh); + set_buffer_uptodate(sbh); + return err; +} + static int f2fs_fill_super(struct super_block *sb, void *data, int silent) { struct f2fs_sb_info *sbi; - struct f2fs_super_block *raw_super = NULL; + struct f2fs_super_block *raw_super; struct buffer_head *raw_super_buf; struct inode *root; - long err = -EINVAL; + long err; bool retry = true, need_fsck = false; char *options = NULL; - int i; + int recovery, i; try_onemore: + err = -EINVAL; + raw_super = NULL; + raw_super_buf = NULL; + recovery = 0; + /* allocate memory for f2fs-specific super block info */ sbi = kzalloc(sizeof(struct f2fs_sb_info), GFP_KERNEL); if (!sbi) @@ -989,23 +1097,12 @@ try_onemore: goto free_sbi; } - err = read_raw_super_block(sb, &raw_super, &raw_super_buf); + err = read_raw_super_block(sb, &raw_super, &raw_super_buf, &recovery); if (err) goto free_sbi; sb->s_fs_info = sbi; - /* init some FS parameters */ - sbi->active_logs = NR_CURSEG_TYPE; - - set_opt(sbi, BG_GC); - set_opt(sbi, INLINE_DATA); - -#ifdef CONFIG_F2FS_FS_XATTR - set_opt(sbi, XATTR_USER); -#endif -#ifdef CONFIG_F2FS_FS_POSIX_ACL - set_opt(sbi, POSIX_ACL); -#endif + default_options(sbi); /* parse mount options */ options = kstrdup((const char *)data, GFP_KERNEL); if (data && !options) { @@ -1148,14 +1245,6 @@ try_onemore: proc_create_data("segment_info", S_IRUGO, sbi->s_proc, &f2fs_seq_segment_info_fops, sb); - if (test_opt(sbi, DISCARD)) { - struct request_queue *q = bdev_get_queue(sb->s_bdev); - if (!blk_queue_discard(q)) - f2fs_msg(sb, KERN_WARNING, - "mounting with \"discard\" option, but " - "the device does not support discard"); - } - sbi->s_kobj.kset = f2fs_kset; init_completion(&sbi->s_kobj_unregister); err = kobject_init_and_add(&sbi->s_kobj, &f2fs_ktype, NULL, @@ -1198,6 +1287,13 @@ try_onemore: goto free_kobj; } kfree(options); + + /* recover broken superblock */ + if (recovery && !f2fs_readonly(sb) && !bdev_read_only(sb->s_bdev)) { + f2fs_msg(sb, KERN_INFO, "Recover invalid superblock"); + f2fs_commit_super(sbi, true); + } + return 0; free_kobj: @@ -1305,13 +1401,18 @@ static int __init init_f2fs_fs(void) err = -ENOMEM; goto free_extent_cache; } - err = register_filesystem(&f2fs_fs_type); + err = f2fs_init_crypto(); if (err) goto free_kset; + err = register_filesystem(&f2fs_fs_type); + if (err) + goto free_crypto; f2fs_create_root_stats(); f2fs_proc_root = proc_mkdir("fs/f2fs", NULL); return 0; +free_crypto: + f2fs_exit_crypto(); free_kset: kset_unregister(f2fs_kset); free_extent_cache: @@ -1333,6 +1434,7 @@ static void __exit exit_f2fs_fs(void) remove_proc_entry("fs/f2fs", NULL); f2fs_destroy_root_stats(); unregister_filesystem(&f2fs_fs_type); + f2fs_exit_crypto(); destroy_extent_cache(); destroy_checkpoint_caches(); destroy_segment_manager_caches(); diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c index 875aa8179..145fb659a 100644 --- a/fs/f2fs/trace.c +++ b/fs/f2fs/trace.c @@ -80,7 +80,7 @@ out: radix_tree_preload_end(); } -void f2fs_trace_ios(struct page *page, struct f2fs_io_info *fio, int flush) +void f2fs_trace_ios(struct f2fs_io_info *fio, int flush) { struct inode *inode; pid_t pid; @@ -91,8 +91,8 @@ void f2fs_trace_ios(struct page *page, struct f2fs_io_info *fio, int flush) return; } - inode = page->mapping->host; - pid = page_private(page); + inode = fio->page->mapping->host; + pid = page_private(fio->page); major = MAJOR(inode->i_sb->s_dev); minor = MINOR(inode->i_sb->s_dev); diff --git a/fs/f2fs/trace.h b/fs/f2fs/trace.h index 1041dbeb5..67db24ac1 100644 --- a/fs/f2fs/trace.h +++ b/fs/f2fs/trace.h @@ -33,12 +33,12 @@ struct last_io_info { }; extern void f2fs_trace_pid(struct page *); -extern void f2fs_trace_ios(struct page *, struct f2fs_io_info *, int); +extern void f2fs_trace_ios(struct f2fs_io_info *, int); extern void f2fs_build_trace_ios(void); extern void f2fs_destroy_trace_ios(void); #else #define f2fs_trace_pid(p) -#define f2fs_trace_ios(p, i, n) +#define f2fs_trace_ios(i, n) #define f2fs_build_trace_ios() #define f2fs_destroy_trace_ios() diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 9757f65a0..07449b980 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -584,6 +584,9 @@ static int __f2fs_setxattr(struct inode *inode, int index, inode->i_ctime = CURRENT_TIME; clear_inode_flag(fi, FI_ACL_MODE); } + if (index == F2FS_XATTR_INDEX_ENCRYPTION && + !strcmp(name, F2FS_XATTR_NAME_ENCRYPTION_CONTEXT)) + f2fs_set_encrypted_inode(inode); if (ipage) update_inode(inode, ipage); diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h index 969d792ca..71a7100d5 100644 --- a/fs/f2fs/xattr.h +++ b/fs/f2fs/xattr.h @@ -35,6 +35,10 @@ #define F2FS_XATTR_INDEX_LUSTRE 5 #define F2FS_XATTR_INDEX_SECURITY 6 #define F2FS_XATTR_INDEX_ADVISE 7 +/* Should be same as EXT4_XATTR_INDEX_ENCRYPTION */ +#define F2FS_XATTR_INDEX_ENCRYPTION 9 + +#define F2FS_XATTR_NAME_ENCRYPTION_CONTEXT "c" struct f2fs_xattr_header { __le32 h_magic; /* magic number for identification */ diff --git a/fs/fat/file.c b/fs/fat/file.c index 442d50a0e..a08f10399 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -11,6 +11,7 @@ #include <linux/compat.h> #include <linux/mount.h> #include <linux/blkdev.h> +#include <linux/backing-dev.h> #include <linux/fsnotify.h> #include <linux/security.h> #include "fat.h" diff --git a/fs/fat/inode.c b/fs/fat/inode.c index c06774658..509411dd3 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -18,6 +18,7 @@ #include <linux/parser.h> #include <linux/uio.h> #include <linux/blkdev.h> +#include <linux/backing-dev.h> #include <asm/unaligned.h> #include "fat.h" @@ -147,6 +147,13 @@ static int expand_fdtable(struct files_struct *files, int nr) spin_unlock(&files->file_lock); new_fdt = alloc_fdtable(nr); + + /* make sure all __fd_install() have seen resize_in_progress + * or have finished their rcu_read_lock_sched() section. + */ + if (atomic_read(&files->count) > 1) + synchronize_sched(); + spin_lock(&files->file_lock); if (!new_fdt) return -ENOMEM; @@ -158,21 +165,14 @@ static int expand_fdtable(struct files_struct *files, int nr) __free_fdtable(new_fdt); return -EMFILE; } - /* - * Check again since another task may have expanded the fd table while - * we dropped the lock - */ cur_fdt = files_fdtable(files); - if (nr >= cur_fdt->max_fds) { - /* Continue as planned */ - copy_fdtable(new_fdt, cur_fdt); - rcu_assign_pointer(files->fdt, new_fdt); - if (cur_fdt != &files->fdtab) - call_rcu(&cur_fdt->rcu, free_fdtable_rcu); - } else { - /* Somebody else expanded, so undo our attempt */ - __free_fdtable(new_fdt); - } + BUG_ON(nr < cur_fdt->max_fds); + copy_fdtable(new_fdt, cur_fdt); + rcu_assign_pointer(files->fdt, new_fdt); + if (cur_fdt != &files->fdtab) + call_rcu(&cur_fdt->rcu, free_fdtable_rcu); + /* coupled with smp_rmb() in __fd_install() */ + smp_wmb(); return 1; } @@ -185,21 +185,38 @@ static int expand_fdtable(struct files_struct *files, int nr) * The files->file_lock should be held on entry, and will be held on exit. */ static int expand_files(struct files_struct *files, int nr) + __releases(files->file_lock) + __acquires(files->file_lock) { struct fdtable *fdt; + int expanded = 0; +repeat: fdt = files_fdtable(files); /* Do we need to expand? */ if (nr < fdt->max_fds) - return 0; + return expanded; /* Can we expand? */ if (nr >= sysctl_nr_open) return -EMFILE; + if (unlikely(files->resize_in_progress)) { + spin_unlock(&files->file_lock); + expanded = 1; + wait_event(files->resize_wait, !files->resize_in_progress); + spin_lock(&files->file_lock); + goto repeat; + } + /* All good, so we try */ - return expand_fdtable(files, nr); + files->resize_in_progress = true; + expanded = expand_fdtable(files, nr); + files->resize_in_progress = false; + + wake_up_all(&files->resize_wait); + return expanded; } static inline void __set_close_on_exec(int fd, struct fdtable *fdt) @@ -256,6 +273,8 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) atomic_set(&newf->count, 1); spin_lock_init(&newf->file_lock); + newf->resize_in_progress = false; + init_waitqueue_head(&newf->resize_wait); newf->next_fd = 0; new_fdt = &newf->fdtab; new_fdt->max_fds = NR_OPEN_DEFAULT; @@ -553,11 +572,21 @@ void __fd_install(struct files_struct *files, unsigned int fd, struct file *file) { struct fdtable *fdt; - spin_lock(&files->file_lock); - fdt = files_fdtable(files); + + might_sleep(); + rcu_read_lock_sched(); + + while (unlikely(files->resize_in_progress)) { + rcu_read_unlock_sched(); + wait_event(files->resize_wait, !files->resize_in_progress); + rcu_read_lock_sched(); + } + /* coupled with smp_wmb() in expand_fdtable() */ + smp_rmb(); + fdt = rcu_dereference_sched(files->fdt); BUG_ON(fdt->fd[fd] != NULL); rcu_assign_pointer(fdt->fd[fd], file); - spin_unlock(&files->file_lock); + rcu_read_unlock_sched(); } void fd_install(unsigned int fd, struct file *file) @@ -635,11 +664,17 @@ static struct file *__fget(unsigned int fd, fmode_t mask) struct file *file; rcu_read_lock(); +loop: file = fcheck_files(files, fd); if (file) { - /* File object ref couldn't be taken */ - if ((file->f_mode & mask) || !get_file_rcu(file)) + /* File object ref couldn't be taken. + * dup2() atomicity guarantee is the reason + * we loop to catch the new file (or NULL pointer) + */ + if (file->f_mode & mask) file = NULL; + else if (!get_file_rcu(file)) + goto loop; } rcu_read_unlock(); diff --git a/fs/file_table.c b/fs/file_table.c index 3cea02778..ad17e05eb 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -20,12 +20,12 @@ #include <linux/cdev.h> #include <linux/fsnotify.h> #include <linux/sysctl.h> -#include <linux/lglock.h> #include <linux/percpu_counter.h> #include <linux/percpu.h> #include <linux/hardirq.h> #include <linux/task_work.h> #include <linux/ima.h> +#include <linux/swap.h> #include <linux/atomic.h> @@ -147,7 +147,6 @@ over: } return ERR_PTR(-ENFILE); } -EXPORT_SYMBOL(get_empty_filp); /** * alloc_file - allocate and initialize a 'struct file' @@ -309,21 +308,25 @@ void put_filp(struct file *file) file_free(file); } } -EXPORT_SYMBOL(put_filp); -void __init files_init(unsigned long mempages) +void __init files_init(void) { - unsigned long n; - filp_cachep = kmem_cache_create("filp", sizeof(struct file), 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + percpu_counter_init(&nr_files, 0, GFP_KERNEL); +} - /* - * One file with associated inode and dcache is very roughly 1K. - * Per default don't use more than 10% of our memory for files. - */ +/* + * One file with associated inode and dcache is very roughly 1K. Per default + * do not use more than 10% of our memory for files. + */ +void __init files_maxfiles_init(void) +{ + unsigned long n; + unsigned long memreserve = (totalram_pages - nr_free_pages()) * 3/2; + + memreserve = min(memreserve, totalram_pages - 1); + n = ((totalram_pages - memreserve) * (PAGE_SIZE / 1024)) / 10; - n = (mempages * (PAGE_SIZE / 1024)) / 10; files_stat.max_files = max_t(unsigned long, n, NR_FILE); - percpu_counter_init(&nr_files, 0, GFP_KERNEL); } diff --git a/fs/freevxfs/vxfs_extern.h b/fs/freevxfs/vxfs_extern.h index 881aa3d21..e3dcb4467 100644 --- a/fs/freevxfs/vxfs_extern.h +++ b/fs/freevxfs/vxfs_extern.h @@ -50,9 +50,6 @@ extern daddr_t vxfs_bmap1(struct inode *, long); /* vxfs_fshead.c */ extern int vxfs_read_fshead(struct super_block *); -/* vxfs_immed.c */ -extern const struct inode_operations vxfs_immed_symlink_iops; - /* vxfs_inode.c */ extern const struct address_space_operations vxfs_immed_aops; extern struct kmem_cache *vxfs_inode_cachep; diff --git a/fs/freevxfs/vxfs_immed.c b/fs/freevxfs/vxfs_immed.c index 8b9229e2c..cb84f0fcc 100644 --- a/fs/freevxfs/vxfs_immed.c +++ b/fs/freevxfs/vxfs_immed.c @@ -32,29 +32,15 @@ */ #include <linux/fs.h> #include <linux/pagemap.h> -#include <linux/namei.h> #include "vxfs.h" #include "vxfs_extern.h" #include "vxfs_inode.h" -static void * vxfs_immed_follow_link(struct dentry *, struct nameidata *); - static int vxfs_immed_readpage(struct file *, struct page *); /* - * Inode operations for immed symlinks. - * - * Unliked all other operations we do not go through the pagecache, - * but do all work directly on the inode. - */ -const struct inode_operations vxfs_immed_symlink_iops = { - .readlink = generic_readlink, - .follow_link = vxfs_immed_follow_link, -}; - -/* * Address space operations for immed files and directories. */ const struct address_space_operations vxfs_immed_aops = { @@ -62,26 +48,6 @@ const struct address_space_operations vxfs_immed_aops = { }; /** - * vxfs_immed_follow_link - follow immed symlink - * @dp: dentry for the link - * @np: pathname lookup data for the current path walk - * - * Description: - * vxfs_immed_follow_link restarts the pathname lookup with - * the data obtained from @dp. - * - * Returns: - * Zero on success, else a negative error code. - */ -static void * -vxfs_immed_follow_link(struct dentry *dp, struct nameidata *np) -{ - struct vxfs_inode_info *vip = VXFS_INO(d_inode(dp)); - nd_set_link(np, vip->vii_immed.vi_immed); - return NULL; -} - -/** * vxfs_immed_readpage - read part of an immed inode into pagecache * @file: file context (unused) * @page: page frame to fill in. diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c index 363e3ae25..ef73ed674 100644 --- a/fs/freevxfs/vxfs_inode.c +++ b/fs/freevxfs/vxfs_inode.c @@ -35,6 +35,7 @@ #include <linux/pagemap.h> #include <linux/kernel.h> #include <linux/slab.h> +#include <linux/namei.h> #include "vxfs.h" #include "vxfs_inode.h" @@ -327,8 +328,10 @@ vxfs_iget(struct super_block *sbp, ino_t ino) ip->i_op = &page_symlink_inode_operations; ip->i_mapping->a_ops = &vxfs_aops; } else { - ip->i_op = &vxfs_immed_symlink_iops; - vip->vii_immed.vi_immed[ip->i_size] = '\0'; + ip->i_op = &simple_symlink_inode_operations; + ip->i_link = vip->vii_immed.vi_immed; + nd_terminate_link(ip->i_link, ip->i_size, + sizeof(vip->vii_immed.vi_immed) - 1); } } else init_special_inode(ip, ip->i_mode, old_decode_dev(vip->vii_rdev)); diff --git a/fs/freevxfs/vxfs_lookup.c b/fs/freevxfs/vxfs_lookup.c index 99c7f0a37..484b32d32 100644 --- a/fs/freevxfs/vxfs_lookup.c +++ b/fs/freevxfs/vxfs_lookup.c @@ -61,13 +61,6 @@ const struct file_operations vxfs_dir_operations = { .iterate = vxfs_readdir, }; - -static inline u_long -dir_pages(struct inode *inode) -{ - return (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; -} - static inline u_long dir_blocks(struct inode *ip) { diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 32a8bbd7a..5fa588e93 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -27,6 +27,7 @@ #include <linux/backing-dev.h> #include <linux/tracepoint.h> #include <linux/device.h> +#include <linux/memcontrol.h> #include "internal.h" /* @@ -34,6 +35,10 @@ */ #define MIN_WRITEBACK_PAGES (4096UL >> (PAGE_CACHE_SHIFT - 10)) +struct wb_completion { + atomic_t cnt; +}; + /* * Passed into wb_writeback(), essentially a subset of writeback_control */ @@ -47,13 +52,29 @@ struct wb_writeback_work { unsigned int range_cyclic:1; unsigned int for_background:1; unsigned int for_sync:1; /* sync(2) WB_SYNC_ALL writeback */ + unsigned int auto_free:1; /* free on completion */ + unsigned int single_wait:1; + unsigned int single_done:1; enum wb_reason reason; /* why was writeback initiated? */ struct list_head list; /* pending work list */ - struct completion *done; /* set if the caller waits */ + struct wb_completion *done; /* set if the caller waits */ }; /* + * If one wants to wait for one or more wb_writeback_works, each work's + * ->done should be set to a wb_completion defined using the following + * macro. Once all work items are issued with wb_queue_work(), the caller + * can wait for the completion of all using wb_wait_for_completion(). Work + * items which are waited upon aren't freed automatically on completion. + */ +#define DEFINE_WB_COMPLETION_ONSTACK(cmpl) \ + struct wb_completion cmpl = { \ + .cnt = ATOMIC_INIT(1), \ + } + + +/* * If an inode is constantly having its pages dirtied, but then the * updates stop dirtytime_expire_interval seconds in the past, it's * possible for the worst case time between when an inode has its @@ -65,35 +86,6 @@ struct wb_writeback_work { */ unsigned int dirtytime_expire_interval = 12 * 60 * 60; -/** - * writeback_in_progress - determine whether there is writeback in progress - * @bdi: the device's backing_dev_info structure. - * - * Determine whether there is writeback waiting to be handled against a - * backing device. - */ -int writeback_in_progress(struct backing_dev_info *bdi) -{ - return test_bit(BDI_writeback_running, &bdi->state); -} -EXPORT_SYMBOL(writeback_in_progress); - -struct backing_dev_info *inode_to_bdi(struct inode *inode) -{ - struct super_block *sb; - - if (!inode) - return &noop_backing_dev_info; - - sb = inode->i_sb; -#ifdef CONFIG_BLOCK - if (sb_is_blkdev_sb(sb)) - return blk_get_backing_dev_info(I_BDEV(inode)); -#endif - return sb->s_bdi; -} -EXPORT_SYMBOL_GPL(inode_to_bdi); - static inline struct inode *wb_inode(struct list_head *head) { return list_entry(head, struct inode, i_wb_list); @@ -109,45 +101,831 @@ static inline struct inode *wb_inode(struct list_head *head) EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepage); -static void bdi_wakeup_thread(struct backing_dev_info *bdi) +static bool wb_io_lists_populated(struct bdi_writeback *wb) +{ + if (wb_has_dirty_io(wb)) { + return false; + } else { + set_bit(WB_has_dirty_io, &wb->state); + WARN_ON_ONCE(!wb->avg_write_bandwidth); + atomic_long_add(wb->avg_write_bandwidth, + &wb->bdi->tot_write_bandwidth); + return true; + } +} + +static void wb_io_lists_depopulated(struct bdi_writeback *wb) { - spin_lock_bh(&bdi->wb_lock); - if (test_bit(BDI_registered, &bdi->state)) - mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); - spin_unlock_bh(&bdi->wb_lock); + if (wb_has_dirty_io(wb) && list_empty(&wb->b_dirty) && + list_empty(&wb->b_io) && list_empty(&wb->b_more_io)) { + clear_bit(WB_has_dirty_io, &wb->state); + WARN_ON_ONCE(atomic_long_sub_return(wb->avg_write_bandwidth, + &wb->bdi->tot_write_bandwidth) < 0); + } } -static void bdi_queue_work(struct backing_dev_info *bdi, - struct wb_writeback_work *work) +/** + * inode_wb_list_move_locked - move an inode onto a bdi_writeback IO list + * @inode: inode to be moved + * @wb: target bdi_writeback + * @head: one of @wb->b_{dirty|io|more_io} + * + * Move @inode->i_wb_list to @list of @wb and set %WB_has_dirty_io. + * Returns %true if @inode is the first occupant of the !dirty_time IO + * lists; otherwise, %false. + */ +static bool inode_wb_list_move_locked(struct inode *inode, + struct bdi_writeback *wb, + struct list_head *head) { - trace_writeback_queue(bdi, work); + assert_spin_locked(&wb->list_lock); + + list_move(&inode->i_wb_list, head); - spin_lock_bh(&bdi->wb_lock); - if (!test_bit(BDI_registered, &bdi->state)) { - if (work->done) - complete(work->done); + /* dirty_time doesn't count as dirty_io until expiration */ + if (head != &wb->b_dirty_time) + return wb_io_lists_populated(wb); + + wb_io_lists_depopulated(wb); + return false; +} + +/** + * inode_wb_list_del_locked - remove an inode from its bdi_writeback IO list + * @inode: inode to be removed + * @wb: bdi_writeback @inode is being removed from + * + * Remove @inode which may be on one of @wb->b_{dirty|io|more_io} lists and + * clear %WB_has_dirty_io if all are empty afterwards. + */ +static void inode_wb_list_del_locked(struct inode *inode, + struct bdi_writeback *wb) +{ + assert_spin_locked(&wb->list_lock); + + list_del_init(&inode->i_wb_list); + wb_io_lists_depopulated(wb); +} + +static void wb_wakeup(struct bdi_writeback *wb) +{ + spin_lock_bh(&wb->work_lock); + if (test_bit(WB_registered, &wb->state)) + mod_delayed_work(bdi_wq, &wb->dwork, 0); + spin_unlock_bh(&wb->work_lock); +} + +static void wb_queue_work(struct bdi_writeback *wb, + struct wb_writeback_work *work) +{ + trace_writeback_queue(wb->bdi, work); + + spin_lock_bh(&wb->work_lock); + if (!test_bit(WB_registered, &wb->state)) { + if (work->single_wait) + work->single_done = 1; goto out_unlock; } - list_add_tail(&work->list, &bdi->work_list); - mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0); + if (work->done) + atomic_inc(&work->done->cnt); + list_add_tail(&work->list, &wb->work_list); + mod_delayed_work(bdi_wq, &wb->dwork, 0); out_unlock: - spin_unlock_bh(&bdi->wb_lock); + spin_unlock_bh(&wb->work_lock); +} + +/** + * wb_wait_for_completion - wait for completion of bdi_writeback_works + * @bdi: bdi work items were issued to + * @done: target wb_completion + * + * Wait for one or more work items issued to @bdi with their ->done field + * set to @done, which should have been defined with + * DEFINE_WB_COMPLETION_ONSTACK(). This function returns after all such + * work items are completed. Work items which are waited upon aren't freed + * automatically on completion. + */ +static void wb_wait_for_completion(struct backing_dev_info *bdi, + struct wb_completion *done) +{ + atomic_dec(&done->cnt); /* put down the initial count */ + wait_event(bdi->wb_waitq, !atomic_read(&done->cnt)); +} + +#ifdef CONFIG_CGROUP_WRITEBACK + +/* parameters for foreign inode detection, see wb_detach_inode() */ +#define WB_FRN_TIME_SHIFT 13 /* 1s = 2^13, upto 8 secs w/ 16bit */ +#define WB_FRN_TIME_AVG_SHIFT 3 /* avg = avg * 7/8 + new * 1/8 */ +#define WB_FRN_TIME_CUT_DIV 2 /* ignore rounds < avg / 2 */ +#define WB_FRN_TIME_PERIOD (2 * (1 << WB_FRN_TIME_SHIFT)) /* 2s */ + +#define WB_FRN_HIST_SLOTS 16 /* inode->i_wb_frn_history is 16bit */ +#define WB_FRN_HIST_UNIT (WB_FRN_TIME_PERIOD / WB_FRN_HIST_SLOTS) + /* each slot's duration is 2s / 16 */ +#define WB_FRN_HIST_THR_SLOTS (WB_FRN_HIST_SLOTS / 2) + /* if foreign slots >= 8, switch */ +#define WB_FRN_HIST_MAX_SLOTS (WB_FRN_HIST_THR_SLOTS / 2 + 1) + /* one round can affect upto 5 slots */ + +void __inode_attach_wb(struct inode *inode, struct page *page) +{ + struct backing_dev_info *bdi = inode_to_bdi(inode); + struct bdi_writeback *wb = NULL; + + if (inode_cgwb_enabled(inode)) { + struct cgroup_subsys_state *memcg_css; + + if (page) { + memcg_css = mem_cgroup_css_from_page(page); + wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); + } else { + /* must pin memcg_css, see wb_get_create() */ + memcg_css = task_get_css(current, memory_cgrp_id); + wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); + css_put(memcg_css); + } + } + + if (!wb) + wb = &bdi->wb; + + /* + * There may be multiple instances of this function racing to + * update the same inode. Use cmpxchg() to tell the winner. + */ + if (unlikely(cmpxchg(&inode->i_wb, NULL, wb))) + wb_put(wb); +} + +/** + * locked_inode_to_wb_and_lock_list - determine a locked inode's wb and lock it + * @inode: inode of interest with i_lock held + * + * Returns @inode's wb with its list_lock held. @inode->i_lock must be + * held on entry and is released on return. The returned wb is guaranteed + * to stay @inode's associated wb until its list_lock is released. + */ +static struct bdi_writeback * +locked_inode_to_wb_and_lock_list(struct inode *inode) + __releases(&inode->i_lock) + __acquires(&wb->list_lock) +{ + while (true) { + struct bdi_writeback *wb = inode_to_wb(inode); + + /* + * inode_to_wb() association is protected by both + * @inode->i_lock and @wb->list_lock but list_lock nests + * outside i_lock. Drop i_lock and verify that the + * association hasn't changed after acquiring list_lock. + */ + wb_get(wb); + spin_unlock(&inode->i_lock); + spin_lock(&wb->list_lock); + wb_put(wb); /* not gonna deref it anymore */ + + /* i_wb may have changed inbetween, can't use inode_to_wb() */ + if (likely(wb == inode->i_wb)) + return wb; /* @inode already has ref */ + + spin_unlock(&wb->list_lock); + cpu_relax(); + spin_lock(&inode->i_lock); + } +} + +/** + * inode_to_wb_and_lock_list - determine an inode's wb and lock it + * @inode: inode of interest + * + * Same as locked_inode_to_wb_and_lock_list() but @inode->i_lock isn't held + * on entry. + */ +static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode) + __acquires(&wb->list_lock) +{ + spin_lock(&inode->i_lock); + return locked_inode_to_wb_and_lock_list(inode); +} + +struct inode_switch_wbs_context { + struct inode *inode; + struct bdi_writeback *new_wb; + + struct rcu_head rcu_head; + struct work_struct work; +}; + +static void inode_switch_wbs_work_fn(struct work_struct *work) +{ + struct inode_switch_wbs_context *isw = + container_of(work, struct inode_switch_wbs_context, work); + struct inode *inode = isw->inode; + struct address_space *mapping = inode->i_mapping; + struct bdi_writeback *old_wb = inode->i_wb; + struct bdi_writeback *new_wb = isw->new_wb; + struct radix_tree_iter iter; + bool switched = false; + void **slot; + + /* + * By the time control reaches here, RCU grace period has passed + * since I_WB_SWITCH assertion and all wb stat update transactions + * between unlocked_inode_to_wb_begin/end() are guaranteed to be + * synchronizing against mapping->tree_lock. + * + * Grabbing old_wb->list_lock, inode->i_lock and mapping->tree_lock + * gives us exclusion against all wb related operations on @inode + * including IO list manipulations and stat updates. + */ + if (old_wb < new_wb) { + spin_lock(&old_wb->list_lock); + spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING); + } else { + spin_lock(&new_wb->list_lock); + spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING); + } + spin_lock(&inode->i_lock); + spin_lock_irq(&mapping->tree_lock); + + /* + * Once I_FREEING is visible under i_lock, the eviction path owns + * the inode and we shouldn't modify ->i_wb_list. + */ + if (unlikely(inode->i_state & I_FREEING)) + goto skip_switch; + + /* + * Count and transfer stats. Note that PAGECACHE_TAG_DIRTY points + * to possibly dirty pages while PAGECACHE_TAG_WRITEBACK points to + * pages actually under underwriteback. + */ + radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0, + PAGECACHE_TAG_DIRTY) { + struct page *page = radix_tree_deref_slot_protected(slot, + &mapping->tree_lock); + if (likely(page) && PageDirty(page)) { + __dec_wb_stat(old_wb, WB_RECLAIMABLE); + __inc_wb_stat(new_wb, WB_RECLAIMABLE); + } + } + + radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, 0, + PAGECACHE_TAG_WRITEBACK) { + struct page *page = radix_tree_deref_slot_protected(slot, + &mapping->tree_lock); + if (likely(page)) { + WARN_ON_ONCE(!PageWriteback(page)); + __dec_wb_stat(old_wb, WB_WRITEBACK); + __inc_wb_stat(new_wb, WB_WRITEBACK); + } + } + + wb_get(new_wb); + + /* + * Transfer to @new_wb's IO list if necessary. The specific list + * @inode was on is ignored and the inode is put on ->b_dirty which + * is always correct including from ->b_dirty_time. The transfer + * preserves @inode->dirtied_when ordering. + */ + if (!list_empty(&inode->i_wb_list)) { + struct inode *pos; + + inode_wb_list_del_locked(inode, old_wb); + inode->i_wb = new_wb; + list_for_each_entry(pos, &new_wb->b_dirty, i_wb_list) + if (time_after_eq(inode->dirtied_when, + pos->dirtied_when)) + break; + inode_wb_list_move_locked(inode, new_wb, pos->i_wb_list.prev); + } else { + inode->i_wb = new_wb; + } + + /* ->i_wb_frn updates may race wbc_detach_inode() but doesn't matter */ + inode->i_wb_frn_winner = 0; + inode->i_wb_frn_avg_time = 0; + inode->i_wb_frn_history = 0; + switched = true; +skip_switch: + /* + * Paired with load_acquire in unlocked_inode_to_wb_begin() and + * ensures that the new wb is visible if they see !I_WB_SWITCH. + */ + smp_store_release(&inode->i_state, inode->i_state & ~I_WB_SWITCH); + + spin_unlock_irq(&mapping->tree_lock); + spin_unlock(&inode->i_lock); + spin_unlock(&new_wb->list_lock); + spin_unlock(&old_wb->list_lock); + + if (switched) { + wb_wakeup(new_wb); + wb_put(old_wb); + } + wb_put(new_wb); + + iput(inode); + kfree(isw); } -static void -__bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, - bool range_cyclic, enum wb_reason reason) +static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head) +{ + struct inode_switch_wbs_context *isw = container_of(rcu_head, + struct inode_switch_wbs_context, rcu_head); + + /* needs to grab bh-unsafe locks, bounce to work item */ + INIT_WORK(&isw->work, inode_switch_wbs_work_fn); + schedule_work(&isw->work); +} + +/** + * inode_switch_wbs - change the wb association of an inode + * @inode: target inode + * @new_wb_id: ID of the new wb + * + * Switch @inode's wb association to the wb identified by @new_wb_id. The + * switching is performed asynchronously and may fail silently. + */ +static void inode_switch_wbs(struct inode *inode, int new_wb_id) +{ + struct backing_dev_info *bdi = inode_to_bdi(inode); + struct cgroup_subsys_state *memcg_css; + struct inode_switch_wbs_context *isw; + + /* noop if seems to be already in progress */ + if (inode->i_state & I_WB_SWITCH) + return; + + isw = kzalloc(sizeof(*isw), GFP_ATOMIC); + if (!isw) + return; + + /* find and pin the new wb */ + rcu_read_lock(); + memcg_css = css_from_id(new_wb_id, &memory_cgrp_subsys); + if (memcg_css) + isw->new_wb = wb_get_create(bdi, memcg_css, GFP_ATOMIC); + rcu_read_unlock(); + if (!isw->new_wb) + goto out_free; + + /* while holding I_WB_SWITCH, no one else can update the association */ + spin_lock(&inode->i_lock); + if (inode->i_state & (I_WB_SWITCH | I_FREEING) || + inode_to_wb(inode) == isw->new_wb) { + spin_unlock(&inode->i_lock); + goto out_free; + } + inode->i_state |= I_WB_SWITCH; + spin_unlock(&inode->i_lock); + + ihold(inode); + isw->inode = inode; + + /* + * In addition to synchronizing among switchers, I_WB_SWITCH tells + * the RCU protected stat update paths to grab the mapping's + * tree_lock so that stat transfer can synchronize against them. + * Let's continue after I_WB_SWITCH is guaranteed to be visible. + */ + call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn); + return; + +out_free: + if (isw->new_wb) + wb_put(isw->new_wb); + kfree(isw); +} + +/** + * wbc_attach_and_unlock_inode - associate wbc with target inode and unlock it + * @wbc: writeback_control of interest + * @inode: target inode + * + * @inode is locked and about to be written back under the control of @wbc. + * Record @inode's writeback context into @wbc and unlock the i_lock. On + * writeback completion, wbc_detach_inode() should be called. This is used + * to track the cgroup writeback context. + */ +void wbc_attach_and_unlock_inode(struct writeback_control *wbc, + struct inode *inode) +{ + if (!inode_cgwb_enabled(inode)) { + spin_unlock(&inode->i_lock); + return; + } + + wbc->wb = inode_to_wb(inode); + wbc->inode = inode; + + wbc->wb_id = wbc->wb->memcg_css->id; + wbc->wb_lcand_id = inode->i_wb_frn_winner; + wbc->wb_tcand_id = 0; + wbc->wb_bytes = 0; + wbc->wb_lcand_bytes = 0; + wbc->wb_tcand_bytes = 0; + + wb_get(wbc->wb); + spin_unlock(&inode->i_lock); + + /* + * A dying wb indicates that the memcg-blkcg mapping has changed + * and a new wb is already serving the memcg. Switch immediately. + */ + if (unlikely(wb_dying(wbc->wb))) + inode_switch_wbs(inode, wbc->wb_id); +} + +/** + * wbc_detach_inode - disassociate wbc from inode and perform foreign detection + * @wbc: writeback_control of the just finished writeback + * + * To be called after a writeback attempt of an inode finishes and undoes + * wbc_attach_and_unlock_inode(). Can be called under any context. + * + * As concurrent write sharing of an inode is expected to be very rare and + * memcg only tracks page ownership on first-use basis severely confining + * the usefulness of such sharing, cgroup writeback tracks ownership + * per-inode. While the support for concurrent write sharing of an inode + * is deemed unnecessary, an inode being written to by different cgroups at + * different points in time is a lot more common, and, more importantly, + * charging only by first-use can too readily lead to grossly incorrect + * behaviors (single foreign page can lead to gigabytes of writeback to be + * incorrectly attributed). + * + * To resolve this issue, cgroup writeback detects the majority dirtier of + * an inode and transfers the ownership to it. To avoid unnnecessary + * oscillation, the detection mechanism keeps track of history and gives + * out the switch verdict only if the foreign usage pattern is stable over + * a certain amount of time and/or writeback attempts. + * + * On each writeback attempt, @wbc tries to detect the majority writer + * using Boyer-Moore majority vote algorithm. In addition to the byte + * count from the majority voting, it also counts the bytes written for the + * current wb and the last round's winner wb (max of last round's current + * wb, the winner from two rounds ago, and the last round's majority + * candidate). Keeping track of the historical winner helps the algorithm + * to semi-reliably detect the most active writer even when it's not the + * absolute majority. + * + * Once the winner of the round is determined, whether the winner is + * foreign or not and how much IO time the round consumed is recorded in + * inode->i_wb_frn_history. If the amount of recorded foreign IO time is + * over a certain threshold, the switch verdict is given. + */ +void wbc_detach_inode(struct writeback_control *wbc) +{ + struct bdi_writeback *wb = wbc->wb; + struct inode *inode = wbc->inode; + unsigned long avg_time, max_bytes, max_time; + u16 history; + int max_id; + + if (!wb) + return; + + history = inode->i_wb_frn_history; + avg_time = inode->i_wb_frn_avg_time; + + /* pick the winner of this round */ + if (wbc->wb_bytes >= wbc->wb_lcand_bytes && + wbc->wb_bytes >= wbc->wb_tcand_bytes) { + max_id = wbc->wb_id; + max_bytes = wbc->wb_bytes; + } else if (wbc->wb_lcand_bytes >= wbc->wb_tcand_bytes) { + max_id = wbc->wb_lcand_id; + max_bytes = wbc->wb_lcand_bytes; + } else { + max_id = wbc->wb_tcand_id; + max_bytes = wbc->wb_tcand_bytes; + } + + /* + * Calculate the amount of IO time the winner consumed and fold it + * into the running average kept per inode. If the consumed IO + * time is lower than avag / WB_FRN_TIME_CUT_DIV, ignore it for + * deciding whether to switch or not. This is to prevent one-off + * small dirtiers from skewing the verdict. + */ + max_time = DIV_ROUND_UP((max_bytes >> PAGE_SHIFT) << WB_FRN_TIME_SHIFT, + wb->avg_write_bandwidth); + if (avg_time) + avg_time += (max_time >> WB_FRN_TIME_AVG_SHIFT) - + (avg_time >> WB_FRN_TIME_AVG_SHIFT); + else + avg_time = max_time; /* immediate catch up on first run */ + + if (max_time >= avg_time / WB_FRN_TIME_CUT_DIV) { + int slots; + + /* + * The switch verdict is reached if foreign wb's consume + * more than a certain proportion of IO time in a + * WB_FRN_TIME_PERIOD. This is loosely tracked by 16 slot + * history mask where each bit represents one sixteenth of + * the period. Determine the number of slots to shift into + * history from @max_time. + */ + slots = min(DIV_ROUND_UP(max_time, WB_FRN_HIST_UNIT), + (unsigned long)WB_FRN_HIST_MAX_SLOTS); + history <<= slots; + if (wbc->wb_id != max_id) + history |= (1U << slots) - 1; + + /* + * Switch if the current wb isn't the consistent winner. + * If there are multiple closely competing dirtiers, the + * inode may switch across them repeatedly over time, which + * is okay. The main goal is avoiding keeping an inode on + * the wrong wb for an extended period of time. + */ + if (hweight32(history) > WB_FRN_HIST_THR_SLOTS) + inode_switch_wbs(inode, max_id); + } + + /* + * Multiple instances of this function may race to update the + * following fields but we don't mind occassional inaccuracies. + */ + inode->i_wb_frn_winner = max_id; + inode->i_wb_frn_avg_time = min(avg_time, (unsigned long)U16_MAX); + inode->i_wb_frn_history = history; + + wb_put(wbc->wb); + wbc->wb = NULL; +} + +/** + * wbc_account_io - account IO issued during writeback + * @wbc: writeback_control of the writeback in progress + * @page: page being written out + * @bytes: number of bytes being written out + * + * @bytes from @page are about to written out during the writeback + * controlled by @wbc. Keep the book for foreign inode detection. See + * wbc_detach_inode(). + */ +void wbc_account_io(struct writeback_control *wbc, struct page *page, + size_t bytes) +{ + int id; + + /* + * pageout() path doesn't attach @wbc to the inode being written + * out. This is intentional as we don't want the function to block + * behind a slow cgroup. Ultimately, we want pageout() to kick off + * regular writeback instead of writing things out itself. + */ + if (!wbc->wb) + return; + + rcu_read_lock(); + id = mem_cgroup_css_from_page(page)->id; + rcu_read_unlock(); + + if (id == wbc->wb_id) { + wbc->wb_bytes += bytes; + return; + } + + if (id == wbc->wb_lcand_id) + wbc->wb_lcand_bytes += bytes; + + /* Boyer-Moore majority vote algorithm */ + if (!wbc->wb_tcand_bytes) + wbc->wb_tcand_id = id; + if (id == wbc->wb_tcand_id) + wbc->wb_tcand_bytes += bytes; + else + wbc->wb_tcand_bytes -= min(bytes, wbc->wb_tcand_bytes); +} +EXPORT_SYMBOL_GPL(wbc_account_io); + +/** + * inode_congested - test whether an inode is congested + * @inode: inode to test for congestion + * @cong_bits: mask of WB_[a]sync_congested bits to test + * + * Tests whether @inode is congested. @cong_bits is the mask of congestion + * bits to test and the return value is the mask of set bits. + * + * If cgroup writeback is enabled for @inode, the congestion state is + * determined by whether the cgwb (cgroup bdi_writeback) for the blkcg + * associated with @inode is congested; otherwise, the root wb's congestion + * state is used. + */ +int inode_congested(struct inode *inode, int cong_bits) +{ + /* + * Once set, ->i_wb never becomes NULL while the inode is alive. + * Start transaction iff ->i_wb is visible. + */ + if (inode && inode_to_wb_is_valid(inode)) { + struct bdi_writeback *wb; + bool locked, congested; + + wb = unlocked_inode_to_wb_begin(inode, &locked); + congested = wb_congested(wb, cong_bits); + unlocked_inode_to_wb_end(inode, locked); + return congested; + } + + return wb_congested(&inode_to_bdi(inode)->wb, cong_bits); +} +EXPORT_SYMBOL_GPL(inode_congested); + +/** + * wb_wait_for_single_work - wait for completion of a single bdi_writeback_work + * @bdi: bdi the work item was issued to + * @work: work item to wait for + * + * Wait for the completion of @work which was issued to one of @bdi's + * bdi_writeback's. The caller must have set @work->single_wait before + * issuing it. This wait operates independently fo + * wb_wait_for_completion() and also disables automatic freeing of @work. + */ +static void wb_wait_for_single_work(struct backing_dev_info *bdi, + struct wb_writeback_work *work) +{ + if (WARN_ON_ONCE(!work->single_wait)) + return; + + wait_event(bdi->wb_waitq, work->single_done); + + /* + * Paired with smp_wmb() in wb_do_writeback() and ensures that all + * modifications to @work prior to assertion of ->single_done is + * visible to the caller once this function returns. + */ + smp_rmb(); +} + +/** + * wb_split_bdi_pages - split nr_pages to write according to bandwidth + * @wb: target bdi_writeback to split @nr_pages to + * @nr_pages: number of pages to write for the whole bdi + * + * Split @wb's portion of @nr_pages according to @wb's write bandwidth in + * relation to the total write bandwidth of all wb's w/ dirty inodes on + * @wb->bdi. + */ +static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages) +{ + unsigned long this_bw = wb->avg_write_bandwidth; + unsigned long tot_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth); + + if (nr_pages == LONG_MAX) + return LONG_MAX; + + /* + * This may be called on clean wb's and proportional distribution + * may not make sense, just use the original @nr_pages in those + * cases. In general, we wanna err on the side of writing more. + */ + if (!tot_bw || this_bw >= tot_bw) + return nr_pages; + else + return DIV_ROUND_UP_ULL((u64)nr_pages * this_bw, tot_bw); +} + +/** + * wb_clone_and_queue_work - clone a wb_writeback_work and issue it to a wb + * @wb: target bdi_writeback + * @base_work: source wb_writeback_work + * + * Try to make a clone of @base_work and issue it to @wb. If cloning + * succeeds, %true is returned; otherwise, @base_work is issued directly + * and %false is returned. In the latter case, the caller is required to + * wait for @base_work's completion using wb_wait_for_single_work(). + * + * A clone is auto-freed on completion. @base_work never is. + */ +static bool wb_clone_and_queue_work(struct bdi_writeback *wb, + struct wb_writeback_work *base_work) { struct wb_writeback_work *work; + work = kmalloc(sizeof(*work), GFP_ATOMIC); + if (work) { + *work = *base_work; + work->auto_free = 1; + work->single_wait = 0; + } else { + work = base_work; + work->auto_free = 0; + work->single_wait = 1; + } + work->single_done = 0; + wb_queue_work(wb, work); + return work != base_work; +} + +/** + * bdi_split_work_to_wbs - split a wb_writeback_work to all wb's of a bdi + * @bdi: target backing_dev_info + * @base_work: wb_writeback_work to issue + * @skip_if_busy: skip wb's which already have writeback in progress + * + * Split and issue @base_work to all wb's (bdi_writeback's) of @bdi which + * have dirty inodes. If @base_work->nr_page isn't %LONG_MAX, it's + * distributed to the busy wbs according to each wb's proportion in the + * total active write bandwidth of @bdi. + */ +static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, + struct wb_writeback_work *base_work, + bool skip_if_busy) +{ + long nr_pages = base_work->nr_pages; + int next_blkcg_id = 0; + struct bdi_writeback *wb; + struct wb_iter iter; + + might_sleep(); +restart: + rcu_read_lock(); + bdi_for_each_wb(wb, bdi, &iter, next_blkcg_id) { + /* SYNC_ALL writes out I_DIRTY_TIME too */ + if (!wb_has_dirty_io(wb) && + (base_work->sync_mode == WB_SYNC_NONE || + list_empty(&wb->b_dirty_time))) + continue; + if (skip_if_busy && writeback_in_progress(wb)) + continue; + + base_work->nr_pages = wb_split_bdi_pages(wb, nr_pages); + if (!wb_clone_and_queue_work(wb, base_work)) { + next_blkcg_id = wb->blkcg_css->id + 1; + rcu_read_unlock(); + wb_wait_for_single_work(bdi, base_work); + goto restart; + } + } + rcu_read_unlock(); +} + +#else /* CONFIG_CGROUP_WRITEBACK */ + +static struct bdi_writeback * +locked_inode_to_wb_and_lock_list(struct inode *inode) + __releases(&inode->i_lock) + __acquires(&wb->list_lock) +{ + struct bdi_writeback *wb = inode_to_wb(inode); + + spin_unlock(&inode->i_lock); + spin_lock(&wb->list_lock); + return wb; +} + +static struct bdi_writeback *inode_to_wb_and_lock_list(struct inode *inode) + __acquires(&wb->list_lock) +{ + struct bdi_writeback *wb = inode_to_wb(inode); + + spin_lock(&wb->list_lock); + return wb; +} + +static long wb_split_bdi_pages(struct bdi_writeback *wb, long nr_pages) +{ + return nr_pages; +} + +static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, + struct wb_writeback_work *base_work, + bool skip_if_busy) +{ + might_sleep(); + + if (!skip_if_busy || !writeback_in_progress(&bdi->wb)) { + base_work->auto_free = 0; + base_work->single_wait = 0; + base_work->single_done = 0; + wb_queue_work(&bdi->wb, base_work); + } +} + +#endif /* CONFIG_CGROUP_WRITEBACK */ + +void wb_start_writeback(struct bdi_writeback *wb, long nr_pages, + bool range_cyclic, enum wb_reason reason) +{ + struct wb_writeback_work *work; + + if (!wb_has_dirty_io(wb)) + return; + /* * This is WB_SYNC_NONE writeback, so if allocation fails just * wakeup the thread for old dirty data writeback */ work = kzalloc(sizeof(*work), GFP_ATOMIC); if (!work) { - trace_writeback_nowork(bdi); - bdi_wakeup_thread(bdi); + trace_writeback_nowork(wb->bdi); + wb_wakeup(wb); return; } @@ -155,46 +933,29 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, work->nr_pages = nr_pages; work->range_cyclic = range_cyclic; work->reason = reason; + work->auto_free = 1; - bdi_queue_work(bdi, work); -} - -/** - * bdi_start_writeback - start writeback - * @bdi: the backing device to write from - * @nr_pages: the number of pages to write - * @reason: reason why some writeback work was initiated - * - * Description: - * This does WB_SYNC_NONE opportunistic writeback. The IO is only - * started when this function returns, we make no guarantees on - * completion. Caller need not hold sb s_umount semaphore. - * - */ -void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, - enum wb_reason reason) -{ - __bdi_start_writeback(bdi, nr_pages, true, reason); + wb_queue_work(wb, work); } /** - * bdi_start_background_writeback - start background writeback - * @bdi: the backing device to write from + * wb_start_background_writeback - start background writeback + * @wb: bdi_writback to write from * * Description: * This makes sure WB_SYNC_NONE background writeback happens. When - * this function returns, it is only guaranteed that for given BDI + * this function returns, it is only guaranteed that for given wb * some IO is happening if we are over background dirty threshold. * Caller need not hold sb s_umount semaphore. */ -void bdi_start_background_writeback(struct backing_dev_info *bdi) +void wb_start_background_writeback(struct bdi_writeback *wb) { /* * We just wake up the flusher thread. It will perform background * writeback as soon as there is no other work to do. */ - trace_writeback_wake_background(bdi); - bdi_wakeup_thread(bdi); + trace_writeback_wake_background(wb->bdi); + wb_wakeup(wb); } /* @@ -202,11 +963,11 @@ void bdi_start_background_writeback(struct backing_dev_info *bdi) */ void inode_wb_list_del(struct inode *inode) { - struct backing_dev_info *bdi = inode_to_bdi(inode); + struct bdi_writeback *wb; - spin_lock(&bdi->wb.list_lock); - list_del_init(&inode->i_wb_list); - spin_unlock(&bdi->wb.list_lock); + wb = inode_to_wb_and_lock_list(inode); + inode_wb_list_del_locked(inode, wb); + spin_unlock(&wb->list_lock); } /* @@ -220,7 +981,6 @@ void inode_wb_list_del(struct inode *inode) */ static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) { - assert_spin_locked(&wb->list_lock); if (!list_empty(&wb->b_dirty)) { struct inode *tail; @@ -228,7 +988,7 @@ static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) if (time_before(inode->dirtied_when, tail->dirtied_when)) inode->dirtied_when = jiffies; } - list_move(&inode->i_wb_list, &wb->b_dirty); + inode_wb_list_move_locked(inode, wb, &wb->b_dirty); } /* @@ -236,8 +996,7 @@ static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) */ static void requeue_io(struct inode *inode, struct bdi_writeback *wb) { - assert_spin_locked(&wb->list_lock); - list_move(&inode->i_wb_list, &wb->b_more_io); + inode_wb_list_move_locked(inode, wb, &wb->b_more_io); } static void inode_sync_complete(struct inode *inode) @@ -346,6 +1105,8 @@ static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work) moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, work); moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io, EXPIRE_DIRTY_ATIME, work); + if (moved) + wb_io_lists_populated(wb); trace_writeback_queue_io(wb, work, moved); } @@ -471,10 +1232,10 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, redirty_tail(inode, wb); } else if (inode->i_state & I_DIRTY_TIME) { inode->dirtied_when = jiffies; - list_move(&inode->i_wb_list, &wb->b_dirty_time); + inode_wb_list_move_locked(inode, wb, &wb->b_dirty_time); } else { /* The inode is clean. Remove from writeback lists. */ - list_del_init(&inode->i_wb_list); + inode_wb_list_del_locked(inode, wb); } } @@ -605,10 +1366,11 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK))) goto out; inode->i_state |= I_SYNC; - spin_unlock(&inode->i_lock); + wbc_attach_and_unlock_inode(wbc, inode); ret = __writeback_single_inode(inode, wbc); + wbc_detach_inode(wbc); spin_lock(&wb->list_lock); spin_lock(&inode->i_lock); /* @@ -616,7 +1378,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, * touch it. See comment above for explanation. */ if (!(inode->i_state & I_DIRTY_ALL)) - list_del_init(&inode->i_wb_list); + inode_wb_list_del_locked(inode, wb); spin_unlock(&wb->list_lock); inode_sync_complete(inode); out: @@ -624,7 +1386,7 @@ out: return ret; } -static long writeback_chunk_size(struct backing_dev_info *bdi, +static long writeback_chunk_size(struct bdi_writeback *wb, struct wb_writeback_work *work) { long pages; @@ -645,8 +1407,8 @@ static long writeback_chunk_size(struct backing_dev_info *bdi, if (work->sync_mode == WB_SYNC_ALL || work->tagged_writepages) pages = LONG_MAX; else { - pages = min(bdi->avg_write_bandwidth / 2, - global_dirty_limit / DIRTY_SCOPE); + pages = min(wb->avg_write_bandwidth / 2, + global_wb_domain.dirty_limit / DIRTY_SCOPE); pages = min(pages, work->nr_pages); pages = round_down(pages + MIN_WRITEBACK_PAGES, MIN_WRITEBACK_PAGES); @@ -741,9 +1503,9 @@ static long writeback_sb_inodes(struct super_block *sb, continue; } inode->i_state |= I_SYNC; - spin_unlock(&inode->i_lock); + wbc_attach_and_unlock_inode(&wbc, inode); - write_chunk = writeback_chunk_size(wb->bdi, work); + write_chunk = writeback_chunk_size(wb, work); wbc.nr_to_write = write_chunk; wbc.pages_skipped = 0; @@ -753,6 +1515,7 @@ static long writeback_sb_inodes(struct super_block *sb, */ __writeback_single_inode(inode, &wbc); + wbc_detach_inode(&wbc); work->nr_pages -= write_chunk - wbc.nr_to_write; wrote += write_chunk - wbc.nr_to_write; spin_lock(&wb->list_lock); @@ -830,33 +1593,6 @@ static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages, return nr_pages - work.nr_pages; } -static bool over_bground_thresh(struct backing_dev_info *bdi) -{ - unsigned long background_thresh, dirty_thresh; - - global_dirty_limits(&background_thresh, &dirty_thresh); - - if (global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS) > background_thresh) - return true; - - if (bdi_stat(bdi, BDI_RECLAIMABLE) > - bdi_dirty_limit(bdi, background_thresh)) - return true; - - return false; -} - -/* - * Called under wb->list_lock. If there are multiple wb per bdi, - * only the flusher working on the first wb should do it. - */ -static void wb_update_bandwidth(struct bdi_writeback *wb, - unsigned long start_time) -{ - __bdi_update_bandwidth(wb->bdi, 0, 0, 0, 0, 0, start_time); -} - /* * Explicit flushing or periodic writeback of "old" data. * @@ -899,14 +1635,14 @@ static long wb_writeback(struct bdi_writeback *wb, * after the other works are all done. */ if ((work->for_background || work->for_kupdate) && - !list_empty(&wb->bdi->work_list)) + !list_empty(&wb->work_list)) break; /* * For background writeout, stop when we are below the * background dirty threshold */ - if (work->for_background && !over_bground_thresh(wb->bdi)) + if (work->for_background && !wb_over_bg_thresh(wb)) break; /* @@ -970,18 +1706,17 @@ static long wb_writeback(struct bdi_writeback *wb, /* * Return the next wb_writeback_work struct that hasn't been processed yet. */ -static struct wb_writeback_work * -get_next_work_item(struct backing_dev_info *bdi) +static struct wb_writeback_work *get_next_work_item(struct bdi_writeback *wb) { struct wb_writeback_work *work = NULL; - spin_lock_bh(&bdi->wb_lock); - if (!list_empty(&bdi->work_list)) { - work = list_entry(bdi->work_list.next, + spin_lock_bh(&wb->work_lock); + if (!list_empty(&wb->work_list)) { + work = list_entry(wb->work_list.next, struct wb_writeback_work, list); list_del_init(&work->list); } - spin_unlock_bh(&bdi->wb_lock); + spin_unlock_bh(&wb->work_lock); return work; } @@ -998,7 +1733,7 @@ static unsigned long get_nr_dirty_pages(void) static long wb_check_background_flush(struct bdi_writeback *wb) { - if (over_bground_thresh(wb->bdi)) { + if (wb_over_bg_thresh(wb)) { struct wb_writeback_work work = { .nr_pages = LONG_MAX, @@ -1053,25 +1788,33 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb) */ static long wb_do_writeback(struct bdi_writeback *wb) { - struct backing_dev_info *bdi = wb->bdi; struct wb_writeback_work *work; long wrote = 0; - set_bit(BDI_writeback_running, &wb->bdi->state); - while ((work = get_next_work_item(bdi)) != NULL) { + set_bit(WB_writeback_running, &wb->state); + while ((work = get_next_work_item(wb)) != NULL) { + struct wb_completion *done = work->done; + bool need_wake_up = false; - trace_writeback_exec(bdi, work); + trace_writeback_exec(wb->bdi, work); wrote += wb_writeback(wb, work); - /* - * Notify the caller of completion if this is a synchronous - * work item, otherwise just free it. - */ - if (work->done) - complete(work->done); - else + if (work->single_wait) { + WARN_ON_ONCE(work->auto_free); + /* paired w/ rmb in wb_wait_for_single_work() */ + smp_wmb(); + work->single_done = 1; + need_wake_up = true; + } else if (work->auto_free) { kfree(work); + } + + if (done && atomic_dec_and_test(&done->cnt)) + need_wake_up = true; + + if (need_wake_up) + wake_up_all(&wb->bdi->wb_waitq); } /* @@ -1079,7 +1822,7 @@ static long wb_do_writeback(struct bdi_writeback *wb) */ wrote += wb_check_old_data_flush(wb); wrote += wb_check_background_flush(wb); - clear_bit(BDI_writeback_running, &wb->bdi->state); + clear_bit(WB_writeback_running, &wb->state); return wrote; } @@ -1088,43 +1831,42 @@ static long wb_do_writeback(struct bdi_writeback *wb) * Handle writeback of dirty data for the device backed by this bdi. Also * reschedules periodically and does kupdated style flushing. */ -void bdi_writeback_workfn(struct work_struct *work) +void wb_workfn(struct work_struct *work) { struct bdi_writeback *wb = container_of(to_delayed_work(work), struct bdi_writeback, dwork); - struct backing_dev_info *bdi = wb->bdi; long pages_written; - set_worker_desc("flush-%s", dev_name(bdi->dev)); + set_worker_desc("flush-%s", dev_name(wb->bdi->dev)); current->flags |= PF_SWAPWRITE; if (likely(!current_is_workqueue_rescuer() || - !test_bit(BDI_registered, &bdi->state))) { + !test_bit(WB_registered, &wb->state))) { /* - * The normal path. Keep writing back @bdi until its + * The normal path. Keep writing back @wb until its * work_list is empty. Note that this path is also taken - * if @bdi is shutting down even when we're running off the + * if @wb is shutting down even when we're running off the * rescuer as work_list needs to be drained. */ do { pages_written = wb_do_writeback(wb); trace_writeback_pages_written(pages_written); - } while (!list_empty(&bdi->work_list)); + } while (!list_empty(&wb->work_list)); } else { /* * bdi_wq can't get enough workers and we're running off * the emergency worker. Don't hog it. Hopefully, 1024 is * enough for efficient IO. */ - pages_written = writeback_inodes_wb(&bdi->wb, 1024, + pages_written = writeback_inodes_wb(wb, 1024, WB_REASON_FORKER_THREAD); trace_writeback_pages_written(pages_written); } - if (!list_empty(&bdi->work_list)) + if (!list_empty(&wb->work_list)) mod_delayed_work(bdi_wq, &wb->dwork, 0); else if (wb_has_dirty_io(wb) && dirty_writeback_interval) - bdi_wakeup_thread_delayed(bdi); + wb_wakeup_delayed(wb); current->flags &= ~PF_SWAPWRITE; } @@ -1142,9 +1884,15 @@ void wakeup_flusher_threads(long nr_pages, enum wb_reason reason) rcu_read_lock(); list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { + struct bdi_writeback *wb; + struct wb_iter iter; + if (!bdi_has_dirty_io(bdi)) continue; - __bdi_start_writeback(bdi, nr_pages, false, reason); + + bdi_for_each_wb(wb, bdi, &iter, 0) + wb_start_writeback(wb, wb_split_bdi_pages(wb, nr_pages), + false, reason); } rcu_read_unlock(); } @@ -1173,9 +1921,12 @@ static void wakeup_dirtytime_writeback(struct work_struct *w) rcu_read_lock(); list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) { - if (list_empty(&bdi->wb.b_dirty_time)) - continue; - bdi_wakeup_thread(bdi); + struct bdi_writeback *wb; + struct wb_iter iter; + + bdi_for_each_wb(wb, bdi, &iter, 0) + if (!list_empty(&bdi->wb.b_dirty_time)) + wb_wakeup(&bdi->wb); } rcu_read_unlock(); schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ); @@ -1249,7 +2000,6 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode) void __mark_inode_dirty(struct inode *inode, int flags) { struct super_block *sb = inode->i_sb; - struct backing_dev_info *bdi = NULL; int dirtytime; trace_writeback_mark_inode_dirty(inode, flags); @@ -1289,6 +2039,8 @@ void __mark_inode_dirty(struct inode *inode, int flags) if ((inode->i_state & flags) != flags) { const int was_dirty = inode->i_state & I_DIRTY; + inode_attach_wb(inode, NULL); + if (flags & I_DIRTY_INODE) inode->i_state &= ~I_DIRTY_TIME; inode->i_state |= flags; @@ -1317,38 +2069,39 @@ void __mark_inode_dirty(struct inode *inode, int flags) * reposition it (that would break b_dirty time-ordering). */ if (!was_dirty) { + struct bdi_writeback *wb; + struct list_head *dirty_list; bool wakeup_bdi = false; - bdi = inode_to_bdi(inode); - spin_unlock(&inode->i_lock); - spin_lock(&bdi->wb.list_lock); - if (bdi_cap_writeback_dirty(bdi)) { - WARN(!test_bit(BDI_registered, &bdi->state), - "bdi-%s not registered\n", bdi->name); + wb = locked_inode_to_wb_and_lock_list(inode); - /* - * If this is the first dirty inode for this - * bdi, we have to wake-up the corresponding - * bdi thread to make sure background - * write-back happens later. - */ - if (!wb_has_dirty_io(&bdi->wb)) - wakeup_bdi = true; - } + WARN(bdi_cap_writeback_dirty(wb->bdi) && + !test_bit(WB_registered, &wb->state), + "bdi-%s not registered\n", wb->bdi->name); inode->dirtied_when = jiffies; if (dirtytime) inode->dirtied_time_when = jiffies; + if (inode->i_state & (I_DIRTY_INODE | I_DIRTY_PAGES)) - list_move(&inode->i_wb_list, &bdi->wb.b_dirty); + dirty_list = &wb->b_dirty; else - list_move(&inode->i_wb_list, - &bdi->wb.b_dirty_time); - spin_unlock(&bdi->wb.list_lock); + dirty_list = &wb->b_dirty_time; + + wakeup_bdi = inode_wb_list_move_locked(inode, wb, + dirty_list); + + spin_unlock(&wb->list_lock); trace_writeback_dirty_inode_enqueue(inode); - if (wakeup_bdi) - bdi_wakeup_thread_delayed(bdi); + /* + * If this is the first dirty inode for this bdi, + * we have to wake-up the corresponding bdi thread + * to make sure background write-back happens + * later. + */ + if (bdi_cap_writeback_dirty(wb->bdi) && wakeup_bdi) + wb_wakeup_delayed(wb); return; } } @@ -1411,6 +2164,28 @@ static void wait_sb_inodes(struct super_block *sb) iput(old_inode); } +static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr, + enum wb_reason reason, bool skip_if_busy) +{ + DEFINE_WB_COMPLETION_ONSTACK(done); + struct wb_writeback_work work = { + .sb = sb, + .sync_mode = WB_SYNC_NONE, + .tagged_writepages = 1, + .done = &done, + .nr_pages = nr, + .reason = reason, + }; + struct backing_dev_info *bdi = sb->s_bdi; + + if (!bdi_has_dirty_io(bdi) || bdi == &noop_backing_dev_info) + return; + WARN_ON(!rwsem_is_locked(&sb->s_umount)); + + bdi_split_work_to_wbs(sb->s_bdi, &work, skip_if_busy); + wb_wait_for_completion(bdi, &done); +} + /** * writeback_inodes_sb_nr - writeback dirty inodes from given super_block * @sb: the superblock @@ -1425,21 +2200,7 @@ void writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr, enum wb_reason reason) { - DECLARE_COMPLETION_ONSTACK(done); - struct wb_writeback_work work = { - .sb = sb, - .sync_mode = WB_SYNC_NONE, - .tagged_writepages = 1, - .done = &done, - .nr_pages = nr, - .reason = reason, - }; - - if (sb->s_bdi == &noop_backing_dev_info) - return; - WARN_ON(!rwsem_is_locked(&sb->s_umount)); - bdi_queue_work(sb->s_bdi, &work); - wait_for_completion(&done); + __writeback_inodes_sb_nr(sb, nr, reason, false); } EXPORT_SYMBOL(writeback_inodes_sb_nr); @@ -1467,19 +2228,15 @@ EXPORT_SYMBOL(writeback_inodes_sb); * Invoke writeback_inodes_sb_nr if no writeback is currently underway. * Returns 1 if writeback was started, 0 if not. */ -int try_to_writeback_inodes_sb_nr(struct super_block *sb, - unsigned long nr, - enum wb_reason reason) +bool try_to_writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr, + enum wb_reason reason) { - if (writeback_in_progress(sb->s_bdi)) - return 1; - if (!down_read_trylock(&sb->s_umount)) - return 0; + return false; - writeback_inodes_sb_nr(sb, nr, reason); + __writeback_inodes_sb_nr(sb, nr, reason, true); up_read(&sb->s_umount); - return 1; + return true; } EXPORT_SYMBOL(try_to_writeback_inodes_sb_nr); @@ -1491,7 +2248,7 @@ EXPORT_SYMBOL(try_to_writeback_inodes_sb_nr); * Implement by try_to_writeback_inodes_sb_nr() * Returns 1 if writeback was started, 0 if not. */ -int try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason) +bool try_to_writeback_inodes_sb(struct super_block *sb, enum wb_reason reason) { return try_to_writeback_inodes_sb_nr(sb, get_nr_dirty_pages(), reason); } @@ -1506,7 +2263,7 @@ EXPORT_SYMBOL(try_to_writeback_inodes_sb); */ void sync_inodes_sb(struct super_block *sb) { - DECLARE_COMPLETION_ONSTACK(done); + DEFINE_WB_COMPLETION_ONSTACK(done); struct wb_writeback_work work = { .sb = sb, .sync_mode = WB_SYNC_ALL, @@ -1516,14 +2273,19 @@ void sync_inodes_sb(struct super_block *sb) .reason = WB_REASON_SYNC, .for_sync = 1, }; + struct backing_dev_info *bdi = sb->s_bdi; - /* Nothing to do? */ - if (sb->s_bdi == &noop_backing_dev_info) + /* + * Can't skip on !bdi_has_dirty() because we should wait for !dirty + * inodes under writeback and I_DIRTY_TIME inodes ignored by + * bdi_has_dirty() need to be written out too. + */ + if (bdi == &noop_backing_dev_info) return; WARN_ON(!rwsem_is_locked(&sb->s_umount)); - bdi_queue_work(sb->s_bdi, &work); - wait_for_completion(&done); + bdi_split_work_to_wbs(bdi, &work, false); + wb_wait_for_completion(bdi, &done); wait_sb_inodes(sb); } diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index 89acec742..d403c69be 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -327,7 +327,8 @@ static int fscache_alloc_object(struct fscache_cache *cache, object_already_extant: ret = -ENOBUFS; - if (fscache_object_is_dead(object)) { + if (fscache_object_is_dying(object) || + fscache_cache_is_broken(object)) { spin_unlock(&cookie->lock); goto error; } @@ -671,7 +672,7 @@ int __fscache_check_consistency(struct fscache_cookie *cookie) if (!op) return -ENOMEM; - fscache_operation_init(op, NULL, NULL); + fscache_operation_init(op, NULL, NULL, NULL); op->flags = FSCACHE_OP_MYTHREAD | (1 << FSCACHE_OP_WAITING) | (1 << FSCACHE_OP_UNUSE_COOKIE); @@ -695,8 +696,7 @@ int __fscache_check_consistency(struct fscache_cookie *cookie) /* the work queue now carries its own ref on the object */ spin_unlock(&cookie->lock); - ret = fscache_wait_for_operation_activation(object, op, - NULL, NULL, NULL); + ret = fscache_wait_for_operation_activation(object, op, NULL, NULL); if (ret == 0) { /* ask the cache to honour the operation */ ret = object->cache->ops->check_consistency(op); diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index 7872a62ef..97ec45110 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -124,8 +124,7 @@ extern int fscache_submit_exclusive_op(struct fscache_object *, struct fscache_operation *); extern int fscache_submit_op(struct fscache_object *, struct fscache_operation *); -extern int fscache_cancel_op(struct fscache_operation *, - void (*)(struct fscache_operation *)); +extern int fscache_cancel_op(struct fscache_operation *, bool); extern void fscache_cancel_all_ops(struct fscache_object *); extern void fscache_abort_object(struct fscache_object *); extern void fscache_start_operations(struct fscache_object *); @@ -138,8 +137,7 @@ extern int fscache_wait_for_deferred_lookup(struct fscache_cookie *); extern int fscache_wait_for_operation_activation(struct fscache_object *, struct fscache_operation *, atomic_t *, - atomic_t *, - void (*)(struct fscache_operation *)); + atomic_t *); extern void fscache_invalidate_writes(struct fscache_cookie *); /* @@ -164,6 +162,7 @@ extern atomic_t fscache_n_op_pend; extern atomic_t fscache_n_op_run; extern atomic_t fscache_n_op_enqueue; extern atomic_t fscache_n_op_deferred_release; +extern atomic_t fscache_n_op_initialised; extern atomic_t fscache_n_op_release; extern atomic_t fscache_n_op_gc; extern atomic_t fscache_n_op_cancelled; @@ -271,6 +270,11 @@ extern atomic_t fscache_n_cop_write_page; extern atomic_t fscache_n_cop_uncache_page; extern atomic_t fscache_n_cop_dissociate_pages; +extern atomic_t fscache_n_cache_no_space_reject; +extern atomic_t fscache_n_cache_stale_objects; +extern atomic_t fscache_n_cache_retired_objects; +extern atomic_t fscache_n_cache_culled_objects; + static inline void fscache_stat(atomic_t *stat) { atomic_inc(stat); diff --git a/fs/fscache/object.c b/fs/fscache/object.c index da032daf0..9e792e30f 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -328,6 +328,17 @@ void fscache_object_init(struct fscache_object *object, EXPORT_SYMBOL(fscache_object_init); /* + * Mark the object as no longer being live, making sure that we synchronise + * against op submission. + */ +static inline void fscache_mark_object_dead(struct fscache_object *object) +{ + spin_lock(&object->lock); + clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags); + spin_unlock(&object->lock); +} + +/* * Abort object initialisation before we start it. */ static const struct fscache_state *fscache_abort_initialisation(struct fscache_object *object, @@ -610,6 +621,8 @@ static const struct fscache_state *fscache_lookup_failure(struct fscache_object object->cache->ops->lookup_complete(object); fscache_stat_d(&fscache_n_cop_lookup_complete); + set_bit(FSCACHE_OBJECT_KILLED_BY_CACHE, &object->flags); + cookie = object->cookie; set_bit(FSCACHE_COOKIE_UNAVAILABLE, &cookie->flags); if (test_and_clear_bit(FSCACHE_COOKIE_LOOKING_UP, &cookie->flags)) @@ -629,7 +642,7 @@ static const struct fscache_state *fscache_kill_object(struct fscache_object *ob _enter("{OBJ%x,%d,%d},%d", object->debug_id, object->n_ops, object->n_children, event); - clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags); + fscache_mark_object_dead(object); object->oob_event_mask = 0; if (list_empty(&object->dependents) && @@ -948,7 +961,8 @@ static const struct fscache_state *_fscache_invalidate_object(struct fscache_obj if (!op) goto nomem; - fscache_operation_init(op, object->cache->ops->invalidate_object, NULL); + fscache_operation_init(op, object->cache->ops->invalidate_object, + NULL, NULL); op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE) | (1 << FSCACHE_OP_UNUSE_COOKIE); @@ -974,13 +988,13 @@ static const struct fscache_state *_fscache_invalidate_object(struct fscache_obj return transit_to(UPDATE_OBJECT); nomem: - clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags); + fscache_mark_object_dead(object); fscache_unuse_cookie(object); _leave(" [ENOMEM]"); return transit_to(KILL_OBJECT); submit_op_failed: - clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags); + fscache_mark_object_dead(object); spin_unlock(&cookie->lock); fscache_unuse_cookie(object); kfree(op); @@ -1016,3 +1030,50 @@ static const struct fscache_state *fscache_update_object(struct fscache_object * _leave(""); return transit_to(WAIT_FOR_CMD); } + +/** + * fscache_object_retrying_stale - Note retrying stale object + * @object: The object that will be retried + * + * Note that an object lookup found an on-disk object that was adjudged to be + * stale and has been deleted. The lookup will be retried. + */ +void fscache_object_retrying_stale(struct fscache_object *object) +{ + fscache_stat(&fscache_n_cache_no_space_reject); +} +EXPORT_SYMBOL(fscache_object_retrying_stale); + +/** + * fscache_object_mark_killed - Note that an object was killed + * @object: The object that was culled + * @why: The reason the object was killed. + * + * Note that an object was killed. Returns true if the object was + * already marked killed, false if it wasn't. + */ +void fscache_object_mark_killed(struct fscache_object *object, + enum fscache_why_object_killed why) +{ + if (test_and_set_bit(FSCACHE_OBJECT_KILLED_BY_CACHE, &object->flags)) { + pr_err("Error: Object already killed by cache [%s]\n", + object->cache->identifier); + return; + } + + switch (why) { + case FSCACHE_OBJECT_NO_SPACE: + fscache_stat(&fscache_n_cache_no_space_reject); + break; + case FSCACHE_OBJECT_IS_STALE: + fscache_stat(&fscache_n_cache_stale_objects); + break; + case FSCACHE_OBJECT_WAS_RETIRED: + fscache_stat(&fscache_n_cache_retired_objects); + break; + case FSCACHE_OBJECT_WAS_CULLED: + fscache_stat(&fscache_n_cache_culled_objects); + break; + } +} +EXPORT_SYMBOL(fscache_object_mark_killed); diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c index e7b87a0e5..de67745e1 100644 --- a/fs/fscache/operation.c +++ b/fs/fscache/operation.c @@ -20,6 +20,35 @@ atomic_t fscache_op_debug_id; EXPORT_SYMBOL(fscache_op_debug_id); +static void fscache_operation_dummy_cancel(struct fscache_operation *op) +{ +} + +/** + * fscache_operation_init - Do basic initialisation of an operation + * @op: The operation to initialise + * @release: The release function to assign + * + * Do basic initialisation of an operation. The caller must still set flags, + * object and processor if needed. + */ +void fscache_operation_init(struct fscache_operation *op, + fscache_operation_processor_t processor, + fscache_operation_cancel_t cancel, + fscache_operation_release_t release) +{ + INIT_WORK(&op->work, fscache_op_work_func); + atomic_set(&op->usage, 1); + op->state = FSCACHE_OP_ST_INITIALISED; + op->debug_id = atomic_inc_return(&fscache_op_debug_id); + op->processor = processor; + op->cancel = cancel ?: fscache_operation_dummy_cancel; + op->release = release; + INIT_LIST_HEAD(&op->pend_link); + fscache_stat(&fscache_n_op_initialised); +} +EXPORT_SYMBOL(fscache_operation_init); + /** * fscache_enqueue_operation - Enqueue an operation for processing * @op: The operation to enqueue @@ -76,6 +105,43 @@ static void fscache_run_op(struct fscache_object *object, } /* + * report an unexpected submission + */ +static void fscache_report_unexpected_submission(struct fscache_object *object, + struct fscache_operation *op, + const struct fscache_state *ostate) +{ + static bool once_only; + struct fscache_operation *p; + unsigned n; + + if (once_only) + return; + once_only = true; + + kdebug("unexpected submission OP%x [OBJ%x %s]", + op->debug_id, object->debug_id, object->state->name); + kdebug("objstate=%s [%s]", object->state->name, ostate->name); + kdebug("objflags=%lx", object->flags); + kdebug("objevent=%lx [%lx]", object->events, object->event_mask); + kdebug("ops=%u inp=%u exc=%u", + object->n_ops, object->n_in_progress, object->n_exclusive); + + if (!list_empty(&object->pending_ops)) { + n = 0; + list_for_each_entry(p, &object->pending_ops, pend_link) { + ASSERTCMP(p->object, ==, object); + kdebug("%p %p", op->processor, op->release); + n++; + } + + kdebug("n=%u", n); + } + + dump_stack(); +} + +/* * submit an exclusive operation for an object * - other ops are excluded from running simultaneously with this one * - this gets any extra refs it needs on an op @@ -83,6 +149,8 @@ static void fscache_run_op(struct fscache_object *object, int fscache_submit_exclusive_op(struct fscache_object *object, struct fscache_operation *op) { + const struct fscache_state *ostate; + unsigned long flags; int ret; _enter("{OBJ%x OP%x},", object->debug_id, op->debug_id); @@ -95,8 +163,21 @@ int fscache_submit_exclusive_op(struct fscache_object *object, ASSERTCMP(object->n_ops, >=, object->n_exclusive); ASSERT(list_empty(&op->pend_link)); + ostate = object->state; + smp_rmb(); + op->state = FSCACHE_OP_ST_PENDING; - if (fscache_object_is_active(object)) { + flags = READ_ONCE(object->flags); + if (unlikely(!(flags & BIT(FSCACHE_OBJECT_IS_LIVE)))) { + fscache_stat(&fscache_n_op_rejected); + op->cancel(op); + op->state = FSCACHE_OP_ST_CANCELLED; + ret = -ENOBUFS; + } else if (unlikely(fscache_cache_is_broken(object))) { + op->cancel(op); + op->state = FSCACHE_OP_ST_CANCELLED; + ret = -EIO; + } else if (flags & BIT(FSCACHE_OBJECT_IS_AVAILABLE)) { op->object = object; object->n_ops++; object->n_exclusive++; /* reads and writes must wait */ @@ -118,7 +199,7 @@ int fscache_submit_exclusive_op(struct fscache_object *object, /* need to issue a new write op after this */ clear_bit(FSCACHE_OBJECT_PENDING_WRITE, &object->flags); ret = 0; - } else if (test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) { + } else if (flags & BIT(FSCACHE_OBJECT_IS_LOOKED_UP)) { op->object = object; object->n_ops++; object->n_exclusive++; /* reads and writes must wait */ @@ -126,12 +207,15 @@ int fscache_submit_exclusive_op(struct fscache_object *object, list_add_tail(&op->pend_link, &object->pending_ops); fscache_stat(&fscache_n_op_pend); ret = 0; + } else if (flags & BIT(FSCACHE_OBJECT_KILLED_BY_CACHE)) { + op->cancel(op); + op->state = FSCACHE_OP_ST_CANCELLED; + ret = -ENOBUFS; } else { - /* If we're in any other state, there must have been an I/O - * error of some nature. - */ - ASSERT(test_bit(FSCACHE_IOERROR, &object->cache->flags)); - ret = -EIO; + fscache_report_unexpected_submission(object, op, ostate); + op->cancel(op); + op->state = FSCACHE_OP_ST_CANCELLED; + ret = -ENOBUFS; } spin_unlock(&object->lock); @@ -139,43 +223,6 @@ int fscache_submit_exclusive_op(struct fscache_object *object, } /* - * report an unexpected submission - */ -static void fscache_report_unexpected_submission(struct fscache_object *object, - struct fscache_operation *op, - const struct fscache_state *ostate) -{ - static bool once_only; - struct fscache_operation *p; - unsigned n; - - if (once_only) - return; - once_only = true; - - kdebug("unexpected submission OP%x [OBJ%x %s]", - op->debug_id, object->debug_id, object->state->name); - kdebug("objstate=%s [%s]", object->state->name, ostate->name); - kdebug("objflags=%lx", object->flags); - kdebug("objevent=%lx [%lx]", object->events, object->event_mask); - kdebug("ops=%u inp=%u exc=%u", - object->n_ops, object->n_in_progress, object->n_exclusive); - - if (!list_empty(&object->pending_ops)) { - n = 0; - list_for_each_entry(p, &object->pending_ops, pend_link) { - ASSERTCMP(p->object, ==, object); - kdebug("%p %p", op->processor, op->release); - n++; - } - - kdebug("n=%u", n); - } - - dump_stack(); -} - -/* * submit an operation for an object * - objects may be submitted only in the following states: * - during object creation (write ops may be submitted) @@ -187,6 +234,7 @@ int fscache_submit_op(struct fscache_object *object, struct fscache_operation *op) { const struct fscache_state *ostate; + unsigned long flags; int ret; _enter("{OBJ%x OP%x},{%u}", @@ -204,7 +252,17 @@ int fscache_submit_op(struct fscache_object *object, smp_rmb(); op->state = FSCACHE_OP_ST_PENDING; - if (fscache_object_is_active(object)) { + flags = READ_ONCE(object->flags); + if (unlikely(!(flags & BIT(FSCACHE_OBJECT_IS_LIVE)))) { + fscache_stat(&fscache_n_op_rejected); + op->cancel(op); + op->state = FSCACHE_OP_ST_CANCELLED; + ret = -ENOBUFS; + } else if (unlikely(fscache_cache_is_broken(object))) { + op->cancel(op); + op->state = FSCACHE_OP_ST_CANCELLED; + ret = -EIO; + } else if (flags & BIT(FSCACHE_OBJECT_IS_AVAILABLE)) { op->object = object; object->n_ops++; @@ -222,23 +280,21 @@ int fscache_submit_op(struct fscache_object *object, fscache_run_op(object, op); } ret = 0; - } else if (test_bit(FSCACHE_OBJECT_IS_LOOKED_UP, &object->flags)) { + } else if (flags & BIT(FSCACHE_OBJECT_IS_LOOKED_UP)) { op->object = object; object->n_ops++; atomic_inc(&op->usage); list_add_tail(&op->pend_link, &object->pending_ops); fscache_stat(&fscache_n_op_pend); ret = 0; - } else if (fscache_object_is_dying(object)) { - fscache_stat(&fscache_n_op_rejected); + } else if (flags & BIT(FSCACHE_OBJECT_KILLED_BY_CACHE)) { + op->cancel(op); op->state = FSCACHE_OP_ST_CANCELLED; ret = -ENOBUFS; - } else if (!test_bit(FSCACHE_IOERROR, &object->cache->flags)) { + } else { fscache_report_unexpected_submission(object, op, ostate); ASSERT(!fscache_object_is_active(object)); - op->state = FSCACHE_OP_ST_CANCELLED; - ret = -ENOBUFS; - } else { + op->cancel(op); op->state = FSCACHE_OP_ST_CANCELLED; ret = -ENOBUFS; } @@ -293,9 +349,10 @@ void fscache_start_operations(struct fscache_object *object) * cancel an operation that's pending on an object */ int fscache_cancel_op(struct fscache_operation *op, - void (*do_cancel)(struct fscache_operation *)) + bool cancel_in_progress_op) { struct fscache_object *object = op->object; + bool put = false; int ret; _enter("OBJ%x OP%x}", op->object->debug_id, op->debug_id); @@ -309,19 +366,37 @@ int fscache_cancel_op(struct fscache_operation *op, ret = -EBUSY; if (op->state == FSCACHE_OP_ST_PENDING) { ASSERT(!list_empty(&op->pend_link)); - fscache_stat(&fscache_n_op_cancelled); list_del_init(&op->pend_link); - if (do_cancel) - do_cancel(op); + put = true; + + fscache_stat(&fscache_n_op_cancelled); + op->cancel(op); + op->state = FSCACHE_OP_ST_CANCELLED; + if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) + object->n_exclusive--; + if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags)) + wake_up_bit(&op->flags, FSCACHE_OP_WAITING); + ret = 0; + } else if (op->state == FSCACHE_OP_ST_IN_PROGRESS && cancel_in_progress_op) { + ASSERTCMP(object->n_in_progress, >, 0); + if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) + object->n_exclusive--; + object->n_in_progress--; + if (object->n_in_progress == 0) + fscache_start_operations(object); + + fscache_stat(&fscache_n_op_cancelled); + op->cancel(op); op->state = FSCACHE_OP_ST_CANCELLED; if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) object->n_exclusive--; if (test_and_clear_bit(FSCACHE_OP_WAITING, &op->flags)) wake_up_bit(&op->flags, FSCACHE_OP_WAITING); - fscache_put_operation(op); ret = 0; } + if (put) + fscache_put_operation(op); spin_unlock(&object->lock); _leave(" = %d", ret); return ret; @@ -345,6 +420,7 @@ void fscache_cancel_all_ops(struct fscache_object *object) list_del_init(&op->pend_link); ASSERTCMP(op->state, ==, FSCACHE_OP_ST_PENDING); + op->cancel(op); op->state = FSCACHE_OP_ST_CANCELLED; if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) @@ -377,8 +453,12 @@ void fscache_op_complete(struct fscache_operation *op, bool cancelled) spin_lock(&object->lock); - op->state = cancelled ? - FSCACHE_OP_ST_CANCELLED : FSCACHE_OP_ST_COMPLETE; + if (!cancelled) { + op->state = FSCACHE_OP_ST_COMPLETE; + } else { + op->cancel(op); + op->state = FSCACHE_OP_ST_CANCELLED; + } if (test_bit(FSCACHE_OP_EXCLUSIVE, &op->flags)) object->n_exclusive--; @@ -409,9 +489,9 @@ void fscache_put_operation(struct fscache_operation *op) return; _debug("PUT OP"); - ASSERTIFCMP(op->state != FSCACHE_OP_ST_COMPLETE, + ASSERTIFCMP(op->state != FSCACHE_OP_ST_INITIALISED && + op->state != FSCACHE_OP_ST_COMPLETE, op->state, ==, FSCACHE_OP_ST_CANCELLED); - op->state = FSCACHE_OP_ST_DEAD; fscache_stat(&fscache_n_op_release); @@ -419,37 +499,39 @@ void fscache_put_operation(struct fscache_operation *op) op->release(op); op->release = NULL; } + op->state = FSCACHE_OP_ST_DEAD; object = op->object; + if (likely(object)) { + if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags)) + atomic_dec(&object->n_reads); + if (test_bit(FSCACHE_OP_UNUSE_COOKIE, &op->flags)) + fscache_unuse_cookie(object); + + /* now... we may get called with the object spinlock held, so we + * complete the cleanup here only if we can immediately acquire the + * lock, and defer it otherwise */ + if (!spin_trylock(&object->lock)) { + _debug("defer put"); + fscache_stat(&fscache_n_op_deferred_release); + + cache = object->cache; + spin_lock(&cache->op_gc_list_lock); + list_add_tail(&op->pend_link, &cache->op_gc_list); + spin_unlock(&cache->op_gc_list_lock); + schedule_work(&cache->op_gc); + _leave(" [defer]"); + return; + } - if (test_bit(FSCACHE_OP_DEC_READ_CNT, &op->flags)) - atomic_dec(&object->n_reads); - if (test_bit(FSCACHE_OP_UNUSE_COOKIE, &op->flags)) - fscache_unuse_cookie(object); - - /* now... we may get called with the object spinlock held, so we - * complete the cleanup here only if we can immediately acquire the - * lock, and defer it otherwise */ - if (!spin_trylock(&object->lock)) { - _debug("defer put"); - fscache_stat(&fscache_n_op_deferred_release); + ASSERTCMP(object->n_ops, >, 0); + object->n_ops--; + if (object->n_ops == 0) + fscache_raise_event(object, FSCACHE_OBJECT_EV_CLEARED); - cache = object->cache; - spin_lock(&cache->op_gc_list_lock); - list_add_tail(&op->pend_link, &cache->op_gc_list); - spin_unlock(&cache->op_gc_list_lock); - schedule_work(&cache->op_gc); - _leave(" [defer]"); - return; + spin_unlock(&object->lock); } - ASSERTCMP(object->n_ops, >, 0); - object->n_ops--; - if (object->n_ops == 0) - fscache_raise_event(object, FSCACHE_OBJECT_EV_CLEARED); - - spin_unlock(&object->lock); - kfree(op); _leave(" [done]"); } diff --git a/fs/fscache/page.c b/fs/fscache/page.c index de33b3fcc..483bbc613 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -213,7 +213,7 @@ int __fscache_attr_changed(struct fscache_cookie *cookie) return -ENOMEM; } - fscache_operation_init(op, fscache_attr_changed_op, NULL); + fscache_operation_init(op, fscache_attr_changed_op, NULL, NULL); op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE) | (1 << FSCACHE_OP_UNUSE_COOKIE); @@ -239,7 +239,7 @@ nobufs_dec: wake_cookie = __fscache_unuse_cookie(cookie); nobufs: spin_unlock(&cookie->lock); - kfree(op); + fscache_put_operation(op); if (wake_cookie) __fscache_wake_unused_cookie(cookie); fscache_stat(&fscache_n_attr_changed_nobufs); @@ -249,6 +249,17 @@ nobufs: EXPORT_SYMBOL(__fscache_attr_changed); /* + * Handle cancellation of a pending retrieval op + */ +static void fscache_do_cancel_retrieval(struct fscache_operation *_op) +{ + struct fscache_retrieval *op = + container_of(_op, struct fscache_retrieval, op); + + atomic_set(&op->n_pages, 0); +} + +/* * release a retrieval op reference */ static void fscache_release_retrieval_op(struct fscache_operation *_op) @@ -258,11 +269,12 @@ static void fscache_release_retrieval_op(struct fscache_operation *_op) _enter("{OP%x}", op->op.debug_id); - ASSERTCMP(atomic_read(&op->n_pages), ==, 0); + ASSERTIFCMP(op->op.state != FSCACHE_OP_ST_INITIALISED, + atomic_read(&op->n_pages), ==, 0); fscache_hist(fscache_retrieval_histogram, op->start_time); if (op->context) - fscache_put_context(op->op.object->cookie, op->context); + fscache_put_context(op->cookie, op->context); _leave(""); } @@ -285,15 +297,24 @@ static struct fscache_retrieval *fscache_alloc_retrieval( return NULL; } - fscache_operation_init(&op->op, NULL, fscache_release_retrieval_op); + fscache_operation_init(&op->op, NULL, + fscache_do_cancel_retrieval, + fscache_release_retrieval_op); op->op.flags = FSCACHE_OP_MYTHREAD | (1UL << FSCACHE_OP_WAITING) | (1UL << FSCACHE_OP_UNUSE_COOKIE); + op->cookie = cookie; op->mapping = mapping; op->end_io_func = end_io_func; op->context = context; op->start_time = jiffies; INIT_LIST_HEAD(&op->to_do); + + /* Pin the netfs read context in case we need to do the actual netfs + * read because we've encountered a cache read failure. + */ + if (context) + fscache_get_context(op->cookie, context); return op; } @@ -330,24 +351,12 @@ int fscache_wait_for_deferred_lookup(struct fscache_cookie *cookie) } /* - * Handle cancellation of a pending retrieval op - */ -static void fscache_do_cancel_retrieval(struct fscache_operation *_op) -{ - struct fscache_retrieval *op = - container_of(_op, struct fscache_retrieval, op); - - atomic_set(&op->n_pages, 0); -} - -/* * wait for an object to become active (or dead) */ int fscache_wait_for_operation_activation(struct fscache_object *object, struct fscache_operation *op, atomic_t *stat_op_waits, - atomic_t *stat_object_dead, - void (*do_cancel)(struct fscache_operation *)) + atomic_t *stat_object_dead) { int ret; @@ -359,7 +368,7 @@ int fscache_wait_for_operation_activation(struct fscache_object *object, fscache_stat(stat_op_waits); if (wait_on_bit(&op->flags, FSCACHE_OP_WAITING, TASK_INTERRUPTIBLE) != 0) { - ret = fscache_cancel_op(op, do_cancel); + ret = fscache_cancel_op(op, false); if (ret == 0) return -ERESTARTSYS; @@ -377,11 +386,13 @@ check_if_dead: _leave(" = -ENOBUFS [cancelled]"); return -ENOBUFS; } - if (unlikely(fscache_object_is_dead(object))) { - pr_err("%s() = -ENOBUFS [obj dead %d]\n", __func__, op->state); - fscache_cancel_op(op, do_cancel); + if (unlikely(fscache_object_is_dying(object) || + fscache_cache_is_broken(object))) { + enum fscache_operation_state state = op->state; + fscache_cancel_op(op, true); if (stat_object_dead) fscache_stat(stat_object_dead); + _leave(" = -ENOBUFS [obj dead %d]", state); return -ENOBUFS; } return 0; @@ -453,17 +464,12 @@ int __fscache_read_or_alloc_page(struct fscache_cookie *cookie, fscache_stat(&fscache_n_retrieval_ops); - /* pin the netfs read context in case we need to do the actual netfs - * read because we've encountered a cache read failure */ - fscache_get_context(object->cookie, op->context); - /* we wait for the operation to become active, and then process it * *here*, in this thread, and not in the thread pool */ ret = fscache_wait_for_operation_activation( object, &op->op, __fscache_stat(&fscache_n_retrieval_op_waits), - __fscache_stat(&fscache_n_retrievals_object_dead), - fscache_do_cancel_retrieval); + __fscache_stat(&fscache_n_retrievals_object_dead)); if (ret < 0) goto error; @@ -503,7 +509,7 @@ nobufs_unlock: spin_unlock(&cookie->lock); if (wake_cookie) __fscache_wake_unused_cookie(cookie); - kfree(op); + fscache_put_retrieval(op); nobufs: fscache_stat(&fscache_n_retrievals_nobufs); _leave(" = -ENOBUFS"); @@ -584,17 +590,12 @@ int __fscache_read_or_alloc_pages(struct fscache_cookie *cookie, fscache_stat(&fscache_n_retrieval_ops); - /* pin the netfs read context in case we need to do the actual netfs - * read because we've encountered a cache read failure */ - fscache_get_context(object->cookie, op->context); - /* we wait for the operation to become active, and then process it * *here*, in this thread, and not in the thread pool */ ret = fscache_wait_for_operation_activation( object, &op->op, __fscache_stat(&fscache_n_retrieval_op_waits), - __fscache_stat(&fscache_n_retrievals_object_dead), - fscache_do_cancel_retrieval); + __fscache_stat(&fscache_n_retrievals_object_dead)); if (ret < 0) goto error; @@ -632,7 +633,7 @@ nobufs_unlock_dec: wake_cookie = __fscache_unuse_cookie(cookie); nobufs_unlock: spin_unlock(&cookie->lock); - kfree(op); + fscache_put_retrieval(op); if (wake_cookie) __fscache_wake_unused_cookie(cookie); nobufs: @@ -700,8 +701,7 @@ int __fscache_alloc_page(struct fscache_cookie *cookie, ret = fscache_wait_for_operation_activation( object, &op->op, __fscache_stat(&fscache_n_alloc_op_waits), - __fscache_stat(&fscache_n_allocs_object_dead), - fscache_do_cancel_retrieval); + __fscache_stat(&fscache_n_allocs_object_dead)); if (ret < 0) goto error; @@ -726,7 +726,7 @@ nobufs_unlock_dec: wake_cookie = __fscache_unuse_cookie(cookie); nobufs_unlock: spin_unlock(&cookie->lock); - kfree(op); + fscache_put_retrieval(op); if (wake_cookie) __fscache_wake_unused_cookie(cookie); nobufs: @@ -944,7 +944,7 @@ int __fscache_write_page(struct fscache_cookie *cookie, if (!op) goto nomem; - fscache_operation_init(&op->op, fscache_write_op, + fscache_operation_init(&op->op, fscache_write_op, NULL, fscache_release_write_op); op->op.flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_WAITING) | @@ -1016,7 +1016,7 @@ already_pending: spin_unlock(&object->lock); spin_unlock(&cookie->lock); radix_tree_preload_end(); - kfree(op); + fscache_put_operation(&op->op); fscache_stat(&fscache_n_stores_ok); _leave(" = 0"); return 0; @@ -1036,7 +1036,7 @@ nobufs_unlock_obj: nobufs: spin_unlock(&cookie->lock); radix_tree_preload_end(); - kfree(op); + fscache_put_operation(&op->op); if (wake_cookie) __fscache_wake_unused_cookie(cookie); fscache_stat(&fscache_n_stores_nobufs); @@ -1044,7 +1044,7 @@ nobufs: return -ENOBUFS; nomem_free: - kfree(op); + fscache_put_operation(&op->op); nomem: fscache_stat(&fscache_n_stores_oom); _leave(" = -ENOMEM"); diff --git a/fs/fscache/stats.c b/fs/fscache/stats.c index 40d13c70e..7cfa0aacd 100644 --- a/fs/fscache/stats.c +++ b/fs/fscache/stats.c @@ -23,6 +23,7 @@ atomic_t fscache_n_op_run; atomic_t fscache_n_op_enqueue; atomic_t fscache_n_op_requeue; atomic_t fscache_n_op_deferred_release; +atomic_t fscache_n_op_initialised; atomic_t fscache_n_op_release; atomic_t fscache_n_op_gc; atomic_t fscache_n_op_cancelled; @@ -130,6 +131,11 @@ atomic_t fscache_n_cop_write_page; atomic_t fscache_n_cop_uncache_page; atomic_t fscache_n_cop_dissociate_pages; +atomic_t fscache_n_cache_no_space_reject; +atomic_t fscache_n_cache_stale_objects; +atomic_t fscache_n_cache_retired_objects; +atomic_t fscache_n_cache_culled_objects; + /* * display the general statistics */ @@ -246,7 +252,8 @@ static int fscache_stats_show(struct seq_file *m, void *v) atomic_read(&fscache_n_op_enqueue), atomic_read(&fscache_n_op_cancelled), atomic_read(&fscache_n_op_rejected)); - seq_printf(m, "Ops : dfr=%u rel=%u gc=%u\n", + seq_printf(m, "Ops : ini=%u dfr=%u rel=%u gc=%u\n", + atomic_read(&fscache_n_op_initialised), atomic_read(&fscache_n_op_deferred_release), atomic_read(&fscache_n_op_release), atomic_read(&fscache_n_op_gc)); @@ -271,6 +278,11 @@ static int fscache_stats_show(struct seq_file *m, void *v) atomic_read(&fscache_n_cop_write_page), atomic_read(&fscache_n_cop_uncache_page), atomic_read(&fscache_n_cop_dissociate_pages)); + seq_printf(m, "CacheEv: nsp=%d stl=%d rtr=%d cul=%d\n", + atomic_read(&fscache_n_cache_no_space_reject), + atomic_read(&fscache_n_cache_stale_objects), + atomic_read(&fscache_n_cache_retired_objects), + atomic_read(&fscache_n_cache_culled_objects)); return 0; } diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index e5bbf748b..eae2c1126 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -489,6 +489,7 @@ static void cuse_fc_release(struct fuse_conn *fc) */ static int cuse_channel_open(struct inode *inode, struct file *file) { + struct fuse_dev *fud; struct cuse_conn *cc; int rc; @@ -499,17 +500,22 @@ static int cuse_channel_open(struct inode *inode, struct file *file) fuse_conn_init(&cc->fc); + fud = fuse_dev_alloc(&cc->fc); + if (!fud) { + kfree(cc); + return -ENOMEM; + } + INIT_LIST_HEAD(&cc->list); cc->fc.release = cuse_fc_release; - cc->fc.connected = 1; cc->fc.initialized = 1; rc = cuse_send_init(cc); if (rc) { - fuse_conn_put(&cc->fc); + fuse_dev_free(fud); return rc; } - file->private_data = &cc->fc; /* channel owns base reference to cc */ + file->private_data = fud; return 0; } @@ -527,7 +533,8 @@ static int cuse_channel_open(struct inode *inode, struct file *file) */ static int cuse_channel_release(struct inode *inode, struct file *file) { - struct cuse_conn *cc = fc_to_cc(file->private_data); + struct fuse_dev *fud = file->private_data; + struct cuse_conn *cc = fc_to_cc(fud->fc); int rc; /* remove from the conntbl, no more access from this point on */ diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index c8b68ab2e..ebb5e3745 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -25,13 +25,13 @@ MODULE_ALIAS("devname:fuse"); static struct kmem_cache *fuse_req_cachep; -static struct fuse_conn *fuse_get_conn(struct file *file) +static struct fuse_dev *fuse_get_dev(struct file *file) { /* * Lockless access is OK, because file->private data is set * once during mount and is valid until the file is released. */ - return file->private_data; + return ACCESS_ONCE(file->private_data); } static void fuse_request_init(struct fuse_req *req, struct page **pages, @@ -48,6 +48,7 @@ static void fuse_request_init(struct fuse_req *req, struct page **pages, req->pages = pages; req->page_descs = page_descs; req->max_pages = npages; + __set_bit(FR_PENDING, &req->flags); } static struct fuse_req *__fuse_request_alloc(unsigned npages, gfp_t flags) @@ -168,6 +169,10 @@ static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages, if (!fc->connected) goto out; + err = -ECONNREFUSED; + if (fc->conn_error) + goto out; + req = fuse_request_alloc(npages); err = -ENOMEM; if (!req) { @@ -177,8 +182,10 @@ static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages, } fuse_req_init_context(req); - req->waiting = 1; - req->background = for_background; + __set_bit(FR_WAITING, &req->flags); + if (for_background) + __set_bit(FR_BACKGROUND, &req->flags); + return req; out: @@ -268,15 +275,15 @@ struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc, req = get_reserved_req(fc, file); fuse_req_init_context(req); - req->waiting = 1; - req->background = 0; + __set_bit(FR_WAITING, &req->flags); + __clear_bit(FR_BACKGROUND, &req->flags); return req; } void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) { if (atomic_dec_and_test(&req->count)) { - if (unlikely(req->background)) { + if (test_bit(FR_BACKGROUND, &req->flags)) { /* * We get here in the unlikely case that a background * request was allocated but not sent @@ -287,8 +294,10 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) spin_unlock(&fc->lock); } - if (req->waiting) + if (test_bit(FR_WAITING, &req->flags)) { + __clear_bit(FR_WAITING, &req->flags); atomic_dec(&fc->num_waiting); + } if (req->stolen_file) put_reserved_req(fc, req); @@ -309,46 +318,38 @@ static unsigned len_args(unsigned numargs, struct fuse_arg *args) return nbytes; } -static u64 fuse_get_unique(struct fuse_conn *fc) +static u64 fuse_get_unique(struct fuse_iqueue *fiq) { - fc->reqctr++; - /* zero is special */ - if (fc->reqctr == 0) - fc->reqctr = 1; - - return fc->reqctr; + return ++fiq->reqctr; } -static void queue_request(struct fuse_conn *fc, struct fuse_req *req) +static void queue_request(struct fuse_iqueue *fiq, struct fuse_req *req) { req->in.h.len = sizeof(struct fuse_in_header) + len_args(req->in.numargs, (struct fuse_arg *) req->in.args); - list_add_tail(&req->list, &fc->pending); - req->state = FUSE_REQ_PENDING; - if (!req->waiting) { - req->waiting = 1; - atomic_inc(&fc->num_waiting); - } - wake_up(&fc->waitq); - kill_fasync(&fc->fasync, SIGIO, POLL_IN); + list_add_tail(&req->list, &fiq->pending); + wake_up_locked(&fiq->waitq); + kill_fasync(&fiq->fasync, SIGIO, POLL_IN); } void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget, u64 nodeid, u64 nlookup) { + struct fuse_iqueue *fiq = &fc->iq; + forget->forget_one.nodeid = nodeid; forget->forget_one.nlookup = nlookup; - spin_lock(&fc->lock); - if (fc->connected) { - fc->forget_list_tail->next = forget; - fc->forget_list_tail = forget; - wake_up(&fc->waitq); - kill_fasync(&fc->fasync, SIGIO, POLL_IN); + spin_lock(&fiq->waitq.lock); + if (fiq->connected) { + fiq->forget_list_tail->next = forget; + fiq->forget_list_tail = forget; + wake_up_locked(&fiq->waitq); + kill_fasync(&fiq->fasync, SIGIO, POLL_IN); } else { kfree(forget); } - spin_unlock(&fc->lock); + spin_unlock(&fiq->waitq.lock); } static void flush_bg_queue(struct fuse_conn *fc) @@ -356,12 +357,15 @@ static void flush_bg_queue(struct fuse_conn *fc) while (fc->active_background < fc->max_background && !list_empty(&fc->bg_queue)) { struct fuse_req *req; + struct fuse_iqueue *fiq = &fc->iq; req = list_entry(fc->bg_queue.next, struct fuse_req, list); list_del(&req->list); fc->active_background++; - req->in.h.unique = fuse_get_unique(fc); - queue_request(fc, req); + spin_lock(&fiq->waitq.lock); + req->in.h.unique = fuse_get_unique(fiq); + queue_request(fiq, req); + spin_unlock(&fiq->waitq.lock); } } @@ -372,20 +376,22 @@ static void flush_bg_queue(struct fuse_conn *fc) * was closed. The requester thread is woken up (if still waiting), * the 'end' callback is called if given, else the reference to the * request is released - * - * Called with fc->lock, unlocks it */ static void request_end(struct fuse_conn *fc, struct fuse_req *req) -__releases(fc->lock) { - void (*end) (struct fuse_conn *, struct fuse_req *) = req->end; - req->end = NULL; - list_del(&req->list); - list_del(&req->intr_entry); - req->state = FUSE_REQ_FINISHED; - if (req->background) { - req->background = 0; + struct fuse_iqueue *fiq = &fc->iq; + + if (test_and_set_bit(FR_FINISHED, &req->flags)) + return; + spin_lock(&fiq->waitq.lock); + list_del_init(&req->intr_entry); + spin_unlock(&fiq->waitq.lock); + WARN_ON(test_bit(FR_PENDING, &req->flags)); + WARN_ON(test_bit(FR_SENT, &req->flags)); + if (test_bit(FR_BACKGROUND, &req->flags)) { + spin_lock(&fc->lock); + clear_bit(FR_BACKGROUND, &req->flags); if (fc->num_background == fc->max_background) fc->blocked = 0; @@ -401,122 +407,105 @@ __releases(fc->lock) fc->num_background--; fc->active_background--; flush_bg_queue(fc); + spin_unlock(&fc->lock); } - spin_unlock(&fc->lock); wake_up(&req->waitq); - if (end) - end(fc, req); + if (req->end) + req->end(fc, req); fuse_put_request(fc, req); } -static void wait_answer_interruptible(struct fuse_conn *fc, - struct fuse_req *req) -__releases(fc->lock) -__acquires(fc->lock) -{ - if (signal_pending(current)) - return; - - spin_unlock(&fc->lock); - wait_event_interruptible(req->waitq, req->state == FUSE_REQ_FINISHED); - spin_lock(&fc->lock); -} - -static void queue_interrupt(struct fuse_conn *fc, struct fuse_req *req) +static void queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) { - list_add_tail(&req->intr_entry, &fc->interrupts); - wake_up(&fc->waitq); - kill_fasync(&fc->fasync, SIGIO, POLL_IN); + spin_lock(&fiq->waitq.lock); + if (list_empty(&req->intr_entry)) { + list_add_tail(&req->intr_entry, &fiq->interrupts); + wake_up_locked(&fiq->waitq); + } + spin_unlock(&fiq->waitq.lock); + kill_fasync(&fiq->fasync, SIGIO, POLL_IN); } static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req) -__releases(fc->lock) -__acquires(fc->lock) { + struct fuse_iqueue *fiq = &fc->iq; + int err; + if (!fc->no_interrupt) { /* Any signal may interrupt this */ - wait_answer_interruptible(fc, req); - - if (req->aborted) - goto aborted; - if (req->state == FUSE_REQ_FINISHED) + err = wait_event_interruptible(req->waitq, + test_bit(FR_FINISHED, &req->flags)); + if (!err) return; - req->interrupted = 1; - if (req->state == FUSE_REQ_SENT) - queue_interrupt(fc, req); + set_bit(FR_INTERRUPTED, &req->flags); + /* matches barrier in fuse_dev_do_read() */ + smp_mb__after_atomic(); + if (test_bit(FR_SENT, &req->flags)) + queue_interrupt(fiq, req); } - if (!req->force) { + if (!test_bit(FR_FORCE, &req->flags)) { sigset_t oldset; /* Only fatal signals may interrupt this */ block_sigs(&oldset); - wait_answer_interruptible(fc, req); + err = wait_event_interruptible(req->waitq, + test_bit(FR_FINISHED, &req->flags)); restore_sigs(&oldset); - if (req->aborted) - goto aborted; - if (req->state == FUSE_REQ_FINISHED) + if (!err) return; + spin_lock(&fiq->waitq.lock); /* Request is not yet in userspace, bail out */ - if (req->state == FUSE_REQ_PENDING) { + if (test_bit(FR_PENDING, &req->flags)) { list_del(&req->list); + spin_unlock(&fiq->waitq.lock); __fuse_put_request(req); req->out.h.error = -EINTR; return; } + spin_unlock(&fiq->waitq.lock); } /* * Either request is already in userspace, or it was forced. * Wait it out. */ - spin_unlock(&fc->lock); - wait_event(req->waitq, req->state == FUSE_REQ_FINISHED); - spin_lock(&fc->lock); - - if (!req->aborted) - return; - - aborted: - BUG_ON(req->state != FUSE_REQ_FINISHED); - if (req->locked) { - /* This is uninterruptible sleep, because data is - being copied to/from the buffers of req. During - locked state, there mustn't be any filesystem - operation (e.g. page fault), since that could lead - to deadlock */ - spin_unlock(&fc->lock); - wait_event(req->waitq, !req->locked); - spin_lock(&fc->lock); - } + wait_event(req->waitq, test_bit(FR_FINISHED, &req->flags)); } static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) { - BUG_ON(req->background); - spin_lock(&fc->lock); - if (!fc->connected) + struct fuse_iqueue *fiq = &fc->iq; + + BUG_ON(test_bit(FR_BACKGROUND, &req->flags)); + spin_lock(&fiq->waitq.lock); + if (!fiq->connected) { + spin_unlock(&fiq->waitq.lock); req->out.h.error = -ENOTCONN; - else if (fc->conn_error) - req->out.h.error = -ECONNREFUSED; - else { - req->in.h.unique = fuse_get_unique(fc); - queue_request(fc, req); + } else { + req->in.h.unique = fuse_get_unique(fiq); + queue_request(fiq, req); /* acquire extra reference, since request is still needed after request_end() */ __fuse_get_request(req); + spin_unlock(&fiq->waitq.lock); request_wait_answer(fc, req); + /* Pairs with smp_wmb() in request_end() */ + smp_rmb(); } - spin_unlock(&fc->lock); } void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) { - req->isreply = 1; + __set_bit(FR_ISREPLY, &req->flags); + if (!test_bit(FR_WAITING, &req->flags)) { + __set_bit(FR_WAITING, &req->flags); + atomic_inc(&fc->num_waiting); + } __fuse_request_send(fc, req); } EXPORT_SYMBOL_GPL(fuse_request_send); @@ -586,10 +575,20 @@ ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args) return ret; } -static void fuse_request_send_nowait_locked(struct fuse_conn *fc, - struct fuse_req *req) +/* + * Called under fc->lock + * + * fc->connected must have been checked previously + */ +void fuse_request_send_background_locked(struct fuse_conn *fc, + struct fuse_req *req) { - BUG_ON(!req->background); + BUG_ON(!test_bit(FR_BACKGROUND, &req->flags)); + if (!test_bit(FR_WAITING, &req->flags)) { + __set_bit(FR_WAITING, &req->flags); + atomic_inc(&fc->num_waiting); + } + __set_bit(FR_ISREPLY, &req->flags); fc->num_background++; if (fc->num_background == fc->max_background) fc->blocked = 1; @@ -602,54 +601,40 @@ static void fuse_request_send_nowait_locked(struct fuse_conn *fc, flush_bg_queue(fc); } -static void fuse_request_send_nowait(struct fuse_conn *fc, struct fuse_req *req) +void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req) { + BUG_ON(!req->end); spin_lock(&fc->lock); if (fc->connected) { - fuse_request_send_nowait_locked(fc, req); + fuse_request_send_background_locked(fc, req); spin_unlock(&fc->lock); } else { + spin_unlock(&fc->lock); req->out.h.error = -ENOTCONN; - request_end(fc, req); + req->end(fc, req); + fuse_put_request(fc, req); } } - -void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req) -{ - req->isreply = 1; - fuse_request_send_nowait(fc, req); -} EXPORT_SYMBOL_GPL(fuse_request_send_background); static int fuse_request_send_notify_reply(struct fuse_conn *fc, struct fuse_req *req, u64 unique) { int err = -ENODEV; + struct fuse_iqueue *fiq = &fc->iq; - req->isreply = 0; + __clear_bit(FR_ISREPLY, &req->flags); req->in.h.unique = unique; - spin_lock(&fc->lock); - if (fc->connected) { - queue_request(fc, req); + spin_lock(&fiq->waitq.lock); + if (fiq->connected) { + queue_request(fiq, req); err = 0; } - spin_unlock(&fc->lock); + spin_unlock(&fiq->waitq.lock); return err; } -/* - * Called under fc->lock - * - * fc->connected must have been checked previously - */ -void fuse_request_send_background_locked(struct fuse_conn *fc, - struct fuse_req *req) -{ - req->isreply = 1; - fuse_request_send_nowait_locked(fc, req); -} - void fuse_force_forget(struct file *file, u64 nodeid) { struct inode *inode = file_inode(file); @@ -665,7 +650,7 @@ void fuse_force_forget(struct file *file, u64 nodeid) req->in.numargs = 1; req->in.args[0].size = sizeof(inarg); req->in.args[0].value = &inarg; - req->isreply = 0; + __clear_bit(FR_ISREPLY, &req->flags); __fuse_request_send(fc, req); /* ignore errors */ fuse_put_request(fc, req); @@ -676,38 +661,39 @@ void fuse_force_forget(struct file *file, u64 nodeid) * anything that could cause a page-fault. If the request was already * aborted bail out. */ -static int lock_request(struct fuse_conn *fc, struct fuse_req *req) +static int lock_request(struct fuse_req *req) { int err = 0; if (req) { - spin_lock(&fc->lock); - if (req->aborted) + spin_lock(&req->waitq.lock); + if (test_bit(FR_ABORTED, &req->flags)) err = -ENOENT; else - req->locked = 1; - spin_unlock(&fc->lock); + set_bit(FR_LOCKED, &req->flags); + spin_unlock(&req->waitq.lock); } return err; } /* - * Unlock request. If it was aborted during being locked, the - * requester thread is currently waiting for it to be unlocked, so - * wake it up. + * Unlock request. If it was aborted while locked, caller is responsible + * for unlocking and ending the request. */ -static void unlock_request(struct fuse_conn *fc, struct fuse_req *req) +static int unlock_request(struct fuse_req *req) { + int err = 0; if (req) { - spin_lock(&fc->lock); - req->locked = 0; - if (req->aborted) - wake_up(&req->waitq); - spin_unlock(&fc->lock); + spin_lock(&req->waitq.lock); + if (test_bit(FR_ABORTED, &req->flags)) + err = -ENOENT; + else + clear_bit(FR_LOCKED, &req->flags); + spin_unlock(&req->waitq.lock); } + return err; } struct fuse_copy_state { - struct fuse_conn *fc; int write; struct fuse_req *req; struct iov_iter *iter; @@ -721,13 +707,10 @@ struct fuse_copy_state { unsigned move_pages:1; }; -static void fuse_copy_init(struct fuse_copy_state *cs, - struct fuse_conn *fc, - int write, +static void fuse_copy_init(struct fuse_copy_state *cs, int write, struct iov_iter *iter) { memset(cs, 0, sizeof(*cs)); - cs->fc = fc; cs->write = write; cs->iter = iter; } @@ -760,7 +743,10 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) struct page *page; int err; - unlock_request(cs->fc, cs->req); + err = unlock_request(cs->req); + if (err) + return err; + fuse_copy_finish(cs); if (cs->pipebufs) { struct pipe_buffer *buf = cs->pipebufs; @@ -809,7 +795,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) iov_iter_advance(cs->iter, err); } - return lock_request(cs->fc, cs->req); + return lock_request(cs->req); } /* Do as much copy to/from userspace buffer as we can */ @@ -860,7 +846,10 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) struct page *newpage; struct pipe_buffer *buf = cs->pipebufs; - unlock_request(cs->fc, cs->req); + err = unlock_request(cs->req); + if (err) + return err; + fuse_copy_finish(cs); err = buf->ops->confirm(cs->pipe, buf); @@ -914,12 +903,12 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) lru_cache_add_file(newpage); err = 0; - spin_lock(&cs->fc->lock); - if (cs->req->aborted) + spin_lock(&cs->req->waitq.lock); + if (test_bit(FR_ABORTED, &cs->req->flags)) err = -ENOENT; else *pagep = newpage; - spin_unlock(&cs->fc->lock); + spin_unlock(&cs->req->waitq.lock); if (err) { unlock_page(newpage); @@ -939,7 +928,7 @@ out_fallback: cs->pg = buf->page; cs->offset = buf->offset; - err = lock_request(cs->fc, cs->req); + err = lock_request(cs->req); if (err) return err; @@ -950,11 +939,15 @@ static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page, unsigned offset, unsigned count) { struct pipe_buffer *buf; + int err; if (cs->nr_segs == cs->pipe->buffers) return -EIO; - unlock_request(cs->fc, cs->req); + err = unlock_request(cs->req); + if (err) + return err; + fuse_copy_finish(cs); buf = cs->pipebufs; @@ -1065,36 +1058,15 @@ static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs, return err; } -static int forget_pending(struct fuse_conn *fc) +static int forget_pending(struct fuse_iqueue *fiq) { - return fc->forget_list_head.next != NULL; + return fiq->forget_list_head.next != NULL; } -static int request_pending(struct fuse_conn *fc) +static int request_pending(struct fuse_iqueue *fiq) { - return !list_empty(&fc->pending) || !list_empty(&fc->interrupts) || - forget_pending(fc); -} - -/* Wait until a request is available on the pending list */ -static void request_wait(struct fuse_conn *fc) -__releases(fc->lock) -__acquires(fc->lock) -{ - DECLARE_WAITQUEUE(wait, current); - - add_wait_queue_exclusive(&fc->waitq, &wait); - while (fc->connected && !request_pending(fc)) { - set_current_state(TASK_INTERRUPTIBLE); - if (signal_pending(current)) - break; - - spin_unlock(&fc->lock); - schedule(); - spin_lock(&fc->lock); - } - set_current_state(TASK_RUNNING); - remove_wait_queue(&fc->waitq, &wait); + return !list_empty(&fiq->pending) || !list_empty(&fiq->interrupts) || + forget_pending(fiq); } /* @@ -1103,11 +1075,12 @@ __acquires(fc->lock) * Unlike other requests this is assembled on demand, without a need * to allocate a separate fuse_req structure. * - * Called with fc->lock held, releases it + * Called with fiq->waitq.lock held, releases it */ -static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_copy_state *cs, +static int fuse_read_interrupt(struct fuse_iqueue *fiq, + struct fuse_copy_state *cs, size_t nbytes, struct fuse_req *req) -__releases(fc->lock) +__releases(fiq->waitq.lock) { struct fuse_in_header ih; struct fuse_interrupt_in arg; @@ -1115,7 +1088,7 @@ __releases(fc->lock) int err; list_del_init(&req->intr_entry); - req->intr_unique = fuse_get_unique(fc); + req->intr_unique = fuse_get_unique(fiq); memset(&ih, 0, sizeof(ih)); memset(&arg, 0, sizeof(arg)); ih.len = reqsize; @@ -1123,7 +1096,7 @@ __releases(fc->lock) ih.unique = req->intr_unique; arg.unique = req->in.h.unique; - spin_unlock(&fc->lock); + spin_unlock(&fiq->waitq.lock); if (nbytes < reqsize) return -EINVAL; @@ -1135,21 +1108,21 @@ __releases(fc->lock) return err ? err : reqsize; } -static struct fuse_forget_link *dequeue_forget(struct fuse_conn *fc, +static struct fuse_forget_link *dequeue_forget(struct fuse_iqueue *fiq, unsigned max, unsigned *countp) { - struct fuse_forget_link *head = fc->forget_list_head.next; + struct fuse_forget_link *head = fiq->forget_list_head.next; struct fuse_forget_link **newhead = &head; unsigned count; for (count = 0; *newhead != NULL && count < max; count++) newhead = &(*newhead)->next; - fc->forget_list_head.next = *newhead; + fiq->forget_list_head.next = *newhead; *newhead = NULL; - if (fc->forget_list_head.next == NULL) - fc->forget_list_tail = &fc->forget_list_head; + if (fiq->forget_list_head.next == NULL) + fiq->forget_list_tail = &fiq->forget_list_head; if (countp != NULL) *countp = count; @@ -1157,24 +1130,24 @@ static struct fuse_forget_link *dequeue_forget(struct fuse_conn *fc, return head; } -static int fuse_read_single_forget(struct fuse_conn *fc, +static int fuse_read_single_forget(struct fuse_iqueue *fiq, struct fuse_copy_state *cs, size_t nbytes) -__releases(fc->lock) +__releases(fiq->waitq.lock) { int err; - struct fuse_forget_link *forget = dequeue_forget(fc, 1, NULL); + struct fuse_forget_link *forget = dequeue_forget(fiq, 1, NULL); struct fuse_forget_in arg = { .nlookup = forget->forget_one.nlookup, }; struct fuse_in_header ih = { .opcode = FUSE_FORGET, .nodeid = forget->forget_one.nodeid, - .unique = fuse_get_unique(fc), + .unique = fuse_get_unique(fiq), .len = sizeof(ih) + sizeof(arg), }; - spin_unlock(&fc->lock); + spin_unlock(&fiq->waitq.lock); kfree(forget); if (nbytes < ih.len) return -EINVAL; @@ -1190,9 +1163,9 @@ __releases(fc->lock) return ih.len; } -static int fuse_read_batch_forget(struct fuse_conn *fc, +static int fuse_read_batch_forget(struct fuse_iqueue *fiq, struct fuse_copy_state *cs, size_t nbytes) -__releases(fc->lock) +__releases(fiq->waitq.lock) { int err; unsigned max_forgets; @@ -1201,18 +1174,18 @@ __releases(fc->lock) struct fuse_batch_forget_in arg = { .count = 0 }; struct fuse_in_header ih = { .opcode = FUSE_BATCH_FORGET, - .unique = fuse_get_unique(fc), + .unique = fuse_get_unique(fiq), .len = sizeof(ih) + sizeof(arg), }; if (nbytes < ih.len) { - spin_unlock(&fc->lock); + spin_unlock(&fiq->waitq.lock); return -EINVAL; } max_forgets = (nbytes - ih.len) / sizeof(struct fuse_forget_one); - head = dequeue_forget(fc, max_forgets, &count); - spin_unlock(&fc->lock); + head = dequeue_forget(fiq, max_forgets, &count); + spin_unlock(&fiq->waitq.lock); arg.count = count; ih.len += count * sizeof(struct fuse_forget_one); @@ -1239,14 +1212,15 @@ __releases(fc->lock) return ih.len; } -static int fuse_read_forget(struct fuse_conn *fc, struct fuse_copy_state *cs, +static int fuse_read_forget(struct fuse_conn *fc, struct fuse_iqueue *fiq, + struct fuse_copy_state *cs, size_t nbytes) -__releases(fc->lock) +__releases(fiq->waitq.lock) { - if (fc->minor < 16 || fc->forget_list_head.next->next == NULL) - return fuse_read_single_forget(fc, cs, nbytes); + if (fc->minor < 16 || fiq->forget_list_head.next->next == NULL) + return fuse_read_single_forget(fiq, cs, nbytes); else - return fuse_read_batch_forget(fc, cs, nbytes); + return fuse_read_batch_forget(fiq, cs, nbytes); } /* @@ -1258,46 +1232,51 @@ __releases(fc->lock) * request_end(). Otherwise add it to the processing list, and set * the 'sent' flag. */ -static ssize_t fuse_dev_do_read(struct fuse_conn *fc, struct file *file, +static ssize_t fuse_dev_do_read(struct fuse_dev *fud, struct file *file, struct fuse_copy_state *cs, size_t nbytes) { - int err; + ssize_t err; + struct fuse_conn *fc = fud->fc; + struct fuse_iqueue *fiq = &fc->iq; + struct fuse_pqueue *fpq = &fud->pq; struct fuse_req *req; struct fuse_in *in; unsigned reqsize; restart: - spin_lock(&fc->lock); + spin_lock(&fiq->waitq.lock); err = -EAGAIN; - if ((file->f_flags & O_NONBLOCK) && fc->connected && - !request_pending(fc)) + if ((file->f_flags & O_NONBLOCK) && fiq->connected && + !request_pending(fiq)) goto err_unlock; - request_wait(fc); - err = -ENODEV; - if (!fc->connected) + err = wait_event_interruptible_exclusive_locked(fiq->waitq, + !fiq->connected || request_pending(fiq)); + if (err) goto err_unlock; - err = -ERESTARTSYS; - if (!request_pending(fc)) + + err = -ENODEV; + if (!fiq->connected) goto err_unlock; - if (!list_empty(&fc->interrupts)) { - req = list_entry(fc->interrupts.next, struct fuse_req, + if (!list_empty(&fiq->interrupts)) { + req = list_entry(fiq->interrupts.next, struct fuse_req, intr_entry); - return fuse_read_interrupt(fc, cs, nbytes, req); + return fuse_read_interrupt(fiq, cs, nbytes, req); } - if (forget_pending(fc)) { - if (list_empty(&fc->pending) || fc->forget_batch-- > 0) - return fuse_read_forget(fc, cs, nbytes); + if (forget_pending(fiq)) { + if (list_empty(&fiq->pending) || fiq->forget_batch-- > 0) + return fuse_read_forget(fc, fiq, cs, nbytes); - if (fc->forget_batch <= -8) - fc->forget_batch = 16; + if (fiq->forget_batch <= -8) + fiq->forget_batch = 16; } - req = list_entry(fc->pending.next, struct fuse_req, list); - req->state = FUSE_REQ_READING; - list_move(&req->list, &fc->io); + req = list_entry(fiq->pending.next, struct fuse_req, list); + clear_bit(FR_PENDING, &req->flags); + list_del_init(&req->list); + spin_unlock(&fiq->waitq.lock); in = &req->in; reqsize = in->h.len; @@ -1310,37 +1289,48 @@ static ssize_t fuse_dev_do_read(struct fuse_conn *fc, struct file *file, request_end(fc, req); goto restart; } - spin_unlock(&fc->lock); + spin_lock(&fpq->lock); + list_add(&req->list, &fpq->io); + spin_unlock(&fpq->lock); cs->req = req; err = fuse_copy_one(cs, &in->h, sizeof(in->h)); if (!err) err = fuse_copy_args(cs, in->numargs, in->argpages, (struct fuse_arg *) in->args, 0); fuse_copy_finish(cs); - spin_lock(&fc->lock); - req->locked = 0; - if (req->aborted) { - request_end(fc, req); - return -ENODEV; + spin_lock(&fpq->lock); + clear_bit(FR_LOCKED, &req->flags); + if (!fpq->connected) { + err = -ENODEV; + goto out_end; } if (err) { req->out.h.error = -EIO; - request_end(fc, req); - return err; + goto out_end; } - if (!req->isreply) - request_end(fc, req); - else { - req->state = FUSE_REQ_SENT; - list_move_tail(&req->list, &fc->processing); - if (req->interrupted) - queue_interrupt(fc, req); - spin_unlock(&fc->lock); + if (!test_bit(FR_ISREPLY, &req->flags)) { + err = reqsize; + goto out_end; } + list_move_tail(&req->list, &fpq->processing); + spin_unlock(&fpq->lock); + set_bit(FR_SENT, &req->flags); + /* matches barrier in request_wait_answer() */ + smp_mb__after_atomic(); + if (test_bit(FR_INTERRUPTED, &req->flags)) + queue_interrupt(fiq, req); + return reqsize; +out_end: + if (!test_bit(FR_PRIVATE, &req->flags)) + list_del_init(&req->list); + spin_unlock(&fpq->lock); + request_end(fc, req); + return err; + err_unlock: - spin_unlock(&fc->lock); + spin_unlock(&fiq->waitq.lock); return err; } @@ -1359,16 +1349,17 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, struct iov_iter *to) { struct fuse_copy_state cs; struct file *file = iocb->ki_filp; - struct fuse_conn *fc = fuse_get_conn(file); - if (!fc) + struct fuse_dev *fud = fuse_get_dev(file); + + if (!fud) return -EPERM; if (!iter_is_iovec(to)) return -EINVAL; - fuse_copy_init(&cs, fc, 1, to); + fuse_copy_init(&cs, 1, to); - return fuse_dev_do_read(fc, file, &cs, iov_iter_count(to)); + return fuse_dev_do_read(fud, file, &cs, iov_iter_count(to)); } static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, @@ -1380,18 +1371,19 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, int do_wakeup = 0; struct pipe_buffer *bufs; struct fuse_copy_state cs; - struct fuse_conn *fc = fuse_get_conn(in); - if (!fc) + struct fuse_dev *fud = fuse_get_dev(in); + + if (!fud) return -EPERM; bufs = kmalloc(pipe->buffers * sizeof(struct pipe_buffer), GFP_KERNEL); if (!bufs) return -ENOMEM; - fuse_copy_init(&cs, fc, 1, NULL); + fuse_copy_init(&cs, 1, NULL); cs.pipebufs = bufs; cs.pipe = pipe; - ret = fuse_dev_do_read(fc, in, &cs, len); + ret = fuse_dev_do_read(fud, in, &cs, len); if (ret < 0) goto out; @@ -1830,11 +1822,11 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, } /* Look up request on processing list by unique ID */ -static struct fuse_req *request_find(struct fuse_conn *fc, u64 unique) +static struct fuse_req *request_find(struct fuse_pqueue *fpq, u64 unique) { struct fuse_req *req; - list_for_each_entry(req, &fc->processing, list) { + list_for_each_entry(req, &fpq->processing, list) { if (req->in.h.unique == unique || req->intr_unique == unique) return req; } @@ -1871,10 +1863,12 @@ static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out, * it from the list and copy the rest of the buffer to the request. * The request is finished by calling request_end() */ -static ssize_t fuse_dev_do_write(struct fuse_conn *fc, +static ssize_t fuse_dev_do_write(struct fuse_dev *fud, struct fuse_copy_state *cs, size_t nbytes) { int err; + struct fuse_conn *fc = fud->fc; + struct fuse_pqueue *fpq = &fud->pq; struct fuse_req *req; struct fuse_out_header oh; @@ -1902,63 +1896,60 @@ static ssize_t fuse_dev_do_write(struct fuse_conn *fc, if (oh.error <= -1000 || oh.error > 0) goto err_finish; - spin_lock(&fc->lock); + spin_lock(&fpq->lock); err = -ENOENT; - if (!fc->connected) - goto err_unlock; + if (!fpq->connected) + goto err_unlock_pq; - req = request_find(fc, oh.unique); + req = request_find(fpq, oh.unique); if (!req) - goto err_unlock; + goto err_unlock_pq; - if (req->aborted) { - spin_unlock(&fc->lock); - fuse_copy_finish(cs); - spin_lock(&fc->lock); - request_end(fc, req); - return -ENOENT; - } /* Is it an interrupt reply? */ if (req->intr_unique == oh.unique) { + spin_unlock(&fpq->lock); + err = -EINVAL; if (nbytes != sizeof(struct fuse_out_header)) - goto err_unlock; + goto err_finish; if (oh.error == -ENOSYS) fc->no_interrupt = 1; else if (oh.error == -EAGAIN) - queue_interrupt(fc, req); + queue_interrupt(&fc->iq, req); - spin_unlock(&fc->lock); fuse_copy_finish(cs); return nbytes; } - req->state = FUSE_REQ_WRITING; - list_move(&req->list, &fc->io); + clear_bit(FR_SENT, &req->flags); + list_move(&req->list, &fpq->io); req->out.h = oh; - req->locked = 1; + set_bit(FR_LOCKED, &req->flags); + spin_unlock(&fpq->lock); cs->req = req; if (!req->out.page_replace) cs->move_pages = 0; - spin_unlock(&fc->lock); err = copy_out_args(cs, &req->out, nbytes); fuse_copy_finish(cs); - spin_lock(&fc->lock); - req->locked = 0; - if (!err) { - if (req->aborted) - err = -ENOENT; - } else if (!req->aborted) + spin_lock(&fpq->lock); + clear_bit(FR_LOCKED, &req->flags); + if (!fpq->connected) + err = -ENOENT; + else if (err) req->out.h.error = -EIO; + if (!test_bit(FR_PRIVATE, &req->flags)) + list_del_init(&req->list); + spin_unlock(&fpq->lock); + request_end(fc, req); return err ? err : nbytes; - err_unlock: - spin_unlock(&fc->lock); + err_unlock_pq: + spin_unlock(&fpq->lock); err_finish: fuse_copy_finish(cs); return err; @@ -1967,16 +1958,17 @@ static ssize_t fuse_dev_do_write(struct fuse_conn *fc, static ssize_t fuse_dev_write(struct kiocb *iocb, struct iov_iter *from) { struct fuse_copy_state cs; - struct fuse_conn *fc = fuse_get_conn(iocb->ki_filp); - if (!fc) + struct fuse_dev *fud = fuse_get_dev(iocb->ki_filp); + + if (!fud) return -EPERM; if (!iter_is_iovec(from)) return -EINVAL; - fuse_copy_init(&cs, fc, 0, from); + fuse_copy_init(&cs, 0, from); - return fuse_dev_do_write(fc, &cs, iov_iter_count(from)); + return fuse_dev_do_write(fud, &cs, iov_iter_count(from)); } static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, @@ -1987,12 +1979,12 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, unsigned idx; struct pipe_buffer *bufs; struct fuse_copy_state cs; - struct fuse_conn *fc; + struct fuse_dev *fud; size_t rem; ssize_t ret; - fc = fuse_get_conn(out); - if (!fc) + fud = fuse_get_dev(out); + if (!fud) return -EPERM; bufs = kmalloc(pipe->buffers * sizeof(struct pipe_buffer), GFP_KERNEL); @@ -2039,7 +2031,7 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, } pipe_unlock(pipe); - fuse_copy_init(&cs, fc, 0, NULL); + fuse_copy_init(&cs, 0, NULL); cs.pipebufs = bufs; cs.nr_segs = nbuf; cs.pipe = pipe; @@ -2047,7 +2039,7 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, if (flags & SPLICE_F_MOVE) cs.move_pages = 1; - ret = fuse_dev_do_write(fc, &cs, len); + ret = fuse_dev_do_write(fud, &cs, len); for (idx = 0; idx < nbuf; idx++) { struct pipe_buffer *buf = &bufs[idx]; @@ -2061,18 +2053,21 @@ out: static unsigned fuse_dev_poll(struct file *file, poll_table *wait) { unsigned mask = POLLOUT | POLLWRNORM; - struct fuse_conn *fc = fuse_get_conn(file); - if (!fc) + struct fuse_iqueue *fiq; + struct fuse_dev *fud = fuse_get_dev(file); + + if (!fud) return POLLERR; - poll_wait(file, &fc->waitq, wait); + fiq = &fud->fc->iq; + poll_wait(file, &fiq->waitq, wait); - spin_lock(&fc->lock); - if (!fc->connected) + spin_lock(&fiq->waitq.lock); + if (!fiq->connected) mask = POLLERR; - else if (request_pending(fc)) + else if (request_pending(fiq)) mask |= POLLIN | POLLRDNORM; - spin_unlock(&fc->lock); + spin_unlock(&fiq->waitq.lock); return mask; } @@ -2083,67 +2078,18 @@ static unsigned fuse_dev_poll(struct file *file, poll_table *wait) * This function releases and reacquires fc->lock */ static void end_requests(struct fuse_conn *fc, struct list_head *head) -__releases(fc->lock) -__acquires(fc->lock) { while (!list_empty(head)) { struct fuse_req *req; req = list_entry(head->next, struct fuse_req, list); req->out.h.error = -ECONNABORTED; - request_end(fc, req); - spin_lock(&fc->lock); - } -} - -/* - * Abort requests under I/O - * - * The requests are set to aborted and finished, and the request - * waiter is woken up. This will make request_wait_answer() wait - * until the request is unlocked and then return. - * - * If the request is asynchronous, then the end function needs to be - * called after waiting for the request to be unlocked (if it was - * locked). - */ -static void end_io_requests(struct fuse_conn *fc) -__releases(fc->lock) -__acquires(fc->lock) -{ - while (!list_empty(&fc->io)) { - struct fuse_req *req = - list_entry(fc->io.next, struct fuse_req, list); - void (*end) (struct fuse_conn *, struct fuse_req *) = req->end; - - req->aborted = 1; - req->out.h.error = -ECONNABORTED; - req->state = FUSE_REQ_FINISHED; + clear_bit(FR_PENDING, &req->flags); + clear_bit(FR_SENT, &req->flags); list_del_init(&req->list); - wake_up(&req->waitq); - if (end) { - req->end = NULL; - __fuse_get_request(req); - spin_unlock(&fc->lock); - wait_event(req->waitq, !req->locked); - end(fc, req); - fuse_put_request(fc, req); - spin_lock(&fc->lock); - } + request_end(fc, req); } } -static void end_queued_requests(struct fuse_conn *fc) -__releases(fc->lock) -__acquires(fc->lock) -{ - fc->max_background = UINT_MAX; - flush_bg_queue(fc); - end_requests(fc, &fc->pending); - end_requests(fc, &fc->processing); - while (forget_pending(fc)) - kfree(dequeue_forget(fc, 1, NULL)); -} - static void end_polls(struct fuse_conn *fc) { struct rb_node *p; @@ -2162,67 +2108,164 @@ static void end_polls(struct fuse_conn *fc) /* * Abort all requests. * - * Emergency exit in case of a malicious or accidental deadlock, or - * just a hung filesystem. - * - * The same effect is usually achievable through killing the - * filesystem daemon and all users of the filesystem. The exception - * is the combination of an asynchronous request and the tricky - * deadlock (see Documentation/filesystems/fuse.txt). + * Emergency exit in case of a malicious or accidental deadlock, or just a hung + * filesystem. * - * During the aborting, progression of requests from the pending and - * processing lists onto the io list, and progression of new requests - * onto the pending list is prevented by req->connected being false. + * The same effect is usually achievable through killing the filesystem daemon + * and all users of the filesystem. The exception is the combination of an + * asynchronous request and the tricky deadlock (see + * Documentation/filesystems/fuse.txt). * - * Progression of requests under I/O to the processing list is - * prevented by the req->aborted flag being true for these requests. - * For this reason requests on the io list must be aborted first. + * Aborting requests under I/O goes as follows: 1: Separate out unlocked + * requests, they should be finished off immediately. Locked requests will be + * finished after unlock; see unlock_request(). 2: Finish off the unlocked + * requests. It is possible that some request will finish before we can. This + * is OK, the request will in that case be removed from the list before we touch + * it. */ void fuse_abort_conn(struct fuse_conn *fc) { + struct fuse_iqueue *fiq = &fc->iq; + spin_lock(&fc->lock); if (fc->connected) { + struct fuse_dev *fud; + struct fuse_req *req, *next; + LIST_HEAD(to_end1); + LIST_HEAD(to_end2); + fc->connected = 0; fc->blocked = 0; fuse_set_initialized(fc); - end_io_requests(fc); - end_queued_requests(fc); + list_for_each_entry(fud, &fc->devices, entry) { + struct fuse_pqueue *fpq = &fud->pq; + + spin_lock(&fpq->lock); + fpq->connected = 0; + list_for_each_entry_safe(req, next, &fpq->io, list) { + req->out.h.error = -ECONNABORTED; + spin_lock(&req->waitq.lock); + set_bit(FR_ABORTED, &req->flags); + if (!test_bit(FR_LOCKED, &req->flags)) { + set_bit(FR_PRIVATE, &req->flags); + list_move(&req->list, &to_end1); + } + spin_unlock(&req->waitq.lock); + } + list_splice_init(&fpq->processing, &to_end2); + spin_unlock(&fpq->lock); + } + fc->max_background = UINT_MAX; + flush_bg_queue(fc); + + spin_lock(&fiq->waitq.lock); + fiq->connected = 0; + list_splice_init(&fiq->pending, &to_end2); + while (forget_pending(fiq)) + kfree(dequeue_forget(fiq, 1, NULL)); + wake_up_all_locked(&fiq->waitq); + spin_unlock(&fiq->waitq.lock); + kill_fasync(&fiq->fasync, SIGIO, POLL_IN); end_polls(fc); - wake_up_all(&fc->waitq); wake_up_all(&fc->blocked_waitq); - kill_fasync(&fc->fasync, SIGIO, POLL_IN); + spin_unlock(&fc->lock); + + while (!list_empty(&to_end1)) { + req = list_first_entry(&to_end1, struct fuse_req, list); + __fuse_get_request(req); + list_del_init(&req->list); + request_end(fc, req); + } + end_requests(fc, &to_end2); + } else { + spin_unlock(&fc->lock); } - spin_unlock(&fc->lock); } EXPORT_SYMBOL_GPL(fuse_abort_conn); int fuse_dev_release(struct inode *inode, struct file *file) { - struct fuse_conn *fc = fuse_get_conn(file); - if (fc) { - spin_lock(&fc->lock); - fc->connected = 0; - fc->blocked = 0; - fuse_set_initialized(fc); - end_queued_requests(fc); - end_polls(fc); - wake_up_all(&fc->blocked_waitq); - spin_unlock(&fc->lock); - fuse_conn_put(fc); - } + struct fuse_dev *fud = fuse_get_dev(file); + + if (fud) { + struct fuse_conn *fc = fud->fc; + struct fuse_pqueue *fpq = &fud->pq; + WARN_ON(!list_empty(&fpq->io)); + end_requests(fc, &fpq->processing); + /* Are we the last open device? */ + if (atomic_dec_and_test(&fc->dev_count)) { + WARN_ON(fc->iq.fasync != NULL); + fuse_abort_conn(fc); + } + fuse_dev_free(fud); + } return 0; } EXPORT_SYMBOL_GPL(fuse_dev_release); static int fuse_dev_fasync(int fd, struct file *file, int on) { - struct fuse_conn *fc = fuse_get_conn(file); - if (!fc) + struct fuse_dev *fud = fuse_get_dev(file); + + if (!fud) return -EPERM; /* No locking - fasync_helper does its own locking */ - return fasync_helper(fd, file, on, &fc->fasync); + return fasync_helper(fd, file, on, &fud->fc->iq.fasync); +} + +static int fuse_device_clone(struct fuse_conn *fc, struct file *new) +{ + struct fuse_dev *fud; + + if (new->private_data) + return -EINVAL; + + fud = fuse_dev_alloc(fc); + if (!fud) + return -ENOMEM; + + new->private_data = fud; + atomic_inc(&fc->dev_count); + + return 0; +} + +static long fuse_dev_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + int err = -ENOTTY; + + if (cmd == FUSE_DEV_IOC_CLONE) { + int oldfd; + + err = -EFAULT; + if (!get_user(oldfd, (__u32 __user *) arg)) { + struct file *old = fget(oldfd); + + err = -EINVAL; + if (old) { + struct fuse_dev *fud = NULL; + + /* + * Check against file->f_op because CUSE + * uses the same ioctl handler. + */ + if (old->f_op == file->f_op && + old->f_cred->user_ns == file->f_cred->user_ns) + fud = fuse_get_dev(old); + + if (fud) { + mutex_lock(&fuse_mutex); + err = fuse_device_clone(fud->fc, file); + mutex_unlock(&fuse_mutex); + } + fput(old); + } + } + } + return err; } const struct file_operations fuse_dev_operations = { @@ -2236,6 +2279,8 @@ const struct file_operations fuse_dev_operations = { .poll = fuse_dev_poll, .release = fuse_dev_release, .fasync = fuse_dev_fasync, + .unlocked_ioctl = fuse_dev_ioctl, + .compat_ioctl = fuse_dev_ioctl, }; EXPORT_SYMBOL_GPL(fuse_dev_operations); diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 0572bca49..5e2e08712 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1365,7 +1365,7 @@ static int fuse_readdir(struct file *file, struct dir_context *ctx) return err; } -static char *read_link(struct dentry *dentry) +static const char *fuse_follow_link(struct dentry *dentry, void **cookie) { struct inode *inode = d_inode(dentry); struct fuse_conn *fc = get_fuse_conn(inode); @@ -1389,28 +1389,12 @@ static char *read_link(struct dentry *dentry) link = ERR_PTR(ret); } else { link[ret] = '\0'; + *cookie = link; } fuse_invalidate_atime(inode); return link; } -static void free_link(char *link) -{ - if (!IS_ERR(link)) - free_page((unsigned long) link); -} - -static void *fuse_follow_link(struct dentry *dentry, struct nameidata *nd) -{ - nd_set_link(nd, read_link(dentry)); - return NULL; -} - -static void fuse_put_link(struct dentry *dentry, struct nameidata *nd, void *c) -{ - free_link(nd_get_link(nd)); -} - static int fuse_dir_open(struct inode *inode, struct file *file) { return fuse_open_common(inode, file, true); @@ -1926,7 +1910,7 @@ static const struct inode_operations fuse_common_inode_operations = { static const struct inode_operations fuse_symlink_inode_operations = { .setattr = fuse_setattr, .follow_link = fuse_follow_link, - .put_link = fuse_put_link, + .put_link = free_page_put_link, .readlink = generic_readlink, .getattr = fuse_getattr, .setxattr = fuse_setxattr, diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 5ef05b5c4..f523f2f04 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -96,17 +96,17 @@ static void fuse_file_put(struct fuse_file *ff, bool sync) * Drop the release request when client does not * implement 'open' */ - req->background = 0; + __clear_bit(FR_BACKGROUND, &req->flags); iput(req->misc.release.inode); fuse_put_request(ff->fc, req); } else if (sync) { - req->background = 0; + __clear_bit(FR_BACKGROUND, &req->flags); fuse_request_send(ff->fc, req); iput(req->misc.release.inode); fuse_put_request(ff->fc, req); } else { req->end = fuse_release_end; - req->background = 1; + __set_bit(FR_BACKGROUND, &req->flags); fuse_request_send_background(ff->fc, req); } kfree(ff); @@ -299,8 +299,8 @@ void fuse_sync_release(struct fuse_file *ff, int flags) { WARN_ON(atomic_read(&ff->count) > 1); fuse_prepare_release(ff, flags, FUSE_RELEASE); - ff->reserved_req->force = 1; - ff->reserved_req->background = 0; + __set_bit(FR_FORCE, &ff->reserved_req->flags); + __clear_bit(FR_BACKGROUND, &ff->reserved_req->flags); fuse_request_send(ff->fc, ff->reserved_req); fuse_put_request(ff->fc, ff->reserved_req); kfree(ff); @@ -426,7 +426,7 @@ static int fuse_flush(struct file *file, fl_owner_t id) req->in.numargs = 1; req->in.args[0].size = sizeof(inarg); req->in.args[0].value = &inarg; - req->force = 1; + __set_bit(FR_FORCE, &req->flags); fuse_request_send(fc, req); err = req->out.h.error; fuse_put_request(fc, req); @@ -1169,7 +1169,7 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (err <= 0) goto out; - err = file_remove_suid(file); + err = file_remove_privs(file); if (err) goto out; @@ -1445,9 +1445,9 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) list_del(&req->writepages_entry); for (i = 0; i < req->num_pages; i++) { - dec_bdi_stat(bdi, BDI_WRITEBACK); + dec_wb_stat(&bdi->wb, WB_WRITEBACK); dec_zone_page_state(req->pages[i], NR_WRITEBACK_TEMP); - bdi_writeout_inc(bdi); + wb_writeout_inc(&bdi->wb); } wake_up(&fi->page_waitq); } @@ -1611,7 +1611,8 @@ static int fuse_writepage_locked(struct page *page) if (!req) goto err; - req->background = 1; /* writeback always goes to bg_queue */ + /* writeback always goes to bg_queue */ + __set_bit(FR_BACKGROUND, &req->flags); tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); if (!tmp_page) goto err_free; @@ -1634,7 +1635,7 @@ static int fuse_writepage_locked(struct page *page) req->end = fuse_writepage_end; req->inode = inode; - inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK); + inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP); spin_lock(&fc->lock); @@ -1742,16 +1743,15 @@ static bool fuse_writepage_in_flight(struct fuse_req *new_req, } } - if (old_req->num_pages == 1 && (old_req->state == FUSE_REQ_INIT || - old_req->state == FUSE_REQ_PENDING)) { + if (old_req->num_pages == 1 && test_bit(FR_PENDING, &old_req->flags)) { struct backing_dev_info *bdi = inode_to_bdi(page->mapping->host); copy_highpage(old_req->pages[0], page); spin_unlock(&fc->lock); - dec_bdi_stat(bdi, BDI_WRITEBACK); + dec_wb_stat(&bdi->wb, WB_WRITEBACK); dec_zone_page_state(page, NR_WRITEBACK_TEMP); - bdi_writeout_inc(bdi); + wb_writeout_inc(&bdi->wb); fuse_writepage_free(fc, new_req); fuse_request_free(new_req); goto out; @@ -1830,7 +1830,7 @@ static int fuse_writepages_fill(struct page *page, req->misc.write.in.write_flags |= FUSE_WRITE_CACHE; req->misc.write.next = NULL; req->in.argpages = 1; - req->background = 1; + __set_bit(FR_BACKGROUND, &req->flags); req->num_pages = 0; req->end = fuse_writepage_end; req->inode = inode; @@ -1848,7 +1848,7 @@ static int fuse_writepages_fill(struct page *page, req->page_descs[req->num_pages].offset = 0; req->page_descs[req->num_pages].length = PAGE_SIZE; - inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK); + inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP); err = 0; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 7354dc142..405113101 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -241,16 +241,6 @@ struct fuse_args { #define FUSE_ARGS(args) struct fuse_args args = {} -/** The request state */ -enum fuse_req_state { - FUSE_REQ_INIT = 0, - FUSE_REQ_PENDING, - FUSE_REQ_READING, - FUSE_REQ_SENT, - FUSE_REQ_WRITING, - FUSE_REQ_FINISHED -}; - /** The request IO state (for asynchronous processing) */ struct fuse_io_priv { int async; @@ -267,7 +257,40 @@ struct fuse_io_priv { }; /** + * Request flags + * + * FR_ISREPLY: set if the request has reply + * FR_FORCE: force sending of the request even if interrupted + * FR_BACKGROUND: request is sent in the background + * FR_WAITING: request is counted as "waiting" + * FR_ABORTED: the request was aborted + * FR_INTERRUPTED: the request has been interrupted + * FR_LOCKED: data is being copied to/from the request + * FR_PENDING: request is not yet in userspace + * FR_SENT: request is in userspace, waiting for an answer + * FR_FINISHED: request is finished + * FR_PRIVATE: request is on private list + */ +enum fuse_req_flag { + FR_ISREPLY, + FR_FORCE, + FR_BACKGROUND, + FR_WAITING, + FR_ABORTED, + FR_INTERRUPTED, + FR_LOCKED, + FR_PENDING, + FR_SENT, + FR_FINISHED, + FR_PRIVATE, +}; + +/** * A request to the client + * + * .waitq.lock protects the following fields: + * - FR_ABORTED + * - FR_LOCKED (may also be modified under fc->lock, tested under both) */ struct fuse_req { /** This can be on either pending processing or io lists in @@ -283,35 +306,8 @@ struct fuse_req { /** Unique ID for the interrupt request */ u64 intr_unique; - /* - * The following bitfields are either set once before the - * request is queued or setting/clearing them is protected by - * fuse_conn->lock - */ - - /** True if the request has reply */ - unsigned isreply:1; - - /** Force sending of the request even if interrupted */ - unsigned force:1; - - /** The request was aborted */ - unsigned aborted:1; - - /** Request is sent in the background */ - unsigned background:1; - - /** The request has been interrupted */ - unsigned interrupted:1; - - /** Data is being copied to/from the request */ - unsigned locked:1; - - /** Request is counted as "waiting" */ - unsigned waiting:1; - - /** State of the request */ - enum fuse_req_state state; + /* Request flags, updated with test/set/clear_bit() */ + unsigned long flags; /** The request input */ struct fuse_in in; @@ -380,6 +376,61 @@ struct fuse_req { struct file *stolen_file; }; +struct fuse_iqueue { + /** Connection established */ + unsigned connected; + + /** Readers of the connection are waiting on this */ + wait_queue_head_t waitq; + + /** The next unique request id */ + u64 reqctr; + + /** The list of pending requests */ + struct list_head pending; + + /** Pending interrupts */ + struct list_head interrupts; + + /** Queue of pending forgets */ + struct fuse_forget_link forget_list_head; + struct fuse_forget_link *forget_list_tail; + + /** Batching of FORGET requests (positive indicates FORGET batch) */ + int forget_batch; + + /** O_ASYNC requests */ + struct fasync_struct *fasync; +}; + +struct fuse_pqueue { + /** Connection established */ + unsigned connected; + + /** Lock protecting accessess to members of this structure */ + spinlock_t lock; + + /** The list of requests being processed */ + struct list_head processing; + + /** The list of requests under I/O */ + struct list_head io; +}; + +/** + * Fuse device instance + */ +struct fuse_dev { + /** Fuse connection for this device */ + struct fuse_conn *fc; + + /** Processing queue */ + struct fuse_pqueue pq; + + /** list entry on fc->devices */ + struct list_head entry; +}; + /** * A Fuse connection. * @@ -394,6 +445,9 @@ struct fuse_conn { /** Refcount */ atomic_t count; + /** Number of fuse_dev's */ + atomic_t dev_count; + struct rcu_head rcu; /** The user id for this mount */ @@ -411,17 +465,8 @@ struct fuse_conn { /** Maximum write size */ unsigned max_write; - /** Readers of the connection are waiting on this */ - wait_queue_head_t waitq; - - /** The list of pending requests */ - struct list_head pending; - - /** The list of requests being processed */ - struct list_head processing; - - /** The list of requests under I/O */ - struct list_head io; + /** Input queue */ + struct fuse_iqueue iq; /** The next unique kernel file handle */ u64 khctr; @@ -444,16 +489,6 @@ struct fuse_conn { /** The list of background requests set aside for later queuing */ struct list_head bg_queue; - /** Pending interrupts */ - struct list_head interrupts; - - /** Queue of pending forgets */ - struct fuse_forget_link forget_list_head; - struct fuse_forget_link *forget_list_tail; - - /** Batching of FORGET requests (positive indicates FORGET batch) */ - int forget_batch; - /** Flag indicating that INIT reply has been received. Allocating * any fuse request will be suspended until the flag is set */ int initialized; @@ -469,9 +504,6 @@ struct fuse_conn { /** waitq for reserved requests */ wait_queue_head_t reserved_req_waitq; - /** The next unique request id */ - u64 reqctr; - /** Connection established, cleared on umount, connection abort and device release */ unsigned connected; @@ -594,9 +626,6 @@ struct fuse_conn { /** number of dentries used in the above array */ int ctl_ndents; - /** O_ASYNC requests */ - struct fasync_struct *fasync; - /** Key for lock owner ID scrambling */ u32 scramble_key[4]; @@ -614,6 +643,9 @@ struct fuse_conn { /** Read/write semaphore to hold when accessing sb. */ struct rw_semaphore killsb; + + /** List of device instances belonging to this connection */ + struct list_head devices; }; static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb) @@ -826,6 +858,9 @@ void fuse_conn_init(struct fuse_conn *fc); */ void fuse_conn_put(struct fuse_conn *fc); +struct fuse_dev *fuse_dev_alloc(struct fuse_conn *fc); +void fuse_dev_free(struct fuse_dev *fud); + /** * Add connection to control filesystem */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 708d69711..2913db2a5 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -362,8 +362,8 @@ static void fuse_send_destroy(struct fuse_conn *fc) if (req && fc->conn_init) { fc->destroy_req = NULL; req->in.h.opcode = FUSE_DESTROY; - req->force = 1; - req->background = 0; + __set_bit(FR_FORCE, &req->flags); + __clear_bit(FR_BACKGROUND, &req->flags); fuse_request_send(fc, req); fuse_put_request(fc, req); } @@ -567,30 +567,46 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root) return 0; } +static void fuse_iqueue_init(struct fuse_iqueue *fiq) +{ + memset(fiq, 0, sizeof(struct fuse_iqueue)); + init_waitqueue_head(&fiq->waitq); + INIT_LIST_HEAD(&fiq->pending); + INIT_LIST_HEAD(&fiq->interrupts); + fiq->forget_list_tail = &fiq->forget_list_head; + fiq->connected = 1; +} + +static void fuse_pqueue_init(struct fuse_pqueue *fpq) +{ + memset(fpq, 0, sizeof(struct fuse_pqueue)); + spin_lock_init(&fpq->lock); + INIT_LIST_HEAD(&fpq->processing); + INIT_LIST_HEAD(&fpq->io); + fpq->connected = 1; +} + void fuse_conn_init(struct fuse_conn *fc) { memset(fc, 0, sizeof(*fc)); spin_lock_init(&fc->lock); init_rwsem(&fc->killsb); atomic_set(&fc->count, 1); - init_waitqueue_head(&fc->waitq); + atomic_set(&fc->dev_count, 1); init_waitqueue_head(&fc->blocked_waitq); init_waitqueue_head(&fc->reserved_req_waitq); - INIT_LIST_HEAD(&fc->pending); - INIT_LIST_HEAD(&fc->processing); - INIT_LIST_HEAD(&fc->io); - INIT_LIST_HEAD(&fc->interrupts); + fuse_iqueue_init(&fc->iq); INIT_LIST_HEAD(&fc->bg_queue); INIT_LIST_HEAD(&fc->entry); - fc->forget_list_tail = &fc->forget_list_head; + INIT_LIST_HEAD(&fc->devices); atomic_set(&fc->num_waiting, 0); fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND; fc->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD; fc->khctr = 0; fc->polled_files = RB_ROOT; - fc->reqctr = 0; fc->blocked = 0; fc->initialized = 0; + fc->connected = 1; fc->attr_version = 1; get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key)); } @@ -930,6 +946,7 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) static void fuse_free_conn(struct fuse_conn *fc) { + WARN_ON(!list_empty(&fc->devices)); kfree_rcu(fc, rcu); } @@ -975,8 +992,42 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) return 0; } +struct fuse_dev *fuse_dev_alloc(struct fuse_conn *fc) +{ + struct fuse_dev *fud; + + fud = kzalloc(sizeof(struct fuse_dev), GFP_KERNEL); + if (fud) { + fud->fc = fuse_conn_get(fc); + fuse_pqueue_init(&fud->pq); + + spin_lock(&fc->lock); + list_add_tail(&fud->entry, &fc->devices); + spin_unlock(&fc->lock); + } + + return fud; +} +EXPORT_SYMBOL_GPL(fuse_dev_alloc); + +void fuse_dev_free(struct fuse_dev *fud) +{ + struct fuse_conn *fc = fud->fc; + + if (fc) { + spin_lock(&fc->lock); + list_del(&fud->entry); + spin_unlock(&fc->lock); + + fuse_conn_put(fc); + } + kfree(fud); +} +EXPORT_SYMBOL_GPL(fuse_dev_free); + static int fuse_fill_super(struct super_block *sb, void *data, int silent) { + struct fuse_dev *fud; struct fuse_conn *fc; struct inode *root; struct fuse_mount_data d; @@ -1028,11 +1079,15 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) fuse_conn_init(fc); fc->release = fuse_free_conn; + fud = fuse_dev_alloc(fc); + if (!fud) + goto err_put_conn; + fc->dev = sb->s_dev; fc->sb = sb; err = fuse_bdi_init(fc, sb); if (err) - goto err_put_conn; + goto err_dev_free; sb->s_bdi = &fc->bdi; @@ -1053,14 +1108,14 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) root = fuse_get_root_inode(sb, d.rootmode); root_dentry = d_make_root(root); if (!root_dentry) - goto err_put_conn; + goto err_dev_free; /* only now - we want root dentry with NULL ->d_op */ sb->s_d_op = &fuse_dentry_operations; init_req = fuse_request_alloc(0); if (!init_req) goto err_put_root; - init_req->background = 1; + __set_bit(FR_BACKGROUND, &init_req->flags); if (is_bdev) { fc->destroy_req = fuse_request_alloc(0); @@ -1079,8 +1134,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) list_add_tail(&fc->entry, &fuse_conn_list); sb->s_root = root_dentry; - fc->connected = 1; - file->private_data = fuse_conn_get(fc); + file->private_data = fud; mutex_unlock(&fuse_mutex); /* * atomic_dec_and_test() in fput() provides the necessary @@ -1099,6 +1153,8 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) fuse_request_free(init_req); err_put_root: dput(root_dentry); + err_dev_free: + fuse_dev_free(fud); err_put_conn: fuse_bdi_destroy(fc); fuse_conn_put(fc); diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 5551fea0a..1caee0534 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -171,6 +171,7 @@ static int __gfs2_jdata_writepage(struct page *page, struct writeback_control *w /** * gfs2_jdata_writepage - Write complete page * @page: Page to write + * @wbc: The writeback control * * Returns: errno * @@ -221,9 +222,10 @@ static int gfs2_writepages(struct address_space *mapping, * gfs2_write_jdata_pagevec - Write back a pagevec's worth of pages * @mapping: The mapping * @wbc: The writeback control - * @writepage: The writepage function to call for each page * @pvec: The vector of pages * @nr_pages: The number of pages to write + * @end: End position + * @done_index: Page index * * Returns: non-zero if loop should terminate, zero otherwise */ @@ -333,8 +335,6 @@ continue_unlock: * gfs2_write_cache_jdata - Like write_cache_pages but different * @mapping: The mapping to write * @wbc: The writeback control - * @writepage: The writepage function to call - * @data: The data to pass to writepage * * The reason that we use our own function here is that we need to * start transactions before we grab page locks. This allows us @@ -588,6 +588,10 @@ int gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos, /** * gfs2_readpages - Read a bunch of pages at once + * @file: The file to read from + * @mapping: Address space info + * @pages: List of pages to read + * @nr_pages: Number of pages to read * * Some notes: * 1. This is only for readahead, so we can simply ignore any things @@ -853,7 +857,7 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh, * @mapping: The address space to write to * @pos: The file position * @len: The length of the data - * @copied: + * @copied: How much was actually copied by the VFS * @page: The page that has been written * @fsdata: The fsdata (unused in GFS2) * diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 31892871e..cf4ab8915 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -180,7 +180,7 @@ void gfs2_set_inode_flags(struct inode *inode) flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_NOSEC); if ((ip->i_eattr == 0) && !is_sxid(inode->i_mode)) - inode->i_flags |= S_NOSEC; + flags |= S_NOSEC; if (ip->i_diskflags & GFS2_DIF_IMMUTABLE) flags |= S_IMMUTABLE; if (ip->i_diskflags & GFS2_DIF_APPENDONLY) @@ -917,7 +917,7 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t le struct gfs2_holder gh; int ret; - if (mode & ~FALLOC_FL_KEEP_SIZE) + if ((mode & ~FALLOC_FL_KEEP_SIZE) || gfs2_is_jdata(ip)) return -EOPNOTSUPP; mutex_lock(&inode->i_mutex); diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 0fa8062f8..a38e38f7b 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1076,7 +1076,8 @@ void gfs2_glock_dq(struct gfs2_holder *gh) !test_bit(GLF_DEMOTE, &gl->gl_flags)) fast_path = 1; } - if (!test_bit(GLF_LFLUSH, &gl->gl_flags) && demote_ok(gl)) + if (!test_bit(GLF_LFLUSH, &gl->gl_flags) && demote_ok(gl) && + (glops->go_flags & GLOF_LRU)) gfs2_glock_add_to_lru(gl); trace_gfs2_glock_queue(gh, 0); diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index fe91951c3..fa3fa5e94 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -144,6 +144,12 @@ static void rgrp_go_sync(struct gfs2_glock *gl) struct gfs2_rgrpd *rgd; int error; + spin_lock(&gl->gl_spin); + rgd = gl->gl_object; + if (rgd) + gfs2_rgrp_brelse(rgd); + spin_unlock(&gl->gl_spin); + if (!test_and_clear_bit(GLF_DIRTY, &gl->gl_flags)) return; GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE); @@ -175,15 +181,17 @@ static void rgrp_go_inval(struct gfs2_glock *gl, int flags) { struct gfs2_sbd *sdp = gl->gl_sbd; struct address_space *mapping = &sdp->sd_aspace; + struct gfs2_rgrpd *rgd = gl->gl_object; + + if (rgd) + gfs2_rgrp_brelse(rgd); WARN_ON_ONCE(!(flags & DIO_METADATA)); gfs2_assert_withdraw(sdp, !atomic_read(&gl->gl_ail_count)); truncate_inode_pages_range(mapping, gl->gl_vm.start, gl->gl_vm.end); - if (gl->gl_object) { - struct gfs2_rgrpd *rgd = (struct gfs2_rgrpd *)gl->gl_object; + if (rgd) rgd->rd_flags &= ~GFS2_RDF_UPTODATE; - } } /** @@ -561,7 +569,7 @@ const struct gfs2_glock_operations gfs2_inode_glops = { .go_lock = inode_go_lock, .go_dump = inode_go_dump, .go_type = LM_TYPE_INODE, - .go_flags = GLOF_ASPACE, + .go_flags = GLOF_ASPACE | GLOF_LRU, }; const struct gfs2_glock_operations gfs2_rgrp_glops = { @@ -584,10 +592,12 @@ const struct gfs2_glock_operations gfs2_freeze_glops = { const struct gfs2_glock_operations gfs2_iopen_glops = { .go_type = LM_TYPE_IOPEN, .go_callback = iopen_go_callback, + .go_flags = GLOF_LRU, }; const struct gfs2_glock_operations gfs2_flock_glops = { .go_type = LM_TYPE_FLOCK, + .go_flags = GLOF_LRU, }; const struct gfs2_glock_operations gfs2_nondisk_glops = { @@ -596,7 +606,7 @@ const struct gfs2_glock_operations gfs2_nondisk_glops = { const struct gfs2_glock_operations gfs2_quota_glops = { .go_type = LM_TYPE_QUOTA, - .go_flags = GLOF_LVB, + .go_flags = GLOF_LVB | GLOF_LRU, }; const struct gfs2_glock_operations gfs2_journal_glops = { diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 58b75abf6..a1ec7c20e 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -225,6 +225,7 @@ struct gfs2_glock_operations { const unsigned long go_flags; #define GLOF_ASPACE 1 #define GLOF_LVB 2 +#define GLOF_LRU 4 }; enum { @@ -432,6 +433,7 @@ enum { QDF_CHANGE = 1, QDF_LOCKED = 2, QDF_REFRESH = 3, + QDF_QMSG_QUIET = 4, }; struct gfs2_quota_data { diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 1b3ca7a2e..063fdfcf8 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -1227,8 +1227,8 @@ static int gfs2_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, */ static int gfs2_atomic_open(struct inode *dir, struct dentry *dentry, - struct file *file, unsigned flags, - umode_t mode, int *opened) + struct file *file, unsigned flags, + umode_t mode, int *opened) { struct dentry *d; bool excl = !!(flags & O_EXCL); @@ -1307,6 +1307,35 @@ static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to) } /** + * update_moved_ino - Update an inode that's being moved + * @ip: The inode being moved + * @ndip: The parent directory of the new filename + * @dir_rename: True of ip is a directory + * + * Returns: errno + */ + +static int update_moved_ino(struct gfs2_inode *ip, struct gfs2_inode *ndip, + int dir_rename) +{ + int error; + struct buffer_head *dibh; + + if (dir_rename) + return gfs2_dir_mvino(ip, &gfs2_qdotdot, ndip, DT_DIR); + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + return error; + ip->i_inode.i_ctime = CURRENT_TIME; + gfs2_trans_add_meta(ip->i_gl, dibh); + gfs2_dinode_out(ip, dibh->b_data); + brelse(dibh); + return 0; +} + + +/** * gfs2_rename - Rename a file * @odir: Parent directory of old file name * @odentry: The old dentry of the file @@ -1354,7 +1383,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, if (S_ISDIR(ip->i_inode.i_mode)) { dir_rename = 1; - /* don't move a dirctory into it's subdir */ + /* don't move a directory into its subdir */ error = gfs2_ok_to_move(ip, ndip); if (error) goto out_gunlock_r; @@ -1494,20 +1523,9 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, if (nip) error = gfs2_unlink_inode(ndip, ndentry); - if (dir_rename) { - error = gfs2_dir_mvino(ip, &gfs2_qdotdot, ndip, DT_DIR); - if (error) - goto out_end_trans; - } else { - struct buffer_head *dibh; - error = gfs2_meta_inode_buffer(ip, &dibh); - if (error) - goto out_end_trans; - ip->i_inode.i_ctime = CURRENT_TIME; - gfs2_trans_add_meta(ip->i_gl, dibh); - gfs2_dinode_out(ip, dibh->b_data); - brelse(dibh); - } + error = update_moved_ino(ip, ndip, dir_rename); + if (error) + goto out_end_trans; error = gfs2_dir_del(odip, odentry); if (error) @@ -1539,6 +1557,161 @@ out: } /** + * gfs2_exchange - exchange two files + * @odir: Parent directory of old file name + * @odentry: The old dentry of the file + * @ndir: Parent directory of new file name + * @ndentry: The new dentry of the file + * @flags: The rename flags + * + * Returns: errno + */ + +static int gfs2_exchange(struct inode *odir, struct dentry *odentry, + struct inode *ndir, struct dentry *ndentry, + unsigned int flags) +{ + struct gfs2_inode *odip = GFS2_I(odir); + struct gfs2_inode *ndip = GFS2_I(ndir); + struct gfs2_inode *oip = GFS2_I(odentry->d_inode); + struct gfs2_inode *nip = GFS2_I(ndentry->d_inode); + struct gfs2_sbd *sdp = GFS2_SB(odir); + struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, }; + unsigned int num_gh; + unsigned int x; + umode_t old_mode = oip->i_inode.i_mode; + umode_t new_mode = nip->i_inode.i_mode; + int error; + + error = gfs2_rindex_update(sdp); + if (error) + return error; + + if (odip != ndip) { + error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE, + 0, &r_gh); + if (error) + goto out; + + if (S_ISDIR(old_mode)) { + /* don't move a directory into its subdir */ + error = gfs2_ok_to_move(oip, ndip); + if (error) + goto out_gunlock_r; + } + + if (S_ISDIR(new_mode)) { + /* don't move a directory into its subdir */ + error = gfs2_ok_to_move(nip, odip); + if (error) + goto out_gunlock_r; + } + } + + num_gh = 1; + gfs2_holder_init(odip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); + if (odip != ndip) { + gfs2_holder_init(ndip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh); + num_gh++; + } + gfs2_holder_init(oip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh); + num_gh++; + + gfs2_holder_init(nip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh); + num_gh++; + + for (x = 0; x < num_gh; x++) { + error = gfs2_glock_nq(ghs + x); + if (error) + goto out_gunlock; + } + + error = -ENOENT; + if (oip->i_inode.i_nlink == 0 || nip->i_inode.i_nlink == 0) + goto out_gunlock; + + error = gfs2_unlink_ok(odip, &odentry->d_name, oip); + if (error) + goto out_gunlock; + error = gfs2_unlink_ok(ndip, &ndentry->d_name, nip); + if (error) + goto out_gunlock; + + if (S_ISDIR(old_mode)) { + error = gfs2_permission(odentry->d_inode, MAY_WRITE); + if (error) + goto out_gunlock; + } + if (S_ISDIR(new_mode)) { + error = gfs2_permission(ndentry->d_inode, MAY_WRITE); + if (error) + goto out_gunlock; + } + error = gfs2_trans_begin(sdp, 4 * RES_DINODE + 4 * RES_LEAF, 0); + if (error) + goto out_gunlock; + + error = update_moved_ino(oip, ndip, S_ISDIR(old_mode)); + if (error) + goto out_end_trans; + + error = update_moved_ino(nip, odip, S_ISDIR(new_mode)); + if (error) + goto out_end_trans; + + error = gfs2_dir_mvino(ndip, &ndentry->d_name, oip, + IF2DT(old_mode)); + if (error) + goto out_end_trans; + + error = gfs2_dir_mvino(odip, &odentry->d_name, nip, + IF2DT(new_mode)); + if (error) + goto out_end_trans; + + if (odip != ndip) { + if (S_ISDIR(new_mode) && !S_ISDIR(old_mode)) { + inc_nlink(&odip->i_inode); + drop_nlink(&ndip->i_inode); + } else if (S_ISDIR(old_mode) && !S_ISDIR(new_mode)) { + inc_nlink(&ndip->i_inode); + drop_nlink(&odip->i_inode); + } + } + mark_inode_dirty(&ndip->i_inode); + if (odip != ndip) + mark_inode_dirty(&odip->i_inode); + +out_end_trans: + gfs2_trans_end(sdp); +out_gunlock: + while (x--) { + gfs2_glock_dq(ghs + x); + gfs2_holder_uninit(ghs + x); + } +out_gunlock_r: + if (r_gh.gh_gl) + gfs2_glock_dq_uninit(&r_gh); +out: + return error; +} + +static int gfs2_rename2(struct inode *odir, struct dentry *odentry, + struct inode *ndir, struct dentry *ndentry, + unsigned int flags) +{ + flags &= ~RENAME_NOREPLACE; + + if (flags & ~RENAME_EXCHANGE) + return -EINVAL; + + if (flags & RENAME_EXCHANGE) + return gfs2_exchange(odir, odentry, ndir, ndentry, flags); + + return gfs2_rename(odir, odentry, ndir, ndentry); +} + +/** * gfs2_follow_link - Follow a symbolic link * @dentry: The dentry of the link * @nd: Data that we pass to vfs_follow_link() @@ -1548,7 +1721,7 @@ out: * Returns: 0 on success or error code */ -static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd) +static const char *gfs2_follow_link(struct dentry *dentry, void **cookie) { struct gfs2_inode *ip = GFS2_I(d_inode(dentry)); struct gfs2_holder i_gh; @@ -1561,8 +1734,7 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd) error = gfs2_glock_nq(&i_gh); if (error) { gfs2_holder_uninit(&i_gh); - nd_set_link(nd, ERR_PTR(error)); - return NULL; + return ERR_PTR(error); } size = (unsigned int)i_size_read(&ip->i_inode); @@ -1586,8 +1758,9 @@ static void *gfs2_follow_link(struct dentry *dentry, struct nameidata *nd) brelse(dibh); out: gfs2_glock_dq_uninit(&i_gh); - nd_set_link(nd, buf); - return NULL; + if (!IS_ERR(buf)) + *cookie = buf; + return buf; } /** @@ -1716,7 +1889,7 @@ static int setattr_chown(struct inode *inode, struct iattr *attr) if (!uid_eq(ouid, NO_UID_QUOTA_CHANGE) || !gid_eq(ogid, NO_GID_QUOTA_CHANGE)) { - gfs2_quota_change(ip, -ap.target, ouid, ogid); + gfs2_quota_change(ip, -(s64)ap.target, ouid, ogid); gfs2_quota_change(ip, ap.target, nuid, ngid); } @@ -1943,7 +2116,7 @@ const struct inode_operations gfs2_dir_iops = { .mkdir = gfs2_mkdir, .rmdir = gfs2_unlink, .mknod = gfs2_mknod, - .rename = gfs2_rename, + .rename2 = gfs2_rename2, .permission = gfs2_permission, .setattr = gfs2_setattr, .getattr = gfs2_getattr, diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 35b49f44c..1e3a93f2f 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -756,6 +756,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) } } + sdp->sd_log_idle = 1; set_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags); gfs2_glock_dq_uninit(&ji_gh); jindex = 0; diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index e3065cb9a..9b61f92fc 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -649,9 +649,117 @@ static void do_qc(struct gfs2_quota_data *qd, s64 change) slot_hold(qd); } + if (change < 0) /* Reset quiet flag if we freed some blocks */ + clear_bit(QDF_QMSG_QUIET, &qd->qd_flags); mutex_unlock(&sdp->sd_quota_mutex); } +static int gfs2_write_buf_to_page(struct gfs2_inode *ip, unsigned long index, + unsigned off, void *buf, unsigned bytes) +{ + struct inode *inode = &ip->i_inode; + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct address_space *mapping = inode->i_mapping; + struct page *page; + struct buffer_head *bh; + void *kaddr; + u64 blk; + unsigned bsize = sdp->sd_sb.sb_bsize, bnum = 0, boff = 0; + unsigned to_write = bytes, pg_off = off; + int done = 0; + + blk = index << (PAGE_CACHE_SHIFT - sdp->sd_sb.sb_bsize_shift); + boff = off % bsize; + + page = find_or_create_page(mapping, index, GFP_NOFS); + if (!page) + return -ENOMEM; + if (!page_has_buffers(page)) + create_empty_buffers(page, bsize, 0); + + bh = page_buffers(page); + while (!done) { + /* Find the beginning block within the page */ + if (pg_off >= ((bnum * bsize) + bsize)) { + bh = bh->b_this_page; + bnum++; + blk++; + continue; + } + if (!buffer_mapped(bh)) { + gfs2_block_map(inode, blk, bh, 1); + if (!buffer_mapped(bh)) + goto unlock_out; + /* If it's a newly allocated disk block, zero it */ + if (buffer_new(bh)) + zero_user(page, bnum * bsize, bh->b_size); + } + if (PageUptodate(page)) + set_buffer_uptodate(bh); + if (!buffer_uptodate(bh)) { + ll_rw_block(READ | REQ_META, 1, &bh); + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + goto unlock_out; + } + gfs2_trans_add_data(ip->i_gl, bh); + + /* If we need to write to the next block as well */ + if (to_write > (bsize - boff)) { + pg_off += (bsize - boff); + to_write -= (bsize - boff); + boff = pg_off % bsize; + continue; + } + done = 1; + } + + /* Write to the page, now that we have setup the buffer(s) */ + kaddr = kmap_atomic(page); + memcpy(kaddr + off, buf, bytes); + flush_dcache_page(page); + kunmap_atomic(kaddr); + unlock_page(page); + page_cache_release(page); + + return 0; + +unlock_out: + unlock_page(page); + page_cache_release(page); + return -EIO; +} + +static int gfs2_write_disk_quota(struct gfs2_inode *ip, struct gfs2_quota *qp, + loff_t loc) +{ + unsigned long pg_beg; + unsigned pg_off, nbytes, overflow = 0; + int pg_oflow = 0, error; + void *ptr; + + nbytes = sizeof(struct gfs2_quota); + + pg_beg = loc >> PAGE_CACHE_SHIFT; + pg_off = loc % PAGE_CACHE_SIZE; + + /* If the quota straddles a page boundary, split the write in two */ + if ((pg_off + nbytes) > PAGE_CACHE_SIZE) { + pg_oflow = 1; + overflow = (pg_off + nbytes) - PAGE_CACHE_SIZE; + } + + ptr = qp; + error = gfs2_write_buf_to_page(ip, pg_beg, pg_off, ptr, + nbytes - overflow); + /* If there's an overflow, write the remaining bytes to the next page */ + if (!error && pg_oflow) + error = gfs2_write_buf_to_page(ip, pg_beg + 1, 0, + ptr + nbytes - overflow, + overflow); + return error; +} + /** * gfs2_adjust_quota - adjust record of current block usage * @ip: The quota inode @@ -672,15 +780,8 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, { struct inode *inode = &ip->i_inode; struct gfs2_sbd *sdp = GFS2_SB(inode); - struct address_space *mapping = inode->i_mapping; - unsigned long index = loc >> PAGE_CACHE_SHIFT; - unsigned offset = loc & (PAGE_CACHE_SIZE - 1); - unsigned blocksize, iblock, pos; - struct buffer_head *bh; - struct page *page; - void *kaddr, *ptr; struct gfs2_quota q; - int err, nbytes; + int err; u64 size; if (gfs2_is_stuffed(ip)) { @@ -694,8 +795,11 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, if (err < 0) return err; + loc -= sizeof(q); /* gfs2_internal_read would've advanced the loc ptr */ err = -EIO; be64_add_cpu(&q.qu_value, change); + if (((s64)be64_to_cpu(q.qu_value)) < 0) + q.qu_value = 0; /* Never go negative on quota usage */ qd->qd_qb.qb_value = q.qu_value; if (fdq) { if (fdq->d_fieldmask & QC_SPC_SOFT) { @@ -712,79 +816,16 @@ static int gfs2_adjust_quota(struct gfs2_inode *ip, loff_t loc, } } - /* Write the quota into the quota file on disk */ - ptr = &q; - nbytes = sizeof(struct gfs2_quota); -get_a_page: - page = find_or_create_page(mapping, index, GFP_NOFS); - if (!page) - return -ENOMEM; - - blocksize = inode->i_sb->s_blocksize; - iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); - - if (!page_has_buffers(page)) - create_empty_buffers(page, blocksize, 0); - - bh = page_buffers(page); - pos = blocksize; - while (offset >= pos) { - bh = bh->b_this_page; - iblock++; - pos += blocksize; - } - - if (!buffer_mapped(bh)) { - gfs2_block_map(inode, iblock, bh, 1); - if (!buffer_mapped(bh)) - goto unlock_out; - /* If it's a newly allocated disk block for quota, zero it */ - if (buffer_new(bh)) - zero_user(page, pos - blocksize, bh->b_size); - } - - if (PageUptodate(page)) - set_buffer_uptodate(bh); - - if (!buffer_uptodate(bh)) { - ll_rw_block(READ | REQ_META, 1, &bh); - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) - goto unlock_out; - } - - gfs2_trans_add_data(ip->i_gl, bh); - - kaddr = kmap_atomic(page); - if (offset + sizeof(struct gfs2_quota) > PAGE_CACHE_SIZE) - nbytes = PAGE_CACHE_SIZE - offset; - memcpy(kaddr + offset, ptr, nbytes); - flush_dcache_page(page); - kunmap_atomic(kaddr); - unlock_page(page); - page_cache_release(page); - - /* If quota straddles page boundary, we need to update the rest of the - * quota at the beginning of the next page */ - if ((offset + sizeof(struct gfs2_quota)) > PAGE_CACHE_SIZE) { - ptr = ptr + nbytes; - nbytes = sizeof(struct gfs2_quota) - nbytes; - offset = 0; - index++; - goto get_a_page; + err = gfs2_write_disk_quota(ip, &q, loc); + if (!err) { + size = loc + sizeof(struct gfs2_quota); + if (size > inode->i_size) + i_size_write(inode, size); + inode->i_mtime = inode->i_atime = CURRENT_TIME; + mark_inode_dirty(inode); + set_bit(QDF_REFRESH, &qd->qd_flags); } - size = loc + sizeof(struct gfs2_quota); - if (size > inode->i_size) - i_size_write(inode, size); - inode->i_mtime = inode->i_atime = CURRENT_TIME; - mark_inode_dirty(inode); - set_bit(QDF_REFRESH, &qd->qd_flags); - return 0; - -unlock_out: - unlock_page(page); - page_cache_release(page); return err; } @@ -1148,10 +1189,13 @@ int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid, /* If no min_target specified or we don't meet * min_target, return -EDQUOT */ if (!ap->min_target || ap->min_target > ap->allowed) { - print_message(qd, "exceeded"); - quota_send_warning(qd->qd_id, - sdp->sd_vfs->s_dev, - QUOTA_NL_BHARDWARN); + if (!test_and_set_bit(QDF_QMSG_QUIET, + &qd->qd_flags)) { + print_message(qd, "exceeded"); + quota_send_warning(qd->qd_id, + sdp->sd_vfs->s_dev, + QUOTA_NL_BHARDWARN); + } error = -EDQUOT; break; } @@ -1648,6 +1692,8 @@ static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid, /* Apply changes */ error = gfs2_adjust_quota(ip, offset, 0, qd, fdq); + if (!error) + clear_bit(QDF_QMSG_QUIET, &qd->qd_flags); gfs2_trans_end(sdp); out_release: diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 6af2396a3..c6c62321d 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -978,10 +978,10 @@ static void set_rgrp_preferences(struct gfs2_sbd *sdp) rgd->rd_flags |= GFS2_RDF_PREFERRED; for (i = 0; i < sdp->sd_journals; i++) { rgd = gfs2_rgrpd_get_next(rgd); - if (rgd == first) + if (!rgd || rgd == first) break; } - } while (rgd != first); + } while (rgd && rgd != first); } /** @@ -1244,14 +1244,13 @@ int gfs2_rgrp_go_lock(struct gfs2_holder *gh) } /** - * gfs2_rgrp_go_unlock - Release RG bitmaps read in with gfs2_rgrp_bh_get() - * @gh: The glock holder for the resource group + * gfs2_rgrp_brelse - Release RG bitmaps read in with gfs2_rgrp_bh_get() + * @rgd: The resource group * */ -void gfs2_rgrp_go_unlock(struct gfs2_holder *gh) +void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd) { - struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object; int x, length = rgd->rd_length; for (x = 0; x < length; x++) { @@ -1264,6 +1263,22 @@ void gfs2_rgrp_go_unlock(struct gfs2_holder *gh) } +/** + * gfs2_rgrp_go_unlock - Unlock a rgrp glock + * @gh: The glock holder for the resource group + * + */ + +void gfs2_rgrp_go_unlock(struct gfs2_holder *gh) +{ + struct gfs2_rgrpd *rgd = gh->gh_gl->gl_object; + int demote_requested = test_bit(GLF_DEMOTE, &gh->gh_gl->gl_flags) | + test_bit(GLF_PENDING_DEMOTE, &gh->gh_gl->gl_flags); + + if (rgd && demote_requested) + gfs2_rgrp_brelse(rgd); +} + int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, struct buffer_head *bh, const struct gfs2_bitmap *bi, unsigned minlen, u64 *ptrimmed) @@ -1711,10 +1726,8 @@ static int gfs2_rbm_find(struct gfs2_rbm *rbm, u8 state, u32 *minext, return ret; bitmap_full: /* Mark bitmap as full and fall through */ - if ((state == GFS2_BLKST_FREE) && initial_offset == 0) { - struct gfs2_bitmap *bi = rbm_bi(rbm); + if ((state == GFS2_BLKST_FREE) && initial_offset == 0) set_bit(GBF_FULL, &bi->bi_flags); - } next_bitmap: /* Find next bitmap in the rgrp */ rbm->offset = 0; @@ -1850,14 +1863,23 @@ static bool gfs2_rgrp_congested(const struct gfs2_rgrpd *rgd, int loops) const struct gfs2_sbd *sdp = gl->gl_sbd; struct gfs2_lkstats *st; s64 r_dcount, l_dcount; - s64 r_srttb, l_srttb; + s64 l_srttb, a_srttb = 0; s64 srttb_diff; s64 sqr_diff; s64 var; + int cpu, nonzero = 0; preempt_disable(); + for_each_present_cpu(cpu) { + st = &per_cpu_ptr(sdp->sd_lkstats, cpu)->lkstats[LM_TYPE_RGRP]; + if (st->stats[GFS2_LKS_SRTTB]) { + a_srttb += st->stats[GFS2_LKS_SRTTB]; + nonzero++; + } + } st = &this_cpu_ptr(sdp->sd_lkstats)->lkstats[LM_TYPE_RGRP]; - r_srttb = st->stats[GFS2_LKS_SRTTB]; + if (nonzero) + do_div(a_srttb, nonzero); r_dcount = st->stats[GFS2_LKS_DCOUNT]; var = st->stats[GFS2_LKS_SRTTVARB] + gl->gl_stats.stats[GFS2_LKS_SRTTVARB]; @@ -1866,10 +1888,10 @@ static bool gfs2_rgrp_congested(const struct gfs2_rgrpd *rgd, int loops) l_srttb = gl->gl_stats.stats[GFS2_LKS_SRTTB]; l_dcount = gl->gl_stats.stats[GFS2_LKS_DCOUNT]; - if ((l_dcount < 1) || (r_dcount < 1) || (r_srttb == 0)) + if ((l_dcount < 1) || (r_dcount < 1) || (a_srttb == 0)) return false; - srttb_diff = r_srttb - l_srttb; + srttb_diff = a_srttb - l_srttb; sqr_diff = srttb_diff * srttb_diff; var *= 2; diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index 68972ecfb..c0ab33fa3 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -36,6 +36,7 @@ extern void gfs2_clear_rgrpd(struct gfs2_sbd *sdp); extern int gfs2_rindex_update(struct gfs2_sbd *sdp); extern void gfs2_free_clones(struct gfs2_rgrpd *rgd); extern int gfs2_rgrp_go_lock(struct gfs2_holder *gh); +extern void gfs2_rgrp_brelse(struct gfs2_rgrpd *rgd); extern void gfs2_rgrp_go_unlock(struct gfs2_holder *gh); extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 859c6edbf..298244594 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -748,7 +748,7 @@ static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc) if (wbc->sync_mode == WB_SYNC_ALL) gfs2_log_flush(GFS2_SB(inode), ip->i_gl, NORMAL_FLUSH); - if (bdi->dirty_exceeded) + if (bdi->wb.dirty_exceeded) gfs2_ail1_flush(sdp, wbc); else filemap_fdatawrite(metamapping); diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index ae8e8811f..c9ff1cf7d 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -101,8 +101,11 @@ static ssize_t freeze_show(struct gfs2_sbd *sdp, char *buf) static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len) { - int error; - int n = simple_strtol(buf, NULL, 0); + int error, n; + + error = kstrtoint(buf, 0, &n); + if (error) + return error; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -134,10 +137,16 @@ static ssize_t withdraw_show(struct gfs2_sbd *sdp, char *buf) static ssize_t withdraw_store(struct gfs2_sbd *sdp, const char *buf, size_t len) { + int error, val; + if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (simple_strtol(buf, NULL, 0) != 1) + error = kstrtoint(buf, 0, &val); + if (error) + return error; + + if (val != 1) return -EINVAL; gfs2_lm_withdraw(sdp, "withdrawing from cluster at user's request\n"); @@ -148,10 +157,16 @@ static ssize_t withdraw_store(struct gfs2_sbd *sdp, const char *buf, size_t len) static ssize_t statfs_sync_store(struct gfs2_sbd *sdp, const char *buf, size_t len) { + int error, val; + if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (simple_strtol(buf, NULL, 0) != 1) + error = kstrtoint(buf, 0, &val); + if (error) + return error; + + if (val != 1) return -EINVAL; gfs2_statfs_sync(sdp->sd_vfs, 0); @@ -161,10 +176,16 @@ static ssize_t statfs_sync_store(struct gfs2_sbd *sdp, const char *buf, static ssize_t quota_sync_store(struct gfs2_sbd *sdp, const char *buf, size_t len) { + int error, val; + if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (simple_strtol(buf, NULL, 0) != 1) + error = kstrtoint(buf, 0, &val); + if (error) + return error; + + if (val != 1) return -EINVAL; gfs2_quota_sync(sdp->sd_vfs, 0); @@ -181,7 +202,9 @@ static ssize_t quota_refresh_user_store(struct gfs2_sbd *sdp, const char *buf, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - id = simple_strtoul(buf, NULL, 0); + error = kstrtou32(buf, 0, &id); + if (error) + return error; qid = make_kqid(current_user_ns(), USRQUOTA, id); if (!qid_valid(qid)) @@ -201,7 +224,9 @@ static ssize_t quota_refresh_group_store(struct gfs2_sbd *sdp, const char *buf, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - id = simple_strtoul(buf, NULL, 0); + error = kstrtou32(buf, 0, &id); + if (error) + return error; qid = make_kqid(current_user_ns(), GRPQUOTA, id); if (!qid_valid(qid)) @@ -324,10 +349,11 @@ static ssize_t block_show(struct gfs2_sbd *sdp, char *buf) static ssize_t block_store(struct gfs2_sbd *sdp, const char *buf, size_t len) { struct lm_lockstruct *ls = &sdp->sd_lockstruct; - ssize_t ret = len; - int val; + int ret, val; - val = simple_strtol(buf, NULL, 0); + ret = kstrtoint(buf, 0, &val); + if (ret) + return ret; if (val == 1) set_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags); @@ -336,9 +362,9 @@ static ssize_t block_store(struct gfs2_sbd *sdp, const char *buf, size_t len) smp_mb__after_atomic(); gfs2_glock_thaw(sdp); } else { - ret = -EINVAL; + return -EINVAL; } - return ret; + return len; } static ssize_t wdack_show(struct gfs2_sbd *sdp, char *buf) @@ -350,17 +376,18 @@ static ssize_t wdack_show(struct gfs2_sbd *sdp, char *buf) static ssize_t wdack_store(struct gfs2_sbd *sdp, const char *buf, size_t len) { - ssize_t ret = len; - int val; + int ret, val; - val = simple_strtol(buf, NULL, 0); + ret = kstrtoint(buf, 0, &val); + if (ret) + return ret; if ((val == 1) && !strcmp(sdp->sd_lockstruct.ls_ops->lm_proto_name, "lock_dlm")) complete(&sdp->sd_wdack); else - ret = -EINVAL; - return ret; + return -EINVAL; + return len; } static ssize_t lkfirst_show(struct gfs2_sbd *sdp, char *buf) @@ -553,11 +580,14 @@ static ssize_t tune_set(struct gfs2_sbd *sdp, unsigned int *field, { struct gfs2_tune *gt = &sdp->sd_tune; unsigned int x; + int error; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - x = simple_strtoul(buf, NULL, 0); + error = kstrtouint(buf, 0, &x); + if (error) + return error; if (check_zero && !x) return -EINVAL; diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h index 95d255219..1f1c7dcbc 100644 --- a/fs/hfs/hfs_fs.h +++ b/fs/hfs/hfs_fs.h @@ -252,7 +252,7 @@ extern void hfs_mark_mdb_dirty(struct super_block *sb); #define __hfs_u_to_mtime(sec) cpu_to_be32(sec + 2082844800U - sys_tz.tz_minuteswest * 60) #define __hfs_m_to_utime(sec) (be32_to_cpu(sec) - 2082844800U + sys_tz.tz_minuteswest * 60) -#define HFS_I(inode) (list_entry(inode, struct hfs_inode_info, vfs_inode)) +#define HFS_I(inode) (container_of(inode, struct hfs_inode_info, vfs_inode)) #define HFS_SB(sb) ((struct hfs_sb_info *)(sb)->s_fs_info) #define hfs_m_to_utime(time) (struct timespec){ .tv_sec = __hfs_m_to_utime(time) } diff --git a/fs/hfs/super.c b/fs/hfs/super.c index eee7206c3..55c03b9e9 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -14,6 +14,7 @@ #include <linux/module.h> #include <linux/blkdev.h> +#include <linux/backing-dev.h> #include <linux/mount.h> #include <linux/init.h> #include <linux/nls.h> diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index b0441d65f..f91a1faf8 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -263,7 +263,7 @@ struct hfsplus_inode_info { static inline struct hfsplus_inode_info *HFSPLUS_I(struct inode *inode) { - return list_entry(inode, struct hfsplus_inode_info, vfs_inode); + return container_of(inode, struct hfsplus_inode_info, vfs_inode); } /* diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 593af2fdc..7302d96ae 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -11,6 +11,7 @@ #include <linux/init.h> #include <linux/pagemap.h> #include <linux/blkdev.h> +#include <linux/backing-dev.h> #include <linux/fs.h> #include <linux/slab.h> #include <linux/vfs.h> diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 07d8d8f52..059597b23 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -892,7 +892,7 @@ static const struct inode_operations hostfs_dir_iops = { .setattr = hostfs_setattr, }; -static void *hostfs_follow_link(struct dentry *dentry, struct nameidata *nd) +static const char *hostfs_follow_link(struct dentry *dentry, void **cookie) { char *link = __getname(); if (link) { @@ -906,21 +906,18 @@ static void *hostfs_follow_link(struct dentry *dentry, struct nameidata *nd) } if (err < 0) { __putname(link); - link = ERR_PTR(err); + return ERR_PTR(err); } } else { - link = ERR_PTR(-ENOMEM); + return ERR_PTR(-ENOMEM); } - nd_set_link(nd, link); - return NULL; + return *cookie = link; } -static void hostfs_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) +static void hostfs_put_link(struct inode *unused, void *cookie) { - char *s = nd_get_link(nd); - if (!IS_ERR(s)) - __putname(s); + __putname(cookie); } static const struct inode_operations hostfs_link_iops = { diff --git a/fs/hpfs/alloc.c b/fs/hpfs/alloc.c index f005046e1..d6a4b55d2 100644 --- a/fs/hpfs/alloc.c +++ b/fs/hpfs/alloc.c @@ -484,3 +484,98 @@ struct anode *hpfs_alloc_anode(struct super_block *s, secno near, anode_secno *a a->btree.first_free = cpu_to_le16(8); return a; } + +static unsigned find_run(__le32 *bmp, unsigned *idx) +{ + unsigned len; + while (tstbits(bmp, *idx, 1)) { + (*idx)++; + if (unlikely(*idx >= 0x4000)) + return 0; + } + len = 1; + while (!tstbits(bmp, *idx + len, 1)) + len++; + return len; +} + +static int do_trim(struct super_block *s, secno start, unsigned len, secno limit_start, secno limit_end, unsigned minlen, unsigned *result) +{ + int err; + secno end; + if (fatal_signal_pending(current)) + return -EINTR; + end = start + len; + if (start < limit_start) + start = limit_start; + if (end > limit_end) + end = limit_end; + if (start >= end) + return 0; + if (end - start < minlen) + return 0; + err = sb_issue_discard(s, start, end - start, GFP_NOFS, 0); + if (err) + return err; + *result += end - start; + return 0; +} + +int hpfs_trim_fs(struct super_block *s, u64 start, u64 end, u64 minlen, unsigned *result) +{ + int err = 0; + struct hpfs_sb_info *sbi = hpfs_sb(s); + unsigned idx, len, start_bmp, end_bmp; + __le32 *bmp; + struct quad_buffer_head qbh; + + *result = 0; + if (!end || end > sbi->sb_fs_size) + end = sbi->sb_fs_size; + if (start >= sbi->sb_fs_size) + return 0; + if (minlen > 0x4000) + return 0; + if (start < sbi->sb_dirband_start + sbi->sb_dirband_size && end > sbi->sb_dirband_start) { + hpfs_lock(s); + if (s->s_flags & MS_RDONLY) { + err = -EROFS; + goto unlock_1; + } + if (!(bmp = hpfs_map_dnode_bitmap(s, &qbh))) { + err = -EIO; + goto unlock_1; + } + idx = 0; + while ((len = find_run(bmp, &idx)) && !err) { + err = do_trim(s, sbi->sb_dirband_start + idx * 4, len * 4, start, end, minlen, result); + idx += len; + } + hpfs_brelse4(&qbh); +unlock_1: + hpfs_unlock(s); + } + start_bmp = start >> 14; + end_bmp = (end + 0x3fff) >> 14; + while (start_bmp < end_bmp && !err) { + hpfs_lock(s); + if (s->s_flags & MS_RDONLY) { + err = -EROFS; + goto unlock_2; + } + if (!(bmp = hpfs_map_bitmap(s, start_bmp, &qbh, "trim"))) { + err = -EIO; + goto unlock_2; + } + idx = 0; + while ((len = find_run(bmp, &idx)) && !err) { + err = do_trim(s, (start_bmp << 14) + idx, len, start, end, minlen, result); + idx += len; + } + hpfs_brelse4(&qbh); +unlock_2: + hpfs_unlock(s); + start_bmp++; + } + return err; +} diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c index 2a8e07425..dc540bfce 100644 --- a/fs/hpfs/dir.c +++ b/fs/hpfs/dir.c @@ -327,4 +327,5 @@ const struct file_operations hpfs_dir_ops = .iterate = hpfs_readdir, .release = hpfs_dir_release, .fsync = hpfs_file_fsync, + .unlocked_ioctl = hpfs_ioctl, }; diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index 6d8cfe9b5..7ca28d604 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -203,6 +203,7 @@ const struct file_operations hpfs_file_ops = .release = hpfs_file_release, .fsync = hpfs_file_fsync, .splice_read = generic_file_splice_read, + .unlocked_ioctl = hpfs_ioctl, }; const struct inode_operations hpfs_file_iops = diff --git a/fs/hpfs/hpfs_fn.h b/fs/hpfs/hpfs_fn.h index b63b75fa0..c4867b511 100644 --- a/fs/hpfs/hpfs_fn.h +++ b/fs/hpfs/hpfs_fn.h @@ -18,6 +18,8 @@ #include <linux/pagemap.h> #include <linux/buffer_head.h> #include <linux/slab.h> +#include <linux/sched.h> +#include <linux/blkdev.h> #include <asm/unaligned.h> #include "hpfs.h" @@ -200,6 +202,7 @@ void hpfs_free_dnode(struct super_block *, secno); struct dnode *hpfs_alloc_dnode(struct super_block *, secno, dnode_secno *, struct quad_buffer_head *); struct fnode *hpfs_alloc_fnode(struct super_block *, secno, fnode_secno *, struct buffer_head **); struct anode *hpfs_alloc_anode(struct super_block *, secno, anode_secno *, struct buffer_head **); +int hpfs_trim_fs(struct super_block *, u64, u64, u64, unsigned *); /* anode.c */ @@ -304,7 +307,7 @@ extern const struct address_space_operations hpfs_symlink_aops; static inline struct hpfs_inode_info *hpfs_i(struct inode *inode) { - return list_entry(inode, struct hpfs_inode_info, vfs_inode); + return container_of(inode, struct hpfs_inode_info, vfs_inode); } static inline struct hpfs_sb_info *hpfs_sb(struct super_block *sb) @@ -318,6 +321,7 @@ __printf(2, 3) void hpfs_error(struct super_block *, const char *, ...); int hpfs_stop_cycles(struct super_block *, int, int *, int *, char *); unsigned hpfs_get_free_dnodes(struct super_block *); +long hpfs_ioctl(struct file *file, unsigned cmd, unsigned long arg); /* * local time (HPFS) to GMT (Unix) diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index 8685c6557..68a9bed05 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -199,12 +199,39 @@ static int hpfs_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } + +long hpfs_ioctl(struct file *file, unsigned cmd, unsigned long arg) +{ + switch (cmd) { + case FITRIM: { + struct fstrim_range range; + secno n_trimmed; + int r; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (copy_from_user(&range, (struct fstrim_range __user *)arg, sizeof(range))) + return -EFAULT; + r = hpfs_trim_fs(file_inode(file)->i_sb, range.start >> 9, (range.start + range.len) >> 9, (range.minlen + 511) >> 9, &n_trimmed); + if (r) + return r; + range.len = (u64)n_trimmed << 9; + if (copy_to_user((struct fstrim_range __user *)arg, &range, sizeof(range))) + return -EFAULT; + return 0; + } + default: { + return -ENOIOCTLCMD; + } + } +} + + static struct kmem_cache * hpfs_inode_cachep; static struct inode *hpfs_alloc_inode(struct super_block *sb) { struct hpfs_inode_info *ei; - ei = (struct hpfs_inode_info *)kmem_cache_alloc(hpfs_inode_cachep, GFP_NOFS); + ei = kmem_cache_alloc(hpfs_inode_cachep, GFP_NOFS); if (!ei) return NULL; ei->vfs_inode.i_version = 1; diff --git a/fs/hppfs/Makefile b/fs/hppfs/Makefile deleted file mode 100644 index 3a982bd97..000000000 --- a/fs/hppfs/Makefile +++ /dev/null @@ -1,6 +0,0 @@ -# -# Copyright (C) 2002 - 2008 Jeff Dike (jdike@{addtoit,linux.intel}.com) -# Licensed under the GPL -# - -obj-$(CONFIG_HPPFS) += hppfs.o diff --git a/fs/hppfs/hppfs.c b/fs/hppfs/hppfs.c deleted file mode 100644 index fa2bd5366..000000000 --- a/fs/hppfs/hppfs.c +++ /dev/null @@ -1,766 +0,0 @@ -/* - * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) - * Licensed under the GPL - */ - -#include <linux/ctype.h> -#include <linux/dcache.h> -#include <linux/file.h> -#include <linux/fs.h> -#include <linux/init.h> -#include <linux/kernel.h> -#include <linux/list.h> -#include <linux/module.h> -#include <linux/mount.h> -#include <linux/slab.h> -#include <linux/statfs.h> -#include <linux/types.h> -#include <linux/pid_namespace.h> -#include <linux/namei.h> -#include <asm/uaccess.h> -#include <os.h> - -static struct inode *get_inode(struct super_block *, struct dentry *); - -struct hppfs_data { - struct list_head list; - char contents[PAGE_SIZE - sizeof(struct list_head)]; -}; - -struct hppfs_private { - struct file *proc_file; - int host_fd; - loff_t len; - struct hppfs_data *contents; -}; - -struct hppfs_inode_info { - struct dentry *proc_dentry; - struct inode vfs_inode; -}; - -static inline struct hppfs_inode_info *HPPFS_I(struct inode *inode) -{ - return container_of(inode, struct hppfs_inode_info, vfs_inode); -} - -#define HPPFS_SUPER_MAGIC 0xb00000ee - -static const struct super_operations hppfs_sbops; - -static int is_pid(struct dentry *dentry) -{ - struct super_block *sb; - int i; - - sb = dentry->d_sb; - if (dentry->d_parent != sb->s_root) - return 0; - - for (i = 0; i < dentry->d_name.len; i++) { - if (!isdigit(dentry->d_name.name[i])) - return 0; - } - return 1; -} - -static char *dentry_name(struct dentry *dentry, int extra) -{ - struct dentry *parent; - char *root, *name; - const char *seg_name; - int len, seg_len, root_len; - - len = 0; - parent = dentry; - while (parent->d_parent != parent) { - if (is_pid(parent)) - len += strlen("pid") + 1; - else len += parent->d_name.len + 1; - parent = parent->d_parent; - } - - root = "proc"; - root_len = strlen(root); - len += root_len; - name = kmalloc(len + extra + 1, GFP_KERNEL); - if (name == NULL) - return NULL; - - name[len] = '\0'; - parent = dentry; - while (parent->d_parent != parent) { - if (is_pid(parent)) { - seg_name = "pid"; - seg_len = strlen(seg_name); - } - else { - seg_name = parent->d_name.name; - seg_len = parent->d_name.len; - } - - len -= seg_len + 1; - name[len] = '/'; - memcpy(&name[len + 1], seg_name, seg_len); - parent = parent->d_parent; - } - memcpy(name, root, root_len); - return name; -} - -static int file_removed(struct dentry *dentry, const char *file) -{ - char *host_file; - int extra, fd; - - extra = 0; - if (file != NULL) - extra += strlen(file) + 1; - - host_file = dentry_name(dentry, extra + strlen("/remove")); - if (host_file == NULL) { - printk(KERN_ERR "file_removed : allocation failed\n"); - return -ENOMEM; - } - - if (file != NULL) { - strcat(host_file, "/"); - strcat(host_file, file); - } - strcat(host_file, "/remove"); - - fd = os_open_file(host_file, of_read(OPENFLAGS()), 0); - kfree(host_file); - if (fd > 0) { - os_close_file(fd); - return 1; - } - return 0; -} - -static struct dentry *hppfs_lookup(struct inode *ino, struct dentry *dentry, - unsigned int flags) -{ - struct dentry *proc_dentry, *parent; - struct qstr *name = &dentry->d_name; - struct inode *inode; - int err, deleted; - - deleted = file_removed(dentry, NULL); - if (deleted < 0) - return ERR_PTR(deleted); - else if (deleted) - return ERR_PTR(-ENOENT); - - parent = HPPFS_I(ino)->proc_dentry; - mutex_lock(&d_inode(parent)->i_mutex); - proc_dentry = lookup_one_len(name->name, parent, name->len); - mutex_unlock(&d_inode(parent)->i_mutex); - - if (IS_ERR(proc_dentry)) - return proc_dentry; - - err = -ENOMEM; - inode = get_inode(ino->i_sb, proc_dentry); - if (!inode) - goto out; - - d_add(dentry, inode); - return NULL; - - out: - return ERR_PTR(err); -} - -static const struct inode_operations hppfs_file_iops = { -}; - -static ssize_t read_proc(struct file *file, char __user *buf, ssize_t count, - loff_t *ppos, int is_user) -{ - ssize_t (*read)(struct file *, char __user *, size_t, loff_t *); - ssize_t n; - - read = file_inode(file)->i_fop->read; - - if (!is_user) - set_fs(KERNEL_DS); - - n = (*read)(file, buf, count, &file->f_pos); - - if (!is_user) - set_fs(USER_DS); - - if (ppos) - *ppos = file->f_pos; - return n; -} - -static ssize_t hppfs_read_file(int fd, char __user *buf, ssize_t count) -{ - ssize_t n; - int cur, err; - char *new_buf; - - n = -ENOMEM; - new_buf = kmalloc(PAGE_SIZE, GFP_KERNEL); - if (new_buf == NULL) { - printk(KERN_ERR "hppfs_read_file : kmalloc failed\n"); - goto out; - } - n = 0; - while (count > 0) { - cur = min_t(ssize_t, count, PAGE_SIZE); - err = os_read_file(fd, new_buf, cur); - if (err < 0) { - printk(KERN_ERR "hppfs_read : read failed, " - "errno = %d\n", err); - n = err; - goto out_free; - } else if (err == 0) - break; - - if (copy_to_user(buf, new_buf, err)) { - n = -EFAULT; - goto out_free; - } - n += err; - count -= err; - } - out_free: - kfree(new_buf); - out: - return n; -} - -static ssize_t hppfs_read(struct file *file, char __user *buf, size_t count, - loff_t *ppos) -{ - struct hppfs_private *hppfs = file->private_data; - struct hppfs_data *data; - loff_t off; - int err; - - if (hppfs->contents != NULL) { - int rem; - - if (*ppos >= hppfs->len) - return 0; - - data = hppfs->contents; - off = *ppos; - while (off >= sizeof(data->contents)) { - data = list_entry(data->list.next, struct hppfs_data, - list); - off -= sizeof(data->contents); - } - - if (off + count > hppfs->len) - count = hppfs->len - off; - rem = copy_to_user(buf, &data->contents[off], count); - *ppos += count - rem; - if (rem > 0) - return -EFAULT; - } else if (hppfs->host_fd != -1) { - err = os_seek_file(hppfs->host_fd, *ppos); - if (err) { - printk(KERN_ERR "hppfs_read : seek failed, " - "errno = %d\n", err); - return err; - } - err = hppfs_read_file(hppfs->host_fd, buf, count); - if (err < 0) { - printk(KERN_ERR "hppfs_read: read failed: %d\n", err); - return err; - } - count = err; - if (count > 0) - *ppos += count; - } - else count = read_proc(hppfs->proc_file, buf, count, ppos, 1); - - return count; -} - -static ssize_t hppfs_write(struct file *file, const char __user *buf, - size_t len, loff_t *ppos) -{ - struct hppfs_private *data = file->private_data; - struct file *proc_file = data->proc_file; - ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *); - - write = file_inode(proc_file)->i_fop->write; - return (*write)(proc_file, buf, len, ppos); -} - -static int open_host_sock(char *host_file, int *filter_out) -{ - char *end; - int fd; - - end = &host_file[strlen(host_file)]; - strcpy(end, "/rw"); - *filter_out = 1; - fd = os_connect_socket(host_file); - if (fd > 0) - return fd; - - strcpy(end, "/r"); - *filter_out = 0; - fd = os_connect_socket(host_file); - return fd; -} - -static void free_contents(struct hppfs_data *head) -{ - struct hppfs_data *data; - struct list_head *ele, *next; - - if (head == NULL) - return; - - list_for_each_safe(ele, next, &head->list) { - data = list_entry(ele, struct hppfs_data, list); - kfree(data); - } - kfree(head); -} - -static struct hppfs_data *hppfs_get_data(int fd, int filter, - struct file *proc_file, - struct file *hppfs_file, - loff_t *size_out) -{ - struct hppfs_data *data, *new, *head; - int n, err; - - err = -ENOMEM; - data = kmalloc(sizeof(*data), GFP_KERNEL); - if (data == NULL) { - printk(KERN_ERR "hppfs_get_data : head allocation failed\n"); - goto failed; - } - - INIT_LIST_HEAD(&data->list); - - head = data; - *size_out = 0; - - if (filter) { - while ((n = read_proc(proc_file, data->contents, - sizeof(data->contents), NULL, 0)) > 0) - os_write_file(fd, data->contents, n); - err = os_shutdown_socket(fd, 0, 1); - if (err) { - printk(KERN_ERR "hppfs_get_data : failed to shut down " - "socket\n"); - goto failed_free; - } - } - while (1) { - n = os_read_file(fd, data->contents, sizeof(data->contents)); - if (n < 0) { - err = n; - printk(KERN_ERR "hppfs_get_data : read failed, " - "errno = %d\n", err); - goto failed_free; - } else if (n == 0) - break; - - *size_out += n; - - if (n < sizeof(data->contents)) - break; - - new = kmalloc(sizeof(*data), GFP_KERNEL); - if (new == 0) { - printk(KERN_ERR "hppfs_get_data : data allocation " - "failed\n"); - err = -ENOMEM; - goto failed_free; - } - - INIT_LIST_HEAD(&new->list); - list_add(&new->list, &data->list); - data = new; - } - return head; - - failed_free: - free_contents(head); - failed: - return ERR_PTR(err); -} - -static struct hppfs_private *hppfs_data(void) -{ - struct hppfs_private *data; - - data = kmalloc(sizeof(*data), GFP_KERNEL); - if (data == NULL) - return data; - - *data = ((struct hppfs_private ) { .host_fd = -1, - .len = -1, - .contents = NULL } ); - return data; -} - -static int file_mode(int fmode) -{ - if (fmode == (FMODE_READ | FMODE_WRITE)) - return O_RDWR; - if (fmode == FMODE_READ) - return O_RDONLY; - if (fmode == FMODE_WRITE) - return O_WRONLY; - return 0; -} - -static int hppfs_open(struct inode *inode, struct file *file) -{ - const struct cred *cred = file->f_cred; - struct hppfs_private *data; - struct path path; - char *host_file; - int err, fd, type, filter; - - err = -ENOMEM; - data = hppfs_data(); - if (data == NULL) - goto out; - - host_file = dentry_name(file->f_path.dentry, strlen("/rw")); - if (host_file == NULL) - goto out_free2; - - path.mnt = inode->i_sb->s_fs_info; - path.dentry = HPPFS_I(inode)->proc_dentry; - - data->proc_file = dentry_open(&path, file_mode(file->f_mode), cred); - err = PTR_ERR(data->proc_file); - if (IS_ERR(data->proc_file)) - goto out_free1; - - type = os_file_type(host_file); - if (type == OS_TYPE_FILE) { - fd = os_open_file(host_file, of_read(OPENFLAGS()), 0); - if (fd >= 0) - data->host_fd = fd; - else - printk(KERN_ERR "hppfs_open : failed to open '%s', " - "errno = %d\n", host_file, -fd); - - data->contents = NULL; - } else if (type == OS_TYPE_DIR) { - fd = open_host_sock(host_file, &filter); - if (fd > 0) { - data->contents = hppfs_get_data(fd, filter, - data->proc_file, - file, &data->len); - if (!IS_ERR(data->contents)) - data->host_fd = fd; - } else - printk(KERN_ERR "hppfs_open : failed to open a socket " - "in '%s', errno = %d\n", host_file, -fd); - } - kfree(host_file); - - file->private_data = data; - return 0; - - out_free1: - kfree(host_file); - out_free2: - free_contents(data->contents); - kfree(data); - out: - return err; -} - -static int hppfs_dir_open(struct inode *inode, struct file *file) -{ - const struct cred *cred = file->f_cred; - struct hppfs_private *data; - struct path path; - int err; - - err = -ENOMEM; - data = hppfs_data(); - if (data == NULL) - goto out; - - path.mnt = inode->i_sb->s_fs_info; - path.dentry = HPPFS_I(inode)->proc_dentry; - data->proc_file = dentry_open(&path, file_mode(file->f_mode), cred); - err = PTR_ERR(data->proc_file); - if (IS_ERR(data->proc_file)) - goto out_free; - - file->private_data = data; - return 0; - - out_free: - kfree(data); - out: - return err; -} - -static loff_t hppfs_llseek(struct file *file, loff_t off, int where) -{ - struct hppfs_private *data = file->private_data; - struct file *proc_file = data->proc_file; - loff_t (*llseek)(struct file *, loff_t, int); - loff_t ret; - - llseek = file_inode(proc_file)->i_fop->llseek; - if (llseek != NULL) { - ret = (*llseek)(proc_file, off, where); - if (ret < 0) - return ret; - } - - return default_llseek(file, off, where); -} - -static int hppfs_release(struct inode *inode, struct file *file) -{ - struct hppfs_private *data = file->private_data; - struct file *proc_file = data->proc_file; - if (proc_file) - fput(proc_file); - kfree(data); - return 0; -} - -static const struct file_operations hppfs_file_fops = { - .owner = NULL, - .llseek = hppfs_llseek, - .read = hppfs_read, - .write = hppfs_write, - .open = hppfs_open, - .release = hppfs_release, -}; - -struct hppfs_dirent { - struct dir_context ctx; - struct dir_context *caller; - struct dentry *dentry; -}; - -static int hppfs_filldir(struct dir_context *ctx, const char *name, int size, - loff_t offset, u64 inode, unsigned int type) -{ - struct hppfs_dirent *dirent = - container_of(ctx, struct hppfs_dirent, ctx); - - if (file_removed(dirent->dentry, name)) - return 0; - - dirent->caller->pos = dirent->ctx.pos; - return !dir_emit(dirent->caller, name, size, inode, type); -} - -static int hppfs_readdir(struct file *file, struct dir_context *ctx) -{ - struct hppfs_private *data = file->private_data; - struct file *proc_file = data->proc_file; - struct hppfs_dirent d = { - .ctx.actor = hppfs_filldir, - .caller = ctx, - .dentry = file->f_path.dentry - }; - int err; - proc_file->f_pos = ctx->pos; - err = iterate_dir(proc_file, &d.ctx); - ctx->pos = d.ctx.pos; - return err; -} - -static const struct file_operations hppfs_dir_fops = { - .owner = NULL, - .iterate = hppfs_readdir, - .open = hppfs_dir_open, - .llseek = default_llseek, - .release = hppfs_release, -}; - -static int hppfs_statfs(struct dentry *dentry, struct kstatfs *sf) -{ - sf->f_blocks = 0; - sf->f_bfree = 0; - sf->f_bavail = 0; - sf->f_files = 0; - sf->f_ffree = 0; - sf->f_type = HPPFS_SUPER_MAGIC; - return 0; -} - -static struct inode *hppfs_alloc_inode(struct super_block *sb) -{ - struct hppfs_inode_info *hi; - - hi = kmalloc(sizeof(*hi), GFP_KERNEL); - if (!hi) - return NULL; - - hi->proc_dentry = NULL; - inode_init_once(&hi->vfs_inode); - return &hi->vfs_inode; -} - -void hppfs_evict_inode(struct inode *ino) -{ - clear_inode(ino); - dput(HPPFS_I(ino)->proc_dentry); - mntput(ino->i_sb->s_fs_info); -} - -static void hppfs_i_callback(struct rcu_head *head) -{ - struct inode *inode = container_of(head, struct inode, i_rcu); - kfree(HPPFS_I(inode)); -} - -static void hppfs_destroy_inode(struct inode *inode) -{ - call_rcu(&inode->i_rcu, hppfs_i_callback); -} - -static const struct super_operations hppfs_sbops = { - .alloc_inode = hppfs_alloc_inode, - .destroy_inode = hppfs_destroy_inode, - .evict_inode = hppfs_evict_inode, - .statfs = hppfs_statfs, -}; - -static int hppfs_readlink(struct dentry *dentry, char __user *buffer, - int buflen) -{ - struct dentry *proc_dentry = HPPFS_I(d_inode(dentry))->proc_dentry; - return d_inode(proc_dentry)->i_op->readlink(proc_dentry, buffer, - buflen); -} - -static void *hppfs_follow_link(struct dentry *dentry, struct nameidata *nd) -{ - struct dentry *proc_dentry = HPPFS_I(d_inode(dentry))->proc_dentry; - - return d_inode(proc_dentry)->i_op->follow_link(proc_dentry, nd); -} - -static void hppfs_put_link(struct dentry *dentry, struct nameidata *nd, - void *cookie) -{ - struct dentry *proc_dentry = HPPFS_I(d_inode(dentry))->proc_dentry; - - if (d_inode(proc_dentry)->i_op->put_link) - d_inode(proc_dentry)->i_op->put_link(proc_dentry, nd, cookie); -} - -static const struct inode_operations hppfs_dir_iops = { - .lookup = hppfs_lookup, -}; - -static const struct inode_operations hppfs_link_iops = { - .readlink = hppfs_readlink, - .follow_link = hppfs_follow_link, - .put_link = hppfs_put_link, -}; - -static struct inode *get_inode(struct super_block *sb, struct dentry *dentry) -{ - struct inode *proc_ino = d_inode(dentry); - struct inode *inode = new_inode(sb); - - if (!inode) { - dput(dentry); - return NULL; - } - - if (d_is_dir(dentry)) { - inode->i_op = &hppfs_dir_iops; - inode->i_fop = &hppfs_dir_fops; - } else if (d_is_symlink(dentry)) { - inode->i_op = &hppfs_link_iops; - inode->i_fop = &hppfs_file_fops; - } else { - inode->i_op = &hppfs_file_iops; - inode->i_fop = &hppfs_file_fops; - } - - HPPFS_I(inode)->proc_dentry = dentry; - - inode->i_uid = proc_ino->i_uid; - inode->i_gid = proc_ino->i_gid; - inode->i_atime = proc_ino->i_atime; - inode->i_mtime = proc_ino->i_mtime; - inode->i_ctime = proc_ino->i_ctime; - inode->i_ino = proc_ino->i_ino; - inode->i_mode = proc_ino->i_mode; - set_nlink(inode, proc_ino->i_nlink); - inode->i_size = proc_ino->i_size; - inode->i_blocks = proc_ino->i_blocks; - - return inode; -} - -static int hppfs_fill_super(struct super_block *sb, void *d, int silent) -{ - struct inode *root_inode; - struct vfsmount *proc_mnt; - int err = -ENOENT; - - proc_mnt = mntget(task_active_pid_ns(current)->proc_mnt); - if (IS_ERR(proc_mnt)) - goto out; - - sb->s_blocksize = 1024; - sb->s_blocksize_bits = 10; - sb->s_magic = HPPFS_SUPER_MAGIC; - sb->s_op = &hppfs_sbops; - sb->s_fs_info = proc_mnt; - - err = -ENOMEM; - root_inode = get_inode(sb, dget(proc_mnt->mnt_root)); - sb->s_root = d_make_root(root_inode); - if (!sb->s_root) - goto out_mntput; - - return 0; - - out_mntput: - mntput(proc_mnt); - out: - return(err); -} - -static struct dentry *hppfs_read_super(struct file_system_type *type, - int flags, const char *dev_name, - void *data) -{ - return mount_nodev(type, flags, data, hppfs_fill_super); -} - -static struct file_system_type hppfs_type = { - .owner = THIS_MODULE, - .name = "hppfs", - .mount = hppfs_read_super, - .kill_sb = kill_anon_super, - .fs_flags = 0, -}; -MODULE_ALIAS_FS("hppfs"); - -static int __init init_hppfs(void) -{ - return register_filesystem(&hppfs_type); -} - -static void __exit exit_hppfs(void) -{ - unregister_filesystem(&hppfs_type); -} - -module_init(init_hppfs) -module_exit(exit_hppfs) -MODULE_LICENSE("GPL"); diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 87724c1d7..973c24ce5 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -130,7 +130,6 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) goto out; ret = 0; - hugetlb_prefault_arch_hook(vma->vm_mm); if (vma->vm_flags & VM_WRITE && inode->i_size < len) inode->i_size = len; out: @@ -1011,6 +1010,8 @@ struct file *hugetlb_file_setup(const char *name, size_t size, inode = hugetlbfs_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0); if (!inode) goto out_dentry; + if (creat_flags == HUGETLB_SHMFS_INODE) + inode->i_flags |= S_PRIVATE; file = ERR_PTR(-ENOMEM); if (hugetlb_reserve_pages(inode, 0, diff --git a/fs/inode.c b/fs/inode.c index 205733701..d30640f7a 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -58,7 +58,6 @@ static struct hlist_head *inode_hashtable __read_mostly; static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock); __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock); -EXPORT_SYMBOL(inode_sb_list_lock); /* * Empty aops. Can be used for the cases where the user does not @@ -153,6 +152,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) inode->i_pipe = NULL; inode->i_bdev = NULL; inode->i_cdev = NULL; + inode->i_link = NULL; inode->i_rdev = 0; inode->dirtied_when = 0; @@ -224,6 +224,7 @@ EXPORT_SYMBOL(free_inode_nonrcu); void __destroy_inode(struct inode *inode) { BUG_ON(inode_has_buffers(inode)); + inode_detach_wb(inode); security_inode_free(inode); fsnotify_inode_delete(inode); locks_free_lock_context(inode->i_flctx); @@ -831,8 +832,6 @@ unsigned int get_next_ino(void) unsigned int *p = &get_cpu_var(last_ino); unsigned int res = *p; -start: - #ifdef CONFIG_SMP if (unlikely((res & (LAST_INO_BATCH-1)) == 0)) { static atomic_t shared_last_ino; @@ -842,8 +841,10 @@ start: } #endif - if (unlikely(!++res)) - goto start; /* never zero */ + res++; + /* get_next_ino should not provide a 0 inode number */ + if (unlikely(!res)) + res++; *p = res; put_cpu_var(last_ino); return res; @@ -1589,36 +1590,47 @@ static int update_time(struct inode *inode, struct timespec *time, int flags) * This function automatically handles read only file systems and media, * as well as the "noatime" flag and inode specific "noatime" markers. */ -void touch_atime(const struct path *path) +bool atime_needs_update(const struct path *path, struct inode *inode) { struct vfsmount *mnt = path->mnt; - struct inode *inode = d_inode(path->dentry); struct timespec now; if (inode->i_flags & S_NOATIME) - return; + return false; if (IS_NOATIME(inode)) - return; + return false; if ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode)) - return; + return false; if (mnt->mnt_flags & MNT_NOATIME) - return; + return false; if ((mnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode)) - return; + return false; now = current_fs_time(inode->i_sb); if (!relatime_need_update(mnt, inode, now)) - return; + return false; if (timespec_equal(&inode->i_atime, &now)) + return false; + + return true; +} + +void touch_atime(const struct path *path) +{ + struct vfsmount *mnt = path->mnt; + struct inode *inode = d_inode(path->dentry); + struct timespec now; + + if (!atime_needs_update(path, inode)) return; if (!sb_start_write_trylock(inode->i_sb)) return; - if (__mnt_want_write(mnt)) + if (__mnt_want_write(mnt) != 0) goto skip_update; /* * File systems can error out when updating inodes if they need to @@ -1629,6 +1641,7 @@ void touch_atime(const struct path *path) * We may also fail on filesystems that have the ability to make parts * of the fs read only, e.g. subvolumes in Btrfs. */ + now = current_fs_time(inode->i_sb); update_time(inode, &now, S_ATIME); __mnt_drop_write(mnt); skip_update: @@ -1665,7 +1678,31 @@ int should_remove_suid(struct dentry *dentry) } EXPORT_SYMBOL(should_remove_suid); -static int __remove_suid(struct dentry *dentry, int kill) +/* + * Return mask of changes for notify_change() that need to be done as a + * response to write or truncate. Return 0 if nothing has to be changed. + * Negative value on error (change should be denied). + */ +int dentry_needs_remove_privs(struct dentry *dentry) +{ + struct inode *inode = d_inode(dentry); + int mask = 0; + int ret; + + if (IS_NOSEC(inode)) + return 0; + + mask = should_remove_suid(dentry); + ret = security_inode_need_killpriv(dentry); + if (ret < 0) + return ret; + if (ret) + mask |= ATTR_KILL_PRIV; + return mask; +} +EXPORT_SYMBOL(dentry_needs_remove_privs); + +static int __remove_privs(struct dentry *dentry, int kill) { struct iattr newattrs; @@ -1677,33 +1714,32 @@ static int __remove_suid(struct dentry *dentry, int kill) return notify_change(dentry, &newattrs, NULL); } -int file_remove_suid(struct file *file) +/* + * Remove special file priviledges (suid, capabilities) when file is written + * to or truncated. + */ +int file_remove_privs(struct file *file) { struct dentry *dentry = file->f_path.dentry; struct inode *inode = d_inode(dentry); - int killsuid; - int killpriv; + int kill; int error = 0; /* Fast path for nothing security related */ if (IS_NOSEC(inode)) return 0; - killsuid = should_remove_suid(dentry); - killpriv = security_inode_need_killpriv(dentry); - - if (killpriv < 0) - return killpriv; - if (killpriv) - error = security_inode_killpriv(dentry); - if (!error && killsuid) - error = __remove_suid(dentry, killsuid); + kill = file_needs_remove_privs(file); + if (kill < 0) + return kill; + if (kill) + error = __remove_privs(dentry, kill); if (!error) inode_has_no_xattr(inode); return error; } -EXPORT_SYMBOL(file_remove_suid); +EXPORT_SYMBOL(file_remove_privs); /** * file_update_time - update mtime and ctime time @@ -1958,9 +1994,8 @@ EXPORT_SYMBOL(inode_dio_wait); * inode is being instantiated). The reason for the cmpxchg() loop * --- which wouldn't be necessary if all code paths which modify * i_flags actually followed this rule, is that there is at least one - * code path which doesn't today --- for example, - * __generic_file_aio_write() calls file_remove_suid() without holding - * i_mutex --- so we use cmpxchg() out of an abundance of caution. + * code path which doesn't today so we use cmpxchg() out of an abundance + * of caution. * * In the long run, i_mutex is overkill, and we should probably look * at using the i_lock spinlock to protect i_flags, and then make sure diff --git a/fs/internal.h b/fs/internal.h index 01dce1d14..4d5af583a 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -107,6 +107,7 @@ extern struct file *do_file_open_root(struct dentry *, struct vfsmount *, extern long do_handle_open(int mountdirfd, struct file_handle __user *ufh, int open_flag); extern int open_check_o_direct(struct file *f); +extern int vfs_open(const struct path *, struct file *, const struct cred *); /* * inode.c diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 112fad9e1..4ff3fad4e 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -371,16 +371,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction, */ J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in)); -retry_alloc: - new_bh = alloc_buffer_head(GFP_NOFS); - if (!new_bh) { - /* - * Failure is not an option, but __GFP_NOFAIL is going - * away; so we retry ourselves here. - */ - congestion_wait(BLK_RW_ASYNC, HZ/50); - goto retry_alloc; - } + new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL); /* keep subsequent assertions sane */ atomic_set(&new_bh->b_count, 1); @@ -1144,7 +1135,6 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev, { journal_t *journal = journal_init_common(); struct buffer_head *bh; - char *p; int n; if (!journal) @@ -1157,9 +1147,7 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev, journal->j_blk_offset = start; journal->j_maxlen = len; bdevname(journal->j_dev, journal->j_devname); - p = journal->j_devname; - while ((p = strchr(p, '/'))) - *p = '!'; + strreplace(journal->j_devname, '/', '!'); jbd2_stats_proc_init(journal); n = journal->j_blocksize / sizeof(journal_block_tag_t); journal->j_wbufsize = n; @@ -1211,10 +1199,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode) journal->j_dev = journal->j_fs_dev = inode->i_sb->s_bdev; journal->j_inode = inode; bdevname(journal->j_dev, journal->j_devname); - p = journal->j_devname; - while ((p = strchr(p, '/'))) - *p = '!'; - p = journal->j_devname + strlen(journal->j_devname); + p = strreplace(journal->j_devname, '/', '!'); sprintf(p, "-%lu", journal->j_inode->i_ino); jbd_debug(1, "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n", @@ -2354,7 +2339,7 @@ static int jbd2_journal_init_journal_head_cache(void) jbd2_journal_head_cache = kmem_cache_create("jbd2_journal_head", sizeof(struct journal_head), 0, /* offset */ - SLAB_TEMPORARY, /* flags */ + SLAB_TEMPORARY | SLAB_DESTROY_BY_RCU, NULL); /* ctor */ retval = 0; if (!jbd2_journal_head_cache) { @@ -2386,10 +2371,8 @@ static struct journal_head *journal_alloc_journal_head(void) if (!ret) { jbd_debug(1, "out of memory for journal_head\n"); pr_notice_ratelimited("ENOMEM in %s, retrying.\n", __func__); - while (!ret) { - yield(); - ret = kmem_cache_zalloc(jbd2_journal_head_cache, GFP_NOFS); - } + ret = kmem_cache_zalloc(jbd2_journal_head_cache, + GFP_NOFS | __GFP_NOFAIL); } return ret; } diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index 14214da80..0abf2e7f7 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -141,11 +141,13 @@ static int insert_revoke_hash(journal_t *journal, unsigned long long blocknr, { struct list_head *hash_list; struct jbd2_revoke_record_s *record; + gfp_t gfp_mask = GFP_NOFS; -repeat: - record = kmem_cache_alloc(jbd2_revoke_record_cache, GFP_NOFS); + if (journal_oom_retry) + gfp_mask |= __GFP_NOFAIL; + record = kmem_cache_alloc(jbd2_revoke_record_cache, gfp_mask); if (!record) - goto oom; + return -ENOMEM; record->sequence = seq; record->blocknr = blocknr; @@ -154,13 +156,6 @@ repeat: list_add(&record->hash, hash_list); spin_unlock(&journal->j_revoke_lock); return 0; - -oom: - if (!journal_oom_retry) - return -ENOMEM; - jbd_debug(1, "ENOMEM in %s, retrying\n", __func__); - yield(); - goto repeat; } /* Find a revoke record in the journal's hash table. */ diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index ff2f2e6ad..f3d06174b 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -278,22 +278,16 @@ static int start_this_handle(journal_t *journal, handle_t *handle, alloc_transaction: if (!journal->j_running_transaction) { + /* + * If __GFP_FS is not present, then we may be being called from + * inside the fs writeback layer, so we MUST NOT fail. + */ + if ((gfp_mask & __GFP_FS) == 0) + gfp_mask |= __GFP_NOFAIL; new_transaction = kmem_cache_zalloc(transaction_cache, gfp_mask); - if (!new_transaction) { - /* - * If __GFP_FS is not present, then we may be - * being called from inside the fs writeback - * layer, so we MUST NOT fail. Since - * __GFP_NOFAIL is going away, we will arrange - * to retry the allocation ourselves. - */ - if ((gfp_mask & __GFP_FS) == 0) { - congestion_wait(BLK_RW_ASYNC, HZ/50); - goto alloc_transaction; - } + if (!new_transaction) return -ENOMEM; - } } jbd_debug(3, "New handle %p going live.\n", handle); @@ -761,6 +755,30 @@ static void warn_dirty_buffer(struct buffer_head *bh) bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr); } +/* Call t_frozen trigger and copy buffer data into jh->b_frozen_data. */ +static void jbd2_freeze_jh_data(struct journal_head *jh) +{ + struct page *page; + int offset; + char *source; + struct buffer_head *bh = jh2bh(jh); + + J_EXPECT_JH(jh, buffer_uptodate(bh), "Possible IO failure.\n"); + page = bh->b_page; + offset = offset_in_page(bh->b_data); + source = kmap_atomic(page); + /* Fire data frozen trigger just before we copy the data */ + jbd2_buffer_frozen_trigger(jh, source + offset, jh->b_triggers); + memcpy(jh->b_frozen_data, source + offset, bh->b_size); + kunmap_atomic(source); + + /* + * Now that the frozen data is saved off, we need to store any matching + * triggers. + */ + jh->b_frozen_triggers = jh->b_triggers; +} + /* * If the buffer is already part of the current transaction, then there * is nothing we need to do. If it is already part of a prior @@ -780,7 +798,6 @@ do_get_write_access(handle_t *handle, struct journal_head *jh, journal_t *journal; int error; char *frozen_buffer = NULL; - int need_copy = 0; unsigned long start_lock, time_lock; if (is_handle_aborted(handle)) @@ -867,119 +884,96 @@ repeat: jh->b_modified = 0; /* + * If the buffer is not journaled right now, we need to make sure it + * doesn't get written to disk before the caller actually commits the + * new data + */ + if (!jh->b_transaction) { + JBUFFER_TRACE(jh, "no transaction"); + J_ASSERT_JH(jh, !jh->b_next_transaction); + JBUFFER_TRACE(jh, "file as BJ_Reserved"); + /* + * Make sure all stores to jh (b_modified, b_frozen_data) are + * visible before attaching it to the running transaction. + * Paired with barrier in jbd2_write_access_granted() + */ + smp_wmb(); + spin_lock(&journal->j_list_lock); + __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved); + spin_unlock(&journal->j_list_lock); + goto done; + } + /* * If there is already a copy-out version of this buffer, then we don't * need to make another one */ if (jh->b_frozen_data) { JBUFFER_TRACE(jh, "has frozen data"); J_ASSERT_JH(jh, jh->b_next_transaction == NULL); - jh->b_next_transaction = transaction; - goto done; + goto attach_next; } - /* Is there data here we need to preserve? */ + JBUFFER_TRACE(jh, "owned by older transaction"); + J_ASSERT_JH(jh, jh->b_next_transaction == NULL); + J_ASSERT_JH(jh, jh->b_transaction == journal->j_committing_transaction); - if (jh->b_transaction && jh->b_transaction != transaction) { - JBUFFER_TRACE(jh, "owned by older transaction"); - J_ASSERT_JH(jh, jh->b_next_transaction == NULL); - J_ASSERT_JH(jh, jh->b_transaction == - journal->j_committing_transaction); + /* + * There is one case we have to be very careful about. If the + * committing transaction is currently writing this buffer out to disk + * and has NOT made a copy-out, then we cannot modify the buffer + * contents at all right now. The essence of copy-out is that it is + * the extra copy, not the primary copy, which gets journaled. If the + * primary copy is already going to disk then we cannot do copy-out + * here. + */ + if (buffer_shadow(bh)) { + JBUFFER_TRACE(jh, "on shadow: sleep"); + jbd_unlock_bh_state(bh); + wait_on_bit_io(&bh->b_state, BH_Shadow, TASK_UNINTERRUPTIBLE); + goto repeat; + } - /* There is one case we have to be very careful about. - * If the committing transaction is currently writing - * this buffer out to disk and has NOT made a copy-out, - * then we cannot modify the buffer contents at all - * right now. The essence of copy-out is that it is the - * extra copy, not the primary copy, which gets - * journaled. If the primary copy is already going to - * disk then we cannot do copy-out here. */ - - if (buffer_shadow(bh)) { - JBUFFER_TRACE(jh, "on shadow: sleep"); + /* + * Only do the copy if the currently-owning transaction still needs it. + * If buffer isn't on BJ_Metadata list, the committing transaction is + * past that stage (here we use the fact that BH_Shadow is set under + * bh_state lock together with refiling to BJ_Shadow list and at this + * point we know the buffer doesn't have BH_Shadow set). + * + * Subtle point, though: if this is a get_undo_access, then we will be + * relying on the frozen_data to contain the new value of the + * committed_data record after the transaction, so we HAVE to force the + * frozen_data copy in that case. + */ + if (jh->b_jlist == BJ_Metadata || force_copy) { + JBUFFER_TRACE(jh, "generate frozen data"); + if (!frozen_buffer) { + JBUFFER_TRACE(jh, "allocate memory for buffer"); jbd_unlock_bh_state(bh); - wait_on_bit_io(&bh->b_state, BH_Shadow, - TASK_UNINTERRUPTIBLE); - goto repeat; - } - - /* - * Only do the copy if the currently-owning transaction still - * needs it. If buffer isn't on BJ_Metadata list, the - * committing transaction is past that stage (here we use the - * fact that BH_Shadow is set under bh_state lock together with - * refiling to BJ_Shadow list and at this point we know the - * buffer doesn't have BH_Shadow set). - * - * Subtle point, though: if this is a get_undo_access, - * then we will be relying on the frozen_data to contain - * the new value of the committed_data record after the - * transaction, so we HAVE to force the frozen_data copy - * in that case. - */ - if (jh->b_jlist == BJ_Metadata || force_copy) { - JBUFFER_TRACE(jh, "generate frozen data"); + frozen_buffer = jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS); if (!frozen_buffer) { - JBUFFER_TRACE(jh, "allocate memory for buffer"); - jbd_unlock_bh_state(bh); - frozen_buffer = - jbd2_alloc(jh2bh(jh)->b_size, - GFP_NOFS); - if (!frozen_buffer) { - printk(KERN_ERR - "%s: OOM for frozen_buffer\n", - __func__); - JBUFFER_TRACE(jh, "oom!"); - error = -ENOMEM; - jbd_lock_bh_state(bh); - goto done; - } - goto repeat; + printk(KERN_ERR "%s: OOM for frozen_buffer\n", + __func__); + JBUFFER_TRACE(jh, "oom!"); + error = -ENOMEM; + goto out; } - jh->b_frozen_data = frozen_buffer; - frozen_buffer = NULL; - need_copy = 1; + goto repeat; } - jh->b_next_transaction = transaction; + jh->b_frozen_data = frozen_buffer; + frozen_buffer = NULL; + jbd2_freeze_jh_data(jh); } - - +attach_next: /* - * Finally, if the buffer is not journaled right now, we need to make - * sure it doesn't get written to disk before the caller actually - * commits the new data + * Make sure all stores to jh (b_modified, b_frozen_data) are visible + * before attaching it to the running transaction. Paired with barrier + * in jbd2_write_access_granted() */ - if (!jh->b_transaction) { - JBUFFER_TRACE(jh, "no transaction"); - J_ASSERT_JH(jh, !jh->b_next_transaction); - JBUFFER_TRACE(jh, "file as BJ_Reserved"); - spin_lock(&journal->j_list_lock); - __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved); - spin_unlock(&journal->j_list_lock); - } + smp_wmb(); + jh->b_next_transaction = transaction; done: - if (need_copy) { - struct page *page; - int offset; - char *source; - - J_EXPECT_JH(jh, buffer_uptodate(jh2bh(jh)), - "Possible IO failure.\n"); - page = jh2bh(jh)->b_page; - offset = offset_in_page(jh2bh(jh)->b_data); - source = kmap_atomic(page); - /* Fire data frozen trigger just before we copy the data */ - jbd2_buffer_frozen_trigger(jh, source + offset, - jh->b_triggers); - memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size); - kunmap_atomic(source); - - /* - * Now that the frozen data is saved off, we need to store - * any matching triggers. - */ - jh->b_frozen_triggers = jh->b_triggers; - } jbd_unlock_bh_state(bh); /* @@ -996,6 +990,55 @@ out: return error; } +/* Fast check whether buffer is already attached to the required transaction */ +static bool jbd2_write_access_granted(handle_t *handle, struct buffer_head *bh) +{ + struct journal_head *jh; + bool ret = false; + + /* Dirty buffers require special handling... */ + if (buffer_dirty(bh)) + return false; + + /* + * RCU protects us from dereferencing freed pages. So the checks we do + * are guaranteed not to oops. However the jh slab object can get freed + * & reallocated while we work with it. So we have to be careful. When + * we see jh attached to the running transaction, we know it must stay + * so until the transaction is committed. Thus jh won't be freed and + * will be attached to the same bh while we run. However it can + * happen jh gets freed, reallocated, and attached to the transaction + * just after we get pointer to it from bh. So we have to be careful + * and recheck jh still belongs to our bh before we return success. + */ + rcu_read_lock(); + if (!buffer_jbd(bh)) + goto out; + /* This should be bh2jh() but that doesn't work with inline functions */ + jh = READ_ONCE(bh->b_private); + if (!jh) + goto out; + if (jh->b_transaction != handle->h_transaction && + jh->b_next_transaction != handle->h_transaction) + goto out; + /* + * There are two reasons for the barrier here: + * 1) Make sure to fetch b_bh after we did previous checks so that we + * detect when jh went through free, realloc, attach to transaction + * while we were checking. Paired with implicit barrier in that path. + * 2) So that access to bh done after jbd2_write_access_granted() + * doesn't get reordered and see inconsistent state of concurrent + * do_get_write_access(). + */ + smp_mb(); + if (unlikely(jh->b_bh != bh)) + goto out; + ret = true; +out: + rcu_read_unlock(); + return ret; +} + /** * int jbd2_journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update. * @handle: transaction to add buffer modifications to @@ -1009,9 +1052,13 @@ out: int jbd2_journal_get_write_access(handle_t *handle, struct buffer_head *bh) { - struct journal_head *jh = jbd2_journal_add_journal_head(bh); + struct journal_head *jh; int rc; + if (jbd2_write_access_granted(handle, bh)) + return 0; + + jh = jbd2_journal_add_journal_head(bh); /* We do not want to get caught playing with fields which the * log thread also manipulates. Make sure that the buffer * completes any outstanding IO before proceeding. */ @@ -1141,11 +1188,14 @@ out: int jbd2_journal_get_undo_access(handle_t *handle, struct buffer_head *bh) { int err; - struct journal_head *jh = jbd2_journal_add_journal_head(bh); + struct journal_head *jh; char *committed_data = NULL; JBUFFER_TRACE(jh, "entry"); + if (jbd2_write_access_granted(handle, bh)) + return 0; + jh = jbd2_journal_add_journal_head(bh); /* * Do this first --- it can drop the journal lock, so we want to * make sure that obtaining the committed_data is done diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index 1ba5c9794..811800229 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -354,6 +354,7 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char ret = -ENOMEM; goto fail; } + inode->i_link = f->target; jffs2_dbg(1, "%s(): symlink's target '%s' cached\n", __func__, (char *)f->target); diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index fe5ea080b..2caf16820 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -272,12 +272,9 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino) mutex_lock(&f->sem); ret = jffs2_do_read_inode(c, f, inode->i_ino, &latest_node); + if (ret) + goto error; - if (ret) { - mutex_unlock(&f->sem); - iget_failed(inode); - return ERR_PTR(ret); - } inode->i_mode = jemode_to_cpu(latest_node.mode); i_uid_write(inode, je16_to_cpu(latest_node.uid)); i_gid_write(inode, je16_to_cpu(latest_node.gid)); @@ -294,6 +291,7 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino) case S_IFLNK: inode->i_op = &jffs2_symlink_inode_operations; + inode->i_link = f->target; break; case S_IFDIR: diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h index d200a9b8f..824e61ede 100644 --- a/fs/jffs2/os-linux.h +++ b/fs/jffs2/os-linux.h @@ -19,7 +19,7 @@ struct kstatfs; struct kvec; -#define JFFS2_INODE_INFO(i) (list_entry(i, struct jffs2_inode_info, vfs_inode)) +#define JFFS2_INODE_INFO(i) (container_of(i, struct jffs2_inode_info, vfs_inode)) #define OFNI_EDONI_2SFFJ(f) (&(f)->vfs_inode) #define JFFS2_SB_INFO(sb) (sb->s_fs_info) #define OFNI_BS_2SFFJ(c) ((struct super_block *)c->os_priv) diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c index dddbde4f5..28e0aab42 100644 --- a/fs/jffs2/readinode.c +++ b/fs/jffs2/readinode.c @@ -1203,17 +1203,13 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c, JFFS2_ERROR("failed to read from flash: error %d, %zd of %zd bytes read\n", ret, retlen, sizeof(*latest_node)); /* FIXME: If this fails, there seems to be a memory leak. Find it. */ - mutex_unlock(&f->sem); - jffs2_do_clear_inode(c, f); - return ret?ret:-EIO; + return ret ? ret : -EIO; } crc = crc32(0, latest_node, sizeof(*latest_node)-8); if (crc != je32_to_cpu(latest_node->node_crc)) { JFFS2_ERROR("CRC failed for read_inode of inode %u at physical location 0x%x\n", f->inocache->ino, ref_offset(rii.latest_ref)); - mutex_unlock(&f->sem); - jffs2_do_clear_inode(c, f); return -EIO; } @@ -1250,16 +1246,11 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c, * keep in RAM to facilitate quick follow symlink * operation. */ uint32_t csize = je32_to_cpu(latest_node->csize); - if (csize > JFFS2_MAX_NAME_LEN) { - mutex_unlock(&f->sem); - jffs2_do_clear_inode(c, f); + if (csize > JFFS2_MAX_NAME_LEN) return -ENAMETOOLONG; - } f->target = kmalloc(csize + 1, GFP_KERNEL); if (!f->target) { JFFS2_ERROR("can't allocate %u bytes of memory for the symlink target path cache\n", csize); - mutex_unlock(&f->sem); - jffs2_do_clear_inode(c, f); return -ENOMEM; } @@ -1271,8 +1262,6 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c, ret = -EIO; kfree(f->target); f->target = NULL; - mutex_unlock(&f->sem); - jffs2_do_clear_inode(c, f); return ret; } @@ -1289,15 +1278,11 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c, if (f->metadata) { JFFS2_ERROR("Argh. Special inode #%u with mode 0%o had metadata node\n", f->inocache->ino, jemode_to_cpu(latest_node->mode)); - mutex_unlock(&f->sem); - jffs2_do_clear_inode(c, f); return -EIO; } if (!frag_first(&f->fragtree)) { JFFS2_ERROR("Argh. Special inode #%u with mode 0%o has no fragments\n", f->inocache->ino, jemode_to_cpu(latest_node->mode)); - mutex_unlock(&f->sem); - jffs2_do_clear_inode(c, f); return -EIO; } /* ASSERT: f->fraglist != NULL */ @@ -1305,8 +1290,6 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c, JFFS2_ERROR("Argh. Special inode #%u with mode 0x%x had more than one node\n", f->inocache->ino, jemode_to_cpu(latest_node->mode)); /* FIXME: Deal with it - check crc32, check for duplicate node, check times and discard the older one */ - mutex_unlock(&f->sem); - jffs2_do_clear_inode(c, f); return -EIO; } /* OK. We're happy */ @@ -1400,10 +1383,8 @@ int jffs2_do_crccheck_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *i f->inocache = ic; ret = jffs2_do_read_inode_internal(c, f, &n); - if (!ret) { - mutex_unlock(&f->sem); - jffs2_do_clear_inode(c, f); - } + mutex_unlock(&f->sem); + jffs2_do_clear_inode(c, f); jffs2_xattr_do_crccheck_inode(c, ic); kfree (f); return ret; diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c index 1fefa25d0..8ce2f2401 100644 --- a/fs/jffs2/symlink.c +++ b/fs/jffs2/symlink.c @@ -9,58 +9,15 @@ * */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/kernel.h> -#include <linux/fs.h> -#include <linux/namei.h> #include "nodelist.h" -static void *jffs2_follow_link(struct dentry *dentry, struct nameidata *nd); - const struct inode_operations jffs2_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = jffs2_follow_link, + .follow_link = simple_follow_link, .setattr = jffs2_setattr, .setxattr = jffs2_setxattr, .getxattr = jffs2_getxattr, .listxattr = jffs2_listxattr, .removexattr = jffs2_removexattr }; - -static void *jffs2_follow_link(struct dentry *dentry, struct nameidata *nd) -{ - struct jffs2_inode_info *f = JFFS2_INODE_INFO(d_inode(dentry)); - char *p = (char *)f->target; - - /* - * We don't acquire the f->sem mutex here since the only data we - * use is f->target. - * - * 1. If we are here the inode has already built and f->target has - * to point to the target path. - * 2. Nobody uses f->target (if the inode is symlink's inode). The - * exception is inode freeing function which frees f->target. But - * it can't be called while we are here and before VFS has - * stopped using our f->target string which we provide by means of - * nd_set_link() call. - */ - - if (!p) { - pr_err("%s(): can't find symlink target\n", __func__); - p = ERR_PTR(-EIO); - } - jffs2_dbg(1, "%s(): target path is '%s'\n", - __func__, (char *)f->target); - - nd_set_link(nd, p); - - /* - * We will unlock the f->sem mutex but VFS will use the f->target string. This is safe - * since the only way that may cause f->target to be changed is iput() operation. - * But VFS will not use f->target after iput() has been called. - */ - return NULL; -} - diff --git a/fs/jfs/file.c b/fs/jfs/file.c index e98d39d75..b9dc23cd0 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -76,7 +76,7 @@ static int jfs_open(struct inode *inode, struct file *file) if (ji->active_ag == -1) { struct jfs_sb_info *jfs_sb = JFS_SBI(inode->i_sb); ji->active_ag = BLKTOAG(addressPXD(&ji->ixpxd), jfs_sb); - atomic_inc( &jfs_sb->bmap->db_active[ji->active_ag]); + atomic_inc(&jfs_sb->bmap->db_active[ji->active_ag]); } spin_unlock_irq(&ji->ag_lock); } diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 070dc4b33..41aa3ca6a 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -63,11 +63,12 @@ struct inode *jfs_iget(struct super_block *sb, unsigned long ino) inode->i_mapping->a_ops = &jfs_aops; } else { inode->i_op = &jfs_fast_symlink_inode_operations; + inode->i_link = JFS_IP(inode)->i_inline; /* * The inline data should be null-terminated, but * don't let on-disk corruption crash the kernel */ - JFS_IP(inode)->i_inline[inode->i_size] = '\0'; + inode->i_link[inode->i_size] = '\0'; } } else { inode->i_op = &jfs_file_inode_operations; @@ -133,11 +134,11 @@ int jfs_write_inode(struct inode *inode, struct writeback_control *wbc) * It has been committed since the last change, but was still * on the dirty inode list. */ - if (!test_cflag(COMMIT_Dirty, inode)) { + if (!test_cflag(COMMIT_Dirty, inode)) { /* Make sure committed changes hit the disk */ jfs_flush_journal(JFS_SBI(inode->i_sb)->log, wait); return 0; - } + } if (jfs_commit_inode(inode, wait)) { jfs_err("jfs_write_inode: jfs_commit_inode failed!"); diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c index 93a123289..8db8b7d61 100644 --- a/fs/jfs/ioctl.c +++ b/fs/jfs/ioctl.c @@ -180,9 +180,6 @@ long jfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case JFS_IOC_SETFLAGS32: cmd = JFS_IOC_SETFLAGS; break; - case FITRIM: - cmd = FITRIM; - break; } return jfs_ioctl(filp, cmd, arg); } diff --git a/fs/jfs/jfs_incore.h b/fs/jfs/jfs_incore.h index fa7e795bd..1f26d1910 100644 --- a/fs/jfs/jfs_incore.h +++ b/fs/jfs/jfs_incore.h @@ -206,7 +206,7 @@ struct jfs_sb_info { static inline struct jfs_inode_info *JFS_IP(struct inode *inode) { - return list_entry(inode, struct jfs_inode_info, vfs_inode); + return container_of(inode, struct jfs_inode_info, vfs_inode); } static inline int jfs_dirtable_inline(struct inode *inode) diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 66db7bc0e..a5ac97b9a 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -880,7 +880,6 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry, int ssize; /* source pathname size */ struct btstack btstack; struct inode *ip = d_inode(dentry); - unchar *i_fastsymlink; s64 xlen = 0; int bmask = 0, xsize; s64 xaddr; @@ -946,8 +945,8 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry, if (ssize <= IDATASIZE) { ip->i_op = &jfs_fast_symlink_inode_operations; - i_fastsymlink = JFS_IP(ip)->i_inline; - memcpy(i_fastsymlink, name, ssize); + ip->i_link = JFS_IP(ip)->i_inline; + memcpy(ip->i_link, name, ssize); ip->i_size = ssize - 1; /* @@ -1161,7 +1160,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, rc = dtModify(tid, new_dir, &new_dname, &ino, old_ip->i_ino, JFS_RENAME); if (rc) - goto out4; + goto out_tx; drop_nlink(new_ip); if (S_ISDIR(new_ip->i_mode)) { drop_nlink(new_ip); @@ -1186,7 +1185,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, if ((new_size = commitZeroLink(tid, new_ip)) < 0) { txAbort(tid, 1); /* Marks FS Dirty */ rc = new_size; - goto out4; + goto out_tx; } tblk = tid_to_tblock(tid); tblk->xflag |= COMMIT_DELETE; @@ -1204,7 +1203,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (rc) { jfs_err("jfs_rename didn't expect dtSearch to fail " "w/rc = %d", rc); - goto out4; + goto out_tx; } ino = old_ip->i_ino; @@ -1212,7 +1211,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (rc) { if (rc == -EIO) jfs_err("jfs_rename: dtInsert returned -EIO"); - goto out4; + goto out_tx; } if (S_ISDIR(old_ip->i_mode)) inc_nlink(new_dir); @@ -1227,7 +1226,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, jfs_err("jfs_rename did not expect dtDelete to return rc = %d", rc); txAbort(tid, 1); /* Marks Filesystem dirty */ - goto out4; + goto out_tx; } if (S_ISDIR(old_ip->i_mode)) { drop_nlink(old_dir); @@ -1286,7 +1285,7 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, rc = txCommit(tid, ipcount, iplist, commit_flag); - out4: + out_tx: txEnd(tid); if (new_ip) mutex_unlock(&JFS_IP(new_ip)->commit_mutex); @@ -1309,13 +1308,6 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, } if (new_ip && (new_ip->i_nlink == 0)) set_cflag(COMMIT_Nolink, new_ip); - out3: - free_UCSname(&new_dname); - out2: - free_UCSname(&old_dname); - out1: - if (new_ip && !S_ISDIR(new_ip->i_mode)) - IWRITE_UNLOCK(new_ip); /* * Truncating the directory index table is not guaranteed. It * may need to be done iteratively @@ -1326,7 +1318,13 @@ static int jfs_rename(struct inode *old_dir, struct dentry *old_dentry, clear_cflag(COMMIT_Stale, old_dir); } - + if (new_ip && !S_ISDIR(new_ip->i_mode)) + IWRITE_UNLOCK(new_ip); + out3: + free_UCSname(&new_dname); + out2: + free_UCSname(&old_dname); + out1: jfs_info("jfs_rename: returning %d", rc); return rc; } diff --git a/fs/jfs/symlink.c b/fs/jfs/symlink.c index 80f42bcc4..5929e2363 100644 --- a/fs/jfs/symlink.c +++ b/fs/jfs/symlink.c @@ -17,21 +17,13 @@ */ #include <linux/fs.h> -#include <linux/namei.h> #include "jfs_incore.h" #include "jfs_inode.h" #include "jfs_xattr.h" -static void *jfs_follow_link(struct dentry *dentry, struct nameidata *nd) -{ - char *s = JFS_IP(d_inode(dentry))->i_inline; - nd_set_link(nd, s); - return NULL; -} - const struct inode_operations jfs_fast_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = jfs_follow_link, + .follow_link = simple_follow_link, .setattr = jfs_setattr, .setxattr = jfs_setxattr, .getxattr = jfs_getxattr, diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index 2bacb9988..7247252ee 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -785,7 +785,6 @@ static unsigned int kernfs_fop_poll(struct file *filp, poll_table *wait) struct kernfs_node *kn = filp->f_path.dentry->d_fsdata; struct kernfs_open_node *on = kn->attr.open; - /* need parent for the kobj, grab both */ if (!kernfs_get_active(kn)) goto trigger; diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h index af9fa7499..6762bfbd8 100644 --- a/fs/kernfs/kernfs-internal.h +++ b/fs/kernfs/kernfs-internal.h @@ -76,7 +76,6 @@ extern struct kmem_cache *kernfs_node_cache; /* * inode.c */ -struct inode *kernfs_get_inode(struct super_block *sb, struct kernfs_node *kn); void kernfs_evict_inode(struct inode *inode); int kernfs_iop_permission(struct inode *inode, int mask); int kernfs_iop_setattr(struct dentry *dentry, struct iattr *iattr); diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c index 8a198898e..db272528a 100644 --- a/fs/kernfs/symlink.c +++ b/fs/kernfs/symlink.c @@ -112,25 +112,18 @@ static int kernfs_getlink(struct dentry *dentry, char *path) return error; } -static void *kernfs_iop_follow_link(struct dentry *dentry, struct nameidata *nd) +static const char *kernfs_iop_follow_link(struct dentry *dentry, void **cookie) { int error = -ENOMEM; unsigned long page = get_zeroed_page(GFP_KERNEL); - if (page) { - error = kernfs_getlink(dentry, (char *) page); - if (error < 0) - free_page((unsigned long)page); - } - nd_set_link(nd, error ? ERR_PTR(error) : (char *)page); - return NULL; -} - -static void kernfs_iop_put_link(struct dentry *dentry, struct nameidata *nd, - void *cookie) -{ - char *page = nd_get_link(nd); - if (!IS_ERR(page)) + if (!page) + return ERR_PTR(-ENOMEM); + error = kernfs_getlink(dentry, (char *)page); + if (unlikely(error < 0)) { free_page((unsigned long)page); + return ERR_PTR(error); + } + return *cookie = (char *)page; } const struct inode_operations kernfs_symlink_iops = { @@ -140,7 +133,7 @@ const struct inode_operations kernfs_symlink_iops = { .listxattr = kernfs_iop_listxattr, .readlink = generic_readlink, .follow_link = kernfs_iop_follow_link, - .put_link = kernfs_iop_put_link, + .put_link = free_page_put_link, .setattr = kernfs_iop_setattr, .getattr = kernfs_iop_getattr, .permission = kernfs_iop_permission, diff --git a/fs/libfs.c b/fs/libfs.c index 02813592e..102edfd39 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -20,11 +20,6 @@ #include "internal.h" -static inline int simple_positive(struct dentry *dentry) -{ - return d_really_is_positive(dentry) && !d_unhashed(dentry); -} - int simple_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { @@ -1024,15 +1019,18 @@ int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync) } EXPORT_SYMBOL(noop_fsync); -void kfree_put_link(struct dentry *dentry, struct nameidata *nd, - void *cookie) +void kfree_put_link(struct inode *unused, void *cookie) { - char *s = nd_get_link(nd); - if (!IS_ERR(s)) - kfree(s); + kfree(cookie); } EXPORT_SYMBOL(kfree_put_link); +void free_page_put_link(struct inode *unused, void *cookie) +{ + free_page((unsigned long) cookie); +} +EXPORT_SYMBOL(free_page_put_link); + /* * nop .set_page_dirty method so that people can use .page_mkwrite on * anon inodes. @@ -1094,6 +1092,17 @@ simple_nosetlease(struct file *filp, long arg, struct file_lock **flp, } EXPORT_SYMBOL(simple_nosetlease); +const char *simple_follow_link(struct dentry *dentry, void **cookie) +{ + return d_inode(dentry)->i_link; +} +EXPORT_SYMBOL(simple_follow_link); + +const struct inode_operations simple_symlink_inode_operations = { + .follow_link = simple_follow_link, + .readlink = generic_readlink +}; +EXPORT_SYMBOL(simple_symlink_inode_operations); /* * Operations for a permanently empty directory. diff --git a/fs/locks.c b/fs/locks.c index 653faabb0..d3d558ba4 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -862,12 +862,11 @@ static int posix_locks_deadlock(struct file_lock *caller_fl, * whether or not a lock was successfully freed by testing the return * value for -ENOENT. */ -static int flock_lock_file(struct file *filp, struct file_lock *request) +static int flock_lock_inode(struct inode *inode, struct file_lock *request) { struct file_lock *new_fl = NULL; struct file_lock *fl; struct file_lock_context *ctx; - struct inode *inode = file_inode(filp); int error = 0; bool found = false; LIST_HEAD(dispose); @@ -890,7 +889,7 @@ static int flock_lock_file(struct file *filp, struct file_lock *request) goto find_conflict; list_for_each_entry(fl, &ctx->flc_flock, fl_list) { - if (filp != fl->fl_file) + if (request->fl_file != fl->fl_file) continue; if (request->fl_type == fl->fl_type) goto out; @@ -1164,20 +1163,19 @@ int posix_lock_file(struct file *filp, struct file_lock *fl, EXPORT_SYMBOL(posix_lock_file); /** - * posix_lock_file_wait - Apply a POSIX-style lock to a file - * @filp: The file to apply the lock to + * posix_lock_inode_wait - Apply a POSIX-style lock to a file + * @inode: inode of file to which lock request should be applied * @fl: The lock to be applied * - * Add a POSIX style lock to a file. - * We merge adjacent & overlapping locks whenever possible. - * POSIX locks are sorted by owner task, then by starting address + * Variant of posix_lock_file_wait that does not take a filp, and so can be + * used after the filp has already been torn down. */ -int posix_lock_file_wait(struct file *filp, struct file_lock *fl) +int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl) { int error; might_sleep (); for (;;) { - error = posix_lock_file(filp, fl, NULL); + error = __posix_lock_file(inode, fl, NULL); if (error != FILE_LOCK_DEFERRED) break; error = wait_event_interruptible(fl->fl_wait, !fl->fl_next); @@ -1189,7 +1187,7 @@ int posix_lock_file_wait(struct file *filp, struct file_lock *fl) } return error; } -EXPORT_SYMBOL(posix_lock_file_wait); +EXPORT_SYMBOL(posix_lock_inode_wait); /** * locks_mandatory_locked - Check for an active lock @@ -1851,18 +1849,18 @@ int fcntl_setlease(unsigned int fd, struct file *filp, long arg) } /** - * flock_lock_file_wait - Apply a FLOCK-style lock to a file - * @filp: The file to apply the lock to + * flock_lock_inode_wait - Apply a FLOCK-style lock to a file + * @inode: inode of the file to apply to * @fl: The lock to be applied * - * Add a FLOCK style lock to a file. + * Apply a FLOCK style lock request to an inode. */ -int flock_lock_file_wait(struct file *filp, struct file_lock *fl) +int flock_lock_inode_wait(struct inode *inode, struct file_lock *fl) { int error; might_sleep(); for (;;) { - error = flock_lock_file(filp, fl); + error = flock_lock_inode(inode, fl); if (error != FILE_LOCK_DEFERRED) break; error = wait_event_interruptible(fl->fl_wait, !fl->fl_next); @@ -1874,8 +1872,7 @@ int flock_lock_file_wait(struct file *filp, struct file_lock *fl) } return error; } - -EXPORT_SYMBOL(flock_lock_file_wait); +EXPORT_SYMBOL(flock_lock_inode_wait); /** * sys_flock: - flock() system call. @@ -2401,7 +2398,8 @@ locks_remove_flock(struct file *filp) .fl_type = F_UNLCK, .fl_end = OFFSET_MAX, }; - struct file_lock_context *flctx = file_inode(filp)->i_flctx; + struct inode *inode = file_inode(filp); + struct file_lock_context *flctx = inode->i_flctx; if (list_empty(&flctx->flc_flock)) return; @@ -2409,7 +2407,7 @@ locks_remove_flock(struct file *filp) if (filp->f_op->flock) filp->f_op->flock(filp, F_SETLKW, &fl); else - flock_lock_file(filp, &fl); + flock_lock_inode(inode, &fl); if (fl.fl_ops && fl.fl_ops->fl_release_private) fl.fl_ops->fl_release_private(&fl); diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c index 4cf38f118..f9b45d46d 100644 --- a/fs/logfs/dir.c +++ b/fs/logfs/dir.c @@ -779,6 +779,7 @@ fail: const struct inode_operations logfs_symlink_iops = { .readlink = generic_readlink, .follow_link = page_follow_link_light, + .put_link = page_put_link, }; const struct inode_operations logfs_dir_iops = { diff --git a/fs/minix/dir.c b/fs/minix/dir.c index 118e4e7bc..d19ac2581 100644 --- a/fs/minix/dir.c +++ b/fs/minix/dir.c @@ -45,11 +45,6 @@ minix_last_byte(struct inode *inode, unsigned long page_nr) return last_byte; } -static inline unsigned long dir_pages(struct inode *inode) -{ - return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT; -} - static int dir_commit_chunk(struct page *page, loff_t pos, unsigned len) { struct address_space *mapping = page->mapping; diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 1182d1e26..086cd0a61 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -62,7 +62,7 @@ static struct kmem_cache * minix_inode_cachep; static struct inode *minix_alloc_inode(struct super_block *sb) { struct minix_inode_info *ei; - ei = (struct minix_inode_info *)kmem_cache_alloc(minix_inode_cachep, GFP_KERNEL); + ei = kmem_cache_alloc(minix_inode_cachep, GFP_KERNEL); if (!ei) return NULL; return &ei->vfs_inode; diff --git a/fs/minix/minix.h b/fs/minix/minix.h index 1ebd11854..01ad81dca 100644 --- a/fs/minix/minix.h +++ b/fs/minix/minix.h @@ -84,7 +84,7 @@ static inline struct minix_sb_info *minix_sb(struct super_block *sb) static inline struct minix_inode_info *minix_i(struct inode *inode) { - return list_entry(inode, struct minix_inode_info, vfs_inode); + return container_of(inode, struct minix_inode_info, vfs_inode); } static inline unsigned minix_blocks_needed(unsigned bits, unsigned blocksize) diff --git a/fs/mount.h b/fs/mount.h index 6a61c2b3e..14db05d42 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -88,6 +88,7 @@ static inline int is_mounted(struct vfsmount *mnt) extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *); extern struct mount *__lookup_mnt_last(struct vfsmount *, struct dentry *); +extern int __legitimize_mnt(struct vfsmount *, unsigned); extern bool legitimize_mnt(struct vfsmount *, unsigned); extern void __detach_mounts(struct dentry *dentry); @@ -117,7 +118,6 @@ static inline void unlock_mount_hash(void) } struct proc_mounts { - struct seq_file m; struct mnt_namespace *ns; struct path root; int (*show)(struct seq_file *, struct vfsmount *); @@ -126,8 +126,6 @@ struct proc_mounts { loff_t cached_index; }; -#define proc_mounts(p) (container_of((p), struct proc_mounts, m)) - extern const struct seq_operations mounts_op; extern bool __is_local_mountpoint(struct dentry *dentry); diff --git a/fs/mpage.c b/fs/mpage.c index 3e79220ba..ca0244b69 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -605,6 +605,8 @@ alloc_new: bio_get_nr_vecs(bdev), GFP_NOFS|__GFP_HIGH); if (bio == NULL) goto confused; + + wbc_init_bio(wbc, bio); } /* @@ -612,6 +614,7 @@ alloc_new: * the confused fail path above (OOM) will be very confused when * it finds all bh marked clean (i.e. it will not write anything) */ + wbc_account_io(wbc, page, PAGE_SIZE); length = first_unmapped << blkbits; if (bio_add_page(bio, page, length, 0) < length) { bio = mpage_bio_submit(WRITE, bio); diff --git a/fs/namei.c b/fs/namei.c index fe30d3be4..1c2105ed2 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -492,6 +492,7 @@ void path_put(const struct path *path) } EXPORT_SYMBOL(path_put); +#define EMBEDDED_LEVELS 2 struct nameidata { struct path path; struct qstr last; @@ -501,10 +502,139 @@ struct nameidata { unsigned seq, m_seq; int last_type; unsigned depth; - struct file *base; - char *saved_names[MAX_NESTED_LINKS + 1]; + int total_link_count; + struct saved { + struct path link; + void *cookie; + const char *name; + struct inode *inode; + unsigned seq; + } *stack, internal[EMBEDDED_LEVELS]; + struct filename *name; + struct nameidata *saved; + unsigned root_seq; + int dfd; }; +static void set_nameidata(struct nameidata *p, int dfd, struct filename *name) +{ + struct nameidata *old = current->nameidata; + p->stack = p->internal; + p->dfd = dfd; + p->name = name; + p->total_link_count = old ? old->total_link_count : 0; + p->saved = old; + current->nameidata = p; +} + +static void restore_nameidata(void) +{ + struct nameidata *now = current->nameidata, *old = now->saved; + + current->nameidata = old; + if (old) + old->total_link_count = now->total_link_count; + if (now->stack != now->internal) { + kfree(now->stack); + now->stack = now->internal; + } +} + +static int __nd_alloc_stack(struct nameidata *nd) +{ + struct saved *p; + + if (nd->flags & LOOKUP_RCU) { + p= kmalloc(MAXSYMLINKS * sizeof(struct saved), + GFP_ATOMIC); + if (unlikely(!p)) + return -ECHILD; + } else { + p= kmalloc(MAXSYMLINKS * sizeof(struct saved), + GFP_KERNEL); + if (unlikely(!p)) + return -ENOMEM; + } + memcpy(p, nd->internal, sizeof(nd->internal)); + nd->stack = p; + return 0; +} + +static inline int nd_alloc_stack(struct nameidata *nd) +{ + if (likely(nd->depth != EMBEDDED_LEVELS)) + return 0; + if (likely(nd->stack != nd->internal)) + return 0; + return __nd_alloc_stack(nd); +} + +static void drop_links(struct nameidata *nd) +{ + int i = nd->depth; + while (i--) { + struct saved *last = nd->stack + i; + struct inode *inode = last->inode; + if (last->cookie && inode->i_op->put_link) { + inode->i_op->put_link(inode, last->cookie); + last->cookie = NULL; + } + } +} + +static void terminate_walk(struct nameidata *nd) +{ + drop_links(nd); + if (!(nd->flags & LOOKUP_RCU)) { + int i; + path_put(&nd->path); + for (i = 0; i < nd->depth; i++) + path_put(&nd->stack[i].link); + if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { + path_put(&nd->root); + nd->root.mnt = NULL; + } + } else { + nd->flags &= ~LOOKUP_RCU; + if (!(nd->flags & LOOKUP_ROOT)) + nd->root.mnt = NULL; + rcu_read_unlock(); + } + nd->depth = 0; +} + +/* path_put is needed afterwards regardless of success or failure */ +static bool legitimize_path(struct nameidata *nd, + struct path *path, unsigned seq) +{ + int res = __legitimize_mnt(path->mnt, nd->m_seq); + if (unlikely(res)) { + if (res > 0) + path->mnt = NULL; + path->dentry = NULL; + return false; + } + if (unlikely(!lockref_get_not_dead(&path->dentry->d_lockref))) { + path->dentry = NULL; + return false; + } + return !read_seqcount_retry(&path->dentry->d_seq, seq); +} + +static bool legitimize_links(struct nameidata *nd) +{ + int i; + for (i = 0; i < nd->depth; i++) { + struct saved *last = nd->stack + i; + if (unlikely(!legitimize_path(nd, &last->link, last->seq))) { + drop_links(nd); + nd->depth = i + 1; + return false; + } + } + return true; +} + /* * Path walking has 2 modes, rcu-walk and ref-walk (see * Documentation/filesystems/path-lookup.txt). In situations when we can't @@ -520,35 +650,28 @@ struct nameidata { * unlazy_walk - try to switch to ref-walk mode. * @nd: nameidata pathwalk data * @dentry: child of nd->path.dentry or NULL + * @seq: seq number to check dentry against * Returns: 0 on success, -ECHILD on failure * * unlazy_walk attempts to legitimize the current nd->path, nd->root and dentry * for ref-walk mode. @dentry must be a path found by a do_lookup call on * @nd or NULL. Must be called from rcu-walk context. + * Nothing should touch nameidata between unlazy_walk() failure and + * terminate_walk(). */ -static int unlazy_walk(struct nameidata *nd, struct dentry *dentry) +static int unlazy_walk(struct nameidata *nd, struct dentry *dentry, unsigned seq) { - struct fs_struct *fs = current->fs; struct dentry *parent = nd->path.dentry; BUG_ON(!(nd->flags & LOOKUP_RCU)); - /* - * After legitimizing the bastards, terminate_walk() - * will do the right thing for non-RCU mode, and all our - * subsequent exit cases should rcu_read_unlock() - * before returning. Do vfsmount first; if dentry - * can't be legitimized, just set nd->path.dentry to NULL - * and rely on dput(NULL) being a no-op. - */ - if (!legitimize_mnt(nd->path.mnt, nd->m_seq)) - return -ECHILD; nd->flags &= ~LOOKUP_RCU; - - if (!lockref_get_not_dead(&parent->d_lockref)) { - nd->path.dentry = NULL; - goto out; - } + if (unlikely(!legitimize_links(nd))) + goto out2; + if (unlikely(!legitimize_mnt(nd->path.mnt, nd->m_seq))) + goto out2; + if (unlikely(!lockref_get_not_dead(&parent->d_lockref))) + goto out1; /* * For a negative lookup, the lookup sequence point is the parents @@ -568,7 +691,7 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry) } else { if (!lockref_get_not_dead(&dentry->d_lockref)) goto out; - if (read_seqcount_retry(&dentry->d_seq, nd->seq)) + if (read_seqcount_retry(&dentry->d_seq, seq)) goto drop_dentry; } @@ -577,22 +700,24 @@ static int unlazy_walk(struct nameidata *nd, struct dentry *dentry) * still valid and get it if required. */ if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { - spin_lock(&fs->lock); - if (nd->root.mnt != fs->root.mnt || nd->root.dentry != fs->root.dentry) - goto unlock_and_drop_dentry; - path_get(&nd->root); - spin_unlock(&fs->lock); + if (unlikely(!legitimize_path(nd, &nd->root, nd->root_seq))) { + rcu_read_unlock(); + dput(dentry); + return -ECHILD; + } } rcu_read_unlock(); return 0; -unlock_and_drop_dentry: - spin_unlock(&fs->lock); drop_dentry: rcu_read_unlock(); dput(dentry); goto drop_root_mnt; +out2: + nd->path.mnt = NULL; +out1: + nd->path.dentry = NULL; out: rcu_read_unlock(); drop_root_mnt: @@ -601,6 +726,24 @@ drop_root_mnt: return -ECHILD; } +static int unlazy_link(struct nameidata *nd, struct path *link, unsigned seq) +{ + if (unlikely(!legitimize_path(nd, link, seq))) { + drop_links(nd); + nd->depth = 0; + nd->flags &= ~LOOKUP_RCU; + nd->path.mnt = NULL; + nd->path.dentry = NULL; + if (!(nd->flags & LOOKUP_ROOT)) + nd->root.mnt = NULL; + rcu_read_unlock(); + } else if (likely(unlazy_walk(nd, NULL, 0)) == 0) { + return 0; + } + path_put(link); + return -ECHILD; +} + static inline int d_revalidate(struct dentry *dentry, unsigned int flags) { return dentry->d_op->d_revalidate(dentry, flags); @@ -622,26 +765,10 @@ static int complete_walk(struct nameidata *nd) int status; if (nd->flags & LOOKUP_RCU) { - nd->flags &= ~LOOKUP_RCU; if (!(nd->flags & LOOKUP_ROOT)) nd->root.mnt = NULL; - - if (!legitimize_mnt(nd->path.mnt, nd->m_seq)) { - rcu_read_unlock(); - return -ECHILD; - } - if (unlikely(!lockref_get_not_dead(&dentry->d_lockref))) { - rcu_read_unlock(); - mntput(nd->path.mnt); + if (unlikely(unlazy_walk(nd, NULL, 0))) return -ECHILD; - } - if (read_seqcount_retry(&dentry->d_seq, nd->seq)) { - rcu_read_unlock(); - dput(dentry); - mntput(nd->path.mnt); - return -ECHILD; - } - rcu_read_unlock(); } if (likely(!(nd->flags & LOOKUP_JUMPED))) @@ -657,28 +784,24 @@ static int complete_walk(struct nameidata *nd) if (!status) status = -ESTALE; - path_put(&nd->path); return status; } -static __always_inline void set_root(struct nameidata *nd) +static void set_root(struct nameidata *nd) { get_fs_root(current->fs, &nd->root); } -static int link_path_walk(const char *, struct nameidata *); - -static __always_inline unsigned set_root_rcu(struct nameidata *nd) +static void set_root_rcu(struct nameidata *nd) { struct fs_struct *fs = current->fs; - unsigned seq, res; + unsigned seq; do { seq = read_seqcount_begin(&fs->seq); nd->root = fs->root; - res = __read_seqcount_begin(&nd->root.dentry->d_seq); + nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq); } while (read_seqcount_retry(&fs->seq, seq)); - return res; } static void path_put_conditional(struct path *path, struct nameidata *nd) @@ -704,8 +827,9 @@ static inline void path_to_nameidata(const struct path *path, * Helper to directly jump to a known parsed path from ->follow_link, * caller must have taken a reference to path beforehand. */ -void nd_jump_link(struct nameidata *nd, struct path *path) +void nd_jump_link(struct path *path) { + struct nameidata *nd = current->nameidata; path_put(&nd->path); nd->path = *path; @@ -713,24 +837,14 @@ void nd_jump_link(struct nameidata *nd, struct path *path) nd->flags |= LOOKUP_JUMPED; } -void nd_set_link(struct nameidata *nd, char *path) -{ - nd->saved_names[nd->depth] = path; -} -EXPORT_SYMBOL(nd_set_link); - -char *nd_get_link(struct nameidata *nd) -{ - return nd->saved_names[nd->depth]; -} -EXPORT_SYMBOL(nd_get_link); - -static inline void put_link(struct nameidata *nd, struct path *link, void *cookie) +static inline void put_link(struct nameidata *nd) { - struct inode *inode = link->dentry->d_inode; - if (inode->i_op->put_link) - inode->i_op->put_link(link->dentry, nd, cookie); - path_put(link); + struct saved *last = nd->stack + --nd->depth; + struct inode *inode = last->inode; + if (last->cookie && inode->i_op->put_link) + inode->i_op->put_link(inode, last->cookie); + if (!(nd->flags & LOOKUP_RCU)) + path_put(&last->link); } int sysctl_protected_symlinks __read_mostly = 0; @@ -738,7 +852,6 @@ int sysctl_protected_hardlinks __read_mostly = 0; /** * may_follow_link - Check symlink following for unsafe situations - * @link: The path of the symlink * @nd: nameidata pathwalk data * * In the case of the sysctl_protected_symlinks sysctl being enabled, @@ -752,7 +865,7 @@ int sysctl_protected_hardlinks __read_mostly = 0; * * Returns 0 if following the symlink is allowed, -ve on error. */ -static inline int may_follow_link(struct path *link, struct nameidata *nd) +static inline int may_follow_link(struct nameidata *nd) { const struct inode *inode; const struct inode *parent; @@ -761,12 +874,12 @@ static inline int may_follow_link(struct path *link, struct nameidata *nd) return 0; /* Allowed if owner and follower match. */ - inode = link->dentry->d_inode; + inode = nd->stack[0].inode; if (uid_eq(current_cred()->fsuid, inode->i_uid)) return 0; /* Allowed if parent directory not sticky and world-writable. */ - parent = nd->path.dentry->d_inode; + parent = nd->inode; if ((parent->i_mode & (S_ISVTX|S_IWOTH)) != (S_ISVTX|S_IWOTH)) return 0; @@ -774,9 +887,10 @@ static inline int may_follow_link(struct path *link, struct nameidata *nd) if (uid_eq(parent->i_uid, inode->i_uid)) return 0; - audit_log_link_denied("follow_link", link); - path_put_conditional(link, nd); - path_put(&nd->path); + if (nd->flags & LOOKUP_RCU) + return -ECHILD; + + audit_log_link_denied("follow_link", &nd->stack[0].link); return -EACCES; } @@ -849,82 +963,68 @@ static int may_linkat(struct path *link) return -EPERM; } -static __always_inline int -follow_link(struct path *link, struct nameidata *nd, void **p) +static __always_inline +const char *get_link(struct nameidata *nd) { - struct dentry *dentry = link->dentry; + struct saved *last = nd->stack + nd->depth - 1; + struct dentry *dentry = last->link.dentry; + struct inode *inode = last->inode; int error; - char *s; + const char *res; - BUG_ON(nd->flags & LOOKUP_RCU); - - if (link->mnt == nd->path.mnt) - mntget(link->mnt); - - error = -ELOOP; - if (unlikely(current->total_link_count >= 40)) - goto out_put_nd_path; - - cond_resched(); - current->total_link_count++; - - touch_atime(link); - nd_set_link(nd, NULL); + if (!(nd->flags & LOOKUP_RCU)) { + touch_atime(&last->link); + cond_resched(); + } else if (atime_needs_update(&last->link, inode)) { + if (unlikely(unlazy_walk(nd, NULL, 0))) + return ERR_PTR(-ECHILD); + touch_atime(&last->link); + } - error = security_inode_follow_link(link->dentry, nd); - if (error) - goto out_put_nd_path; + error = security_inode_follow_link(dentry, inode, + nd->flags & LOOKUP_RCU); + if (unlikely(error)) + return ERR_PTR(error); nd->last_type = LAST_BIND; - *p = dentry->d_inode->i_op->follow_link(dentry, nd); - error = PTR_ERR(*p); - if (IS_ERR(*p)) - goto out_put_nd_path; - - error = 0; - s = nd_get_link(nd); - if (s) { - if (unlikely(IS_ERR(s))) { - path_put(&nd->path); - put_link(nd, link, *p); - return PTR_ERR(s); + res = inode->i_link; + if (!res) { + if (nd->flags & LOOKUP_RCU) { + if (unlikely(unlazy_walk(nd, NULL, 0))) + return ERR_PTR(-ECHILD); } - if (*s == '/') { + res = inode->i_op->follow_link(dentry, &last->cookie); + if (IS_ERR_OR_NULL(res)) { + last->cookie = NULL; + return res; + } + } + if (*res == '/') { + if (nd->flags & LOOKUP_RCU) { + struct dentry *d; + if (!nd->root.mnt) + set_root_rcu(nd); + nd->path = nd->root; + d = nd->path.dentry; + nd->inode = d->d_inode; + nd->seq = nd->root_seq; + if (unlikely(read_seqcount_retry(&d->d_seq, nd->seq))) + return ERR_PTR(-ECHILD); + } else { if (!nd->root.mnt) set_root(nd); path_put(&nd->path); nd->path = nd->root; path_get(&nd->root); - nd->flags |= LOOKUP_JUMPED; + nd->inode = nd->path.dentry->d_inode; } - nd->inode = nd->path.dentry->d_inode; - error = link_path_walk(s, nd); - if (unlikely(error)) - put_link(nd, link, *p); + nd->flags |= LOOKUP_JUMPED; + while (unlikely(*++res == '/')) + ; } - - return error; - -out_put_nd_path: - *p = NULL; - path_put(&nd->path); - path_put(link); - return error; -} - -static int follow_up_rcu(struct path *path) -{ - struct mount *mnt = real_mount(path->mnt); - struct mount *parent; - struct dentry *mountpoint; - - parent = mnt->mnt_parent; - if (&parent->mnt == path->mnt) - return 0; - mountpoint = mnt->mnt_mountpoint; - path->dentry = mountpoint; - path->mnt = &parent->mnt; - return 1; + if (!*res) + res = NULL; + return res; } /* @@ -965,7 +1065,7 @@ EXPORT_SYMBOL(follow_up); * - return -EISDIR to tell follow_managed() to stop and return the path we * were called with. */ -static int follow_automount(struct path *path, unsigned flags, +static int follow_automount(struct path *path, struct nameidata *nd, bool *need_mntput) { struct vfsmount *mnt; @@ -985,13 +1085,13 @@ static int follow_automount(struct path *path, unsigned flags, * as being automount points. These will need the attentions * of the daemon to instantiate them before they can be used. */ - if (!(flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY | - LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) && + if (!(nd->flags & (LOOKUP_PARENT | LOOKUP_DIRECTORY | + LOOKUP_OPEN | LOOKUP_CREATE | LOOKUP_AUTOMOUNT)) && path->dentry->d_inode) return -EISDIR; - current->total_link_count++; - if (current->total_link_count >= 40) + nd->total_link_count++; + if (nd->total_link_count >= 40) return -ELOOP; mnt = path->dentry->d_op->d_automount(path); @@ -1005,7 +1105,7 @@ static int follow_automount(struct path *path, unsigned flags, * the path being looked up; if it wasn't then the remainder of * the path is inaccessible and we should say so. */ - if (PTR_ERR(mnt) == -EISDIR && (flags & LOOKUP_PARENT)) + if (PTR_ERR(mnt) == -EISDIR && (nd->flags & LOOKUP_PARENT)) return -EREMOTE; return PTR_ERR(mnt); } @@ -1045,7 +1145,7 @@ static int follow_automount(struct path *path, unsigned flags, * * Serialization is taken care of in namespace.c */ -static int follow_managed(struct path *path, unsigned flags) +static int follow_managed(struct path *path, struct nameidata *nd) { struct vfsmount *mnt = path->mnt; /* held by caller, must be left alone */ unsigned managed; @@ -1089,7 +1189,7 @@ static int follow_managed(struct path *path, unsigned flags) /* Handle an automount point */ if (managed & DCACHE_NEED_AUTOMOUNT) { - ret = follow_automount(path, flags, &need_mntput); + ret = follow_automount(path, nd, &need_mntput); if (ret < 0) break; continue; @@ -1103,7 +1203,11 @@ static int follow_managed(struct path *path, unsigned flags) mntput(path->mnt); if (ret == -EISDIR) ret = 0; - return ret < 0 ? ret : need_mntput; + if (need_mntput) + nd->flags |= LOOKUP_JUMPED; + if (unlikely(ret < 0)) + path_put_conditional(path, nd); + return ret; } int follow_down_one(struct path *path) @@ -1133,7 +1237,7 @@ static inline int managed_dentry_rcu(struct dentry *dentry) * we meet a managed dentry that would need blocking. */ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, - struct inode **inode) + struct inode **inode, unsigned *seqp) { for (;;) { struct mount *mounted; @@ -1160,7 +1264,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, path->mnt = &mounted->mnt; path->dentry = mounted->mnt.mnt_root; nd->flags |= LOOKUP_JUMPED; - nd->seq = read_seqcount_begin(&path->dentry->d_seq); + *seqp = read_seqcount_begin(&path->dentry->d_seq); /* * Update the inode too. We don't need to re-check the * dentry sequence number here after this d_inode read, @@ -1179,10 +1283,8 @@ static int follow_dotdot_rcu(struct nameidata *nd) set_root_rcu(nd); while (1) { - if (nd->path.dentry == nd->root.dentry && - nd->path.mnt == nd->root.mnt) { + if (path_equal(&nd->path, &nd->root)) break; - } if (nd->path.dentry != nd->path.mnt->mnt_root) { struct dentry *old = nd->path.dentry; struct dentry *parent = old->d_parent; @@ -1190,38 +1292,42 @@ static int follow_dotdot_rcu(struct nameidata *nd) inode = parent->d_inode; seq = read_seqcount_begin(&parent->d_seq); - if (read_seqcount_retry(&old->d_seq, nd->seq)) - goto failed; + if (unlikely(read_seqcount_retry(&old->d_seq, nd->seq))) + return -ECHILD; nd->path.dentry = parent; nd->seq = seq; break; + } else { + struct mount *mnt = real_mount(nd->path.mnt); + struct mount *mparent = mnt->mnt_parent; + struct dentry *mountpoint = mnt->mnt_mountpoint; + struct inode *inode2 = mountpoint->d_inode; + unsigned seq = read_seqcount_begin(&mountpoint->d_seq); + if (unlikely(read_seqretry(&mount_lock, nd->m_seq))) + return -ECHILD; + if (&mparent->mnt == nd->path.mnt) + break; + /* we know that mountpoint was pinned */ + nd->path.dentry = mountpoint; + nd->path.mnt = &mparent->mnt; + inode = inode2; + nd->seq = seq; } - if (!follow_up_rcu(&nd->path)) - break; - inode = nd->path.dentry->d_inode; - nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); } - while (d_mountpoint(nd->path.dentry)) { + while (unlikely(d_mountpoint(nd->path.dentry))) { struct mount *mounted; mounted = __lookup_mnt(nd->path.mnt, nd->path.dentry); + if (unlikely(read_seqretry(&mount_lock, nd->m_seq))) + return -ECHILD; if (!mounted) break; nd->path.mnt = &mounted->mnt; nd->path.dentry = mounted->mnt.mnt_root; inode = nd->path.dentry->d_inode; nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); - if (read_seqretry(&mount_lock, nd->m_seq)) - goto failed; } nd->inode = inode; return 0; - -failed: - nd->flags &= ~LOOKUP_RCU; - if (!(nd->flags & LOOKUP_ROOT)) - nd->root.mnt = NULL; - rcu_read_unlock(); - return -ECHILD; } /* @@ -1400,7 +1506,8 @@ static struct dentry *__lookup_hash(struct qstr *name, * It _is_ time-critical. */ static int lookup_fast(struct nameidata *nd, - struct path *path, struct inode **inode) + struct path *path, struct inode **inode, + unsigned *seqp) { struct vfsmount *mnt = nd->path.mnt; struct dentry *dentry, *parent = nd->path.dentry; @@ -1424,7 +1531,7 @@ static int lookup_fast(struct nameidata *nd, * This sequence count validates that the inode matches * the dentry name information from lookup. */ - *inode = dentry->d_inode; + *inode = d_backing_inode(dentry); negative = d_is_negative(dentry); if (read_seqcount_retry(&dentry->d_seq, seq)) return -ECHILD; @@ -1440,8 +1547,8 @@ static int lookup_fast(struct nameidata *nd, */ if (__read_seqcount_retry(&parent->d_seq, nd->seq)) return -ECHILD; - nd->seq = seq; + *seqp = seq; if (unlikely(dentry->d_flags & DCACHE_OP_REVALIDATE)) { status = d_revalidate(dentry, nd->flags); if (unlikely(status <= 0)) { @@ -1452,10 +1559,10 @@ static int lookup_fast(struct nameidata *nd, } path->mnt = mnt; path->dentry = dentry; - if (likely(__follow_mount_rcu(nd, path, inode))) + if (likely(__follow_mount_rcu(nd, path, inode, seqp))) return 0; unlazy: - if (unlazy_walk(nd, dentry)) + if (unlazy_walk(nd, dentry, seq)) return -ECHILD; } else { dentry = __d_lookup(parent, &nd->last); @@ -1482,15 +1589,10 @@ unlazy: } path->mnt = mnt; path->dentry = dentry; - err = follow_managed(path, nd->flags); - if (unlikely(err < 0)) { - path_put_conditional(path, nd); - return err; - } - if (err) - nd->flags |= LOOKUP_JUMPED; - *inode = path->dentry->d_inode; - return 0; + err = follow_managed(path, nd); + if (likely(!err)) + *inode = d_backing_inode(path->dentry); + return err; need_lookup: return 1; @@ -1500,7 +1602,6 @@ need_lookup: static int lookup_slow(struct nameidata *nd, struct path *path) { struct dentry *dentry, *parent; - int err; parent = nd->path.dentry; BUG_ON(nd->inode != parent->d_inode); @@ -1512,14 +1613,7 @@ static int lookup_slow(struct nameidata *nd, struct path *path) return PTR_ERR(dentry); path->mnt = nd->path.mnt; path->dentry = dentry; - err = follow_managed(path, nd->flags); - if (unlikely(err < 0)) { - path_put_conditional(path, nd); - return err; - } - if (err) - nd->flags |= LOOKUP_JUMPED; - return 0; + return follow_managed(path, nd); } static inline int may_lookup(struct nameidata *nd) @@ -1528,7 +1622,7 @@ static inline int may_lookup(struct nameidata *nd) int err = inode_permission(nd->inode, MAY_EXEC|MAY_NOT_BLOCK); if (err != -ECHILD) return err; - if (unlazy_walk(nd, NULL)) + if (unlazy_walk(nd, NULL, 0)) return -ECHILD; } return inode_permission(nd->inode, MAY_EXEC); @@ -1538,24 +1632,45 @@ static inline int handle_dots(struct nameidata *nd, int type) { if (type == LAST_DOTDOT) { if (nd->flags & LOOKUP_RCU) { - if (follow_dotdot_rcu(nd)) - return -ECHILD; + return follow_dotdot_rcu(nd); } else follow_dotdot(nd); } return 0; } -static void terminate_walk(struct nameidata *nd) +static int pick_link(struct nameidata *nd, struct path *link, + struct inode *inode, unsigned seq) { + int error; + struct saved *last; + if (unlikely(nd->total_link_count++ >= MAXSYMLINKS)) { + path_to_nameidata(link, nd); + return -ELOOP; + } if (!(nd->flags & LOOKUP_RCU)) { - path_put(&nd->path); - } else { - nd->flags &= ~LOOKUP_RCU; - if (!(nd->flags & LOOKUP_ROOT)) - nd->root.mnt = NULL; - rcu_read_unlock(); + if (link->mnt == nd->path.mnt) + mntget(link->mnt); + } + error = nd_alloc_stack(nd); + if (unlikely(error)) { + if (error == -ECHILD) { + if (unlikely(unlazy_link(nd, link, seq))) + return -ECHILD; + error = nd_alloc_stack(nd); + } + if (error) { + path_put(link); + return error; + } } + + last = nd->stack + nd->depth++; + last->link = *link; + last->cookie = NULL; + last->inode = inode; + last->seq = seq; + return 1; } /* @@ -1564,98 +1679,68 @@ static void terminate_walk(struct nameidata *nd) * so we keep a cache of "no, this doesn't need follow_link" * for the common case. */ -static inline int should_follow_link(struct dentry *dentry, int follow) +static inline int should_follow_link(struct nameidata *nd, struct path *link, + int follow, + struct inode *inode, unsigned seq) { - return unlikely(d_is_symlink(dentry)) ? follow : 0; + if (likely(!d_is_symlink(link->dentry))) + return 0; + if (!follow) + return 0; + return pick_link(nd, link, inode, seq); } -static inline int walk_component(struct nameidata *nd, struct path *path, - int follow) +enum {WALK_GET = 1, WALK_PUT = 2}; + +static int walk_component(struct nameidata *nd, int flags) { + struct path path; struct inode *inode; + unsigned seq; int err; /* * "." and ".." are special - ".." especially so because it has * to be able to know about the current root directory and * parent relationships. */ - if (unlikely(nd->last_type != LAST_NORM)) - return handle_dots(nd, nd->last_type); - err = lookup_fast(nd, path, &inode); + if (unlikely(nd->last_type != LAST_NORM)) { + err = handle_dots(nd, nd->last_type); + if (flags & WALK_PUT) + put_link(nd); + return err; + } + err = lookup_fast(nd, &path, &inode, &seq); if (unlikely(err)) { if (err < 0) - goto out_err; + return err; - err = lookup_slow(nd, path); + err = lookup_slow(nd, &path); if (err < 0) - goto out_err; + return err; - inode = path->dentry->d_inode; + inode = d_backing_inode(path.dentry); + seq = 0; /* we are already out of RCU mode */ err = -ENOENT; - if (d_is_negative(path->dentry)) + if (d_is_negative(path.dentry)) goto out_path_put; } - if (should_follow_link(path->dentry, follow)) { - if (nd->flags & LOOKUP_RCU) { - if (unlikely(nd->path.mnt != path->mnt || - unlazy_walk(nd, path->dentry))) { - err = -ECHILD; - goto out_err; - } - } - BUG_ON(inode != path->dentry->d_inode); - return 1; - } - path_to_nameidata(path, nd); + if (flags & WALK_PUT) + put_link(nd); + err = should_follow_link(nd, &path, flags & WALK_GET, inode, seq); + if (unlikely(err)) + return err; + path_to_nameidata(&path, nd); nd->inode = inode; + nd->seq = seq; return 0; out_path_put: - path_to_nameidata(path, nd); -out_err: - terminate_walk(nd); + path_to_nameidata(&path, nd); return err; } /* - * This limits recursive symlink follows to 8, while - * limiting consecutive symlinks to 40. - * - * Without that kind of total limit, nasty chains of consecutive - * symlinks can cause almost arbitrarily long lookups. - */ -static inline int nested_symlink(struct path *path, struct nameidata *nd) -{ - int res; - - if (unlikely(current->link_count >= MAX_NESTED_LINKS)) { - path_put_conditional(path, nd); - path_put(&nd->path); - return -ELOOP; - } - BUG_ON(nd->depth >= MAX_NESTED_LINKS); - - nd->depth++; - current->link_count++; - - do { - struct path link = *path; - void *cookie; - - res = follow_link(&link, nd, &cookie); - if (res) - break; - res = walk_component(nd, path, LOOKUP_FOLLOW); - put_link(nd, &link, cookie); - } while (res > 0); - - current->link_count--; - nd->depth--; - return res; -} - -/* * We can do the critical dentry name comparison and hashing * operations one word at a time, but we are limited to: * @@ -1781,9 +1866,8 @@ static inline u64 hash_name(const char *name) */ static int link_path_walk(const char *name, struct nameidata *nd) { - struct path next; int err; - + while (*name=='/') name++; if (!*name) @@ -1796,7 +1880,7 @@ static int link_path_walk(const char *name, struct nameidata *nd) err = may_lookup(nd); if (err) - break; + return err; hash_len = hash_name(name); @@ -1818,7 +1902,7 @@ static int link_path_walk(const char *name, struct nameidata *nd) struct qstr this = { { .hash_len = hash_len }, .name = name }; err = parent->d_op->d_hash(parent, &this); if (err < 0) - break; + return err; hash_len = this.hash_len; name = this.name; } @@ -1830,7 +1914,7 @@ static int link_path_walk(const char *name, struct nameidata *nd) name += hashlen_len(hash_len); if (!*name) - return 0; + goto OK; /* * If it wasn't NUL, we know it was '/'. Skip that * slash, and continue until no more slashes. @@ -1838,57 +1922,78 @@ static int link_path_walk(const char *name, struct nameidata *nd) do { name++; } while (unlikely(*name == '/')); - if (!*name) - return 0; - - err = walk_component(nd, &next, LOOKUP_FOLLOW); + if (unlikely(!*name)) { +OK: + /* pathname body, done */ + if (!nd->depth) + return 0; + name = nd->stack[nd->depth - 1].name; + /* trailing symlink, done */ + if (!name) + return 0; + /* last component of nested symlink */ + err = walk_component(nd, WALK_GET | WALK_PUT); + } else { + err = walk_component(nd, WALK_GET); + } if (err < 0) return err; if (err) { - err = nested_symlink(&next, nd); - if (err) - return err; + const char *s = get_link(nd); + + if (unlikely(IS_ERR(s))) + return PTR_ERR(s); + err = 0; + if (unlikely(!s)) { + /* jumped */ + put_link(nd); + } else { + nd->stack[nd->depth - 1].name = name; + name = s; + continue; + } } - if (!d_can_lookup(nd->path.dentry)) { - err = -ENOTDIR; - break; + if (unlikely(!d_can_lookup(nd->path.dentry))) { + if (nd->flags & LOOKUP_RCU) { + if (unlazy_walk(nd, NULL, 0)) + return -ECHILD; + } + return -ENOTDIR; } } - terminate_walk(nd); - return err; } -static int path_init(int dfd, const struct filename *name, unsigned int flags, - struct nameidata *nd) +static const char *path_init(struct nameidata *nd, unsigned flags) { int retval = 0; - const char *s = name->name; + const char *s = nd->name->name; nd->last_type = LAST_ROOT; /* if there are only slashes... */ nd->flags = flags | LOOKUP_JUMPED | LOOKUP_PARENT; nd->depth = 0; - nd->base = NULL; + nd->total_link_count = 0; if (flags & LOOKUP_ROOT) { struct dentry *root = nd->root.dentry; struct inode *inode = root->d_inode; if (*s) { if (!d_can_lookup(root)) - return -ENOTDIR; + return ERR_PTR(-ENOTDIR); retval = inode_permission(inode, MAY_EXEC); if (retval) - return retval; + return ERR_PTR(retval); } nd->path = nd->root; nd->inode = inode; if (flags & LOOKUP_RCU) { rcu_read_lock(); nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); + nd->root_seq = nd->seq; nd->m_seq = read_seqbegin(&mount_lock); } else { path_get(&nd->path); } - goto done; + return s; } nd->root.mnt = NULL; @@ -1897,13 +2002,14 @@ static int path_init(int dfd, const struct filename *name, unsigned int flags, if (*s == '/') { if (flags & LOOKUP_RCU) { rcu_read_lock(); - nd->seq = set_root_rcu(nd); + set_root_rcu(nd); + nd->seq = nd->root_seq; } else { set_root(nd); path_get(&nd->root); } nd->path = nd->root; - } else if (dfd == AT_FDCWD) { + } else if (nd->dfd == AT_FDCWD) { if (flags & LOOKUP_RCU) { struct fs_struct *fs = current->fs; unsigned seq; @@ -1920,180 +2026,205 @@ static int path_init(int dfd, const struct filename *name, unsigned int flags, } } else { /* Caller must check execute permissions on the starting path component */ - struct fd f = fdget_raw(dfd); + struct fd f = fdget_raw(nd->dfd); struct dentry *dentry; if (!f.file) - return -EBADF; + return ERR_PTR(-EBADF); dentry = f.file->f_path.dentry; if (*s) { if (!d_can_lookup(dentry)) { fdput(f); - return -ENOTDIR; + return ERR_PTR(-ENOTDIR); } } nd->path = f.file->f_path; if (flags & LOOKUP_RCU) { - if (f.flags & FDPUT_FPUT) - nd->base = f.file; - nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); rcu_read_lock(); + nd->inode = nd->path.dentry->d_inode; + nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); } else { path_get(&nd->path); - fdput(f); + nd->inode = nd->path.dentry->d_inode; } + fdput(f); + return s; } nd->inode = nd->path.dentry->d_inode; if (!(flags & LOOKUP_RCU)) - goto done; + return s; if (likely(!read_seqcount_retry(&nd->path.dentry->d_seq, nd->seq))) - goto done; + return s; if (!(nd->flags & LOOKUP_ROOT)) nd->root.mnt = NULL; rcu_read_unlock(); - return -ECHILD; -done: - current->total_link_count = 0; - return link_path_walk(s, nd); + return ERR_PTR(-ECHILD); } -static void path_cleanup(struct nameidata *nd) +static const char *trailing_symlink(struct nameidata *nd) { - if (nd->root.mnt && !(nd->flags & LOOKUP_ROOT)) { - path_put(&nd->root); - nd->root.mnt = NULL; - } - if (unlikely(nd->base)) - fput(nd->base); + const char *s; + int error = may_follow_link(nd); + if (unlikely(error)) + return ERR_PTR(error); + nd->flags |= LOOKUP_PARENT; + nd->stack[0].name = NULL; + s = get_link(nd); + return s ? s : ""; } -static inline int lookup_last(struct nameidata *nd, struct path *path) +static inline int lookup_last(struct nameidata *nd) { if (nd->last_type == LAST_NORM && nd->last.name[nd->last.len]) nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; nd->flags &= ~LOOKUP_PARENT; - return walk_component(nd, path, nd->flags & LOOKUP_FOLLOW); + return walk_component(nd, + nd->flags & LOOKUP_FOLLOW + ? nd->depth + ? WALK_PUT | WALK_GET + : WALK_GET + : 0); } /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ -static int path_lookupat(int dfd, const struct filename *name, - unsigned int flags, struct nameidata *nd) +static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path) { - struct path path; + const char *s = path_init(nd, flags); int err; - /* - * Path walking is largely split up into 2 different synchronisation - * schemes, rcu-walk and ref-walk (explained in - * Documentation/filesystems/path-lookup.txt). These share much of the - * path walk code, but some things particularly setup, cleanup, and - * following mounts are sufficiently divergent that functions are - * duplicated. Typically there is a function foo(), and its RCU - * analogue, foo_rcu(). - * - * -ECHILD is the error number of choice (just to avoid clashes) that - * is returned if some aspect of an rcu-walk fails. Such an error must - * be handled by restarting a traditional ref-walk (which will always - * be able to complete). - */ - err = path_init(dfd, name, flags, nd); - if (!err && !(flags & LOOKUP_PARENT)) { - err = lookup_last(nd, &path); - while (err > 0) { - void *cookie; - struct path link = path; - err = may_follow_link(&link, nd); - if (unlikely(err)) - break; - nd->flags |= LOOKUP_PARENT; - err = follow_link(&link, nd, &cookie); - if (err) - break; - err = lookup_last(nd, &path); - put_link(nd, &link, cookie); + if (IS_ERR(s)) + return PTR_ERR(s); + while (!(err = link_path_walk(s, nd)) + && ((err = lookup_last(nd)) > 0)) { + s = trailing_symlink(nd); + if (IS_ERR(s)) { + err = PTR_ERR(s); + break; } } - if (!err) err = complete_walk(nd); - if (!err && nd->flags & LOOKUP_DIRECTORY) { - if (!d_can_lookup(nd->path.dentry)) { - path_put(&nd->path); + if (!err && nd->flags & LOOKUP_DIRECTORY) + if (!d_can_lookup(nd->path.dentry)) err = -ENOTDIR; - } + if (!err) { + *path = nd->path; + nd->path.mnt = NULL; + nd->path.dentry = NULL; } - - path_cleanup(nd); + terminate_walk(nd); return err; } -static int filename_lookup(int dfd, struct filename *name, - unsigned int flags, struct nameidata *nd) +static int filename_lookup(int dfd, struct filename *name, unsigned flags, + struct path *path, struct path *root) { - int retval = path_lookupat(dfd, name, flags | LOOKUP_RCU, nd); + int retval; + struct nameidata nd; + if (IS_ERR(name)) + return PTR_ERR(name); + if (unlikely(root)) { + nd.root = *root; + flags |= LOOKUP_ROOT; + } + set_nameidata(&nd, dfd, name); + retval = path_lookupat(&nd, flags | LOOKUP_RCU, path); if (unlikely(retval == -ECHILD)) - retval = path_lookupat(dfd, name, flags, nd); + retval = path_lookupat(&nd, flags, path); if (unlikely(retval == -ESTALE)) - retval = path_lookupat(dfd, name, flags | LOOKUP_REVAL, nd); + retval = path_lookupat(&nd, flags | LOOKUP_REVAL, path); if (likely(!retval)) - audit_inode(name, nd->path.dentry, flags & LOOKUP_PARENT); + audit_inode(name, path->dentry, flags & LOOKUP_PARENT); + restore_nameidata(); + putname(name); return retval; } +/* Returns 0 and nd will be valid on success; Retuns error, otherwise. */ +static int path_parentat(struct nameidata *nd, unsigned flags, + struct path *parent) +{ + const char *s = path_init(nd, flags); + int err; + if (IS_ERR(s)) + return PTR_ERR(s); + err = link_path_walk(s, nd); + if (!err) + err = complete_walk(nd); + if (!err) { + *parent = nd->path; + nd->path.mnt = NULL; + nd->path.dentry = NULL; + } + terminate_walk(nd); + return err; +} + +static struct filename *filename_parentat(int dfd, struct filename *name, + unsigned int flags, struct path *parent, + struct qstr *last, int *type) +{ + int retval; + struct nameidata nd; + + if (IS_ERR(name)) + return name; + set_nameidata(&nd, dfd, name); + retval = path_parentat(&nd, flags | LOOKUP_RCU, parent); + if (unlikely(retval == -ECHILD)) + retval = path_parentat(&nd, flags, parent); + if (unlikely(retval == -ESTALE)) + retval = path_parentat(&nd, flags | LOOKUP_REVAL, parent); + if (likely(!retval)) { + *last = nd.last; + *type = nd.last_type; + audit_inode(name, parent->dentry, LOOKUP_PARENT); + } else { + putname(name); + name = ERR_PTR(retval); + } + restore_nameidata(); + return name; +} + /* does lookup, returns the object with parent locked */ struct dentry *kern_path_locked(const char *name, struct path *path) { - struct filename *filename = getname_kernel(name); - struct nameidata nd; + struct filename *filename; struct dentry *d; - int err; + struct qstr last; + int type; + filename = filename_parentat(AT_FDCWD, getname_kernel(name), 0, path, + &last, &type); if (IS_ERR(filename)) return ERR_CAST(filename); - - err = filename_lookup(AT_FDCWD, filename, LOOKUP_PARENT, &nd); - if (err) { - d = ERR_PTR(err); - goto out; - } - if (nd.last_type != LAST_NORM) { - path_put(&nd.path); - d = ERR_PTR(-EINVAL); - goto out; + if (unlikely(type != LAST_NORM)) { + path_put(path); + putname(filename); + return ERR_PTR(-EINVAL); } - mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); - d = __lookup_hash(&nd.last, nd.path.dentry, 0); + mutex_lock_nested(&path->dentry->d_inode->i_mutex, I_MUTEX_PARENT); + d = __lookup_hash(&last, path->dentry, 0); if (IS_ERR(d)) { - mutex_unlock(&nd.path.dentry->d_inode->i_mutex); - path_put(&nd.path); - goto out; + mutex_unlock(&path->dentry->d_inode->i_mutex); + path_put(path); } - *path = nd.path; -out: putname(filename); return d; } int kern_path(const char *name, unsigned int flags, struct path *path) { - struct nameidata nd; - struct filename *filename = getname_kernel(name); - int res = PTR_ERR(filename); - - if (!IS_ERR(filename)) { - res = filename_lookup(AT_FDCWD, filename, flags, &nd); - putname(filename); - if (!res) - *path = nd.path; - } - return res; + return filename_lookup(AT_FDCWD, getname_kernel(name), + flags, path, NULL); } EXPORT_SYMBOL(kern_path); @@ -2109,36 +2240,13 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, const char *name, unsigned int flags, struct path *path) { - struct filename *filename = getname_kernel(name); - int err = PTR_ERR(filename); - - BUG_ON(flags & LOOKUP_PARENT); - - /* the first argument of filename_lookup() is ignored with LOOKUP_ROOT */ - if (!IS_ERR(filename)) { - struct nameidata nd; - nd.root.dentry = dentry; - nd.root.mnt = mnt; - err = filename_lookup(AT_FDCWD, filename, - flags | LOOKUP_ROOT, &nd); - if (!err) - *path = nd.path; - putname(filename); - } - return err; + struct path root = {.mnt = mnt, .dentry = dentry}; + /* the first argument of filename_lookup() is ignored with root */ + return filename_lookup(AT_FDCWD, getname_kernel(name), + flags , path, &root); } EXPORT_SYMBOL(vfs_path_lookup); -/* - * Restricted form of lookup. Doesn't follow links, single-component only, - * needs parent already locked. Doesn't follow mounts. - * SMP-safe. - */ -static struct dentry *lookup_hash(struct nameidata *nd) -{ - return __lookup_hash(&nd->last, nd->path.dentry, nd->flags); -} - /** * lookup_one_len - filesystem helper to lookup single pathname component * @name: pathname component to lookup @@ -2193,27 +2301,10 @@ EXPORT_SYMBOL(lookup_one_len); int user_path_at_empty(int dfd, const char __user *name, unsigned flags, struct path *path, int *empty) { - struct nameidata nd; - struct filename *tmp = getname_flags(name, flags, empty); - int err = PTR_ERR(tmp); - if (!IS_ERR(tmp)) { - - BUG_ON(flags & LOOKUP_PARENT); - - err = filename_lookup(dfd, tmp, flags, &nd); - putname(tmp); - if (!err) - *path = nd.path; - } - return err; -} - -int user_path_at(int dfd, const char __user *name, unsigned flags, - struct path *path) -{ - return user_path_at_empty(dfd, name, flags, path, NULL); + return filename_lookup(dfd, getname_flags(name, flags, empty), + flags, path, NULL); } -EXPORT_SYMBOL(user_path_at); +EXPORT_SYMBOL(user_path_at_empty); /* * NB: most callers don't do anything directly with the reference to the @@ -2221,26 +2312,16 @@ EXPORT_SYMBOL(user_path_at); * allocated by getname. So we must hold the reference to it until all * path-walking is complete. */ -static struct filename * -user_path_parent(int dfd, const char __user *path, struct nameidata *nd, +static inline struct filename * +user_path_parent(int dfd, const char __user *path, + struct path *parent, + struct qstr *last, + int *type, unsigned int flags) { - struct filename *s = getname(path); - int error; - /* only LOOKUP_REVAL is allowed in extra flags */ - flags &= LOOKUP_REVAL; - - if (IS_ERR(s)) - return s; - - error = filename_lookup(dfd, s, flags | LOOKUP_PARENT, nd); - if (error) { - putname(s); - return ERR_PTR(error); - } - - return s; + return filename_parentat(dfd, getname(path), flags & LOOKUP_REVAL, + parent, last, type); } /** @@ -2279,10 +2360,8 @@ mountpoint_last(struct nameidata *nd, struct path *path) /* If we're in rcuwalk, drop out of it to handle last component */ if (nd->flags & LOOKUP_RCU) { - if (unlazy_walk(nd, NULL)) { - error = -ECHILD; - goto out; - } + if (unlazy_walk(nd, NULL, 0)) + return -ECHILD; } nd->flags &= ~LOOKUP_PARENT; @@ -2290,7 +2369,7 @@ mountpoint_last(struct nameidata *nd, struct path *path) if (unlikely(nd->last_type != LAST_NORM)) { error = handle_dots(nd, nd->last_type); if (error) - goto out; + return error; dentry = dget(nd->path.dentry); goto done; } @@ -2305,74 +2384,60 @@ mountpoint_last(struct nameidata *nd, struct path *path) */ dentry = d_alloc(dir, &nd->last); if (!dentry) { - error = -ENOMEM; mutex_unlock(&dir->d_inode->i_mutex); - goto out; + return -ENOMEM; } dentry = lookup_real(dir->d_inode, dentry, nd->flags); - error = PTR_ERR(dentry); if (IS_ERR(dentry)) { mutex_unlock(&dir->d_inode->i_mutex); - goto out; + return PTR_ERR(dentry); } } mutex_unlock(&dir->d_inode->i_mutex); done: if (d_is_negative(dentry)) { - error = -ENOENT; dput(dentry); - goto out; + return -ENOENT; } + if (nd->depth) + put_link(nd); path->dentry = dentry; path->mnt = nd->path.mnt; - if (should_follow_link(dentry, nd->flags & LOOKUP_FOLLOW)) - return 1; + error = should_follow_link(nd, path, nd->flags & LOOKUP_FOLLOW, + d_backing_inode(dentry), 0); + if (unlikely(error)) + return error; mntget(path->mnt); follow_mount(path); - error = 0; -out: - terminate_walk(nd); - return error; + return 0; } /** * path_mountpoint - look up a path to be umounted - * @dfd: directory file descriptor to start walk from - * @name: full pathname to walk - * @path: pointer to container for result + * @nameidata: lookup context * @flags: lookup flags + * @path: pointer to container for result * * Look up the given name, but don't attempt to revalidate the last component. * Returns 0 and "path" will be valid on success; Returns error otherwise. */ static int -path_mountpoint(int dfd, const struct filename *name, struct path *path, - unsigned int flags) +path_mountpoint(struct nameidata *nd, unsigned flags, struct path *path) { - struct nameidata nd; + const char *s = path_init(nd, flags); int err; - - err = path_init(dfd, name, flags, &nd); - if (unlikely(err)) - goto out; - - err = mountpoint_last(&nd, path); - while (err > 0) { - void *cookie; - struct path link = *path; - err = may_follow_link(&link, &nd); - if (unlikely(err)) - break; - nd.flags |= LOOKUP_PARENT; - err = follow_link(&link, &nd, &cookie); - if (err) + if (IS_ERR(s)) + return PTR_ERR(s); + while (!(err = link_path_walk(s, nd)) && + (err = mountpoint_last(nd, path)) > 0) { + s = trailing_symlink(nd); + if (IS_ERR(s)) { + err = PTR_ERR(s); break; - err = mountpoint_last(&nd, path); - put_link(&nd, &link, cookie); + } } -out: - path_cleanup(&nd); + terminate_walk(nd); return err; } @@ -2380,16 +2445,19 @@ static int filename_mountpoint(int dfd, struct filename *name, struct path *path, unsigned int flags) { + struct nameidata nd; int error; if (IS_ERR(name)) return PTR_ERR(name); - error = path_mountpoint(dfd, name, path, flags | LOOKUP_RCU); + set_nameidata(&nd, dfd, name); + error = path_mountpoint(&nd, flags | LOOKUP_RCU, path); if (unlikely(error == -ECHILD)) - error = path_mountpoint(dfd, name, path, flags); + error = path_mountpoint(&nd, flags, path); if (unlikely(error == -ESTALE)) - error = path_mountpoint(dfd, name, path, flags | LOOKUP_REVAL); + error = path_mountpoint(&nd, flags | LOOKUP_REVAL, path); if (likely(!error)) audit_inode(name, path->dentry, 0); + restore_nameidata(); putname(name); return error; } @@ -2456,7 +2524,7 @@ EXPORT_SYMBOL(__check_sticky); */ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir) { - struct inode *inode = victim->d_inode; + struct inode *inode = d_backing_inode(victim); int error; if (d_is_negative(victim)) @@ -2922,18 +2990,19 @@ out_dput: /* * Handle the last step of open() */ -static int do_last(struct nameidata *nd, struct path *path, +static int do_last(struct nameidata *nd, struct file *file, const struct open_flags *op, - int *opened, struct filename *name) + int *opened) { struct dentry *dir = nd->path.dentry; int open_flag = op->open_flag; bool will_truncate = (open_flag & O_TRUNC) != 0; bool got_write = false; int acc_mode = op->acc_mode; + unsigned seq; struct inode *inode; - bool symlink_ok = false; struct path save_parent = { .dentry = NULL, .mnt = NULL }; + struct path path; bool retried = false; int error; @@ -2942,7 +3011,7 @@ static int do_last(struct nameidata *nd, struct path *path, if (nd->last_type != LAST_NORM) { error = handle_dots(nd, nd->last_type); - if (error) + if (unlikely(error)) return error; goto finish_open; } @@ -2950,15 +3019,13 @@ static int do_last(struct nameidata *nd, struct path *path, if (!(open_flag & O_CREAT)) { if (nd->last.name[nd->last.len]) nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY; - if (open_flag & O_PATH && !(nd->flags & LOOKUP_FOLLOW)) - symlink_ok = true; /* we _can_ be in RCU mode here */ - error = lookup_fast(nd, path, &inode); + error = lookup_fast(nd, &path, &inode, &seq); if (likely(!error)) goto finish_lookup; if (error < 0) - goto out; + return error; BUG_ON(nd->inode != dir->d_inode); } else { @@ -2972,11 +3039,10 @@ static int do_last(struct nameidata *nd, struct path *path, if (error) return error; - audit_inode(name, dir, LOOKUP_PARENT); - error = -EISDIR; + audit_inode(nd->name, dir, LOOKUP_PARENT); /* trailing slashes? */ - if (nd->last.name[nd->last.len]) - goto out; + if (unlikely(nd->last.name[nd->last.len])) + return -EISDIR; } retry_lookup: @@ -2991,7 +3057,7 @@ retry_lookup: */ } mutex_lock(&dir->d_inode->i_mutex); - error = lookup_open(nd, path, file, op, got_write, opened); + error = lookup_open(nd, &path, file, op, got_write, opened); mutex_unlock(&dir->d_inode->i_mutex); if (error <= 0) { @@ -3002,7 +3068,7 @@ retry_lookup: !S_ISREG(file_inode(file)->i_mode)) will_truncate = false; - audit_inode(name, file->f_path.dentry, 0); + audit_inode(nd->name, file->f_path.dentry, 0); goto opened; } @@ -3011,15 +3077,15 @@ retry_lookup: open_flag &= ~O_TRUNC; will_truncate = false; acc_mode = MAY_OPEN; - path_to_nameidata(path, nd); + path_to_nameidata(&path, nd); goto finish_open_created; } /* * create/update audit record if it already exists. */ - if (d_is_positive(path->dentry)) - audit_inode(name, path->dentry, 0); + if (d_is_positive(path.dentry)) + audit_inode(nd->name, path.dentry, 0); /* * If atomic_open() acquired write access it is dropped now due to @@ -3031,47 +3097,45 @@ retry_lookup: got_write = false; } - error = -EEXIST; - if ((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT)) - goto exit_dput; - - error = follow_managed(path, nd->flags); - if (error < 0) - goto exit_dput; + if (unlikely((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT))) { + path_to_nameidata(&path, nd); + return -EEXIST; + } - if (error) - nd->flags |= LOOKUP_JUMPED; + error = follow_managed(&path, nd); + if (unlikely(error < 0)) + return error; BUG_ON(nd->flags & LOOKUP_RCU); - inode = path->dentry->d_inode; - error = -ENOENT; - if (d_is_negative(path->dentry)) { - path_to_nameidata(path, nd); - goto out; + inode = d_backing_inode(path.dentry); + seq = 0; /* out of RCU mode, so the value doesn't matter */ + if (unlikely(d_is_negative(path.dentry))) { + path_to_nameidata(&path, nd); + return -ENOENT; } finish_lookup: - /* we _can_ be in RCU mode here */ - if (should_follow_link(path->dentry, !symlink_ok)) { - if (nd->flags & LOOKUP_RCU) { - if (unlikely(nd->path.mnt != path->mnt || - unlazy_walk(nd, path->dentry))) { - error = -ECHILD; - goto out; - } - } - BUG_ON(inode != path->dentry->d_inode); - return 1; + if (nd->depth) + put_link(nd); + error = should_follow_link(nd, &path, nd->flags & LOOKUP_FOLLOW, + inode, seq); + if (unlikely(error)) + return error; + + if (unlikely(d_is_symlink(path.dentry)) && !(open_flag & O_PATH)) { + path_to_nameidata(&path, nd); + return -ELOOP; } - if ((nd->flags & LOOKUP_RCU) || nd->path.mnt != path->mnt) { - path_to_nameidata(path, nd); + if ((nd->flags & LOOKUP_RCU) || nd->path.mnt != path.mnt) { + path_to_nameidata(&path, nd); } else { save_parent.dentry = nd->path.dentry; - save_parent.mnt = mntget(path->mnt); - nd->path.dentry = path->dentry; + save_parent.mnt = mntget(path.mnt); + nd->path.dentry = path.dentry; } nd->inode = inode; + nd->seq = seq; /* Why this, you ask? _Now_ we might have grown LOOKUP_JUMPED... */ finish_open: error = complete_walk(nd); @@ -3079,7 +3143,7 @@ finish_open: path_put(&save_parent); return error; } - audit_inode(name, nd->path.dentry, 0); + audit_inode(nd->name, nd->path.dentry, 0); error = -EISDIR; if ((open_flag & O_CREAT) && d_is_dir(nd->path.dentry)) goto out; @@ -3126,12 +3190,8 @@ out: if (got_write) mnt_drop_write(nd->path.mnt); path_put(&save_parent); - terminate_walk(nd); return error; -exit_dput: - path_put_conditional(path, nd); - goto out; exit_fput: fput(file); goto out; @@ -3155,50 +3215,46 @@ stale_open: goto retry_lookup; } -static int do_tmpfile(int dfd, struct filename *pathname, - struct nameidata *nd, int flags, +static int do_tmpfile(struct nameidata *nd, unsigned flags, const struct open_flags *op, struct file *file, int *opened) { static const struct qstr name = QSTR_INIT("/", 1); - struct dentry *dentry, *child; + struct dentry *child; struct inode *dir; - int error = path_lookupat(dfd, pathname, - flags | LOOKUP_DIRECTORY, nd); + struct path path; + int error = path_lookupat(nd, flags | LOOKUP_DIRECTORY, &path); if (unlikely(error)) return error; - error = mnt_want_write(nd->path.mnt); + error = mnt_want_write(path.mnt); if (unlikely(error)) goto out; + dir = path.dentry->d_inode; /* we want directory to be writable */ - error = inode_permission(nd->inode, MAY_WRITE | MAY_EXEC); + error = inode_permission(dir, MAY_WRITE | MAY_EXEC); if (error) goto out2; - dentry = nd->path.dentry; - dir = dentry->d_inode; if (!dir->i_op->tmpfile) { error = -EOPNOTSUPP; goto out2; } - child = d_alloc(dentry, &name); + child = d_alloc(path.dentry, &name); if (unlikely(!child)) { error = -ENOMEM; goto out2; } - nd->flags &= ~LOOKUP_DIRECTORY; - nd->flags |= op->intent; - dput(nd->path.dentry); - nd->path.dentry = child; - error = dir->i_op->tmpfile(dir, nd->path.dentry, op->mode); + dput(path.dentry); + path.dentry = child; + error = dir->i_op->tmpfile(dir, child, op->mode); if (error) goto out2; - audit_inode(pathname, nd->path.dentry, 0); + audit_inode(nd->name, child, 0); /* Don't check for other permissions, the inode was just created */ - error = may_open(&nd->path, MAY_OPEN, op->open_flag); + error = may_open(&path, MAY_OPEN, op->open_flag); if (error) goto out2; - file->f_path.mnt = nd->path.mnt; - error = finish_open(file, nd->path.dentry, NULL, opened); + file->f_path.mnt = path.mnt; + error = finish_open(file, child, NULL, opened); if (error) goto out2; error = open_check_o_direct(file); @@ -3211,17 +3267,17 @@ static int do_tmpfile(int dfd, struct filename *pathname, spin_unlock(&inode->i_lock); } out2: - mnt_drop_write(nd->path.mnt); + mnt_drop_write(path.mnt); out: - path_put(&nd->path); + path_put(&path); return error; } -static struct file *path_openat(int dfd, struct filename *pathname, - struct nameidata *nd, const struct open_flags *op, int flags) +static struct file *path_openat(struct nameidata *nd, + const struct open_flags *op, unsigned flags) { + const char *s; struct file *file; - struct path path; int opened = 0; int error; @@ -3232,37 +3288,25 @@ static struct file *path_openat(int dfd, struct filename *pathname, file->f_flags = op->open_flag; if (unlikely(file->f_flags & __O_TMPFILE)) { - error = do_tmpfile(dfd, pathname, nd, flags, op, file, &opened); + error = do_tmpfile(nd, flags, op, file, &opened); goto out2; } - error = path_init(dfd, pathname, flags, nd); - if (unlikely(error)) - goto out; - - error = do_last(nd, &path, file, op, &opened, pathname); - while (unlikely(error > 0)) { /* trailing symlink */ - struct path link = path; - void *cookie; - if (!(nd->flags & LOOKUP_FOLLOW)) { - path_put_conditional(&path, nd); - path_put(&nd->path); - error = -ELOOP; - break; - } - error = may_follow_link(&link, nd); - if (unlikely(error)) - break; - nd->flags |= LOOKUP_PARENT; + s = path_init(nd, flags); + if (IS_ERR(s)) { + put_filp(file); + return ERR_CAST(s); + } + while (!(error = link_path_walk(s, nd)) && + (error = do_last(nd, file, op, &opened)) > 0) { nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL); - error = follow_link(&link, nd, &cookie); - if (unlikely(error)) + s = trailing_symlink(nd); + if (IS_ERR(s)) { + error = PTR_ERR(s); break; - error = do_last(nd, &path, file, op, &opened, pathname); - put_link(nd, &link, cookie); + } } -out: - path_cleanup(nd); + terminate_walk(nd); out2: if (!(opened & FILE_OPENED)) { BUG_ON(!error); @@ -3287,11 +3331,13 @@ struct file *do_filp_open(int dfd, struct filename *pathname, int flags = op->lookup_flags; struct file *filp; - filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_RCU); + set_nameidata(&nd, dfd, pathname); + filp = path_openat(&nd, op, flags | LOOKUP_RCU); if (unlikely(filp == ERR_PTR(-ECHILD))) - filp = path_openat(dfd, pathname, &nd, op, flags); + filp = path_openat(&nd, op, flags); if (unlikely(filp == ERR_PTR(-ESTALE))) - filp = path_openat(dfd, pathname, &nd, op, flags | LOOKUP_REVAL); + filp = path_openat(&nd, op, flags | LOOKUP_REVAL); + restore_nameidata(); return filp; } @@ -3313,11 +3359,13 @@ struct file *do_file_open_root(struct dentry *dentry, struct vfsmount *mnt, if (unlikely(IS_ERR(filename))) return ERR_CAST(filename); - file = path_openat(-1, filename, &nd, op, flags | LOOKUP_RCU); + set_nameidata(&nd, -1, filename); + file = path_openat(&nd, op, flags | LOOKUP_RCU); if (unlikely(file == ERR_PTR(-ECHILD))) - file = path_openat(-1, filename, &nd, op, flags); + file = path_openat(&nd, op, flags); if (unlikely(file == ERR_PTR(-ESTALE))) - file = path_openat(-1, filename, &nd, op, flags | LOOKUP_REVAL); + file = path_openat(&nd, op, flags | LOOKUP_REVAL); + restore_nameidata(); putname(filename); return file; } @@ -3326,7 +3374,8 @@ static struct dentry *filename_create(int dfd, struct filename *name, struct path *path, unsigned int lookup_flags) { struct dentry *dentry = ERR_PTR(-EEXIST); - struct nameidata nd; + struct qstr last; + int type; int err2; int error; bool is_dir = (lookup_flags & LOOKUP_DIRECTORY); @@ -3337,26 +3386,25 @@ static struct dentry *filename_create(int dfd, struct filename *name, */ lookup_flags &= LOOKUP_REVAL; - error = filename_lookup(dfd, name, LOOKUP_PARENT|lookup_flags, &nd); - if (error) - return ERR_PTR(error); + name = filename_parentat(dfd, name, lookup_flags, path, &last, &type); + if (IS_ERR(name)) + return ERR_CAST(name); /* * Yucky last component or no last component at all? * (foo/., foo/.., /////) */ - if (nd.last_type != LAST_NORM) + if (unlikely(type != LAST_NORM)) goto out; - nd.flags &= ~LOOKUP_PARENT; - nd.flags |= LOOKUP_CREATE | LOOKUP_EXCL; /* don't fail immediately if it's r/o, at least try to report other errors */ - err2 = mnt_want_write(nd.path.mnt); + err2 = mnt_want_write(path->mnt); /* * Do the final lookup. */ - mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); - dentry = lookup_hash(&nd); + lookup_flags |= LOOKUP_CREATE | LOOKUP_EXCL; + mutex_lock_nested(&path->dentry->d_inode->i_mutex, I_MUTEX_PARENT); + dentry = __lookup_hash(&last, path->dentry, lookup_flags); if (IS_ERR(dentry)) goto unlock; @@ -3370,7 +3418,7 @@ static struct dentry *filename_create(int dfd, struct filename *name, * all is fine. Let's be bastards - you had / on the end, you've * been asking for (non-existent) directory. -ENOENT for you. */ - if (unlikely(!is_dir && nd.last.name[nd.last.len])) { + if (unlikely(!is_dir && last.name[last.len])) { error = -ENOENT; goto fail; } @@ -3378,31 +3426,26 @@ static struct dentry *filename_create(int dfd, struct filename *name, error = err2; goto fail; } - *path = nd.path; + putname(name); return dentry; fail: dput(dentry); dentry = ERR_PTR(error); unlock: - mutex_unlock(&nd.path.dentry->d_inode->i_mutex); + mutex_unlock(&path->dentry->d_inode->i_mutex); if (!err2) - mnt_drop_write(nd.path.mnt); + mnt_drop_write(path->mnt); out: - path_put(&nd.path); + path_put(path); + putname(name); return dentry; } struct dentry *kern_path_create(int dfd, const char *pathname, struct path *path, unsigned int lookup_flags) { - struct filename *filename = getname_kernel(pathname); - struct dentry *res; - - if (IS_ERR(filename)) - return ERR_CAST(filename); - res = filename_create(dfd, filename, path, lookup_flags); - putname(filename); - return res; + return filename_create(dfd, getname_kernel(pathname), + path, lookup_flags); } EXPORT_SYMBOL(kern_path_create); @@ -3415,16 +3458,10 @@ void done_path_create(struct path *path, struct dentry *dentry) } EXPORT_SYMBOL(done_path_create); -struct dentry *user_path_create(int dfd, const char __user *pathname, +inline struct dentry *user_path_create(int dfd, const char __user *pathname, struct path *path, unsigned int lookup_flags) { - struct filename *tmp = getname(pathname); - struct dentry *res; - if (IS_ERR(tmp)) - return ERR_CAST(tmp); - res = filename_create(dfd, tmp, path, lookup_flags); - putname(tmp); - return res; + return filename_create(dfd, getname(pathname), path, lookup_flags); } EXPORT_SYMBOL(user_path_create); @@ -3645,14 +3682,17 @@ static long do_rmdir(int dfd, const char __user *pathname) int error = 0; struct filename *name; struct dentry *dentry; - struct nameidata nd; + struct path path; + struct qstr last; + int type; unsigned int lookup_flags = 0; retry: - name = user_path_parent(dfd, pathname, &nd, lookup_flags); + name = user_path_parent(dfd, pathname, + &path, &last, &type, lookup_flags); if (IS_ERR(name)) return PTR_ERR(name); - switch(nd.last_type) { + switch (type) { case LAST_DOTDOT: error = -ENOTEMPTY; goto exit1; @@ -3664,13 +3704,12 @@ retry: goto exit1; } - nd.flags &= ~LOOKUP_PARENT; - error = mnt_want_write(nd.path.mnt); + error = mnt_want_write(path.mnt); if (error) goto exit1; - mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); - dentry = lookup_hash(&nd); + mutex_lock_nested(&path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); + dentry = __lookup_hash(&last, path.dentry, lookup_flags); error = PTR_ERR(dentry); if (IS_ERR(dentry)) goto exit2; @@ -3678,17 +3717,17 @@ retry: error = -ENOENT; goto exit3; } - error = security_path_rmdir(&nd.path, dentry); + error = security_path_rmdir(&path, dentry); if (error) goto exit3; - error = vfs_rmdir(nd.path.dentry->d_inode, dentry); + error = vfs_rmdir(path.dentry->d_inode, dentry); exit3: dput(dentry); exit2: - mutex_unlock(&nd.path.dentry->d_inode->i_mutex); - mnt_drop_write(nd.path.mnt); + mutex_unlock(&path.dentry->d_inode->i_mutex); + mnt_drop_write(path.mnt); exit1: - path_put(&nd.path); + path_put(&path); putname(name); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; @@ -3771,43 +3810,45 @@ static long do_unlinkat(int dfd, const char __user *pathname) int error; struct filename *name; struct dentry *dentry; - struct nameidata nd; + struct path path; + struct qstr last; + int type; struct inode *inode = NULL; struct inode *delegated_inode = NULL; unsigned int lookup_flags = 0; retry: - name = user_path_parent(dfd, pathname, &nd, lookup_flags); + name = user_path_parent(dfd, pathname, + &path, &last, &type, lookup_flags); if (IS_ERR(name)) return PTR_ERR(name); error = -EISDIR; - if (nd.last_type != LAST_NORM) + if (type != LAST_NORM) goto exit1; - nd.flags &= ~LOOKUP_PARENT; - error = mnt_want_write(nd.path.mnt); + error = mnt_want_write(path.mnt); if (error) goto exit1; retry_deleg: - mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); - dentry = lookup_hash(&nd); + mutex_lock_nested(&path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); + dentry = __lookup_hash(&last, path.dentry, lookup_flags); error = PTR_ERR(dentry); if (!IS_ERR(dentry)) { /* Why not before? Because we want correct error value */ - if (nd.last.name[nd.last.len]) + if (last.name[last.len]) goto slashes; inode = dentry->d_inode; if (d_is_negative(dentry)) goto slashes; ihold(inode); - error = security_path_unlink(&nd.path, dentry); + error = security_path_unlink(&path, dentry); if (error) goto exit2; - error = vfs_unlink(nd.path.dentry->d_inode, dentry, &delegated_inode); + error = vfs_unlink(path.dentry->d_inode, dentry, &delegated_inode); exit2: dput(dentry); } - mutex_unlock(&nd.path.dentry->d_inode->i_mutex); + mutex_unlock(&path.dentry->d_inode->i_mutex); if (inode) iput(inode); /* truncate the inode here */ inode = NULL; @@ -3816,9 +3857,9 @@ exit2: if (!error) goto retry_deleg; } - mnt_drop_write(nd.path.mnt); + mnt_drop_write(path.mnt); exit1: - path_put(&nd.path); + path_put(&path); putname(name); if (retry_estale(error, lookup_flags)) { lookup_flags |= LOOKUP_REVAL; @@ -4248,14 +4289,15 @@ EXPORT_SYMBOL(vfs_rename); SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname, int, newdfd, const char __user *, newname, unsigned int, flags) { - struct dentry *old_dir, *new_dir; struct dentry *old_dentry, *new_dentry; struct dentry *trap; - struct nameidata oldnd, newnd; + struct path old_path, new_path; + struct qstr old_last, new_last; + int old_type, new_type; struct inode *delegated_inode = NULL; struct filename *from; struct filename *to; - unsigned int lookup_flags = 0; + unsigned int lookup_flags = 0, target_flags = LOOKUP_RENAME_TARGET; bool should_retry = false; int error; @@ -4269,47 +4311,45 @@ SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname, if ((flags & RENAME_WHITEOUT) && !capable(CAP_MKNOD)) return -EPERM; + if (flags & RENAME_EXCHANGE) + target_flags = 0; + retry: - from = user_path_parent(olddfd, oldname, &oldnd, lookup_flags); + from = user_path_parent(olddfd, oldname, + &old_path, &old_last, &old_type, lookup_flags); if (IS_ERR(from)) { error = PTR_ERR(from); goto exit; } - to = user_path_parent(newdfd, newname, &newnd, lookup_flags); + to = user_path_parent(newdfd, newname, + &new_path, &new_last, &new_type, lookup_flags); if (IS_ERR(to)) { error = PTR_ERR(to); goto exit1; } error = -EXDEV; - if (oldnd.path.mnt != newnd.path.mnt) + if (old_path.mnt != new_path.mnt) goto exit2; - old_dir = oldnd.path.dentry; error = -EBUSY; - if (oldnd.last_type != LAST_NORM) + if (old_type != LAST_NORM) goto exit2; - new_dir = newnd.path.dentry; if (flags & RENAME_NOREPLACE) error = -EEXIST; - if (newnd.last_type != LAST_NORM) + if (new_type != LAST_NORM) goto exit2; - error = mnt_want_write(oldnd.path.mnt); + error = mnt_want_write(old_path.mnt); if (error) goto exit2; - oldnd.flags &= ~LOOKUP_PARENT; - newnd.flags &= ~LOOKUP_PARENT; - if (!(flags & RENAME_EXCHANGE)) - newnd.flags |= LOOKUP_RENAME_TARGET; - retry_deleg: - trap = lock_rename(new_dir, old_dir); + trap = lock_rename(new_path.dentry, old_path.dentry); - old_dentry = lookup_hash(&oldnd); + old_dentry = __lookup_hash(&old_last, old_path.dentry, lookup_flags); error = PTR_ERR(old_dentry); if (IS_ERR(old_dentry)) goto exit3; @@ -4317,7 +4357,7 @@ retry_deleg: error = -ENOENT; if (d_is_negative(old_dentry)) goto exit4; - new_dentry = lookup_hash(&newnd); + new_dentry = __lookup_hash(&new_last, new_path.dentry, lookup_flags | target_flags); error = PTR_ERR(new_dentry); if (IS_ERR(new_dentry)) goto exit4; @@ -4331,16 +4371,16 @@ retry_deleg: if (!d_is_dir(new_dentry)) { error = -ENOTDIR; - if (newnd.last.name[newnd.last.len]) + if (new_last.name[new_last.len]) goto exit5; } } /* unless the source is a directory trailing slashes give -ENOTDIR */ if (!d_is_dir(old_dentry)) { error = -ENOTDIR; - if (oldnd.last.name[oldnd.last.len]) + if (old_last.name[old_last.len]) goto exit5; - if (!(flags & RENAME_EXCHANGE) && newnd.last.name[newnd.last.len]) + if (!(flags & RENAME_EXCHANGE) && new_last.name[new_last.len]) goto exit5; } /* source should not be ancestor of target */ @@ -4353,32 +4393,32 @@ retry_deleg: if (new_dentry == trap) goto exit5; - error = security_path_rename(&oldnd.path, old_dentry, - &newnd.path, new_dentry, flags); + error = security_path_rename(&old_path, old_dentry, + &new_path, new_dentry, flags); if (error) goto exit5; - error = vfs_rename(old_dir->d_inode, old_dentry, - new_dir->d_inode, new_dentry, + error = vfs_rename(old_path.dentry->d_inode, old_dentry, + new_path.dentry->d_inode, new_dentry, &delegated_inode, flags); exit5: dput(new_dentry); exit4: dput(old_dentry); exit3: - unlock_rename(new_dir, old_dir); + unlock_rename(new_path.dentry, old_path.dentry); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) goto retry_deleg; } - mnt_drop_write(oldnd.path.mnt); + mnt_drop_write(old_path.mnt); exit2: if (retry_estale(error, lookup_flags)) should_retry = true; - path_put(&newnd.path); + path_put(&new_path); putname(to); exit1: - path_put(&oldnd.path); + path_put(&old_path); putname(from); if (should_retry) { should_retry = false; @@ -4437,18 +4477,19 @@ EXPORT_SYMBOL(readlink_copy); */ int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen) { - struct nameidata nd; void *cookie; + struct inode *inode = d_inode(dentry); + const char *link = inode->i_link; int res; - nd.depth = 0; - cookie = dentry->d_inode->i_op->follow_link(dentry, &nd); - if (IS_ERR(cookie)) - return PTR_ERR(cookie); - - res = readlink_copy(buffer, buflen, nd_get_link(&nd)); - if (dentry->d_inode->i_op->put_link) - dentry->d_inode->i_op->put_link(dentry, &nd, cookie); + if (!link) { + link = inode->i_op->follow_link(dentry, &cookie); + if (IS_ERR(link)) + return PTR_ERR(link); + } + res = readlink_copy(buffer, buflen, link); + if (inode->i_op->put_link) + inode->i_op->put_link(inode, cookie); return res; } EXPORT_SYMBOL(generic_readlink); @@ -4480,22 +4521,21 @@ int page_readlink(struct dentry *dentry, char __user *buffer, int buflen) } EXPORT_SYMBOL(page_readlink); -void *page_follow_link_light(struct dentry *dentry, struct nameidata *nd) +const char *page_follow_link_light(struct dentry *dentry, void **cookie) { struct page *page = NULL; - nd_set_link(nd, page_getlink(dentry, &page)); - return page; + char *res = page_getlink(dentry, &page); + if (!IS_ERR(res)) + *cookie = page; + return res; } EXPORT_SYMBOL(page_follow_link_light); -void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) +void page_put_link(struct inode *unused, void *cookie) { struct page *page = cookie; - - if (page) { - kunmap(page); - page_cache_release(page); - } + kunmap(page); + page_cache_release(page); } EXPORT_SYMBOL(page_put_link); diff --git a/fs/namespace.c b/fs/namespace.c index 777b4819c..2b8aa15fd 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -463,7 +463,6 @@ void __mnt_drop_write(struct vfsmount *mnt) mnt_dec_writers(real_mount(mnt)); preempt_enable(); } -EXPORT_SYMBOL_GPL(__mnt_drop_write); /** * mnt_drop_write - give up write access to a mount @@ -591,24 +590,35 @@ static void delayed_free_vfsmnt(struct rcu_head *head) } /* call under rcu_read_lock */ -bool legitimize_mnt(struct vfsmount *bastard, unsigned seq) +int __legitimize_mnt(struct vfsmount *bastard, unsigned seq) { struct mount *mnt; if (read_seqretry(&mount_lock, seq)) - return false; + return 1; if (bastard == NULL) - return true; + return 0; mnt = real_mount(bastard); mnt_add_count(mnt, 1); if (likely(!read_seqretry(&mount_lock, seq))) - return true; + return 0; if (bastard->mnt_flags & MNT_SYNC_UMOUNT) { mnt_add_count(mnt, -1); - return false; + return 1; + } + return -1; +} + +/* call under rcu_read_lock */ +bool legitimize_mnt(struct vfsmount *bastard, unsigned seq) +{ + int res = __legitimize_mnt(bastard, seq); + if (likely(!res)) + return true; + if (unlikely(res < 0)) { + rcu_read_unlock(); + mntput(bastard); + rcu_read_lock(); } - rcu_read_unlock(); - mntput(bastard); - rcu_read_lock(); return false; } @@ -1216,7 +1226,7 @@ EXPORT_SYMBOL(replace_mount_options); /* iterator; we want it to have access to namespace_sem, thus here... */ static void *m_start(struct seq_file *m, loff_t *pos) { - struct proc_mounts *p = proc_mounts(m); + struct proc_mounts *p = m->private; down_read(&namespace_sem); if (p->cached_event == p->ns->event) { @@ -1237,7 +1247,7 @@ static void *m_start(struct seq_file *m, loff_t *pos) static void *m_next(struct seq_file *m, void *v, loff_t *pos) { - struct proc_mounts *p = proc_mounts(m); + struct proc_mounts *p = m->private; p->cached_mount = seq_list_next(v, &p->ns->list, pos); p->cached_index = *pos; @@ -1251,7 +1261,7 @@ static void m_stop(struct seq_file *m, void *v) static int m_show(struct seq_file *m, void *v) { - struct proc_mounts *p = proc_mounts(m); + struct proc_mounts *p = m->private; struct mount *r = list_entry(v, struct mount, mnt_list); return p->show(m, &r->mnt); } @@ -1793,7 +1803,6 @@ int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg, } return 0; } -EXPORT_SYMBOL(iterate_mounts); static void cleanup_group_ids(struct mount *mnt, struct mount *end) { diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index 80021c709..93575e91a 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -1145,6 +1145,8 @@ static int ncp_rename(struct inode *old_dir, struct dentry *old_dentry, case 0x00: ncp_dbg(1, "renamed %pd -> %pd\n", old_dentry, new_dentry); + ncp_d_prune(old_dentry); + ncp_d_prune(new_dentry); break; case 0x9E: error = -ENAMETOOLONG; diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 8d129bb73..682529c00 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -458,7 +458,7 @@ check_gss_callback_principal(struct nfs_client *clp, struct svc_rqst *rqstp) * pg_authenticate method for nfsv4 callback threads. * * The authflavor has been negotiated, so an incorrect flavor is a server - * bug. Drop packets with incorrect authflavor. + * bug. Deny packets with incorrect authflavor. * * All other checking done after NFS decoding where the nfs_client can be * found in nfs4_callback_compound @@ -468,12 +468,12 @@ static int nfs_callback_authenticate(struct svc_rqst *rqstp) switch (rqstp->rq_authop->flavour) { case RPC_AUTH_NULL: if (rqstp->rq_proc != CB_NULL) - return SVC_DROP; + return SVC_DENIED; break; case RPC_AUTH_GSS: /* No RPC_AUTH_GSS support yet in NFSv4.1 */ if (svc_is_backchannel(rqstp)) - return SVC_DROP; + return SVC_DENIED; } return SVC_OK; } diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 197806fb8..29e3c1b01 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -327,10 +327,8 @@ validate_seqid(struct nfs4_slot_table *tbl, struct cb_sequenceargs * args) dprintk("%s slot table seqid: %u\n", __func__, slot->seq_nr); /* Normal */ - if (likely(args->csa_sequenceid == slot->seq_nr + 1)) { - slot->seq_nr++; + if (likely(args->csa_sequenceid == slot->seq_nr + 1)) goto out_ok; - } /* Replay */ if (args->csa_sequenceid == slot->seq_nr) { @@ -418,6 +416,7 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args, struct cb_process_state *cps) { struct nfs4_slot_table *tbl; + struct nfs4_slot *slot; struct nfs_client *clp; int i; __be32 status = htonl(NFS4ERR_BADSESSION); @@ -429,25 +428,32 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args, if (!(clp->cl_session->flags & SESSION4_BACK_CHAN)) goto out; + tbl = &clp->cl_session->bc_slot_table; + slot = tbl->slots + args->csa_slotid; spin_lock(&tbl->slot_tbl_lock); /* state manager is resetting the session */ if (test_bit(NFS4_SLOT_TBL_DRAINING, &tbl->slot_tbl_state)) { - spin_unlock(&tbl->slot_tbl_lock); status = htonl(NFS4ERR_DELAY); /* Return NFS4ERR_BADSESSION if we're draining the session * in order to reset it. */ if (test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state)) status = htonl(NFS4ERR_BADSESSION); - goto out; + goto out_unlock; } - status = validate_seqid(&clp->cl_session->bc_slot_table, args); - spin_unlock(&tbl->slot_tbl_lock); + memcpy(&res->csr_sessionid, &args->csa_sessionid, + sizeof(res->csr_sessionid)); + res->csr_sequenceid = args->csa_sequenceid; + res->csr_slotid = args->csa_slotid; + res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1; + res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1; + + status = validate_seqid(tbl, args); if (status) - goto out; + goto out_unlock; cps->slotid = args->csa_slotid; @@ -458,15 +464,17 @@ __be32 nfs4_callback_sequence(struct cb_sequenceargs *args, */ if (referring_call_exists(clp, args->csa_nrclists, args->csa_rclists)) { status = htonl(NFS4ERR_DELAY); - goto out; + goto out_unlock; } - memcpy(&res->csr_sessionid, &args->csa_sessionid, - sizeof(res->csr_sessionid)); - res->csr_sequenceid = args->csa_sequenceid; - res->csr_slotid = args->csa_slotid; - res->csr_highestslotid = NFS41_BC_MAX_CALLBACKS - 1; - res->csr_target_highestslotid = NFS41_BC_MAX_CALLBACKS - 1; + /* + * RFC5661 20.9.3 + * If CB_SEQUENCE returns an error, then the state of the slot + * (sequence ID, cached reply) MUST NOT change. + */ + slot->seq_nr++; +out_unlock: + spin_unlock(&tbl->slot_tbl_lock); out: cps->clp = clp; /* put in nfs4_callback_compound */ diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index 19ca95cdf..6b1697a01 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -909,7 +909,7 @@ static __be32 nfs4_callback_compound(struct svc_rqst *rqstp, void *argp, void *r xdr_init_encode(&xdr_out, &rqstp->rq_res, p); status = decode_compound_hdr_arg(&xdr_in, &hdr_arg); - if (status == __constant_htonl(NFS4ERR_RESOURCE)) + if (status == htonl(NFS4ERR_RESOURCE)) return rpc_garbage_args; if (hdr_arg.minorversion == 0) { diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 892aefff3..4a90c9bb3 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -775,7 +775,7 @@ static int nfs_init_server(struct nfs_server *server, server->options = data->options; server->caps |= NFS_CAP_HARDLINKS|NFS_CAP_SYMLINKS|NFS_CAP_FILEID| NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER|NFS_CAP_OWNER_GROUP| - NFS_CAP_ATIME|NFS_CAP_CTIME|NFS_CAP_MTIME|NFS_CAP_CHANGE_ATTR; + NFS_CAP_ATIME|NFS_CAP_CTIME|NFS_CAP_MTIME; if (data->rsize) server->rsize = nfs_block_size(data->rsize, NULL); @@ -825,7 +825,6 @@ error: * Load up the server record from information gained in an fsinfo record */ static void nfs_server_set_fsinfo(struct nfs_server *server, - struct nfs_fh *mntfh, struct nfs_fsinfo *fsinfo) { unsigned long max_rpc_payload; @@ -901,7 +900,7 @@ int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs if (error < 0) goto out_error; - nfs_server_set_fsinfo(server, mntfh, &fsinfo); + nfs_server_set_fsinfo(server, &fsinfo); /* Get some general file system info */ if (server->namelen == 0) { @@ -1193,8 +1192,6 @@ void nfs_clients_init(struct net *net) } #ifdef CONFIG_PROC_FS -static struct proc_dir_entry *proc_fs_nfs; - static int nfs_server_list_open(struct inode *inode, struct file *file); static void *nfs_server_list_start(struct seq_file *p, loff_t *pos); static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos); @@ -1364,27 +1361,29 @@ static int nfs_volume_list_show(struct seq_file *m, void *v) { struct nfs_server *server; struct nfs_client *clp; - char dev[8], fsid[17]; + char dev[13]; // 8 for 2^24, 1 for ':', 3 for 2^8, 1 for '\0' + char fsid[34]; // 2 * 16 for %llx, 1 for ':', 1 for '\0' struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id); /* display header on line 1 */ if (v == &nn->nfs_volume_list) { - seq_puts(m, "NV SERVER PORT DEV FSID FSC\n"); + seq_puts(m, "NV SERVER PORT DEV FSID" + " FSC\n"); return 0; } /* display one transport per line on subsequent lines */ server = list_entry(v, struct nfs_server, master_link); clp = server->nfs_client; - snprintf(dev, 8, "%u:%u", + snprintf(dev, sizeof(dev), "%u:%u", MAJOR(server->s_dev), MINOR(server->s_dev)); - snprintf(fsid, 17, "%llx:%llx", + snprintf(fsid, sizeof(fsid), "%llx:%llx", (unsigned long long) server->fsid.major, (unsigned long long) server->fsid.minor); rcu_read_lock(); - seq_printf(m, "v%u %s %s %-7s %-17s %s\n", + seq_printf(m, "v%u %s %s %-12s %-33s %s\n", clp->rpc_ops->version, rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_ADDR), rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_HEX_PORT), @@ -1434,27 +1433,20 @@ void nfs_fs_proc_net_exit(struct net *net) */ int __init nfs_fs_proc_init(void) { - struct proc_dir_entry *p; - - proc_fs_nfs = proc_mkdir("fs/nfsfs", NULL); - if (!proc_fs_nfs) + if (!proc_mkdir("fs/nfsfs", NULL)) goto error_0; /* a file of servers with which we're dealing */ - p = proc_symlink("servers", proc_fs_nfs, "../../net/nfsfs/servers"); - if (!p) + if (!proc_symlink("fs/nfsfs/servers", NULL, "../../net/nfsfs/servers")) goto error_1; /* a file of volumes that we have mounted */ - p = proc_symlink("volumes", proc_fs_nfs, "../../net/nfsfs/volumes"); - if (!p) - goto error_2; - return 0; + if (!proc_symlink("fs/nfsfs/volumes", NULL, "../../net/nfsfs/volumes")) + goto error_1; -error_2: - remove_proc_entry("servers", proc_fs_nfs); + return 0; error_1: - remove_proc_entry("fs/nfsfs", NULL); + remove_proc_subtree("fs/nfsfs", NULL); error_0: return -ENOMEM; } @@ -1464,9 +1456,7 @@ error_0: */ void nfs_fs_proc_exit(void) { - remove_proc_entry("volumes", proc_fs_nfs); - remove_proc_entry("servers", proc_fs_nfs); - remove_proc_entry("fs/nfsfs", NULL); + remove_proc_subtree("fs/nfsfs", NULL); } #endif /* CONFIG_PROC_FS */ diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index b2c8b31b2..547308a5e 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1470,9 +1470,6 @@ static int nfs_finish_open(struct nfs_open_context *ctx, { int err; - if ((open_flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL)) - *opened |= FILE_CREATED; - err = finish_open(file, dentry, do_open, opened); if (err) goto out; @@ -1771,7 +1768,7 @@ EXPORT_SYMBOL_GPL(nfs_mkdir); static void nfs_dentry_handle_enoent(struct dentry *dentry) { - if (d_really_is_positive(dentry) && !d_unhashed(dentry)) + if (simple_positive(dentry)) d_delete(dentry); } diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 8b8d83a52..cc4fa1ed6 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -555,31 +555,22 @@ static int nfs_launder_page(struct page *page) return nfs_wb_page(inode, page); } -#ifdef CONFIG_NFS_SWAP static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file, sector_t *span) { - int ret; struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host); *span = sis->pages; - rcu_read_lock(); - ret = xs_swapper(rcu_dereference(clnt->cl_xprt), 1); - rcu_read_unlock(); - - return ret; + return rpc_clnt_swap_activate(clnt); } static void nfs_swap_deactivate(struct file *file) { struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host); - rcu_read_lock(); - xs_swapper(rcu_dereference(clnt->cl_xprt), 0); - rcu_read_unlock(); + rpc_clnt_swap_deactivate(clnt); } -#endif const struct address_space_operations nfs_file_aops = { .readpage = nfs_readpage, @@ -596,10 +587,8 @@ const struct address_space_operations nfs_file_aops = { .launder_page = nfs_launder_page, .is_dirty_writeback = nfs_check_dirty_writeback, .error_remove_page = generic_error_remove_page, -#ifdef CONFIG_NFS_SWAP .swap_activate = nfs_swap_activate, .swap_deactivate = nfs_swap_deactivate, -#endif }; /* diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index a46bf6de9..b34f2e228 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -32,6 +32,7 @@ #include <linux/nfs_fs.h> #include <linux/nfs_page.h> #include <linux/module.h> +#include <linux/backing-dev.h> #include <linux/sunrpc/metrics.h> diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 6f5f0f425..b3289d701 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -20,6 +20,7 @@ #include "../nfs4trace.h" #include "../iostat.h" #include "../nfs.h" +#include "../nfs42.h" #define NFSDBG_FACILITY NFSDBG_PNFS_LD @@ -182,17 +183,14 @@ static void _ff_layout_free_lseg(struct nfs4_ff_layout_segment *fls) static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls) { - struct nfs4_ff_layout_mirror *tmp; int i, j; for (i = 0; i < fls->mirror_array_cnt - 1; i++) { for (j = i + 1; j < fls->mirror_array_cnt; j++) if (fls->mirror_array[i]->efficiency < - fls->mirror_array[j]->efficiency) { - tmp = fls->mirror_array[i]; - fls->mirror_array[i] = fls->mirror_array[j]; - fls->mirror_array[j] = tmp; - } + fls->mirror_array[j]->efficiency) + swap(fls->mirror_array[i], + fls->mirror_array[j]); } } @@ -274,6 +272,7 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, spin_lock_init(&fls->mirror_array[i]->lock); fls->mirror_array[i]->ds_count = ds_count; + fls->mirror_array[i]->lseg = &fls->generic_hdr; /* deviceid */ rc = decode_deviceid(&stream, &devid); @@ -344,6 +343,10 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, fls->mirror_array[i]->gid); } + p = xdr_inline_decode(&stream, 4); + if (p) + fls->flags = be32_to_cpup(p); + ff_layout_sort_mirrors(fls); rc = ff_layout_check_layout(lgr); if (rc) @@ -415,6 +418,146 @@ ff_layout_get_lseg_count(struct nfs4_ff_layout_segment *fls) return 1; } +static void +nfs4_ff_start_busy_timer(struct nfs4_ff_busy_timer *timer) +{ + /* first IO request? */ + if (atomic_inc_return(&timer->n_ops) == 1) { + timer->start_time = ktime_get(); + } +} + +static ktime_t +nfs4_ff_end_busy_timer(struct nfs4_ff_busy_timer *timer) +{ + ktime_t start, now; + + if (atomic_dec_return(&timer->n_ops) < 0) + WARN_ON_ONCE(1); + + now = ktime_get(); + start = timer->start_time; + timer->start_time = now; + return ktime_sub(now, start); +} + +static ktime_t +nfs4_ff_layout_calc_completion_time(struct rpc_task *task) +{ + return ktime_sub(ktime_get(), task->tk_start); +} + +static bool +nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror, + struct nfs4_ff_layoutstat *layoutstat) +{ + static const ktime_t notime = {0}; + ktime_t now = ktime_get(); + + nfs4_ff_start_busy_timer(&layoutstat->busy_timer); + if (ktime_equal(mirror->start_time, notime)) + mirror->start_time = now; + if (ktime_equal(mirror->last_report_time, notime)) + mirror->last_report_time = now; + if (ktime_to_ms(ktime_sub(now, mirror->last_report_time)) >= + FF_LAYOUTSTATS_REPORT_INTERVAL) { + mirror->last_report_time = now; + return true; + } + + return false; +} + +static void +nfs4_ff_layout_stat_io_update_requested(struct nfs4_ff_layoutstat *layoutstat, + __u64 requested) +{ + struct nfs4_ff_io_stat *iostat = &layoutstat->io_stat; + + iostat->ops_requested++; + iostat->bytes_requested += requested; +} + +static void +nfs4_ff_layout_stat_io_update_completed(struct nfs4_ff_layoutstat *layoutstat, + __u64 requested, + __u64 completed, + ktime_t time_completed) +{ + struct nfs4_ff_io_stat *iostat = &layoutstat->io_stat; + ktime_t timer; + + iostat->ops_completed++; + iostat->bytes_completed += completed; + iostat->bytes_not_delivered += requested - completed; + + timer = nfs4_ff_end_busy_timer(&layoutstat->busy_timer); + iostat->total_busy_time = + ktime_add(iostat->total_busy_time, timer); + iostat->aggregate_completion_time = + ktime_add(iostat->aggregate_completion_time, time_completed); +} + +static void +nfs4_ff_layout_stat_io_start_read(struct nfs4_ff_layout_mirror *mirror, + __u64 requested) +{ + bool report; + + spin_lock(&mirror->lock); + report = nfs4_ff_layoutstat_start_io(mirror, &mirror->read_stat); + nfs4_ff_layout_stat_io_update_requested(&mirror->read_stat, requested); + spin_unlock(&mirror->lock); + + if (report) + pnfs_report_layoutstat(mirror->lseg->pls_layout->plh_inode); +} + +static void +nfs4_ff_layout_stat_io_end_read(struct rpc_task *task, + struct nfs4_ff_layout_mirror *mirror, + __u64 requested, + __u64 completed) +{ + spin_lock(&mirror->lock); + nfs4_ff_layout_stat_io_update_completed(&mirror->read_stat, + requested, completed, + nfs4_ff_layout_calc_completion_time(task)); + spin_unlock(&mirror->lock); +} + +static void +nfs4_ff_layout_stat_io_start_write(struct nfs4_ff_layout_mirror *mirror, + __u64 requested) +{ + bool report; + + spin_lock(&mirror->lock); + report = nfs4_ff_layoutstat_start_io(mirror , &mirror->write_stat); + nfs4_ff_layout_stat_io_update_requested(&mirror->write_stat, requested); + spin_unlock(&mirror->lock); + + if (report) + pnfs_report_layoutstat(mirror->lseg->pls_layout->plh_inode); +} + +static void +nfs4_ff_layout_stat_io_end_write(struct rpc_task *task, + struct nfs4_ff_layout_mirror *mirror, + __u64 requested, + __u64 completed, + enum nfs3_stable_how committed) +{ + if (committed == NFS_UNSTABLE) + requested = completed = 0; + + spin_lock(&mirror->lock); + nfs4_ff_layout_stat_io_update_completed(&mirror->write_stat, + requested, completed, + nfs4_ff_layout_calc_completion_time(task)); + spin_unlock(&mirror->lock); +} + static int ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg, struct nfs_commit_info *cinfo, @@ -879,6 +1022,12 @@ static int ff_layout_read_done_cb(struct rpc_task *task, return 0; } +static bool +ff_layout_need_layoutcommit(struct pnfs_layout_segment *lseg) +{ + return !(FF_LAYOUT_LSEG(lseg)->flags & FF_FLAGS_NO_LAYOUTCOMMIT); +} + /* * We reference the rpc_cred of the first WRITE that triggers the need for * a LAYOUTCOMMIT, and use it to send the layoutcommit compound. @@ -891,6 +1040,9 @@ static int ff_layout_read_done_cb(struct rpc_task *task, static void ff_layout_set_layoutcommit(struct nfs_pgio_header *hdr) { + if (!ff_layout_need_layoutcommit(hdr->lseg)) + return; + pnfs_set_layoutcommit(hdr->inode, hdr->lseg, hdr->mds_offset + hdr->res.count); dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino, @@ -909,6 +1061,10 @@ ff_layout_reset_to_mds(struct pnfs_layout_segment *lseg, int idx) static int ff_layout_read_prepare_common(struct rpc_task *task, struct nfs_pgio_header *hdr) { + nfs4_ff_layout_stat_io_start_read( + FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), + hdr->args.count); + if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) { rpc_exit(task, -EIO); return -EIO; @@ -962,15 +1118,15 @@ static void ff_layout_read_prepare_v4(struct rpc_task *task, void *data) { struct nfs_pgio_header *hdr = data; - if (ff_layout_read_prepare_common(task, hdr)) - return; - if (ff_layout_setup_sequence(hdr->ds_clp, &hdr->args.seq_args, &hdr->res.seq_res, task)) return; + if (ff_layout_read_prepare_common(task, hdr)) + return; + if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context, hdr->args.lock_context, FMODE_READ) == -EIO) rpc_exit(task, -EIO); /* lost lock, terminate I/O */ @@ -982,6 +1138,10 @@ static void ff_layout_read_call_done(struct rpc_task *task, void *data) dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status); + nfs4_ff_layout_stat_io_end_read(task, + FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), + hdr->args.count, hdr->res.count); + if (test_bit(NFS_IOHDR_REDO, &hdr->flags) && task->tk_status == 0) { nfs4_sequence_done(task, &hdr->res.seq_res); @@ -1074,7 +1234,8 @@ static int ff_layout_commit_done_cb(struct rpc_task *task, return -EAGAIN; } - if (data->verf.committed == NFS_UNSTABLE) + if (data->verf.committed == NFS_UNSTABLE + && ff_layout_need_layoutcommit(data->lseg)) pnfs_set_layoutcommit(data->inode, data->lseg, data->lwb); return 0; @@ -1083,6 +1244,10 @@ static int ff_layout_commit_done_cb(struct rpc_task *task, static int ff_layout_write_prepare_common(struct rpc_task *task, struct nfs_pgio_header *hdr) { + nfs4_ff_layout_stat_io_start_write( + FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), + hdr->args.count); + if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) { rpc_exit(task, -EIO); return -EIO; @@ -1116,15 +1281,15 @@ static void ff_layout_write_prepare_v4(struct rpc_task *task, void *data) { struct nfs_pgio_header *hdr = data; - if (ff_layout_write_prepare_common(task, hdr)) - return; - if (ff_layout_setup_sequence(hdr->ds_clp, &hdr->args.seq_args, &hdr->res.seq_res, task)) return; + if (ff_layout_write_prepare_common(task, hdr)) + return; + if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context, hdr->args.lock_context, FMODE_WRITE) == -EIO) rpc_exit(task, -EIO); /* lost lock, terminate I/O */ @@ -1134,6 +1299,11 @@ static void ff_layout_write_call_done(struct rpc_task *task, void *data) { struct nfs_pgio_header *hdr = data; + nfs4_ff_layout_stat_io_end_write(task, + FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), + hdr->args.count, hdr->res.count, + hdr->res.verf->committed); + if (test_bit(NFS_IOHDR_REDO, &hdr->flags) && task->tk_status == 0) { nfs4_sequence_done(task, &hdr->res.seq_res); @@ -1152,8 +1322,17 @@ static void ff_layout_write_count_stats(struct rpc_task *task, void *data) &NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_WRITE]); } +static void ff_layout_commit_prepare_common(struct rpc_task *task, + struct nfs_commit_data *cdata) +{ + nfs4_ff_layout_stat_io_start_write( + FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index), + 0); +} + static void ff_layout_commit_prepare_v3(struct rpc_task *task, void *data) { + ff_layout_commit_prepare_common(task, data); rpc_call_start(task); } @@ -1161,10 +1340,30 @@ static void ff_layout_commit_prepare_v4(struct rpc_task *task, void *data) { struct nfs_commit_data *wdata = data; - ff_layout_setup_sequence(wdata->ds_clp, + if (ff_layout_setup_sequence(wdata->ds_clp, &wdata->args.seq_args, &wdata->res.seq_res, - task); + task)) + return; + ff_layout_commit_prepare_common(task, data); +} + +static void ff_layout_commit_done(struct rpc_task *task, void *data) +{ + struct nfs_commit_data *cdata = data; + struct nfs_page *req; + __u64 count = 0; + + if (task->tk_status == 0) { + list_for_each_entry(req, &cdata->pages, wb_list) + count += req->wb_bytes; + } + + nfs4_ff_layout_stat_io_end_write(task, + FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index), + count, count, NFS_FILE_SYNC); + + pnfs_generic_write_commit_done(task, data); } static void ff_layout_commit_count_stats(struct rpc_task *task, void *data) @@ -1205,14 +1404,14 @@ static const struct rpc_call_ops ff_layout_write_call_ops_v4 = { static const struct rpc_call_ops ff_layout_commit_call_ops_v3 = { .rpc_call_prepare = ff_layout_commit_prepare_v3, - .rpc_call_done = pnfs_generic_write_commit_done, + .rpc_call_done = ff_layout_commit_done, .rpc_count_stats = ff_layout_commit_count_stats, .rpc_release = pnfs_generic_commit_release, }; static const struct rpc_call_ops ff_layout_commit_call_ops_v4 = { .rpc_call_prepare = ff_layout_commit_prepare_v4, - .rpc_call_done = pnfs_generic_write_commit_done, + .rpc_call_done = ff_layout_commit_done, .rpc_count_stats = ff_layout_commit_count_stats, .rpc_release = pnfs_generic_commit_release, }; @@ -1256,7 +1455,6 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr) fh = nfs4_ff_layout_select_ds_fh(lseg, idx); if (fh) hdr->args.fh = fh; - /* * Note that if we ever decide to split across DSes, * then we may need to handle dense-like offsets. @@ -1385,6 +1583,7 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how) fh = select_ds_fh_from_commit(lseg, data->ds_commit_index); if (fh) data->args.fh = fh; + return nfs_initiate_commit(ds_clnt, data, ds->ds_clp->rpc_ops, vers == 3 ? &ff_layout_commit_call_ops_v3 : &ff_layout_commit_call_ops_v4, @@ -1488,6 +1687,247 @@ out: dprintk("%s: Return\n", __func__); } +static int +ff_layout_ntop4(const struct sockaddr *sap, char *buf, const size_t buflen) +{ + const struct sockaddr_in *sin = (struct sockaddr_in *)sap; + + return snprintf(buf, buflen, "%pI4", &sin->sin_addr); +} + +static size_t +ff_layout_ntop6_noscopeid(const struct sockaddr *sap, char *buf, + const int buflen) +{ + const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap; + const struct in6_addr *addr = &sin6->sin6_addr; + + /* + * RFC 4291, Section 2.2.2 + * + * Shorthanded ANY address + */ + if (ipv6_addr_any(addr)) + return snprintf(buf, buflen, "::"); + + /* + * RFC 4291, Section 2.2.2 + * + * Shorthanded loopback address + */ + if (ipv6_addr_loopback(addr)) + return snprintf(buf, buflen, "::1"); + + /* + * RFC 4291, Section 2.2.3 + * + * Special presentation address format for mapped v4 + * addresses. + */ + if (ipv6_addr_v4mapped(addr)) + return snprintf(buf, buflen, "::ffff:%pI4", + &addr->s6_addr32[3]); + + /* + * RFC 4291, Section 2.2.1 + */ + return snprintf(buf, buflen, "%pI6c", addr); +} + +/* Derived from rpc_sockaddr2uaddr */ +static void +ff_layout_encode_netaddr(struct xdr_stream *xdr, struct nfs4_pnfs_ds_addr *da) +{ + struct sockaddr *sap = (struct sockaddr *)&da->da_addr; + char portbuf[RPCBIND_MAXUADDRPLEN]; + char addrbuf[RPCBIND_MAXUADDRLEN]; + char *netid; + unsigned short port; + int len, netid_len; + __be32 *p; + + switch (sap->sa_family) { + case AF_INET: + if (ff_layout_ntop4(sap, addrbuf, sizeof(addrbuf)) == 0) + return; + port = ntohs(((struct sockaddr_in *)sap)->sin_port); + netid = "tcp"; + netid_len = 3; + break; + case AF_INET6: + if (ff_layout_ntop6_noscopeid(sap, addrbuf, sizeof(addrbuf)) == 0) + return; + port = ntohs(((struct sockaddr_in6 *)sap)->sin6_port); + netid = "tcp6"; + netid_len = 4; + break; + default: + /* we only support tcp and tcp6 */ + WARN_ON_ONCE(1); + return; + } + + snprintf(portbuf, sizeof(portbuf), ".%u.%u", port >> 8, port & 0xff); + len = strlcat(addrbuf, portbuf, sizeof(addrbuf)); + + p = xdr_reserve_space(xdr, 4 + netid_len); + xdr_encode_opaque(p, netid, netid_len); + + p = xdr_reserve_space(xdr, 4 + len); + xdr_encode_opaque(p, addrbuf, len); +} + +static void +ff_layout_encode_nfstime(struct xdr_stream *xdr, + ktime_t t) +{ + struct timespec64 ts; + __be32 *p; + + p = xdr_reserve_space(xdr, 12); + ts = ktime_to_timespec64(t); + p = xdr_encode_hyper(p, ts.tv_sec); + *p++ = cpu_to_be32(ts.tv_nsec); +} + +static void +ff_layout_encode_io_latency(struct xdr_stream *xdr, + struct nfs4_ff_io_stat *stat) +{ + __be32 *p; + + p = xdr_reserve_space(xdr, 5 * 8); + p = xdr_encode_hyper(p, stat->ops_requested); + p = xdr_encode_hyper(p, stat->bytes_requested); + p = xdr_encode_hyper(p, stat->ops_completed); + p = xdr_encode_hyper(p, stat->bytes_completed); + p = xdr_encode_hyper(p, stat->bytes_not_delivered); + ff_layout_encode_nfstime(xdr, stat->total_busy_time); + ff_layout_encode_nfstime(xdr, stat->aggregate_completion_time); +} + +static void +ff_layout_encode_layoutstats(struct xdr_stream *xdr, + struct nfs42_layoutstat_args *args, + struct nfs42_layoutstat_devinfo *devinfo) +{ + struct nfs4_ff_layout_mirror *mirror = devinfo->layout_private; + struct nfs4_pnfs_ds_addr *da; + struct nfs4_pnfs_ds *ds = mirror->mirror_ds->ds; + struct nfs_fh *fh = &mirror->fh_versions[0]; + __be32 *p, *start; + + da = list_first_entry(&ds->ds_addrs, struct nfs4_pnfs_ds_addr, da_node); + dprintk("%s: DS %s: encoding address %s\n", + __func__, ds->ds_remotestr, da->da_remotestr); + /* layoutupdate length */ + start = xdr_reserve_space(xdr, 4); + /* netaddr4 */ + ff_layout_encode_netaddr(xdr, da); + /* nfs_fh4 */ + p = xdr_reserve_space(xdr, 4 + fh->size); + xdr_encode_opaque(p, fh->data, fh->size); + /* ff_io_latency4 read */ + spin_lock(&mirror->lock); + ff_layout_encode_io_latency(xdr, &mirror->read_stat.io_stat); + /* ff_io_latency4 write */ + ff_layout_encode_io_latency(xdr, &mirror->write_stat.io_stat); + spin_unlock(&mirror->lock); + /* nfstime4 */ + ff_layout_encode_nfstime(xdr, ktime_sub(ktime_get(), mirror->start_time)); + /* bool */ + p = xdr_reserve_space(xdr, 4); + *p = cpu_to_be32(false); + + *start = cpu_to_be32((xdr->p - start - 1) * 4); +} + +static bool +ff_layout_mirror_prepare_stats(struct nfs42_layoutstat_args *args, + struct pnfs_layout_segment *pls, + int *dev_count, int dev_limit) +{ + struct nfs4_ff_layout_mirror *mirror; + struct nfs4_deviceid_node *dev; + struct nfs42_layoutstat_devinfo *devinfo; + int i; + + for (i = 0; i < FF_LAYOUT_MIRROR_COUNT(pls); i++) { + if (*dev_count >= dev_limit) + break; + mirror = FF_LAYOUT_COMP(pls, i); + if (!mirror || !mirror->mirror_ds) + continue; + dev = FF_LAYOUT_DEVID_NODE(pls, i); + devinfo = &args->devinfo[*dev_count]; + memcpy(&devinfo->dev_id, &dev->deviceid, NFS4_DEVICEID4_SIZE); + devinfo->offset = pls->pls_range.offset; + devinfo->length = pls->pls_range.length; + /* well, we don't really know if IO is continuous or not! */ + devinfo->read_count = mirror->read_stat.io_stat.bytes_completed; + devinfo->read_bytes = mirror->read_stat.io_stat.bytes_completed; + devinfo->write_count = mirror->write_stat.io_stat.bytes_completed; + devinfo->write_bytes = mirror->write_stat.io_stat.bytes_completed; + devinfo->layout_type = LAYOUT_FLEX_FILES; + devinfo->layoutstats_encode = ff_layout_encode_layoutstats; + devinfo->layout_private = mirror; + /* lseg refcount put in cleanup_layoutstats */ + pnfs_get_lseg(pls); + + ++(*dev_count); + } + + return *dev_count < dev_limit; +} + +static int +ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args) +{ + struct pnfs_layout_segment *pls; + int dev_count = 0; + + spin_lock(&args->inode->i_lock); + list_for_each_entry(pls, &NFS_I(args->inode)->layout->plh_segs, pls_list) { + dev_count += FF_LAYOUT_MIRROR_COUNT(pls); + } + spin_unlock(&args->inode->i_lock); + /* For now, send at most PNFS_LAYOUTSTATS_MAXDEV statistics */ + if (dev_count > PNFS_LAYOUTSTATS_MAXDEV) { + dprintk("%s: truncating devinfo to limit (%d:%d)\n", + __func__, dev_count, PNFS_LAYOUTSTATS_MAXDEV); + dev_count = PNFS_LAYOUTSTATS_MAXDEV; + } + args->devinfo = kmalloc(dev_count * sizeof(*args->devinfo), GFP_KERNEL); + if (!args->devinfo) + return -ENOMEM; + + dev_count = 0; + spin_lock(&args->inode->i_lock); + list_for_each_entry(pls, &NFS_I(args->inode)->layout->plh_segs, pls_list) { + if (!ff_layout_mirror_prepare_stats(args, pls, &dev_count, + PNFS_LAYOUTSTATS_MAXDEV)) { + break; + } + } + spin_unlock(&args->inode->i_lock); + args->num_dev = dev_count; + + return 0; +} + +static void +ff_layout_cleanup_layoutstats(struct nfs42_layoutstat_data *data) +{ + struct nfs4_ff_layout_mirror *mirror; + int i; + + for (i = 0; i < data->args.num_dev; i++) { + mirror = data->args.devinfo[i].layout_private; + data->args.devinfo[i].layout_private = NULL; + pnfs_put_lseg(mirror->lseg); + } +} + static struct pnfs_layoutdriver_type flexfilelayout_type = { .id = LAYOUT_FLEX_FILES, .name = "LAYOUT_FLEX_FILES", @@ -1510,6 +1950,8 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = { .alloc_deviceid_node = ff_layout_alloc_deviceid_node, .encode_layoutreturn = ff_layout_encode_layoutreturn, .sync = pnfs_nfs_generic_sync, + .prepare_layoutstats = ff_layout_prepare_layoutstats, + .cleanup_layoutstats = ff_layout_cleanup_layoutstats, }; static int __init nfs4flexfilelayout_init(void) diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h index 070f20445..f92f9a0a8 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.h +++ b/fs/nfs/flexfilelayout/flexfilelayout.h @@ -9,12 +9,17 @@ #ifndef FS_NFS_NFS4FLEXFILELAYOUT_H #define FS_NFS_NFS4FLEXFILELAYOUT_H +#define FF_FLAGS_NO_LAYOUTCOMMIT 1 + #include "../pnfs.h" /* XXX: Let's filter out insanely large mirror count for now to avoid oom * due to network error etc. */ #define NFS4_FLEXFILE_LAYOUT_MAX_MIRROR_CNT 4096 +/* LAYOUTSTATS report interval in ms */ +#define FF_LAYOUTSTATS_REPORT_INTERVAL (60000L) + struct nfs4_ff_ds_version { u32 version; u32 minor_version; @@ -41,24 +46,48 @@ struct nfs4_ff_layout_ds_err { struct nfs4_deviceid deviceid; }; +struct nfs4_ff_io_stat { + __u64 ops_requested; + __u64 bytes_requested; + __u64 ops_completed; + __u64 bytes_completed; + __u64 bytes_not_delivered; + ktime_t total_busy_time; + ktime_t aggregate_completion_time; +}; + +struct nfs4_ff_busy_timer { + ktime_t start_time; + atomic_t n_ops; +}; + +struct nfs4_ff_layoutstat { + struct nfs4_ff_io_stat io_stat; + struct nfs4_ff_busy_timer busy_timer; +}; + struct nfs4_ff_layout_mirror { + struct pnfs_layout_segment *lseg; /* back pointer */ u32 ds_count; u32 efficiency; struct nfs4_ff_layout_ds *mirror_ds; u32 fh_versions_cnt; struct nfs_fh *fh_versions; nfs4_stateid stateid; - struct nfs4_string user_name; - struct nfs4_string group_name; u32 uid; u32 gid; struct rpc_cred *cred; spinlock_t lock; + struct nfs4_ff_layoutstat read_stat; + struct nfs4_ff_layoutstat write_stat; + ktime_t start_time; + ktime_t last_report_time; }; struct nfs4_ff_layout_segment { struct pnfs_layout_segment generic_hdr; u64 stripe_unit; + u32 flags; u32 mirror_array_cnt; struct nfs4_ff_layout_mirror **mirror_array; }; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 5d25b9d97..0adc7d245 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -442,8 +442,9 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); if (fattr->valid & NFS_ATTR_FATTR_CHANGE) inode->i_version = fattr->change_attr; - else if (nfs_server_capable(inode, NFS_CAP_CHANGE_ATTR)) - nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR); + else + nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATTR + | NFS_INO_REVAL_PAGECACHE); if (fattr->valid & NFS_ATTR_FATTR_SIZE) inode->i_size = nfs_size_to_loff_t(fattr->size); else @@ -678,6 +679,8 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) if (!err) { generic_fillattr(inode, stat); stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode)); + if (S_ISDIR(inode->i_mode)) + stat->blksize = NFS_SERVER(inode)->dtsize; } out: trace_nfs_getattr_exit(inode, err); @@ -1689,7 +1692,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) nfs_force_lookup_revalidate(inode); inode->i_version = fattr->change_attr; } - } else if (server->caps & NFS_CAP_CHANGE_ATTR) + } else nfsi->cache_validity |= save_cache_validity; if (fattr->valid & NFS_ATTR_FATTR_MTIME) { @@ -2008,17 +2011,15 @@ static int __init init_nfs_fs(void) if (err) goto out1; -#ifdef CONFIG_PROC_FS rpc_proc_register(&init_net, &nfs_rpcstat); -#endif - if ((err = register_nfs_fs()) != 0) + + err = register_nfs_fs(); + if (err) goto out0; return 0; out0: -#ifdef CONFIG_PROC_FS rpc_proc_unregister(&init_net, "nfs"); -#endif nfs_destroy_directcache(); out1: nfs_destroy_writepagecache(); @@ -2049,9 +2050,7 @@ static void __exit exit_nfs_fs(void) nfs_destroy_nfspagecache(); nfs_fscache_unregister(); unregister_pernet_subsys(&nfs_net_ops); -#ifdef CONFIG_PROC_FS rpc_proc_unregister(&init_net, "nfs"); -#endif unregister_nfs_fs(); nfs_fs_proc_exit(); nfsiod_stop(); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 9e6475bc5..9b372b845 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -296,6 +296,22 @@ extern struct rpc_procinfo nfs4_procedures[]; #ifdef CONFIG_NFS_V4_SECURITY_LABEL extern struct nfs4_label *nfs4_label_alloc(struct nfs_server *server, gfp_t flags); +static inline struct nfs4_label * +nfs4_label_copy(struct nfs4_label *dst, struct nfs4_label *src) +{ + if (!dst || !src) + return NULL; + + if (src->len > NFS4_MAXLABELLEN) + return NULL; + + dst->lfs = src->lfs; + dst->pi = src->pi; + dst->len = src->len; + memcpy(dst->label, src->label, src->len); + + return dst; +} static inline void nfs4_label_free(struct nfs4_label *label) { if (label) { @@ -316,6 +332,11 @@ static inline void nfs4_label_free(void *label) {} static inline void nfs_zap_label_cache_locked(struct nfs_inode *nfsi) { } +static inline struct nfs4_label * +nfs4_label_copy(struct nfs4_label *dst, struct nfs4_label *src) +{ + return NULL; +} #endif /* CONFIG_NFS_V4_SECURITY_LABEL */ /* proc.c */ @@ -607,7 +628,7 @@ void nfs_mark_page_unstable(struct page *page) struct inode *inode = page_file_mapping(page)->host; inc_zone_page_state(page, NR_UNSTABLE_NFS); - inc_bdi_stat(inode_to_bdi(inode), BDI_RECLAIMABLE); + inc_wb_stat(&inode_to_bdi(inode)->wb, WB_RECLAIMABLE); __mark_inode_dirty(inode, I_DIRTY_DATASYNC); } diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h index 7afb8947d..ff66ae700 100644 --- a/fs/nfs/nfs42.h +++ b/fs/nfs/nfs42.h @@ -5,11 +5,18 @@ #ifndef __LINUX_FS_NFS_NFS4_2_H #define __LINUX_FS_NFS_NFS4_2_H +/* + * FIXME: four LAYOUTSTATS calls per compound at most! Do we need to support + * more? Need to consider not to pre-alloc too much for a compound. + */ +#define PNFS_LAYOUTSTATS_MAXDEV (4) + /* nfs4.2proc.c */ int nfs42_proc_allocate(struct file *, loff_t, loff_t); int nfs42_proc_deallocate(struct file *, loff_t, loff_t); loff_t nfs42_proc_llseek(struct file *, loff_t, int); - +int nfs42_proc_layoutstats_generic(struct nfs_server *, + struct nfs42_layoutstat_data *); /* nfs4.2xdr.h */ extern struct rpc_procinfo nfs4_2_procedures[]; diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 3a9e75235..d731bbf97 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -10,6 +10,11 @@ #include <linux/nfs_fs.h> #include "nfs4_fs.h" #include "nfs42.h" +#include "iostat.h" +#include "pnfs.h" +#include "internal.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS static int nfs42_set_rw_stateid(nfs4_stateid *dst, struct file *file, fmode_t fmode) @@ -130,7 +135,7 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len) return err; } -loff_t nfs42_proc_llseek(struct file *filep, loff_t offset, int whence) +static loff_t _nfs42_proc_llseek(struct file *filep, loff_t offset, int whence) { struct inode *inode = file_inode(filep); struct nfs42_seek_args args = { @@ -165,3 +170,102 @@ loff_t nfs42_proc_llseek(struct file *filep, loff_t offset, int whence) return vfs_setpos(filep, res.sr_offset, inode->i_sb->s_maxbytes); } + +loff_t nfs42_proc_llseek(struct file *filep, loff_t offset, int whence) +{ + struct nfs_server *server = NFS_SERVER(file_inode(filep)); + struct nfs4_exception exception = { }; + int err; + + do { + err = _nfs42_proc_llseek(filep, offset, whence); + if (err == -ENOTSUPP) + return -EOPNOTSUPP; + err = nfs4_handle_exception(server, err, &exception); + } while (exception.retry); + + return err; +} + + +static void +nfs42_layoutstat_prepare(struct rpc_task *task, void *calldata) +{ + struct nfs42_layoutstat_data *data = calldata; + struct nfs_server *server = NFS_SERVER(data->args.inode); + + nfs41_setup_sequence(nfs4_get_session(server), &data->args.seq_args, + &data->res.seq_res, task); +} + +static void +nfs42_layoutstat_done(struct rpc_task *task, void *calldata) +{ + struct nfs42_layoutstat_data *data = calldata; + + if (!nfs4_sequence_done(task, &data->res.seq_res)) + return; + + switch (task->tk_status) { + case 0: + break; + case -ENOTSUPP: + case -EOPNOTSUPP: + NFS_SERVER(data->inode)->caps &= ~NFS_CAP_LAYOUTSTATS; + default: + dprintk("%s server returns %d\n", __func__, task->tk_status); + } +} + +static void +nfs42_layoutstat_release(void *calldata) +{ + struct nfs42_layoutstat_data *data = calldata; + struct nfs_server *nfss = NFS_SERVER(data->args.inode); + + if (nfss->pnfs_curr_ld->cleanup_layoutstats) + nfss->pnfs_curr_ld->cleanup_layoutstats(data); + + pnfs_put_layout_hdr(NFS_I(data->args.inode)->layout); + smp_mb__before_atomic(); + clear_bit(NFS_INO_LAYOUTSTATS, &NFS_I(data->args.inode)->flags); + smp_mb__after_atomic(); + nfs_iput_and_deactive(data->inode); + kfree(data->args.devinfo); + kfree(data); +} + +static const struct rpc_call_ops nfs42_layoutstat_ops = { + .rpc_call_prepare = nfs42_layoutstat_prepare, + .rpc_call_done = nfs42_layoutstat_done, + .rpc_release = nfs42_layoutstat_release, +}; + +int nfs42_proc_layoutstats_generic(struct nfs_server *server, + struct nfs42_layoutstat_data *data) +{ + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_LAYOUTSTATS], + .rpc_argp = &data->args, + .rpc_resp = &data->res, + }; + struct rpc_task_setup task_setup = { + .rpc_client = server->client, + .rpc_message = &msg, + .callback_ops = &nfs42_layoutstat_ops, + .callback_data = data, + .flags = RPC_TASK_ASYNC, + }; + struct rpc_task *task; + + data->inode = nfs_igrab_and_active(data->args.inode); + if (!data->inode) { + nfs42_layoutstat_release(data); + return -EAGAIN; + } + nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0); + task = rpc_run_task(&task_setup); + if (IS_ERR(task)) + return PTR_ERR(task); + return 0; +} diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index 1a25b2724..a6bd27da6 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -4,6 +4,8 @@ #ifndef __LINUX_FS_NFS_NFS4_2XDR_H #define __LINUX_FS_NFS_NFS4_2XDR_H +#include "nfs42.h" + #define encode_fallocate_maxsz (encode_stateid_maxsz + \ 2 /* offset */ + \ 2 /* length */) @@ -22,6 +24,16 @@ 1 /* whence */ + \ 2 /* offset */ + \ 2 /* length */) +#define encode_io_info_maxsz 4 +#define encode_layoutstats_maxsz (op_decode_hdr_maxsz + \ + 2 /* offset */ + \ + 2 /* length */ + \ + encode_stateid_maxsz + \ + encode_io_info_maxsz + \ + encode_io_info_maxsz + \ + 1 /* opaque devaddr4 length */ + \ + XDR_QUADLEN(PNFS_LAYOUTSTATS_MAXSIZE)) +#define decode_layoutstats_maxsz (op_decode_hdr_maxsz) #define NFS4_enc_allocate_sz (compound_encode_hdr_maxsz + \ encode_putfh_maxsz + \ @@ -45,6 +57,14 @@ #define NFS4_dec_seek_sz (compound_decode_hdr_maxsz + \ decode_putfh_maxsz + \ decode_seek_maxsz) +#define NFS4_enc_layoutstats_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + PNFS_LAYOUTSTATS_MAXDEV * encode_layoutstats_maxsz) +#define NFS4_dec_layoutstats_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + PNFS_LAYOUTSTATS_MAXDEV * decode_layoutstats_maxsz) static void encode_fallocate(struct xdr_stream *xdr, @@ -81,6 +101,33 @@ static void encode_seek(struct xdr_stream *xdr, encode_uint32(xdr, args->sa_what); } +static void encode_layoutstats(struct xdr_stream *xdr, + struct nfs42_layoutstat_args *args, + struct nfs42_layoutstat_devinfo *devinfo, + struct compound_hdr *hdr) +{ + __be32 *p; + + encode_op_hdr(xdr, OP_LAYOUTSTATS, decode_layoutstats_maxsz, hdr); + p = reserve_space(xdr, 8 + 8); + p = xdr_encode_hyper(p, devinfo->offset); + p = xdr_encode_hyper(p, devinfo->length); + encode_nfs4_stateid(xdr, &args->stateid); + p = reserve_space(xdr, 4*8 + NFS4_DEVICEID4_SIZE + 4); + p = xdr_encode_hyper(p, devinfo->read_count); + p = xdr_encode_hyper(p, devinfo->read_bytes); + p = xdr_encode_hyper(p, devinfo->write_count); + p = xdr_encode_hyper(p, devinfo->write_bytes); + p = xdr_encode_opaque_fixed(p, devinfo->dev_id.data, + NFS4_DEVICEID4_SIZE); + /* Encode layoutupdate4 */ + *p++ = cpu_to_be32(devinfo->layout_type); + if (devinfo->layoutstats_encode != NULL) + devinfo->layoutstats_encode(xdr, args, devinfo); + else + encode_uint32(xdr, 0); +} + /* * Encode ALLOCATE request */ @@ -137,6 +184,28 @@ static void nfs4_xdr_enc_seek(struct rpc_rqst *req, encode_nops(&hdr); } +/* + * Encode LAYOUTSTATS request + */ +static void nfs4_xdr_enc_layoutstats(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs42_layoutstat_args *args) +{ + int i; + + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->fh, &hdr); + WARN_ON(args->num_dev > PNFS_LAYOUTSTATS_MAXDEV); + for (i = 0; i < args->num_dev; i++) + encode_layoutstats(xdr, args, &args->devinfo[i], &hdr); + encode_nops(&hdr); +} + static int decode_allocate(struct xdr_stream *xdr, struct nfs42_falloc_res *res) { return decode_op_hdr(xdr, OP_ALLOCATE); @@ -169,6 +238,12 @@ out_overflow: return -EIO; } +static int decode_layoutstats(struct xdr_stream *xdr, + struct nfs42_layoutstat_res *res) +{ + return decode_op_hdr(xdr, OP_LAYOUTSTATS); +} + /* * Decode ALLOCATE request */ @@ -246,4 +321,35 @@ static int nfs4_xdr_dec_seek(struct rpc_rqst *rqstp, out: return status; } + +/* + * Decode LAYOUTSTATS request + */ +static int nfs4_xdr_dec_layoutstats(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs42_layoutstat_res *res) +{ + struct compound_hdr hdr; + int status, i; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + WARN_ON(res->num_dev > PNFS_LAYOUTSTATS_MAXDEV); + for (i = 0; i < res->num_dev; i++) { + status = decode_layoutstats(xdr, res); + if (status) + goto out; + } +out: + res->rpc_status = status; + return status; +} + #endif /* __LINUX_FS_NFS_NFS4_2XDR_H */ diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index fdef424b0..ea3bee919 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -233,6 +233,7 @@ extern int nfs4_handle_exception(struct nfs_server *, int, struct nfs4_exception extern int nfs4_call_sync(struct rpc_clnt *, struct nfs_server *, struct rpc_message *, struct nfs4_sequence_args *, struct nfs4_sequence_res *, int); +extern void nfs4_init_sequence(struct nfs4_sequence_args *, struct nfs4_sequence_res *, int); extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *); extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *); extern int nfs4_proc_get_rootfh(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *, bool); diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index e42be52a8..3aa6a9ba5 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -676,7 +676,6 @@ found: break; } - /* No matching nfs_client found. */ spin_unlock(&nn->nfs_client_lock); dprintk("NFS: <-- %s status = %d\n", __func__, status); nfs_put_client(prev); diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index f58c17b3b..dcd39d4e2 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -41,6 +41,10 @@ nfs4_file_open(struct inode *inode, struct file *filp) dprintk("NFS: open file(%pd2)\n", dentry); + err = nfs_check_flags(openflags); + if (err) + return err; + if ((openflags & O_ACCMODE) == 3) openflags--; diff --git a/fs/nfs/nfs4getroot.c b/fs/nfs/nfs4getroot.c index c0b3a16b4..039b3eb6d 100644 --- a/fs/nfs/nfs4getroot.c +++ b/fs/nfs/nfs4getroot.c @@ -35,13 +35,6 @@ int nfs4_get_rootfh(struct nfs_server *server, struct nfs_fh *mntfh, bool auth_p goto out; } - if (fsinfo.fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) { - printk(KERN_ERR "nfs4_get_rootfh:" - " getroot obtained referral\n"); - ret = -EREMOTE; - goto out; - } - memcpy(&server->fsid, &fsinfo.fattr->fsid, sizeof(server->fsid)); out: nfs_free_fattr(fsinfo.fattr); diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c index 2e1737c40..535dfc69c 100644 --- a/fs/nfs/nfs4idmap.c +++ b/fs/nfs/nfs4idmap.c @@ -494,12 +494,7 @@ nfs_idmap_delete(struct nfs_client *clp) int nfs_idmap_init(void) { - int ret; - ret = nfs_idmap_init_keyring(); - if (ret != 0) - goto out; -out: - return ret; + return nfs_idmap_init_keyring(); } void nfs_idmap_quit(void) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index d3f205126..3acb1eb72 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -356,6 +356,9 @@ int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_ case 0: return 0; case -NFS4ERR_OPENMODE: + case -NFS4ERR_DELEG_REVOKED: + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_BAD_STATEID: if (inode && nfs4_have_delegation(inode, FMODE_READ)) { nfs4_inode_return_delegation(inode); exception->retry = 1; @@ -367,15 +370,6 @@ int nfs4_handle_exception(struct nfs_server *server, int errorcode, struct nfs4_ if (ret < 0) break; goto wait_on_recovery; - case -NFS4ERR_DELEG_REVOKED: - case -NFS4ERR_ADMIN_REVOKED: - case -NFS4ERR_BAD_STATEID: - if (state == NULL) - break; - ret = nfs4_schedule_stateid_recovery(server, state); - if (ret < 0) - break; - goto wait_on_recovery; case -NFS4ERR_EXPIRED: if (state != NULL) { ret = nfs4_schedule_stateid_recovery(server, state); @@ -473,7 +467,10 @@ static void do_renew_lease(struct nfs_client *clp, unsigned long timestamp) static void renew_lease(const struct nfs_server *server, unsigned long timestamp) { - do_renew_lease(server->nfs_client, timestamp); + struct nfs_client *clp = server->nfs_client; + + if (!nfs4_has_session(clp)) + do_renew_lease(clp, timestamp); } struct nfs4_call_sync_data { @@ -482,8 +479,8 @@ struct nfs4_call_sync_data { struct nfs4_sequence_res *seq_res; }; -static void nfs4_init_sequence(struct nfs4_sequence_args *args, - struct nfs4_sequence_res *res, int cache_reply) +void nfs4_init_sequence(struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, int cache_reply) { args->sa_slot = NULL; args->sa_cache_this = cache_reply; @@ -622,8 +619,7 @@ int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) clp = session->clp; do_renew_lease(clp, res->sr_timestamp); /* Check sequence flags */ - if (res->sr_status_flags != 0) - nfs4_schedule_lease_recovery(clp); + nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags); nfs41_update_target_slotid(slot->table, slot, res); break; case 1: @@ -916,6 +912,7 @@ struct nfs4_opendata { struct nfs_open_confirmres c_res; struct nfs4_string owner_name; struct nfs4_string group_name; + struct nfs4_label *a_label; struct nfs_fattr f_attr; struct nfs4_label *f_label; struct dentry *dir; @@ -1019,6 +1016,10 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, if (IS_ERR(p->f_label)) goto err_free_p; + p->a_label = nfs4_label_alloc(server, gfp_mask); + if (IS_ERR(p->a_label)) + goto err_free_f; + alloc_seqid = server->nfs_client->cl_mvops->alloc_seqid; p->o_arg.seqid = alloc_seqid(&sp->so_seqid, gfp_mask); if (IS_ERR(p->o_arg.seqid)) @@ -1047,7 +1048,7 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, p->o_arg.server = server; p->o_arg.bitmask = nfs4_bitmask(server, label); p->o_arg.open_bitmap = &nfs4_fattr_bitmap[0]; - p->o_arg.label = label; + p->o_arg.label = nfs4_label_copy(p->a_label, label); p->o_arg.claim = nfs4_map_atomic_open_claim(server, claim); switch (p->o_arg.claim) { case NFS4_OPEN_CLAIM_NULL: @@ -1080,6 +1081,8 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, return p; err_free_label: + nfs4_label_free(p->a_label); +err_free_f: nfs4_label_free(p->f_label); err_free_p: kfree(p); @@ -1099,6 +1102,7 @@ static void nfs4_opendata_free(struct kref *kref) nfs4_put_open_state(p->state); nfs4_put_state_owner(p->owner); + nfs4_label_free(p->a_label); nfs4_label_free(p->f_label); dput(p->dir); @@ -1556,6 +1560,13 @@ static int nfs4_open_recover_helper(struct nfs4_opendata *opendata, fmode_t fmod struct nfs4_state *newstate; int ret; + if ((opendata->o_arg.claim == NFS4_OPEN_CLAIM_DELEGATE_CUR || + opendata->o_arg.claim == NFS4_OPEN_CLAIM_DELEG_CUR_FH) && + (opendata->o_arg.u.delegation_type & fmode) != fmode) + /* This mode can't have been delegated, so we must have + * a valid open_stateid to cover it - not need to reclaim. + */ + return 0; opendata->o_arg.open_flags = 0; opendata->o_arg.fmode = fmode; opendata->o_arg.share_access = nfs4_map_atomic_open_share( @@ -1687,6 +1698,7 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct "%d.\n", __func__, err); case 0: case -ENOENT: + case -EAGAIN: case -ESTALE: break; case -NFS4ERR_BADSESSION: @@ -3358,6 +3370,8 @@ static int nfs4_proc_lookup_common(struct rpc_clnt **clnt, struct inode *dir, goto out; case -NFS4ERR_MOVED: err = nfs4_get_referral(client, dir, name, fattr, fhandle); + if (err == -NFS4ERR_MOVED) + err = nfs4_handle_exception(NFS_SERVER(dir), err, &exception); goto out; case -NFS4ERR_WRONGSEC: err = -EPERM; @@ -4958,49 +4972,128 @@ static void nfs4_init_boot_verifier(const struct nfs_client *clp, memcpy(bootverf->data, verf, sizeof(bootverf->data)); } -static unsigned int -nfs4_init_nonuniform_client_string(struct nfs_client *clp, - char *buf, size_t len) +static int +nfs4_init_nonuniform_client_string(struct nfs_client *clp) { - unsigned int result; + int result; + size_t len; + char *str; + bool retried = false; if (clp->cl_owner_id != NULL) - return strlcpy(buf, clp->cl_owner_id, len); + return 0; +retry: + rcu_read_lock(); + len = 10 + strlen(clp->cl_ipaddr) + 1 + + strlen(rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR)) + + 1 + + strlen(rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_PROTO)) + + 1; + rcu_read_unlock(); + + if (len > NFS4_OPAQUE_LIMIT + 1) + return -EINVAL; + + /* + * Since this string is allocated at mount time, and held until the + * nfs_client is destroyed, we can use GFP_KERNEL here w/o worrying + * about a memory-reclaim deadlock. + */ + str = kmalloc(len, GFP_KERNEL); + if (!str) + return -ENOMEM; rcu_read_lock(); - result = scnprintf(buf, len, "Linux NFSv4.0 %s/%s %s", - clp->cl_ipaddr, - rpc_peeraddr2str(clp->cl_rpcclient, - RPC_DISPLAY_ADDR), - rpc_peeraddr2str(clp->cl_rpcclient, - RPC_DISPLAY_PROTO)); + result = scnprintf(str, len, "Linux NFSv4.0 %s/%s %s", + clp->cl_ipaddr, + rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR), + rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_PROTO)); rcu_read_unlock(); - clp->cl_owner_id = kstrdup(buf, GFP_KERNEL); - return result; + + /* Did something change? */ + if (result >= len) { + kfree(str); + if (retried) + return -EINVAL; + retried = true; + goto retry; + } + clp->cl_owner_id = str; + return 0; } -static unsigned int -nfs4_init_uniform_client_string(struct nfs_client *clp, - char *buf, size_t len) +static int +nfs4_init_uniquifier_client_string(struct nfs_client *clp) +{ + int result; + size_t len; + char *str; + + len = 10 + 10 + 1 + 10 + 1 + + strlen(nfs4_client_id_uniquifier) + 1 + + strlen(clp->cl_rpcclient->cl_nodename) + 1; + + if (len > NFS4_OPAQUE_LIMIT + 1) + return -EINVAL; + + /* + * Since this string is allocated at mount time, and held until the + * nfs_client is destroyed, we can use GFP_KERNEL here w/o worrying + * about a memory-reclaim deadlock. + */ + str = kmalloc(len, GFP_KERNEL); + if (!str) + return -ENOMEM; + + result = scnprintf(str, len, "Linux NFSv%u.%u %s/%s", + clp->rpc_ops->version, clp->cl_minorversion, + nfs4_client_id_uniquifier, + clp->cl_rpcclient->cl_nodename); + if (result >= len) { + kfree(str); + return -EINVAL; + } + clp->cl_owner_id = str; + return 0; +} + +static int +nfs4_init_uniform_client_string(struct nfs_client *clp) { - const char *nodename = clp->cl_rpcclient->cl_nodename; - unsigned int result; + int result; + size_t len; + char *str; if (clp->cl_owner_id != NULL) - return strlcpy(buf, clp->cl_owner_id, len); + return 0; if (nfs4_client_id_uniquifier[0] != '\0') - result = scnprintf(buf, len, "Linux NFSv%u.%u %s/%s", - clp->rpc_ops->version, - clp->cl_minorversion, - nfs4_client_id_uniquifier, - nodename); - else - result = scnprintf(buf, len, "Linux NFSv%u.%u %s", - clp->rpc_ops->version, clp->cl_minorversion, - nodename); - clp->cl_owner_id = kstrdup(buf, GFP_KERNEL); - return result; + return nfs4_init_uniquifier_client_string(clp); + + len = 10 + 10 + 1 + 10 + 1 + + strlen(clp->cl_rpcclient->cl_nodename) + 1; + + if (len > NFS4_OPAQUE_LIMIT + 1) + return -EINVAL; + + /* + * Since this string is allocated at mount time, and held until the + * nfs_client is destroyed, we can use GFP_KERNEL here w/o worrying + * about a memory-reclaim deadlock. + */ + str = kmalloc(len, GFP_KERNEL); + if (!str) + return -ENOMEM; + + result = scnprintf(str, len, "Linux NFSv%u.%u %s", + clp->rpc_ops->version, clp->cl_minorversion, + clp->cl_rpcclient->cl_nodename); + if (result >= len) { + kfree(str); + return -EINVAL; + } + clp->cl_owner_id = str; + return 0; } /* @@ -5047,7 +5140,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, struct nfs4_setclientid setclientid = { .sc_verifier = &sc_verifier, .sc_prog = program, - .sc_cb_ident = clp->cl_cb_ident, + .sc_clnt = clp, }; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SETCLIENTID], @@ -5067,16 +5160,15 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, /* nfs_client_id4 */ nfs4_init_boot_verifier(clp, &sc_verifier); + if (test_bit(NFS_CS_MIGRATION, &clp->cl_flags)) - setclientid.sc_name_len = - nfs4_init_uniform_client_string(clp, - setclientid.sc_name, - sizeof(setclientid.sc_name)); + status = nfs4_init_uniform_client_string(clp); else - setclientid.sc_name_len = - nfs4_init_nonuniform_client_string(clp, - setclientid.sc_name, - sizeof(setclientid.sc_name)); + status = nfs4_init_nonuniform_client_string(clp); + + if (status) + goto out; + /* cb_client4 */ setclientid.sc_netid_len = nfs4_init_callback_netid(clp, @@ -5086,9 +5178,9 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, sizeof(setclientid.sc_uaddr), "%s.%u.%u", clp->cl_ipaddr, port >> 8, port & 255); - dprintk("NFS call setclientid auth=%s, '%.*s'\n", + dprintk("NFS call setclientid auth=%s, '%s'\n", clp->cl_rpcclient->cl_auth->au_ops->au_name, - setclientid.sc_name_len, setclientid.sc_name); + clp->cl_owner_id); task = rpc_run_task(&task_setup_data); if (IS_ERR(task)) { status = PTR_ERR(task); @@ -5360,15 +5452,15 @@ static int nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock * return err; } -static int do_vfs_lock(struct file *file, struct file_lock *fl) +static int do_vfs_lock(struct inode *inode, struct file_lock *fl) { int res = 0; switch (fl->fl_flags & (FL_POSIX|FL_FLOCK)) { case FL_POSIX: - res = posix_lock_file_wait(file, fl); + res = posix_lock_inode_wait(inode, fl); break; case FL_FLOCK: - res = flock_lock_file_wait(file, fl); + res = flock_lock_inode_wait(inode, fl); break; default: BUG(); @@ -5428,7 +5520,7 @@ static void nfs4_locku_done(struct rpc_task *task, void *data) switch (task->tk_status) { case 0: renew_lease(calldata->server, calldata->timestamp); - do_vfs_lock(calldata->fl.fl_file, &calldata->fl); + do_vfs_lock(calldata->lsp->ls_state->inode, &calldata->fl); if (nfs4_update_lock_stateid(calldata->lsp, &calldata->res.stateid)) break; @@ -5536,7 +5628,7 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock * mutex_lock(&sp->so_delegreturn_mutex); /* Exclude nfs4_reclaim_open_stateid() - note nesting! */ down_read(&nfsi->rwsem); - if (do_vfs_lock(request->fl_file, request) == -ENOENT) { + if (do_vfs_lock(inode, request) == -ENOENT) { up_read(&nfsi->rwsem); mutex_unlock(&sp->so_delegreturn_mutex); goto out; @@ -5677,7 +5769,7 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata) data->timestamp); if (data->arg.new_lock) { data->fl.fl_flags &= ~(FL_SLEEP | FL_ACCESS); - if (do_vfs_lock(data->fl.fl_file, &data->fl) < 0) { + if (do_vfs_lock(lsp->ls_state->inode, &data->fl) < 0) { rpc_restart_call_prepare(task); break; } @@ -5919,7 +6011,7 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock if (status != 0) goto out; request->fl_flags |= FL_ACCESS; - status = do_vfs_lock(request->fl_file, request); + status = do_vfs_lock(state->inode, request); if (status < 0) goto out; down_read(&nfsi->rwsem); @@ -5927,7 +6019,7 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock /* Yes: cache locks! */ /* ...but avoid races with delegation recall... */ request->fl_flags = fl_flags & ~FL_SLEEP; - status = do_vfs_lock(request->fl_file, request); + status = do_vfs_lock(state->inode, request); up_read(&nfsi->rwsem); goto out; } @@ -6849,11 +6941,14 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, }; nfs4_init_boot_verifier(clp, &verifier); - args.id_len = nfs4_init_uniform_client_string(clp, args.id, - sizeof(args.id)); - dprintk("NFS call exchange_id auth=%s, '%.*s'\n", + + status = nfs4_init_uniform_client_string(clp); + if (status) + goto out; + + dprintk("NFS call exchange_id auth=%s, '%s'\n", clp->cl_rpcclient->cl_auth->au_ops->au_name, - args.id_len, args.id); + clp->cl_owner_id); res.server_owner = kzalloc(sizeof(struct nfs41_server_owner), GFP_NOFS); @@ -6888,7 +6983,7 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, /* unsupported! */ WARN_ON_ONCE(1); status = -EINVAL; - goto out_server_scope; + goto out_impl_id; } status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT); @@ -6916,6 +7011,7 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, /* use the most recent implementation id */ kfree(clp->cl_implid); clp->cl_implid = res.impl_id; + res.impl_id = NULL; if (clp->cl_serverscope != NULL && !nfs41_same_server_scope(clp->cl_serverscope, @@ -6929,15 +7025,16 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred, if (clp->cl_serverscope == NULL) { clp->cl_serverscope = res.server_scope; - goto out; + res.server_scope = NULL; } - } else - kfree(res.impl_id); + } -out_server_owner: - kfree(res.server_owner); +out_impl_id: + kfree(res.impl_id); out_server_scope: kfree(res.server_scope); +out_server_owner: + kfree(res.server_owner); out: if (clp->cl_implid != NULL) dprintk("NFS reply exchange_id: Server Implementation ID: " @@ -7487,13 +7584,8 @@ static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred) goto out; } ret = rpc_wait_for_completion_task(task); - if (!ret) { - struct nfs4_sequence_res *res = task->tk_msg.rpc_resp; - - if (task->tk_status == 0) - nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags); + if (!ret) ret = task->tk_status; - } rpc_put_task(task); out: dprintk("<-- %s status=%d\n", __func__, ret); @@ -7881,16 +7973,17 @@ static void nfs4_layoutreturn_release(void *calldata) { struct nfs4_layoutreturn *lrp = calldata; struct pnfs_layout_hdr *lo = lrp->args.layout; + LIST_HEAD(freeme); dprintk("--> %s\n", __func__); spin_lock(&lo->plh_inode->i_lock); if (lrp->res.lrs_present) pnfs_set_layout_stateid(lo, &lrp->res.stateid, true); + pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range); pnfs_clear_layoutreturn_waitbit(lo); - clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, &lo->plh_flags); - rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq); lo->plh_block_lgets--; spin_unlock(&lo->plh_inode->i_lock); + pnfs_free_lseg_list(&freeme); pnfs_put_layout_hdr(lrp->args.layout); nfs_iput_and_deactive(lrp->inode); kfree(calldata); @@ -8064,9 +8157,8 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync) struct rpc_task *task; int status = 0; - dprintk("NFS: %4d initiating layoutcommit call. sync %d " - "lbw: %llu inode %lu\n", - data->task.tk_pid, sync, + dprintk("NFS: initiating layoutcommit call. sync %d " + "lbw: %llu inode %lu\n", sync, data->args.lastbytewritten, data->args.inode->i_ino); @@ -8505,7 +8597,6 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = { .minor_version = 0, .init_caps = NFS_CAP_READDIRPLUS | NFS_CAP_ATOMIC_OPEN - | NFS_CAP_CHANGE_ATTR | NFS_CAP_POSIX_LOCK, .init_client = nfs40_init_client, .shutdown_client = nfs40_shutdown_client, @@ -8531,7 +8622,6 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = { .minor_version = 1, .init_caps = NFS_CAP_READDIRPLUS | NFS_CAP_ATOMIC_OPEN - | NFS_CAP_CHANGE_ATTR | NFS_CAP_POSIX_LOCK | NFS_CAP_STATEID_NFSV41 | NFS_CAP_ATOMIC_OPEN_V1, @@ -8554,13 +8644,13 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = { .minor_version = 2, .init_caps = NFS_CAP_READDIRPLUS | NFS_CAP_ATOMIC_OPEN - | NFS_CAP_CHANGE_ATTR | NFS_CAP_POSIX_LOCK | NFS_CAP_STATEID_NFSV41 | NFS_CAP_ATOMIC_OPEN_V1 | NFS_CAP_ALLOCATE | NFS_CAP_DEALLOCATE - | NFS_CAP_SEEK, + | NFS_CAP_SEEK + | NFS_CAP_LAYOUTSTATS, .init_client = nfs41_init_client, .shutdown_client = nfs41_shutdown_client, .match_stateid = nfs41_match_stateid, diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index ddef1dc80..f2e2ad894 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -309,7 +309,6 @@ int nfs41_init_clientid(struct nfs_client *clp, struct rpc_cred *cred) if (test_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state)) goto do_confirm; - nfs4_begin_drain_session(clp); status = nfs4_proc_exchange_id(clp, cred); if (status != 0) goto out; @@ -1832,6 +1831,7 @@ static int nfs4_establish_lease(struct nfs_client *clp) clp->cl_mvops->reboot_recovery_ops; int status; + nfs4_begin_drain_session(clp); cred = nfs4_get_clid_cred(clp); if (cred == NULL) return -ENOENT; @@ -2191,25 +2191,35 @@ static void nfs41_handle_server_reboot(struct nfs_client *clp) } } -static void nfs41_handle_state_revoked(struct nfs_client *clp) +static void nfs41_handle_all_state_revoked(struct nfs_client *clp) { nfs4_reset_all_state(clp); dprintk("%s: state revoked on server %s\n", __func__, clp->cl_hostname); } +static void nfs41_handle_some_state_revoked(struct nfs_client *clp) +{ + nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce); + nfs4_schedule_state_manager(clp); + + dprintk("%s: state revoked on server %s\n", __func__, clp->cl_hostname); +} + static void nfs41_handle_recallable_state_revoked(struct nfs_client *clp) { - /* This will need to handle layouts too */ - nfs_expire_all_delegations(clp); + /* FIXME: For now, we destroy all layouts. */ + pnfs_destroy_all_layouts(clp); + /* FIXME: For now, we test all delegations+open state+locks. */ + nfs41_handle_some_state_revoked(clp); dprintk("%s: Recallable state revoked on server %s!\n", __func__, clp->cl_hostname); } static void nfs41_handle_backchannel_fault(struct nfs_client *clp) { - nfs_expire_all_delegations(clp); - if (test_and_set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) == 0) - nfs4_schedule_state_manager(clp); + set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); + nfs4_schedule_state_manager(clp); + dprintk("%s: server %s declared a backchannel fault\n", __func__, clp->cl_hostname); } @@ -2231,10 +2241,11 @@ void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags) if (flags & SEQ4_STATUS_RESTART_RECLAIM_NEEDED) nfs41_handle_server_reboot(clp); - if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED | - SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED | + if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED)) + nfs41_handle_all_state_revoked(clp); + if (flags & (SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED | SEQ4_STATUS_ADMIN_STATE_REVOKED)) - nfs41_handle_state_revoked(clp); + nfs41_handle_some_state_revoked(clp); if (flags & SEQ4_STATUS_LEASE_MOVED) nfs4_schedule_lease_moved_recovery(clp); if (flags & SEQ4_STATUS_RECALLABLE_STATE_REVOKED) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 0aea97841..558cd65db 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -139,7 +139,8 @@ static int nfs4_stat_to_errno(int); #define encode_setclientid_maxsz \ (op_encode_hdr_maxsz + \ XDR_QUADLEN(NFS4_VERIFIER_SIZE) + \ - XDR_QUADLEN(NFS4_SETCLIENTID_NAMELEN) + \ + /* client name */ \ + 1 + XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \ 1 /* sc_prog */ + \ 1 + XDR_QUADLEN(RPCBIND_MAXNETIDLEN) + \ 1 + XDR_QUADLEN(RPCBIND_MAXUADDRLEN) + \ @@ -288,7 +289,8 @@ static int nfs4_stat_to_errno(int); #define encode_exchange_id_maxsz (op_encode_hdr_maxsz + \ encode_verifier_maxsz + \ 1 /* co_ownerid.len */ + \ - XDR_QUADLEN(NFS4_EXCHANGE_ID_LEN) + \ + /* eia_clientowner */ \ + 1 + XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \ 1 /* flags */ + \ 1 /* spa_how */ + \ /* max is SP4_MACH_CRED (for now) */ + \ @@ -1667,13 +1669,14 @@ static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclie encode_op_hdr(xdr, OP_SETCLIENTID, decode_setclientid_maxsz, hdr); encode_nfs4_verifier(xdr, setclientid->sc_verifier); - encode_string(xdr, setclientid->sc_name_len, setclientid->sc_name); + encode_string(xdr, strlen(setclientid->sc_clnt->cl_owner_id), + setclientid->sc_clnt->cl_owner_id); p = reserve_space(xdr, 4); *p = cpu_to_be32(setclientid->sc_prog); encode_string(xdr, setclientid->sc_netid_len, setclientid->sc_netid); encode_string(xdr, setclientid->sc_uaddr_len, setclientid->sc_uaddr); p = reserve_space(xdr, 4); - *p = cpu_to_be32(setclientid->sc_cb_ident); + *p = cpu_to_be32(setclientid->sc_clnt->cl_cb_ident); } static void encode_setclientid_confirm(struct xdr_stream *xdr, const struct nfs4_setclientid_res *arg, struct compound_hdr *hdr) @@ -1747,7 +1750,8 @@ static void encode_exchange_id(struct xdr_stream *xdr, encode_op_hdr(xdr, OP_EXCHANGE_ID, decode_exchange_id_maxsz, hdr); encode_nfs4_verifier(xdr, args->verifier); - encode_string(xdr, args->id_len, args->id); + encode_string(xdr, strlen(args->client->cl_owner_id), + args->client->cl_owner_id); encode_uint32(xdr, args->flags); encode_uint32(xdr, args->state_protect.how); @@ -7427,6 +7431,7 @@ struct rpc_procinfo nfs4_procedures[] = { PROC(SEEK, enc_seek, dec_seek), PROC(ALLOCATE, enc_allocate, dec_allocate), PROC(DEALLOCATE, enc_deallocate, dec_deallocate), + PROC(LAYOUTSTATS, enc_layoutstats, dec_layoutstats), #endif /* CONFIG_NFS_V4_2 */ }; diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 7b4552678..4984bbe55 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -636,9 +636,8 @@ int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr, hdr->rw_ops->rw_initiate(hdr, &msg, rpc_ops, &task_setup_data, how); - dprintk("NFS: %5u initiated pgio call " + dprintk("NFS: initiated pgio call " "(req %s/%llu, %u bytes @ offset %llu)\n", - hdr->task.tk_pid, hdr->inode->i_sb->s_id, (unsigned long long)NFS_FILEID(hdr->inode), hdr->args.count, @@ -690,8 +689,6 @@ static int nfs_pgio_error(struct nfs_pageio_descriptor *desc, static void nfs_pgio_release(void *calldata) { struct nfs_pgio_header *hdr = calldata; - if (hdr->rw_ops->rw_release) - hdr->rw_ops->rw_release(hdr); nfs_pgio_data_destroy(hdr); hdr->completion_ops->completion(hdr); } @@ -711,7 +708,9 @@ static void nfs_pageio_mirror_init(struct nfs_pgio_mirror *mirror, * nfs_pageio_init - initialise a page io descriptor * @desc: pointer to descriptor * @inode: pointer to inode - * @doio: pointer to io function + * @pg_ops: pointer to pageio operations + * @compl_ops: pointer to pageio completion operations + * @rw_ops: pointer to nfs read/write operations * @bsize: io block size * @io_flags: extra parameters for the io function */ @@ -1101,8 +1100,6 @@ static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc) mirror->pg_base = 0; mirror->pg_recoalesce = 0; - desc->pg_moreio = 0; - while (!list_empty(&head)) { struct nfs_page *req; @@ -1189,6 +1186,7 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, * nfs_pageio_complete_mirror - Complete I/O on the current mirror of an * nfs_pageio_descriptor * @desc: pointer to io descriptor + * @mirror_idx: pointer to mirror index */ static void nfs_pageio_complete_mirror(struct nfs_pageio_descriptor *desc, u32 mirror_idx) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index d47c18868..70bf706b1 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -35,6 +35,7 @@ #include "iostat.h" #include "nfs4trace.h" #include "delegation.h" +#include "nfs42.h" #define NFSDBG_FACILITY NFSDBG_PNFS #define PNFS_LAYOUTGET_RETRY_TIMEOUT (120*HZ) @@ -351,7 +352,7 @@ pnfs_layout_need_return(struct pnfs_layout_hdr *lo, { struct pnfs_layout_segment *s; - if (!test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags)) + if (!test_and_clear_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags)) return false; list_for_each_entry(s, &lo->plh_segs, pls_list) @@ -361,6 +362,18 @@ pnfs_layout_need_return(struct pnfs_layout_hdr *lo, return true; } +static bool +pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo) +{ + if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) + return false; + lo->plh_return_iomode = 0; + lo->plh_block_lgets++; + pnfs_get_layout_hdr(lo); + clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, &lo->plh_flags); + return true; +} + static void pnfs_layoutreturn_before_put_lseg(struct pnfs_layout_segment *lseg, struct pnfs_layout_hdr *lo, struct inode *inode) { @@ -371,17 +384,16 @@ static void pnfs_layoutreturn_before_put_lseg(struct pnfs_layout_segment *lseg, if (pnfs_layout_need_return(lo, lseg)) { nfs4_stateid stateid; enum pnfs_iomode iomode; + bool send; stateid = lo->plh_stateid; iomode = lo->plh_return_iomode; - /* decreased in pnfs_send_layoutreturn() */ - lo->plh_block_lgets++; - lo->plh_return_iomode = 0; + send = pnfs_prepare_layoutreturn(lo); spin_unlock(&inode->i_lock); - pnfs_get_layout_hdr(lo); - - /* Send an async layoutreturn so we dont deadlock */ - pnfs_send_layoutreturn(lo, stateid, iomode, false); + if (send) { + /* Send an async layoutreturn so we dont deadlock */ + pnfs_send_layoutreturn(lo, stateid, iomode, false); + } } else spin_unlock(&inode->i_lock); } @@ -410,6 +422,10 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg) pnfs_layoutreturn_before_put_lseg(lseg, lo, inode); if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) { + if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags)) { + spin_unlock(&inode->i_lock); + return; + } pnfs_get_layout_hdr(lo); pnfs_layout_remove_lseg(lo, lseg); spin_unlock(&inode->i_lock); @@ -450,6 +466,8 @@ pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg) test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); if (atomic_dec_and_test(&lseg->pls_refcount)) { struct pnfs_layout_hdr *lo = lseg->pls_layout; + if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags)) + return; pnfs_get_layout_hdr(lo); pnfs_layout_remove_lseg(lo, lseg); pnfs_free_lseg_async(lseg); @@ -923,6 +941,7 @@ void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo) clear_bit_unlock(NFS_LAYOUT_RETURN, &lo->plh_flags); smp_mb__after_atomic(); wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN); + rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq); } static int @@ -977,6 +996,7 @@ _pnfs_return_layout(struct inode *ino) LIST_HEAD(tmp_list); nfs4_stateid stateid; int status = 0, empty; + bool send; dprintk("NFS: %s for inode %lu\n", __func__, ino->i_ino); @@ -1006,17 +1026,18 @@ _pnfs_return_layout(struct inode *ino) /* Don't send a LAYOUTRETURN if list was initially empty */ if (empty) { spin_unlock(&ino->i_lock); - pnfs_put_layout_hdr(lo); dprintk("NFS: %s no layout segments to return\n", __func__); - goto out; + goto out_put_layout_hdr; } set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); - lo->plh_block_lgets++; + send = pnfs_prepare_layoutreturn(lo); spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&tmp_list); - - status = pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true); + if (send) + status = pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true); +out_put_layout_hdr: + pnfs_put_layout_hdr(lo); out: dprintk("<-- %s status: %d\n", __func__, status); return status; @@ -1096,13 +1117,9 @@ bool pnfs_roc(struct inode *ino) out_noroc: if (lo) { stateid = lo->plh_stateid; - layoutreturn = - test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, - &lo->plh_flags); - if (layoutreturn) { - lo->plh_block_lgets++; - pnfs_get_layout_hdr(lo); - } + if (test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, + &lo->plh_flags)) + layoutreturn = pnfs_prepare_layoutreturn(lo); } spin_unlock(&ino->i_lock); if (layoutreturn) { @@ -1145,15 +1162,18 @@ bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task) struct pnfs_layout_segment *lseg; nfs4_stateid stateid; u32 current_seqid; - bool found = false, layoutreturn = false; + bool layoutreturn = false; spin_lock(&ino->i_lock); - list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list) - if (test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) { - rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL); - found = true; - goto out; - } + list_for_each_entry(lseg, &nfsi->layout->plh_segs, pls_list) { + if (!test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) + continue; + if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags)) + continue; + rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL); + spin_unlock(&ino->i_lock); + return true; + } lo = nfsi->layout; current_seqid = be32_to_cpu(lo->plh_stateid.seqid); @@ -1161,23 +1181,19 @@ bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task) * a barrier, we choose the worst-case barrier. */ *barrier = current_seqid + atomic_read(&lo->plh_outstanding); -out: - if (!found) { - stateid = lo->plh_stateid; - layoutreturn = - test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, - &lo->plh_flags); - if (layoutreturn) { - lo->plh_block_lgets++; - pnfs_get_layout_hdr(lo); - } - } + stateid = lo->plh_stateid; + if (test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, + &lo->plh_flags)) + layoutreturn = pnfs_prepare_layoutreturn(lo); + if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) + rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL); + spin_unlock(&ino->i_lock); if (layoutreturn) { - rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL); pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, false); + return true; } - return found; + return false; } /* @@ -1694,7 +1710,6 @@ void pnfs_error_mark_layout_for_return(struct inode *inode, spin_lock(&inode->i_lock); /* set failure bit so that pnfs path will be retried later */ pnfs_layout_set_fail_bit(lo, iomode); - set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags); if (lo->plh_return_iomode == 0) lo->plh_return_iomode = range.iomode; else if (lo->plh_return_iomode != range.iomode) @@ -2206,13 +2221,12 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) if (ld->prepare_layoutcommit) { status = ld->prepare_layoutcommit(&data->args); if (status) { + put_rpccred(data->cred); spin_lock(&inode->i_lock); set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags); if (end_pos > nfsi->layout->plh_lwb) nfsi->layout->plh_lwb = end_pos; - spin_unlock(&inode->i_lock); - put_rpccred(data->cred); - goto clear_layoutcommitting; + goto out_unlock; } } @@ -2250,3 +2264,63 @@ struct nfs4_threshold *pnfs_mdsthreshold_alloc(void) } return thp; } + +#if IS_ENABLED(CONFIG_NFS_V4_2) +int +pnfs_report_layoutstat(struct inode *inode) +{ + struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; + struct nfs_server *server = NFS_SERVER(inode); + struct nfs_inode *nfsi = NFS_I(inode); + struct nfs42_layoutstat_data *data; + struct pnfs_layout_hdr *hdr; + int status = 0; + + if (!pnfs_enabled_sb(server) || !ld->prepare_layoutstats) + goto out; + + if (!nfs_server_capable(inode, NFS_CAP_LAYOUTSTATS)) + goto out; + + if (test_and_set_bit(NFS_INO_LAYOUTSTATS, &nfsi->flags)) + goto out; + + spin_lock(&inode->i_lock); + if (!NFS_I(inode)->layout) { + spin_unlock(&inode->i_lock); + goto out; + } + hdr = NFS_I(inode)->layout; + pnfs_get_layout_hdr(hdr); + spin_unlock(&inode->i_lock); + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) { + status = -ENOMEM; + goto out_put; + } + + data->args.fh = NFS_FH(inode); + data->args.inode = inode; + nfs4_stateid_copy(&data->args.stateid, &hdr->plh_stateid); + status = ld->prepare_layoutstats(&data->args); + if (status) + goto out_free; + + status = nfs42_proc_layoutstats_generic(NFS_SERVER(inode), data); + +out: + dprintk("%s returns %d\n", __func__, status); + return status; + +out_free: + kfree(data); +out_put: + pnfs_put_layout_hdr(hdr); + smp_mb__before_atomic(); + clear_bit(NFS_INO_LAYOUTSTATS, &nfsi->flags); + smp_mb__after_atomic(); + goto out; +} +EXPORT_SYMBOL_GPL(pnfs_report_layoutstat); +#endif diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 1e6308f82..3e6ab7bfb 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -178,6 +178,8 @@ struct pnfs_layoutdriver_type { void (*encode_layoutcommit) (struct pnfs_layout_hdr *lo, struct xdr_stream *xdr, const struct nfs4_layoutcommit_args *args); + int (*prepare_layoutstats) (struct nfs42_layoutstat_args *args); + void (*cleanup_layoutstats) (struct nfs42_layoutstat_data *data); }; struct pnfs_layout_hdr { @@ -290,7 +292,6 @@ int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *); struct nfs4_threshold *pnfs_mdsthreshold_alloc(void); void pnfs_error_mark_layout_for_return(struct inode *inode, struct pnfs_layout_segment *lseg); - /* nfs4_deviceid_flags */ enum { NFS_DEVICEID_INVALID = 0, /* set when MDS clientid recalled */ @@ -689,4 +690,14 @@ static inline void nfs4_pnfs_v3_ds_connect_unload(void) #endif /* CONFIG_NFS_V4_1 */ +#if IS_ENABLED(CONFIG_NFS_V4_2) +int pnfs_report_layoutstat(struct inode *inode); +#else +static inline int +pnfs_report_layoutstat(struct inode *inode) +{ + return 0; +} +#endif + #endif /* FS_NFS_PNFS_H */ diff --git a/fs/nfs/super.c b/fs/nfs/super.c index f175b833b..aa62004f1 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2847,7 +2847,7 @@ static int param_set_portnr(const char *val, const struct kernel_param *kp) *((unsigned int *)kp->arg) = num; return 0; } -static struct kernel_param_ops param_ops_portnr = { +static const struct kernel_param_ops param_ops_portnr = { .set = param_set_portnr, .get = param_get_uint, }; diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c index 2d5620065..b6de433da 100644 --- a/fs/nfs/symlink.c +++ b/fs/nfs/symlink.c @@ -20,7 +20,6 @@ #include <linux/stat.h> #include <linux/mm.h> #include <linux/string.h> -#include <linux/namei.h> /* Symlink caching in the page cache is even more simplistic * and straight-forward than readdir caching. @@ -43,7 +42,7 @@ error: return -EIO; } -static void *nfs_follow_link(struct dentry *dentry, struct nameidata *nd) +static const char *nfs_follow_link(struct dentry *dentry, void **cookie) { struct inode *inode = d_inode(dentry); struct page *page; @@ -51,19 +50,13 @@ static void *nfs_follow_link(struct dentry *dentry, struct nameidata *nd) err = ERR_PTR(nfs_revalidate_mapping(inode, inode->i_mapping)); if (err) - goto read_failed; + return err; page = read_cache_page(&inode->i_data, 0, (filler_t *)nfs_symlink_filler, inode); - if (IS_ERR(page)) { - err = page; - goto read_failed; - } - nd_set_link(nd, kmap(page)); - return page; - -read_failed: - nd_set_link(nd, err); - return NULL; + if (IS_ERR(page)) + return ERR_CAST(page); + *cookie = page; + return kmap(page); } /* diff --git a/fs/nfs/write.c b/fs/nfs/write.c index daf355642..75a35a1af 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -853,7 +853,8 @@ static void nfs_clear_page_commit(struct page *page) { dec_zone_page_state(page, NR_UNSTABLE_NFS); - dec_bdi_stat(inode_to_bdi(page_file_mapping(page)->host), BDI_RECLAIMABLE); + dec_wb_stat(&inode_to_bdi(page_file_mapping(page)->host)->wb, + WB_RECLAIMABLE); } /* Called holding inode (/cinfo) lock */ @@ -1348,11 +1349,6 @@ void nfs_commit_prepare(struct rpc_task *task, void *calldata) NFS_PROTO(data->inode)->commit_rpc_prepare(task, data); } -static void nfs_writeback_release_common(struct nfs_pgio_header *hdr) -{ - /* do nothing! */ -} - /* * Special version of should_remove_suid() that ignores capabilities. */ @@ -1383,24 +1379,27 @@ static void nfs_writeback_check_extend(struct nfs_pgio_header *hdr, { struct nfs_pgio_args *argp = &hdr->args; struct nfs_pgio_res *resp = &hdr->res; + u64 size = argp->offset + resp->count; if (!(fattr->valid & NFS_ATTR_FATTR_SIZE)) + fattr->size = size; + if (nfs_size_to_loff_t(fattr->size) < i_size_read(hdr->inode)) { + fattr->valid &= ~NFS_ATTR_FATTR_SIZE; return; - if (argp->offset + resp->count != fattr->size) - return; - if (nfs_size_to_loff_t(fattr->size) < i_size_read(hdr->inode)) + } + if (size != fattr->size) return; /* Set attribute barrier */ nfs_fattr_set_barrier(fattr); + /* ...and update size */ + fattr->valid |= NFS_ATTR_FATTR_SIZE; } void nfs_writeback_update_inode(struct nfs_pgio_header *hdr) { - struct nfs_fattr *fattr = hdr->res.fattr; + struct nfs_fattr *fattr = &hdr->fattr; struct inode *inode = hdr->inode; - if (fattr == NULL) - return; spin_lock(&inode->i_lock); nfs_writeback_check_extend(hdr, fattr); nfs_post_op_update_inode_force_wcc_locked(inode, fattr); @@ -1556,7 +1555,7 @@ int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data, /* Set up the initial task struct. */ nfs_ops->commit_setup(data, &msg); - dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); + dprintk("NFS: initiated commit call\n"); nfs4_state_protect(NFS_SERVER(data->inode)->nfs_client, NFS_SP4_MACH_CRED_COMMIT, &task_setup_data.rpc_client, &msg); @@ -2013,7 +2012,6 @@ static const struct nfs_rw_ops nfs_rw_write_ops = { .rw_mode = FMODE_WRITE, .rw_alloc_header = nfs_writehdr_alloc, .rw_free_header = nfs_writehdr_free, - .rw_release = nfs_writeback_release_common, .rw_done = nfs_writeback_done, .rw_result = nfs_writeback_result, .rw_initiate = nfs_initiate_write, diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index e4b2b4322..f6e7cbaba 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -805,7 +805,7 @@ encode_entry_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, static __be32 compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp, - const char *name, int namlen) + const char *name, int namlen, u64 ino) { struct svc_export *exp; struct dentry *dparent, *dchild; @@ -830,19 +830,21 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp, goto out; if (d_really_is_negative(dchild)) goto out; + if (dchild->d_inode->i_ino != ino) + goto out; rv = fh_compose(fhp, exp, dchild, &cd->fh); out: dput(dchild); return rv; } -static __be32 *encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, int namlen) +static __be32 *encode_entryplus_baggage(struct nfsd3_readdirres *cd, __be32 *p, const char *name, int namlen, u64 ino) { struct svc_fh *fh = &cd->scratch; __be32 err; fh_init(fh, NFS3_FHSIZE); - err = compose_entry_fh(cd, fh, name, namlen); + err = compose_entry_fh(cd, fh, name, namlen, ino); if (err) { *p++ = 0; *p++ = 0; @@ -927,7 +929,7 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen, p = encode_entry_baggage(cd, p, name, namlen, ino); if (plus) - p = encode_entryplus_baggage(cd, p, name, namlen); + p = encode_entryplus_baggage(cd, p, name, namlen, ino); num_entry_words = p - cd->buffer; } else if (*(page+1) != NULL) { /* temporarily encode entry into next page, then move back to @@ -941,7 +943,7 @@ encode_entry(struct readdir_cd *ccd, const char *name, int namlen, p1 = encode_entry_baggage(cd, p1, name, namlen, ino); if (plus) - p1 = encode_entryplus_baggage(cd, p1, name, namlen); + p1 = encode_entryplus_baggage(cd, p1, name, namlen, ino); /* determine entry word length and lengths to go in pages */ num_entry_words = p1 - tmp; diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index 67242bf7c..eb5accf1b 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -52,10 +52,6 @@ #define NFS4_ANYONE_MODE (NFS4_ACE_READ_ATTRIBUTES | NFS4_ACE_READ_ACL | NFS4_ACE_SYNCHRONIZE) #define NFS4_OWNER_MODE (NFS4_ACE_WRITE_ATTRIBUTES | NFS4_ACE_WRITE_ACL) -/* We don't support these bits; insist they be neither allowed nor denied */ -#define NFS4_MASK_UNSUPP (NFS4_ACE_DELETE | NFS4_ACE_WRITE_OWNER \ - | NFS4_ACE_READ_NAMED_ATTRS | NFS4_ACE_WRITE_NAMED_ATTRS) - /* flags used to simulate posix default ACLs */ #define NFS4_INHERITANCE_FLAGS (NFS4_ACE_FILE_INHERIT_ACE \ | NFS4_ACE_DIRECTORY_INHERIT_ACE) @@ -64,9 +60,6 @@ | NFS4_ACE_INHERIT_ONLY_ACE \ | NFS4_ACE_IDENTIFIER_GROUP) -#define MASK_EQUAL(mask1, mask2) \ - ( ((mask1) & NFS4_ACE_MASK_ALL) == ((mask2) & NFS4_ACE_MASK_ALL) ) - static u32 mask_from_posix(unsigned short perm, unsigned int flags) { @@ -126,11 +119,6 @@ low_mode_from_nfs4(u32 perm, unsigned short *mode, unsigned int flags) *mode |= ACL_EXECUTE; } -struct ace_container { - struct nfs4_ace *ace; - struct list_head ace_l; -}; - static short ace2type(struct nfs4_ace *); static void _posix_to_nfsv4_one(struct posix_acl *, struct nfs4_acl *, unsigned int); @@ -384,7 +372,6 @@ pace_gt(struct posix_acl_entry *pace1, struct posix_acl_entry *pace2) static void sort_pacl_range(struct posix_acl *pacl, int start, int end) { int sorted = 0, i; - struct posix_acl_entry tmp; /* We just do a bubble sort; easy to do in place, and we're not * expecting acl's to be long enough to justify anything more. */ @@ -394,9 +381,8 @@ sort_pacl_range(struct posix_acl *pacl, int start, int end) { if (pace_gt(&pacl->a_entries[i], &pacl->a_entries[i+1])) { sorted = 0; - tmp = pacl->a_entries[i]; - pacl->a_entries[i] = pacl->a_entries[i+1]; - pacl->a_entries[i+1] = tmp; + swap(pacl->a_entries[i], + pacl->a_entries[i + 1]); } } } diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 5694cfb7a..a49201835 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -455,6 +455,7 @@ static int decode_cb_sequence4res(struct xdr_stream *xdr, if (unlikely(status || cb->cb_status)) return status; + cb->cb_update_seq_nr = true; return decode_cb_sequence4resok(xdr, cb); } @@ -875,6 +876,8 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata) u32 minorversion = clp->cl_minorversion; cb->cb_minorversion = minorversion; + cb->cb_update_seq_nr = false; + cb->cb_status = 0; if (minorversion) { if (!nfsd41_cb_get_slot(clp, task)) return; @@ -891,9 +894,16 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata) clp->cl_minorversion); if (clp->cl_minorversion) { - /* No need for lock, access serialized in nfsd4_cb_prepare */ - if (!task->tk_status) + /* + * No need for lock, access serialized in nfsd4_cb_prepare + * + * RFC5661 20.9.3 + * If CB_SEQUENCE returns an error, then the state of the slot + * (sequence ID, cached reply) MUST NOT change. + */ + if (cb->cb_update_seq_nr) ++clp->cl_cb_session->se_cb_seq_nr; + clear_bit(0, &clp->cl_cb_slot_busy); rpc_wake_up_next(&clp->cl_cb_waitq); dprintk("%s: freed slot, new seqid=%d\n", __func__, @@ -1090,6 +1100,7 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp, cb->cb_ops = ops; INIT_WORK(&cb->cb_work, nfsd4_run_cb_work); cb->cb_status = 0; + cb->cb_update_seq_nr = false; cb->cb_need_restart = false; } diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index 6904213a4..ebf90e487 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -212,6 +212,7 @@ nfsd4_alloc_layout_stateid(struct nfsd4_compound_state *cstate, BUG_ON(!ls->ls_file); if (nfsd4_layout_setlease(ls)) { + fput(ls->ls_file); put_nfs4_file(fp); kmem_cache_free(nfs4_layout_stateid_cache, ls); return NULL; diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 864e2003e..90cfda753 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -760,8 +760,6 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, { __be32 status; - /* no need to check permission - this will be done in nfsd_read() */ - read->rd_filp = NULL; if (read->rd_offset >= OFFSET_MAX) return nfserr_inval; @@ -778,9 +776,9 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags); /* check stateid */ - if ((status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), - cstate, &read->rd_stateid, - RD_STATE, &read->rd_filp))) { + status = nfs4_preprocess_stateid_op(rqstp, cstate, &read->rd_stateid, + RD_STATE, &read->rd_filp, &read->rd_tmp_file); + if (status) { dprintk("NFSD: nfsd4_read: couldn't process stateid!\n"); goto out; } @@ -924,8 +922,8 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, int err; if (setattr->sa_iattr.ia_valid & ATTR_SIZE) { - status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), cstate, - &setattr->sa_stateid, WR_STATE, NULL); + status = nfs4_preprocess_stateid_op(rqstp, cstate, + &setattr->sa_stateid, WR_STATE, NULL, NULL); if (status) { dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n"); return status; @@ -986,13 +984,11 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, unsigned long cnt; int nvecs; - /* no need to check permission - this will be done in nfsd_write() */ - if (write->wr_offset >= OFFSET_MAX) return nfserr_inval; - status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), - cstate, stateid, WR_STATE, &filp); + status = nfs4_preprocess_stateid_op(rqstp, cstate, stateid, WR_STATE, + &filp, NULL); if (status) { dprintk("NFSD: nfsd4_write: couldn't process stateid!\n"); return status; @@ -1005,11 +1001,10 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nvecs = fill_in_write_vector(rqstp->rq_vec, write); WARN_ON_ONCE(nvecs > ARRAY_SIZE(rqstp->rq_vec)); - status = nfsd_write(rqstp, &cstate->current_fh, filp, - write->wr_offset, rqstp->rq_vec, nvecs, - &cnt, &write->wr_how_written); - if (filp) - fput(filp); + status = nfsd_vfs_write(rqstp, &cstate->current_fh, filp, + write->wr_offset, rqstp->rq_vec, nvecs, &cnt, + &write->wr_how_written); + fput(filp); write->wr_bytes_written = cnt; @@ -1023,15 +1018,13 @@ nfsd4_fallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, __be32 status = nfserr_notsupp; struct file *file; - status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), cstate, + status = nfs4_preprocess_stateid_op(rqstp, cstate, &fallocate->falloc_stateid, - WR_STATE, &file); + WR_STATE, &file, NULL); if (status != nfs_ok) { dprintk("NFSD: nfsd4_fallocate: couldn't process stateid!\n"); return status; } - if (!file) - return nfserr_bad_stateid; status = nfsd4_vfs_fallocate(rqstp, &cstate->current_fh, file, fallocate->falloc_offset, @@ -1064,15 +1057,13 @@ nfsd4_seek(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, __be32 status; struct file *file; - status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), cstate, + status = nfs4_preprocess_stateid_op(rqstp, cstate, &seek->seek_stateid, - RD_STATE, &file); + RD_STATE, &file, NULL); if (status) { dprintk("NFSD: nfsd4_seek: couldn't process stateid!\n"); return status; } - if (!file) - return nfserr_bad_stateid; switch (seek->seek_whence) { case NFS4_CONTENT_DATA: @@ -1732,10 +1723,6 @@ encode_op: be32_to_cpu(status)); nfsd4_cstate_clear_replay(cstate); - /* XXX Ugh, we need to get rid of this kind of special case: */ - if (op->opnum == OP_READ && op->u.read.rd_filp) - fput(op->u.read.rd_filp); - nfsd4_increment_op_stats(op->opnum); } diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 6e13504f7..95202719a 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -3861,7 +3861,7 @@ static __be32 nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_ol_stateid *stp, struct nfsd4_open *open) { __be32 status; - unsigned char old_deny_bmap; + unsigned char old_deny_bmap = stp->st_deny_bmap; if (!test_access(open->op_share_access, stp)) return nfs4_get_vfs_file(rqstp, fp, cur_fh, stp, open); @@ -3870,7 +3870,6 @@ nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *c spin_lock(&fp->fi_lock); status = nfs4_file_check_deny(fp, open->op_share_deny); if (status == nfs_ok) { - old_deny_bmap = stp->st_deny_bmap; set_deny(open->op_share_deny, stp); fp->fi_share_deny |= (open->op_share_deny & NFS4_SHARE_DENY_BOTH); @@ -4577,6 +4576,9 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, static struct file * nfs4_find_file(struct nfs4_stid *s, int flags) { + if (!s) + return NULL; + switch (s->sc_type) { case NFS4_DELEG_STID: if (WARN_ON_ONCE(!s->sc_file->fi_deleg_file)) @@ -4605,27 +4607,63 @@ nfs4_check_olstateid(struct svc_fh *fhp, struct nfs4_ol_stateid *ols, int flags) return nfs4_check_openmode(ols, flags); } +static __be32 +nfs4_check_file(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfs4_stid *s, + struct file **filpp, bool *tmp_file, int flags) +{ + int acc = (flags & RD_STATE) ? NFSD_MAY_READ : NFSD_MAY_WRITE; + struct file *file; + __be32 status; + + file = nfs4_find_file(s, flags); + if (file) { + status = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry, + acc | NFSD_MAY_OWNER_OVERRIDE); + if (status) { + fput(file); + return status; + } + + *filpp = file; + } else { + status = nfsd_open(rqstp, fhp, S_IFREG, acc, filpp); + if (status) + return status; + + if (tmp_file) + *tmp_file = true; + } + + return 0; +} + /* * Checks for stateid operations */ __be32 -nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate, - stateid_t *stateid, int flags, struct file **filpp) +nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, stateid_t *stateid, + int flags, struct file **filpp, bool *tmp_file) { struct svc_fh *fhp = &cstate->current_fh; struct inode *ino = d_inode(fhp->fh_dentry); + struct net *net = SVC_NET(rqstp); struct nfsd_net *nn = net_generic(net, nfsd_net_id); - struct nfs4_stid *s; + struct nfs4_stid *s = NULL; __be32 status; if (filpp) *filpp = NULL; + if (tmp_file) + *tmp_file = false; if (grace_disallows_io(net, ino)) return nfserr_grace; - if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) - return check_special_stateids(net, fhp, stateid, flags); + if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) { + status = check_special_stateids(net, fhp, stateid, flags); + goto done; + } status = nfsd4_lookup_stateid(cstate, stateid, NFS4_DELEG_STID|NFS4_OPEN_STID|NFS4_LOCK_STID, @@ -4653,13 +4691,12 @@ nfs4_preprocess_stateid_op(struct net *net, struct nfsd4_compound_state *cstate, goto out; status = nfs4_check_fh(fhp, s); - if (!status && filpp) { - *filpp = nfs4_find_file(s, flags); - if (!*filpp) - status = nfserr_serverfault; - } +done: + if (!status && filpp) + status = nfs4_check_file(rqstp, fhp, s, filpp, tmp_file, flags); out: - nfs4_put_stid(s); + if (s) + nfs4_put_stid(s); return status; } @@ -5512,7 +5549,7 @@ static __be32 nfsd_test_lock(struct svc_rqst *rqstp, struct svc_fh *fhp, struct __be32 err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file); if (!err) { err = nfserrno(vfs_test_lock(file, lock)); - nfsd_close(file); + fput(file); } return err; } diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index d4d84451e..75e0563c0 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -33,6 +33,7 @@ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ +#include <linux/file.h> #include <linux/slab.h> #include <linux/namei.h> #include <linux/statfs.h> @@ -2229,7 +2230,6 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, u32 rdattr_err = 0; __be32 status; int err; - int aclsupport = 0; struct nfs4_acl *acl = NULL; void *context = NULL; int contextlen; @@ -2275,19 +2275,15 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, goto out; fhp = tempfh; } - if (bmval0 & (FATTR4_WORD0_ACL | FATTR4_WORD0_ACLSUPPORT - | FATTR4_WORD0_SUPPORTED_ATTRS)) { + if (bmval0 & FATTR4_WORD0_ACL) { err = nfsd4_get_nfs4_acl(rqstp, dentry, &acl); - aclsupport = (err == 0); - if (bmval0 & FATTR4_WORD0_ACL) { - if (err == -EOPNOTSUPP) - bmval0 &= ~FATTR4_WORD0_ACL; - else if (err == -EINVAL) { - status = nfserr_attrnotsupp; - goto out; - } else if (err != 0) - goto out_nfserr; - } + if (err == -EOPNOTSUPP) + bmval0 &= ~FATTR4_WORD0_ACL; + else if (err == -EINVAL) { + status = nfserr_attrnotsupp; + goto out; + } else if (err != 0) + goto out_nfserr; } #ifdef CONFIG_NFSD_V4_SECURITY_LABEL @@ -2339,7 +2335,7 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp, u32 word1 = nfsd_suppattrs1(minorversion); u32 word2 = nfsd_suppattrs2(minorversion); - if (!aclsupport) + if (!IS_POSIXACL(dentry->d_inode)) word0 &= ~FATTR4_WORD0_ACL; if (!contextsupport) word2 &= ~FATTR4_WORD2_SECURITY_LABEL; @@ -2487,7 +2483,7 @@ out_acl: p = xdr_reserve_space(xdr, 4); if (!p) goto out_resource; - *p++ = cpu_to_be32(aclsupport ? + *p++ = cpu_to_be32(IS_POSIXACL(dentry->d_inode) ? ACL4_SUPPORT_ALLOW_ACL|ACL4_SUPPORT_DENY_ACL : 0); } if (bmval0 & FATTR4_WORD0_CANSETTIME) { @@ -3423,52 +3419,51 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr, unsigned long maxcount; struct xdr_stream *xdr = &resp->xdr; struct file *file = read->rd_filp; - struct svc_fh *fhp = read->rd_fhp; int starting_len = xdr->buf->len; - struct raparms *ra; + struct raparms *ra = NULL; __be32 *p; - __be32 err; if (nfserr) - return nfserr; + goto out; p = xdr_reserve_space(xdr, 8); /* eof flag and byte count */ if (!p) { WARN_ON_ONCE(test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags)); - return nfserr_resource; + nfserr = nfserr_resource; + goto out; } - if (resp->xdr.buf->page_len && test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags)) { + if (resp->xdr.buf->page_len && + test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags)) { WARN_ON_ONCE(1); - return nfserr_resource; + nfserr = nfserr_resource; + goto out; } xdr_commit_encode(xdr); maxcount = svc_max_payload(resp->rqstp); - maxcount = min_t(unsigned long, maxcount, (xdr->buf->buflen - xdr->buf->len)); + maxcount = min_t(unsigned long, maxcount, + (xdr->buf->buflen - xdr->buf->len)); maxcount = min_t(unsigned long, maxcount, read->rd_length); - if (read->rd_filp) - err = nfsd_permission(resp->rqstp, fhp->fh_export, - fhp->fh_dentry, - NFSD_MAY_READ|NFSD_MAY_OWNER_OVERRIDE); - else - err = nfsd_get_tmp_read_open(resp->rqstp, read->rd_fhp, - &file, &ra); - if (err) - goto err_truncate; + if (read->rd_tmp_file) + ra = nfsd_init_raparms(file); - if (file->f_op->splice_read && test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags)) - err = nfsd4_encode_splice_read(resp, read, file, maxcount); + if (file->f_op->splice_read && + test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags)) + nfserr = nfsd4_encode_splice_read(resp, read, file, maxcount); else - err = nfsd4_encode_readv(resp, read, file, maxcount); + nfserr = nfsd4_encode_readv(resp, read, file, maxcount); - if (!read->rd_filp) - nfsd_put_tmp_read_open(file, ra); + if (ra) + nfsd_put_raparams(file, ra); -err_truncate: - if (err) + if (nfserr) xdr_truncate_encode(xdr, starting_len); - return err; + +out: + if (file) + fput(file); + return nfserr; } static __be32 diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index aecbcd34d..4cd78ef4c 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -59,13 +59,61 @@ static __be32 nfsd_proc_setattr(struct svc_rqst *rqstp, struct nfsd_sattrargs *argp, struct nfsd_attrstat *resp) { + struct iattr *iap = &argp->attrs; + struct svc_fh *fhp; __be32 nfserr; + dprintk("nfsd: SETATTR %s, valid=%x, size=%ld\n", SVCFH_fmt(&argp->fh), argp->attrs.ia_valid, (long) argp->attrs.ia_size); - fh_copy(&resp->fh, &argp->fh); - nfserr = nfsd_setattr(rqstp, &resp->fh, &argp->attrs,0, (time_t)0); + fhp = fh_copy(&resp->fh, &argp->fh); + + /* + * NFSv2 does not differentiate between "set-[ac]time-to-now" + * which only requires access, and "set-[ac]time-to-X" which + * requires ownership. + * So if it looks like it might be "set both to the same time which + * is close to now", and if inode_change_ok fails, then we + * convert to "set to now" instead of "set to explicit time" + * + * We only call inode_change_ok as the last test as technically + * it is not an interface that we should be using. + */ +#define BOTH_TIME_SET (ATTR_ATIME_SET | ATTR_MTIME_SET) +#define MAX_TOUCH_TIME_ERROR (30*60) + if ((iap->ia_valid & BOTH_TIME_SET) == BOTH_TIME_SET && + iap->ia_mtime.tv_sec == iap->ia_atime.tv_sec) { + /* + * Looks probable. + * + * Now just make sure time is in the right ballpark. + * Solaris, at least, doesn't seem to care what the time + * request is. We require it be within 30 minutes of now. + */ + time_t delta = iap->ia_atime.tv_sec - get_seconds(); + struct inode *inode; + + nfserr = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP); + if (nfserr) + goto done; + inode = d_inode(fhp->fh_dentry); + + if (delta < 0) + delta = -delta; + if (delta < MAX_TOUCH_TIME_ERROR && + inode_change_ok(inode, iap) != 0) { + /* + * Turn off ATTR_[AM]TIME_SET but leave ATTR_[AM]TIME. + * This will cause notify_change to set these times + * to "now" + */ + iap->ia_valid &= ~BOTH_TIME_SET; + } + } + + nfserr = nfsd_setattr(rqstp, fhp, iap, 0, (time_t)0); +done: return nfsd_return_attrs(nfserr, resp); } diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index dbc4f85a5..4874ce515 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -68,6 +68,7 @@ struct nfsd4_callback { struct nfsd4_callback_ops *cb_ops; struct work_struct cb_work; int cb_status; + bool cb_update_seq_nr; bool cb_need_restart; }; @@ -582,9 +583,9 @@ enum nfsd4_cb_op { struct nfsd4_compound_state; struct nfsd_net; -extern __be32 nfs4_preprocess_stateid_op(struct net *net, - struct nfsd4_compound_state *cstate, - stateid_t *stateid, int flags, struct file **filp); +extern __be32 nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, + struct nfsd4_compound_state *cstate, stateid_t *stateid, + int flags, struct file **filp, bool *tmp_file); __be32 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, stateid_t *stateid, unsigned char typemask, struct nfs4_stid **s, struct nfsd_net *nn); diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 84d770be0..b5e077a6e 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -302,42 +302,6 @@ commit_metadata(struct svc_fh *fhp) static void nfsd_sanitize_attrs(struct inode *inode, struct iattr *iap) { - /* - * NFSv2 does not differentiate between "set-[ac]time-to-now" - * which only requires access, and "set-[ac]time-to-X" which - * requires ownership. - * So if it looks like it might be "set both to the same time which - * is close to now", and if inode_change_ok fails, then we - * convert to "set to now" instead of "set to explicit time" - * - * We only call inode_change_ok as the last test as technically - * it is not an interface that we should be using. - */ -#define BOTH_TIME_SET (ATTR_ATIME_SET | ATTR_MTIME_SET) -#define MAX_TOUCH_TIME_ERROR (30*60) - if ((iap->ia_valid & BOTH_TIME_SET) == BOTH_TIME_SET && - iap->ia_mtime.tv_sec == iap->ia_atime.tv_sec) { - /* - * Looks probable. - * - * Now just make sure time is in the right ballpark. - * Solaris, at least, doesn't seem to care what the time - * request is. We require it be within 30 minutes of now. - */ - time_t delta = iap->ia_atime.tv_sec - get_seconds(); - if (delta < 0) - delta = -delta; - if (delta < MAX_TOUCH_TIME_ERROR && - inode_change_ok(inode, iap) != 0) { - /* - * Turn off ATTR_[AM]TIME_SET but leave ATTR_[AM]TIME. - * This will cause notify_change to set these times - * to "now" - */ - iap->ia_valid &= ~BOTH_TIME_SET; - } - } - /* sanitize the mode change */ if (iap->ia_valid & ATTR_MODE) { iap->ia_mode &= S_IALLUGO; @@ -538,16 +502,11 @@ __be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, loff_t len, int flags) { - __be32 err; int error; if (!S_ISREG(file_inode(file)->i_mode)) return nfserr_inval; - err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry, NFSD_MAY_WRITE); - if (err) - return err; - error = vfs_fallocate(file, flags, offset, len); if (!error) error = commit_metadata(fhp); @@ -744,7 +703,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, host_err = ima_file_check(file, may_flags, 0); if (host_err) { - nfsd_close(file); + fput(file); goto out_nfserr; } @@ -761,23 +720,12 @@ out: return err; } -/* - * Close a file. - */ -void -nfsd_close(struct file *filp) -{ - fput(filp); -} - -/* - * Obtain the readahead parameters for the file - * specified by (dev, ino). - */ - -static inline struct raparms * -nfsd_get_raparms(dev_t dev, ino_t ino) +struct raparms * +nfsd_init_raparms(struct file *file) { + struct inode *inode = file_inode(file); + dev_t dev = inode->i_sb->s_dev; + ino_t ino = inode->i_ino; struct raparms *ra, **rap, **frap = NULL; int depth = 0; unsigned int hash; @@ -814,9 +762,23 @@ found: ra->p_count++; nfsdstats.ra_depth[depth*10/nfsdstats.ra_size]++; spin_unlock(&rab->pb_lock); + + if (ra->p_set) + file->f_ra = ra->p_ra; return ra; } +void nfsd_put_raparams(struct file *file, struct raparms *ra) +{ + struct raparm_hbucket *rab = &raparm_hash[ra->p_hindex]; + + spin_lock(&rab->pb_lock); + ra->p_ra = file->f_ra; + ra->p_set = 1; + ra->p_count--; + spin_unlock(&rab->pb_lock); +} + /* * Grab and keep cached pages associated with a file in the svc_rqst * so that they can be passed to the network sendmsg/sendpage routines @@ -945,7 +907,7 @@ static int wait_for_concurrent_writes(struct file *file) return err; } -static __be32 +__be32 nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, struct kvec *vec, int vlen, unsigned long *cnt, int *stablep) @@ -1009,40 +971,6 @@ out_nfserr: return err; } -__be32 nfsd_get_tmp_read_open(struct svc_rqst *rqstp, struct svc_fh *fhp, - struct file **file, struct raparms **ra) -{ - struct inode *inode; - __be32 err; - - err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, file); - if (err) - return err; - - inode = file_inode(*file); - - /* Get readahead parameters */ - *ra = nfsd_get_raparms(inode->i_sb->s_dev, inode->i_ino); - - if (*ra && (*ra)->p_set) - (*file)->f_ra = (*ra)->p_ra; - return nfs_ok; -} - -void nfsd_put_tmp_read_open(struct file *file, struct raparms *ra) -{ - /* Write back readahead params */ - if (ra) { - struct raparm_hbucket *rab = &raparm_hash[ra->p_hindex]; - spin_lock(&rab->pb_lock); - ra->p_ra = file->f_ra; - ra->p_set = 1; - ra->p_count--; - spin_unlock(&rab->pb_lock); - } - nfsd_close(file); -} - /* * Read data from a file. count must contain the requested read count * on entry. On return, *count contains the number of bytes actually read. @@ -1055,13 +983,15 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct raparms *ra; __be32 err; - err = nfsd_get_tmp_read_open(rqstp, fhp, &file, &ra); + err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file); if (err) return err; + ra = nfsd_init_raparms(file); err = nfsd_vfs_read(rqstp, file, offset, vec, vlen, count); - - nfsd_put_tmp_read_open(file, ra); + if (ra) + nfsd_put_raparams(file, ra); + fput(file); return err; } @@ -1093,7 +1023,7 @@ nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, if (cnt) err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, cnt, stablep); - nfsd_close(file); + fput(file); } out: return err; @@ -1138,7 +1068,7 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, err = nfserr_notsupp; } - nfsd_close(file); + fput(file); out: return err; } @@ -1977,7 +1907,7 @@ nfsd_readdir(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t *offsetp, if (err == nfserr_eof || err == nfserr_toosmall) err = nfs_ok; /* can still be found in ->err */ out_close: - nfsd_close(file); + fput(file); out: return err; } diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index 2050cb016..5be875e3e 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -71,11 +71,7 @@ __be32 nfsd_commit(struct svc_rqst *, struct svc_fh *, #endif /* CONFIG_NFSD_V3 */ __be32 nfsd_open(struct svc_rqst *, struct svc_fh *, umode_t, int, struct file **); -void nfsd_close(struct file *); struct raparms; -__be32 nfsd_get_tmp_read_open(struct svc_rqst *, struct svc_fh *, - struct file **, struct raparms **); -void nfsd_put_tmp_read_open(struct file *, struct raparms *); __be32 nfsd_splice_read(struct svc_rqst *, struct file *, loff_t, unsigned long *); __be32 nfsd_readv(struct file *, loff_t, struct kvec *, int, @@ -84,6 +80,10 @@ __be32 nfsd_read(struct svc_rqst *, struct svc_fh *, loff_t, struct kvec *, int, unsigned long *); __be32 nfsd_write(struct svc_rqst *, struct svc_fh *,struct file *, loff_t, struct kvec *,int, unsigned long *, int *); +__be32 nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct file *file, loff_t offset, + struct kvec *vec, int vlen, unsigned long *cnt, + int *stablep); __be32 nfsd_readlink(struct svc_rqst *, struct svc_fh *, char *, int *); __be32 nfsd_symlink(struct svc_rqst *, struct svc_fh *, @@ -104,6 +104,9 @@ __be32 nfsd_statfs(struct svc_rqst *, struct svc_fh *, __be32 nfsd_permission(struct svc_rqst *, struct svc_export *, struct dentry *, int); +struct raparms *nfsd_init_raparms(struct file *file); +void nfsd_put_raparams(struct file *file, struct raparms *ra); + static inline int fh_want_write(struct svc_fh *fh) { int ret = mnt_want_write(fh->fh_export->ex_path.mnt); diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 2f8c092be..9f991007a 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -273,6 +273,7 @@ struct nfsd4_read { u32 rd_length; /* request */ int rd_vlen; struct file *rd_filp; + bool rd_tmp_file; struct svc_rqst *rd_rqstp; /* response */ struct svc_fh * rd_fhp; /* response */ diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index 0ee0bed36..6b8b92b19 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -61,11 +61,6 @@ static inline void nilfs_put_page(struct page *page) page_cache_release(page); } -static inline unsigned long dir_pages(struct inode *inode) -{ - return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT; -} - /* * Return the offset into page `page_nr' of the last valid * byte in that page, plus one. diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 258d9fe25..4a73d6dff 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -307,31 +307,13 @@ static int nilfs_write_end(struct file *file, struct address_space *mapping, static ssize_t nilfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) { - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = file->f_mapping->host; - size_t count = iov_iter_count(iter); - ssize_t size; + struct inode *inode = file_inode(iocb->ki_filp); if (iov_iter_rw(iter) == WRITE) return 0; /* Needs synchronization with the cleaner */ - size = blockdev_direct_IO(iocb, inode, iter, offset, nilfs_get_block); - - /* - * In case of error extending write may have instantiated a few - * blocks outside i_size. Trim these off again. - */ - if (unlikely(iov_iter_rw(iter) == WRITE && size < 0)) { - loff_t isize = i_size_read(inode); - loff_t end = offset + count; - - if (end > isize) - nilfs_write_failed(mapping, end); - } - - return size; + return blockdev_direct_IO(iocb, inode, iter, offset, nilfs_get_block); } const struct address_space_operations nilfs_aops = { diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index 9a20e513d..aba43811d 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -1369,7 +1369,6 @@ long nilfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case NILFS_IOCTL_SYNC: case NILFS_IOCTL_RESIZE: case NILFS_IOCTL_SET_ALLOC_RANGE: - case FITRIM: break; default: return -ENOIOCTLCMD; diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index 22180836e..37dd6b05b 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -496,8 +496,7 @@ static struct dentry *nilfs_fh_to_dentry(struct super_block *sb, struct fid *fh, { struct nilfs_fid *fid = (struct nilfs_fid *)fh; - if ((fh_len != NILFS_FID_SIZE_NON_CONNECTABLE && - fh_len != NILFS_FID_SIZE_CONNECTABLE) || + if (fh_len < NILFS_FID_SIZE_NON_CONNECTABLE || (fh_type != FILEID_NILFS_WITH_PARENT && fh_type != FILEID_NILFS_WITHOUT_PARENT)) return NULL; @@ -510,7 +509,7 @@ static struct dentry *nilfs_fh_to_parent(struct super_block *sb, struct fid *fh, { struct nilfs_fid *fid = (struct nilfs_fid *)fh; - if (fh_len != NILFS_FID_SIZE_CONNECTABLE || + if (fh_len < NILFS_FID_SIZE_CONNECTABLE || fh_type != FILEID_NILFS_WITH_PARENT) return NULL; diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c index dc3a9efda..42468e5ab 100644 --- a/fs/nilfs2/segbuf.c +++ b/fs/nilfs2/segbuf.c @@ -343,11 +343,6 @@ static void nilfs_end_bio_write(struct bio *bio, int err) const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); struct nilfs_segment_buffer *segbuf = bio->bi_private; - if (err == -EOPNOTSUPP) { - set_bit(BIO_EOPNOTSUPP, &bio->bi_flags); - /* to be detected by nilfs_segbuf_submit_bio() */ - } - if (!uptodate) atomic_inc(&segbuf->sb_err); @@ -374,15 +369,8 @@ static int nilfs_segbuf_submit_bio(struct nilfs_segment_buffer *segbuf, bio->bi_end_io = nilfs_end_bio_write; bio->bi_private = segbuf; - bio_get(bio); submit_bio(mode, bio); segbuf->sb_nbio++; - if (bio_flagged(bio, BIO_EOPNOTSUPP)) { - bio_put(bio); - err = -EOPNOTSUPP; - goto failed; - } - bio_put(bio); wi->bio = NULL; wi->rest_blocks -= wi->end - wi->start; diff --git a/fs/notify/group.c b/fs/notify/group.c index 06ca6bc4a..d16b62cb2 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -22,7 +22,6 @@ #include <linux/srcu.h> #include <linux/rculist.h> #include <linux/wait.h> -#include <linux/module.h> #include <linux/fsnotify_backend.h> #include "fsnotify.h" @@ -73,7 +72,6 @@ void fsnotify_get_group(struct fsnotify_group *group) { atomic_inc(&group->refcnt); } -EXPORT_SYMBOL(fsnotify_get_group); /* * Drop a reference to a group. Free it if it's through. @@ -83,7 +81,6 @@ void fsnotify_put_group(struct fsnotify_group *group) if (atomic_dec_and_test(&group->refcnt)) fsnotify_final_destroy_group(group); } -EXPORT_SYMBOL(fsnotify_put_group); /* * Create a new fsnotify_group and hold a reference for the group returned. @@ -112,7 +109,6 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops) return group; } -EXPORT_SYMBOL(fsnotify_alloc_group); int fsnotify_fasync(int fd, struct file *file, int on) { diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 450648697..5b1e2a497 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -26,7 +26,7 @@ #include <linux/fs.h> /* struct inode */ #include <linux/fsnotify_backend.h> #include <linux/idr.h> -#include <linux/init.h> /* module_init */ +#include <linux/init.h> /* fs_initcall */ #include <linux/inotify.h> #include <linux/kernel.h> /* roundup() */ #include <linux/namei.h> /* LOOKUP_FOLLOW */ @@ -812,4 +812,4 @@ static int __init inotify_user_setup(void) return 0; } -module_init(inotify_user_setup); +fs_initcall(inotify_user_setup); diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 39a838ad2..39ddcaf09 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -109,7 +109,6 @@ void fsnotify_put_mark(struct fsnotify_mark *mark) mark->free_mark(mark); } } -EXPORT_SYMBOL(fsnotify_put_mark); /* Calculate mask of events for a list of marks */ u32 fsnotify_recalc_mask(struct hlist_head *head) @@ -203,7 +202,6 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark, fsnotify_destroy_mark_locked(mark, group); mutex_unlock(&group->mark_mutex); } -EXPORT_SYMBOL(fsnotify_destroy_mark); /* * Destroy all marks in the given list. The marks must be already detached from @@ -378,7 +376,6 @@ err: return ret; } -EXPORT_SYMBOL(fsnotify_add_mark); int fsnotify_add_mark(struct fsnotify_mark *mark, struct fsnotify_group *group, struct inode *inode, struct vfsmount *mnt, int allow_dups) @@ -478,7 +475,6 @@ void fsnotify_init_mark(struct fsnotify_mark *mark, atomic_set(&mark->refcnt, 1); mark->free_mark = free_mark; } -EXPORT_SYMBOL(fsnotify_init_mark); static int fsnotify_mark_destroy(void *ignored) { diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 7bb487e66..262561fea 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -382,7 +382,7 @@ static ssize_t ntfs_prepare_file_for_write(struct kiocb *iocb, base_ni = ni; if (NInoAttr(ni)) base_ni = ni->ext.base_ntfs_ino; - err = file_remove_suid(file); + err = file_remove_privs(file); if (unlikely(err)) goto out; /* @@ -525,7 +525,8 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping, } } err = add_to_page_cache_lru(*cached_page, mapping, - index, GFP_KERNEL); + index, + GFP_KERNEL & mapping_gfp_mask(mapping)); if (unlikely(err)) { if (err == -EEXIST) continue; diff --git a/fs/ntfs/inode.h b/fs/ntfs/inode.h index 76b6cfb57..b3c3469de 100644 --- a/fs/ntfs/inode.h +++ b/fs/ntfs/inode.h @@ -239,7 +239,7 @@ typedef struct { */ static inline ntfs_inode *NTFS_I(struct inode *inode) { - return (ntfs_inode *)list_entry(inode, big_ntfs_inode, vfs_inode); + return (ntfs_inode *)container_of(inode, big_ntfs_inode, vfs_inode); } static inline struct inode *VFS_I(ntfs_inode *ni) diff --git a/fs/ntfs/malloc.h b/fs/ntfs/malloc.h index a44b14cbc..ab172e5f5 100644 --- a/fs/ntfs/malloc.h +++ b/fs/ntfs/malloc.h @@ -85,12 +85,7 @@ static inline void *ntfs_malloc_nofs_nofail(unsigned long size) static inline void ntfs_free(void *addr) { - if (!is_vmalloc_addr(addr)) { - kfree(addr); - /* free_page((unsigned long)addr); */ - return; - } - vfree(addr); + kvfree(addr); } #endif /* _LINUX_NTFS_MALLOC_H */ diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c index 0f35b80d1..443abecf0 100644 --- a/fs/ntfs/namei.c +++ b/fs/ntfs/namei.c @@ -35,7 +35,7 @@ * ntfs_lookup - find the inode represented by a dentry in a directory inode * @dir_ino: directory inode in which to look for the inode * @dent: dentry representing the inode to look for - * @nd: lookup nameidata + * @flags: lookup flags * * In short, ntfs_lookup() looks for the inode represented by the dentry @dent * in the directory inode @dir_ino and if found attaches the inode to the diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 2d7f76e52..5997c00a1 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -2925,7 +2925,8 @@ static int __ocfs2_rotate_tree_left(handle_t *handle, struct ocfs2_path *right_path = NULL; struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci); - BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0]))); + if (!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0]))) + return 0; *empty_extent_path = NULL; @@ -4311,13 +4312,13 @@ out: return ret; } -static enum ocfs2_contig_type -ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, +static int ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, struct ocfs2_path *path, struct ocfs2_extent_list *el, int index, - struct ocfs2_extent_rec *split_rec) + struct ocfs2_extent_rec *split_rec, + struct ocfs2_merge_ctxt *ctxt) { - int status; + int status = 0; enum ocfs2_contig_type ret = CONTIG_NONE; u32 left_cpos, right_cpos; struct ocfs2_extent_rec *rec = NULL; @@ -4336,8 +4337,11 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, if (left_cpos != 0) { left_path = ocfs2_new_path_from_path(path); - if (!left_path) + if (!left_path) { + status = -ENOMEM; + mlog_errno(status); goto exit; + } status = ocfs2_find_path(et->et_ci, left_path, left_cpos); @@ -4392,8 +4396,11 @@ ocfs2_figure_merge_contig_type(struct ocfs2_extent_tree *et, goto free_left_path; right_path = ocfs2_new_path_from_path(path); - if (!right_path) + if (!right_path) { + status = -ENOMEM; + mlog_errno(status); goto free_left_path; + } status = ocfs2_find_path(et->et_ci, right_path, right_cpos); if (status) @@ -4433,7 +4440,10 @@ free_right_path: free_left_path: ocfs2_free_path(left_path); exit: - return ret; + if (status == 0) + ctxt->c_contig_type = ret; + + return status; } static void ocfs2_figure_contig_type(struct ocfs2_extent_tree *et, @@ -5039,9 +5049,14 @@ int ocfs2_split_extent(handle_t *handle, goto out; } - ctxt.c_contig_type = ocfs2_figure_merge_contig_type(et, path, el, - split_index, - split_rec); + ret = ocfs2_figure_merge_contig_type(et, path, el, + split_index, + split_rec, + &ctxt); + if (ret) { + mlog_errno(ret); + goto out; + } /* * The core merge / split code wants to know how much room is diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 9ea701270..0f5fd9db8 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -523,7 +523,7 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits; unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; unsigned long len = bh_result->b_size; - unsigned int clusters_to_alloc = 0; + unsigned int clusters_to_alloc = 0, contig_clusters = 0; cpos = ocfs2_blocks_to_clusters(inode->i_sb, iblock); @@ -560,8 +560,10 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, /* fill hole, allocate blocks can't be larger than the size * of the hole */ clusters_to_alloc = ocfs2_clusters_for_bytes(inode->i_sb, len); - if (clusters_to_alloc > contig_blocks) - clusters_to_alloc = contig_blocks; + contig_clusters = ocfs2_clusters_for_blocks(inode->i_sb, + contig_blocks); + if (clusters_to_alloc > contig_clusters) + clusters_to_alloc = contig_clusters; /* allocate extent and insert them into the extent tree */ ret = ocfs2_extend_allocation(inode, cpos, @@ -619,9 +621,6 @@ static void ocfs2_dio_end_io(struct kiocb *iocb, /* this io's submitter should not have unlocked this before we could */ BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); - if (ocfs2_iocb_is_sem_locked(iocb)) - ocfs2_iocb_clear_sem_locked(iocb); - if (ocfs2_iocb_is_unaligned_aio(iocb)) { ocfs2_iocb_clear_unaligned_aio(iocb); @@ -925,13 +924,23 @@ clean_orphan: int update_isize = written > 0 ? 1 : 0; loff_t end = update_isize ? offset + written : 0; - tmp_ret = ocfs2_del_inode_from_orphan(osb, inode, + tmp_ret = ocfs2_inode_lock(inode, &di_bh, 1); + if (tmp_ret < 0) { + ret = tmp_ret; + mlog_errno(ret); + goto out; + } + + tmp_ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh, update_isize, end); if (tmp_ret < 0) { ret = tmp_ret; + mlog_errno(ret); goto out; } + ocfs2_inode_unlock(inode, 1); + tmp_ret = jbd2_journal_force_commit(journal); if (tmp_ret < 0) { ret = tmp_ret; diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h index dd59599b0..24e496d6b 100644 --- a/fs/ocfs2/aops.h +++ b/fs/ocfs2/aops.h @@ -79,7 +79,6 @@ static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level) enum ocfs2_iocb_lock_bits { OCFS2_IOCB_RW_LOCK = 0, OCFS2_IOCB_RW_LOCK_LEVEL, - OCFS2_IOCB_SEM, OCFS2_IOCB_UNALIGNED_IO, OCFS2_IOCB_NUM_LOCKS }; @@ -88,12 +87,6 @@ enum ocfs2_iocb_lock_bits { clear_bit(OCFS2_IOCB_RW_LOCK, (unsigned long *)&iocb->private) #define ocfs2_iocb_rw_locked_level(iocb) \ test_bit(OCFS2_IOCB_RW_LOCK_LEVEL, (unsigned long *)&iocb->private) -#define ocfs2_iocb_set_sem_locked(iocb) \ - set_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private) -#define ocfs2_iocb_clear_sem_locked(iocb) \ - clear_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private) -#define ocfs2_iocb_is_sem_locked(iocb) \ - test_bit(OCFS2_IOCB_SEM, (unsigned long *)&iocb->private) #define ocfs2_iocb_set_unaligned_aio(iocb) \ set_bit(OCFS2_IOCB_UNALIGNED_IO, (unsigned long *)&iocb->private) diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c index af7598bff..dfe162f5f 100644 --- a/fs/ocfs2/cluster/masklog.c +++ b/fs/ocfs2/cluster/masklog.c @@ -64,6 +64,40 @@ static ssize_t mlog_mask_store(u64 mask, const char *buf, size_t count) return count; } +void __mlog_printk(const u64 *mask, const char *func, int line, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + const char *level; + const char *prefix = ""; + + if (!__mlog_test_u64(*mask, mlog_and_bits) || + __mlog_test_u64(*mask, mlog_not_bits)) + return; + + if (*mask & ML_ERROR) { + level = KERN_ERR; + prefix = "ERROR: "; + } else if (*mask & ML_NOTICE) { + level = KERN_NOTICE; + } else { + level = KERN_INFO; + } + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + printk("%s(%s,%u,%u):%s:%d %s%pV", + level, current->comm, task_pid_nr(current), + raw_smp_processor_id(), func, line, prefix, &vaf); + + va_end(args); +} +EXPORT_SYMBOL_GPL(__mlog_printk); + struct mlog_attribute { struct attribute attr; u64 mask; diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h index 7fdc25a4d..308ea0eb3 100644 --- a/fs/ocfs2/cluster/masklog.h +++ b/fs/ocfs2/cluster/masklog.h @@ -162,38 +162,20 @@ extern struct mlog_bits mlog_and_bits, mlog_not_bits; #endif -/* - * smp_processor_id() "helpfully" screams when called outside preemptible - * regions in current kernels. sles doesn't have the variants that don't - * scream. just do this instead of trying to guess which we're building - * against.. *sigh*. - */ -#define __mlog_cpu_guess ({ \ - unsigned long _cpu = get_cpu(); \ - put_cpu(); \ - _cpu; \ -}) +__printf(4, 5) +void __mlog_printk(const u64 *m, const char *func, int line, + const char *fmt, ...); -/* In the following two macros, the whitespace after the ',' just - * before ##args is intentional. Otherwise, gcc 2.95 will eat the - * previous token if args expands to nothing. +/* + * Testing before the __mlog_printk call lets the compiler eliminate the + * call completely when (m & ML_ALLOWED_BITS) is 0. */ -#define __mlog_printk(level, fmt, args...) \ - printk(level "(%s,%u,%lu):%s:%d " fmt, current->comm, \ - task_pid_nr(current), __mlog_cpu_guess, \ - __PRETTY_FUNCTION__, __LINE__ , ##args) - -#define mlog(mask, fmt, args...) do { \ - u64 __m = MLOG_MASK_PREFIX | (mask); \ - if ((__m & ML_ALLOWED_BITS) && \ - __mlog_test_u64(__m, mlog_and_bits) && \ - !__mlog_test_u64(__m, mlog_not_bits)) { \ - if (__m & ML_ERROR) \ - __mlog_printk(KERN_ERR, "ERROR: "fmt , ##args); \ - else if (__m & ML_NOTICE) \ - __mlog_printk(KERN_NOTICE, fmt , ##args); \ - else __mlog_printk(KERN_INFO, fmt , ##args); \ - } \ +#define mlog(mask, fmt, ...) \ +do { \ + u64 _m = MLOG_MASK_PREFIX | (mask); \ + if (_m & ML_ALLOWED_BITS) \ + __mlog_printk(&_m, __func__, __LINE__, fmt, \ + ##__VA_ARGS__); \ } while (0) #define mlog_errno(st) ({ \ diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 56c403a56..2d0acd667 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -2204,7 +2204,7 @@ out: kfree(o2net_hand); kfree(o2net_keep_req); kfree(o2net_keep_resp); - + o2net_debugfs_exit(); o2quo_exit(); return -ENOMEM; } diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index ccd4dcfc3..02878a83f 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -1617,7 +1617,7 @@ int __ocfs2_add_entry(handle_t *handle, struct ocfs2_dir_entry *de, *de1; struct ocfs2_dinode *di = (struct ocfs2_dinode *)parent_fe_bh->b_data; struct super_block *sb = dir->i_sb; - int retval, status; + int retval; unsigned int size = sb->s_blocksize; struct buffer_head *insert_bh = lookup->dl_leaf_bh; char *data_start = insert_bh->b_data; @@ -1695,25 +1695,25 @@ int __ocfs2_add_entry(handle_t *handle, } if (insert_bh == parent_fe_bh) - status = ocfs2_journal_access_di(handle, + retval = ocfs2_journal_access_di(handle, INODE_CACHE(dir), insert_bh, OCFS2_JOURNAL_ACCESS_WRITE); else { - status = ocfs2_journal_access_db(handle, + retval = ocfs2_journal_access_db(handle, INODE_CACHE(dir), insert_bh, OCFS2_JOURNAL_ACCESS_WRITE); - if (ocfs2_dir_indexed(dir)) { - status = ocfs2_dx_dir_insert(dir, + if (!retval && ocfs2_dir_indexed(dir)) + retval = ocfs2_dx_dir_insert(dir, handle, lookup); - if (status) { - mlog_errno(status); - goto bail; - } - } + } + + if (retval) { + mlog_errno(retval); + goto bail; } /* By now the buffer is marked for journaling */ @@ -3543,13 +3543,10 @@ static void dx_leaf_sort_swap(void *a, void *b, int size) { struct ocfs2_dx_entry *entry1 = a; struct ocfs2_dx_entry *entry2 = b; - struct ocfs2_dx_entry tmp; BUG_ON(size != sizeof(*entry1)); - tmp = *entry1; - *entry1 = *entry2; - *entry2 = tmp; + swap(*entry1, *entry2); } static int ocfs2_dx_leaf_same_major(struct ocfs2_dx_leaf *dx_leaf) diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index fae17c640..e88ccf8c8 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -1014,7 +1014,6 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm, /* will exit holding res->spinlock, but may drop in function */ void __dlm_wait_on_lockres_flags(struct dlm_lock_resource *res, int flags); -void __dlm_wait_on_lockres_flags_set(struct dlm_lock_resource *res, int flags); /* will exit holding res->spinlock, but may drop in function */ static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index d8b670cbd..719f7f4c7 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -37,6 +37,7 @@ #include <linux/falloc.h> #include <linux/quotaops.h> #include <linux/blkdev.h> +#include <linux/backing-dev.h> #include <cluster/masklog.h> @@ -2250,7 +2251,7 @@ out: static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { - int direct_io, appending, rw_level, have_alloc_sem = 0; + int direct_io, appending, rw_level; int can_do_direct, has_refcount = 0; ssize_t written = 0; ssize_t ret; @@ -2279,16 +2280,7 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, mutex_lock(&inode->i_mutex); - ocfs2_iocb_clear_sem_locked(iocb); - relock: - /* to match setattr's i_mutex -> rw_lock ordering */ - if (direct_io) { - have_alloc_sem = 1; - /* communicate with ocfs2_dio_end_io */ - ocfs2_iocb_set_sem_locked(iocb); - } - /* * Concurrent O_DIRECT writes are allowed with * mount_option "coherency=buffered". @@ -2298,7 +2290,7 @@ relock: ret = ocfs2_rw_lock(inode, rw_level); if (ret < 0) { mlog_errno(ret); - goto out_sems; + goto out_mutex; } /* @@ -2347,7 +2339,6 @@ relock: if (direct_io && !can_do_direct) { ocfs2_rw_unlock(inode, rw_level); - have_alloc_sem = 0; rw_level = -1; direct_io = 0; @@ -2416,7 +2407,6 @@ no_sync: */ if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) { rw_level = -1; - have_alloc_sem = 0; unaligned_dio = 0; } @@ -2429,10 +2419,7 @@ out: if (rw_level != -1) ocfs2_rw_unlock(inode, rw_level); -out_sems: - if (have_alloc_sem) - ocfs2_iocb_clear_sem_locked(iocb); - +out_mutex: mutex_unlock(&inode->i_mutex); if (written) @@ -2473,7 +2460,7 @@ bail: static ssize_t ocfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to) { - int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0; + int ret = 0, rw_level = -1, lock_level = 0; struct file *filp = iocb->ki_filp; struct inode *inode = file_inode(filp); @@ -2490,16 +2477,11 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb, goto bail; } - ocfs2_iocb_clear_sem_locked(iocb); - /* * buffered reads protect themselves in ->readpage(). O_DIRECT reads * need locks to protect pending reads from racing with truncate. */ if (iocb->ki_flags & IOCB_DIRECT) { - have_alloc_sem = 1; - ocfs2_iocb_set_sem_locked(iocb); - ret = ocfs2_rw_lock(inode, 0); if (ret < 0) { mlog_errno(ret); @@ -2535,13 +2517,9 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb, /* see ocfs2_file_write_iter */ if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { rw_level = -1; - have_alloc_sem = 0; } bail: - if (have_alloc_sem) - ocfs2_iocb_clear_sem_locked(iocb); - if (rw_level != -1) ocfs2_rw_unlock(inode, rw_level); diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index 53e6c40ed..3cb097ccc 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -980,7 +980,6 @@ long ocfs2_compat_ioctl(struct file *file, unsigned cmd, unsigned long arg) case OCFS2_IOC_GROUP_EXTEND: case OCFS2_IOC_GROUP_ADD: case OCFS2_IOC_GROUP_ADD64: - case FITRIM: break; case OCFS2_IOC_REFLINK: if (copy_from_user(&args, argp, sizeof(args))) diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index ff5319282..7c099f703 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -108,7 +108,7 @@ struct ocfs2_replay_map { unsigned char rm_replay_slots[0]; }; -void ocfs2_replay_map_set_state(struct ocfs2_super *osb, int state) +static void ocfs2_replay_map_set_state(struct ocfs2_super *osb, int state) { if (!osb->replay_map) return; @@ -153,7 +153,7 @@ int ocfs2_compute_replay_slots(struct ocfs2_super *osb) return 0; } -void ocfs2_queue_replay_slots(struct ocfs2_super *osb, +static void ocfs2_queue_replay_slots(struct ocfs2_super *osb, enum ocfs2_orphan_reco_type orphan_reco_type) { struct ocfs2_replay_map *replay_map = osb->replay_map; @@ -173,7 +173,7 @@ void ocfs2_queue_replay_slots(struct ocfs2_super *osb, replay_map->rm_state = REPLAY_DONE; } -void ocfs2_free_replay_slots(struct ocfs2_super *osb) +static void ocfs2_free_replay_slots(struct ocfs2_super *osb) { struct ocfs2_replay_map *replay_map = osb->replay_map; @@ -571,9 +571,7 @@ static void ocfs2_abort_trigger(struct jbd2_buffer_trigger_type *triggers, (unsigned long)bh, (unsigned long long)bh->b_blocknr); - /* We aren't guaranteed to have the superblock here - but if we - * don't, it'll just crash. */ - ocfs2_error(bh->b_assoc_map->host->i_sb, + ocfs2_error(bh->b_bdev->bd_super, "JBD2 has aborted our journal, ocfs2 cannot continue\n"); } @@ -775,7 +773,20 @@ void ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh) trace_ocfs2_journal_dirty((unsigned long long)bh->b_blocknr); status = jbd2_journal_dirty_metadata(handle, bh); - BUG_ON(status); + if (status) { + mlog_errno(status); + if (!is_handle_aborted(handle)) { + journal_t *journal = handle->h_transaction->t_journal; + struct super_block *sb = bh->b_bdev->bd_super; + + mlog(ML_ERROR, "jbd2_journal_dirty_metadata failed. " + "Aborting transaction and journal.\n"); + handle->h_err = status; + jbd2_journal_abort_handle(handle); + jbd2_journal_abort(journal, status); + ocfs2_abort(sb, "Journal already aborted.\n"); + } + } } #define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE) @@ -1884,7 +1895,7 @@ static inline unsigned long ocfs2_orphan_scan_timeout(void) * hasn't happened. The node queues a scan and increments the * sequence number in the LVB. */ -void ocfs2_queue_orphan_scan(struct ocfs2_super *osb) +static void ocfs2_queue_orphan_scan(struct ocfs2_super *osb) { struct ocfs2_orphan_scan *os; int status, i; @@ -1933,7 +1944,7 @@ out: } /* Worker task that gets fired every ORPHAN_SCAN_SCHEDULE_TIMEOUT millsec */ -void ocfs2_orphan_scan_work(struct work_struct *work) +static void ocfs2_orphan_scan_work(struct work_struct *work) { struct ocfs2_orphan_scan *os; struct ocfs2_super *osb; @@ -2137,6 +2148,8 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, struct inode *inode = NULL; struct inode *iter; struct ocfs2_inode_info *oi; + struct buffer_head *di_bh = NULL; + struct ocfs2_dinode *di = NULL; trace_ocfs2_recover_orphans(slot); @@ -2157,16 +2170,22 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, iter = oi->ip_next_orphan; oi->ip_next_orphan = NULL; + ret = ocfs2_rw_lock(inode, 1); + if (ret < 0) { + mlog_errno(ret); + goto next; + } /* * We need to take and drop the inode lock to * force read inode from disk. */ - ret = ocfs2_inode_lock(inode, NULL, 0); + ret = ocfs2_inode_lock(inode, &di_bh, 1); if (ret) { mlog_errno(ret); - goto next; + goto unlock_rw; } - ocfs2_inode_unlock(inode, 0); + + di = (struct ocfs2_dinode *)di_bh->b_data; if (inode->i_nlink == 0) { spin_lock(&oi->ip_lock); @@ -2174,43 +2193,30 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, * ocfs2_delete_inode. */ oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED; spin_unlock(&oi->ip_lock); - } else if (orphan_reco_type == ORPHAN_NEED_TRUNCATE) { - struct buffer_head *di_bh = NULL; - - ret = ocfs2_rw_lock(inode, 1); - if (ret) { - mlog_errno(ret); - goto next; - } - - ret = ocfs2_inode_lock(inode, &di_bh, 1); - if (ret < 0) { - ocfs2_rw_unlock(inode, 1); - mlog_errno(ret); - goto next; - } - + } else if ((orphan_reco_type == ORPHAN_NEED_TRUNCATE) && + (di->i_flags & cpu_to_le32(OCFS2_DIO_ORPHANED_FL))) { ret = ocfs2_truncate_file(inode, di_bh, i_size_read(inode)); - ocfs2_inode_unlock(inode, 1); - ocfs2_rw_unlock(inode, 1); - brelse(di_bh); if (ret < 0) { if (ret != -ENOSPC) mlog_errno(ret); - goto next; + goto unlock_inode; } - ret = ocfs2_del_inode_from_orphan(osb, inode, 0, 0); + ret = ocfs2_del_inode_from_orphan(osb, inode, di_bh, 0, 0); if (ret) mlog_errno(ret); wake_up(&OCFS2_I(inode)->append_dio_wq); } /* else if ORPHAN_NO_NEED_TRUNCATE, do nothing */ - +unlock_inode: + ocfs2_inode_unlock(inode, 1); +unlock_rw: + ocfs2_rw_unlock(inode, 1); next: iput(inode); - + brelse(di_bh); + di_bh = NULL; inode = iter; } diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 176fe6afd..6e6abb93f 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -1116,8 +1116,6 @@ static int ocfs2_double_lock(struct ocfs2_super *osb, int inode1_is_ancestor, inode2_is_ancestor; struct ocfs2_inode_info *oi1 = OCFS2_I(inode1); struct ocfs2_inode_info *oi2 = OCFS2_I(inode2); - struct buffer_head **tmpbh; - struct inode *tmpinode; trace_ocfs2_double_lock((unsigned long long)oi1->ip_blkno, (unsigned long long)oi2->ip_blkno); @@ -1148,13 +1146,8 @@ static int ocfs2_double_lock(struct ocfs2_super *osb, (oi1->ip_blkno < oi2->ip_blkno && inode2_is_ancestor == 0)) { /* switch id1 and id2 around */ - tmpbh = bh2; - bh2 = bh1; - bh1 = tmpbh; - - tmpinode = inode2; - inode2 = inode1; - inode1 = tmpinode; + swap(bh2, bh1); + swap(inode2, inode1); } /* lock id2 */ status = ocfs2_inode_lock_nested(inode2, bh2, 1, @@ -2670,30 +2663,22 @@ bail: } int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, - struct inode *inode, int update_isize, - loff_t end) + struct inode *inode, struct buffer_head *di_bh, + int update_isize, loff_t end) { struct inode *orphan_dir_inode = NULL; struct buffer_head *orphan_dir_bh = NULL; - struct buffer_head *di_bh = NULL; - struct ocfs2_dinode *di = NULL; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; handle_t *handle = NULL; int status = 0; - status = ocfs2_inode_lock(inode, &di_bh, 1); - if (status < 0) { - mlog_errno(status); - goto bail; - } - di = (struct ocfs2_dinode *) di_bh->b_data; - orphan_dir_inode = ocfs2_get_system_file_inode(osb, ORPHAN_DIR_SYSTEM_INODE, le16_to_cpu(di->i_dio_orphaned_slot)); if (!orphan_dir_inode) { status = -ENOENT; mlog_errno(status); - goto bail_unlock_inode; + goto bail; } mutex_lock(&orphan_dir_inode->i_mutex); @@ -2702,7 +2687,7 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, mutex_unlock(&orphan_dir_inode->i_mutex); iput(orphan_dir_inode); mlog_errno(status); - goto bail_unlock_inode; + goto bail; } handle = ocfs2_start_trans(osb, @@ -2749,10 +2734,6 @@ bail_unlock_orphan: brelse(orphan_dir_bh); iput(orphan_dir_inode); -bail_unlock_inode: - ocfs2_inode_unlock(inode, 1); - brelse(di_bh); - bail: return status; } diff --git a/fs/ocfs2/namei.h b/fs/ocfs2/namei.h index 5ddecce17..e173329eb 100644 --- a/fs/ocfs2/namei.h +++ b/fs/ocfs2/namei.h @@ -42,8 +42,8 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb, struct inode *inode); int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, - struct inode *inode, int update_isize, - loff_t end); + struct inode *inode, struct buffer_head *di_bh, + int update_isize, loff_t end); int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, struct inode *new_inode, struct dentry *new_dentry); diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 460c6c37e..690ddc601 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -717,6 +717,16 @@ static inline u64 ocfs2_clusters_to_blocks(struct super_block *sb, return (u64)clusters << c_to_b_bits; } +static inline u32 ocfs2_clusters_for_blocks(struct super_block *sb, + u64 blocks) +{ + int b_to_c_bits = OCFS2_SB(sb)->s_clustersize_bits - + sb->s_blocksize_bits; + + blocks += (1 << b_to_c_bits) - 1; + return (u32)(blocks >> b_to_c_bits); +} + static inline u32 ocfs2_blocks_to_clusters(struct super_block *sb, u64 blocks) { diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index d8c6af101..b69dd14c0 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -1406,11 +1406,9 @@ static int cmp_refcount_rec_by_cpos(const void *a, const void *b) static void swap_refcount_rec(void *a, void *b, int size) { - struct ocfs2_refcount_rec *l = a, *r = b, tmp; + struct ocfs2_refcount_rec *l = a, *r = b; - tmp = *l; - *l = *r; - *r = tmp; + swap(*l, *r); } /* diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index d03bfbf3d..889f3796a 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -7271,7 +7271,7 @@ static int ocfs2_xattr_security_set(struct dentry *dentry, const char *name, name, value, size, flags); } -int ocfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array, +static int ocfs2_initxattrs(struct inode *inode, const struct xattr *xattr_array, void *fs_info) { const struct xattr *xattr; @@ -54,8 +54,10 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, newattrs.ia_valid |= ATTR_FILE; } - /* Remove suid/sgid on truncate too */ - ret = should_remove_suid(dentry); + /* Remove suid, sgid, and file capabilities on truncate too */ + ret = dentry_needs_remove_privs(dentry); + if (ret < 0) + return ret; if (ret) newattrs.ia_valid |= ret | ATTR_FORCE; @@ -65,7 +67,6 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, mutex_unlock(&dentry->d_inode->i_mutex); return ret; } -EXPORT_SYMBOL(do_truncate); long vfs_truncate(struct path *path, loff_t length) { @@ -371,7 +372,7 @@ retry: if (res) goto out; - inode = path.dentry->d_inode; + inode = d_backing_inode(path.dentry); if ((mode & MAY_EXEC) && S_ISREG(inode->i_mode)) { /* @@ -680,21 +681,20 @@ int open_check_o_direct(struct file *f) } return 0; } -EXPORT_SYMBOL(open_check_o_direct); static int do_dentry_open(struct file *f, + struct inode *inode, int (*open)(struct inode *, struct file *), const struct cred *cred) { static const struct file_operations empty_fops = {}; - struct inode *inode; int error; f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; path_get(&f->f_path); - inode = f->f_inode = f->f_path.dentry->d_inode; + f->f_inode = inode; f->f_mapping = inode->i_mapping; if (unlikely(f->f_flags & O_PATH)) { @@ -798,7 +798,8 @@ int finish_open(struct file *file, struct dentry *dentry, BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */ file->f_path.dentry = dentry; - error = do_dentry_open(file, open, current_cred()); + error = do_dentry_open(file, d_backing_inode(dentry), open, + current_cred()); if (!error) *opened |= FILE_OPENED; @@ -827,6 +828,34 @@ int finish_no_open(struct file *file, struct dentry *dentry) } EXPORT_SYMBOL(finish_no_open); +char *file_path(struct file *filp, char *buf, int buflen) +{ + return d_path(&filp->f_path, buf, buflen); +} +EXPORT_SYMBOL(file_path); + +/** + * vfs_open - open the file at the given path + * @path: path to open + * @file: newly allocated file with f_flag initialized + * @cred: credentials to use + */ +int vfs_open(const struct path *path, struct file *file, + const struct cred *cred) +{ + struct dentry *dentry = path->dentry; + struct inode *inode = dentry->d_inode; + + file->f_path = *path; + if (dentry->d_flags & DCACHE_OP_SELECT_INODE) { + inode = dentry->d_op->d_select_inode(dentry, file->f_flags); + if (IS_ERR(inode)) + return PTR_ERR(inode); + } + + return do_dentry_open(file, inode, NULL, cred); +} + struct file *dentry_open(const struct path *path, int flags, const struct cred *cred) { @@ -858,26 +887,6 @@ struct file *dentry_open(const struct path *path, int flags, } EXPORT_SYMBOL(dentry_open); -/** - * vfs_open - open the file at the given path - * @path: path to open - * @filp: newly allocated file with f_flag initialized - * @cred: credentials to use - */ -int vfs_open(const struct path *path, struct file *filp, - const struct cred *cred) -{ - struct inode *inode = path->dentry->d_inode; - - if (inode->i_op->dentry_open) - return inode->i_op->dentry_open(path->dentry, filp, cred); - else { - filp->f_path = *path; - return do_dentry_open(filp, NULL, cred); - } -} -EXPORT_SYMBOL(vfs_open); - static inline int build_open_flags(int flags, umode_t mode, struct open_flags *op) { int lookup_flags = 0; diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 04f124884..d9da5a4e9 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -140,11 +140,12 @@ struct ovl_link_data { void *cookie; }; -static void *ovl_follow_link(struct dentry *dentry, struct nameidata *nd) +static const char *ovl_follow_link(struct dentry *dentry, void **cookie) { - void *ret; struct dentry *realdentry; struct inode *realinode; + struct ovl_link_data *data = NULL; + const char *ret; realdentry = ovl_dentry_real(dentry); realinode = realdentry->d_inode; @@ -152,28 +153,28 @@ static void *ovl_follow_link(struct dentry *dentry, struct nameidata *nd) if (WARN_ON(!realinode->i_op->follow_link)) return ERR_PTR(-EPERM); - ret = realinode->i_op->follow_link(realdentry, nd); - if (IS_ERR(ret)) - return ret; - if (realinode->i_op->put_link) { - struct ovl_link_data *data; - data = kmalloc(sizeof(struct ovl_link_data), GFP_KERNEL); - if (!data) { - realinode->i_op->put_link(realdentry, nd, ret); + if (!data) return ERR_PTR(-ENOMEM); - } data->realdentry = realdentry; - data->cookie = ret; + } - return data; - } else { - return NULL; + ret = realinode->i_op->follow_link(realdentry, cookie); + if (IS_ERR_OR_NULL(ret)) { + kfree(data); + return ret; } + + if (data) + data->cookie = *cookie; + + *cookie = data; + + return ret; } -static void ovl_put_link(struct dentry *dentry, struct nameidata *nd, void *c) +static void ovl_put_link(struct inode *unused, void *c) { struct inode *realinode; struct ovl_link_data *data = c; @@ -182,7 +183,7 @@ static void ovl_put_link(struct dentry *dentry, struct nameidata *nd, void *c) return; realinode = data->realdentry->d_inode; - realinode->i_op->put_link(data->realdentry, nd, data->cookie); + realinode->i_op->put_link(realinode, data->cookie); kfree(data); } @@ -336,37 +337,33 @@ static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type, return true; } -static int ovl_dentry_open(struct dentry *dentry, struct file *file, - const struct cred *cred) +struct inode *ovl_d_select_inode(struct dentry *dentry, unsigned file_flags) { int err; struct path realpath; enum ovl_path_type type; - bool want_write = false; + + if (d_is_dir(dentry)) + return d_backing_inode(dentry); type = ovl_path_real(dentry, &realpath); - if (ovl_open_need_copy_up(file->f_flags, type, realpath.dentry)) { - want_write = true; + if (ovl_open_need_copy_up(file_flags, type, realpath.dentry)) { err = ovl_want_write(dentry); if (err) - goto out; + return ERR_PTR(err); - if (file->f_flags & O_TRUNC) + if (file_flags & O_TRUNC) err = ovl_copy_up_last(dentry, NULL, true); else err = ovl_copy_up(dentry); + ovl_drop_write(dentry); if (err) - goto out_drop_write; + return ERR_PTR(err); ovl_path_upper(dentry, &realpath); } - err = vfs_open(&realpath, file, cred); -out_drop_write: - if (want_write) - ovl_drop_write(dentry); -out: - return err; + return d_backing_inode(realpath.dentry); } static const struct inode_operations ovl_file_inode_operations = { @@ -377,7 +374,6 @@ static const struct inode_operations ovl_file_inode_operations = { .getxattr = ovl_getxattr, .listxattr = ovl_listxattr, .removexattr = ovl_removexattr, - .dentry_open = ovl_dentry_open, }; static const struct inode_operations ovl_symlink_inode_operations = { diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 17ac5afc9..ea5a40b06 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -173,6 +173,7 @@ ssize_t ovl_getxattr(struct dentry *dentry, const char *name, void *value, size_t size); ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size); int ovl_removexattr(struct dentry *dentry, const char *name); +struct inode *ovl_d_select_inode(struct dentry *dentry, unsigned file_flags); struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, struct ovl_entry *oe); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index bf8537c7f..7466ff339 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -273,8 +273,56 @@ static void ovl_dentry_release(struct dentry *dentry) } } +static int ovl_dentry_revalidate(struct dentry *dentry, unsigned int flags) +{ + struct ovl_entry *oe = dentry->d_fsdata; + unsigned int i; + int ret = 1; + + for (i = 0; i < oe->numlower; i++) { + struct dentry *d = oe->lowerstack[i].dentry; + + if (d->d_flags & DCACHE_OP_REVALIDATE) { + ret = d->d_op->d_revalidate(d, flags); + if (ret < 0) + return ret; + if (!ret) { + if (!(flags & LOOKUP_RCU)) + d_invalidate(d); + return -ESTALE; + } + } + } + return 1; +} + +static int ovl_dentry_weak_revalidate(struct dentry *dentry, unsigned int flags) +{ + struct ovl_entry *oe = dentry->d_fsdata; + unsigned int i; + int ret = 1; + + for (i = 0; i < oe->numlower; i++) { + struct dentry *d = oe->lowerstack[i].dentry; + + if (d->d_flags & DCACHE_OP_WEAK_REVALIDATE) { + ret = d->d_op->d_weak_revalidate(d, flags); + if (ret <= 0) + break; + } + } + return ret; +} + static const struct dentry_operations ovl_dentry_operations = { .d_release = ovl_dentry_release, + .d_select_inode = ovl_d_select_inode, +}; + +static const struct dentry_operations ovl_reval_dentry_operations = { + .d_release = ovl_dentry_release, + .d_revalidate = ovl_dentry_revalidate, + .d_weak_revalidate = ovl_dentry_weak_revalidate, }; static struct ovl_entry *ovl_alloc_entry(unsigned int numlower) @@ -288,6 +336,20 @@ static struct ovl_entry *ovl_alloc_entry(unsigned int numlower) return oe; } +static bool ovl_dentry_remote(struct dentry *dentry) +{ + return dentry->d_flags & + (DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE); +} + +static bool ovl_dentry_weird(struct dentry *dentry) +{ + return dentry->d_flags & (DCACHE_NEED_AUTOMOUNT | + DCACHE_MANAGE_TRANSIT | + DCACHE_OP_HASH | + DCACHE_OP_COMPARE); +} + static inline struct dentry *ovl_lookup_real(struct dentry *dir, struct qstr *name) { @@ -303,6 +365,10 @@ static inline struct dentry *ovl_lookup_real(struct dentry *dir, } else if (!dentry->d_inode) { dput(dentry); dentry = NULL; + } else if (ovl_dentry_weird(dentry)) { + dput(dentry); + /* Don't support traversing automounts and other weirdness */ + dentry = ERR_PTR(-EREMOTE); } return dentry; } @@ -350,6 +416,11 @@ struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, goto out; if (this) { + if (unlikely(ovl_dentry_remote(this))) { + dput(this); + err = -EREMOTE; + goto out; + } if (ovl_is_whiteout(this)) { dput(this); this = NULL; @@ -694,25 +765,6 @@ static void ovl_unescape(char *s) } } -static bool ovl_is_allowed_fs_type(struct dentry *root) -{ - const struct dentry_operations *dop = root->d_op; - - /* - * We don't support: - * - automount filesystems - * - filesystems with revalidate (FIXME for lower layer) - * - filesystems with case insensitive names - */ - if (dop && - (dop->d_manage || dop->d_automount || - dop->d_revalidate || dop->d_weak_revalidate || - dop->d_compare || dop->d_hash)) { - return false; - } - return true; -} - static int ovl_mount_dir_noesc(const char *name, struct path *path) { int err = -EINVAL; @@ -727,7 +779,7 @@ static int ovl_mount_dir_noesc(const char *name, struct path *path) goto out; } err = -EINVAL; - if (!ovl_is_allowed_fs_type(path->dentry)) { + if (ovl_dentry_weird(path->dentry)) { pr_err("overlayfs: filesystem on '%s' not supported\n", name); goto out_put; } @@ -751,13 +803,21 @@ static int ovl_mount_dir(const char *name, struct path *path) if (tmp) { ovl_unescape(tmp); err = ovl_mount_dir_noesc(tmp, path); + + if (!err) + if (ovl_dentry_remote(path->dentry)) { + pr_err("overlayfs: filesystem on '%s' not supported as upperdir\n", + tmp); + path_put(path); + err = -EINVAL; + } kfree(tmp); } return err; } static int ovl_lower_dir(const char *name, struct path *path, long *namelen, - int *stack_depth) + int *stack_depth, bool *remote) { int err; struct kstatfs statfs; @@ -774,6 +834,9 @@ static int ovl_lower_dir(const char *name, struct path *path, long *namelen, *namelen = max(*namelen, statfs.f_namelen); *stack_depth = max(*stack_depth, path->mnt->mnt_sb->s_stack_depth); + if (ovl_dentry_remote(path->dentry)) + *remote = true; + return 0; out_put: @@ -827,6 +890,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) unsigned int numlower; unsigned int stacklen = 0; unsigned int i; + bool remote = false; int err; err = -ENOMEM; @@ -900,7 +964,8 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) lower = lowertmp; for (numlower = 0; numlower < stacklen; numlower++) { err = ovl_lower_dir(lower, &stack[numlower], - &ufs->lower_namelen, &sb->s_stack_depth); + &ufs->lower_namelen, &sb->s_stack_depth, + &remote); if (err) goto out_put_lowerpath; @@ -958,7 +1023,10 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent) if (!ufs->upper_mnt) sb->s_flags |= MS_RDONLY; - sb->s_d_op = &ovl_dentry_operations; + if (remote) + sb->s_d_op = &ovl_reval_dentry_operations; + else + sb->s_d_op = &ovl_dentry_operations; err = -ENOMEM; oe = ovl_alloc_entry(numlower); diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 84bb65b83..4fb17ded7 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -547,51 +547,45 @@ posix_acl_create(struct inode *dir, umode_t *mode, struct posix_acl **default_acl, struct posix_acl **acl) { struct posix_acl *p; + struct posix_acl *clone; int ret; + *acl = NULL; + *default_acl = NULL; + if (S_ISLNK(*mode) || !IS_POSIXACL(dir)) - goto no_acl; + return 0; p = get_acl(dir, ACL_TYPE_DEFAULT); - if (IS_ERR(p)) { - if (p == ERR_PTR(-EOPNOTSUPP)) - goto apply_umask; - return PTR_ERR(p); + if (!p || p == ERR_PTR(-EOPNOTSUPP)) { + *mode &= ~current_umask(); + return 0; } + if (IS_ERR(p)) + return PTR_ERR(p); - if (!p) - goto apply_umask; - - *acl = posix_acl_clone(p, GFP_NOFS); - if (!*acl) + clone = posix_acl_clone(p, GFP_NOFS); + if (!clone) goto no_mem; - ret = posix_acl_create_masq(*acl, mode); + ret = posix_acl_create_masq(clone, mode); if (ret < 0) goto no_mem_clone; - if (ret == 0) { - posix_acl_release(*acl); - *acl = NULL; - } + if (ret == 0) + posix_acl_release(clone); + else + *acl = clone; - if (!S_ISDIR(*mode)) { + if (!S_ISDIR(*mode)) posix_acl_release(p); - *default_acl = NULL; - } else { + else *default_acl = p; - } - return 0; -apply_umask: - *mode &= ~current_umask(); -no_acl: - *default_acl = NULL; - *acl = NULL; return 0; no_mem_clone: - posix_acl_release(*acl); + posix_acl_release(clone); no_mem: posix_acl_release(p); return -ENOMEM; diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig index 2183fcf41..1ade1206b 100644 --- a/fs/proc/Kconfig +++ b/fs/proc/Kconfig @@ -71,3 +71,13 @@ config PROC_PAGE_MONITOR /proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap, /proc/kpagecount, and /proc/kpageflags. Disabling these interfaces will reduce the size of the kernel by approximately 4kb. + +config PROC_CHILDREN + bool "Include /proc/<pid>/task/<tid>/children file" + default n + help + Provides a fast way to retrieve first level children pids of a task. See + <file:Documentation/filesystems/proc.txt> for more information. + + Say Y if you are running any user-space software which takes benefit from + this interface. For example, rkt is such a piece of software. diff --git a/fs/proc/array.c b/fs/proc/array.c index fd02a9ebf..ce065cf31 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -126,6 +126,14 @@ static inline const char *get_task_state(struct task_struct *tsk) { unsigned int state = (tsk->state | tsk->exit_state) & TASK_REPORT; + /* + * Parked tasks do not run; they sit in __kthread_parkme(). + * Without this check, we would report them as running, which is + * clearly wrong, so we report them as sleeping instead. + */ + if (tsk->state == TASK_PARKED) + state = TASK_INTERRUPTIBLE; + BUILD_BUG_ON(1 + ilog2(TASK_REPORT) != ARRAY_SIZE(task_state_array)-1); return task_state_array[fls(state)]; @@ -569,7 +577,7 @@ int proc_pid_statm(struct seq_file *m, struct pid_namespace *ns, return 0; } -#ifdef CONFIG_CHECKPOINT_RESTORE +#ifdef CONFIG_PROC_CHILDREN static struct pid * get_children_pid(struct inode *inode, struct pid *pid_prev, loff_t pos) { @@ -692,4 +700,4 @@ const struct file_operations proc_tid_children_operations = { .llseek = seq_lseek, .release = children_seq_release, }; -#endif /* CONFIG_CHECKPOINT_RESTORE */ +#endif /* CONFIG_PROC_CHILDREN */ diff --git a/fs/proc/base.c b/fs/proc/base.c index c439a9dcc..aa50d1ac2 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -196,18 +196,210 @@ static int proc_root_link(struct dentry *dentry, struct path *path) return result; } -static int proc_pid_cmdline(struct seq_file *m, struct pid_namespace *ns, - struct pid *pid, struct task_struct *task) +static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, + size_t _count, loff_t *pos) { + struct task_struct *tsk; + struct mm_struct *mm; + char *page; + unsigned long count = _count; + unsigned long arg_start, arg_end, env_start, env_end; + unsigned long len1, len2, len; + unsigned long p; + char c; + ssize_t rv; + + BUG_ON(*pos < 0); + + tsk = get_proc_task(file_inode(file)); + if (!tsk) + return -ESRCH; + mm = get_task_mm(tsk); + put_task_struct(tsk); + if (!mm) + return 0; + /* Check if process spawned far enough to have cmdline. */ + if (!mm->env_end) { + rv = 0; + goto out_mmput; + } + + page = (char *)__get_free_page(GFP_TEMPORARY); + if (!page) { + rv = -ENOMEM; + goto out_mmput; + } + + down_read(&mm->mmap_sem); + arg_start = mm->arg_start; + arg_end = mm->arg_end; + env_start = mm->env_start; + env_end = mm->env_end; + up_read(&mm->mmap_sem); + + BUG_ON(arg_start > arg_end); + BUG_ON(env_start > env_end); + + len1 = arg_end - arg_start; + len2 = env_end - env_start; + + /* Empty ARGV. */ + if (len1 == 0) { + rv = 0; + goto out_free_page; + } /* - * Rely on struct seq_operations::show() being called once - * per internal buffer allocation. See single_open(), traverse(). + * Inherently racy -- command line shares address space + * with code and data. */ - BUG_ON(m->size < PAGE_SIZE); - m->count += get_cmdline(task, m->buf, PAGE_SIZE); - return 0; + rv = access_remote_vm(mm, arg_end - 1, &c, 1, 0); + if (rv <= 0) + goto out_free_page; + + rv = 0; + + if (c == '\0') { + /* Command line (set of strings) occupies whole ARGV. */ + if (len1 <= *pos) + goto out_free_page; + + p = arg_start + *pos; + len = len1 - *pos; + while (count > 0 && len > 0) { + unsigned int _count; + int nr_read; + + _count = min3(count, len, PAGE_SIZE); + nr_read = access_remote_vm(mm, p, page, _count, 0); + if (nr_read < 0) + rv = nr_read; + if (nr_read <= 0) + goto out_free_page; + + if (copy_to_user(buf, page, nr_read)) { + rv = -EFAULT; + goto out_free_page; + } + + p += nr_read; + len -= nr_read; + buf += nr_read; + count -= nr_read; + rv += nr_read; + } + } else { + /* + * Command line (1 string) occupies ARGV and maybe + * extends into ENVP. + */ + if (len1 + len2 <= *pos) + goto skip_argv_envp; + if (len1 <= *pos) + goto skip_argv; + + p = arg_start + *pos; + len = len1 - *pos; + while (count > 0 && len > 0) { + unsigned int _count, l; + int nr_read; + bool final; + + _count = min3(count, len, PAGE_SIZE); + nr_read = access_remote_vm(mm, p, page, _count, 0); + if (nr_read < 0) + rv = nr_read; + if (nr_read <= 0) + goto out_free_page; + + /* + * Command line can be shorter than whole ARGV + * even if last "marker" byte says it is not. + */ + final = false; + l = strnlen(page, nr_read); + if (l < nr_read) { + nr_read = l; + final = true; + } + + if (copy_to_user(buf, page, nr_read)) { + rv = -EFAULT; + goto out_free_page; + } + + p += nr_read; + len -= nr_read; + buf += nr_read; + count -= nr_read; + rv += nr_read; + + if (final) + goto out_free_page; + } +skip_argv: + /* + * Command line (1 string) occupies ARGV and + * extends into ENVP. + */ + if (len1 <= *pos) { + p = env_start + *pos - len1; + len = len1 + len2 - *pos; + } else { + p = env_start; + len = len2; + } + while (count > 0 && len > 0) { + unsigned int _count, l; + int nr_read; + bool final; + + _count = min3(count, len, PAGE_SIZE); + nr_read = access_remote_vm(mm, p, page, _count, 0); + if (nr_read < 0) + rv = nr_read; + if (nr_read <= 0) + goto out_free_page; + + /* Find EOS. */ + final = false; + l = strnlen(page, nr_read); + if (l < nr_read) { + nr_read = l; + final = true; + } + + if (copy_to_user(buf, page, nr_read)) { + rv = -EFAULT; + goto out_free_page; + } + + p += nr_read; + len -= nr_read; + buf += nr_read; + count -= nr_read; + rv += nr_read; + + if (final) + goto out_free_page; + } +skip_argv_envp: + ; + } + +out_free_page: + free_page((unsigned long)page); +out_mmput: + mmput(mm); + if (rv > 0) + *pos += rv; + return rv; } +static const struct file_operations proc_pid_cmdline_ops = { + .read = proc_pid_cmdline_read, + .llseek = generic_file_llseek, +}; + static int proc_pid_auxv(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { @@ -304,15 +496,18 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns, } #endif -#ifdef CONFIG_SCHEDSTATS +#ifdef CONFIG_SCHED_INFO /* * Provides /proc/PID/schedstat */ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { - seq_printf(m, "%llu %llu %lu\n", - (unsigned long long)tsk_seruntime(task), + if (unlikely(!sched_info_on())) + seq_printf(m, "0 0 0\n"); + else + seq_printf(m, "%llu %llu %lu\n", + (unsigned long long)task->se.sum_exec_runtime, (unsigned long long)task->sched_info.run_delay, task->sched_info.pcount); @@ -1380,7 +1575,7 @@ static int proc_exe_link(struct dentry *dentry, struct path *exe_path) return -ENOENT; } -static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd) +static const char *proc_pid_follow_link(struct dentry *dentry, void **cookie) { struct inode *inode = d_inode(dentry); struct path path; @@ -1394,7 +1589,7 @@ static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd) if (error) goto out; - nd_jump_link(nd, &path); + nd_jump_link(&path); return NULL; out: return ERR_PTR(error); @@ -1744,7 +1939,7 @@ static int proc_map_files_get_link(struct dentry *dentry, struct path *path) down_read(&mm->mmap_sem); vma = find_exact_vma(mm, vm_start, vm_end); if (vma && vma->vm_file) { - *path = vma_pr_or_file(vma)->f_path; + *path = vma->vm_file->f_path; path_get(path); rc = 0; } @@ -2572,7 +2767,7 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_HAVE_ARCH_TRACEHOOK ONE("syscall", S_IRUSR, proc_pid_syscall), #endif - ONE("cmdline", S_IRUGO, proc_pid_cmdline), + REG("cmdline", S_IRUGO, proc_pid_cmdline_ops), ONE("stat", S_IRUGO, proc_tgid_stat), ONE("statm", S_IRUGO, proc_pid_statm), REG("maps", S_IRUGO, proc_pid_maps_operations), @@ -2600,7 +2795,7 @@ static const struct pid_entry tgid_base_stuff[] = { #ifdef CONFIG_STACKTRACE ONE("stack", S_IRUSR, proc_pid_stack), #endif -#ifdef CONFIG_SCHEDSTATS +#ifdef CONFIG_SCHED_INFO ONE("schedstat", S_IRUGO, proc_pid_schedstat), #endif #ifdef CONFIG_LATENCYTOP @@ -2918,11 +3113,11 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_HAVE_ARCH_TRACEHOOK ONE("syscall", S_IRUSR, proc_pid_syscall), #endif - ONE("cmdline", S_IRUGO, proc_pid_cmdline), + REG("cmdline", S_IRUGO, proc_pid_cmdline_ops), ONE("stat", S_IRUGO, proc_tid_stat), ONE("statm", S_IRUGO, proc_pid_statm), REG("maps", S_IRUGO, proc_tid_maps_operations), -#ifdef CONFIG_CHECKPOINT_RESTORE +#ifdef CONFIG_PROC_CHILDREN REG("children", S_IRUGO, proc_tid_children_operations), #endif #ifdef CONFIG_NUMA @@ -2948,7 +3143,7 @@ static const struct pid_entry tid_base_stuff[] = { #ifdef CONFIG_STACKTRACE ONE("stack", S_IRUSR, proc_pid_stack), #endif -#ifdef CONFIG_SCHEDSTATS +#ifdef CONFIG_SCHED_INFO ONE("schedstat", S_IRUGO, proc_pid_schedstat), #endif #ifdef CONFIG_LATENCYTOP diff --git a/fs/proc/inode.c b/fs/proc/inode.c index e3eb55246..bd95b9fde 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -23,7 +23,6 @@ #include <linux/slab.h> #include <linux/mount.h> #include <linux/magic.h> -#include <linux/namei.h> #include <asm/uaccess.h> @@ -394,16 +393,16 @@ static const struct file_operations proc_reg_file_ops_no_compat = { }; #endif -static void *proc_follow_link(struct dentry *dentry, struct nameidata *nd) +static const char *proc_follow_link(struct dentry *dentry, void **cookie) { struct proc_dir_entry *pde = PDE(d_inode(dentry)); if (unlikely(!use_pde(pde))) return ERR_PTR(-EINVAL); - nd_set_link(nd, pde->data); - return pde; + *cookie = pde; + return pde->data; } -static void proc_put_link(struct dentry *dentry, struct nameidata *nd, void *p) +static void proc_put_link(struct inode *unused, void *p) { unuse_pde(p); } diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 91a4e6426..92e6726f6 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -92,7 +92,7 @@ static size_t get_kcore_size(int *nphdr, size_t *elf_buflen) roundup(sizeof(CORE_STR), 4)) + roundup(sizeof(struct elf_prstatus), 4) + roundup(sizeof(struct elf_prpsinfo), 4) + - roundup(sizeof(struct task_struct), 4); + roundup(arch_task_struct_size, 4); *elf_buflen = PAGE_ALIGN(*elf_buflen); return size + *elf_buflen; } @@ -415,7 +415,7 @@ static void elf_kcore_store_hdr(char *bufp, int nphdr, int dataoff) /* set up the task structure */ notes[2].name = CORE_STR; notes[2].type = NT_TASKSTRUCT; - notes[2].datasz = sizeof(struct task_struct); + notes[2].datasz = arch_task_struct_size; notes[2].data = current; nhdr->p_filesz += notesize(¬es[2]); diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index e512642db..f6e8354b8 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -30,7 +30,7 @@ static const struct proc_ns_operations *ns_entries[] = { &mntns_operations, }; -static void *proc_ns_follow_link(struct dentry *dentry, struct nameidata *nd) +static const char *proc_ns_follow_link(struct dentry *dentry, void **cookie) { struct inode *inode = d_inode(dentry); const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops; @@ -45,7 +45,7 @@ static void *proc_ns_follow_link(struct dentry *dentry, struct nameidata *nd) if (ptrace_may_access(task, PTRACE_MODE_READ)) { error = ns_get_path(&ns_path, task, ns_ops); if (!error) - nd_jump_link(nd, &ns_path); + nd_jump_link(&ns_path); } put_task_struct(task); return error; diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c index 139718132..f8595e8b5 100644 --- a/fs/proc/nommu.c +++ b/fs/proc/nommu.c @@ -45,10 +45,7 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region) file = region->vm_file; if (file) { - struct inode *inode; - - file = vmr_pr_or_file(region); - inode = file_inode(file); + struct inode *inode = file_inode(region->vm_file); dev = inode->i_sb->s_dev; ino = inode->i_ino; } @@ -67,7 +64,7 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region) if (file) { seq_pad(m, ' '); - seq_path(m, &file->f_path, ""); + seq_file_path(m, file, ""); } seq_putc(m, '\n'); diff --git a/fs/proc/self.c b/fs/proc/self.c index 6195b4a7c..113b8d061 100644 --- a/fs/proc/self.c +++ b/fs/proc/self.c @@ -1,5 +1,4 @@ #include <linux/sched.h> -#include <linux/namei.h> #include <linux/slab.h> #include <linux/pid_namespace.h> #include "internal.h" @@ -19,21 +18,20 @@ static int proc_self_readlink(struct dentry *dentry, char __user *buffer, return readlink_copy(buffer, buflen, tmp); } -static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd) +static const char *proc_self_follow_link(struct dentry *dentry, void **cookie) { struct pid_namespace *ns = dentry->d_sb->s_fs_info; pid_t tgid = task_tgid_nr_ns(current, ns); - char *name = ERR_PTR(-ENOENT); - if (tgid) { - /* 11 for max length of signed int in decimal + NULL term */ - name = kmalloc(12, GFP_KERNEL); - if (!name) - name = ERR_PTR(-ENOMEM); - else - sprintf(name, "%d", tgid); - } - nd_set_link(nd, name); - return NULL; + char *name; + + if (!tgid) + return ERR_PTR(-ENOENT); + /* 11 for max length of signed int in decimal + NULL term */ + name = kmalloc(12, GFP_KERNEL); + if (!name) + return ERR_PTR(-ENOMEM); + sprintf(name, "%d", tgid); + return *cookie = name; } static const struct inode_operations proc_self_inode_operations = { diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 9afa35d04..ca1e09188 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -279,10 +279,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) const char *name = NULL; if (file) { - struct inode *inode; - - file = vma_pr_or_file(vma); - inode = file_inode(file); + struct inode *inode = file_inode(vma->vm_file); dev = inode->i_sb->s_dev; ino = inode->i_ino; pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT; @@ -313,7 +310,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) */ if (file) { seq_pad(m, ' '); - seq_path(m, &file->f_path, "\n"); + seq_file_path(m, file, "\n"); goto done; } @@ -1482,7 +1479,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) struct proc_maps_private *proc_priv = &numa_priv->proc_maps; struct vm_area_struct *vma = v; struct numa_maps *md = &numa_priv->md; - struct file *file = vma_pr_or_file(vma); + struct file *file = vma->vm_file; struct mm_struct *mm = vma->vm_mm; struct mm_walk walk = { .hugetlb_entry = gather_hugetlb_stats, @@ -1512,7 +1509,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) if (file) { seq_puts(m, " file="); - seq_path(m, &file->f_path, "\n\t= "); + seq_file_path(m, file, "\n\t= "); } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { seq_puts(m, " heap"); } else { diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 174020784..e0d64c92e 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -160,10 +160,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma, file = vma->vm_file; if (file) { - struct inode *inode; - - file = vma_pr_or_file(vma); - inode = file_inode(file); + struct inode *inode = file_inode(vma->vm_file); dev = inode->i_sb->s_dev; ino = inode->i_ino; pgoff = (loff_t)vma->vm_pgoff << PAGE_SHIFT; @@ -183,7 +180,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma, if (file) { seq_pad(m, ' '); - seq_path(m, &file->f_path, ""); + seq_file_path(m, file, ""); } else if (mm) { pid_t tid = pid_of_stack(priv, vma, is_pid); diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c index a8371993b..947b0f4fd 100644 --- a/fs/proc/thread_self.c +++ b/fs/proc/thread_self.c @@ -1,5 +1,4 @@ #include <linux/sched.h> -#include <linux/namei.h> #include <linux/slab.h> #include <linux/pid_namespace.h> #include "internal.h" @@ -20,21 +19,20 @@ static int proc_thread_self_readlink(struct dentry *dentry, char __user *buffer, return readlink_copy(buffer, buflen, tmp); } -static void *proc_thread_self_follow_link(struct dentry *dentry, struct nameidata *nd) +static const char *proc_thread_self_follow_link(struct dentry *dentry, void **cookie) { struct pid_namespace *ns = dentry->d_sb->s_fs_info; pid_t tgid = task_tgid_nr_ns(current, ns); pid_t pid = task_pid_nr_ns(current, ns); - char *name = ERR_PTR(-ENOENT); - if (pid) { - name = kmalloc(PROC_NUMBUF + 6 + PROC_NUMBUF, GFP_KERNEL); - if (!name) - name = ERR_PTR(-ENOMEM); - else - sprintf(name, "%d/task/%d", tgid, pid); - } - nd_set_link(nd, name); - return NULL; + char *name; + + if (!pid) + return ERR_PTR(-ENOENT); + name = kmalloc(PROC_NUMBUF + 6 + PROC_NUMBUF, GFP_KERNEL); + if (!name) + return ERR_PTR(-ENOMEM); + sprintf(name, "%d/task/%d", tgid, pid); + return *cookie = name; } static const struct inode_operations proc_thread_self_inode_operations = { diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c index 8db932da4..8ebd9a334 100644 --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -17,7 +17,8 @@ static unsigned mounts_poll(struct file *file, poll_table *wait) { - struct proc_mounts *p = proc_mounts(file->private_data); + struct seq_file *m = file->private_data; + struct proc_mounts *p = m->private; struct mnt_namespace *ns = p->ns; unsigned res = POLLIN | POLLRDNORM; int event; @@ -25,8 +26,8 @@ static unsigned mounts_poll(struct file *file, poll_table *wait) poll_wait(file, &p->ns->poll, wait); event = ACCESS_ONCE(ns->event); - if (p->m.poll_event != event) { - p->m.poll_event = event; + if (m->poll_event != event) { + m->poll_event = event; res |= POLLERR | POLLPRI; } @@ -92,7 +93,7 @@ static void show_type(struct seq_file *m, struct super_block *sb) static int show_vfsmnt(struct seq_file *m, struct vfsmount *mnt) { - struct proc_mounts *p = proc_mounts(m); + struct proc_mounts *p = m->private; struct mount *r = real_mount(mnt); int err = 0; struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; @@ -126,7 +127,7 @@ out: static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt) { - struct proc_mounts *p = proc_mounts(m); + struct proc_mounts *p = m->private; struct mount *r = real_mount(mnt); struct super_block *sb = mnt->mnt_sb; struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; @@ -186,7 +187,7 @@ out: static int show_vfsstat(struct seq_file *m, struct vfsmount *mnt) { - struct proc_mounts *p = proc_mounts(m); + struct proc_mounts *p = m->private; struct mount *r = real_mount(mnt); struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; struct super_block *sb = mnt_path.dentry->d_sb; @@ -236,6 +237,7 @@ static int mounts_open_common(struct inode *inode, struct file *file, struct mnt_namespace *ns = NULL; struct path root; struct proc_mounts *p; + struct seq_file *m; int ret = -EINVAL; if (!task) @@ -260,26 +262,21 @@ static int mounts_open_common(struct inode *inode, struct file *file, task_unlock(task); put_task_struct(task); - ret = -ENOMEM; - p = kmalloc(sizeof(struct proc_mounts), GFP_KERNEL); - if (!p) + ret = seq_open_private(file, &mounts_op, sizeof(struct proc_mounts)); + if (ret) goto err_put_path; - file->private_data = &p->m; - ret = seq_open(file, &mounts_op); - if (ret) - goto err_free; + m = file->private_data; + m->poll_event = ns->event; + p = m->private; p->ns = ns; p->root = root; - p->m.poll_event = ns->event; p->show = show; p->cached_event = ~0ULL; return 0; - err_free: - kfree(p); err_put_path: path_put(&root); err_put_ns: @@ -290,10 +287,11 @@ static int mounts_open_common(struct inode *inode, struct file *file, static int mounts_release(struct inode *inode, struct file *file) { - struct proc_mounts *p = proc_mounts(file->private_data); + struct seq_file *m = file->private_data; + struct proc_mounts *p = m->private; path_put(&p->root); put_mnt_ns(p->ns); - return seq_release(inode, file); + return seq_release_private(inode, file); } static int mounts_open(struct inode *inode, struct file *file) diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index c4c9a10c5..791743dee 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -299,7 +299,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, bool compressed; size_t total_len; - if (big_oops_buf) { + if (big_oops_buf && is_locked) { dst = big_oops_buf; hsize = sprintf(dst, "%s#%d Part%u\n", why, oopscount, part); @@ -456,6 +456,12 @@ int pstore_register(struct pstore_info *psi) add_timer(&pstore_timer); } + /* + * Update the module parameter backend, so it is visible + * through /sys/module/pstore/parameters/backend + */ + backend = psi->name; + pr_info("Registered %s as persistent store backend\n", psi->name); return 0; diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index 44a549bee..6c26c4daa 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -186,12 +186,34 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, ssize_t size; ssize_t ecc_notice_size; struct ramoops_context *cxt = psi->data; - struct persistent_ram_zone *prz; - int header_length; + struct persistent_ram_zone *prz = NULL; + int header_length = 0; + + /* Ramoops headers provide time stamps for PSTORE_TYPE_DMESG, but + * PSTORE_TYPE_CONSOLE and PSTORE_TYPE_FTRACE don't currently have + * valid time stamps, so it is initialized to zero. + */ + time->tv_sec = 0; + time->tv_nsec = 0; + *compressed = false; + + /* Find the next valid persistent_ram_zone for DMESG */ + while (cxt->dump_read_cnt < cxt->max_dump_cnt && !prz) { + prz = ramoops_get_next_prz(cxt->przs, &cxt->dump_read_cnt, + cxt->max_dump_cnt, id, type, + PSTORE_TYPE_DMESG, 1); + if (!prz_ok(prz)) + continue; + header_length = ramoops_read_kmsg_hdr(persistent_ram_old(prz), + time, compressed); + /* Clear and skip this DMESG record if it has no valid header */ + if (!header_length) { + persistent_ram_free_old(prz); + persistent_ram_zap(prz); + prz = NULL; + } + } - prz = ramoops_get_next_prz(cxt->przs, &cxt->dump_read_cnt, - cxt->max_dump_cnt, id, type, - PSTORE_TYPE_DMESG, 1); if (!prz_ok(prz)) prz = ramoops_get_next_prz(&cxt->cprz, &cxt->console_read_cnt, 1, id, type, PSTORE_TYPE_CONSOLE, 0); @@ -204,13 +226,7 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type, if (!prz_ok(prz)) return 0; - if (!persistent_ram_old(prz)) - return 0; - - size = persistent_ram_old_size(prz); - header_length = ramoops_read_kmsg_hdr(persistent_ram_old(prz), time, - compressed); - size -= header_length; + size = persistent_ram_old_size(prz) - header_length; /* ECC correction notice */ ecc_notice_size = persistent_ram_ecc_string(prz, NULL, 0); @@ -394,18 +410,16 @@ static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt, } for (i = 0; i < cxt->max_dump_cnt; i++) { - size_t sz = cxt->record_size; - - cxt->przs[i] = persistent_ram_new(*paddr, sz, 0, + cxt->przs[i] = persistent_ram_new(*paddr, cxt->record_size, 0, &cxt->ecc_info, cxt->memtype); if (IS_ERR(cxt->przs[i])) { err = PTR_ERR(cxt->przs[i]); dev_err(dev, "failed to request mem region (0x%zx@0x%llx): %d\n", - sz, (unsigned long long)*paddr, err); + cxt->record_size, (unsigned long long)*paddr, err); goto fail_prz; } - *paddr += sz; + *paddr += cxt->record_size; } return 0; @@ -608,7 +622,7 @@ static void ramoops_register_dummy(void) dummy_data->mem_size = mem_size; dummy_data->mem_address = mem_address; - dummy_data->mem_type = 0; + dummy_data->mem_type = mem_type; dummy_data->record_size = record_size; dummy_data->console_size = ramoops_console_size; dummy_data->ftrace_size = ramoops_ftrace_size; diff --git a/fs/qnx6/dir.c b/fs/qnx6/dir.c index 8d64bb536..e1f37278c 100644 --- a/fs/qnx6/dir.c +++ b/fs/qnx6/dir.c @@ -32,11 +32,6 @@ static struct page *qnx6_get_page(struct inode *dir, unsigned long n) return page; } -static inline unsigned long dir_pages(struct inode *inode) -{ - return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT; -} - static unsigned last_entry(struct inode *inode, unsigned long page_nr) { unsigned long last_byte = inode->i_size; diff --git a/fs/read_write.c b/fs/read_write.c index 8ace6ec15..819ef3faf 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -494,30 +494,6 @@ ssize_t __vfs_write(struct file *file, const char __user *p, size_t count, } EXPORT_SYMBOL(__vfs_write); -vfs_readf_t vfs_readf(struct file *file) -{ - const struct file_operations *fop = file->f_op; - - if (fop->read) - return fop->read; - if (fop->read_iter) - return new_sync_read; - return ERR_PTR(-ENOSYS); -} -EXPORT_SYMBOL(vfs_readf); - -vfs_writef_t vfs_writef(struct file *file) -{ - const struct file_operations *fop = file->f_op; - - if (fop->write) - return fop->write; - if (fop->write_iter) - return new_sync_write; - return ERR_PTR(-ENOSYS); -} -EXPORT_SYMBOL(vfs_writef); - ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t *pos) { mm_segment_t old_fs; diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 0111ad046..0e4cf7281 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -21,6 +21,7 @@ #include "xattr.h" #include <linux/init.h> #include <linux/blkdev.h> +#include <linux/backing-dev.h> #include <linux/buffer_head.h> #include <linux/exportfs.h> #include <linux/quotaops.h> @@ -588,8 +589,7 @@ static struct kmem_cache *reiserfs_inode_cachep; static struct inode *reiserfs_alloc_inode(struct super_block *sb) { struct reiserfs_inode_info *ei; - ei = (struct reiserfs_inode_info *) - kmem_cache_alloc(reiserfs_inode_cachep, GFP_KERNEL); + ei = kmem_cache_alloc(reiserfs_inode_cachep, GFP_KERNEL); if (!ei) return NULL; atomic_set(&ei->openers, 0); diff --git a/fs/select.c b/fs/select.c index f684c750e..015547330 100644 --- a/fs/select.c +++ b/fs/select.c @@ -189,7 +189,7 @@ static int __pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key) * doesn't imply write barrier and the users expect write * barrier semantics on wakeup functions. The following * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up() - * and is paired with set_mb() in poll_schedule_timeout. + * and is paired with smp_store_mb() in poll_schedule_timeout. */ smp_wmb(); pwq->triggered = 1; @@ -244,7 +244,7 @@ int poll_schedule_timeout(struct poll_wqueues *pwq, int state, /* * Prepare for the next iteration. * - * The following set_mb() serves two purposes. First, it's + * The following smp_store_mb() serves two purposes. First, it's * the counterpart rmb of the wmb in pollwake() such that data * written before wake up is always visible after wake up. * Second, the full barrier guarantees that triggered clearing @@ -252,7 +252,7 @@ int poll_schedule_timeout(struct poll_wqueues *pwq, int state, * this problem doesn't exist for the first iteration as * add_wait_queue() has full barrier semantics. */ - set_mb(pwq->triggered, 0); + smp_store_mb(pwq->triggered, 0); return rc; } diff --git a/fs/seq_file.c b/fs/seq_file.c index 555f82155..ce9e39fd5 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -48,18 +48,21 @@ static void *seq_buf_alloc(unsigned long size) * ERR_PTR(error). In the end of sequence they return %NULL. ->show() * returns 0 in case of success and negative number in case of error. * Returning SEQ_SKIP means "discard this element and move on". + * Note: seq_open() will allocate a struct seq_file and store its + * pointer in @file->private_data. This pointer should not be modified. */ int seq_open(struct file *file, const struct seq_operations *op) { - struct seq_file *p = file->private_data; + struct seq_file *p; + + WARN_ON(file->private_data); + + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (!p) + return -ENOMEM; + + file->private_data = p; - if (!p) { - p = kmalloc(sizeof(*p), GFP_KERNEL); - if (!p) - return -ENOMEM; - file->private_data = p; - } - memset(p, 0, sizeof(*p)); mutex_init(&p->lock); p->op = op; #ifdef CONFIG_USER_NS @@ -487,6 +490,20 @@ int seq_path(struct seq_file *m, const struct path *path, const char *esc) } EXPORT_SYMBOL(seq_path); +/** + * seq_file_path - seq_file interface to print a pathname of a file + * @m: the seq_file handle + * @file: the struct file to print + * @esc: set of characters to escape in the output + * + * return the absolute path to the file. + */ +int seq_file_path(struct seq_file *m, struct file *file, const char *esc) +{ + return seq_path(m, &file->f_path, esc); +} +EXPORT_SYMBOL(seq_file_path); + /* * Same as seq_path, but relative to supplied root. */ @@ -538,6 +555,7 @@ int seq_dentry(struct seq_file *m, struct dentry *dentry, const char *esc) return res; } +EXPORT_SYMBOL(seq_dentry); static void *single_start(struct seq_file *p, loff_t *pos) { diff --git a/fs/splice.c b/fs/splice.c index bfb3324fc..5fc1e50a7 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -261,6 +261,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, return ret; } +EXPORT_SYMBOL_GPL(splice_to_pipe); void spd_release_page(struct splice_pipe_desc *spd, unsigned int i) { @@ -359,7 +360,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, break; error = add_to_page_cache_lru(page, mapping, index, - GFP_KERNEL); + GFP_KERNEL & mapping_gfp_mask(mapping)); if (unlikely(error)) { page_cache_release(page); if (error == -EEXIST) @@ -1101,8 +1102,8 @@ EXPORT_SYMBOL(generic_splice_sendpage); /* * Attempt to initiate a splice from pipe to file. */ -long do_splice_from(struct pipe_inode_info *pipe, struct file *out, - loff_t *ppos, size_t len, unsigned int flags) +static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, + loff_t *ppos, size_t len, unsigned int flags) { ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); @@ -1114,14 +1115,13 @@ long do_splice_from(struct pipe_inode_info *pipe, struct file *out, return splice_write(pipe, out, ppos, len, flags); } -EXPORT_SYMBOL(do_splice_from); /* * Attempt to initiate a splice from a file to a pipe. */ -long do_splice_to(struct file *in, loff_t *ppos, - struct pipe_inode_info *pipe, size_t len, - unsigned int flags) +static long do_splice_to(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) { ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); @@ -1141,7 +1141,6 @@ long do_splice_to(struct file *in, loff_t *ppos, return splice_read(in, ppos, pipe, len, flags); } -EXPORT_SYMBOL(do_splice_to); /** * splice_direct_to_actor - splices data directly between two non-pipes diff --git a/fs/squashfs/squashfs_fs_i.h b/fs/squashfs/squashfs_fs_i.h index 73588e770..d09fcd6fb 100644 --- a/fs/squashfs/squashfs_fs_i.h +++ b/fs/squashfs/squashfs_fs_i.h @@ -49,6 +49,6 @@ struct squashfs_inode_info { static inline struct squashfs_inode_info *squashfs_i(struct inode *inode) { - return list_entry(inode, struct squashfs_inode_info, vfs_inode); + return container_of(inode, struct squashfs_inode_info, vfs_inode); } #endif diff --git a/fs/super.c b/fs/super.c index 6fb221e8d..b61372354 100644 --- a/fs/super.c +++ b/fs/super.c @@ -36,7 +36,7 @@ #include "internal.h" -LIST_HEAD(super_blocks); +static LIST_HEAD(super_blocks); static DEFINE_SPINLOCK(sb_lock); static char *sb_writers_name[SB_FREEZE_LEVELS] = { @@ -842,7 +842,7 @@ int get_anon_bdev(dev_t *p) else if (error) return -EAGAIN; - if (dev == (1 << MINORBITS)) { + if (dev >= (1 << MINORBITS)) { spin_lock(&unnamed_dev_lock); ida_remove(&unnamed_dev_ida, dev); if (unnamed_dev_start > dev) diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 7c2867b44..6c95628ea 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -90,7 +90,7 @@ static ssize_t sysfs_kf_bin_read(struct kernfs_open_file *of, char *buf, return 0; if (size) { - if (pos > size) + if (pos >= size) return 0; if (pos + count > size) count = size - pos; diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index b400c0437..39a019936 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -135,7 +135,7 @@ static int internal_create_group(struct kobject *kobj, int update, * This function creates a group for the first time. It will explicitly * warn and error if any of the attribute files being created already exist. * - * Returns 0 on success or error. + * Returns 0 on success or error code on failure. */ int sysfs_create_group(struct kobject *kobj, const struct attribute_group *grp) @@ -155,7 +155,7 @@ EXPORT_SYMBOL_GPL(sysfs_create_group); * It will explicitly warn and error if any of the attribute files being * created already exist. * - * Returns 0 on success or error code from sysfs_create_group on error. + * Returns 0 on success or error code from sysfs_create_group on failure. */ int sysfs_create_groups(struct kobject *kobj, const struct attribute_group **groups) @@ -193,7 +193,7 @@ EXPORT_SYMBOL_GPL(sysfs_create_groups); * The primary use for this function is to call it after making a change * that affects group visibility. * - * Returns 0 on success or error. + * Returns 0 on success or error code on failure. */ int sysfs_update_group(struct kobject *kobj, const struct attribute_group *grp) diff --git a/fs/sysv/Makefile b/fs/sysv/Makefile index 3591f9d7a..7a75e70a4 100644 --- a/fs/sysv/Makefile +++ b/fs/sysv/Makefile @@ -5,4 +5,4 @@ obj-$(CONFIG_SYSV_FS) += sysv.o sysv-objs := ialloc.o balloc.o inode.o itree.o file.o dir.o \ - namei.o super.o symlink.o + namei.o super.o diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c index 8f3555f00..63c1bcb22 100644 --- a/fs/sysv/dir.c +++ b/fs/sysv/dir.c @@ -33,11 +33,6 @@ static inline void dir_put_page(struct page *page) page_cache_release(page); } -static inline unsigned long dir_pages(struct inode *inode) -{ - return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT; -} - static int dir_commit_chunk(struct page *page, loff_t pos, unsigned len) { struct address_space *mapping = page->mapping; diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index 88956309c..590ad9206 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -166,8 +166,9 @@ void sysv_set_inode(struct inode *inode, dev_t rdev) inode->i_op = &sysv_symlink_inode_operations; inode->i_mapping->a_ops = &sysv_aops; } else { - inode->i_op = &sysv_fast_symlink_inode_operations; - nd_terminate_link(SYSV_I(inode)->i_data, inode->i_size, + inode->i_op = &simple_symlink_inode_operations; + inode->i_link = (char *)SYSV_I(inode)->i_data; + nd_terminate_link(inode->i_link, inode->i_size, sizeof(SYSV_I(inode)->i_data) - 1); } } else diff --git a/fs/sysv/symlink.c b/fs/sysv/symlink.c deleted file mode 100644 index d3fa0d703..000000000 --- a/fs/sysv/symlink.c +++ /dev/null @@ -1,20 +0,0 @@ -/* - * linux/fs/sysv/symlink.c - * - * Handling of System V filesystem fast symlinks extensions. - * Aug 2001, Christoph Hellwig (hch@infradead.org) - */ - -#include "sysv.h" -#include <linux/namei.h> - -static void *sysv_follow_link(struct dentry *dentry, struct nameidata *nd) -{ - nd_set_link(nd, (char *)SYSV_I(d_inode(dentry))->i_data); - return NULL; -} - -const struct inode_operations sysv_fast_symlink_inode_operations = { - .readlink = generic_readlink, - .follow_link = sysv_follow_link, -}; diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h index 69d488986..6c212288a 100644 --- a/fs/sysv/sysv.h +++ b/fs/sysv/sysv.h @@ -73,7 +73,7 @@ struct sysv_inode_info { static inline struct sysv_inode_info *SYSV_I(struct inode *inode) { - return list_entry(inode, struct sysv_inode_info, vfs_inode); + return container_of(inode, struct sysv_inode_info, vfs_inode); } static inline struct sysv_sb_info *SYSV_SB(struct super_block *sb) @@ -161,7 +161,6 @@ extern ino_t sysv_inode_by_name(struct dentry *); extern const struct inode_operations sysv_file_inode_operations; extern const struct inode_operations sysv_dir_inode_operations; -extern const struct inode_operations sysv_fast_symlink_inode_operations; extern const struct file_operations sysv_file_operations; extern const struct file_operations sysv_dir_operations; extern const struct address_space_operations sysv_aops; diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index a43df11a1..cbc8d5d27 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -496,16 +496,11 @@ struct dentry *tracefs_create_instance_dir(const char *name, struct dentry *pare return dentry; } -static inline int tracefs_positive(struct dentry *dentry) -{ - return dentry->d_inode && !d_unhashed(dentry); -} - static int __tracefs_remove(struct dentry *dentry, struct dentry *parent) { int ret = 0; - if (tracefs_positive(dentry)) { + if (simple_positive(dentry)) { if (dentry->d_inode) { dget(dentry); switch (dentry->d_inode->i_mode & S_IFMT) { @@ -582,7 +577,7 @@ void tracefs_remove_recursive(struct dentry *dentry) */ spin_lock(&parent->d_lock); list_for_each_entry(child, &parent->d_subdirs, d_child) { - if (!tracefs_positive(child)) + if (!simple_positive(child)) continue; /* perhaps simple_empty(child) makes more sense */ @@ -603,7 +598,7 @@ void tracefs_remove_recursive(struct dentry *dentry) * from d_subdirs. When releasing the parent->d_lock we can * no longer trust that the next pointer is valid. * Restart the loop. We'll skip this one with the - * tracefs_positive() check. + * simple_positive() check. */ goto loop; } diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 27060fc85..5c27c66c2 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -889,6 +889,7 @@ static int ubifs_symlink(struct inode *dir, struct dentry *dentry, memcpy(ui->data, symname, len); ((char *)ui->data)[len] = '\0'; + inode->i_link = ui->data; /* * The terminating zero byte is not written to the flash media and it * is put just to make later in-memory string processing simpler. Thus, diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 35efc103c..a3dfe2ae7 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -51,7 +51,6 @@ #include "ubifs.h" #include <linux/mount.h> -#include <linux/namei.h> #include <linux/slab.h> static int read_block(struct inode *inode, void *addr, unsigned int block, @@ -1300,14 +1299,6 @@ static void ubifs_invalidatepage(struct page *page, unsigned int offset, ClearPageChecked(page); } -static void *ubifs_follow_link(struct dentry *dentry, struct nameidata *nd) -{ - struct ubifs_inode *ui = ubifs_inode(d_inode(dentry)); - - nd_set_link(nd, ui->data); - return NULL; -} - int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; @@ -1570,7 +1561,7 @@ const struct inode_operations ubifs_file_inode_operations = { const struct inode_operations ubifs_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = ubifs_follow_link, + .follow_link = simple_follow_link, .setattr = ubifs_setattr, .getattr = ubifs_getattr, .setxattr = ubifs_setxattr, diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 75e6f04bb..9547a2786 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -195,6 +195,7 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum) } memcpy(ui->data, ino->data, ui->data_len); ((char *)ui->data)[ui->data_len] = '\0'; + inode->i_link = ui->data; break; case S_IFBLK: case S_IFCHR: @@ -2245,7 +2246,9 @@ static int __init ubifs_init(void) if (!ubifs_inode_slab) return -ENOMEM; - register_shrinker(&ubifs_shrinker_info); + err = register_shrinker(&ubifs_shrinker_info); + if (err) + goto out_slab; err = ubifs_compressors_init(); if (err) @@ -2269,6 +2272,7 @@ out_compr: ubifs_compressors_exit(); out_shrinker: unregister_shrinker(&ubifs_shrinker_info); +out_slab: kmem_cache_destroy(ubifs_inode_slab); return err; } diff --git a/fs/udf/dir.c b/fs/udf/dir.c index 541a12b57..541d9c650 100644 --- a/fs/udf/dir.c +++ b/fs/udf/dir.c @@ -168,7 +168,7 @@ static int udf_readdir(struct file *file, struct dir_context *ctx) } flen = udf_get_filename(sb, nameptr, lfi, fname, UDF_NAME_LEN); - if (!flen) + if (flen < 0) continue; tloc = lelb_to_cpu(cfi.icb.extLocation); diff --git a/fs/udf/file.c b/fs/udf/file.c index 7a95b8fed..bddf3d071 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -152,8 +152,6 @@ out: mutex_unlock(&inode->i_mutex); if (retval > 0) { - ssize_t err; - mark_inode_dirty(inode); err = generic_write_sync(file, iocb->ki_pos - retval, retval); if (err < 0) diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 6afac3d56..8d0b3ade0 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -1652,17 +1652,9 @@ static int udf_update_inode(struct inode *inode, int do_sync) iinfo->i_ext.i_data, inode->i_sb->s_blocksize - sizeof(struct unallocSpaceEntry)); use->descTag.tagIdent = cpu_to_le16(TAG_IDENT_USE); - use->descTag.tagLocation = - cpu_to_le32(iinfo->i_location.logicalBlockNum); - crclen = sizeof(struct unallocSpaceEntry) + - iinfo->i_lenAlloc - sizeof(struct tag); - use->descTag.descCRCLength = cpu_to_le16(crclen); - use->descTag.descCRC = cpu_to_le16(crc_itu_t(0, (char *)use + - sizeof(struct tag), - crclen)); - use->descTag.tagChecksum = udf_tag_checksum(&use->descTag); + crclen = sizeof(struct unallocSpaceEntry); - goto out; + goto finish; } if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_UID_FORGET)) @@ -1782,6 +1774,8 @@ static int udf_update_inode(struct inode *inode, int do_sync) efe->descTag.tagIdent = cpu_to_le16(TAG_IDENT_EFE); crclen = sizeof(struct extendedFileEntry); } + +finish: if (iinfo->i_strat4096) { fe->icbTag.strategyType = cpu_to_le16(4096); fe->icbTag.strategyParameter = cpu_to_le16(1); @@ -1791,7 +1785,9 @@ static int udf_update_inode(struct inode *inode, int do_sync) fe->icbTag.numEntries = cpu_to_le16(1); } - if (S_ISDIR(inode->i_mode)) + if (iinfo->i_use) + fe->icbTag.fileType = ICBTAG_FILE_TYPE_USE; + else if (S_ISDIR(inode->i_mode)) fe->icbTag.fileType = ICBTAG_FILE_TYPE_DIRECTORY; else if (S_ISREG(inode->i_mode)) fe->icbTag.fileType = ICBTAG_FILE_TYPE_REGULAR; @@ -1828,7 +1824,6 @@ static int udf_update_inode(struct inode *inode, int do_sync) crclen)); fe->descTag.tagChecksum = udf_tag_checksum(&fe->descTag); -out: set_buffer_uptodate(bh); unlock_buffer(bh); diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 5c03f0dfb..c97b5a8d1 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -138,6 +138,25 @@ int udf_write_fi(struct inode *inode, struct fileIdentDesc *cfi, return 0; } +/** + * udf_find_entry - find entry in given directory. + * + * @dir: directory inode to search in + * @child: qstr of the name + * @fibh: buffer head / inode with file identifier descriptor we found + * @cfi: found file identifier descriptor with given name + * + * This function searches in the directory @dir for a file name @child. When + * found, @fibh points to the buffer head(s) (bh is NULL for in ICB + * directories) containing the file identifier descriptor (FID). In that case + * the function returns pointer to the FID in the buffer or inode - but note + * that FID may be split among two buffers (blocks) so accessing it via that + * pointer isn't easily possible. This pointer can be used only as an iterator + * for other directory manipulation functions. For inspection of the FID @cfi + * can be used - the found FID is copied there. + * + * Returns pointer to FID, NULL when nothing found, or error code. + */ static struct fileIdentDesc *udf_find_entry(struct inode *dir, const struct qstr *child, struct udf_fileident_bh *fibh, @@ -167,8 +186,11 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir, fibh->soffset = fibh->eoffset = f_pos & (sb->s_blocksize - 1); if (dinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { if (inode_bmap(dir, f_pos >> sb->s_blocksize_bits, &epos, - &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30)) + &eloc, &elen, &offset) != (EXT_RECORDED_ALLOCATED >> 30)) { + fi = ERR_PTR(-EIO); goto out_err; + } + block = udf_get_lb_pblock(sb, &eloc, offset); if ((++offset << sb->s_blocksize_bits) < elen) { if (dinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) @@ -179,19 +201,25 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir, offset = 0; fibh->sbh = fibh->ebh = udf_tread(sb, block); - if (!fibh->sbh) + if (!fibh->sbh) { + fi = ERR_PTR(-EIO); goto out_err; + } } fname = kmalloc(UDF_NAME_LEN, GFP_NOFS); - if (!fname) + if (!fname) { + fi = ERR_PTR(-ENOMEM); goto out_err; + } while (f_pos < size) { fi = udf_fileident_read(dir, &f_pos, fibh, cfi, &epos, &eloc, &elen, &offset); - if (!fi) + if (!fi) { + fi = ERR_PTR(-EIO); goto out_err; + } liu = le16_to_cpu(cfi->lengthOfImpUse); lfi = cfi->lengthFileIdent; @@ -234,12 +262,17 @@ static struct fileIdentDesc *udf_find_entry(struct inode *dir, continue; flen = udf_get_filename(sb, nameptr, lfi, fname, UDF_NAME_LEN); - if (flen && udf_match(flen, fname, child->len, child->name)) + if (flen < 0) { + fi = ERR_PTR(flen); + goto out_err; + } + + if (udf_match(flen, fname, child->len, child->name)) goto out_ok; } -out_err: fi = NULL; +out_err: if (fibh->sbh != fibh->ebh) brelse(fibh->ebh); brelse(fibh->sbh); @@ -256,6 +289,7 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry, struct inode *inode = NULL; struct fileIdentDesc cfi; struct udf_fileident_bh fibh; + struct fileIdentDesc *fi; if (dentry->d_name.len > UDF_NAME_LEN - 2) return ERR_PTR(-ENAMETOOLONG); @@ -275,7 +309,11 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry, } else #endif /* UDF_RECOVERY */ - if (udf_find_entry(dir, &dentry->d_name, &fibh, &cfi)) { + fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); + if (IS_ERR(fi)) + return ERR_CAST(fi); + + if (fi) { struct kernel_lb_addr loc; if (fibh.sbh != fibh.ebh) @@ -774,8 +812,11 @@ static int udf_rmdir(struct inode *dir, struct dentry *dentry) retval = -ENOENT; fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); - if (!fi) + if (IS_ERR_OR_NULL(fi)) { + if (fi) + retval = PTR_ERR(fi); goto out; + } retval = -EIO; tloc = lelb_to_cpu(cfi.icb.extLocation); @@ -817,8 +858,12 @@ static int udf_unlink(struct inode *dir, struct dentry *dentry) retval = -ENOENT; fi = udf_find_entry(dir, &dentry->d_name, &fibh, &cfi); - if (!fi) + + if (IS_ERR_OR_NULL(fi)) { + if (fi) + retval = PTR_ERR(fi); goto out; + } retval = -EIO; tloc = lelb_to_cpu(cfi.icb.extLocation); @@ -1049,24 +1094,30 @@ static int udf_rename(struct inode *old_dir, struct dentry *old_dentry, struct udf_inode_info *old_iinfo = UDF_I(old_inode); ofi = udf_find_entry(old_dir, &old_dentry->d_name, &ofibh, &ocfi); - if (ofi) { - if (ofibh.sbh != ofibh.ebh) - brelse(ofibh.ebh); - brelse(ofibh.sbh); + if (IS_ERR(ofi)) { + retval = PTR_ERR(ofi); + goto end_rename; } + + if (ofibh.sbh != ofibh.ebh) + brelse(ofibh.ebh); + + brelse(ofibh.sbh); tloc = lelb_to_cpu(ocfi.icb.extLocation); if (!ofi || udf_get_lb_pblock(old_dir->i_sb, &tloc, 0) != old_inode->i_ino) goto end_rename; nfi = udf_find_entry(new_dir, &new_dentry->d_name, &nfibh, &ncfi); - if (nfi) { - if (!new_inode) { - if (nfibh.sbh != nfibh.ebh) - brelse(nfibh.ebh); - brelse(nfibh.sbh); - nfi = NULL; - } + if (IS_ERR(nfi)) { + retval = PTR_ERR(nfi); + goto end_rename; + } + if (nfi && !new_inode) { + if (nfibh.sbh != nfibh.ebh) + brelse(nfibh.ebh); + brelse(nfibh.sbh); + nfi = NULL; } if (S_ISDIR(old_inode->i_mode)) { int offset = udf_ext0_offset(old_inode); @@ -1221,7 +1272,7 @@ static struct dentry *udf_nfs_get_inode(struct super_block *sb, u32 block, static struct dentry *udf_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { - if ((fh_len != 3 && fh_len != 5) || + if (fh_len < 3 || (fh_type != FILEID_UDF_WITH_PARENT && fh_type != FILEID_UDF_WITHOUT_PARENT)) return NULL; @@ -1233,7 +1284,7 @@ static struct dentry *udf_fh_to_dentry(struct super_block *sb, static struct dentry *udf_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { - if (fh_len != 5 || fh_type != FILEID_UDF_WITH_PARENT) + if (fh_len < 5 || fh_type != FILEID_UDF_WITH_PARENT) return NULL; return udf_nfs_get_inode(sb, fid->udf.parent_block, diff --git a/fs/udf/super.c b/fs/udf/super.c index 6299f3419..b96f190bc 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -927,17 +927,23 @@ static int udf_load_pvoldesc(struct super_block *sb, sector_t block) #endif } - if (!udf_build_ustr(instr, pvoldesc->volIdent, 32)) - if (udf_CS0toUTF8(outstr, instr)) { - strncpy(UDF_SB(sb)->s_volume_ident, outstr->u_name, - outstr->u_len > 31 ? 31 : outstr->u_len); - udf_debug("volIdent[] = '%s'\n", - UDF_SB(sb)->s_volume_ident); - } + if (!udf_build_ustr(instr, pvoldesc->volIdent, 32)) { + ret = udf_CS0toUTF8(outstr, instr); + if (ret < 0) + goto out_bh; + + strncpy(UDF_SB(sb)->s_volume_ident, outstr->u_name, + outstr->u_len > 31 ? 31 : outstr->u_len); + udf_debug("volIdent[] = '%s'\n", UDF_SB(sb)->s_volume_ident); + } - if (!udf_build_ustr(instr, pvoldesc->volSetIdent, 128)) - if (udf_CS0toUTF8(outstr, instr)) - udf_debug("volSetIdent[] = '%s'\n", outstr->u_name); + if (!udf_build_ustr(instr, pvoldesc->volSetIdent, 128)) { + ret = udf_CS0toUTF8(outstr, instr); + if (ret < 0) + goto out_bh; + + udf_debug("volSetIdent[] = '%s'\n", outstr->u_name); + } ret = 0; out_bh: diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c index 8dfbc4025..862535b3b 100644 --- a/fs/udf/symlink.c +++ b/fs/udf/symlink.c @@ -82,6 +82,9 @@ static int udf_pc_to_char(struct super_block *sb, unsigned char *from, comp_len = udf_get_filename(sb, pc->componentIdent, pc->lengthComponentIdent, p, tolen); + if (comp_len < 0) + return comp_len; + p += comp_len; tolen -= comp_len; if (tolen == 0) diff --git a/fs/udf/udf_i.h b/fs/udf/udf_i.h index b5cd8ed2a..b1b9a63d8 100644 --- a/fs/udf/udf_i.h +++ b/fs/udf/udf_i.h @@ -56,7 +56,7 @@ struct udf_inode_info { static inline struct udf_inode_info *UDF_I(struct inode *inode) { - return list_entry(inode, struct udf_inode_info, vfs_inode); + return container_of(inode, struct udf_inode_info, vfs_inode); } #endif /* _UDF_I_H) */ diff --git a/fs/udf/unicode.c b/fs/udf/unicode.c index b84fee372..ab478e62b 100644 --- a/fs/udf/unicode.c +++ b/fs/udf/unicode.c @@ -68,21 +68,16 @@ int udf_build_ustr(struct ustr *dest, dstring *ptr, int size) /* * udf_build_ustr_exact */ -static int udf_build_ustr_exact(struct ustr *dest, dstring *ptr, int exactsize) +static void udf_build_ustr_exact(struct ustr *dest, dstring *ptr, int exactsize) { - if ((!dest) || (!ptr) || (!exactsize)) - return -1; - memset(dest, 0, sizeof(struct ustr)); dest->u_cmpID = ptr[0]; dest->u_len = exactsize - 1; memcpy(dest->u_name, ptr + 1, exactsize - 1); - - return 0; } /* - * udf_ocu_to_utf8 + * udf_CS0toUTF8 * * PURPOSE * Convert OSTA Compressed Unicode to the UTF-8 equivalent. @@ -94,7 +89,7 @@ static int udf_build_ustr_exact(struct ustr *dest, dstring *ptr, int exactsize) * both of type "struct ustr *" * * POST-CONDITIONS - * <return> Zero on success. + * <return> >= 0 on success. * * HISTORY * November 12, 1997 - Andrew E. Mileski @@ -117,7 +112,7 @@ int udf_CS0toUTF8(struct ustr *utf_o, const struct ustr *ocu_i) memset(utf_o, 0, sizeof(struct ustr)); pr_err("unknown compression code (%d) stri=%s\n", cmp_id, ocu_i->u_name); - return 0; + return -EINVAL; } ocu = ocu_i->u_name; @@ -154,7 +149,7 @@ int udf_CS0toUTF8(struct ustr *utf_o, const struct ustr *ocu_i) /* * - * udf_utf8_to_ocu + * udf_UTF8toCS0 * * PURPOSE * Convert UTF-8 to the OSTA Compressed Unicode equivalent. @@ -270,7 +265,7 @@ static int udf_CS0toNLS(struct nls_table *nls, struct ustr *utf_o, memset(utf_o, 0, sizeof(struct ustr)); pr_err("unknown compression code (%d) stri=%s\n", cmp_id, ocu_i->u_name); - return 0; + return -EINVAL; } ocu = ocu_i->u_name; @@ -338,43 +333,51 @@ int udf_get_filename(struct super_block *sb, uint8_t *sname, int slen, uint8_t *dname, int dlen) { struct ustr *filename, *unifilename; - int len = 0; + int ret; + + if (!slen) + return -EIO; filename = kmalloc(sizeof(struct ustr), GFP_NOFS); if (!filename) - return 0; + return -ENOMEM; unifilename = kmalloc(sizeof(struct ustr), GFP_NOFS); - if (!unifilename) + if (!unifilename) { + ret = -ENOMEM; goto out1; + } - if (udf_build_ustr_exact(unifilename, sname, slen)) - goto out2; - + udf_build_ustr_exact(unifilename, sname, slen); if (UDF_QUERY_FLAG(sb, UDF_FLAG_UTF8)) { - if (!udf_CS0toUTF8(filename, unifilename)) { + ret = udf_CS0toUTF8(filename, unifilename); + if (ret < 0) { udf_debug("Failed in udf_get_filename: sname = %s\n", sname); goto out2; } } else if (UDF_QUERY_FLAG(sb, UDF_FLAG_NLS_MAP)) { - if (!udf_CS0toNLS(UDF_SB(sb)->s_nls_map, filename, - unifilename)) { + ret = udf_CS0toNLS(UDF_SB(sb)->s_nls_map, filename, + unifilename); + if (ret < 0) { udf_debug("Failed in udf_get_filename: sname = %s\n", sname); goto out2; } } else - goto out2; + BUG(); - len = udf_translate_to_linux(dname, dlen, + ret = udf_translate_to_linux(dname, dlen, filename->u_name, filename->u_len, unifilename->u_name, unifilename->u_len); + /* Zero length filename isn't valid... */ + if (ret == 0) + ret = -EINVAL; out2: kfree(unifilename); out1: kfree(filename); - return len; + return ret; } int udf_put_filename(struct super_block *sb, const uint8_t *sname, diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index 1bfe8cabf..74f2e8028 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -65,11 +65,6 @@ static inline void ufs_put_page(struct page *page) page_cache_release(page); } -static inline unsigned long ufs_dir_pages(struct inode *inode) -{ - return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT; -} - ino_t ufs_inode_by_name(struct inode *dir, const struct qstr *qstr) { ino_t res = 0; @@ -87,7 +82,8 @@ ino_t ufs_inode_by_name(struct inode *dir, const struct qstr *qstr) /* Releases the page */ void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de, - struct page *page, struct inode *inode) + struct page *page, struct inode *inode, + bool update_times) { loff_t pos = page_offset(page) + (char *) de - (char *) page_address(page); @@ -103,7 +99,8 @@ void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de, err = ufs_commit_chunk(page, pos, len); ufs_put_page(page); - dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; + if (update_times) + dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC; mark_inode_dirty(dir); } @@ -256,7 +253,7 @@ struct ufs_dir_entry *ufs_find_entry(struct inode *dir, const struct qstr *qstr, int namelen = qstr->len; unsigned reclen = UFS_DIR_REC_LEN(namelen); unsigned long start, n; - unsigned long npages = ufs_dir_pages(dir); + unsigned long npages = dir_pages(dir); struct page *page = NULL; struct ufs_inode_info *ui = UFS_I(dir); struct ufs_dir_entry *de; @@ -320,7 +317,7 @@ int ufs_add_link(struct dentry *dentry, struct inode *inode) unsigned short rec_len, name_len; struct page *page = NULL; struct ufs_dir_entry *de; - unsigned long npages = ufs_dir_pages(dir); + unsigned long npages = dir_pages(dir); unsigned long n; char *kaddr; loff_t pos; @@ -437,7 +434,7 @@ ufs_readdir(struct file *file, struct dir_context *ctx) struct super_block *sb = inode->i_sb; unsigned int offset = pos & ~PAGE_CACHE_MASK; unsigned long n = pos >> PAGE_CACHE_SHIFT; - unsigned long npages = ufs_dir_pages(inode); + unsigned long npages = dir_pages(inode); unsigned chunk_mask = ~(UFS_SB(sb)->s_uspi->s_dirblksize - 1); int need_revalidate = file->f_version != inode->i_version; unsigned flags = UFS_SB(sb)->s_flags; @@ -608,7 +605,7 @@ int ufs_empty_dir(struct inode * inode) { struct super_block *sb = inode->i_sb; struct page *page = NULL; - unsigned long i, npages = ufs_dir_pages(inode); + unsigned long i, npages = dir_pages(inode); for (i = 0; i < npages; i++) { char *kaddr; diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 2d93ab07d..f913a6924 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -572,9 +572,10 @@ static void ufs_set_inode_ops(struct inode *inode) inode->i_fop = &ufs_dir_operations; inode->i_mapping->a_ops = &ufs_aops; } else if (S_ISLNK(inode->i_mode)) { - if (!inode->i_blocks) + if (!inode->i_blocks) { inode->i_op = &ufs_fast_symlink_inode_operations; - else { + inode->i_link = (char *)UFS_I(inode)->i_u1.i_symlink; + } else { inode->i_op = &ufs_symlink_inode_operations; inode->i_mapping->a_ops = &ufs_aops; } diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index 60ee32249..479665543 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -56,11 +56,9 @@ static struct dentry *ufs_lookup(struct inode * dir, struct dentry *dentry, unsi if (dentry->d_name.len > UFS_MAXNAMLEN) return ERR_PTR(-ENAMETOOLONG); - lock_ufs(dir->i_sb); ino = ufs_inode_by_name(dir, &dentry->d_name); if (ino) inode = ufs_iget(dir->i_sb, ino); - unlock_ufs(dir->i_sb); return d_splice_alias(inode, dentry); } @@ -76,24 +74,16 @@ static int ufs_create (struct inode * dir, struct dentry * dentry, umode_t mode, bool excl) { struct inode *inode; - int err; - - UFSD("BEGIN\n"); inode = ufs_new_inode(dir, mode); - err = PTR_ERR(inode); + if (IS_ERR(inode)) + return PTR_ERR(inode); - if (!IS_ERR(inode)) { - inode->i_op = &ufs_file_inode_operations; - inode->i_fop = &ufs_file_operations; - inode->i_mapping->a_ops = &ufs_aops; - mark_inode_dirty(inode); - lock_ufs(dir->i_sb); - err = ufs_add_nondir(dentry, inode); - unlock_ufs(dir->i_sb); - } - UFSD("END: err=%d\n", err); - return err; + inode->i_op = &ufs_file_inode_operations; + inode->i_fop = &ufs_file_operations; + inode->i_mapping->a_ops = &ufs_aops; + mark_inode_dirty(inode); + return ufs_add_nondir(dentry, inode); } static int ufs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) @@ -110,9 +100,7 @@ static int ufs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev init_special_inode(inode, mode, rdev); ufs_set_inode_dev(inode->i_sb, UFS_I(inode), rdev); mark_inode_dirty(inode); - lock_ufs(dir->i_sb); err = ufs_add_nondir(dentry, inode); - unlock_ufs(dir->i_sb); } return err; } @@ -121,18 +109,17 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry, const char * symname) { struct super_block * sb = dir->i_sb; - int err = -ENAMETOOLONG; + int err; unsigned l = strlen(symname)+1; struct inode * inode; if (l > sb->s_blocksize) - goto out_notlocked; + return -ENAMETOOLONG; - lock_ufs(dir->i_sb); inode = ufs_new_inode(dir, S_IFLNK | S_IRWXUGO); err = PTR_ERR(inode); if (IS_ERR(inode)) - goto out; + return err; if (l > UFS_SB(sb)->s_uspi->s_maxsymlinklen) { /* slow symlink */ @@ -144,22 +131,19 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry, } else { /* fast symlink */ inode->i_op = &ufs_fast_symlink_inode_operations; - memcpy(UFS_I(inode)->i_u1.i_symlink, symname, l); + inode->i_link = (char *)UFS_I(inode)->i_u1.i_symlink; + memcpy(inode->i_link, symname, l); inode->i_size = l-1; } mark_inode_dirty(inode); - err = ufs_add_nondir(dentry, inode); -out: - unlock_ufs(dir->i_sb); -out_notlocked: - return err; + return ufs_add_nondir(dentry, inode); out_fail: inode_dec_link_count(inode); unlock_new_inode(inode); iput(inode); - goto out; + return err; } static int ufs_link (struct dentry * old_dentry, struct inode * dir, @@ -168,8 +152,6 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir, struct inode *inode = d_inode(old_dentry); int error; - lock_ufs(dir->i_sb); - inode->i_ctime = CURRENT_TIME_SEC; inode_inc_link_count(inode); ihold(inode); @@ -180,7 +162,6 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir, iput(inode); } else d_instantiate(dentry, inode); - unlock_ufs(dir->i_sb); return error; } @@ -189,7 +170,6 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode) struct inode * inode; int err; - lock_ufs(dir->i_sb); inode_inc_link_count(dir); inode = ufs_new_inode(dir, S_IFDIR|mode); @@ -210,12 +190,10 @@ static int ufs_mkdir(struct inode * dir, struct dentry * dentry, umode_t mode) err = ufs_add_link(dentry, inode); if (err) goto out_fail; - unlock_ufs(dir->i_sb); unlock_new_inode(inode); d_instantiate(dentry, inode); -out: - return err; + return 0; out_fail: inode_dec_link_count(inode); @@ -224,8 +202,7 @@ out_fail: iput (inode); out_dir: inode_dec_link_count(dir); - unlock_ufs(dir->i_sb); - goto out; + return err; } static int ufs_unlink(struct inode *dir, struct dentry *dentry) @@ -255,7 +232,6 @@ static int ufs_rmdir (struct inode * dir, struct dentry *dentry) struct inode * inode = d_inode(dentry); int err= -ENOTEMPTY; - lock_ufs(dir->i_sb); if (ufs_empty_dir (inode)) { err = ufs_unlink(dir, dentry); if (!err) { @@ -264,7 +240,6 @@ static int ufs_rmdir (struct inode * dir, struct dentry *dentry) inode_dec_link_count(dir); } } - unlock_ufs(dir->i_sb); return err; } @@ -302,7 +277,7 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry, new_de = ufs_find_entry(new_dir, &new_dentry->d_name, &new_page); if (!new_de) goto out_dir; - ufs_set_link(new_dir, new_de, new_page, old_inode); + ufs_set_link(new_dir, new_de, new_page, old_inode, 1); new_inode->i_ctime = CURRENT_TIME_SEC; if (dir_de) drop_nlink(new_inode); @@ -325,7 +300,12 @@ static int ufs_rename(struct inode *old_dir, struct dentry *old_dentry, mark_inode_dirty(old_inode); if (dir_de) { - ufs_set_link(old_inode, dir_de, dir_page, new_dir); + if (old_dir != new_dir) + ufs_set_link(old_inode, dir_de, dir_page, new_dir, 0); + else { + kunmap(dir_page); + page_cache_release(dir_page); + } inode_dec_link_count(old_dir); } return 0; diff --git a/fs/ufs/super.c b/fs/ufs/super.c index dc33f9416..250579a80 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -80,6 +80,7 @@ #include <linux/stat.h> #include <linux/string.h> #include <linux/blkdev.h> +#include <linux/backing-dev.h> #include <linux/init.h> #include <linux/parser.h> #include <linux/buffer_head.h> diff --git a/fs/ufs/symlink.c b/fs/ufs/symlink.c index 5b537e2fd..874480bb4 100644 --- a/fs/ufs/symlink.c +++ b/fs/ufs/symlink.c @@ -25,23 +25,12 @@ * ext2 symlink handling code */ -#include <linux/fs.h> -#include <linux/namei.h> - #include "ufs_fs.h" #include "ufs.h" - -static void *ufs_follow_link(struct dentry *dentry, struct nameidata *nd) -{ - struct ufs_inode_info *p = UFS_I(d_inode(dentry)); - nd_set_link(nd, (char*)p->i_u1.i_symlink); - return NULL; -} - const struct inode_operations ufs_fast_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = ufs_follow_link, + .follow_link = simple_follow_link, .setattr = ufs_setattr, }; diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h index cf6368d42..2e31ea2e3 100644 --- a/fs/ufs/ufs.h +++ b/fs/ufs/ufs.h @@ -106,7 +106,7 @@ extern int ufs_delete_entry(struct inode *, struct ufs_dir_entry *, struct page extern int ufs_empty_dir (struct inode *); extern struct ufs_dir_entry *ufs_dotdot(struct inode *, struct page **); extern void ufs_set_link(struct inode *dir, struct ufs_dir_entry *de, - struct page *page, struct inode *inode); + struct page *page, struct inode *inode, bool update_times); /* file.c */ extern const struct inode_operations ufs_file_inode_operations; diff --git a/fs/xattr.c b/fs/xattr.c index 6bb630399..072fee125 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -207,7 +207,6 @@ vfs_getxattr_alloc(struct dentry *dentry, const char *name, char **xattr_value, *xattr_value = value; return error; } -EXPORT_SYMBOL(vfs_getxattr_alloc); /* Compare an extended attribute value with the given value */ int vfs_xattr_cmp(struct dentry *dentry, const char *xattr_name, @@ -299,18 +298,18 @@ vfs_removexattr(struct dentry *dentry, const char *name) mutex_lock(&inode->i_mutex); error = security_inode_removexattr(dentry, name); - if (error) { - mutex_unlock(&inode->i_mutex); - return error; - } + if (error) + goto out; error = inode->i_op->removexattr(dentry, name); - mutex_unlock(&inode->i_mutex); if (!error) { fsnotify_xattr(dentry); evm_inode_post_removexattr(dentry, name); } + +out: + mutex_unlock(&inode->i_mutex); return error; } EXPORT_SYMBOL_GPL(vfs_removexattr); diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 516162be1..f9e9ffe6f 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -149,13 +149,27 @@ xfs_alloc_compute_aligned( { xfs_agblock_t bno; xfs_extlen_t len; + xfs_extlen_t diff; /* Trim busy sections out of found extent */ xfs_extent_busy_trim(args, foundbno, foundlen, &bno, &len); + /* + * If we have a largish extent that happens to start before min_agbno, + * see if we can shift it into range... + */ + if (bno < args->min_agbno && bno + len > args->min_agbno) { + diff = args->min_agbno - bno; + if (len > diff) { + bno += diff; + len -= diff; + } + } + if (args->alignment > 1 && len >= args->minlen) { xfs_agblock_t aligned_bno = roundup(bno, args->alignment); - xfs_extlen_t diff = aligned_bno - bno; + + diff = aligned_bno - bno; *resbno = aligned_bno; *reslen = diff >= len ? 0 : len - diff; @@ -795,9 +809,13 @@ xfs_alloc_find_best_extent( * The good extent is closer than this one. */ if (!dir) { + if (*sbnoa > args->max_agbno) + goto out_use_good; if (*sbnoa >= args->agbno + gdiff) goto out_use_good; } else { + if (*sbnoa < args->min_agbno) + goto out_use_good; if (*sbnoa <= args->agbno - gdiff) goto out_use_good; } @@ -884,6 +902,17 @@ xfs_alloc_ag_vextent_near( dofirst = prandom_u32() & 1; #endif + /* handle unitialized agbno range so caller doesn't have to */ + if (!args->min_agbno && !args->max_agbno) + args->max_agbno = args->mp->m_sb.sb_agblocks - 1; + ASSERT(args->min_agbno <= args->max_agbno); + + /* clamp agbno to the range if it's outside */ + if (args->agbno < args->min_agbno) + args->agbno = args->min_agbno; + if (args->agbno > args->max_agbno) + args->agbno = args->max_agbno; + restart: bno_cur_lt = NULL; bno_cur_gt = NULL; @@ -976,6 +1005,8 @@ restart: <bnoa, <lena); if (ltlena < args->minlen) continue; + if (ltbnoa < args->min_agbno || ltbnoa > args->max_agbno) + continue; args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen); xfs_alloc_fix_len(args); ASSERT(args->len >= args->minlen); @@ -1096,11 +1127,11 @@ restart: XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); xfs_alloc_compute_aligned(args, ltbno, ltlen, <bnoa, <lena); - if (ltlena >= args->minlen) + if (ltlena >= args->minlen && ltbnoa >= args->min_agbno) break; if ((error = xfs_btree_decrement(bno_cur_lt, 0, &i))) goto error0; - if (!i) { + if (!i || ltbnoa < args->min_agbno) { xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR); bno_cur_lt = NULL; @@ -1112,11 +1143,11 @@ restart: XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0); xfs_alloc_compute_aligned(args, gtbno, gtlen, >bnoa, >lena); - if (gtlena >= args->minlen) + if (gtlena >= args->minlen && gtbnoa <= args->max_agbno) break; if ((error = xfs_btree_increment(bno_cur_gt, 0, &i))) goto error0; - if (!i) { + if (!i || gtbnoa > args->max_agbno) { xfs_btree_del_cursor(bno_cur_gt, XFS_BTREE_NOERROR); bno_cur_gt = NULL; @@ -1216,6 +1247,7 @@ restart: ASSERT(ltnew >= ltbno); ASSERT(ltnew + rlen <= ltbnoa + ltlena); ASSERT(ltnew + rlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length)); + ASSERT(ltnew >= args->min_agbno && ltnew <= args->max_agbno); args->agbno = ltnew; if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur_lt, ltbno, ltlen, @@ -1825,11 +1857,11 @@ xfs_alloc_compute_maxlevels( xfs_extlen_t xfs_alloc_longest_free_extent( struct xfs_mount *mp, - struct xfs_perag *pag) + struct xfs_perag *pag, + xfs_extlen_t need) { - xfs_extlen_t need, delta = 0; + xfs_extlen_t delta = 0; - need = XFS_MIN_FREELIST_PAG(pag, mp); if (need > pag->pagf_flcount) delta = need - pag->pagf_flcount; @@ -1838,131 +1870,150 @@ xfs_alloc_longest_free_extent( return pag->pagf_flcount > 0 || pag->pagf_longest > 0; } +unsigned int +xfs_alloc_min_freelist( + struct xfs_mount *mp, + struct xfs_perag *pag) +{ + unsigned int min_free; + + /* space needed by-bno freespace btree */ + min_free = min_t(unsigned int, pag->pagf_levels[XFS_BTNUM_BNOi] + 1, + mp->m_ag_maxlevels); + /* space needed by-size freespace btree */ + min_free += min_t(unsigned int, pag->pagf_levels[XFS_BTNUM_CNTi] + 1, + mp->m_ag_maxlevels); + + return min_free; +} + +/* + * Check if the operation we are fixing up the freelist for should go ahead or + * not. If we are freeing blocks, we always allow it, otherwise the allocation + * is dependent on whether the size and shape of free space available will + * permit the requested allocation to take place. + */ +static bool +xfs_alloc_space_available( + struct xfs_alloc_arg *args, + xfs_extlen_t min_free, + int flags) +{ + struct xfs_perag *pag = args->pag; + xfs_extlen_t longest; + int available; + + if (flags & XFS_ALLOC_FLAG_FREEING) + return true; + + /* do we have enough contiguous free space for the allocation? */ + longest = xfs_alloc_longest_free_extent(args->mp, pag, min_free); + if ((args->minlen + args->alignment + args->minalignslop - 1) > longest) + return false; + + /* do have enough free space remaining for the allocation? */ + available = (int)(pag->pagf_freeblks + pag->pagf_flcount - + min_free - args->total); + if (available < (int)args->minleft) + return false; + + return true; +} + /* * Decide whether to use this allocation group for this allocation. * If so, fix up the btree freelist's size. */ STATIC int /* error */ xfs_alloc_fix_freelist( - xfs_alloc_arg_t *args, /* allocation argument structure */ - int flags) /* XFS_ALLOC_FLAG_... */ + struct xfs_alloc_arg *args, /* allocation argument structure */ + int flags) /* XFS_ALLOC_FLAG_... */ { - xfs_buf_t *agbp; /* agf buffer pointer */ - xfs_agf_t *agf; /* a.g. freespace structure pointer */ - xfs_buf_t *agflbp;/* agfl buffer pointer */ - xfs_agblock_t bno; /* freelist block */ - xfs_extlen_t delta; /* new blocks needed in freelist */ - int error; /* error result code */ - xfs_extlen_t longest;/* longest extent in allocation group */ - xfs_mount_t *mp; /* file system mount point structure */ - xfs_extlen_t need; /* total blocks needed in freelist */ - xfs_perag_t *pag; /* per-ag information structure */ - xfs_alloc_arg_t targs; /* local allocation arguments */ - xfs_trans_t *tp; /* transaction pointer */ - - mp = args->mp; + struct xfs_mount *mp = args->mp; + struct xfs_perag *pag = args->pag; + struct xfs_trans *tp = args->tp; + struct xfs_buf *agbp = NULL; + struct xfs_buf *agflbp = NULL; + struct xfs_alloc_arg targs; /* local allocation arguments */ + xfs_agblock_t bno; /* freelist block */ + xfs_extlen_t need; /* total blocks needed in freelist */ + int error; - pag = args->pag; - tp = args->tp; if (!pag->pagf_init) { - if ((error = xfs_alloc_read_agf(mp, tp, args->agno, flags, - &agbp))) - return error; + error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp); + if (error) + goto out_no_agbp; if (!pag->pagf_init) { ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK); ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING)); - args->agbp = NULL; - return 0; + goto out_agbp_relse; } - } else - agbp = NULL; + } /* - * If this is a metadata preferred pag and we are user data - * then try somewhere else if we are not being asked to - * try harder at this point + * If this is a metadata preferred pag and we are user data then try + * somewhere else if we are not being asked to try harder at this + * point */ if (pag->pagf_metadata && args->userdata && (flags & XFS_ALLOC_FLAG_TRYLOCK)) { ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING)); - args->agbp = NULL; - return 0; + goto out_agbp_relse; } - if (!(flags & XFS_ALLOC_FLAG_FREEING)) { - /* - * If it looks like there isn't a long enough extent, or enough - * total blocks, reject it. - */ - need = XFS_MIN_FREELIST_PAG(pag, mp); - longest = xfs_alloc_longest_free_extent(mp, pag); - if ((args->minlen + args->alignment + args->minalignslop - 1) > - longest || - ((int)(pag->pagf_freeblks + pag->pagf_flcount - - need - args->total) < (int)args->minleft)) { - if (agbp) - xfs_trans_brelse(tp, agbp); - args->agbp = NULL; - return 0; - } - } + need = xfs_alloc_min_freelist(mp, pag); + if (!xfs_alloc_space_available(args, need, flags)) + goto out_agbp_relse; /* * Get the a.g. freespace buffer. * Can fail if we're not blocking on locks, and it's held. */ - if (agbp == NULL) { - if ((error = xfs_alloc_read_agf(mp, tp, args->agno, flags, - &agbp))) - return error; - if (agbp == NULL) { + if (!agbp) { + error = xfs_alloc_read_agf(mp, tp, args->agno, flags, &agbp); + if (error) + goto out_no_agbp; + if (!agbp) { ASSERT(flags & XFS_ALLOC_FLAG_TRYLOCK); ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING)); - args->agbp = NULL; - return 0; - } - } - /* - * Figure out how many blocks we should have in the freelist. - */ - agf = XFS_BUF_TO_AGF(agbp); - need = XFS_MIN_FREELIST(agf, mp); - /* - * If there isn't enough total or single-extent, reject it. - */ - if (!(flags & XFS_ALLOC_FLAG_FREEING)) { - delta = need > be32_to_cpu(agf->agf_flcount) ? - (need - be32_to_cpu(agf->agf_flcount)) : 0; - longest = be32_to_cpu(agf->agf_longest); - longest = (longest > delta) ? (longest - delta) : - (be32_to_cpu(agf->agf_flcount) > 0 || longest > 0); - if ((args->minlen + args->alignment + args->minalignslop - 1) > - longest || - ((int)(be32_to_cpu(agf->agf_freeblks) + - be32_to_cpu(agf->agf_flcount) - need - args->total) < - (int)args->minleft)) { - xfs_trans_brelse(tp, agbp); - args->agbp = NULL; - return 0; + goto out_no_agbp; } } + + /* If there isn't enough total space or single-extent, reject it. */ + need = xfs_alloc_min_freelist(mp, pag); + if (!xfs_alloc_space_available(args, need, flags)) + goto out_agbp_relse; + /* * Make the freelist shorter if it's too long. + * + * Note that from this point onwards, we will always release the agf and + * agfl buffers on error. This handles the case where we error out and + * the buffers are clean or may not have been joined to the transaction + * and hence need to be released manually. If they have been joined to + * the transaction, then xfs_trans_brelse() will handle them + * appropriately based on the recursion count and dirty state of the + * buffer. + * + * XXX (dgc): When we have lots of free space, does this buy us + * anything other than extra overhead when we need to put more blocks + * back on the free list? Maybe we should only do this when space is + * getting low or the AGFL is more than half full? */ - while (be32_to_cpu(agf->agf_flcount) > need) { - xfs_buf_t *bp; + while (pag->pagf_flcount > need) { + struct xfs_buf *bp; error = xfs_alloc_get_freelist(tp, agbp, &bno, 0); if (error) - return error; - if ((error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, 1))) - return error; + goto out_agbp_relse; + error = xfs_free_ag_extent(tp, agbp, args->agno, bno, 1, 1); + if (error) + goto out_agbp_relse; bp = xfs_btree_get_bufs(mp, tp, args->agno, bno, 0); xfs_trans_binval(tp, bp); } - /* - * Initialize the args structure. - */ + memset(&targs, 0, sizeof(targs)); targs.tp = tp; targs.mp = mp; @@ -1971,21 +2022,20 @@ xfs_alloc_fix_freelist( targs.alignment = targs.minlen = targs.prod = targs.isfl = 1; targs.type = XFS_ALLOCTYPE_THIS_AG; targs.pag = pag; - if ((error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp))) - return error; - /* - * Make the freelist longer if it's too short. - */ - while (be32_to_cpu(agf->agf_flcount) < need) { + error = xfs_alloc_read_agfl(mp, tp, targs.agno, &agflbp); + if (error) + goto out_agbp_relse; + + /* Make the freelist longer if it's too short. */ + while (pag->pagf_flcount < need) { targs.agbno = 0; - targs.maxlen = need - be32_to_cpu(agf->agf_flcount); - /* - * Allocate as many blocks as possible at once. - */ - if ((error = xfs_alloc_ag_vextent(&targs))) { - xfs_trans_brelse(tp, agflbp); - return error; - } + targs.maxlen = need - pag->pagf_flcount; + + /* Allocate as many blocks as possible at once. */ + error = xfs_alloc_ag_vextent(&targs); + if (error) + goto out_agflbp_relse; + /* * Stop if we run out. Won't happen if callers are obeying * the restrictions correctly. Can happen for free calls @@ -1994,9 +2044,7 @@ xfs_alloc_fix_freelist( if (targs.agbno == NULLAGBLOCK) { if (flags & XFS_ALLOC_FLAG_FREEING) break; - xfs_trans_brelse(tp, agflbp); - args->agbp = NULL; - return 0; + goto out_agflbp_relse; } /* * Put each allocated block on the list. @@ -2005,12 +2053,21 @@ xfs_alloc_fix_freelist( error = xfs_alloc_put_freelist(tp, agbp, agflbp, bno, 0); if (error) - return error; + goto out_agflbp_relse; } } xfs_trans_brelse(tp, agflbp); args->agbp = agbp; return 0; + +out_agflbp_relse: + xfs_trans_brelse(tp, agflbp); +out_agbp_relse: + if (agbp) + xfs_trans_brelse(tp, agbp); +out_no_agbp: + args->agbp = NULL; + return error; } /* diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index d1b4b6a5c..ca1c81683 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -112,6 +112,8 @@ typedef struct xfs_alloc_arg { xfs_extlen_t total; /* total blocks needed in xaction */ xfs_extlen_t alignment; /* align answer to multiple of this */ xfs_extlen_t minalignslop; /* slop for minlen+alignment calcs */ + xfs_agblock_t min_agbno; /* set an agbno range for NEAR allocs */ + xfs_agblock_t max_agbno; /* ... */ xfs_extlen_t len; /* output: actual size of extent */ xfs_alloctype_t type; /* allocation type XFS_ALLOCTYPE_... */ xfs_alloctype_t otype; /* original allocation type */ @@ -128,11 +130,9 @@ typedef struct xfs_alloc_arg { #define XFS_ALLOC_USERDATA 1 /* allocation is for user data*/ #define XFS_ALLOC_INITIAL_USER_DATA 2 /* special case start of file */ -/* - * Find the length of the longest extent in an AG. - */ -xfs_extlen_t -xfs_alloc_longest_free_extent(struct xfs_mount *mp, +xfs_extlen_t xfs_alloc_longest_free_extent(struct xfs_mount *mp, + struct xfs_perag *pag, xfs_extlen_t need); +unsigned int xfs_alloc_min_freelist(struct xfs_mount *mp, struct xfs_perag *pag); /* diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index 0a472fbe0..3349c9a1e 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -266,7 +266,7 @@ xfs_attr_set( tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; error = xfs_trans_reserve(args.trans, &tres, args.total, 0); if (error) { - xfs_trans_cancel(args.trans, 0); + xfs_trans_cancel(args.trans); return error; } xfs_ilock(dp, XFS_ILOCK_EXCL); @@ -276,7 +276,7 @@ xfs_attr_set( XFS_QMOPT_RES_REGBLKS); if (error) { xfs_iunlock(dp, XFS_ILOCK_EXCL); - xfs_trans_cancel(args.trans, XFS_TRANS_RELEASE_LOG_RES); + xfs_trans_cancel(args.trans); return error; } @@ -320,8 +320,7 @@ xfs_attr_set( xfs_trans_ichgtime(args.trans, dp, XFS_ICHGTIME_CHG); } - err2 = xfs_trans_commit(args.trans, - XFS_TRANS_RELEASE_LOG_RES); + err2 = xfs_trans_commit(args.trans); xfs_iunlock(dp, XFS_ILOCK_EXCL); return error ? error : err2; @@ -383,16 +382,14 @@ xfs_attr_set( * Commit the last in the sequence of transactions. */ xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE); - error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(args.trans); xfs_iunlock(dp, XFS_ILOCK_EXCL); return error; out: - if (args.trans) { - xfs_trans_cancel(args.trans, - XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); - } + if (args.trans) + xfs_trans_cancel(args.trans); xfs_iunlock(dp, XFS_ILOCK_EXCL); return error; } @@ -462,7 +459,7 @@ xfs_attr_remove( error = xfs_trans_reserve(args.trans, &M_RES(mp)->tr_attrrm, XFS_ATTRRM_SPACE_RES(mp), 0); if (error) { - xfs_trans_cancel(args.trans, 0); + xfs_trans_cancel(args.trans); return error; } @@ -501,16 +498,14 @@ xfs_attr_remove( * Commit the last in the sequence of transactions. */ xfs_trans_log_inode(args.trans, dp, XFS_ILOG_CORE); - error = xfs_trans_commit(args.trans, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(args.trans); xfs_iunlock(dp, XFS_ILOCK_EXCL); return error; out: - if (args.trans) { - xfs_trans_cancel(args.trans, - XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); - } + if (args.trans) + xfs_trans_cancel(args.trans); xfs_iunlock(dp, XFS_ILOCK_EXCL); return error; } diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index f1026e86d..63e05b663 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -1112,7 +1112,6 @@ xfs_bmap_add_attrfork( int committed; /* xaction was committed */ int logflags; /* logging flags */ int error; /* error return value */ - int cancel_flags = 0; ASSERT(XFS_IFORK_Q(ip) == 0); @@ -1124,17 +1123,15 @@ xfs_bmap_add_attrfork( tp->t_flags |= XFS_TRANS_RESERVE; error = xfs_trans_reserve(tp, &M_RES(mp)->tr_addafork, blks, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } - cancel_flags = XFS_TRANS_RELEASE_LOG_RES; xfs_ilock(ip, XFS_ILOCK_EXCL); error = xfs_trans_reserve_quota_nblks(tp, ip, blks, 0, rsvd ? XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_FORCE_RES : XFS_QMOPT_RES_REGBLKS); if (error) goto trans_cancel; - cancel_flags |= XFS_TRANS_ABORT; if (XFS_IFORK_Q(ip)) goto trans_cancel; if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) { @@ -1218,14 +1215,14 @@ xfs_bmap_add_attrfork( error = xfs_bmap_finish(&tp, &flist, &committed); if (error) goto bmap_cancel; - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; bmap_cancel: xfs_bmap_cancel(&flist); trans_cancel: - xfs_trans_cancel(tp, cancel_flags); + xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } @@ -3521,7 +3518,8 @@ xfs_bmap_longest_free_extent( } } - longest = xfs_alloc_longest_free_extent(mp, pag); + longest = xfs_alloc_longest_free_extent(mp, pag, + xfs_alloc_min_freelist(mp, pag)); if (*blen < longest) *blen = longest; @@ -4424,7 +4422,15 @@ xfs_bmapi_convert_unwritten( error = xfs_bmap_add_extent_unwritten_real(bma->tp, bma->ip, &bma->idx, &bma->cur, mval, bma->firstblock, bma->flist, &tmp_logflags); - bma->logflags |= tmp_logflags; + /* + * Log the inode core unconditionally in the unwritten extent conversion + * path because the conversion might not have done so (e.g., if the + * extent count hasn't changed). We need to make sure the inode is dirty + * in the transaction for the sake of fsync(), even if nothing has + * changed, because fsync() will not force the log for this transaction + * unless it sees the inode pinned. + */ + bma->logflags |= tmp_logflags | XFS_ILOG_CORE; if (error) return error; @@ -5918,7 +5924,7 @@ xfs_bmap_split_extent( error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, XFS_DIOSTRAT_SPACE_RES(mp, 0), 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } @@ -5936,10 +5942,9 @@ xfs_bmap_split_extent( if (error) goto out; - return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); - + return xfs_trans_commit(tp); out: - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + xfs_trans_cancel(tp); return error; } diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index 4daaa6623..a0ae57205 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -170,7 +170,7 @@ typedef struct xfs_sb { __uint32_t sb_features_log_incompat; __uint32_t sb_crc; /* superblock crc */ - __uint32_t sb_pad; + xfs_extlen_t sb_spino_align; /* sparse inode chunk alignment */ xfs_ino_t sb_pquotino; /* project quota inode */ xfs_lsn_t sb_lsn; /* last write sequence */ @@ -256,7 +256,7 @@ typedef struct xfs_dsb { __be32 sb_features_log_incompat; __le32 sb_crc; /* superblock crc */ - __be32 sb_pad; + __be32 sb_spino_align; /* sparse inode chunk alignment */ __be64 sb_pquotino; /* project quota inode */ __be64 sb_lsn; /* last write sequence */ @@ -457,8 +457,10 @@ xfs_sb_has_ro_compat_feature( } #define XFS_SB_FEAT_INCOMPAT_FTYPE (1 << 0) /* filetype in dirent */ +#define XFS_SB_FEAT_INCOMPAT_SPINODES (1 << 1) /* sparse inode chunks */ #define XFS_SB_FEAT_INCOMPAT_ALL \ - (XFS_SB_FEAT_INCOMPAT_FTYPE) + (XFS_SB_FEAT_INCOMPAT_FTYPE| \ + XFS_SB_FEAT_INCOMPAT_SPINODES) #define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL static inline bool @@ -506,6 +508,12 @@ static inline int xfs_sb_version_hasfinobt(xfs_sb_t *sbp) (sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_FINOBT); } +static inline bool xfs_sb_version_hassparseinodes(struct xfs_sb *sbp) +{ + return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 && + xfs_sb_has_incompat_feature(sbp, XFS_SB_FEAT_INCOMPAT_SPINODES); +} + /* * end of superblock version macros */ @@ -758,19 +766,6 @@ typedef struct xfs_agfl { #define XFS_AGFL_CRC_OFF offsetof(struct xfs_agfl, agfl_crc) - -#define XFS_AG_MAXLEVELS(mp) ((mp)->m_ag_maxlevels) -#define XFS_MIN_FREELIST_RAW(bl,cl,mp) \ - (MIN(bl + 1, XFS_AG_MAXLEVELS(mp)) + MIN(cl + 1, XFS_AG_MAXLEVELS(mp))) -#define XFS_MIN_FREELIST(a,mp) \ - (XFS_MIN_FREELIST_RAW( \ - be32_to_cpu((a)->agf_levels[XFS_BTNUM_BNOi]), \ - be32_to_cpu((a)->agf_levels[XFS_BTNUM_CNTi]), mp)) -#define XFS_MIN_FREELIST_PAG(pag,mp) \ - (XFS_MIN_FREELIST_RAW( \ - (unsigned int)(pag)->pagf_levels[XFS_BTNUM_BNOi], \ - (unsigned int)(pag)->pagf_levels[XFS_BTNUM_CNTi], mp)) - #define XFS_AGB_TO_FSB(mp,agno,agbno) \ (((xfs_fsblock_t)(agno) << (mp)->m_sb.sb_agblklog) | (agbno)) #define XFS_FSB_TO_AGNO(mp,fsbno) \ @@ -1216,26 +1211,54 @@ typedef __uint64_t xfs_inofree_t; #define XFS_INOBT_ALL_FREE ((xfs_inofree_t)-1) #define XFS_INOBT_MASK(i) ((xfs_inofree_t)1 << (i)) +#define XFS_INOBT_HOLEMASK_FULL 0 /* holemask for full chunk */ +#define XFS_INOBT_HOLEMASK_BITS (NBBY * sizeof(__uint16_t)) +#define XFS_INODES_PER_HOLEMASK_BIT \ + (XFS_INODES_PER_CHUNK / (NBBY * sizeof(__uint16_t))) + static inline xfs_inofree_t xfs_inobt_maskn(int i, int n) { return ((n >= XFS_INODES_PER_CHUNK ? 0 : XFS_INOBT_MASK(n)) - 1) << i; } /* - * Data record structure + * The on-disk inode record structure has two formats. The original "full" + * format uses a 4-byte freecount. The "sparse" format uses a 1-byte freecount + * and replaces the 3 high-order freecount bytes wth the holemask and inode + * count. + * + * The holemask of the sparse record format allows an inode chunk to have holes + * that refer to blocks not owned by the inode record. This facilitates inode + * allocation in the event of severe free space fragmentation. */ typedef struct xfs_inobt_rec { __be32 ir_startino; /* starting inode number */ - __be32 ir_freecount; /* count of free inodes (set bits) */ + union { + struct { + __be32 ir_freecount; /* count of free inodes */ + } f; + struct { + __be16 ir_holemask;/* hole mask for sparse chunks */ + __u8 ir_count; /* total inode count */ + __u8 ir_freecount; /* count of free inodes */ + } sp; + } ir_u; __be64 ir_free; /* free inode mask */ } xfs_inobt_rec_t; typedef struct xfs_inobt_rec_incore { xfs_agino_t ir_startino; /* starting inode number */ - __int32_t ir_freecount; /* count of free inodes (set bits) */ + __uint16_t ir_holemask; /* hole mask for sparse chunks */ + __uint8_t ir_count; /* total inode count */ + __uint8_t ir_freecount; /* count of free inodes (set bits) */ xfs_inofree_t ir_free; /* free inode mask */ } xfs_inobt_rec_incore_t; +static inline bool xfs_inobt_issparse(uint16_t holemask) +{ + /* non-zero holemask represents a sparse rec. */ + return holemask; +} /* * Key structure @@ -1453,8 +1476,8 @@ struct xfs_acl { sizeof(struct xfs_acl_entry) * XFS_ACL_MAX_ENTRIES((mp))) /* On-disk XFS extended attribute names */ -#define SGI_ACL_FILE (unsigned char *)"SGI_ACL_FILE" -#define SGI_ACL_DEFAULT (unsigned char *)"SGI_ACL_DEFAULT" +#define SGI_ACL_FILE "SGI_ACL_FILE" +#define SGI_ACL_DEFAULT "SGI_ACL_DEFAULT" #define SGI_ACL_FILE_SIZE (sizeof(SGI_ACL_FILE)-1) #define SGI_ACL_DEFAULT_SIZE (sizeof(SGI_ACL_DEFAULT)-1) diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 18dc721ca..89689c6a4 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -239,6 +239,7 @@ typedef struct xfs_fsop_resblks { #define XFS_FSOP_GEOM_FLAGS_V5SB 0x8000 /* version 5 superblock */ #define XFS_FSOP_GEOM_FLAGS_FTYPE 0x10000 /* inode directory types */ #define XFS_FSOP_GEOM_FLAGS_FINOBT 0x20000 /* free inode btree */ +#define XFS_FSOP_GEOM_FLAGS_SPINODES 0x40000 /* sparse inode chunks */ /* * Minimum and maximum sizes need for growth checks. diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 1c9e75521..66efc7024 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -65,6 +65,8 @@ xfs_inobt_lookup( int *stat) /* success/failure */ { cur->bc_rec.i.ir_startino = ino; + cur->bc_rec.i.ir_holemask = 0; + cur->bc_rec.i.ir_count = 0; cur->bc_rec.i.ir_freecount = 0; cur->bc_rec.i.ir_free = 0; return xfs_btree_lookup(cur, dir, stat); @@ -82,7 +84,14 @@ xfs_inobt_update( union xfs_btree_rec rec; rec.inobt.ir_startino = cpu_to_be32(irec->ir_startino); - rec.inobt.ir_freecount = cpu_to_be32(irec->ir_freecount); + if (xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb)) { + rec.inobt.ir_u.sp.ir_holemask = cpu_to_be16(irec->ir_holemask); + rec.inobt.ir_u.sp.ir_count = irec->ir_count; + rec.inobt.ir_u.sp.ir_freecount = irec->ir_freecount; + } else { + /* ir_holemask/ir_count not supported on-disk */ + rec.inobt.ir_u.f.ir_freecount = cpu_to_be32(irec->ir_freecount); + } rec.inobt.ir_free = cpu_to_be64(irec->ir_free); return xfs_btree_update(cur, &rec); } @@ -100,12 +109,27 @@ xfs_inobt_get_rec( int error; error = xfs_btree_get_rec(cur, &rec, stat); - if (!error && *stat == 1) { - irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino); - irec->ir_freecount = be32_to_cpu(rec->inobt.ir_freecount); - irec->ir_free = be64_to_cpu(rec->inobt.ir_free); + if (error || *stat == 0) + return error; + + irec->ir_startino = be32_to_cpu(rec->inobt.ir_startino); + if (xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb)) { + irec->ir_holemask = be16_to_cpu(rec->inobt.ir_u.sp.ir_holemask); + irec->ir_count = rec->inobt.ir_u.sp.ir_count; + irec->ir_freecount = rec->inobt.ir_u.sp.ir_freecount; + } else { + /* + * ir_holemask/ir_count not supported on-disk. Fill in hardcoded + * values for full inode chunks. + */ + irec->ir_holemask = XFS_INOBT_HOLEMASK_FULL; + irec->ir_count = XFS_INODES_PER_CHUNK; + irec->ir_freecount = + be32_to_cpu(rec->inobt.ir_u.f.ir_freecount); } - return error; + irec->ir_free = be64_to_cpu(rec->inobt.ir_free); + + return 0; } /* @@ -114,10 +138,14 @@ xfs_inobt_get_rec( STATIC int xfs_inobt_insert_rec( struct xfs_btree_cur *cur, + __uint16_t holemask, + __uint8_t count, __int32_t freecount, xfs_inofree_t free, int *stat) { + cur->bc_rec.i.ir_holemask = holemask; + cur->bc_rec.i.ir_count = count; cur->bc_rec.i.ir_freecount = freecount; cur->bc_rec.i.ir_free = free; return xfs_btree_insert(cur, stat); @@ -154,7 +182,9 @@ xfs_inobt_insert( } ASSERT(i == 0); - error = xfs_inobt_insert_rec(cur, XFS_INODES_PER_CHUNK, + error = xfs_inobt_insert_rec(cur, XFS_INOBT_HOLEMASK_FULL, + XFS_INODES_PER_CHUNK, + XFS_INODES_PER_CHUNK, XFS_INOBT_ALL_FREE, &i); if (error) { xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); @@ -220,6 +250,7 @@ xfs_ialloc_inode_init( struct xfs_mount *mp, struct xfs_trans *tp, struct list_head *buffer_list, + int icount, xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_agblock_t length, @@ -275,7 +306,7 @@ xfs_ialloc_inode_init( * they track in the AIL as if they were physically logged. */ if (tp) - xfs_icreate_log(tp, agno, agbno, mp->m_ialloc_inos, + xfs_icreate_log(tp, agno, agbno, icount, mp->m_sb.sb_inodesize, length, gen); } else version = 2; @@ -347,6 +378,214 @@ xfs_ialloc_inode_init( } /* + * Align startino and allocmask for a recently allocated sparse chunk such that + * they are fit for insertion (or merge) into the on-disk inode btrees. + * + * Background: + * + * When enabled, sparse inode support increases the inode alignment from cluster + * size to inode chunk size. This means that the minimum range between two + * non-adjacent inode records in the inobt is large enough for a full inode + * record. This allows for cluster sized, cluster aligned block allocation + * without need to worry about whether the resulting inode record overlaps with + * another record in the tree. Without this basic rule, we would have to deal + * with the consequences of overlap by potentially undoing recent allocations in + * the inode allocation codepath. + * + * Because of this alignment rule (which is enforced on mount), there are two + * inobt possibilities for newly allocated sparse chunks. One is that the + * aligned inode record for the chunk covers a range of inodes not already + * covered in the inobt (i.e., it is safe to insert a new sparse record). The + * other is that a record already exists at the aligned startino that considers + * the newly allocated range as sparse. In the latter case, record content is + * merged in hope that sparse inode chunks fill to full chunks over time. + */ +STATIC void +xfs_align_sparse_ino( + struct xfs_mount *mp, + xfs_agino_t *startino, + uint16_t *allocmask) +{ + xfs_agblock_t agbno; + xfs_agblock_t mod; + int offset; + + agbno = XFS_AGINO_TO_AGBNO(mp, *startino); + mod = agbno % mp->m_sb.sb_inoalignmt; + if (!mod) + return; + + /* calculate the inode offset and align startino */ + offset = mod << mp->m_sb.sb_inopblog; + *startino -= offset; + + /* + * Since startino has been aligned down, left shift allocmask such that + * it continues to represent the same physical inodes relative to the + * new startino. + */ + *allocmask <<= offset / XFS_INODES_PER_HOLEMASK_BIT; +} + +/* + * Determine whether the source inode record can merge into the target. Both + * records must be sparse, the inode ranges must match and there must be no + * allocation overlap between the records. + */ +STATIC bool +__xfs_inobt_can_merge( + struct xfs_inobt_rec_incore *trec, /* tgt record */ + struct xfs_inobt_rec_incore *srec) /* src record */ +{ + uint64_t talloc; + uint64_t salloc; + + /* records must cover the same inode range */ + if (trec->ir_startino != srec->ir_startino) + return false; + + /* both records must be sparse */ + if (!xfs_inobt_issparse(trec->ir_holemask) || + !xfs_inobt_issparse(srec->ir_holemask)) + return false; + + /* both records must track some inodes */ + if (!trec->ir_count || !srec->ir_count) + return false; + + /* can't exceed capacity of a full record */ + if (trec->ir_count + srec->ir_count > XFS_INODES_PER_CHUNK) + return false; + + /* verify there is no allocation overlap */ + talloc = xfs_inobt_irec_to_allocmask(trec); + salloc = xfs_inobt_irec_to_allocmask(srec); + if (talloc & salloc) + return false; + + return true; +} + +/* + * Merge the source inode record into the target. The caller must call + * __xfs_inobt_can_merge() to ensure the merge is valid. + */ +STATIC void +__xfs_inobt_rec_merge( + struct xfs_inobt_rec_incore *trec, /* target */ + struct xfs_inobt_rec_incore *srec) /* src */ +{ + ASSERT(trec->ir_startino == srec->ir_startino); + + /* combine the counts */ + trec->ir_count += srec->ir_count; + trec->ir_freecount += srec->ir_freecount; + + /* + * Merge the holemask and free mask. For both fields, 0 bits refer to + * allocated inodes. We combine the allocated ranges with bitwise AND. + */ + trec->ir_holemask &= srec->ir_holemask; + trec->ir_free &= srec->ir_free; +} + +/* + * Insert a new sparse inode chunk into the associated inode btree. The inode + * record for the sparse chunk is pre-aligned to a startino that should match + * any pre-existing sparse inode record in the tree. This allows sparse chunks + * to fill over time. + * + * This function supports two modes of handling preexisting records depending on + * the merge flag. If merge is true, the provided record is merged with the + * existing record and updated in place. The merged record is returned in nrec. + * If merge is false, an existing record is replaced with the provided record. + * If no preexisting record exists, the provided record is always inserted. + * + * It is considered corruption if a merge is requested and not possible. Given + * the sparse inode alignment constraints, this should never happen. + */ +STATIC int +xfs_inobt_insert_sprec( + struct xfs_mount *mp, + struct xfs_trans *tp, + struct xfs_buf *agbp, + int btnum, + struct xfs_inobt_rec_incore *nrec, /* in/out: new/merged rec. */ + bool merge) /* merge or replace */ +{ + struct xfs_btree_cur *cur; + struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); + xfs_agnumber_t agno = be32_to_cpu(agi->agi_seqno); + int error; + int i; + struct xfs_inobt_rec_incore rec; + + cur = xfs_inobt_init_cursor(mp, tp, agbp, agno, btnum); + + /* the new record is pre-aligned so we know where to look */ + error = xfs_inobt_lookup(cur, nrec->ir_startino, XFS_LOOKUP_EQ, &i); + if (error) + goto error; + /* if nothing there, insert a new record and return */ + if (i == 0) { + error = xfs_inobt_insert_rec(cur, nrec->ir_holemask, + nrec->ir_count, nrec->ir_freecount, + nrec->ir_free, &i); + if (error) + goto error; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error); + + goto out; + } + + /* + * A record exists at this startino. Merge or replace the record + * depending on what we've been asked to do. + */ + if (merge) { + error = xfs_inobt_get_rec(cur, &rec, &i); + if (error) + goto error; + XFS_WANT_CORRUPTED_GOTO(mp, i == 1, error); + XFS_WANT_CORRUPTED_GOTO(mp, + rec.ir_startino == nrec->ir_startino, + error); + + /* + * This should never fail. If we have coexisting records that + * cannot merge, something is seriously wrong. + */ + XFS_WANT_CORRUPTED_GOTO(mp, __xfs_inobt_can_merge(nrec, &rec), + error); + + trace_xfs_irec_merge_pre(mp, agno, rec.ir_startino, + rec.ir_holemask, nrec->ir_startino, + nrec->ir_holemask); + + /* merge to nrec to output the updated record */ + __xfs_inobt_rec_merge(nrec, &rec); + + trace_xfs_irec_merge_post(mp, agno, nrec->ir_startino, + nrec->ir_holemask); + + error = xfs_inobt_rec_check_count(mp, nrec); + if (error) + goto error; + } + + error = xfs_inobt_update(cur, nrec); + if (error) + goto error; + +out: + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + return 0; +error: + xfs_btree_del_cursor(cur, XFS_BTREE_ERROR); + return error; +} + +/* * Allocate new inodes in the allocation group specified by agbp. * Return 0 for success, else error code. */ @@ -364,11 +603,22 @@ xfs_ialloc_ag_alloc( xfs_agino_t newlen; /* new number of inodes */ int isaligned = 0; /* inode allocation at stripe unit */ /* boundary */ + uint16_t allocmask = (uint16_t) -1; /* init. to full chunk */ + struct xfs_inobt_rec_incore rec; struct xfs_perag *pag; + int do_sparse = 0; memset(&args, 0, sizeof(args)); args.tp = tp; args.mp = tp->t_mountp; + args.fsbno = NULLFSBLOCK; + +#ifdef DEBUG + /* randomly do sparse inode allocations */ + if (xfs_sb_version_hassparseinodes(&tp->t_mountp->m_sb) && + args.mp->m_ialloc_min_blks < args.mp->m_ialloc_blks) + do_sparse = prandom_u32() & 1; +#endif /* * Locking will ensure that we don't have two callers in here @@ -390,6 +640,8 @@ xfs_ialloc_ag_alloc( agno = be32_to_cpu(agi->agi_seqno); args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) + args.mp->m_ialloc_blks; + if (do_sparse) + goto sparse_alloc; if (likely(newino != NULLAGINO && (args.agbno < be32_to_cpu(agi->agi_length)))) { args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); @@ -428,8 +680,7 @@ xfs_ialloc_ag_alloc( * subsequent requests. */ args.minalignslop = 0; - } else - args.fsbno = NULLFSBLOCK; + } if (unlikely(args.fsbno == NULLFSBLOCK)) { /* @@ -480,6 +731,47 @@ xfs_ialloc_ag_alloc( return error; } + /* + * Finally, try a sparse allocation if the filesystem supports it and + * the sparse allocation length is smaller than a full chunk. + */ + if (xfs_sb_version_hassparseinodes(&args.mp->m_sb) && + args.mp->m_ialloc_min_blks < args.mp->m_ialloc_blks && + args.fsbno == NULLFSBLOCK) { +sparse_alloc: + args.type = XFS_ALLOCTYPE_NEAR_BNO; + args.agbno = be32_to_cpu(agi->agi_root); + args.fsbno = XFS_AGB_TO_FSB(args.mp, agno, args.agbno); + args.alignment = args.mp->m_sb.sb_spino_align; + args.prod = 1; + + args.minlen = args.mp->m_ialloc_min_blks; + args.maxlen = args.minlen; + + /* + * The inode record will be aligned to full chunk size. We must + * prevent sparse allocation from AG boundaries that result in + * invalid inode records, such as records that start at agbno 0 + * or extend beyond the AG. + * + * Set min agbno to the first aligned, non-zero agbno and max to + * the last aligned agbno that is at least one full chunk from + * the end of the AG. + */ + args.min_agbno = args.mp->m_sb.sb_inoalignmt; + args.max_agbno = round_down(args.mp->m_sb.sb_agblocks, + args.mp->m_sb.sb_inoalignmt) - + args.mp->m_ialloc_blks; + + error = xfs_alloc_vextent(&args); + if (error) + return error; + + newlen = args.len << args.mp->m_sb.sb_inopblog; + ASSERT(newlen <= XFS_INODES_PER_CHUNK); + allocmask = (1 << (newlen / XFS_INODES_PER_HOLEMASK_BIT)) - 1; + } + if (args.fsbno == NULLFSBLOCK) { *alloc = 0; return 0; @@ -495,8 +787,8 @@ xfs_ialloc_ag_alloc( * rather than a linear progression to prevent the next generation * number from being easily guessable. */ - error = xfs_ialloc_inode_init(args.mp, tp, NULL, agno, args.agbno, - args.len, prandom_u32()); + error = xfs_ialloc_inode_init(args.mp, tp, NULL, newlen, agno, + args.agbno, args.len, prandom_u32()); if (error) return error; @@ -504,6 +796,73 @@ xfs_ialloc_ag_alloc( * Convert the results. */ newino = XFS_OFFBNO_TO_AGINO(args.mp, args.agbno, 0); + + if (xfs_inobt_issparse(~allocmask)) { + /* + * We've allocated a sparse chunk. Align the startino and mask. + */ + xfs_align_sparse_ino(args.mp, &newino, &allocmask); + + rec.ir_startino = newino; + rec.ir_holemask = ~allocmask; + rec.ir_count = newlen; + rec.ir_freecount = newlen; + rec.ir_free = XFS_INOBT_ALL_FREE; + + /* + * Insert the sparse record into the inobt and allow for a merge + * if necessary. If a merge does occur, rec is updated to the + * merged record. + */ + error = xfs_inobt_insert_sprec(args.mp, tp, agbp, XFS_BTNUM_INO, + &rec, true); + if (error == -EFSCORRUPTED) { + xfs_alert(args.mp, + "invalid sparse inode record: ino 0x%llx holemask 0x%x count %u", + XFS_AGINO_TO_INO(args.mp, agno, + rec.ir_startino), + rec.ir_holemask, rec.ir_count); + xfs_force_shutdown(args.mp, SHUTDOWN_CORRUPT_INCORE); + } + if (error) + return error; + + /* + * We can't merge the part we've just allocated as for the inobt + * due to finobt semantics. The original record may or may not + * exist independent of whether physical inodes exist in this + * sparse chunk. + * + * We must update the finobt record based on the inobt record. + * rec contains the fully merged and up to date inobt record + * from the previous call. Set merge false to replace any + * existing record with this one. + */ + if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) { + error = xfs_inobt_insert_sprec(args.mp, tp, agbp, + XFS_BTNUM_FINO, &rec, + false); + if (error) + return error; + } + } else { + /* full chunk - insert new records to both btrees */ + error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen, + XFS_BTNUM_INO); + if (error) + return error; + + if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) { + error = xfs_inobt_insert(args.mp, tp, agbp, newino, + newlen, XFS_BTNUM_FINO); + if (error) + return error; + } + } + + /* + * Update AGI counts and newino. + */ be32_add_cpu(&agi->agi_count, newlen); be32_add_cpu(&agi->agi_freecount, newlen); pag = xfs_perag_get(args.mp, agno); @@ -512,20 +871,6 @@ xfs_ialloc_ag_alloc( agi->agi_newino = cpu_to_be32(newino); /* - * Insert records describing the new inode chunk into the btrees. - */ - error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen, - XFS_BTNUM_INO); - if (error) - return error; - - if (xfs_sb_version_hasfinobt(&args.mp->m_sb)) { - error = xfs_inobt_insert(args.mp, tp, agbp, newino, newlen, - XFS_BTNUM_FINO); - if (error) - return error; - } - /* * Log allocation group header fields */ xfs_ialloc_log_agi(tp, agbp, @@ -645,7 +990,7 @@ xfs_ialloc_ag_select( * if we fail allocation due to alignment issues then it is most * likely a real ENOSPC condition. */ - ineed = mp->m_ialloc_blks; + ineed = mp->m_ialloc_min_blks; if (flags && ineed > 1) ineed += xfs_ialloc_cluster_alignment(mp); longest = pag->pagf_longest; @@ -732,6 +1077,27 @@ xfs_ialloc_get_rec( } /* + * Return the offset of the first free inode in the record. If the inode chunk + * is sparsely allocated, we convert the record holemask to inode granularity + * and mask off the unallocated regions from the inode free mask. + */ +STATIC int +xfs_inobt_first_free_inode( + struct xfs_inobt_rec_incore *rec) +{ + xfs_inofree_t realfree; + + /* if there are no holes, return the first available offset */ + if (!xfs_inobt_issparse(rec->ir_holemask)) + return xfs_lowbit64(rec->ir_free); + + realfree = xfs_inobt_irec_to_allocmask(rec); + realfree &= rec->ir_free; + + return xfs_lowbit64(realfree); +} + +/* * Allocate an inode using the inobt-only algorithm. */ STATIC int @@ -961,7 +1327,7 @@ newino: } alloc_inode: - offset = xfs_lowbit64(rec.ir_free); + offset = xfs_inobt_first_free_inode(&rec); ASSERT(offset >= 0); ASSERT(offset < XFS_INODES_PER_CHUNK); ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % @@ -1210,7 +1576,7 @@ xfs_dialloc_ag( if (error) goto error_cur; - offset = xfs_lowbit64(rec.ir_free); + offset = xfs_inobt_first_free_inode(&rec); ASSERT(offset >= 0); ASSERT(offset < XFS_INODES_PER_CHUNK); ASSERT((XFS_AGINO_TO_OFFSET(mp, rec.ir_startino) % @@ -1439,6 +1805,83 @@ out_error: return error; } +/* + * Free the blocks of an inode chunk. We must consider that the inode chunk + * might be sparse and only free the regions that are allocated as part of the + * chunk. + */ +STATIC void +xfs_difree_inode_chunk( + struct xfs_mount *mp, + xfs_agnumber_t agno, + struct xfs_inobt_rec_incore *rec, + struct xfs_bmap_free *flist) +{ + xfs_agblock_t sagbno = XFS_AGINO_TO_AGBNO(mp, rec->ir_startino); + int startidx, endidx; + int nextbit; + xfs_agblock_t agbno; + int contigblk; + DECLARE_BITMAP(holemask, XFS_INOBT_HOLEMASK_BITS); + + if (!xfs_inobt_issparse(rec->ir_holemask)) { + /* not sparse, calculate extent info directly */ + xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno, + XFS_AGINO_TO_AGBNO(mp, rec->ir_startino)), + mp->m_ialloc_blks, flist, mp); + return; + } + + /* holemask is only 16-bits (fits in an unsigned long) */ + ASSERT(sizeof(rec->ir_holemask) <= sizeof(holemask[0])); + holemask[0] = rec->ir_holemask; + + /* + * Find contiguous ranges of zeroes (i.e., allocated regions) in the + * holemask and convert the start/end index of each range to an extent. + * We start with the start and end index both pointing at the first 0 in + * the mask. + */ + startidx = endidx = find_first_zero_bit(holemask, + XFS_INOBT_HOLEMASK_BITS); + nextbit = startidx + 1; + while (startidx < XFS_INOBT_HOLEMASK_BITS) { + nextbit = find_next_zero_bit(holemask, XFS_INOBT_HOLEMASK_BITS, + nextbit); + /* + * If the next zero bit is contiguous, update the end index of + * the current range and continue. + */ + if (nextbit != XFS_INOBT_HOLEMASK_BITS && + nextbit == endidx + 1) { + endidx = nextbit; + goto next; + } + + /* + * nextbit is not contiguous with the current end index. Convert + * the current start/end to an extent and add it to the free + * list. + */ + agbno = sagbno + (startidx * XFS_INODES_PER_HOLEMASK_BIT) / + mp->m_sb.sb_inopblock; + contigblk = ((endidx - startidx + 1) * + XFS_INODES_PER_HOLEMASK_BIT) / + mp->m_sb.sb_inopblock; + + ASSERT(agbno % mp->m_sb.sb_spino_align == 0); + ASSERT(contigblk % mp->m_sb.sb_spino_align == 0); + xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno, agbno), contigblk, + flist, mp); + + /* reset range to current bit and carry on... */ + startidx = endidx = nextbit; + +next: + nextbit++; + } +} + STATIC int xfs_difree_inobt( struct xfs_mount *mp, @@ -1446,8 +1889,7 @@ xfs_difree_inobt( struct xfs_buf *agbp, xfs_agino_t agino, struct xfs_bmap_free *flist, - int *deleted, - xfs_ino_t *first_ino, + struct xfs_icluster *xic, struct xfs_inobt_rec_incore *orec) { struct xfs_agi *agi = XFS_BUF_TO_AGI(agbp); @@ -1501,20 +1943,23 @@ xfs_difree_inobt( rec.ir_freecount++; /* - * When an inode cluster is free, it becomes eligible for removal + * When an inode chunk is free, it becomes eligible for removal. Don't + * remove the chunk if the block size is large enough for multiple inode + * chunks (that might not be free). */ if (!(mp->m_flags & XFS_MOUNT_IKEEP) && - (rec.ir_freecount == mp->m_ialloc_inos)) { - - *deleted = 1; - *first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino); + rec.ir_free == XFS_INOBT_ALL_FREE && + mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK) { + xic->deleted = 1; + xic->first_ino = XFS_AGINO_TO_INO(mp, agno, rec.ir_startino); + xic->alloc = xfs_inobt_irec_to_allocmask(&rec); /* * Remove the inode cluster from the AGI B+Tree, adjust the * AGI and Superblock inode counts, and mark the disk space * to be freed when the transaction is committed. */ - ilen = mp->m_ialloc_inos; + ilen = rec.ir_freecount; be32_add_cpu(&agi->agi_count, -ilen); be32_add_cpu(&agi->agi_freecount, -(ilen - 1)); xfs_ialloc_log_agi(tp, agbp, XFS_AGI_COUNT | XFS_AGI_FREECOUNT); @@ -1530,11 +1975,9 @@ xfs_difree_inobt( goto error0; } - xfs_bmap_add_free(XFS_AGB_TO_FSB(mp, agno, - XFS_AGINO_TO_AGBNO(mp, rec.ir_startino)), - mp->m_ialloc_blks, flist, mp); + xfs_difree_inode_chunk(mp, agno, &rec, flist); } else { - *deleted = 0; + xic->deleted = 0; error = xfs_inobt_update(cur, &rec); if (error) { @@ -1599,7 +2042,9 @@ xfs_difree_finobt( */ XFS_WANT_CORRUPTED_GOTO(mp, ibtrec->ir_freecount == 1, error); - error = xfs_inobt_insert_rec(cur, ibtrec->ir_freecount, + error = xfs_inobt_insert_rec(cur, ibtrec->ir_holemask, + ibtrec->ir_count, + ibtrec->ir_freecount, ibtrec->ir_free, &i); if (error) goto error; @@ -1634,8 +2079,13 @@ xfs_difree_finobt( * free inode. Hence, if all of the inodes are free and we aren't * keeping inode chunks permanently on disk, remove the record. * Otherwise, update the record with the new information. + * + * Note that we currently can't free chunks when the block size is large + * enough for multiple chunks. Leave the finobt record to remain in sync + * with the inobt. */ - if (rec.ir_freecount == mp->m_ialloc_inos && + if (rec.ir_free == XFS_INOBT_ALL_FREE && + mp->m_sb.sb_inopblock <= XFS_INODES_PER_CHUNK && !(mp->m_flags & XFS_MOUNT_IKEEP)) { error = xfs_btree_delete(cur, &i); if (error) @@ -1671,8 +2121,7 @@ xfs_difree( struct xfs_trans *tp, /* transaction pointer */ xfs_ino_t inode, /* inode to be freed */ struct xfs_bmap_free *flist, /* extents to free */ - int *deleted,/* set if inode cluster was deleted */ - xfs_ino_t *first_ino)/* first inode in deleted cluster */ + struct xfs_icluster *xic) /* cluster info if deleted */ { /* REFERENCED */ xfs_agblock_t agbno; /* block number containing inode */ @@ -1723,8 +2172,7 @@ xfs_difree( /* * Fix up the inode allocation btree. */ - error = xfs_difree_inobt(mp, tp, agbp, agino, flist, deleted, first_ino, - &rec); + error = xfs_difree_inobt(mp, tp, agbp, agino, flist, xic, &rec); if (error) goto error0; diff --git a/fs/xfs/libxfs/xfs_ialloc.h b/fs/xfs/libxfs/xfs_ialloc.h index 100007d56..6e450df29 100644 --- a/fs/xfs/libxfs/xfs_ialloc.h +++ b/fs/xfs/libxfs/xfs_ialloc.h @@ -28,6 +28,13 @@ struct xfs_btree_cur; /* Move inodes in clusters of this size */ #define XFS_INODE_BIG_CLUSTER_SIZE 8192 +struct xfs_icluster { + bool deleted; /* record is deleted */ + xfs_ino_t first_ino; /* first inode number */ + uint64_t alloc; /* inode phys. allocation bitmap for + * sparse chunks */ +}; + /* Calculate and return the number of filesystem blocks per inode cluster */ static inline int xfs_icluster_size_fsb( @@ -44,8 +51,7 @@ xfs_icluster_size_fsb( static inline struct xfs_dinode * xfs_make_iptr(struct xfs_mount *mp, struct xfs_buf *b, int o) { - return (struct xfs_dinode *) - (xfs_buf_offset(b, o << (mp)->m_sb.sb_inodelog)); + return xfs_buf_offset(b, o << (mp)->m_sb.sb_inodelog); } /* @@ -90,8 +96,7 @@ xfs_difree( struct xfs_trans *tp, /* transaction pointer */ xfs_ino_t inode, /* inode to be freed */ struct xfs_bmap_free *flist, /* extents to free */ - int *deleted, /* set if inode cluster was deleted */ - xfs_ino_t *first_ino); /* first inode in deleted cluster */ + struct xfs_icluster *ifree); /* cluster info if deleted */ /* * Return the location of the inode in imap, for mapping it into a buffer. @@ -156,7 +161,7 @@ int xfs_inobt_get_rec(struct xfs_btree_cur *cur, * Inode chunk initialisation routine */ int xfs_ialloc_inode_init(struct xfs_mount *mp, struct xfs_trans *tp, - struct list_head *buffer_list, + struct list_head *buffer_list, int icount, xfs_agnumber_t agno, xfs_agblock_t agbno, xfs_agblock_t length, unsigned int gen); diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index 964c465ca..674ad8f76 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -167,7 +167,16 @@ xfs_inobt_init_rec_from_cur( union xfs_btree_rec *rec) { rec->inobt.ir_startino = cpu_to_be32(cur->bc_rec.i.ir_startino); - rec->inobt.ir_freecount = cpu_to_be32(cur->bc_rec.i.ir_freecount); + if (xfs_sb_version_hassparseinodes(&cur->bc_mp->m_sb)) { + rec->inobt.ir_u.sp.ir_holemask = + cpu_to_be16(cur->bc_rec.i.ir_holemask); + rec->inobt.ir_u.sp.ir_count = cur->bc_rec.i.ir_count; + rec->inobt.ir_u.sp.ir_freecount = cur->bc_rec.i.ir_freecount; + } else { + /* ir_holemask/ir_count not supported on-disk */ + rec->inobt.ir_u.f.ir_freecount = + cpu_to_be32(cur->bc_rec.i.ir_freecount); + } rec->inobt.ir_free = cpu_to_be64(cur->bc_rec.i.ir_free); } @@ -418,3 +427,85 @@ xfs_inobt_maxrecs( return blocklen / sizeof(xfs_inobt_rec_t); return blocklen / (sizeof(xfs_inobt_key_t) + sizeof(xfs_inobt_ptr_t)); } + +/* + * Convert the inode record holemask to an inode allocation bitmap. The inode + * allocation bitmap is inode granularity and specifies whether an inode is + * physically allocated on disk (not whether the inode is considered allocated + * or free by the fs). + * + * A bit value of 1 means the inode is allocated, a value of 0 means it is free. + */ +uint64_t +xfs_inobt_irec_to_allocmask( + struct xfs_inobt_rec_incore *rec) +{ + uint64_t bitmap = 0; + uint64_t inodespbit; + int nextbit; + uint allocbitmap; + + /* + * The holemask has 16-bits for a 64 inode record. Therefore each + * holemask bit represents multiple inodes. Create a mask of bits to set + * in the allocmask for each holemask bit. + */ + inodespbit = (1 << XFS_INODES_PER_HOLEMASK_BIT) - 1; + + /* + * Allocated inodes are represented by 0 bits in holemask. Invert the 0 + * bits to 1 and convert to a uint so we can use xfs_next_bit(). Mask + * anything beyond the 16 holemask bits since this casts to a larger + * type. + */ + allocbitmap = ~rec->ir_holemask & ((1 << XFS_INOBT_HOLEMASK_BITS) - 1); + + /* + * allocbitmap is the inverted holemask so every set bit represents + * allocated inodes. To expand from 16-bit holemask granularity to + * 64-bit (e.g., bit-per-inode), set inodespbit bits in the target + * bitmap for every holemask bit. + */ + nextbit = xfs_next_bit(&allocbitmap, 1, 0); + while (nextbit != -1) { + ASSERT(nextbit < (sizeof(rec->ir_holemask) * NBBY)); + + bitmap |= (inodespbit << + (nextbit * XFS_INODES_PER_HOLEMASK_BIT)); + + nextbit = xfs_next_bit(&allocbitmap, 1, nextbit + 1); + } + + return bitmap; +} + +#if defined(DEBUG) || defined(XFS_WARN) +/* + * Verify that an in-core inode record has a valid inode count. + */ +int +xfs_inobt_rec_check_count( + struct xfs_mount *mp, + struct xfs_inobt_rec_incore *rec) +{ + int inocount = 0; + int nextbit = 0; + uint64_t allocbmap; + int wordsz; + + wordsz = sizeof(allocbmap) / sizeof(unsigned int); + allocbmap = xfs_inobt_irec_to_allocmask(rec); + + nextbit = xfs_next_bit((uint *) &allocbmap, wordsz, nextbit); + while (nextbit != -1) { + inocount++; + nextbit = xfs_next_bit((uint *) &allocbmap, wordsz, + nextbit + 1); + } + + if (inocount != rec->ir_count) + return -EFSCORRUPTED; + + return 0; +} +#endif /* DEBUG */ diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.h b/fs/xfs/libxfs/xfs_ialloc_btree.h index d7ebea72c..bd8845321 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.h +++ b/fs/xfs/libxfs/xfs_ialloc_btree.h @@ -62,4 +62,14 @@ extern struct xfs_btree_cur *xfs_inobt_init_cursor(struct xfs_mount *, xfs_btnum_t); extern int xfs_inobt_maxrecs(struct xfs_mount *, int, int); +/* ir_holemask to inode allocation bitmap conversion */ +uint64_t xfs_inobt_irec_to_allocmask(struct xfs_inobt_rec_incore *); + +#if defined(DEBUG) || defined(XFS_WARN) +int xfs_inobt_rec_check_count(struct xfs_mount *, + struct xfs_inobt_rec_incore *); +#else +#define xfs_inobt_rec_check_count(mp, rec) 0 +#endif /* DEBUG */ + #endif /* __XFS_IALLOC_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 002b6b3a1..6526e7696 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -46,8 +46,7 @@ xfs_inobp_check( j = mp->m_inode_cluster_size >> mp->m_sb.sb_inodelog; for (i = 0; i < j; i++) { - dip = (xfs_dinode_t *)xfs_buf_offset(bp, - i * mp->m_sb.sb_inodesize); + dip = xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize); if (!dip->di_next_unlinked) { xfs_alert(mp, "Detected bogus zero next_unlinked field in inode %d buffer 0x%llx.", @@ -86,8 +85,7 @@ xfs_inode_buf_verify( int di_ok; xfs_dinode_t *dip; - dip = (struct xfs_dinode *)xfs_buf_offset(bp, - (i << mp->m_sb.sb_inodelog)); + dip = xfs_buf_offset(bp, (i << mp->m_sb.sb_inodelog)); di_ok = dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) && XFS_DINODE_GOOD_VERSION(dip->di_version); if (unlikely(XFS_TEST_ERROR(!di_ok, mp, @@ -186,7 +184,7 @@ xfs_imap_to_bp( } *bpp = bp; - *dipp = (struct xfs_dinode *)xfs_buf_offset(bp, imap->im_boffset); + *dipp = xfs_buf_offset(bp, imap->im_boffset); return 0; } diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index dc4bfc5d8..df9851c46 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -174,6 +174,27 @@ xfs_mount_validate_sb( return -EFSCORRUPTED; } + /* + * Full inode chunks must be aligned to inode chunk size when + * sparse inodes are enabled to support the sparse chunk + * allocation algorithm and prevent overlapping inode records. + */ + if (xfs_sb_version_hassparseinodes(sbp)) { + uint32_t align; + + xfs_alert(mp, + "EXPERIMENTAL sparse inode feature enabled. Use at your own risk!"); + + align = XFS_INODES_PER_CHUNK * sbp->sb_inodesize + >> sbp->sb_blocklog; + if (sbp->sb_inoalignmt != align) { + xfs_warn(mp, +"Inode block alignment (%u) must match chunk size (%u) for sparse inodes.", + sbp->sb_inoalignmt, align); + return -EINVAL; + } + } + if (unlikely( sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) { xfs_warn(mp, @@ -374,7 +395,7 @@ __xfs_sb_from_disk( be32_to_cpu(from->sb_features_log_incompat); /* crc is only used on disk, not in memory; just init to 0 here. */ to->sb_crc = 0; - to->sb_pad = 0; + to->sb_spino_align = be32_to_cpu(from->sb_spino_align); to->sb_pquotino = be64_to_cpu(from->sb_pquotino); to->sb_lsn = be64_to_cpu(from->sb_lsn); /* Convert on-disk flags to in-memory flags? */ @@ -516,7 +537,7 @@ xfs_sb_to_disk( cpu_to_be32(from->sb_features_incompat); to->sb_features_log_incompat = cpu_to_be32(from->sb_features_log_incompat); - to->sb_pad = 0; + to->sb_spino_align = cpu_to_be32(from->sb_spino_align); to->sb_lsn = cpu_to_be64(from->sb_lsn); } } @@ -689,6 +710,11 @@ xfs_sb_mount_common( mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK, sbp->sb_inopblock); mp->m_ialloc_blks = mp->m_ialloc_inos >> sbp->sb_inopblog; + + if (sbp->sb_spino_align) + mp->m_ialloc_min_blks = sbp->sb_spino_align; + else + mp->m_ialloc_min_blks = mp->m_ialloc_blks; } /* @@ -792,12 +818,12 @@ xfs_sync_sb( tp = _xfs_trans_alloc(mp, XFS_TRANS_SB_CHANGE, KM_SLEEP); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_sb, 0, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } xfs_log_sb(tp); if (wait) xfs_trans_set_sync(tp); - return xfs_trans_commit(tp, 0); + return xfs_trans_commit(tp); } diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index 8dda4b321..5be529707 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -182,12 +182,6 @@ int xfs_log_calc_minimum_size(struct xfs_mount *); #define XFS_TRANS_FREEZE_PROT 0x40 /* Transaction has elevated writer count in superblock */ /* - * Values for call flags parameter. - */ -#define XFS_TRANS_RELEASE_LOG_RES 0x4 -#define XFS_TRANS_ABORT 0x8 - -/* * Field values for xfs_trans_mod_sb. */ #define XFS_TRANS_SB_ICOUNT 0x00000001 diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h index 2d5bdfce6..797815012 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.h +++ b/fs/xfs/libxfs/xfs_trans_resv.h @@ -73,9 +73,9 @@ struct xfs_trans_resv { * 2 trees * (2 blocks/level * max depth - 1) * block size */ #define XFS_ALLOCFREE_LOG_RES(mp,nx) \ - ((nx) * (2 * XFS_FSB_TO_B((mp), 2 * XFS_AG_MAXLEVELS(mp) - 1))) + ((nx) * (2 * XFS_FSB_TO_B((mp), 2 * (mp)->m_ag_maxlevels - 1))) #define XFS_ALLOCFREE_LOG_COUNT(mp,nx) \ - ((nx) * (2 * (2 * XFS_AG_MAXLEVELS(mp) - 1))) + ((nx) * (2 * (2 * (mp)->m_ag_maxlevels - 1))) /* * Per-directory log reservation for any directory change. diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h index bf9c45793..41e0428d8 100644 --- a/fs/xfs/libxfs/xfs_trans_space.h +++ b/fs/xfs/libxfs/xfs_trans_space.h @@ -67,7 +67,7 @@ #define XFS_DIOSTRAT_SPACE_RES(mp, v) \ (XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK) + (v)) #define XFS_GROWFS_SPACE_RES(mp) \ - (2 * XFS_AG_MAXLEVELS(mp)) + (2 * (mp)->m_ag_maxlevels) #define XFS_GROWFSRT_SPACE_RES(mp,b) \ ((b) + XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK)) #define XFS_LINK_SPACE_RES(mp,nl) \ diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index a56960dd1..3859f5e27 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -109,7 +109,7 @@ xfs_setfilesize_trans_alloc( error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } @@ -145,7 +145,7 @@ xfs_setfilesize( isize = xfs_new_eof(ip, offset + size); if (!isize) { xfs_iunlock(ip, XFS_ILOCK_EXCL); - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return 0; } @@ -155,7 +155,7 @@ xfs_setfilesize( xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - return xfs_trans_commit(tp, 0); + return xfs_trans_commit(tp); } STATIC int @@ -356,7 +356,6 @@ xfs_end_bio( { xfs_ioend_t *ioend = bio->bi_private; - ASSERT(atomic_read(&bio->bi_cnt) >= 1); ioend->io_error = test_bit(BIO_UPTODATE, &bio->bi_flags) ? 0 : error; /* Toss bio and pass work off to an xfsdatad thread */ @@ -1349,7 +1348,7 @@ __xfs_get_blocks( sector_t iblock, struct buffer_head *bh_result, int create, - int direct) + bool direct) { struct xfs_inode *ip = XFS_I(inode); struct xfs_mount *mp = ip->i_mount; @@ -1414,6 +1413,7 @@ __xfs_get_blocks( if (error) return error; new = 1; + } else { /* * Delalloc reservations do not require a transaction, @@ -1508,49 +1508,29 @@ xfs_get_blocks( struct buffer_head *bh_result, int create) { - return __xfs_get_blocks(inode, iblock, bh_result, create, 0); + return __xfs_get_blocks(inode, iblock, bh_result, create, false); } -STATIC int +int xfs_get_blocks_direct( struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { - return __xfs_get_blocks(inode, iblock, bh_result, create, 1); + return __xfs_get_blocks(inode, iblock, bh_result, create, true); } -/* - * Complete a direct I/O write request. - * - * The ioend structure is passed from __xfs_get_blocks() to tell us what to do. - * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite - * wholly within the EOF and so there is nothing for us to do. Note that in this - * case the completion can be called in interrupt context, whereas if we have an - * ioend we will always be called in task context (i.e. from a workqueue). - */ -STATIC void -xfs_end_io_direct_write( - struct kiocb *iocb, +static void +__xfs_end_io_direct_write( + struct inode *inode, + struct xfs_ioend *ioend, loff_t offset, - ssize_t size, - void *private) + ssize_t size) { - struct inode *inode = file_inode(iocb->ki_filp); - struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - struct xfs_ioend *ioend = private; - - trace_xfs_gbmap_direct_endio(ip, offset, size, - ioend ? ioend->io_type : 0, NULL); + struct xfs_mount *mp = XFS_I(inode)->i_mount; - if (!ioend) { - ASSERT(offset + size <= i_size_read(inode)); - return; - } - - if (XFS_FORCED_SHUTDOWN(mp)) + if (XFS_FORCED_SHUTDOWN(mp) || ioend->io_error) goto out_end_io; /* @@ -1587,10 +1567,10 @@ xfs_end_io_direct_write( * here can result in EOF moving backwards and Bad Things Happen when * that occurs. */ - spin_lock(&ip->i_flags_lock); + spin_lock(&XFS_I(inode)->i_flags_lock); if (offset + size > i_size_read(inode)) i_size_write(inode, offset + size); - spin_unlock(&ip->i_flags_lock); + spin_unlock(&XFS_I(inode)->i_flags_lock); /* * If we are doing an append IO that needs to update the EOF on disk, @@ -1607,6 +1587,98 @@ out_end_io: return; } +/* + * Complete a direct I/O write request. + * + * The ioend structure is passed from __xfs_get_blocks() to tell us what to do. + * If no ioend exists (i.e. @private == NULL) then the write IO is an overwrite + * wholly within the EOF and so there is nothing for us to do. Note that in this + * case the completion can be called in interrupt context, whereas if we have an + * ioend we will always be called in task context (i.e. from a workqueue). + */ +STATIC void +xfs_end_io_direct_write( + struct kiocb *iocb, + loff_t offset, + ssize_t size, + void *private) +{ + struct inode *inode = file_inode(iocb->ki_filp); + struct xfs_ioend *ioend = private; + + trace_xfs_gbmap_direct_endio(XFS_I(inode), offset, size, + ioend ? ioend->io_type : 0, NULL); + + if (!ioend) { + ASSERT(offset + size <= i_size_read(inode)); + return; + } + + __xfs_end_io_direct_write(inode, ioend, offset, size); +} + +/* + * For DAX we need a mapping buffer callback for unwritten extent conversion + * when page faults allocate blocks and then zero them. Note that in this + * case the mapping indicated by the ioend may extend beyond EOF. We most + * definitely do not want to extend EOF here, so we trim back the ioend size to + * EOF. + */ +#ifdef CONFIG_FS_DAX +void +xfs_end_io_dax_write( + struct buffer_head *bh, + int uptodate) +{ + struct xfs_ioend *ioend = bh->b_private; + struct inode *inode = ioend->io_inode; + ssize_t size = ioend->io_size; + + ASSERT(IS_DAX(ioend->io_inode)); + + /* if there was an error zeroing, then don't convert it */ + if (!uptodate) + ioend->io_error = -EIO; + + /* + * Trim update to EOF, so we don't extend EOF during unwritten extent + * conversion of partial EOF blocks. + */ + spin_lock(&XFS_I(inode)->i_flags_lock); + if (ioend->io_offset + size > i_size_read(inode)) + size = i_size_read(inode) - ioend->io_offset; + spin_unlock(&XFS_I(inode)->i_flags_lock); + + __xfs_end_io_direct_write(inode, ioend, ioend->io_offset, size); + +} +#else +void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate) { } +#endif + +static inline ssize_t +xfs_vm_do_dio( + struct inode *inode, + struct kiocb *iocb, + struct iov_iter *iter, + loff_t offset, + void (*endio)(struct kiocb *iocb, + loff_t offset, + ssize_t size, + void *private), + int flags) +{ + struct block_device *bdev; + + if (IS_DAX(inode)) + return dax_do_io(iocb, inode, iter, offset, + xfs_get_blocks_direct, endio, 0); + + bdev = xfs_find_bdev_for_inode(inode); + return __blockdev_direct_IO(iocb, inode, bdev, iter, offset, + xfs_get_blocks_direct, endio, NULL, flags); +} + STATIC ssize_t xfs_vm_direct_IO( struct kiocb *iocb, @@ -1614,16 +1686,11 @@ xfs_vm_direct_IO( loff_t offset) { struct inode *inode = iocb->ki_filp->f_mapping->host; - struct block_device *bdev = xfs_find_bdev_for_inode(inode); - if (iov_iter_rw(iter) == WRITE) { - return __blockdev_direct_IO(iocb, inode, bdev, iter, offset, - xfs_get_blocks_direct, - xfs_end_io_direct_write, NULL, - DIO_ASYNC_EXTEND); - } - return __blockdev_direct_IO(iocb, inode, bdev, iter, offset, - xfs_get_blocks_direct, NULL, NULL, 0); + if (iov_iter_rw(iter) == WRITE) + return xfs_vm_do_dio(inode, iocb, iter, offset, + xfs_end_io_direct_write, DIO_ASYNC_EXTEND); + return xfs_vm_do_dio(inode, iocb, iter, offset, NULL, 0); } /* @@ -1874,6 +1941,7 @@ xfs_vm_set_page_dirty( loff_t end_offset; loff_t offset; int newly_dirty; + struct mem_cgroup *memcg; if (unlikely(!mapping)) return !TestSetPageDirty(page); @@ -1893,6 +1961,11 @@ xfs_vm_set_page_dirty( offset += 1 << inode->i_blkbits; } while (bh != head); } + /* + * Use mem_group_begin_page_stat() to keep PageDirty synchronized with + * per-memcg dirty page counters. + */ + memcg = mem_cgroup_begin_page_stat(page); newly_dirty = !TestSetPageDirty(page); spin_unlock(&mapping->private_lock); @@ -1903,13 +1976,15 @@ xfs_vm_set_page_dirty( spin_lock_irqsave(&mapping->tree_lock, flags); if (page->mapping) { /* Race with truncate? */ WARN_ON_ONCE(!PageUptodate(page)); - account_page_dirtied(page, mapping); + account_page_dirtied(page, mapping, memcg); radix_tree_tag_set(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); } spin_unlock_irqrestore(&mapping->tree_lock, flags); - __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } + mem_cgroup_end_page_stat(memcg); + if (newly_dirty) + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); return newly_dirty; } diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index ac644e013..86afd1ac7 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -53,7 +53,12 @@ typedef struct xfs_ioend { } xfs_ioend_t; extern const struct address_space_operations xfs_address_space_operations; -extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int); + +int xfs_get_blocks(struct inode *inode, sector_t offset, + struct buffer_head *map_bh, int create); +int xfs_get_blocks_direct(struct inode *inode, sector_t offset, + struct buffer_head *map_bh, int create); +void xfs_end_io_dax_write(struct buffer_head *bh, int uptodate); extern void xfs_count_page_state(struct page *, int *, int *); diff --git a/fs/xfs/xfs_attr_inactive.c b/fs/xfs/xfs_attr_inactive.c index 73e75a87a..2bb959ada 100644 --- a/fs/xfs/xfs_attr_inactive.c +++ b/fs/xfs/xfs_attr_inactive.c @@ -394,7 +394,6 @@ xfs_attr_inactive( { struct xfs_trans *trans; struct xfs_mount *mp; - int cancel_flags = 0; int lock_mode = XFS_ILOCK_SHARED; int error = 0; @@ -423,7 +422,6 @@ xfs_attr_inactive( goto out_cancel; lock_mode = XFS_ILOCK_EXCL; - cancel_flags = XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT; xfs_ilock(dp, lock_mode); if (!XFS_IFORK_Q(dp)) @@ -455,12 +453,12 @@ xfs_attr_inactive( /* Reset the attribute fork - this also destroys the in-core fork */ xfs_attr_fork_remove(dp, trans); - error = xfs_trans_commit(trans, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(trans); xfs_iunlock(dp, lock_mode); return error; out_cancel: - xfs_trans_cancel(trans, cancel_flags); + xfs_trans_cancel(trans); out_destroy_fork: /* kill the in-core attr fork before we drop the inode lock */ if (dp->i_afp) diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index a52bbd3ab..0f34886cf 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -75,28 +75,20 @@ xfs_bmap_finish( xfs_efi_log_item_t *efi; /* extent free intention */ int error; /* error return value */ xfs_bmap_free_item_t *free; /* free extent item */ - struct xfs_trans_res tres; /* new log reservation */ xfs_mount_t *mp; /* filesystem mount structure */ xfs_bmap_free_item_t *next; /* next item on free list */ - xfs_trans_t *ntp; /* new transaction pointer */ ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); if (flist->xbf_count == 0) { *committed = 0; return 0; } - ntp = *tp; - efi = xfs_trans_get_efi(ntp, flist->xbf_count); + efi = xfs_trans_get_efi(*tp, flist->xbf_count); for (free = flist->xbf_first; free; free = free->xbfi_next) - xfs_trans_log_efi_extent(ntp, efi, free->xbfi_startblock, + xfs_trans_log_efi_extent(*tp, efi, free->xbfi_startblock, free->xbfi_blockcount); - tres.tr_logres = ntp->t_log_res; - tres.tr_logcount = ntp->t_log_count; - tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; - ntp = xfs_trans_dup(*tp); - error = xfs_trans_commit(*tp, 0); - *tp = ntp; + error = xfs_trans_roll(tp, NULL); *committed = 1; /* * We have a new transaction, so we should return committed=1, @@ -105,19 +97,10 @@ xfs_bmap_finish( if (error) return error; - /* - * transaction commit worked ok so we can drop the extra ticket - * reference that we gained in xfs_trans_dup() - */ - xfs_log_ticket_put(ntp->t_ticket); - - error = xfs_trans_reserve(ntp, &tres, 0, 0); - if (error) - return error; - efd = xfs_trans_get_efd(ntp, efi, flist->xbf_count); + efd = xfs_trans_get_efd(*tp, efi, flist->xbf_count); for (free = flist->xbf_first; free != NULL; free = next) { next = free->xbfi_next; - if ((error = xfs_free_extent(ntp, free->xbfi_startblock, + if ((error = xfs_free_extent(*tp, free->xbfi_startblock, free->xbfi_blockcount))) { /* * The bmap free list will be cleaned up at a @@ -127,7 +110,7 @@ xfs_bmap_finish( * happens, since this transaction may not be * dirty yet. */ - mp = ntp->t_mountp; + mp = (*tp)->t_mountp; if (!XFS_FORCED_SHUTDOWN(mp)) xfs_force_shutdown(mp, (error == -EFSCORRUPTED) ? @@ -135,7 +118,7 @@ xfs_bmap_finish( SHUTDOWN_META_IO_ERROR); return error; } - xfs_trans_log_efd_extent(ntp, efd, free->xbfi_startblock, + xfs_trans_log_efd_extent(*tp, efd, free->xbfi_startblock, free->xbfi_blockcount); xfs_bmap_del_free(flist, NULL, free); } @@ -878,7 +861,7 @@ xfs_free_eofblocks( if (need_iolock) { if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return -EAGAIN; } } @@ -886,7 +869,7 @@ xfs_free_eofblocks( error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); if (error) { ASSERT(XFS_FORCED_SHUTDOWN(mp)); - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); if (need_iolock) xfs_iunlock(ip, XFS_IOLOCK_EXCL); return error; @@ -908,12 +891,9 @@ xfs_free_eofblocks( * If we get an error at this point we simply don't * bother truncating the file. */ - xfs_trans_cancel(tp, - (XFS_TRANS_RELEASE_LOG_RES | - XFS_TRANS_ABORT)); + xfs_trans_cancel(tp); } else { - error = xfs_trans_commit(tp, - XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); if (!error) xfs_inode_clear_eofblocks_tag(ip); } @@ -1026,7 +1006,7 @@ xfs_alloc_file_space( * Free the transaction structure. */ ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp)); - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); break; } xfs_ilock(ip, XFS_ILOCK_EXCL); @@ -1053,7 +1033,7 @@ xfs_alloc_file_space( goto error0; } - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); if (error) { break; @@ -1077,7 +1057,7 @@ error0: /* Cancel bmap, unlock inode, unreserve quota blocks, cancel trans */ xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag); error1: /* Just cancel transaction */ - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } @@ -1133,14 +1113,29 @@ xfs_zero_remaining_bytes( break; ASSERT(imap.br_blockcount >= 1); ASSERT(imap.br_startoff == offset_fsb); + ASSERT(imap.br_startblock != DELAYSTARTBLOCK); + + if (imap.br_startblock == HOLESTARTBLOCK || + imap.br_state == XFS_EXT_UNWRITTEN) { + /* skip the entire extent */ + lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + + imap.br_blockcount) - 1; + continue; + } + lastoffset = XFS_FSB_TO_B(mp, imap.br_startoff + 1) - 1; if (lastoffset > endoff) lastoffset = endoff; - if (imap.br_startblock == HOLESTARTBLOCK) - continue; - ASSERT(imap.br_startblock != DELAYSTARTBLOCK); - if (imap.br_state == XFS_EXT_UNWRITTEN) + + /* DAX can just zero the backing device directly */ + if (IS_DAX(VFS_I(ip))) { + error = dax_zero_page_range(VFS_I(ip), offset, + lastoffset - offset + 1, + xfs_get_blocks_direct); + if (error) + return error; continue; + } error = xfs_buf_read_uncached(XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp, @@ -1289,7 +1284,7 @@ xfs_free_file_space( * Free the transaction structure. */ ASSERT(error == -ENOSPC || XFS_FORCED_SHUTDOWN(mp)); - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); break; } xfs_ilock(ip, XFS_ILOCK_EXCL); @@ -1320,7 +1315,7 @@ xfs_free_file_space( goto error0; } - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); } @@ -1330,7 +1325,7 @@ xfs_free_file_space( error0: xfs_bmap_cancel(&free_list); error1: - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); goto out; } @@ -1462,7 +1457,7 @@ xfs_shift_file_space( error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, XFS_DIOSTRAT_SPACE_RES(mp, 0), 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); break; } @@ -1492,13 +1487,13 @@ xfs_shift_file_space( if (error) goto out; - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); } return error; out: - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + xfs_trans_cancel(tp); return error; } @@ -1718,7 +1713,7 @@ xfs_swap_extents( tp = xfs_trans_alloc(mp, XFS_TRANS_SWAPEXT); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); goto out_unlock; } @@ -1901,7 +1896,7 @@ xfs_swap_extents( if (mp->m_flags & XFS_MOUNT_WSYNC) xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp); trace_xfs_swap_extent_after(ip, 0); trace_xfs_swap_extent_after(tip, 1); @@ -1915,6 +1910,6 @@ out_unlock: goto out; out_trans_cancel: - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); goto out; } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 1790b00be..a4b7d92e9 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1419,9 +1419,9 @@ xfs_buf_submit_wait( return error; } -xfs_caddr_t +void * xfs_buf_offset( - xfs_buf_t *bp, + struct xfs_buf *bp, size_t offset) { struct page *page; @@ -1431,7 +1431,7 @@ xfs_buf_offset( offset += bp->b_offset; page = bp->b_pages[offset >> PAGE_SHIFT]; - return (xfs_caddr_t)page_address(page) + (offset & (PAGE_SIZE-1)); + return page_address(page) + (offset & (PAGE_SIZE-1)); } /* diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 75ff5d5a7..331c1ccf8 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -299,7 +299,7 @@ extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *, xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO) /* Buffer Utility Routines */ -extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t); +extern void *xfs_buf_offset(struct xfs_buf *, size_t); /* Delayed Write Buffer Routines */ extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *); diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 02c01bbbc..4143dc75d 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -568,8 +568,6 @@ xfs_qm_dqread( struct xfs_buf *bp; struct xfs_trans *tp = NULL; int error; - int cancelflags = 0; - dqp = kmem_zone_zalloc(xfs_qm_dqzone, KM_SLEEP); @@ -617,7 +615,6 @@ xfs_qm_dqread( XFS_QM_DQALLOC_SPACE_RES(mp), 0); if (error) goto error1; - cancelflags = XFS_TRANS_RELEASE_LOG_RES; } /* @@ -632,7 +629,6 @@ xfs_qm_dqread( * allocate (ENOENT). */ trace_xfs_dqread_fail(dqp); - cancelflags |= XFS_TRANS_ABORT; goto error1; } @@ -670,7 +666,7 @@ xfs_qm_dqread( xfs_trans_brelse(tp, bp); if (tp) { - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); if (error) goto error0; } @@ -680,7 +676,7 @@ xfs_qm_dqread( error1: if (tp) - xfs_trans_cancel(tp, cancelflags); + xfs_trans_cancel(tp); error0: xfs_qm_dqdestroy(dqp); *O_dqpp = NULL; diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 338e50bbf..74d0e5966 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -127,7 +127,7 @@ xfs_error_report( struct xfs_mount *mp, const char *filename, int linenum, - inst_t *ra) + void *ra) { if (level <= xfs_error_level) { xfs_alert_tag(mp, XFS_PTAG_ERROR_REPORT, @@ -146,7 +146,7 @@ xfs_corruption_error( void *p, const char *filename, int linenum, - inst_t *ra) + void *ra) { if (level <= xfs_error_level) xfs_hex_dump(p, 64); diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index c0394ed12..4ed3042a0 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -21,10 +21,10 @@ struct xfs_mount; extern void xfs_error_report(const char *tag, int level, struct xfs_mount *mp, - const char *filename, int linenum, inst_t *ra); + const char *filename, int linenum, void *ra); extern void xfs_corruption_error(const char *tag, int level, struct xfs_mount *mp, void *p, const char *filename, - int linenum, inst_t *ra); + int linenum, void *ra); extern void xfs_verifier_error(struct xfs_buf *bp); #define XFS_ERROR_REPORT(e, lvl, mp) \ diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index cb7fe64cd..adc8f8fdd 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -239,7 +239,7 @@ xfs_efi_init( xfs_log_item_init(mp, &efip->efi_item, XFS_LI_EFI, &xfs_efi_item_ops); efip->efi_format.efi_nextents = nextents; - efip->efi_format.efi_id = (__psint_t)(void*)efip; + efip->efi_format.efi_id = (uintptr_t)(void *)efip; atomic_set(&efip->efi_next_extent, 0); atomic_set(&efip->efi_refcount, 2); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 3b7591224..db4acc1c3 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -41,6 +41,7 @@ #include <linux/dcache.h> #include <linux/falloc.h> #include <linux/pagevec.h> +#include <linux/backing-dev.h> static const struct vm_operations_struct xfs_file_vm_ops; @@ -79,14 +80,15 @@ xfs_rw_ilock_demote( } /* - * xfs_iozero + * xfs_iozero clears the specified range supplied via the page cache (except in + * the DAX case). Writes through the page cache will allocate blocks over holes, + * though the callers usually map the holes first and avoid them. If a block is + * not completely zeroed, then it will be read from disk before being partially + * zeroed. * - * xfs_iozero clears the specified range of buffer supplied, - * and marks all the affected blocks as valid and modified. If - * an affected block is not allocated, it will be allocated. If - * an affected block is not completely overwritten, and is not - * valid before the operation, it will be read from disk before - * being partially zeroed. + * In the DAX case, we can just directly write to the underlying pages. This + * will not allocate blocks, but will avoid holes and unwritten extents and so + * not do unnecessary work. */ int xfs_iozero( @@ -96,7 +98,8 @@ xfs_iozero( { struct page *page; struct address_space *mapping; - int status; + int status = 0; + mapping = VFS_I(ip)->i_mapping; do { @@ -108,20 +111,27 @@ xfs_iozero( if (bytes > count) bytes = count; - status = pagecache_write_begin(NULL, mapping, pos, bytes, - AOP_FLAG_UNINTERRUPTIBLE, - &page, &fsdata); - if (status) - break; + if (IS_DAX(VFS_I(ip))) { + status = dax_zero_page_range(VFS_I(ip), pos, bytes, + xfs_get_blocks_direct); + if (status) + break; + } else { + status = pagecache_write_begin(NULL, mapping, pos, bytes, + AOP_FLAG_UNINTERRUPTIBLE, + &page, &fsdata); + if (status) + break; - zero_user(page, offset, bytes); + zero_user(page, offset, bytes); - status = pagecache_write_end(NULL, mapping, pos, bytes, bytes, - page, fsdata); - WARN_ON(status <= 0); /* can't return less than zero! */ + status = pagecache_write_end(NULL, mapping, pos, bytes, + bytes, page, fsdata); + WARN_ON(status <= 0); /* can't return less than zero! */ + status = 0; + } pos += bytes; count -= bytes; - status = 0; } while (count); return status; @@ -138,7 +148,7 @@ xfs_update_prealloc_flags( tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_WRITEID); error = xfs_trans_reserve(tp, &M_RES(ip->i_mount)->tr_writeid, 0, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } @@ -160,7 +170,7 @@ xfs_update_prealloc_flags( xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); if (flags & XFS_PREALLOC_SYNC) xfs_trans_set_sync(tp); - return xfs_trans_commit(tp, 0); + return xfs_trans_commit(tp); } /* @@ -284,7 +294,7 @@ xfs_file_read_iter( if (file->f_mode & FMODE_NOCMTIME) ioflags |= XFS_IO_INVIS; - if (unlikely(ioflags & XFS_IO_ISDIRECT)) { + if ((ioflags & XFS_IO_ISDIRECT) && !IS_DAX(inode)) { xfs_buftarg_t *target = XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp; @@ -378,7 +388,11 @@ xfs_file_splice_read( trace_xfs_file_splice_read(ip, count, *ppos, ioflags); - ret = generic_file_splice_read(infilp, ppos, pipe, count, flags); + /* for dax, we need to avoid the page cache */ + if (IS_DAX(VFS_I(ip))) + ret = default_file_splice_read(infilp, ppos, pipe, count, flags); + else + ret = generic_file_splice_read(infilp, ppos, pipe, count, flags); if (ret > 0) XFS_STATS_ADD(xs_read_bytes, ret); @@ -563,6 +577,13 @@ restart: if (error) return error; + /* For changing security info in file_remove_privs() we need i_mutex */ + if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) { + xfs_rw_iunlock(ip, *iolock); + *iolock = XFS_IOLOCK_EXCL; + xfs_rw_ilock(ip, *iolock); + goto restart; + } /* * If the offset is beyond the size of the file, we need to zero any * blocks that fall between the existing EOF and the start of this @@ -623,7 +644,9 @@ restart: * setgid bits if the process is not being run by root. This keeps * people from modifying setuid and setgid binaries. */ - return file_remove_suid(file); + if (!IS_NOSEC(inode)) + return file_remove_privs(file); + return 0; } /* @@ -672,7 +695,7 @@ xfs_file_dio_aio_write( mp->m_rtdev_targp : mp->m_ddev_targp; /* DIO must be aligned to device logical sector size */ - if ((pos | count) & target->bt_logical_sectormask) + if (!IS_DAX(inode) && ((pos | count) & target->bt_logical_sectormask)) return -EINVAL; /* "unaligned" here means not aligned to a filesystem block */ @@ -758,8 +781,11 @@ xfs_file_dio_aio_write( out: xfs_rw_iunlock(ip, iolock); - /* No fallback to buffered IO on errors for XFS. */ - ASSERT(ret < 0 || ret == count); + /* + * No fallback to buffered IO on errors for XFS. DAX can result in + * partial writes, but direct IO will either complete fully or fail. + */ + ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip))); return ret; } @@ -842,7 +868,7 @@ xfs_file_write_iter( if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -EIO; - if (unlikely(iocb->ki_flags & IOCB_DIRECT)) + if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode)) ret = xfs_file_dio_aio_write(iocb, from); else ret = xfs_file_buffered_aio_write(iocb, from); @@ -1063,17 +1089,6 @@ xfs_file_readdir( return xfs_readdir(ip, ctx, bufsize); } -STATIC int -xfs_file_mmap( - struct file *filp, - struct vm_area_struct *vma) -{ - vma->vm_ops = &xfs_file_vm_ops; - - file_accessed(filp); - return 0; -} - /* * This type is designed to indicate the type of offset we would like * to search from page cache for xfs_seek_hole_data(). @@ -1454,48 +1469,92 @@ xfs_file_llseek( * ordering of: * * mmap_sem (MM) - * i_mmap_lock (XFS - truncate serialisation) - * page_lock (MM) - * i_lock (XFS - extent map serialisation) + * sb_start_pagefault(vfs, freeze) + * i_mmap_lock (XFS - truncate serialisation) + * page_lock (MM) + * i_lock (XFS - extent map serialisation) + */ + +/* + * mmap()d file has taken write protection fault and is being made writable. We + * can set the page state up correctly for a writable page, which means we can + * do correct delalloc accounting (ENOSPC checking!) and unwritten extent + * mapping. */ STATIC int -xfs_filemap_fault( +xfs_filemap_page_mkwrite( struct vm_area_struct *vma, struct vm_fault *vmf) { - struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host); - int error; + struct inode *inode = file_inode(vma->vm_file); + int ret; - trace_xfs_filemap_fault(ip); + trace_xfs_filemap_page_mkwrite(XFS_I(inode)); - xfs_ilock(ip, XFS_MMAPLOCK_SHARED); - error = filemap_fault(vma, vmf); - xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); + sb_start_pagefault(inode->i_sb); + file_update_time(vma->vm_file); + xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - return error; + if (IS_DAX(inode)) { + ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_direct, + xfs_end_io_dax_write); + } else { + ret = __block_page_mkwrite(vma, vmf, xfs_get_blocks); + ret = block_page_mkwrite_return(ret); + } + + xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); + sb_end_pagefault(inode->i_sb); + + return ret; } -/* - * mmap()d file has taken write protection fault and is being made writable. We - * can set the page state up correctly for a writable page, which means we can - * do correct delalloc accounting (ENOSPC checking!) and unwritten extent - * mapping. - */ STATIC int -xfs_filemap_page_mkwrite( +xfs_filemap_fault( struct vm_area_struct *vma, struct vm_fault *vmf) { - struct xfs_inode *ip = XFS_I(vma->vm_file->f_mapping->host); - int error; + struct inode *inode = file_inode(vma->vm_file); + int ret; - trace_xfs_filemap_page_mkwrite(ip); + trace_xfs_filemap_fault(XFS_I(inode)); - xfs_ilock(ip, XFS_MMAPLOCK_SHARED); - error = block_page_mkwrite(vma, vmf, xfs_get_blocks); - xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); + /* DAX can shortcut the normal fault path on write faults! */ + if ((vmf->flags & FAULT_FLAG_WRITE) && IS_DAX(inode)) + return xfs_filemap_page_mkwrite(vma, vmf); - return error; + xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); + if (IS_DAX(inode)) { + /* + * we do not want to trigger unwritten extent conversion on read + * faults - that is unnecessary overhead and would also require + * changes to xfs_get_blocks_direct() to map unwritten extent + * ioend for conversion on read-only mappings. + */ + ret = __dax_fault(vma, vmf, xfs_get_blocks_direct, NULL); + } else + ret = filemap_fault(vma, vmf); + xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); + + return ret; +} + +static const struct vm_operations_struct xfs_file_vm_ops = { + .fault = xfs_filemap_fault, + .map_pages = filemap_map_pages, + .page_mkwrite = xfs_filemap_page_mkwrite, +}; + +STATIC int +xfs_file_mmap( + struct file *filp, + struct vm_area_struct *vma) +{ + file_accessed(filp); + vma->vm_ops = &xfs_file_vm_ops; + if (IS_DAX(file_inode(filp))) + vma->vm_flags |= VM_MIXEDMAP; + return 0; } const struct file_operations xfs_file_operations = { @@ -1526,9 +1585,3 @@ const struct file_operations xfs_dir_file_operations = { #endif .fsync = xfs_dir_fsync, }; - -static const struct vm_operations_struct xfs_file_vm_ops = { - .fault = xfs_filemap_fault, - .map_pages = filemap_map_pages, - .page_mkwrite = xfs_filemap_page_mkwrite, -}; diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index da82f1cb4..c4c130f9b 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -196,7 +196,8 @@ xfs_filestream_pick_ag( goto next_ag; } - longest = xfs_alloc_longest_free_extent(mp, pag); + longest = xfs_alloc_longest_free_extent(mp, pag, + xfs_alloc_min_freelist(mp, pag)); if (((minlen && longest >= minlen) || (!minlen && pag->pagf_freeblks >= minfree)) && (!pag->pagf_metadata || !(flags & XFS_PICK_USERDATA) || diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index cb7e8a29d..9b3438a76 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -101,7 +101,9 @@ xfs_fs_geometry( (xfs_sb_version_hasftype(&mp->m_sb) ? XFS_FSOP_GEOM_FLAGS_FTYPE : 0) | (xfs_sb_version_hasfinobt(&mp->m_sb) ? - XFS_FSOP_GEOM_FLAGS_FINOBT : 0); + XFS_FSOP_GEOM_FLAGS_FINOBT : 0) | + (xfs_sb_version_hassparseinodes(&mp->m_sb) ? + XFS_FSOP_GEOM_FLAGS_SPINODES : 0); geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ? mp->m_sb.sb_logsectsize : BBSIZE; geo->rtsectsize = mp->m_sb.sb_blocksize; @@ -201,7 +203,7 @@ xfs_growfs_data_private( error = xfs_trans_reserve(tp, &M_RES(mp)->tr_growdata, XFS_GROWFS_SPACE_RES(mp), 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } @@ -489,7 +491,7 @@ xfs_growfs_data_private( if (dpct) xfs_trans_mod_sb(tp, XFS_TRANS_SB_IMAXPCT, dpct); xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp); if (error) return error; @@ -557,7 +559,7 @@ xfs_growfs_data_private( return saved_error ? saved_error : error; error0: - xfs_trans_cancel(tp, XFS_TRANS_ABORT); + xfs_trans_cancel(tp); return error; } diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 539a85fdd..3da9f4da4 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -905,7 +905,6 @@ xfs_dir_ialloc( { xfs_trans_t *tp; - xfs_trans_t *ntp; xfs_inode_t *ip; xfs_buf_t *ialloc_context = NULL; int code; @@ -954,8 +953,6 @@ xfs_dir_ialloc( * to succeed the second time. */ if (ialloc_context) { - struct xfs_trans_res tres; - /* * Normally, xfs_trans_commit releases all the locks. * We call bhold to hang on to the ialloc_context across @@ -964,12 +961,6 @@ xfs_dir_ialloc( * allocation group. */ xfs_trans_bhold(tp, ialloc_context); - /* - * Save the log reservation so we can use - * them in the next transaction. - */ - tres.tr_logres = xfs_trans_get_log_res(tp); - tres.tr_logcount = xfs_trans_get_log_count(tp); /* * We want the quota changes to be associated with the next @@ -985,35 +976,9 @@ xfs_dir_ialloc( tp->t_flags &= ~(XFS_TRANS_DQ_DIRTY); } - ntp = xfs_trans_dup(tp); - code = xfs_trans_commit(tp, 0); - tp = ntp; - if (committed != NULL) { + code = xfs_trans_roll(&tp, 0); + if (committed != NULL) *committed = 1; - } - /* - * If we get an error during the commit processing, - * release the buffer that is still held and return - * to the caller. - */ - if (code) { - xfs_buf_relse(ialloc_context); - if (dqinfo) { - tp->t_dqinfo = dqinfo; - xfs_trans_free_dqinfo(tp); - } - *tpp = ntp; - *ipp = NULL; - return code; - } - - /* - * transaction commit worked ok so we can drop the extra ticket - * reference that we gained in xfs_trans_dup() - */ - xfs_log_ticket_put(tp->t_ticket); - tres.tr_logflags = XFS_TRANS_PERM_LOG_RES; - code = xfs_trans_reserve(tp, &tres, 0, 0); /* * Re-attach the quota info that we detached from prev trx. @@ -1025,7 +990,7 @@ xfs_dir_ialloc( if (code) { xfs_buf_relse(ialloc_context); - *tpp = ntp; + *tpp = tp; *ipp = NULL; return code; } @@ -1127,7 +1092,6 @@ xfs_create( xfs_bmap_free_t free_list; xfs_fsblock_t first_block; bool unlock_dp_on_error = false; - uint cancel_flags; int committed; prid_t prid; struct xfs_dquot *udqp = NULL; @@ -1164,8 +1128,6 @@ xfs_create( tp = xfs_trans_alloc(mp, XFS_TRANS_CREATE); } - cancel_flags = XFS_TRANS_RELEASE_LOG_RES; - /* * Initially assume that the file does not exist and * reserve the resources for that case. If that is not @@ -1183,10 +1145,9 @@ xfs_create( resblks = 0; error = xfs_trans_reserve(tp, tres, 0, 0); } - if (error) { - cancel_flags = 0; + if (error) goto out_trans_cancel; - } + xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); unlock_dp_on_error = true; @@ -1217,7 +1178,7 @@ xfs_create( if (error) { if (error == -ENOSPC) goto out_trans_cancel; - goto out_trans_abort; + goto out_trans_cancel; } /* @@ -1235,7 +1196,7 @@ xfs_create( resblks - XFS_IALLOC_SPACE_RES(mp) : 0); if (error) { ASSERT(error != -ENOSPC); - goto out_trans_abort; + goto out_trans_cancel; } xfs_trans_ichgtime(tp, dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, dp, XFS_ILOG_CORE); @@ -1269,7 +1230,7 @@ xfs_create( if (error) goto out_bmap_cancel; - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); if (error) goto out_release_inode; @@ -1282,10 +1243,8 @@ xfs_create( out_bmap_cancel: xfs_bmap_cancel(&free_list); - out_trans_abort: - cancel_flags |= XFS_TRANS_ABORT; out_trans_cancel: - xfs_trans_cancel(tp, cancel_flags); + xfs_trans_cancel(tp); out_release_inode: /* * Wait until after the current transaction is aborted to finish the @@ -1317,7 +1276,6 @@ xfs_create_tmpfile( struct xfs_inode *ip = NULL; struct xfs_trans *tp = NULL; int error; - uint cancel_flags = XFS_TRANS_RELEASE_LOG_RES; prid_t prid; struct xfs_dquot *udqp = NULL; struct xfs_dquot *gdqp = NULL; @@ -1350,10 +1308,8 @@ xfs_create_tmpfile( resblks = 0; error = xfs_trans_reserve(tp, tres, 0, 0); } - if (error) { - cancel_flags = 0; + if (error) goto out_trans_cancel; - } error = xfs_trans_reserve_quota(tp, mp, udqp, gdqp, pdqp, resblks, 1, 0); @@ -1365,7 +1321,7 @@ xfs_create_tmpfile( if (error) { if (error == -ENOSPC) goto out_trans_cancel; - goto out_trans_abort; + goto out_trans_cancel; } if (mp->m_flags & XFS_MOUNT_WSYNC) @@ -1381,9 +1337,9 @@ xfs_create_tmpfile( ip->i_d.di_nlink--; error = xfs_iunlink(tp, ip); if (error) - goto out_trans_abort; + goto out_trans_cancel; - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); if (error) goto out_release_inode; @@ -1394,10 +1350,8 @@ xfs_create_tmpfile( *ipp = ip; return 0; - out_trans_abort: - cancel_flags |= XFS_TRANS_ABORT; out_trans_cancel: - xfs_trans_cancel(tp, cancel_flags); + xfs_trans_cancel(tp); out_release_inode: /* * Wait until after the current transaction is aborted to finish the @@ -1427,7 +1381,6 @@ xfs_link( int error; xfs_bmap_free_t free_list; xfs_fsblock_t first_block; - int cancel_flags; int committed; int resblks; @@ -1447,17 +1400,14 @@ xfs_link( goto std_return; tp = xfs_trans_alloc(mp, XFS_TRANS_LINK); - cancel_flags = XFS_TRANS_RELEASE_LOG_RES; resblks = XFS_LINK_SPACE_RES(mp, target_name->len); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, resblks, 0); if (error == -ENOSPC) { resblks = 0; error = xfs_trans_reserve(tp, &M_RES(mp)->tr_link, 0, 0); } - if (error) { - cancel_flags = 0; + if (error) goto error_return; - } xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL); @@ -1486,19 +1436,19 @@ xfs_link( if (sip->i_d.di_nlink == 0) { error = xfs_iunlink_remove(tp, sip); if (error) - goto abort_return; + goto error_return; } error = xfs_dir_createname(tp, tdp, target_name, sip->i_ino, &first_block, &free_list, resblks); if (error) - goto abort_return; + goto error_return; xfs_trans_ichgtime(tp, tdp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE); error = xfs_bumplink(tp, sip); if (error) - goto abort_return; + goto error_return; /* * If this is a synchronous mount, make sure that the @@ -1512,15 +1462,13 @@ xfs_link( error = xfs_bmap_finish (&tp, &free_list, &committed); if (error) { xfs_bmap_cancel(&free_list); - goto abort_return; + goto error_return; } - return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + return xfs_trans_commit(tp); - abort_return: - cancel_flags |= XFS_TRANS_ABORT; error_return: - xfs_trans_cancel(tp, cancel_flags); + xfs_trans_cancel(tp); std_return: return error; } @@ -1555,7 +1503,6 @@ xfs_itruncate_extents( { struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp = *tpp; - struct xfs_trans *ntp; xfs_bmap_free_t free_list; xfs_fsblock_t first_block; xfs_fileoff_t first_unmap_block; @@ -1613,29 +1560,7 @@ xfs_itruncate_extents( if (error) goto out_bmap_cancel; - if (committed) { - /* - * Mark the inode dirty so it will be logged and - * moved forward in the log as part of every commit. - */ - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - } - - ntp = xfs_trans_dup(tp); - error = xfs_trans_commit(tp, 0); - tp = ntp; - - xfs_trans_ijoin(tp, ip, 0); - - if (error) - goto out; - - /* - * Transaction commit worked ok so we can drop the extra ticket - * reference that we gained in xfs_trans_dup() - */ - xfs_log_ticket_put(tp->t_ticket); - error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); + error = xfs_trans_roll(&tp, ip); if (error) goto out; } @@ -1756,7 +1681,7 @@ xfs_inactive_truncate( error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); if (error) { ASSERT(XFS_FORCED_SHUTDOWN(mp)); - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } @@ -1777,7 +1702,7 @@ xfs_inactive_truncate( ASSERT(ip->i_d.di_nextents == 0); - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); if (error) goto error_unlock; @@ -1785,7 +1710,7 @@ xfs_inactive_truncate( return 0; error_trans_cancel: - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + xfs_trans_cancel(tp); error_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; @@ -1835,7 +1760,7 @@ xfs_inactive_ifree( } else { ASSERT(XFS_FORCED_SHUTDOWN(mp)); } - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES); + xfs_trans_cancel(tp); return error; } @@ -1855,7 +1780,7 @@ xfs_inactive_ifree( __func__, error); xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); } - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); + xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } @@ -1874,7 +1799,7 @@ xfs_inactive_ifree( if (error) xfs_notice(mp, "%s: xfs_bmap_finish returned error %d", __func__, error); - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); if (error) xfs_notice(mp, "%s: xfs_trans_commit returned error %d", __func__, error); @@ -2235,28 +2160,42 @@ xfs_iunlink_remove( */ STATIC int xfs_ifree_cluster( - xfs_inode_t *free_ip, - xfs_trans_t *tp, - xfs_ino_t inum) + xfs_inode_t *free_ip, + xfs_trans_t *tp, + struct xfs_icluster *xic) { xfs_mount_t *mp = free_ip->i_mount; int blks_per_cluster; int inodes_per_cluster; int nbufs; int i, j; + int ioffset; xfs_daddr_t blkno; xfs_buf_t *bp; xfs_inode_t *ip; xfs_inode_log_item_t *iip; xfs_log_item_t *lip; struct xfs_perag *pag; + xfs_ino_t inum; + inum = xic->first_ino; pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, inum)); blks_per_cluster = xfs_icluster_size_fsb(mp); inodes_per_cluster = blks_per_cluster << mp->m_sb.sb_inopblog; nbufs = mp->m_ialloc_blks / blks_per_cluster; for (j = 0; j < nbufs; j++, inum += inodes_per_cluster) { + /* + * The allocation bitmap tells us which inodes of the chunk were + * physically allocated. Skip the cluster if an inode falls into + * a sparse region. + */ + ioffset = inum - xic->first_ino; + if ((xic->alloc & XFS_INOBT_MASK(ioffset)) == 0) { + ASSERT(do_mod(ioffset, inodes_per_cluster) == 0); + continue; + } + blkno = XFS_AGB_TO_DADDR(mp, XFS_INO_TO_AGNO(mp, inum), XFS_INO_TO_AGBNO(mp, inum)); @@ -2414,8 +2353,7 @@ xfs_ifree( xfs_bmap_free_t *flist) { int error; - int delete; - xfs_ino_t first_ino; + struct xfs_icluster xic = { 0 }; ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(ip->i_d.di_nlink == 0); @@ -2431,7 +2369,7 @@ xfs_ifree( if (error) return error; - error = xfs_difree(tp, ip->i_ino, flist, &delete, &first_ino); + error = xfs_difree(tp, ip->i_ino, flist, &xic); if (error) return error; @@ -2448,8 +2386,8 @@ xfs_ifree( ip->i_d.di_gen++; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - if (delete) - error = xfs_ifree_cluster(ip, tp, first_ino); + if (xic.deleted) + error = xfs_ifree_cluster(ip, tp, &xic); return error; } @@ -2536,7 +2474,6 @@ xfs_remove( int error = 0; xfs_bmap_free_t free_list; xfs_fsblock_t first_block; - int cancel_flags; int committed; uint resblks; @@ -2557,7 +2494,6 @@ xfs_remove( tp = xfs_trans_alloc(mp, XFS_TRANS_RMDIR); else tp = xfs_trans_alloc(mp, XFS_TRANS_REMOVE); - cancel_flags = XFS_TRANS_RELEASE_LOG_RES; /* * We try to get the real space reservation first, @@ -2576,7 +2512,6 @@ xfs_remove( } if (error) { ASSERT(error != -ENOSPC); - cancel_flags = 0; goto out_trans_cancel; } @@ -2588,7 +2523,6 @@ xfs_remove( /* * If we're removing a directory perform some additional validation. */ - cancel_flags |= XFS_TRANS_ABORT; if (is_dir) { ASSERT(ip->i_d.di_nlink >= 2); if (ip->i_d.di_nlink != 2) { @@ -2644,7 +2578,7 @@ xfs_remove( if (error) goto out_bmap_cancel; - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); if (error) goto std_return; @@ -2656,7 +2590,7 @@ xfs_remove( out_bmap_cancel: xfs_bmap_cancel(&free_list); out_trans_cancel: - xfs_trans_cancel(tp, cancel_flags); + xfs_trans_cancel(tp); std_return: return error; } @@ -2730,11 +2664,11 @@ xfs_finish_rename( error = xfs_bmap_finish(&tp, free_list, &committed); if (error) { xfs_bmap_cancel(free_list); - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); + xfs_trans_cancel(tp); return error; } - return xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + return xfs_trans_commit(tp); } /* @@ -2855,7 +2789,7 @@ xfs_cross_rename( out_trans_abort: xfs_bmap_cancel(free_list); - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); + xfs_trans_cancel(tp); return error; } @@ -2915,7 +2849,6 @@ xfs_rename( int num_inodes = __XFS_SORT_INODES; bool new_parent = (src_dp != target_dp); bool src_is_directory = S_ISDIR(src_ip->i_d.di_mode); - int cancel_flags = 0; int spaceres; int error; @@ -2951,7 +2884,6 @@ xfs_rename( } if (error) goto out_trans_cancel; - cancel_flags = XFS_TRANS_RELEASE_LOG_RES; /* * Attach the dquots to the inodes @@ -3022,10 +2954,8 @@ xfs_rename( error = xfs_dir_createname(tp, target_dp, target_name, src_ip->i_ino, &first_block, &free_list, spaceres); - if (error == -ENOSPC) - goto out_bmap_cancel; if (error) - goto out_trans_abort; + goto out_bmap_cancel; xfs_trans_ichgtime(tp, target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); @@ -3033,7 +2963,7 @@ xfs_rename( if (new_parent && src_is_directory) { error = xfs_bumplink(tp, target_dp); if (error) - goto out_trans_abort; + goto out_bmap_cancel; } } else { /* target_ip != NULL */ /* @@ -3065,7 +2995,7 @@ xfs_rename( src_ip->i_ino, &first_block, &free_list, spaceres); if (error) - goto out_trans_abort; + goto out_bmap_cancel; xfs_trans_ichgtime(tp, target_dp, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); @@ -3076,7 +3006,7 @@ xfs_rename( */ error = xfs_droplink(tp, target_ip); if (error) - goto out_trans_abort; + goto out_bmap_cancel; if (src_is_directory) { /* @@ -3084,7 +3014,7 @@ xfs_rename( */ error = xfs_droplink(tp, target_ip); if (error) - goto out_trans_abort; + goto out_bmap_cancel; } } /* target_ip != NULL */ @@ -3101,7 +3031,7 @@ xfs_rename( &first_block, &free_list, spaceres); ASSERT(error != -EEXIST); if (error) - goto out_trans_abort; + goto out_bmap_cancel; } /* @@ -3127,7 +3057,7 @@ xfs_rename( */ error = xfs_droplink(tp, src_dp); if (error) - goto out_trans_abort; + goto out_bmap_cancel; } /* @@ -3142,7 +3072,7 @@ xfs_rename( error = xfs_dir_removename(tp, src_dp, src_name, src_ip->i_ino, &first_block, &free_list, spaceres); if (error) - goto out_trans_abort; + goto out_bmap_cancel; /* * For whiteouts, we need to bump the link count on the whiteout inode. @@ -3156,10 +3086,10 @@ xfs_rename( ASSERT(VFS_I(wip)->i_nlink == 0 && wip->i_d.di_nlink == 0); error = xfs_bumplink(tp, wip); if (error) - goto out_trans_abort; + goto out_bmap_cancel; error = xfs_iunlink_remove(tp, wip); if (error) - goto out_trans_abort; + goto out_bmap_cancel; xfs_trans_log_inode(tp, wip, XFS_ILOG_CORE); /* @@ -3180,12 +3110,10 @@ xfs_rename( IRELE(wip); return error; -out_trans_abort: - cancel_flags |= XFS_TRANS_ABORT; out_bmap_cancel: xfs_bmap_cancel(&free_list); out_trans_cancel: - xfs_trans_cancel(tp, cancel_flags); + xfs_trans_cancel(tp); if (wip) IRELE(wip); return error; @@ -3464,7 +3392,7 @@ xfs_iflush_int( ASSERT(ip->i_d.di_version > 1); /* set *dip = inode's place in the buffer */ - dip = (xfs_dinode_t *)xfs_buf_offset(bp, ip->i_imap.im_boffset); + dip = xfs_buf_offset(bp, ip->i_imap.im_boffset); if (XFS_TEST_ERROR(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC), mp, XFS_ERRTAG_IFLUSH_1, XFS_RANDOM_IFLUSH_1)) { diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 87f67c6b6..ea7d85af5 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -336,7 +336,7 @@ xfs_set_dmattrs( tp = xfs_trans_alloc(mp, XFS_TRANS_SET_DMATTRS); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } xfs_ilock(ip, XFS_ILOCK_EXCL); @@ -346,7 +346,7 @@ xfs_set_dmattrs( ip->i_d.di_dmstate = state; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - error = xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp); return error; } @@ -1076,7 +1076,7 @@ xfs_ioctl_setattr_get_trans( return tp; out_cancel: - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return ERR_PTR(error); } @@ -1253,7 +1253,7 @@ xfs_ioctl_setattr( else ip->i_d.di_extsize = 0; - code = xfs_trans_commit(tp, 0); + code = xfs_trans_commit(tp); /* * Release any dquot(s) the inode had kept before chown. @@ -1265,7 +1265,7 @@ xfs_ioctl_setattr( return code; error_trans_cancel: - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); error_free_dquots: xfs_qm_dqrele(udqp); xfs_qm_dqrele(pdqp); @@ -1338,11 +1338,11 @@ xfs_ioc_setxflags( error = xfs_ioctl_setattr_xflags(tp, ip, &fa); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); goto out_drop_write; } - error = xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp); out_drop_write: mnt_drop_write_file(filp); return error; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index 38e633bad..1f8603317 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -183,7 +183,7 @@ xfs_iomap_write_direct( * Check for running out of space, note: need lock to return */ if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } @@ -213,7 +213,7 @@ xfs_iomap_write_direct( error = xfs_bmap_finish(&tp, &free_list, &committed); if (error) goto out_bmap_cancel; - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); if (error) goto out_unlock; @@ -236,7 +236,7 @@ out_bmap_cancel: xfs_bmap_cancel(&free_list); xfs_trans_unreserve_quota_nblks(tp, ip, (long)qblocks, 0, quota_flag); out_trans_cancel: - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + xfs_trans_cancel(tp); goto out_unlock; } @@ -690,7 +690,7 @@ xfs_iomap_write_allocate( error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, nres, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } xfs_ilock(ip, XFS_ILOCK_EXCL); @@ -760,7 +760,7 @@ xfs_iomap_write_allocate( if (error) goto trans_cancel; - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); if (error) goto error0; @@ -791,7 +791,7 @@ xfs_iomap_write_allocate( trans_cancel: xfs_bmap_cancel(&free_list); - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + xfs_trans_cancel(tp); error0: xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; @@ -853,7 +853,7 @@ xfs_iomap_write_unwritten( error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, resblks, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } @@ -890,7 +890,7 @@ xfs_iomap_write_unwritten( if (error) goto error_on_bmapi_transaction; - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); if (error) return error; @@ -914,7 +914,7 @@ xfs_iomap_write_unwritten( error_on_bmapi_transaction: xfs_bmap_cancel(&free_list); - xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT)); + xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index f4cd7204e..766b23f86 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -41,7 +41,6 @@ #include <linux/capability.h> #include <linux/xattr.h> -#include <linux/namei.h> #include <linux/posix_acl.h> #include <linux/security.h> #include <linux/fiemap.h> @@ -414,10 +413,10 @@ xfs_vn_rename( * we need to be very careful about how much stack we use. * uio is kmalloced for this reason... */ -STATIC void * +STATIC const char * xfs_vn_follow_link( struct dentry *dentry, - struct nameidata *nd) + void **cookie) { char *link; int error = -ENOMEM; @@ -430,14 +429,12 @@ xfs_vn_follow_link( if (unlikely(error)) goto out_kfree; - nd_set_link(nd, link); - return NULL; + return *cookie = link; out_kfree: kfree(link); out_err: - nd_set_link(nd, ERR_PTR(error)); - return NULL; + return ERR_PTR(error); } STATIC int @@ -702,7 +699,7 @@ xfs_setattr_nonsize( if (mp->m_flags & XFS_MOUNT_WSYNC) xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -733,7 +730,7 @@ xfs_setattr_nonsize( return 0; out_trans_cancel: - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); out_dqrele: xfs_qm_dqrele(udqp); @@ -755,7 +752,6 @@ xfs_setattr_size( struct xfs_trans *tp; int error; uint lock_flags = 0; - uint commit_flags = 0; bool did_zeroing = false; trace_xfs_setattr(ip); @@ -851,7 +847,11 @@ xfs_setattr_size( * to hope that the caller sees ENOMEM and retries the truncate * operation. */ - error = block_truncate_page(inode->i_mapping, newsize, xfs_get_blocks); + if (IS_DAX(inode)) + error = dax_truncate_page(inode, newsize, xfs_get_blocks_direct); + else + error = block_truncate_page(inode->i_mapping, newsize, + xfs_get_blocks); if (error) return error; truncate_setsize(inode, newsize); @@ -861,7 +861,6 @@ xfs_setattr_size( if (error) goto out_trans_cancel; - commit_flags = XFS_TRANS_RELEASE_LOG_RES; lock_flags |= XFS_ILOCK_EXCL; xfs_ilock(ip, XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, ip, 0); @@ -901,7 +900,7 @@ xfs_setattr_size( if (newsize <= oldsize) { error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, newsize); if (error) - goto out_trans_abort; + goto out_trans_cancel; /* * Truncated "down", so we're removing references to old data @@ -928,16 +927,14 @@ xfs_setattr_size( if (mp->m_flags & XFS_MOUNT_WSYNC) xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); out_unlock: if (lock_flags) xfs_iunlock(ip, lock_flags); return error; -out_trans_abort: - commit_flags |= XFS_TRANS_ABORT; out_trans_cancel: - xfs_trans_cancel(tp, commit_flags); + xfs_trans_cancel(tp); goto out_unlock; } @@ -984,7 +981,7 @@ xfs_vn_update_time( tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_fsyncts, 0, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } @@ -1006,7 +1003,7 @@ xfs_vn_update_time( } xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); xfs_trans_log_inode(tp, ip, XFS_ILOG_TIMESTAMP); - return xfs_trans_commit(tp, 0); + return xfs_trans_commit(tp); } #define XFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) @@ -1191,22 +1188,22 @@ xfs_diflags_to_iflags( struct inode *inode, struct xfs_inode *ip) { - if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE) + uint16_t flags = ip->i_d.di_flags; + + inode->i_flags &= ~(S_IMMUTABLE | S_APPEND | S_SYNC | + S_NOATIME | S_DAX); + + if (flags & XFS_DIFLAG_IMMUTABLE) inode->i_flags |= S_IMMUTABLE; - else - inode->i_flags &= ~S_IMMUTABLE; - if (ip->i_d.di_flags & XFS_DIFLAG_APPEND) + if (flags & XFS_DIFLAG_APPEND) inode->i_flags |= S_APPEND; - else - inode->i_flags &= ~S_APPEND; - if (ip->i_d.di_flags & XFS_DIFLAG_SYNC) + if (flags & XFS_DIFLAG_SYNC) inode->i_flags |= S_SYNC; - else - inode->i_flags &= ~S_SYNC; - if (ip->i_d.di_flags & XFS_DIFLAG_NOATIME) + if (flags & XFS_DIFLAG_NOATIME) inode->i_flags |= S_NOATIME; - else - inode->i_flags &= ~S_NOATIME; + /* XXX: Also needs an on-disk per inode flag! */ + if (ip->i_mount->m_flags & XFS_MOUNT_DAX) + inode->i_flags |= S_DAX; } /* diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 80429891d..f41b0c3fd 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -252,7 +252,7 @@ xfs_bulkstat_grab_ichunk( } irec->ir_free |= xfs_inobt_maskn(0, idx); - *icount = XFS_INODES_PER_CHUNK - irec->ir_freecount; + *icount = irec->ir_count - irec->ir_freecount; } return 0; @@ -415,6 +415,8 @@ xfs_bulkstat( goto del_cursor; if (icount) { irbp->ir_startino = r.ir_startino; + irbp->ir_holemask = r.ir_holemask; + irbp->ir_count = r.ir_count; irbp->ir_freecount = r.ir_freecount; irbp->ir_free = r.ir_free; irbp++; @@ -447,13 +449,15 @@ xfs_bulkstat( * If this chunk has any allocated inodes, save it. * Also start read-ahead now for this chunk. */ - if (r.ir_freecount < XFS_INODES_PER_CHUNK) { + if (r.ir_freecount < r.ir_count) { xfs_bulkstat_ichunk_ra(mp, agno, &r); irbp->ir_startino = r.ir_startino; + irbp->ir_holemask = r.ir_holemask; + irbp->ir_count = r.ir_count; irbp->ir_freecount = r.ir_freecount; irbp->ir_free = r.ir_free; irbp++; - icount += XFS_INODES_PER_CHUNK - r.ir_freecount; + icount += r.ir_count - r.ir_freecount; } error = xfs_btree_increment(cur, 0, &stat); if (error || stat == 0) { @@ -599,8 +603,7 @@ xfs_inumbers( agino = r.ir_startino + XFS_INODES_PER_CHUNK - 1; buffer[bufidx].xi_startino = XFS_AGINO_TO_INO(mp, agno, r.ir_startino); - buffer[bufidx].xi_alloccount = - XFS_INODES_PER_CHUNK - r.ir_freecount; + buffer[bufidx].xi_alloccount = r.ir_count - r.ir_freecount; buffer[bufidx].xi_allocmask = ~r.ir_free; if (++bufidx == bcount) { long written; diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index 7c7842c85..85f883dd6 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -32,26 +32,12 @@ typedef unsigned int __uint32_t; typedef signed long long int __int64_t; typedef unsigned long long int __uint64_t; -typedef __uint32_t inst_t; /* an instruction */ - typedef __s64 xfs_off_t; /* <file offset> type */ typedef unsigned long long xfs_ino_t; /* <inode> type */ typedef __s64 xfs_daddr_t; /* <disk address> type */ -typedef char * xfs_caddr_t; /* <core address> type */ typedef __u32 xfs_dev_t; typedef __u32 xfs_nlink_t; -/* __psint_t is the same size as a pointer */ -#if (BITS_PER_LONG == 32) -typedef __int32_t __psint_t; -typedef __uint32_t __psunsigned_t; -#elif (BITS_PER_LONG == 64) -typedef __int64_t __psint_t; -typedef __uint64_t __psunsigned_t; -#else -#error BITS_PER_LONG must be 32 or 64 -#endif - #include "xfs_types.h" #include "kmem.h" diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index bcc7cfabb..08d4fe46f 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -109,7 +109,7 @@ xlog_ungrant_log_space( STATIC void xlog_verify_dest_ptr( struct xlog *log, - char *ptr); + void *ptr); STATIC void xlog_verify_grant_tail( struct xlog *log); @@ -513,7 +513,7 @@ xfs_log_done( struct xfs_mount *mp, struct xlog_ticket *ticket, struct xlog_in_core **iclog, - uint flags) + bool regrant) { struct xlog *log = mp->m_log; xfs_lsn_t lsn = 0; @@ -526,14 +526,11 @@ xfs_log_done( (((ticket->t_flags & XLOG_TIC_INITED) == 0) && (xlog_commit_record(log, ticket, iclog, &lsn)))) { lsn = (xfs_lsn_t) -1; - if (ticket->t_flags & XLOG_TIC_PERM_RESERV) { - flags |= XFS_LOG_REL_PERM_RESERV; - } + regrant = false; } - if ((ticket->t_flags & XLOG_TIC_PERM_RESERV) == 0 || - (flags & XFS_LOG_REL_PERM_RESERV)) { + if (!regrant) { trace_xfs_log_done_nonperm(log, ticket); /* @@ -541,7 +538,6 @@ xfs_log_done( * request has been made to release a permanent reservation. */ xlog_ungrant_log_space(log, ticket); - xfs_log_ticket_put(ticket); } else { trace_xfs_log_done_perm(log, ticket); @@ -553,6 +549,7 @@ xfs_log_done( ticket->t_flags |= XLOG_TIC_INITED; } + xfs_log_ticket_put(ticket); return lsn; } @@ -1447,7 +1444,7 @@ xlog_alloc_log( iclog->ic_bp = bp; iclog->ic_data = bp->b_addr; #ifdef DEBUG - log->l_iclog_bak[i] = (xfs_caddr_t)&(iclog->ic_header); + log->l_iclog_bak[i] = &iclog->ic_header; #endif head = &iclog->ic_header; memset(head, 0, sizeof(xlog_rec_header_t)); @@ -1602,7 +1599,7 @@ xlog_pack_data( int i, j, k; int size = iclog->ic_offset + roundoff; __be32 cycle_lsn; - xfs_caddr_t dp; + char *dp; cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn); @@ -3664,7 +3661,7 @@ xlog_ticket_alloc( void xlog_verify_dest_ptr( struct xlog *log, - char *ptr) + void *ptr) { int i; int good_ptr = 0; @@ -3767,9 +3764,8 @@ xlog_verify_iclog( xlog_op_header_t *ophead; xlog_in_core_t *icptr; xlog_in_core_2_t *xhdr; - xfs_caddr_t ptr; - xfs_caddr_t base_ptr; - __psint_t field_offset; + void *base_ptr, *ptr, *p; + ptrdiff_t field_offset; __uint8_t clientid; int len, i, j, k, op_len; int idx; @@ -3788,9 +3784,9 @@ xlog_verify_iclog( if (iclog->ic_header.h_magicno != cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) xfs_emerg(log->l_mp, "%s: invalid magic num", __func__); - ptr = (xfs_caddr_t) &iclog->ic_header; - for (ptr += BBSIZE; ptr < ((xfs_caddr_t)&iclog->ic_header) + count; - ptr += BBSIZE) { + base_ptr = ptr = &iclog->ic_header; + p = &iclog->ic_header; + for (ptr += BBSIZE; ptr < base_ptr + count; ptr += BBSIZE) { if (*(__be32 *)ptr == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) xfs_emerg(log->l_mp, "%s: unexpected magic num", __func__); @@ -3798,20 +3794,19 @@ xlog_verify_iclog( /* check fields */ len = be32_to_cpu(iclog->ic_header.h_num_logops); - ptr = iclog->ic_datap; - base_ptr = ptr; - ophead = (xlog_op_header_t *)ptr; + base_ptr = ptr = iclog->ic_datap; + ophead = ptr; xhdr = iclog->ic_data; for (i = 0; i < len; i++) { - ophead = (xlog_op_header_t *)ptr; + ophead = ptr; /* clientid is only 1 byte */ - field_offset = (__psint_t) - ((xfs_caddr_t)&(ophead->oh_clientid) - base_ptr); + p = &ophead->oh_clientid; + field_offset = p - base_ptr; if (!syncing || (field_offset & 0x1ff)) { clientid = ophead->oh_clientid; } else { - idx = BTOBBT((xfs_caddr_t)&(ophead->oh_clientid) - iclog->ic_datap); + idx = BTOBBT((char *)&ophead->oh_clientid - iclog->ic_datap); if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) { j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); @@ -3829,13 +3824,13 @@ xlog_verify_iclog( (unsigned long)field_offset); /* check length */ - field_offset = (__psint_t) - ((xfs_caddr_t)&(ophead->oh_len) - base_ptr); + p = &ophead->oh_len; + field_offset = p - base_ptr; if (!syncing || (field_offset & 0x1ff)) { op_len = be32_to_cpu(ophead->oh_len); } else { - idx = BTOBBT((__psint_t)&ophead->oh_len - - (__psint_t)iclog->ic_datap); + idx = BTOBBT((uintptr_t)&ophead->oh_len - + (uintptr_t)iclog->ic_datap); if (idx >= (XLOG_HEADER_CYCLE_SIZE / BBSIZE)) { j = idx / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); k = idx % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 84e0deb95..fa27aaec7 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -111,15 +111,6 @@ static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2) #define XFS_LSN_CMP(x,y) _lsn_cmp(x,y) /* - * Macros, structures, prototypes for interface to the log manager. - */ - -/* - * Flags to xfs_log_done() - */ -#define XFS_LOG_REL_PERM_RESERV 0x1 - -/* * Flags to xfs_log_force() * * XFS_LOG_SYNC: Synchronous force in-core log to disk @@ -138,7 +129,7 @@ struct xfs_log_callback; xfs_lsn_t xfs_log_done(struct xfs_mount *mp, struct xlog_ticket *ticket, struct xlog_in_core **iclog, - uint flags); + bool regrant); int _xfs_log_force(struct xfs_mount *mp, uint flags, int *log_forced); @@ -183,7 +174,7 @@ struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket); void xfs_log_ticket_put(struct xlog_ticket *ticket); void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp, - xfs_lsn_t *commit_lsn, int flags); + xfs_lsn_t *commit_lsn, bool regrant); bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip); void xfs_log_work_queue(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 45cc0ce18..abc2ccbff 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -624,7 +624,7 @@ restart: spin_unlock(&cil->xc_push_lock); /* xfs_log_done always frees the ticket on error. */ - commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, 0); + commit_lsn = xfs_log_done(log->l_mp, tic, &commit_iclog, false); if (commit_lsn == -1) goto out_abort; @@ -773,14 +773,10 @@ xfs_log_commit_cil( struct xfs_mount *mp, struct xfs_trans *tp, xfs_lsn_t *commit_lsn, - int flags) + bool regrant) { struct xlog *log = mp->m_log; struct xfs_cil *cil = log->l_cilp; - int log_flags = 0; - - if (flags & XFS_TRANS_RELEASE_LOG_RES) - log_flags = XFS_LOG_REL_PERM_RESERV; /* lock out background commit */ down_read(&cil->xc_ctx_lock); @@ -795,7 +791,7 @@ xfs_log_commit_cil( if (commit_lsn) *commit_lsn = tp->t_commit_lsn; - xfs_log_done(mp, tp->t_ticket, NULL, log_flags); + xfs_log_done(mp, tp->t_ticket, NULL, regrant); xfs_trans_unreserve_and_mod_sb(tp); /* @@ -809,7 +805,7 @@ xfs_log_commit_cil( * the log items. This affects (at least) processing of stale buffers, * inodes and EFIs. */ - xfs_trans_free_items(tp, tp->t_commit_lsn, 0); + xfs_trans_free_items(tp, tp->t_commit_lsn, false); xlog_cil_push_background(log); diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index db7cbdeb2..1c87c8abf 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -409,7 +409,7 @@ struct xlog { /* The following field are used for debugging; need to hold icloglock */ #ifdef DEBUG - char *l_iclog_bak[XLOG_MAX_ICLOGS]; + void *l_iclog_bak[XLOG_MAX_ICLOGS]; #endif }; diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index a5d03396d..480ebba84 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -147,7 +147,7 @@ xlog_put_bp( * Return the address of the start of the given block number's data * in a log buffer. The buffer covers a log sector-aligned region. */ -STATIC xfs_caddr_t +STATIC char * xlog_align( struct xlog *log, xfs_daddr_t blk_no, @@ -203,7 +203,7 @@ xlog_bread( xfs_daddr_t blk_no, int nbblks, struct xfs_buf *bp, - xfs_caddr_t *offset) + char **offset) { int error; @@ -225,9 +225,9 @@ xlog_bread_offset( xfs_daddr_t blk_no, /* block to read from */ int nbblks, /* blocks to read */ struct xfs_buf *bp, - xfs_caddr_t offset) + char *offset) { - xfs_caddr_t orig_offset = bp->b_addr; + char *orig_offset = bp->b_addr; int orig_len = BBTOB(bp->b_length); int error, error2; @@ -396,7 +396,7 @@ xlog_find_cycle_start( xfs_daddr_t *last_blk, uint cycle) { - xfs_caddr_t offset; + char *offset; xfs_daddr_t mid_blk; xfs_daddr_t end_blk; uint mid_cycle; @@ -443,7 +443,7 @@ xlog_find_verify_cycle( uint cycle; xfs_buf_t *bp; xfs_daddr_t bufblks; - xfs_caddr_t buf = NULL; + char *buf = NULL; int error = 0; /* @@ -509,7 +509,7 @@ xlog_find_verify_log_record( { xfs_daddr_t i; xfs_buf_t *bp; - xfs_caddr_t offset = NULL; + char *offset = NULL; xlog_rec_header_t *head = NULL; int error = 0; int smallmem = 0; @@ -616,7 +616,7 @@ xlog_find_head( xfs_daddr_t *return_head_blk) { xfs_buf_t *bp; - xfs_caddr_t offset; + char *offset; xfs_daddr_t new_blk, first_blk, start_blk, last_blk, head_blk; int num_scan_bblks; uint first_half_cycle, last_half_cycle; @@ -891,7 +891,7 @@ xlog_find_tail( { xlog_rec_header_t *rhead; xlog_op_header_t *op_head; - xfs_caddr_t offset = NULL; + char *offset = NULL; xfs_buf_t *bp; int error, i, found; xfs_daddr_t umount_data_blk; @@ -1099,7 +1099,7 @@ xlog_find_zeroed( xfs_daddr_t *blk_no) { xfs_buf_t *bp; - xfs_caddr_t offset; + char *offset; uint first_cycle, last_cycle; xfs_daddr_t new_blk, last_blk, start_blk; xfs_daddr_t num_scan_bblks; @@ -1199,7 +1199,7 @@ bp_err: STATIC void xlog_add_record( struct xlog *log, - xfs_caddr_t buf, + char *buf, int cycle, int block, int tail_cycle, @@ -1227,7 +1227,7 @@ xlog_write_log_records( int tail_cycle, int tail_block) { - xfs_caddr_t offset; + char *offset; xfs_buf_t *bp; int balign, ealign; int sectbb = log->l_sectBBsize; @@ -1789,8 +1789,7 @@ xlog_recover_do_inode_buffer( return -EFSCORRUPTED; } - buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp, - next_unlinked_offset); + buffer_nextp = xfs_buf_offset(bp, next_unlinked_offset); *buffer_nextp = *logged_nextp; /* @@ -1798,7 +1797,7 @@ xlog_recover_do_inode_buffer( * have to leave the inode in a consistent state for whoever * reads it next.... */ - xfs_dinode_calc_crc(mp, (struct xfs_dinode *) + xfs_dinode_calc_crc(mp, xfs_buf_offset(bp, i * mp->m_sb.sb_inodesize)); } @@ -2508,8 +2507,8 @@ xlog_recover_inode_pass2( xfs_buf_t *bp; xfs_dinode_t *dip; int len; - xfs_caddr_t src; - xfs_caddr_t dest; + char *src; + char *dest; int error; int attr_index; uint fields; @@ -2551,7 +2550,7 @@ xlog_recover_inode_pass2( goto out_release; } ASSERT(in_f->ilf_fields & XFS_ILOG_CORE); - dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset); + dip = xfs_buf_offset(bp, in_f->ilf_boffset); /* * Make sure the place we're flushing out to really looks @@ -2890,7 +2889,7 @@ xlog_recover_dquot_pass2( return error; ASSERT(bp); - ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset); + ddq = xfs_buf_offset(bp, dq_f->qlf_boffset); /* * If the dquot has an LSN in it, recover the dquot only if it's less @@ -3073,12 +3072,22 @@ xlog_recover_do_icreate_pass2( return -EINVAL; } - /* existing allocation is fixed value */ - ASSERT(count == mp->m_ialloc_inos); - ASSERT(length == mp->m_ialloc_blks); - if (count != mp->m_ialloc_inos || - length != mp->m_ialloc_blks) { - xfs_warn(log->l_mp, "xlog_recover_do_icreate_trans: bad count 2"); + /* + * The inode chunk is either full or sparse and we only support + * m_ialloc_min_blks sized sparse allocations at this time. + */ + if (length != mp->m_ialloc_blks && + length != mp->m_ialloc_min_blks) { + xfs_warn(log->l_mp, + "%s: unsupported chunk length", __FUNCTION__); + return -EINVAL; + } + + /* verify inode count is consistent with extent length */ + if ((count >> mp->m_sb.sb_inopblog) != length) { + xfs_warn(log->l_mp, + "%s: inconsistent inode count and chunk length", + __FUNCTION__); return -EINVAL; } @@ -3096,8 +3105,8 @@ xlog_recover_do_icreate_pass2( XFS_AGB_TO_DADDR(mp, agno, agbno), length, 0)) return 0; - xfs_ialloc_inode_init(mp, NULL, buffer_list, agno, agbno, length, - be32_to_cpu(icl->icl_gen)); + xfs_ialloc_inode_init(mp, NULL, buffer_list, count, agno, agbno, length, + be32_to_cpu(icl->icl_gen)); return 0; } @@ -3369,17 +3378,17 @@ STATIC int xlog_recover_add_to_cont_trans( struct xlog *log, struct xlog_recover *trans, - xfs_caddr_t dp, + char *dp, int len) { xlog_recover_item_t *item; - xfs_caddr_t ptr, old_ptr; + char *ptr, *old_ptr; int old_len; if (list_empty(&trans->r_itemq)) { /* finish copying rest of trans header */ xlog_recover_add_item(&trans->r_itemq); - ptr = (xfs_caddr_t) &trans->r_theader + + ptr = (char *)&trans->r_theader + sizeof(xfs_trans_header_t) - len; memcpy(ptr, dp, len); return 0; @@ -3415,12 +3424,12 @@ STATIC int xlog_recover_add_to_trans( struct xlog *log, struct xlog_recover *trans, - xfs_caddr_t dp, + char *dp, int len) { xfs_inode_log_format_t *in_f; /* any will do */ xlog_recover_item_t *item; - xfs_caddr_t ptr; + char *ptr; if (!len) return 0; @@ -3509,7 +3518,7 @@ STATIC int xlog_recovery_process_trans( struct xlog *log, struct xlog_recover *trans, - xfs_caddr_t dp, + char *dp, unsigned int len, unsigned int flags, int pass) @@ -3616,8 +3625,8 @@ xlog_recover_process_ophdr( struct hlist_head rhash[], struct xlog_rec_header *rhead, struct xlog_op_header *ohead, - xfs_caddr_t dp, - xfs_caddr_t end, + char *dp, + char *end, int pass) { struct xlog_recover *trans; @@ -3666,11 +3675,11 @@ xlog_recover_process_data( struct xlog *log, struct hlist_head rhash[], struct xlog_rec_header *rhead, - xfs_caddr_t dp, + char *dp, int pass) { struct xlog_op_header *ohead; - xfs_caddr_t end; + char *end; int num_logops; int error; @@ -3756,11 +3765,11 @@ xlog_recover_process_efi( } set_bit(XFS_EFI_RECOVERED, &efip->efi_flags); - error = xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp); return error; abort_error: - xfs_trans_cancel(tp, XFS_TRANS_ABORT); + xfs_trans_cancel(tp); return error; } @@ -3862,13 +3871,13 @@ xlog_recover_clear_agi_bucket( xfs_trans_log_buf(tp, agibp, offset, (offset + sizeof(xfs_agino_t) - 1)); - error = xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp); if (error) goto out_error; return; out_abort: - xfs_trans_cancel(tp, XFS_TRANS_ABORT); + xfs_trans_cancel(tp); out_error: xfs_warn(mp, "%s: failed to clear agi %d. Continuing.", __func__, agno); return; @@ -4015,7 +4024,7 @@ xlog_recover_process_iunlinks( STATIC int xlog_unpack_data_crc( struct xlog_rec_header *rhead, - xfs_caddr_t dp, + char *dp, struct xlog *log) { __le32 crc; @@ -4045,7 +4054,7 @@ xlog_unpack_data_crc( STATIC int xlog_unpack_data( struct xlog_rec_header *rhead, - xfs_caddr_t dp, + char *dp, struct xlog *log) { int i, j, k; @@ -4127,7 +4136,7 @@ xlog_do_recovery_pass( { xlog_rec_header_t *rhead; xfs_daddr_t blk_no; - xfs_caddr_t offset; + char *offset; xfs_buf_t *hbp, *dbp; int error = 0, h_size; int bblks, split_bblks; diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 6f23fbdfb..461e791ef 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -725,6 +725,22 @@ xfs_mountfs( } /* + * If enabled, sparse inode chunk alignment is expected to match the + * cluster size. Full inode chunk alignment must match the chunk size, + * but that is checked on sb read verification... + */ + if (xfs_sb_version_hassparseinodes(&mp->m_sb) && + mp->m_sb.sb_spino_align != + XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size)) { + xfs_warn(mp, + "Sparse inode block alignment (%u) must match cluster size (%llu).", + mp->m_sb.sb_spino_align, + XFS_B_TO_FSBT(mp, mp->m_inode_cluster_size)); + error = -EINVAL; + goto out_remove_uuid; + } + + /* * Set inode alignment fields */ xfs_set_inoalignment(mp); diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 8c995a2cc..7999e91cd 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -101,6 +101,8 @@ typedef struct xfs_mount { __uint64_t m_flags; /* global mount flags */ int m_ialloc_inos; /* inodes in inode allocation */ int m_ialloc_blks; /* blocks in inode allocation */ + int m_ialloc_min_blks;/* min blocks in sparse inode + * allocation */ int m_inoalign_mask;/* mask sb_inoalignmt if used */ uint m_qflags; /* quota status flags */ struct xfs_trans_resv m_resv; /* precomputed res values */ @@ -179,6 +181,8 @@ typedef struct xfs_mount { allocator */ #define XFS_MOUNT_NOATTR2 (1ULL << 25) /* disable use of attr2 format */ +#define XFS_MOUNT_DAX (1ULL << 62) /* TEST ONLY! */ + /* * Default minimum read and write sizes. diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index 981a657ec..ab4a6066f 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -306,7 +306,7 @@ xfs_fs_commit_blocks( tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_NOT_SIZE); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_ichange, 0, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); goto out_drop_iolock; } @@ -321,7 +321,7 @@ xfs_fs_commit_blocks( } xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp); out_drop_iolock: xfs_iunlock(ip, XFS_IOLOCK_EXCL); diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 5538468c7..eac9549ef 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -756,7 +756,7 @@ xfs_qm_qino_alloc( error = xfs_trans_reserve(tp, &M_RES(mp)->tr_create, XFS_QM_QINOCREATE_SPACE_RES(mp), 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } @@ -764,8 +764,7 @@ xfs_qm_qino_alloc( error = xfs_dir_ialloc(&tp, NULL, S_IFREG, 1, 0, 0, 1, ip, &committed); if (error) { - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | - XFS_TRANS_ABORT); + xfs_trans_cancel(tp); return error; } } @@ -796,7 +795,7 @@ xfs_qm_qino_alloc( spin_unlock(&mp->m_sb_lock); xfs_log_sb(tp); - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); if (error) { ASSERT(XFS_FORCED_SHUTDOWN(mp)); xfs_alert(mp, "%s failed (error %d)!", __func__, error); diff --git a/fs/xfs/xfs_qm_syscalls.c b/fs/xfs/xfs_qm_syscalls.c index 9a25c9275..3640c6e89 100644 --- a/fs/xfs/xfs_qm_syscalls.c +++ b/fs/xfs/xfs_qm_syscalls.c @@ -239,7 +239,7 @@ xfs_qm_scall_trunc_qfile( tp = xfs_trans_alloc(mp, XFS_TRANS_TRUNCATE_FILE); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); xfs_iunlock(ip, XFS_IOLOCK_EXCL); goto out_put; } @@ -252,15 +252,14 @@ xfs_qm_scall_trunc_qfile( error = xfs_itruncate_extents(&tp, ip, XFS_DATA_FORK, 0); if (error) { - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | - XFS_TRANS_ABORT); + xfs_trans_cancel(tp); goto out_unlock; } ASSERT(ip->i_d.di_nextents == 0); xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_MOD | XFS_ICHGTIME_CHG); - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); out_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); @@ -437,7 +436,7 @@ xfs_qm_scall_setqlim( tp = xfs_trans_alloc(mp, XFS_TRANS_QM_SETQLIM); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_setqlim, 0, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); goto out_rele; } @@ -548,7 +547,7 @@ xfs_qm_scall_setqlim( dqp->dq_flags |= XFS_DQ_DIRTY; xfs_trans_log_dquot(tp, dqp); - error = xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp); out_rele: xfs_qm_dqrele(dqp); @@ -571,7 +570,7 @@ xfs_qm_log_quotaoff_end( error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_equotaoff, 0, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } @@ -585,8 +584,7 @@ xfs_qm_log_quotaoff_end( * We don't care about quotoff's performance. */ xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp, 0); - return error; + return xfs_trans_commit(tp); } @@ -605,7 +603,7 @@ xfs_qm_log_quotaoff( tp = xfs_trans_alloc(mp, XFS_TRANS_QM_QUOTAOFF); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_qm_quotaoff, 0, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); goto out; } @@ -624,7 +622,7 @@ xfs_qm_log_quotaoff( * We don't care about quotoff's performance. */ xfs_trans_set_sync(tp); - error = xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp); if (error) goto out; diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index 5376dd406..ce6506ada 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -55,7 +55,6 @@ struct xfs_trans; typedef struct xfs_dqtrx { struct xfs_dquot *qt_dquot; /* the dquot this refers to */ ulong qt_blk_res; /* blks reserved on a dquot */ - ulong qt_blk_res_used; /* blks used from the reservation */ ulong qt_ino_res; /* inode reserved on a dquot */ ulong qt_ino_res_used; /* inodes used from the reservation */ long qt_bcount_delta; /* dquot blk count changes */ diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index f2079b691..f4e8c06ee 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -780,7 +780,6 @@ xfs_growfs_rt_alloc( * Allocate space to the file, as necessary. */ while (oblocks < nblocks) { - int cancelflags = 0; xfs_trans_t *tp; tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFSRT_ALLOC); @@ -792,7 +791,6 @@ xfs_growfs_rt_alloc( resblks, 0); if (error) goto error_cancel; - cancelflags = XFS_TRANS_RELEASE_LOG_RES; /* * Lock the inode. */ @@ -804,7 +802,6 @@ xfs_growfs_rt_alloc( * Allocate blocks to the bitmap file. */ nmap = 1; - cancelflags |= XFS_TRANS_ABORT; error = xfs_bmapi_write(tp, ip, oblocks, nblocks - oblocks, XFS_BMAPI_METADATA, &firstblock, resblks, &map, &nmap, &flist); @@ -818,14 +815,13 @@ xfs_growfs_rt_alloc( error = xfs_bmap_finish(&tp, &flist, &committed); if (error) goto error_cancel; - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); if (error) goto error; /* * Now we need to clear the allocated blocks. * Do this one block per transaction, to keep it simple. */ - cancelflags = 0; for (bno = map.br_startoff, fsbno = map.br_startblock; bno < map.br_startoff + map.br_blockcount; bno++, fsbno++) { @@ -851,7 +847,7 @@ xfs_growfs_rt_alloc( if (bp == NULL) { error = -EIO; error_cancel: - xfs_trans_cancel(tp, cancelflags); + xfs_trans_cancel(tp); goto error; } memset(bp->b_addr, 0, mp->m_sb.sb_blocksize); @@ -859,7 +855,7 @@ error_cancel: /* * Commit the transaction. */ - error = xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp); if (error) goto error; } @@ -973,7 +969,6 @@ xfs_growfs_rt( bmbno < nrbmblocks; bmbno++) { xfs_trans_t *tp; - int cancelflags = 0; *nmp = *mp; nsbp = &nmp->m_sb; @@ -1015,7 +1010,6 @@ xfs_growfs_rt( mp->m_rbmip->i_d.di_size = nsbp->sb_rbmblocks * nsbp->sb_blocksize; xfs_trans_log_inode(tp, mp->m_rbmip, XFS_ILOG_CORE); - cancelflags |= XFS_TRANS_ABORT; /* * Get the summary inode into the transaction. */ @@ -1062,7 +1056,7 @@ xfs_growfs_rt( nsbp->sb_rextents - sbp->sb_rextents, &bp, &sumbno); if (error) { error_cancel: - xfs_trans_cancel(tp, cancelflags); + xfs_trans_cancel(tp); break; } /* @@ -1076,7 +1070,7 @@ error_cancel: mp->m_rsumlevels = nrsumlevels; mp->m_rsumsize = nrsumsize; - error = xfs_trans_commit(tp, 0); + error = xfs_trans_commit(tp); if (error) break; } diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 858e1e62b..1fb16562c 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -112,6 +112,8 @@ static struct xfs_kobj xfs_dbg_kobj; /* global debug sysfs attrs */ #define MNTOPT_DISCARD "discard" /* Discard unused blocks */ #define MNTOPT_NODISCARD "nodiscard" /* Do not discard unused blocks */ +#define MNTOPT_DAX "dax" /* Enable direct access to bdev pages */ + /* * Table driven mount option parser. * @@ -363,6 +365,10 @@ xfs_parseargs( mp->m_flags |= XFS_MOUNT_DISCARD; } else if (!strcmp(this_char, MNTOPT_NODISCARD)) { mp->m_flags &= ~XFS_MOUNT_DISCARD; +#ifdef CONFIG_FS_DAX + } else if (!strcmp(this_char, MNTOPT_DAX)) { + mp->m_flags |= XFS_MOUNT_DAX; +#endif } else { xfs_warn(mp, "unknown mount option [%s].", this_char); return -EINVAL; @@ -452,8 +458,8 @@ done: } struct proc_xfs_info { - int flag; - char *str; + uint64_t flag; + char *str; }; STATIC int @@ -474,6 +480,7 @@ xfs_showargs( { XFS_MOUNT_GRPID, "," MNTOPT_GRPID }, { XFS_MOUNT_DISCARD, "," MNTOPT_DISCARD }, { XFS_MOUNT_SMALL_INUMS, "," MNTOPT_32BITINODE }, + { XFS_MOUNT_DAX, "," MNTOPT_DAX }, { 0, NULL } }; static struct proc_xfs_info xfs_info_unset[] = { @@ -1507,6 +1514,20 @@ xfs_fs_fill_super( if (XFS_SB_VERSION_NUM(&mp->m_sb) == XFS_SB_VERSION_5) sb->s_flags |= MS_I_VERSION; + if (mp->m_flags & XFS_MOUNT_DAX) { + xfs_warn(mp, + "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); + if (sb->s_blocksize != PAGE_SIZE) { + xfs_alert(mp, + "Filesystem block size invalid for DAX Turning DAX off."); + mp->m_flags &= ~XFS_MOUNT_DAX; + } else if (!sb->s_bdev->bd_disk->fops->direct_access) { + xfs_alert(mp, + "Block device does not support DAX Turning DAX off."); + mp->m_flags &= ~XFS_MOUNT_DAX; + } + } + error = xfs_mountfs(mp); if (error) goto out_filestream_unmount; diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 40c076523..4be27b021 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -178,7 +178,6 @@ xfs_symlink( struct xfs_bmap_free free_list; xfs_fsblock_t first_block; bool unlock_dp_on_error = false; - uint cancel_flags; int committed; xfs_fileoff_t first_fsb; xfs_filblks_t fs_blocks; @@ -224,7 +223,6 @@ xfs_symlink( return error; tp = xfs_trans_alloc(mp, XFS_TRANS_SYMLINK); - cancel_flags = XFS_TRANS_RELEASE_LOG_RES; /* * The symlink will fit into the inode data fork? * There can't be any attributes so we get the whole variable part. @@ -239,10 +237,8 @@ xfs_symlink( resblks = 0; error = xfs_trans_reserve(tp, &M_RES(mp)->tr_symlink, 0, 0); } - if (error) { - cancel_flags = 0; + if (error) goto out_trans_cancel; - } xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT); unlock_dp_on_error = true; @@ -394,7 +390,7 @@ xfs_symlink( if (error) goto out_bmap_cancel; - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); if (error) goto out_release_inode; @@ -407,9 +403,8 @@ xfs_symlink( out_bmap_cancel: xfs_bmap_cancel(&free_list); - cancel_flags |= XFS_TRANS_ABORT; out_trans_cancel: - xfs_trans_cancel(tp, cancel_flags); + xfs_trans_cancel(tp); out_release_inode: /* * Wait until after the current transaction is aborted to finish the @@ -464,7 +459,7 @@ xfs_inactive_symlink_rmt( tp = xfs_trans_alloc(mp, XFS_TRANS_INACTIVE); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); if (error) { - xfs_trans_cancel(tp, 0); + xfs_trans_cancel(tp); return error; } @@ -533,7 +528,7 @@ xfs_inactive_symlink_rmt( /* * Commit the transaction containing extent freeing and EFDs. */ - error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); + error = xfs_trans_commit(tp); if (error) { ASSERT(XFS_FORCED_SHUTDOWN(mp)); goto error_unlock; @@ -552,7 +547,7 @@ xfs_inactive_symlink_rmt( error_bmap_cancel: xfs_bmap_cancel(&free_list); error_trans_cancel: - xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT); + xfs_trans_cancel(tp); error_unlock: xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 615781bf4..8d916d33d 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -738,6 +738,53 @@ TRACE_EVENT(xfs_iomap_prealloc_size, __entry->blocks, __entry->shift, __entry->writeio_blocks) ) +TRACE_EVENT(xfs_irec_merge_pre, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino, + uint16_t holemask, xfs_agino_t nagino, uint16_t nholemask), + TP_ARGS(mp, agno, agino, holemask, nagino, nholemask), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, agino) + __field(uint16_t, holemask) + __field(xfs_agino_t, nagino) + __field(uint16_t, nholemask) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agino = agino; + __entry->holemask = holemask; + __entry->nagino = nagino; + __entry->nholemask = holemask; + ), + TP_printk("dev %d:%d agno %d inobt (%u:0x%x) new (%u:0x%x)", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->agno, + __entry->agino, __entry->holemask, __entry->nagino, + __entry->nholemask) +) + +TRACE_EVENT(xfs_irec_merge_post, + TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, xfs_agino_t agino, + uint16_t holemask), + TP_ARGS(mp, agno, agino, holemask), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_agnumber_t, agno) + __field(xfs_agino_t, agino) + __field(uint16_t, holemask) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->agno = agno; + __entry->agino = agino; + __entry->holemask = holemask; + ), + TP_printk("dev %d:%d agno %d inobt (%u:0x%x)", MAJOR(__entry->dev), + MINOR(__entry->dev), __entry->agno, __entry->agino, + __entry->holemask) +) + #define DEFINE_IREF_EVENT(name) \ DEFINE_EVENT(xfs_iref_class, name, \ TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \ diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 220ef2c90..0582a2710 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -113,7 +113,7 @@ xfs_trans_free( * blocks. Locks and log items, however, are no inherited. They must * be added to the new transaction explicitly. */ -xfs_trans_t * +STATIC xfs_trans_t * xfs_trans_dup( xfs_trans_t *tp) { @@ -251,14 +251,7 @@ xfs_trans_reserve( */ undo_log: if (resp->tr_logres > 0) { - int log_flags; - - if (resp->tr_logflags & XFS_TRANS_PERM_LOG_RES) { - log_flags = XFS_LOG_REL_PERM_RESERV; - } else { - log_flags = 0; - } - xfs_log_done(tp->t_mountp, tp->t_ticket, NULL, log_flags); + xfs_log_done(tp->t_mountp, tp->t_ticket, NULL, false); tp->t_ticket = NULL; tp->t_log_res = 0; tp->t_flags &= ~XFS_TRANS_PERM_LOG_RES; @@ -744,7 +737,7 @@ void xfs_trans_free_items( struct xfs_trans *tp, xfs_lsn_t commit_lsn, - int flags) + bool abort) { struct xfs_log_item_desc *lidp, *next; @@ -755,7 +748,7 @@ xfs_trans_free_items( if (commit_lsn != NULLCOMMITLSN) lip->li_ops->iop_committing(lip, commit_lsn); - if (flags & XFS_TRANS_ABORT) + if (abort) lip->li_flags |= XFS_LI_ABORTED; lip->li_ops->iop_unlock(lip); @@ -892,27 +885,17 @@ xfs_trans_committed_bulk( * have already been unlocked as if the commit had succeeded. * Do not reference the transaction structure after this call. */ -int -xfs_trans_commit( +static int +__xfs_trans_commit( struct xfs_trans *tp, - uint flags) + bool regrant) { struct xfs_mount *mp = tp->t_mountp; xfs_lsn_t commit_lsn = -1; int error = 0; - int log_flags = 0; int sync = tp->t_flags & XFS_TRANS_SYNC; /* - * Determine whether this commit is releasing a permanent - * log reservation or not. - */ - if (flags & XFS_TRANS_RELEASE_LOG_RES) { - ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); - log_flags = XFS_LOG_REL_PERM_RESERV; - } - - /* * If there is nothing to be logged by the transaction, * then unlock all of the items associated with the * transaction and free the transaction structure. @@ -936,7 +919,7 @@ xfs_trans_commit( xfs_trans_apply_sb_deltas(tp); xfs_trans_apply_dquot_deltas(tp); - xfs_log_commit_cil(mp, tp, &commit_lsn, flags); + xfs_log_commit_cil(mp, tp, &commit_lsn, regrant); current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); xfs_trans_free(tp); @@ -964,18 +947,25 @@ out_unreserve: */ xfs_trans_unreserve_and_mod_dquots(tp); if (tp->t_ticket) { - commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, log_flags); + commit_lsn = xfs_log_done(mp, tp->t_ticket, NULL, regrant); if (commit_lsn == -1 && !error) error = -EIO; } current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); - xfs_trans_free_items(tp, NULLCOMMITLSN, error ? XFS_TRANS_ABORT : 0); + xfs_trans_free_items(tp, NULLCOMMITLSN, !!error); xfs_trans_free(tp); XFS_STATS_INC(xs_trans_empty); return error; } +int +xfs_trans_commit( + struct xfs_trans *tp) +{ + return __xfs_trans_commit(tp, false); +} + /* * Unlock all of the transaction's items and free the transaction. * The transaction must not have modified any of its items, because @@ -986,29 +976,22 @@ out_unreserve: */ void xfs_trans_cancel( - xfs_trans_t *tp, - int flags) + struct xfs_trans *tp) { - int log_flags; - xfs_mount_t *mp = tp->t_mountp; + struct xfs_mount *mp = tp->t_mountp; + bool dirty = (tp->t_flags & XFS_TRANS_DIRTY); /* - * See if the caller is being too lazy to figure out if - * the transaction really needs an abort. - */ - if ((flags & XFS_TRANS_ABORT) && !(tp->t_flags & XFS_TRANS_DIRTY)) - flags &= ~XFS_TRANS_ABORT; - /* * See if the caller is relying on us to shut down the * filesystem. This happens in paths where we detect * corruption and decide to give up. */ - if ((tp->t_flags & XFS_TRANS_DIRTY) && !XFS_FORCED_SHUTDOWN(mp)) { + if (dirty && !XFS_FORCED_SHUTDOWN(mp)) { XFS_ERROR_REPORT("xfs_trans_cancel", XFS_ERRLEVEL_LOW, mp); xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); } #ifdef DEBUG - if (!(flags & XFS_TRANS_ABORT) && !XFS_FORCED_SHUTDOWN(mp)) { + if (!dirty && !XFS_FORCED_SHUTDOWN(mp)) { struct xfs_log_item_desc *lidp; list_for_each_entry(lidp, &tp->t_items, lid_trans) @@ -1018,27 +1001,20 @@ xfs_trans_cancel( xfs_trans_unreserve_and_mod_sb(tp); xfs_trans_unreserve_and_mod_dquots(tp); - if (tp->t_ticket) { - if (flags & XFS_TRANS_RELEASE_LOG_RES) { - ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); - log_flags = XFS_LOG_REL_PERM_RESERV; - } else { - log_flags = 0; - } - xfs_log_done(mp, tp->t_ticket, NULL, log_flags); - } + if (tp->t_ticket) + xfs_log_done(mp, tp->t_ticket, NULL, false); /* mark this thread as no longer being in a transaction */ current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS); - xfs_trans_free_items(tp, NULLCOMMITLSN, flags); + xfs_trans_free_items(tp, NULLCOMMITLSN, dirty); xfs_trans_free(tp); } /* * Roll from one trans in the sequence of PERMANENT transactions to * the next: permanent transactions are only flushed out when - * committed with XFS_TRANS_RELEASE_LOG_RES, but we still want as soon + * committed with xfs_trans_commit(), but we still want as soon * as possible to let chunks of it go to the log. So we commit the * chunk we've been working on and get a new transaction to continue. */ @@ -1055,7 +1031,8 @@ xfs_trans_roll( * Ensure that the inode is always logged. */ trans = *tpp; - xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE); + if (dp) + xfs_trans_log_inode(trans, dp, XFS_ILOG_CORE); /* * Copy the critical parameters from one trans to the next. @@ -1071,20 +1048,13 @@ xfs_trans_roll( * is in progress. The caller takes the responsibility to cancel * the duplicate transaction that gets returned. */ - error = xfs_trans_commit(trans, 0); + error = __xfs_trans_commit(trans, true); if (error) return error; trans = *tpp; /* - * transaction commit worked ok so we can drop the extra ticket - * reference that we gained in xfs_trans_dup() - */ - xfs_log_ticket_put(trans->t_ticket); - - - /* * Reserve space in the log for th next transaction. * This also pushes items in the "AIL", the list of logged items, * out to disk if they are taking up space at the tail of the log @@ -1100,6 +1070,7 @@ xfs_trans_roll( if (error) return error; - xfs_trans_ijoin(trans, dp, 0); + if (dp) + xfs_trans_ijoin(trans, dp, 0); return 0; } diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index b5bc1ab3c..3b21b4e5e 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -133,8 +133,6 @@ typedef struct xfs_trans { * XFS transaction mechanism exported interfaces that are * actually macros. */ -#define xfs_trans_get_log_res(tp) ((tp)->t_log_res) -#define xfs_trans_get_log_count(tp) ((tp)->t_log_count) #define xfs_trans_get_block_res(tp) ((tp)->t_blk_res) #define xfs_trans_set_sync(tp) ((tp)->t_flags |= XFS_TRANS_SYNC) @@ -153,7 +151,6 @@ typedef struct xfs_trans { */ xfs_trans_t *xfs_trans_alloc(struct xfs_mount *, uint); xfs_trans_t *_xfs_trans_alloc(struct xfs_mount *, uint, xfs_km_flags_t); -xfs_trans_t *xfs_trans_dup(xfs_trans_t *); int xfs_trans_reserve(struct xfs_trans *, struct xfs_trans_res *, uint, uint); void xfs_trans_mod_sb(xfs_trans_t *, uint, int64_t); @@ -228,9 +225,9 @@ void xfs_trans_log_efd_extent(xfs_trans_t *, struct xfs_efd_log_item *, xfs_fsblock_t, xfs_extlen_t); -int xfs_trans_commit(xfs_trans_t *, uint flags); +int xfs_trans_commit(struct xfs_trans *); int xfs_trans_roll(struct xfs_trans **, struct xfs_inode *); -void xfs_trans_cancel(xfs_trans_t *, int); +void xfs_trans_cancel(xfs_trans_t *); int xfs_trans_ail_init(struct xfs_mount *); void xfs_trans_ail_destroy(struct xfs_mount *); diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 573aefb5a..1098cf490 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -159,7 +159,7 @@ xfs_trans_ail_cursor_next( { struct xfs_log_item *lip = cur->item; - if ((__psint_t)lip & 1) + if ((uintptr_t)lip & 1) lip = xfs_ail_min(ailp); if (lip) cur->item = xfs_ail_next(ailp, lip); @@ -196,7 +196,7 @@ xfs_trans_ail_cursor_clear( list_for_each_entry(cur, &ailp->xa_cursors, list) { if (cur->item == lip) cur->item = (struct xfs_log_item *) - ((__psint_t)cur->item | 1); + ((uintptr_t)cur->item | 1); } } @@ -287,7 +287,7 @@ xfs_ail_splice( * find the place in the AIL where the items belong. */ lip = cur ? cur->item : NULL; - if (!lip || (__psint_t) lip & 1) + if (!lip || (uintptr_t)lip & 1) lip = __xfs_trans_ail_cursor_last(ailp, lsn); /* diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index 76a16df55..ce78534a0 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -90,8 +90,9 @@ xfs_trans_dup_dqinfo( xfs_trans_t *ntp) { xfs_dqtrx_t *oq, *nq; - int i,j; + int i, j; xfs_dqtrx_t *oqa, *nqa; + ulong blk_res_used; if (!otp->t_dqinfo) return; @@ -102,18 +103,23 @@ xfs_trans_dup_dqinfo( * Because the quota blk reservation is carried forward, * it is also necessary to carry forward the DQ_DIRTY flag. */ - if(otp->t_flags & XFS_TRANS_DQ_DIRTY) + if (otp->t_flags & XFS_TRANS_DQ_DIRTY) ntp->t_flags |= XFS_TRANS_DQ_DIRTY; for (j = 0; j < XFS_QM_TRANS_DQTYPES; j++) { oqa = otp->t_dqinfo->dqs[j]; nqa = ntp->t_dqinfo->dqs[j]; for (i = 0; i < XFS_QM_TRANS_MAXDQS; i++) { + blk_res_used = 0; + if (oqa[i].qt_dquot == NULL) break; oq = &oqa[i]; nq = &nqa[i]; + if (oq->qt_blk_res && oq->qt_bcount_delta > 0) + blk_res_used = oq->qt_bcount_delta; + nq->qt_dquot = oq->qt_dquot; nq->qt_bcount_delta = nq->qt_icount_delta = 0; nq->qt_rtbcount_delta = 0; @@ -121,8 +127,8 @@ xfs_trans_dup_dqinfo( /* * Transfer whatever is left of the reservations. */ - nq->qt_blk_res = oq->qt_blk_res - oq->qt_blk_res_used; - oq->qt_blk_res = oq->qt_blk_res_used; + nq->qt_blk_res = oq->qt_blk_res - blk_res_used; + oq->qt_blk_res = blk_res_used; nq->qt_rtblk_res = oq->qt_rtblk_res - oq->qt_rtblk_res_used; @@ -239,10 +245,6 @@ xfs_trans_mod_dquot( * disk blocks used. */ case XFS_TRANS_DQ_BCOUNT: - if (qtrx->qt_blk_res && delta > 0) { - qtrx->qt_blk_res_used += (ulong)delta; - ASSERT(qtrx->qt_blk_res >= qtrx->qt_blk_res_used); - } qtrx->qt_bcount_delta += delta; break; @@ -423,15 +425,19 @@ xfs_trans_apply_dquot_deltas( * reservation that a transaction structure knows of. */ if (qtrx->qt_blk_res != 0) { - if (qtrx->qt_blk_res != qtrx->qt_blk_res_used) { - if (qtrx->qt_blk_res > - qtrx->qt_blk_res_used) + ulong blk_res_used = 0; + + if (qtrx->qt_bcount_delta > 0) + blk_res_used = qtrx->qt_bcount_delta; + + if (qtrx->qt_blk_res != blk_res_used) { + if (qtrx->qt_blk_res > blk_res_used) dqp->q_res_bcount -= (xfs_qcnt_t) (qtrx->qt_blk_res - - qtrx->qt_blk_res_used); + blk_res_used); else dqp->q_res_bcount -= (xfs_qcnt_t) - (qtrx->qt_blk_res_used - + (blk_res_used - qtrx->qt_blk_res); } } else { diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index bd1281862..1b7362945 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -30,7 +30,7 @@ void xfs_trans_init(struct xfs_mount *); void xfs_trans_add_item(struct xfs_trans *, struct xfs_log_item *); void xfs_trans_del_item(struct xfs_log_item *); void xfs_trans_free_items(struct xfs_trans *tp, xfs_lsn_t commit_lsn, - int flags); + bool abort); void xfs_trans_unreserve_and_mod_sb(struct xfs_trans *tp); void xfs_trans_committed_bulk(struct xfs_ail *ailp, struct xfs_log_vec *lv, |