diff options
author | André Fabian Silva Delgado <emulatorman@parabola.nu> | 2016-03-25 03:53:42 -0300 |
---|---|---|
committer | André Fabian Silva Delgado <emulatorman@parabola.nu> | 2016-03-25 03:53:42 -0300 |
commit | 03dd4cb26d967f9588437b0fc9cc0e8353322bb7 (patch) | |
tree | fa581f6dc1c0596391690d1f67eceef3af8246dc /fs | |
parent | d4e493caf788ef44982e131ff9c786546904d934 (diff) |
Linux-libre 4.5-gnu
Diffstat (limited to 'fs')
600 files changed, 15088 insertions, 39036 deletions
diff --git a/fs/9p/acl.c b/fs/9p/acl.c index a7e28890f..9da967f38 100644 --- a/fs/9p/acl.c +++ b/fs/9p/acl.c @@ -67,8 +67,8 @@ int v9fs_get_acl(struct inode *inode, struct p9_fid *fid) return 0; } /* get the default/access acl values and cache them */ - dacl = __v9fs_get_acl(fid, POSIX_ACL_XATTR_DEFAULT); - pacl = __v9fs_get_acl(fid, POSIX_ACL_XATTR_ACCESS); + dacl = __v9fs_get_acl(fid, XATTR_NAME_POSIX_ACL_DEFAULT); + pacl = __v9fs_get_acl(fid, XATTR_NAME_POSIX_ACL_ACCESS); if (!IS_ERR(dacl) && !IS_ERR(pacl)) { set_cached_acl(inode, ACL_TYPE_DEFAULT, dacl); @@ -133,10 +133,10 @@ static int v9fs_set_acl(struct p9_fid *fid, int type, struct posix_acl *acl) goto err_free_out; switch (type) { case ACL_TYPE_ACCESS: - name = POSIX_ACL_XATTR_ACCESS; + name = XATTR_NAME_POSIX_ACL_ACCESS; break; case ACL_TYPE_DEFAULT: - name = POSIX_ACL_XATTR_DEFAULT; + name = XATTR_NAME_POSIX_ACL_DEFAULT; break; default: BUG(); @@ -220,15 +220,12 @@ static int v9fs_xattr_get_acl(const struct xattr_handler *handler, struct posix_acl *acl; int error; - if (strcmp(name, "") != 0) - return -EINVAL; - v9ses = v9fs_dentry2v9ses(dentry); /* * We allow set/get/list of acl when access=client is not specified */ if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) - return v9fs_xattr_get(dentry, handler->prefix, buffer, size); + return v9fs_xattr_get(dentry, handler->name, buffer, size); acl = v9fs_get_cached_acl(d_inode(dentry), handler->flags); if (IS_ERR(acl)) @@ -250,16 +247,13 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler, struct v9fs_session_info *v9ses; struct inode *inode = d_inode(dentry); - if (strcmp(name, "") != 0) - return -EINVAL; - v9ses = v9fs_dentry2v9ses(dentry); /* * set the attribute on the remote. Without even looking at the * xattr value. We leave it to the server to validate */ if ((v9ses->flags & V9FS_ACCESS_MASK) != V9FS_ACCESS_CLIENT) - return v9fs_xattr_set(dentry, handler->prefix, value, size, + return v9fs_xattr_set(dentry, handler->name, value, size, flags); if (S_ISLNK(inode->i_mode)) @@ -319,7 +313,7 @@ static int v9fs_xattr_set_acl(const struct xattr_handler *handler, default: BUG(); } - retval = v9fs_xattr_set(dentry, handler->prefix, value, size, flags); + retval = v9fs_xattr_set(dentry, handler->name, value, size, flags); if (!retval) set_cached_acl(inode, handler->flags, acl); err_out: @@ -328,14 +322,14 @@ err_out: } const struct xattr_handler v9fs_xattr_acl_access_handler = { - .prefix = POSIX_ACL_XATTR_ACCESS, + .name = XATTR_NAME_POSIX_ACL_ACCESS, .flags = ACL_TYPE_ACCESS, .get = v9fs_xattr_get_acl, .set = v9fs_xattr_set_acl, }; const struct xattr_handler v9fs_xattr_acl_default_handler = { - .prefix = POSIX_ACL_XATTR_DEFAULT, + .name = XATTR_NAME_POSIX_ACL_DEFAULT, .flags = ACL_TYPE_DEFAULT, .get = v9fs_xattr_get_acl, .set = v9fs_xattr_set_acl, diff --git a/fs/9p/cache.c b/fs/9p/cache.c index a69260f27..103ca5e12 100644 --- a/fs/9p/cache.c +++ b/fs/9p/cache.c @@ -243,14 +243,14 @@ void v9fs_cache_inode_set_cookie(struct inode *inode, struct file *filp) if (!v9inode->fscache) return; - spin_lock(&v9inode->fscache_lock); + mutex_lock(&v9inode->fscache_lock); if ((filp->f_flags & O_ACCMODE) != O_RDONLY) v9fs_cache_inode_flush_cookie(inode); else v9fs_cache_inode_get_cookie(inode); - spin_unlock(&v9inode->fscache_lock); + mutex_unlock(&v9inode->fscache_lock); } void v9fs_cache_inode_reset_cookie(struct inode *inode) @@ -264,7 +264,7 @@ void v9fs_cache_inode_reset_cookie(struct inode *inode) old = v9inode->fscache; - spin_lock(&v9inode->fscache_lock); + mutex_lock(&v9inode->fscache_lock); fscache_relinquish_cookie(v9inode->fscache, 1); v9ses = v9fs_inode2v9ses(inode); @@ -274,7 +274,7 @@ void v9fs_cache_inode_reset_cookie(struct inode *inode) p9_debug(P9_DEBUG_FSC, "inode %p revalidating cookie old %p new %p\n", inode, old, v9inode->fscache); - spin_unlock(&v9inode->fscache_lock); + mutex_unlock(&v9inode->fscache_lock); } int __v9fs_fscache_release_page(struct page *page, gfp_t gfp) diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 6caca0250..072e75995 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -575,7 +575,7 @@ static int v9fs_init_inode_cache(void) v9fs_inode_cache = kmem_cache_create("v9fs_inode_cache", sizeof(struct v9fs_inode), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), v9fs_inode_init_once); if (!v9fs_inode_cache) return -ENOMEM; diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index 0923f2cf3..687705038 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -123,7 +123,7 @@ struct v9fs_session_info { struct v9fs_inode { #ifdef CONFIG_9P_FSCACHE - spinlock_t fscache_lock; + struct mutex fscache_lock; struct fscache_cookie *fscache; #endif struct p9_qid qid; diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 7bf835f85..eadc894fa 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -449,14 +449,14 @@ static int v9fs_file_fsync(struct file *filp, loff_t start, loff_t end, if (retval) return retval; - mutex_lock(&inode->i_mutex); + inode_lock(inode); p9_debug(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync); fid = filp->private_data; v9fs_blank_wstat(&wstat); retval = p9_client_wstat(fid, &wstat); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return retval; } @@ -472,13 +472,13 @@ int v9fs_file_fsync_dotl(struct file *filp, loff_t start, loff_t end, if (retval) return retval; - mutex_lock(&inode->i_mutex); + inode_lock(inode); p9_debug(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync); fid = filp->private_data; retval = p9_client_fsync(fid, datasync); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return retval; } diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 511078586..3a08b3e6f 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -244,7 +244,7 @@ struct inode *v9fs_alloc_inode(struct super_block *sb) return NULL; #ifdef CONFIG_9P_FSCACHE v9inode->fscache = NULL; - spin_lock_init(&v9inode->fscache_lock); + mutex_init(&v9inode->fscache_lock); #endif v9inode->writeback_fid = NULL; v9inode->cache_validity = 0; @@ -1223,18 +1223,26 @@ ino_t v9fs_qid2ino(struct p9_qid *qid) } /** - * v9fs_vfs_follow_link - follow a symlink path + * v9fs_vfs_get_link - follow a symlink path * @dentry: dentry for symlink - * @cookie: place to pass the data to put_link() + * @inode: inode for symlink + * @done: delayed call for when we are done with the return value */ -static const char *v9fs_vfs_follow_link(struct dentry *dentry, void **cookie) +static const char *v9fs_vfs_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) { - struct v9fs_session_info *v9ses = v9fs_dentry2v9ses(dentry); - struct p9_fid *fid = v9fs_fid_lookup(dentry); + struct v9fs_session_info *v9ses; + struct p9_fid *fid; struct p9_wstat *st; char *res; + if (!dentry) + return ERR_PTR(-ECHILD); + + v9ses = v9fs_dentry2v9ses(dentry); + fid = v9fs_fid_lookup(dentry); p9_debug(P9_DEBUG_VFS, "%pd\n", dentry); if (IS_ERR(fid)) @@ -1259,7 +1267,8 @@ static const char *v9fs_vfs_follow_link(struct dentry *dentry, void **cookie) p9stat_free(st); kfree(st); - return *cookie = res; + set_delayed_call(done, kfree_link, res); + return res; } /** @@ -1452,8 +1461,7 @@ static const struct inode_operations v9fs_file_inode_operations = { static const struct inode_operations v9fs_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = v9fs_vfs_follow_link, - .put_link = kfree_put_link, + .get_link = v9fs_vfs_get_link, .getattr = v9fs_vfs_getattr, .setattr = v9fs_vfs_setattr, }; diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index cb899af1b..a34702c99 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -899,26 +899,34 @@ error: } /** - * v9fs_vfs_follow_link_dotl - follow a symlink path + * v9fs_vfs_get_link_dotl - follow a symlink path * @dentry: dentry for symlink - * @cookie: place to pass the data to put_link() + * @inode: inode for symlink + * @done: destructor for return value */ static const char * -v9fs_vfs_follow_link_dotl(struct dentry *dentry, void **cookie) +v9fs_vfs_get_link_dotl(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) { - struct p9_fid *fid = v9fs_fid_lookup(dentry); + struct p9_fid *fid; char *target; int retval; + if (!dentry) + return ERR_PTR(-ECHILD); + p9_debug(P9_DEBUG_VFS, "%pd\n", dentry); + fid = v9fs_fid_lookup(dentry); if (IS_ERR(fid)) return ERR_CAST(fid); retval = p9_client_readlink(fid, &target); if (retval) return ERR_PTR(retval); - return *cookie = target; + set_delayed_call(done, kfree_link, target); + return target; } int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode) @@ -984,8 +992,7 @@ const struct inode_operations v9fs_file_inode_operations_dotl = { const struct inode_operations v9fs_symlink_inode_operations_dotl = { .readlink = generic_readlink, - .follow_link = v9fs_vfs_follow_link_dotl, - .put_link = kfree_put_link, + .get_link = v9fs_vfs_get_link_dotl, .getattr = v9fs_vfs_getattr_dotl, .setattr = v9fs_vfs_setattr_dotl, .setxattr = generic_setxattr, diff --git a/fs/9p/xattr.c b/fs/9p/xattr.c index e3d026ac3..9dd9b47a6 100644 --- a/fs/9p/xattr.c +++ b/fs/9p/xattr.c @@ -143,8 +143,6 @@ static int v9fs_xattr_handler_get(const struct xattr_handler *handler, { const char *full_name = xattr_full_name(handler, name); - if (strcmp(name, "") == 0) - return -EINVAL; return v9fs_xattr_get(dentry, full_name, buffer, size); } @@ -154,8 +152,6 @@ static int v9fs_xattr_handler_set(const struct xattr_handler *handler, { const char *full_name = xattr_full_name(handler, name); - if (strcmp(name, "") == 0) - return -EINVAL; return v9fs_xattr_set(dentry, full_name, value, size, flags); } diff --git a/fs/Kconfig b/fs/Kconfig index bec7b6beb..dc04844aa 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -50,7 +50,8 @@ config FS_DAX_PMD bool default FS_DAX depends on FS_DAX - depends on BROKEN + depends on ZONE_DEVICE + depends on TRANSPARENT_HUGEPAGE endif # BLOCK @@ -73,6 +74,16 @@ config FILE_LOCKING for filesystems like NFS and for the flock() system call. Disabling this option saves about 11k. +config MANDATORY_FILE_LOCKING + bool "Enable Mandatory file locking" + depends on FILE_LOCKING + default y + help + This option enables files appropriately marked files on appropriely + mounted filesystems to support mandatory locking. + + To the best of my knowledge this is dead code that no one cares about. + source "fs/notify/Kconfig" source "fs/quota/Kconfig" @@ -222,7 +233,6 @@ source "fs/pstore/Kconfig" source "fs/sysv/Kconfig" source "fs/ufs/Kconfig" source "fs/exofs/Kconfig" -source "fs/aufs/Kconfig" endif # MISC_FILESYSTEMS diff --git a/fs/Makefile b/fs/Makefile index 0c61756a2..b8da1c2c3 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -127,4 +127,3 @@ obj-y += exofs/ # Multiple modules obj-$(CONFIG_CEPH_FS) += ceph/ obj-$(CONFIG_PSTORE) += pstore/ obj-$(CONFIG_EFIVAR_FS) += efivarfs/ -obj-$(CONFIG_AUFS_FS) += aufs/ diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h index 24575d9d8..fadf408bd 100644 --- a/fs/adfs/adfs.h +++ b/fs/adfs/adfs.h @@ -44,24 +44,24 @@ struct adfs_dir_ops; */ struct adfs_sb_info { union { struct { - struct adfs_discmap *s_map; /* bh list containing map */ - struct adfs_dir_ops *s_dir; /* directory operations */ + struct adfs_discmap *s_map; /* bh list containing map */ + const struct adfs_dir_ops *s_dir; /* directory operations */ }; - struct rcu_head rcu; /* used only at shutdown time */ + struct rcu_head rcu; /* used only at shutdown time */ }; - kuid_t s_uid; /* owner uid */ - kgid_t s_gid; /* owner gid */ - umode_t s_owner_mask; /* ADFS owner perm -> unix perm */ - umode_t s_other_mask; /* ADFS other perm -> unix perm */ + kuid_t s_uid; /* owner uid */ + kgid_t s_gid; /* owner gid */ + umode_t s_owner_mask; /* ADFS owner perm -> unix perm */ + umode_t s_other_mask; /* ADFS other perm -> unix perm */ int s_ftsuffix; /* ,xyz hex filetype suffix option */ - __u32 s_ids_per_zone; /* max. no ids in one zone */ - __u32 s_idlen; /* length of ID in map */ - __u32 s_map_size; /* sector size of a map */ - unsigned long s_size; /* total size (in blocks) of this fs */ - signed int s_map2blk; /* shift left by this for map->sector */ - unsigned int s_log2sharesize;/* log2 share size */ - __le32 s_version; /* disc format version */ + __u32 s_ids_per_zone; /* max. no ids in one zone */ + __u32 s_idlen; /* length of ID in map */ + __u32 s_map_size; /* sector size of a map */ + unsigned long s_size; /* total size (in blocks) of this fs */ + signed int s_map2blk; /* shift left by this for map->sector*/ + unsigned int s_log2sharesize;/* log2 share size */ + __le32 s_version; /* disc format version */ unsigned int s_namelen; /* maximum number of characters in name */ }; @@ -168,8 +168,8 @@ void __adfs_error(struct super_block *sb, const char *function, extern const struct inode_operations adfs_dir_inode_operations; extern const struct file_operations adfs_dir_operations; extern const struct dentry_operations adfs_dentry_operations; -extern struct adfs_dir_ops adfs_f_dir_ops; -extern struct adfs_dir_ops adfs_fplus_dir_ops; +extern const struct adfs_dir_ops adfs_f_dir_ops; +extern const struct adfs_dir_ops adfs_fplus_dir_ops; extern int adfs_dir_update(struct super_block *sb, struct object_info *obj, int wait); diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c index 51c279a29..fd4cf2c48 100644 --- a/fs/adfs/dir.c +++ b/fs/adfs/dir.c @@ -21,7 +21,7 @@ adfs_readdir(struct file *file, struct dir_context *ctx) { struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; - struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir; + const struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir; struct object_info obj; struct adfs_dir dir; int ret = 0; @@ -69,7 +69,7 @@ adfs_dir_update(struct super_block *sb, struct object_info *obj, int wait) { int ret = -EINVAL; #ifdef CONFIG_ADFS_FS_RW - struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir; + const struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir; struct adfs_dir dir; printk(KERN_INFO "adfs_dir_update: object %06X in dir %06X\n", @@ -129,7 +129,7 @@ static int adfs_dir_lookup_byname(struct inode *inode, struct qstr *name, struct object_info *obj) { struct super_block *sb = inode->i_sb; - struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir; + const struct adfs_dir_ops *ops = ADFS_SB(sb)->s_dir; struct adfs_dir dir; int ret; diff --git a/fs/adfs/dir_f.c b/fs/adfs/dir_f.c index 4bbe853ee..0fbfd0b04 100644 --- a/fs/adfs/dir_f.c +++ b/fs/adfs/dir_f.c @@ -476,7 +476,7 @@ adfs_f_free(struct adfs_dir *dir) dir->sb = NULL; } -struct adfs_dir_ops adfs_f_dir_ops = { +const struct adfs_dir_ops adfs_f_dir_ops = { .read = adfs_f_read, .setpos = adfs_f_setpos, .getnext = adfs_f_getnext, diff --git a/fs/adfs/dir_fplus.c b/fs/adfs/dir_fplus.c index 82d14cdf7..c92cfb638 100644 --- a/fs/adfs/dir_fplus.c +++ b/fs/adfs/dir_fplus.c @@ -256,7 +256,7 @@ adfs_fplus_free(struct adfs_dir *dir) dir->sb = NULL; } -struct adfs_dir_ops adfs_fplus_dir_ops = { +const struct adfs_dir_ops adfs_fplus_dir_ops = { .read = adfs_fplus_read, .setpos = adfs_fplus_setpos, .getnext = adfs_fplus_getnext, diff --git a/fs/adfs/super.c b/fs/adfs/super.c index 4d4a0df83..c9fdfb112 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -271,7 +271,7 @@ static int __init init_inodecache(void) adfs_inode_cachep = kmem_cache_create("adfs_inode_cache", sizeof(struct adfs_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (adfs_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/affs/affs.h b/fs/affs/affs.h index c69a87eaf..cc2b2efc9 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h @@ -138,7 +138,7 @@ extern int affs_remove_hash(struct inode *dir, struct buffer_head *rem_bh); extern int affs_remove_header(struct dentry *dentry); extern u32 affs_checksum_block(struct super_block *sb, struct buffer_head *bh); extern void affs_fix_checksum(struct super_block *sb, struct buffer_head *bh); -extern void secs_to_datestamp(time_t secs, struct affs_date *ds); +extern void secs_to_datestamp(time64_t secs, struct affs_date *ds); extern umode_t prot_to_mode(u32 prot); extern void mode_to_prot(struct inode *inode); __printf(3, 4) diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c index 5fa92bc79..d6c7a51c9 100644 --- a/fs/affs/amigaffs.c +++ b/fs/affs/amigaffs.c @@ -8,6 +8,7 @@ * Please send bug reports to: hjw@zvw.de */ +#include <linux/math64.h> #include "affs.h" /* @@ -366,22 +367,22 @@ affs_fix_checksum(struct super_block *sb, struct buffer_head *bh) } void -secs_to_datestamp(time_t secs, struct affs_date *ds) +secs_to_datestamp(time64_t secs, struct affs_date *ds) { u32 days; u32 minute; + s32 rem; secs -= sys_tz.tz_minuteswest * 60 + ((8 * 365 + 2) * 24 * 60 * 60); if (secs < 0) secs = 0; - days = secs / 86400; - secs -= days * 86400; - minute = secs / 60; - secs -= minute * 60; + days = div_s64_rem(secs, 86400, &rem); + minute = rem / 60; + rem -= minute * 60; ds->days = cpu_to_be32(days); ds->mins = cpu_to_be32(minute); - ds->ticks = cpu_to_be32(secs * 50); + ds->ticks = cpu_to_be32(rem * 50); } umode_t diff --git a/fs/affs/file.c b/fs/affs/file.c index 659c579c4..22fc7c802 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -33,11 +33,11 @@ affs_file_release(struct inode *inode, struct file *filp) inode->i_ino, atomic_read(&AFFS_I(inode)->i_opencnt)); if (atomic_dec_and_test(&AFFS_I(inode)->i_opencnt)) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (inode->i_size != AFFS_I(inode)->mmu_private) affs_truncate(inode); affs_free_prealloc(inode); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } return 0; @@ -511,8 +511,6 @@ affs_do_readpage_ofs(struct page *page, unsigned to) pr_debug("%s(%lu, %ld, 0, %d)\n", __func__, inode->i_ino, page->index, to); BUG_ON(to > PAGE_CACHE_SIZE); - kmap(page); - data = page_address(page); bsize = AFFS_SB(sb)->s_data_blksize; tmp = page->index << PAGE_CACHE_SHIFT; bidx = tmp / bsize; @@ -524,14 +522,15 @@ affs_do_readpage_ofs(struct page *page, unsigned to) return PTR_ERR(bh); tmp = min(bsize - boff, to - pos); BUG_ON(pos + tmp > to || tmp > bsize); + data = kmap_atomic(page); memcpy(data + pos, AFFS_DATA(bh) + boff, tmp); + kunmap_atomic(data); affs_brelse(bh); bidx++; pos += tmp; boff = 0; } flush_dcache_page(page); - kunmap(page); return 0; } @@ -958,12 +957,12 @@ int affs_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync) if (err) return err; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = write_inode_now(inode, 0); err = sync_blockdev(inode->i_sb->s_bdev); if (!ret) ret = err; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } const struct file_operations affs_file_operations = { diff --git a/fs/affs/inode.c b/fs/affs/inode.c index 173495005..0fdb0f5b2 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -140,6 +140,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino) break; case ST_SOFTLINK: inode->i_mode |= S_IFLNK; + inode_nohighmem(inode); inode->i_op = &affs_symlink_inode_operations; inode->i_data.a_ops = &affs_symlink_aops; break; diff --git a/fs/affs/namei.c b/fs/affs/namei.c index 181e05b46..00d3002a6 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -344,6 +344,7 @@ affs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) return -ENOSPC; inode->i_op = &affs_symlink_inode_operations; + inode_nohighmem(inode); inode->i_data.a_ops = &affs_symlink_aops; inode->i_mode = S_IFLNK | 0777; mode_to_prot(inode); diff --git a/fs/affs/super.c b/fs/affs/super.c index 5b50c4ca4..2a6713b6b 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -32,7 +32,7 @@ affs_commit_super(struct super_block *sb, int wait) struct affs_root_tail *tail = AFFS_ROOT_TAIL(sb, bh); lock_buffer(bh); - secs_to_datestamp(get_seconds(), &tail->disk_change); + secs_to_datestamp(ktime_get_real_seconds(), &tail->disk_change); affs_fix_checksum(sb, bh); unlock_buffer(bh); @@ -132,7 +132,7 @@ static int __init init_inodecache(void) affs_inode_cachep = kmem_cache_create("affs_inode_cache", sizeof(struct affs_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (affs_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/affs/symlink.c b/fs/affs/symlink.c index ea5b69a18..69b03dbb7 100644 --- a/fs/affs/symlink.c +++ b/fs/affs/symlink.c @@ -14,13 +14,13 @@ static int affs_symlink_readpage(struct file *file, struct page *page) { struct buffer_head *bh; struct inode *inode = page->mapping->host; - char *link = kmap(page); + char *link = page_address(page); struct slink_front *lf; int i, j; char c; char lc; - pr_debug("follow_link(ino=%lu)\n", inode->i_ino); + pr_debug("get_link(ino=%lu)\n", inode->i_ino); bh = affs_bread(inode->i_sb, inode->i_ino); if (!bh) @@ -57,12 +57,10 @@ static int affs_symlink_readpage(struct file *file, struct page *page) link[i] = '\0'; affs_brelse(bh); SetPageUptodate(page); - kunmap(page); unlock_page(page); return 0; fail: SetPageError(page); - kunmap(page); unlock_page(page); return -EIO; } @@ -73,7 +71,6 @@ const struct address_space_operations affs_symlink_aops = { const struct inode_operations affs_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = page_follow_link_light, - .put_link = page_put_link, + .get_link = page_get_link, .setattr = affs_notify_change, }; diff --git a/fs/afs/flock.c b/fs/afs/flock.c index 4baf1d2b3..d91a9c9cf 100644 --- a/fs/afs/flock.c +++ b/fs/afs/flock.c @@ -483,7 +483,7 @@ static int afs_do_getlk(struct file *file, struct file_lock *fl) fl->fl_type = F_UNLCK; - mutex_lock(&vnode->vfs_inode.i_mutex); + inode_lock(&vnode->vfs_inode); /* check local lock records first */ ret = 0; @@ -505,7 +505,7 @@ static int afs_do_getlk(struct file *file, struct file_lock *fl) } error: - mutex_unlock(&vnode->vfs_inode.i_mutex); + inode_unlock(&vnode->vfs_inode); _leave(" = %d [%hd]", ret, fl->fl_type); return ret; } diff --git a/fs/afs/inode.c b/fs/afs/inode.c index e06f5a233..86cc7264c 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -56,6 +56,7 @@ static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key) case AFS_FTYPE_SYMLINK: inode->i_mode = S_IFLNK | vnode->status.mode; inode->i_op = &page_symlink_inode_operations; + inode_nohighmem(inode); break; default: printk("kAFS: AFS vnode with undefined type\n"); diff --git a/fs/afs/proc.c b/fs/afs/proc.c index 24a905b07..2853b4095 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -230,14 +230,9 @@ static ssize_t afs_proc_cells_write(struct file *file, const char __user *buf, if (size <= 1 || size >= PAGE_SIZE) return -EINVAL; - kbuf = kmalloc(size + 1, GFP_KERNEL); - if (!kbuf) - return -ENOMEM; - - ret = -EFAULT; - if (copy_from_user(kbuf, buf, size) != 0) - goto done; - kbuf[size] = 0; + kbuf = memdup_user_nul(buf, size); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); /* trim to first NL */ name = memchr(kbuf, '\n', size); @@ -315,15 +310,9 @@ static ssize_t afs_proc_rootcell_write(struct file *file, if (size <= 1 || size >= PAGE_SIZE) return -EINVAL; - ret = -ENOMEM; - kbuf = kmalloc(size + 1, GFP_KERNEL); - if (!kbuf) - goto nomem; - - ret = -EFAULT; - if (copy_from_user(kbuf, buf, size) != 0) - goto infault; - kbuf[size] = 0; + kbuf = memdup_user_nul(buf, size); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); /* trim to first NL */ s = memchr(kbuf, '\n', size); @@ -337,9 +326,7 @@ static ssize_t afs_proc_rootcell_write(struct file *file, if (ret >= 0) ret = size; /* consume everything, always */ -infault: kfree(kbuf); -nomem: _leave(" = %d", ret); return ret; } diff --git a/fs/afs/super.c b/fs/afs/super.c index 1fb4a5129..81afefe7d 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -91,7 +91,7 @@ int __init afs_fs_init(void) afs_inode_cachep = kmem_cache_create("afs_inode_cache", sizeof(struct afs_vnode), 0, - SLAB_HWCACHE_ALIGN, + SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT, afs_i_init_once); if (!afs_inode_cachep) { printk(KERN_NOTICE "kAFS: Failed to allocate inode cache\n"); diff --git a/fs/afs/write.c b/fs/afs/write.c index 0714abcd7..dfef94f70 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -693,7 +693,7 @@ int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync) ret = filemap_write_and_wait_range(inode->i_mapping, start, end); if (ret) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* use a writeback record as a marker in the queue - when this reaches * the front of the queue, all the outstanding writes are either @@ -735,7 +735,7 @@ int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync) afs_put_writeback(wb); _leave(" = %d", ret); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } @@ -195,7 +195,7 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de struct timespec now; unsigned int ia_valid = attr->ia_valid; - WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex)); + WARN_ON_ONCE(!inode_is_locked(inode)); if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_TIMES_SET)) { if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) diff --git a/fs/aufs/Kconfig b/fs/aufs/Kconfig deleted file mode 100644 index 63560ceda..000000000 --- a/fs/aufs/Kconfig +++ /dev/null @@ -1,185 +0,0 @@ -config AUFS_FS - tristate "Aufs (Advanced multi layered unification filesystem) support" - help - Aufs is a stackable unification filesystem such as Unionfs, - which unifies several directories and provides a merged single - directory. - In the early days, aufs was entirely re-designed and - re-implemented Unionfs Version 1.x series. Introducing many - original ideas, approaches and improvements, it becomes totally - different from Unionfs while keeping the basic features. - -if AUFS_FS -choice - prompt "Maximum number of branches" - default AUFS_BRANCH_MAX_127 - help - Specifies the maximum number of branches (or member directories) - in a single aufs. The larger value consumes more system - resources and has a minor impact to performance. -config AUFS_BRANCH_MAX_127 - bool "127" - help - Specifies the maximum number of branches (or member directories) - in a single aufs. The larger value consumes more system - resources and has a minor impact to performance. -config AUFS_BRANCH_MAX_511 - bool "511" - help - Specifies the maximum number of branches (or member directories) - in a single aufs. The larger value consumes more system - resources and has a minor impact to performance. -config AUFS_BRANCH_MAX_1023 - bool "1023" - help - Specifies the maximum number of branches (or member directories) - in a single aufs. The larger value consumes more system - resources and has a minor impact to performance. -config AUFS_BRANCH_MAX_32767 - bool "32767" - help - Specifies the maximum number of branches (or member directories) - in a single aufs. The larger value consumes more system - resources and has a minor impact to performance. -endchoice - -config AUFS_SBILIST - bool - depends on AUFS_MAGIC_SYSRQ || PROC_FS - default y - help - Automatic configuration for internal use. - When aufs supports Magic SysRq or /proc, enabled automatically. - -config AUFS_HNOTIFY - bool "Detect direct branch access (bypassing aufs)" - help - If you want to modify files on branches directly, eg. bypassing aufs, - and want aufs to detect the changes of them fully, then enable this - option and use 'udba=notify' mount option. - Currently there is only one available configuration, "fsnotify". - It will have a negative impact to the performance. - See detail in aufs.5. - -choice - prompt "method" if AUFS_HNOTIFY - default AUFS_HFSNOTIFY -config AUFS_HFSNOTIFY - bool "fsnotify" - select FSNOTIFY -endchoice - -config AUFS_EXPORT - bool "NFS-exportable aufs" - depends on EXPORTFS - help - If you want to export your mounted aufs via NFS, then enable this - option. There are several requirements for this configuration. - See detail in aufs.5. - -config AUFS_INO_T_64 - bool - depends on AUFS_EXPORT - depends on 64BIT && !(ALPHA || S390) - default y - help - Automatic configuration for internal use. - /* typedef unsigned long/int __kernel_ino_t */ - /* alpha and s390x are int */ - -config AUFS_XATTR - bool "support for XATTR/EA (including Security Labels)" - help - If your branch fs supports XATTR/EA and you want to make them - available in aufs too, then enable this opsion and specify the - branch attributes for EA. - See detail in aufs.5. - -config AUFS_FHSM - bool "File-based Hierarchical Storage Management" - help - Hierarchical Storage Management (or HSM) is a well-known feature - in the storage world. Aufs provides this feature as file-based. - with multiple branches. - These multiple branches are prioritized, ie. the topmost one - should be the fastest drive and be used heavily. - -config AUFS_RDU - bool "Readdir in userspace" - help - Aufs has two methods to provide a merged view for a directory, - by a user-space library and by kernel-space natively. The latter - is always enabled but sometimes large and slow. - If you enable this option, install the library in aufs2-util - package, and set some environment variables for your readdir(3), - then the work will be handled in user-space which generally - shows better performance in most cases. - See detail in aufs.5. - -config AUFS_SHWH - bool "Show whiteouts" - help - If you want to make the whiteouts in aufs visible, then enable - this option and specify 'shwh' mount option. Although it may - sounds like philosophy or something, but in technically it - simply shows the name of whiteout with keeping its behaviour. - -config AUFS_BR_RAMFS - bool "Ramfs (initramfs/rootfs) as an aufs branch" - help - If you want to use ramfs as an aufs branch fs, then enable this - option. Generally tmpfs is recommended. - Aufs prohibited them to be a branch fs by default, because - initramfs becomes unusable after switch_root or something - generally. If you sets initramfs as an aufs branch and boot your - system by switch_root, you will meet a problem easily since the - files in initramfs may be inaccessible. - Unless you are going to use ramfs as an aufs branch fs without - switch_root or something, leave it N. - -config AUFS_BR_FUSE - bool "Fuse fs as an aufs branch" - depends on FUSE_FS - select AUFS_POLL - help - If you want to use fuse-based userspace filesystem as an aufs - branch fs, then enable this option. - It implements the internal poll(2) operation which is - implemented by fuse only (curretnly). - -config AUFS_POLL - bool - help - Automatic configuration for internal use. - -config AUFS_BR_HFSPLUS - bool "Hfsplus as an aufs branch" - depends on HFSPLUS_FS - default y - help - If you want to use hfsplus fs as an aufs branch fs, then enable - this option. This option introduces a small overhead at - copying-up a file on hfsplus. - -config AUFS_BDEV_LOOP - bool - depends on BLK_DEV_LOOP - default y - help - Automatic configuration for internal use. - Convert =[ym] into =y. - -config AUFS_DEBUG - bool "Debug aufs" - help - Enable this to compile aufs internal debug code. - It will have a negative impact to the performance. - -config AUFS_MAGIC_SYSRQ - bool - depends on AUFS_DEBUG && MAGIC_SYSRQ - default y - help - Automatic configuration for internal use. - When aufs supports Magic SysRq, enabled automatically. -endif diff --git a/fs/aufs/Makefile b/fs/aufs/Makefile deleted file mode 100644 index c7efb62b5..000000000 --- a/fs/aufs/Makefile +++ /dev/null @@ -1,36 +0,0 @@ - -include ${srctree}/${src}/magic.mk - -# cf. include/linux/kernel.h -# enable pr_debug -ccflags-y += -DDEBUG -# sparse requires the full pathname -ccflags-y += -include ${srctree}/include/uapi/linux/aufs_type.h - -obj-$(CONFIG_AUFS_FS) += aufs.o -aufs-y := module.o sbinfo.o super.o branch.o xino.o sysaufs.o opts.o \ - wkq.o vfsub.o dcsub.o \ - cpup.o whout.o wbr_policy.o \ - dinfo.o dentry.o \ - dynop.o \ - finfo.o file.o f_op.o \ - dir.o vdir.o \ - iinfo.o inode.o i_op.o i_op_add.o i_op_del.o i_op_ren.o \ - mvdown.o ioctl.o - -# all are boolean -aufs-$(CONFIG_PROC_FS) += procfs.o plink.o -aufs-$(CONFIG_SYSFS) += sysfs.o -aufs-$(CONFIG_DEBUG_FS) += dbgaufs.o -aufs-$(CONFIG_AUFS_BDEV_LOOP) += loop.o -aufs-$(CONFIG_AUFS_HNOTIFY) += hnotify.o -aufs-$(CONFIG_AUFS_HFSNOTIFY) += hfsnotify.o -aufs-$(CONFIG_AUFS_EXPORT) += export.o -aufs-$(CONFIG_AUFS_XATTR) += xattr.o -aufs-$(CONFIG_FS_POSIX_ACL) += posix_acl.o -aufs-$(CONFIG_AUFS_FHSM) += fhsm.o -aufs-$(CONFIG_AUFS_POLL) += poll.o -aufs-$(CONFIG_AUFS_RDU) += rdu.o -aufs-$(CONFIG_AUFS_BR_HFSPLUS) += hfsplus.o -aufs-$(CONFIG_AUFS_DEBUG) += debug.o -aufs-$(CONFIG_AUFS_MAGIC_SYSRQ) += sysrq.o diff --git a/fs/aufs/aufs.h b/fs/aufs/aufs.h deleted file mode 100644 index 49f43b433..000000000 --- a/fs/aufs/aufs.h +++ /dev/null @@ -1,46 +0,0 @@ -/* - * Copyright (C) 2005-2016 Junjiro R. Okajima - */ - -/* - * all header files - */ - -#ifndef __AUFS_H__ -#define __AUFS_H__ - -#ifdef __KERNEL__ - -#define AuStub(type, name, body, ...) \ - static inline type name(__VA_ARGS__) { body; } - -#define AuStubVoid(name, ...) \ - AuStub(void, name, , __VA_ARGS__) -#define AuStubInt0(name, ...) \ - AuStub(int, name, return 0, __VA_ARGS__) - -#include "debug.h" - -#include "branch.h" -#include "cpup.h" -#include "dcsub.h" -#include "dbgaufs.h" -#include "dentry.h" -#include "dir.h" -#include "dynop.h" -#include "file.h" -#include "fstype.h" -#include "inode.h" -#include "loop.h" -#include "module.h" -#include "opts.h" -#include "rwsem.h" -#include "spl.h" -#include "super.h" -#include "sysaufs.h" -#include "vfsub.h" -#include "whout.h" -#include "wkq.h" - -#endif /* __KERNEL__ */ -#endif /* __AUFS_H__ */ diff --git a/fs/aufs/branch.c b/fs/aufs/branch.c deleted file mode 100644 index 1ab5e1f2d..000000000 --- a/fs/aufs/branch.c +++ /dev/null @@ -1,1394 +0,0 @@ -/* - * Copyright (C) 2005-2016 Junjiro R. Okajima - */ - -/* - * branch management - */ - -#include <linux/compat.h> -#include <linux/statfs.h> -#include "aufs.h" - -/* - * free a single branch - */ -static void au_br_do_free(struct au_branch *br) -{ - int i; - struct au_wbr *wbr; - struct au_dykey **key; - - au_hnotify_fin_br(br); - - if (br->br_xino.xi_file) - fput(br->br_xino.xi_file); - mutex_destroy(&br->br_xino.xi_nondir_mtx); - - AuDebugOn(atomic_read(&br->br_count)); - - wbr = br->br_wbr; - if (wbr) { - for (i = 0; i < AuBrWh_Last; i++) - dput(wbr->wbr_wh[i]); - AuDebugOn(atomic_read(&wbr->wbr_wh_running)); - AuRwDestroy(&wbr->wbr_wh_rwsem); - } - - if (br->br_fhsm) { - au_br_fhsm_fin(br->br_fhsm); - kfree(br->br_fhsm); - } - - key = br->br_dykey; - for (i = 0; i < AuBrDynOp; i++, key++) - if (*key) - au_dy_put(*key); - else - break; - - /* recursive lock, s_umount of branch's */ - lockdep_off(); - path_put(&br->br_path); - lockdep_on(); - kfree(wbr); - kfree(br); -} - -/* - * frees all branches - */ -void au_br_free(struct au_sbinfo *sbinfo) -{ - aufs_bindex_t bmax; - struct au_branch **br; - - AuRwMustWriteLock(&sbinfo->si_rwsem); - - bmax = sbinfo->si_bend + 1; - br = sbinfo->si_branch; - while (bmax--) - au_br_do_free(*br++); -} - -/* - * find the index of a branch which is specified by @br_id. - */ -int au_br_index(struct super_block *sb, aufs_bindex_t br_id) -{ - aufs_bindex_t bindex, bend; - - bend = au_sbend(sb); - for (bindex = 0; bindex <= bend; bindex++) - if (au_sbr_id(sb, bindex) == br_id) - return bindex; - return -1; -} - -/* ---------------------------------------------------------------------- */ - -/* - * add a branch - */ - -static int test_overlap(struct super_block *sb, struct dentry *h_adding, - struct dentry *h_root) -{ - if (unlikely(h_adding == h_root - || au_test_loopback_overlap(sb, h_adding))) - return 1; - if (h_adding->d_sb != h_root->d_sb) - return 0; - return au_test_subdir(h_adding, h_root) - || au_test_subdir(h_root, h_adding); -} - -/* - * returns a newly allocated branch. @new_nbranch is a number of branches - * after adding a branch. - */ -static struct au_branch *au_br_alloc(struct super_block *sb, int new_nbranch, - int perm) -{ - struct au_branch *add_branch; - struct dentry *root; - struct inode *inode; - int err; - - err = -ENOMEM; - root = sb->s_root; - add_branch = kzalloc(sizeof(*add_branch), GFP_NOFS); - if (unlikely(!add_branch)) - goto out; - - err = au_hnotify_init_br(add_branch, perm); - if (unlikely(err)) - goto out_br; - - if (au_br_writable(perm)) { - /* may be freed separately at changing the branch permission */ - add_branch->br_wbr = kzalloc(sizeof(*add_branch->br_wbr), - GFP_NOFS); - if (unlikely(!add_branch->br_wbr)) - goto out_hnotify; - } - - if (au_br_fhsm(perm)) { - err = au_fhsm_br_alloc(add_branch); - if (unlikely(err)) - goto out_wbr; - } - - err = au_sbr_realloc(au_sbi(sb), new_nbranch); - if (!err) - err = au_di_realloc(au_di(root), new_nbranch); - if (!err) { - inode = d_inode(root); - err = au_ii_realloc(au_ii(inode), new_nbranch); - } - if (!err) - return add_branch; /* success */ - -out_wbr: - kfree(add_branch->br_wbr); -out_hnotify: - au_hnotify_fin_br(add_branch); -out_br: - kfree(add_branch); -out: - return ERR_PTR(err); -} - -/* - * test if the branch permission is legal or not. - */ -static int test_br(struct inode *inode, int brperm, char *path) -{ - int err; - - err = (au_br_writable(brperm) && IS_RDONLY(inode)); - if (!err) - goto out; - - err = -EINVAL; - pr_err("write permission for readonly mount or inode, %s\n", path); - -out: - return err; -} - -/* - * returns: - * 0: success, the caller will add it - * plus: success, it is already unified, the caller should ignore it - * minus: error - */ -static int test_add(struct super_block *sb, struct au_opt_add *add, int remount) -{ - int err; - aufs_bindex_t bend, bindex; - struct dentry *root, *h_dentry; - struct inode *inode, *h_inode; - - root = sb->s_root; - bend = au_sbend(sb); - if (unlikely(bend >= 0 - && au_find_dbindex(root, add->path.dentry) >= 0)) { - err = 1; - if (!remount) { - err = -EINVAL; - pr_err("%s duplicated\n", add->pathname); - } - goto out; - } - - err = -ENOSPC; /* -E2BIG; */ - if (unlikely(AUFS_BRANCH_MAX <= add->bindex - || AUFS_BRANCH_MAX - 1 <= bend)) { - pr_err("number of branches exceeded %s\n", add->pathname); - goto out; - } - - err = -EDOM; - if (unlikely(add->bindex < 0 || bend + 1 < add->bindex)) { - pr_err("bad index %d\n", add->bindex); - goto out; - } - - inode = d_inode(add->path.dentry); - err = -ENOENT; - if (unlikely(!inode->i_nlink)) { - pr_err("no existence %s\n", add->pathname); - goto out; - } - - err = -EINVAL; - if (unlikely(inode->i_sb == sb)) { - pr_err("%s must be outside\n", add->pathname); - goto out; - } - - if (unlikely(au_test_fs_unsuppoted(inode->i_sb))) { - pr_err("unsupported filesystem, %s (%s)\n", - add->pathname, au_sbtype(inode->i_sb)); - goto out; - } - - if (unlikely(inode->i_sb->s_stack_depth)) { - pr_err("already stacked, %s (%s)\n", - add->pathname, au_sbtype(inode->i_sb)); - goto out; - } - - err = test_br(d_inode(add->path.dentry), add->perm, add->pathname); - if (unlikely(err)) - goto out; - - if (bend < 0) - return 0; /* success */ - - err = -EINVAL; - for (bindex = 0; bindex <= bend; bindex++) - if (unlikely(test_overlap(sb, add->path.dentry, - au_h_dptr(root, bindex)))) { - pr_err("%s is overlapped\n", add->pathname); - goto out; - } - - err = 0; - if (au_opt_test(au_mntflags(sb), WARN_PERM)) { - h_dentry = au_h_dptr(root, 0); - h_inode = d_inode(h_dentry); - if ((h_inode->i_mode & S_IALLUGO) != (inode->i_mode & S_IALLUGO) - || !uid_eq(h_inode->i_uid, inode->i_uid) - || !gid_eq(h_inode->i_gid, inode->i_gid)) - pr_warn("uid/gid/perm %s %u/%u/0%o, %u/%u/0%o\n", - add->pathname, - i_uid_read(inode), i_gid_read(inode), - (inode->i_mode & S_IALLUGO), - i_uid_read(h_inode), i_gid_read(h_inode), - (h_inode->i_mode & S_IALLUGO)); - } - -out: - return err; -} - -/* - * initialize or clean the whiteouts for an adding branch - */ -static int au_br_init_wh(struct super_block *sb, struct au_branch *br, - int new_perm) -{ - int err, old_perm; - aufs_bindex_t bindex; - struct mutex *h_mtx; - struct au_wbr *wbr; - struct au_hinode *hdir; - struct dentry *h_dentry; - - err = vfsub_mnt_want_write(au_br_mnt(br)); - if (unlikely(err)) - goto out; - - wbr = br->br_wbr; - old_perm = br->br_perm; - br->br_perm = new_perm; - hdir = NULL; - h_mtx = NULL; - bindex = au_br_index(sb, br->br_id); - if (0 <= bindex) { - hdir = au_hi(d_inode(sb->s_root), bindex); - au_hn_imtx_lock_nested(hdir, AuLsc_I_PARENT); - } else { - h_dentry = au_br_dentry(br); - h_mtx = &d_inode(h_dentry)->i_mutex; - mutex_lock_nested(h_mtx, AuLsc_I_PARENT); - } - if (!wbr) - err = au_wh_init(br, sb); - else { - wbr_wh_write_lock(wbr); - err = au_wh_init(br, sb); - wbr_wh_write_unlock(wbr); - } - if (hdir) - au_hn_imtx_unlock(hdir); - else - mutex_unlock(h_mtx); - vfsub_mnt_drop_write(au_br_mnt(br)); - br->br_perm = old_perm; - - if (!err && wbr && !au_br_writable(new_perm)) { - kfree(wbr); - br->br_wbr = NULL; - } - -out: - return err; -} - -static int au_wbr_init(struct au_branch *br, struct super_block *sb, - int perm) -{ - int err; - struct kstatfs kst; - struct au_wbr *wbr; - - wbr = br->br_wbr; - au_rw_init(&wbr->wbr_wh_rwsem); - atomic_set(&wbr->wbr_wh_running, 0); - - /* - * a limit for rmdir/rename a dir - * cf. AUFS_MAX_NAMELEN in include/uapi/linux/aufs_type.h - */ - err = vfs_statfs(&br->br_path, &kst); - if (unlikely(err)) - goto out; - err = -EINVAL; - if (kst.f_namelen >= NAME_MAX) - err = au_br_init_wh(sb, br, perm); - else - pr_err("%pd(%s), unsupported namelen %ld\n", - au_br_dentry(br), - au_sbtype(au_br_dentry(br)->d_sb), kst.f_namelen); - -out: - return err; -} - -/* initialize a new branch */ -static int au_br_init(struct au_branch *br, struct super_block *sb, - struct au_opt_add *add) -{ - int err; - struct inode *h_inode; - - err = 0; - mutex_init(&br->br_xino.xi_nondir_mtx); - br->br_perm = add->perm; - br->br_path = add->path; /* set first, path_get() later */ - spin_lock_init(&br->br_dykey_lock); - atomic_set(&br->br_count, 0); - atomic_set(&br->br_xino_running, 0); - br->br_id = au_new_br_id(sb); - AuDebugOn(br->br_id < 0); - - if (au_br_writable(add->perm)) { - err = au_wbr_init(br, sb, add->perm); - if (unlikely(err)) - goto out_err; - } - - if (au_opt_test(au_mntflags(sb), XINO)) { - h_inode = d_inode(add->path.dentry); - err = au_xino_br(sb, br, h_inode->i_ino, - au_sbr(sb, 0)->br_xino.xi_file, /*do_test*/1); - if (unlikely(err)) { - AuDebugOn(br->br_xino.xi_file); - goto out_err; - } - } - - sysaufs_br_init(br); - path_get(&br->br_path); - goto out; /* success */ - -out_err: - memset(&br->br_path, 0, sizeof(br->br_path)); -out: - return err; -} - -static void au_br_do_add_brp(struct au_sbinfo *sbinfo, aufs_bindex_t bindex, - struct au_branch *br, aufs_bindex_t bend, - aufs_bindex_t amount) -{ - struct au_branch **brp; - - AuRwMustWriteLock(&sbinfo->si_rwsem); - - brp = sbinfo->si_branch + bindex; - memmove(brp + 1, brp, sizeof(*brp) * amount); - *brp = br; - sbinfo->si_bend++; - if (unlikely(bend < 0)) - sbinfo->si_bend = 0; -} - -static void au_br_do_add_hdp(struct au_dinfo *dinfo, aufs_bindex_t bindex, - aufs_bindex_t bend, aufs_bindex_t amount) -{ - struct au_hdentry *hdp; - - AuRwMustWriteLock(&dinfo->di_rwsem); - - hdp = dinfo->di_hdentry + bindex; - memmove(hdp + 1, hdp, sizeof(*hdp) * amount); - au_h_dentry_init(hdp); - dinfo->di_bend++; - if (unlikely(bend < 0)) - dinfo->di_bstart = 0; -} - -static void au_br_do_add_hip(struct au_iinfo *iinfo, aufs_bindex_t bindex, - aufs_bindex_t bend, aufs_bindex_t amount) -{ - struct au_hinode *hip; - - AuRwMustWriteLock(&iinfo->ii_rwsem); - - hip = iinfo->ii_hinode + bindex; - memmove(hip + 1, hip, sizeof(*hip) * amount); - hip->hi_inode = NULL; - au_hn_init(hip); - iinfo->ii_bend++; - if (unlikely(bend < 0)) - iinfo->ii_bstart = 0; -} - -static void au_br_do_add(struct super_block *sb, struct au_branch *br, - aufs_bindex_t bindex) -{ - struct dentry *root, *h_dentry; - struct inode *root_inode, *h_inode; - aufs_bindex_t bend, amount; - - root = sb->s_root; - root_inode = d_inode(root); - bend = au_sbend(sb); - amount = bend + 1 - bindex; - h_dentry = au_br_dentry(br); - au_sbilist_lock(); - au_br_do_add_brp(au_sbi(sb), bindex, br, bend, amount); - au_br_do_add_hdp(au_di(root), bindex, bend, amount); - au_br_do_add_hip(au_ii(root_inode), bindex, bend, amount); - au_set_h_dptr(root, bindex, dget(h_dentry)); - h_inode = d_inode(h_dentry); - au_set_h_iptr(root_inode, bindex, au_igrab(h_inode), /*flags*/0); - au_sbilist_unlock(); -} - -int au_br_add(struct super_block *sb, struct au_opt_add *add, int remount) -{ - int err; - aufs_bindex_t bend, add_bindex; - struct dentry *root, *h_dentry; - struct inode *root_inode; - struct au_branch *add_branch; - - root = sb->s_root; - root_inode = d_inode(root); - IMustLock(root_inode); - err = test_add(sb, add, remount); - if (unlikely(err < 0)) - goto out; - if (err) { - err = 0; - goto out; /* success */ - } - - bend = au_sbend(sb); - add_branch = au_br_alloc(sb, bend + 2, add->perm); - err = PTR_ERR(add_branch); - if (IS_ERR(add_branch)) - goto out; - - err = au_br_init(add_branch, sb, add); - if (unlikely(err)) { - au_br_do_free(add_branch); - goto out; - } - - add_bindex = add->bindex; - if (!remount) - au_br_do_add(sb, add_branch, add_bindex); - else { - sysaufs_brs_del(sb, add_bindex); - au_br_do_add(sb, add_branch, add_bindex); - sysaufs_brs_add(sb, add_bindex); - } - - h_dentry = add->path.dentry; - if (!add_bindex) { - au_cpup_attr_all(root_inode, /*force*/1); - sb->s_maxbytes = h_dentry->d_sb->s_maxbytes; - } else - au_add_nlink(root_inode, d_inode(h_dentry)); - - /* - * this test/set prevents aufs from handling unnecesary notify events - * of xino files, in case of re-adding a writable branch which was - * once detached from aufs. - */ - if (au_xino_brid(sb) < 0 - && au_br_writable(add_branch->br_perm) - && !au_test_fs_bad_xino(h_dentry->d_sb) - && add_branch->br_xino.xi_file - && add_branch->br_xino.xi_file->f_path.dentry->d_parent == h_dentry) - au_xino_brid_set(sb, add_branch->br_id); - -out: - return err; -} - -/* ---------------------------------------------------------------------- */ - -static unsigned long long au_farray_cb(struct super_block *sb, void *a, - unsigned long long max __maybe_unused, - void *arg) -{ - unsigned long long n; - struct file **p, *f; - struct au_sphlhead *files; - struct au_finfo *finfo; - - n = 0; - p = a; - files = &au_sbi(sb)->si_files; - spin_lock(&files->spin); - hlist_for_each_entry(finfo, &files->head, fi_hlist) { - f = finfo->fi_file; - if (file_count(f) - && !special_file(file_inode(f)->i_mode)) { - get_file(f); - *p++ = f; - n++; - AuDebugOn(n > max); - } - } - spin_unlock(&files->spin); - - return n; -} - -static struct file **au_farray_alloc(struct super_block *sb, - unsigned long long *max) -{ - *max = atomic_long_read(&au_sbi(sb)->si_nfiles); - return au_array_alloc(max, au_farray_cb, sb, /*arg*/NULL); -} - -static void au_farray_free(struct file **a, unsigned long long max) -{ - unsigned long long ull; - - for (ull = 0; ull < max; ull++) - if (a[ull]) - fput(a[ull]); - kvfree(a); -} - -/* ---------------------------------------------------------------------- */ - -/* - * delete a branch - */ - -/* to show the line number, do not make it inlined function */ -#define AuVerbose(do_info, fmt, ...) do { \ - if (do_info) \ - pr_info(fmt, ##__VA_ARGS__); \ -} while (0) - -static int au_test_ibusy(struct inode *inode, aufs_bindex_t bstart, - aufs_bindex_t bend) -{ - return (inode && !S_ISDIR(inode->i_mode)) || bstart == bend; -} - -static int au_test_dbusy(struct dentry *dentry, aufs_bindex_t bstart, - aufs_bindex_t bend) -{ - return au_test_ibusy(d_inode(dentry), bstart, bend); -} - -/* - * test if the branch is deletable or not. - */ -static int test_dentry_busy(struct dentry *root, aufs_bindex_t bindex, - unsigned int sigen, const unsigned int verbose) -{ - int err, i, j, ndentry; - aufs_bindex_t bstart, bend; - struct au_dcsub_pages dpages; - struct au_dpage *dpage; - struct dentry *d; - - err = au_dpages_init(&dpages, GFP_NOFS); - if (unlikely(err)) - goto out; - err = au_dcsub_pages(&dpages, root, NULL, NULL); - if (unlikely(err)) - goto out_dpages; - - for (i = 0; !err && i < dpages.ndpage; i++) { - dpage = dpages.dpages + i; - ndentry = dpage->ndentry; - for (j = 0; !err && j < ndentry; j++) { - d = dpage->dentries[j]; - AuDebugOn(au_dcount(d) <= 0); - if (!au_digen_test(d, sigen)) { - di_read_lock_child(d, AuLock_IR); - if (unlikely(au_dbrange_test(d))) { - di_read_unlock(d, AuLock_IR); - continue; - } - } else { - di_write_lock_child(d); - if (unlikely(au_dbrange_test(d))) { - di_write_unlock(d); - continue; - } - err = au_reval_dpath(d, sigen); - if (!err) - di_downgrade_lock(d, AuLock_IR); - else { - di_write_unlock(d); - break; - } - } - - /* AuDbgDentry(d); */ - bstart = au_dbstart(d); - bend = au_dbend(d); - if (bstart <= bindex - && bindex <= bend - && au_h_dptr(d, bindex) - && au_test_dbusy(d, bstart, bend)) { - err = -EBUSY; - AuVerbose(verbose, "busy %pd\n", d); - AuDbgDentry(d); - } - di_read_unlock(d, AuLock_IR); - } - } - -out_dpages: - au_dpages_free(&dpages); -out: - return err; -} - -static int test_inode_busy(struct super_block *sb, aufs_bindex_t bindex, - unsigned int sigen, const unsigned int verbose) -{ - int err; - unsigned long long max, ull; - struct inode *i, **array; - aufs_bindex_t bstart, bend; - - array = au_iarray_alloc(sb, &max); - err = PTR_ERR(array); - if (IS_ERR(array)) - goto out; - - err = 0; - AuDbg("b%d\n", bindex); - for (ull = 0; !err && ull < max; ull++) { - i = array[ull]; - if (unlikely(!i)) - break; - if (i->i_ino == AUFS_ROOT_INO) - continue; - - /* AuDbgInode(i); */ - if (au_iigen(i, NULL) == sigen) - ii_read_lock_child(i); - else { - ii_write_lock_child(i); - err = au_refresh_hinode_self(i); - au_iigen_dec(i); - if (!err) - ii_downgrade_lock(i); - else { - ii_write_unlock(i); - break; - } - } - - bstart = au_ibstart(i); - bend = au_ibend(i); - if (bstart <= bindex - && bindex <= bend - && au_h_iptr(i, bindex) - && au_test_ibusy(i, bstart, bend)) { - err = -EBUSY; - AuVerbose(verbose, "busy i%lu\n", i->i_ino); - AuDbgInode(i); - } - ii_read_unlock(i); - } - au_iarray_free(array, max); - -out: - return err; -} - -static int test_children_busy(struct dentry *root, aufs_bindex_t bindex, - const unsigned int verbose) -{ - int err; - unsigned int sigen; - - sigen = au_sigen(root->d_sb); - DiMustNoWaiters(root); - IiMustNoWaiters(d_inode(root)); - di_write_unlock(root); - err = test_dentry_busy(root, bindex, sigen, verbose); - if (!err) - err = test_inode_busy(root->d_sb, bindex, sigen, verbose); - di_write_lock_child(root); /* aufs_write_lock() calls ..._child() */ - - return err; -} - -static int test_dir_busy(struct file *file, aufs_bindex_t br_id, - struct file **to_free, int *idx) -{ - int err; - unsigned char matched, root; - aufs_bindex_t bindex, bend; - struct au_fidir *fidir; - struct au_hfile *hfile; - - err = 0; - root = IS_ROOT(file->f_path.dentry); - if (root) { - get_file(file); - to_free[*idx] = file; - (*idx)++; - goto out; - } - - matched = 0; - fidir = au_fi(file)->fi_hdir; - AuDebugOn(!fidir); - bend = au_fbend_dir(file); - for (bindex = au_fbstart(file); bindex <= bend; bindex++) { - hfile = fidir->fd_hfile + bindex; - if (!hfile->hf_file) - continue; - - if (hfile->hf_br->br_id == br_id) { - matched = 1; - break; - } - } - if (matched) - err = -EBUSY; - -out: - return err; -} - -static int test_file_busy(struct super_block *sb, aufs_bindex_t br_id, - struct file **to_free, int opened) -{ - int err, idx; - unsigned long long ull, max; - aufs_bindex_t bstart; - struct file *file, **array; - struct dentry *root; - struct au_hfile *hfile; - - array = au_farray_alloc(sb, &max); - err = PTR_ERR(array); - if (IS_ERR(array)) - goto out; - - err = 0; - idx = 0; - root = sb->s_root; - di_write_unlock(root); - for (ull = 0; ull < max; ull++) { - file = array[ull]; - if (unlikely(!file)) - break; - - /* AuDbg("%pD\n", file); */ - fi_read_lock(file); - bstart = au_fbstart(file); - if (!d_is_dir(file->f_path.dentry)) { - hfile = &au_fi(file)->fi_htop; - if (hfile->hf_br->br_id == br_id) - err = -EBUSY; - } else - err = test_dir_busy(file, br_id, to_free, &idx); - fi_read_unlock(file); - if (unlikely(err)) - break; - } - di_write_lock_child(root); - au_farray_free(array, max); - AuDebugOn(idx > opened); - -out: - return err; -} - -static void br_del_file(struct file **to_free, unsigned long long opened, - aufs_bindex_t br_id) -{ - unsigned long long ull; - aufs_bindex_t bindex, bstart, bend, bfound; - struct file *file; - struct au_fidir *fidir; - struct au_hfile *hfile; - - for (ull = 0; ull < opened; ull++) { - file = to_free[ull]; - if (unlikely(!file)) - break; - - /* AuDbg("%pD\n", file); */ - AuDebugOn(!d_is_dir(file->f_path.dentry)); - bfound = -1; - fidir = au_fi(file)->fi_hdir; - AuDebugOn(!fidir); - fi_write_lock(file); - bstart = au_fbstart(file); - bend = au_fbend_dir(file); - for (bindex = bstart; bindex <= bend; bindex++) { - hfile = fidir->fd_hfile + bindex; - if (!hfile->hf_file) - continue; - - if (hfile->hf_br->br_id == br_id) { - bfound = bindex; - break; - } - } - AuDebugOn(bfound < 0); - au_set_h_fptr(file, bfound, NULL); - if (bfound == bstart) { - for (bstart++; bstart <= bend; bstart++) - if (au_hf_dir(file, bstart)) { - au_set_fbstart(file, bstart); - break; - } - } - fi_write_unlock(file); - } -} - -static void au_br_do_del_brp(struct au_sbinfo *sbinfo, - const aufs_bindex_t bindex, - const aufs_bindex_t bend) -{ - struct au_branch **brp, **p; - - AuRwMustWriteLock(&sbinfo->si_rwsem); - - brp = sbinfo->si_branch + bindex; - if (bindex < bend) - memmove(brp, brp + 1, sizeof(*brp) * (bend - bindex)); - sbinfo->si_branch[0 + bend] = NULL; - sbinfo->si_bend--; - - p = krealloc(sbinfo->si_branch, sizeof(*p) * bend, AuGFP_SBILIST); - if (p) - sbinfo->si_branch = p; - /* harmless error */ -} - -static void au_br_do_del_hdp(struct au_dinfo *dinfo, const aufs_bindex_t bindex, - const aufs_bindex_t bend) -{ - struct au_hdentry *hdp, *p; - - AuRwMustWriteLock(&dinfo->di_rwsem); - - hdp = dinfo->di_hdentry; - if (bindex < bend) - memmove(hdp + bindex, hdp + bindex + 1, - sizeof(*hdp) * (bend - bindex)); - hdp[0 + bend].hd_dentry = NULL; - dinfo->di_bend--; - - p = krealloc(hdp, sizeof(*p) * bend, AuGFP_SBILIST); - if (p) - dinfo->di_hdentry = p; - /* harmless error */ -} - -static void au_br_do_del_hip(struct au_iinfo *iinfo, const aufs_bindex_t bindex, - const aufs_bindex_t bend) -{ - struct au_hinode *hip, *p; - - AuRwMustWriteLock(&iinfo->ii_rwsem); - - hip = iinfo->ii_hinode + bindex; - if (bindex < bend) - memmove(hip, hip + 1, sizeof(*hip) * (bend - bindex)); - iinfo->ii_hinode[0 + bend].hi_inode = NULL; - au_hn_init(iinfo->ii_hinode + bend); - iinfo->ii_bend--; - - p = krealloc(iinfo->ii_hinode, sizeof(*p) * bend, AuGFP_SBILIST); - if (p) - iinfo->ii_hinode = p; - /* harmless error */ -} - -static void au_br_do_del(struct super_block *sb, aufs_bindex_t bindex, - struct au_branch *br) -{ - aufs_bindex_t bend; - struct au_sbinfo *sbinfo; - struct dentry *root, *h_root; - struct inode *inode, *h_inode; - struct au_hinode *hinode; - - SiMustWriteLock(sb); - - root = sb->s_root; - inode = d_inode(root); - sbinfo = au_sbi(sb); - bend = sbinfo->si_bend; - - h_root = au_h_dptr(root, bindex); - hinode = au_hi(inode, bindex); - h_inode = au_igrab(hinode->hi_inode); - au_hiput(hinode); - - au_sbilist_lock(); - au_br_do_del_brp(sbinfo, bindex, bend); - au_br_do_del_hdp(au_di(root), bindex, bend); - au_br_do_del_hip(au_ii(inode), bindex, bend); - au_sbilist_unlock(); - - dput(h_root); - iput(h_inode); - au_br_do_free(br); -} - -static unsigned long long empty_cb(struct super_block *sb, void *array, - unsigned long long max, void *arg) -{ - return max; -} - -int au_br_del(struct super_block *sb, struct au_opt_del *del, int remount) -{ - int err, rerr, i; - unsigned long long opened; - unsigned int mnt_flags; - aufs_bindex_t bindex, bend, br_id; - unsigned char do_wh, verbose; - struct au_branch *br; - struct au_wbr *wbr; - struct dentry *root; - struct file **to_free; - - err = 0; - opened = 0; - to_free = NULL; - root = sb->s_root; - bindex = au_find_dbindex(root, del->h_path.dentry); - if (bindex < 0) { - if (remount) - goto out; /* success */ - err = -ENOENT; - pr_err("%s no such branch\n", del->pathname); - goto out; - } - AuDbg("bindex b%d\n", bindex); - - err = -EBUSY; - mnt_flags = au_mntflags(sb); - verbose = !!au_opt_test(mnt_flags, VERBOSE); - bend = au_sbend(sb); - if (unlikely(!bend)) { - AuVerbose(verbose, "no more branches left\n"); - goto out; - } - br = au_sbr(sb, bindex); - AuDebugOn(!path_equal(&br->br_path, &del->h_path)); - - br_id = br->br_id; - opened = atomic_read(&br->br_count); - if (unlikely(opened)) { - to_free = au_array_alloc(&opened, empty_cb, sb, NULL); - err = PTR_ERR(to_free); - if (IS_ERR(to_free)) - goto out; - - err = test_file_busy(sb, br_id, to_free, opened); - if (unlikely(err)) { - AuVerbose(verbose, "%llu file(s) opened\n", opened); - goto out; - } - } - - wbr = br->br_wbr; - do_wh = wbr && (wbr->wbr_whbase || wbr->wbr_plink || wbr->wbr_orph); - if (do_wh) { - /* instead of WbrWhMustWriteLock(wbr) */ - SiMustWriteLock(sb); - for (i = 0; i < AuBrWh_Last; i++) { - dput(wbr->wbr_wh[i]); - wbr->wbr_wh[i] = NULL; - } - } - - err = test_children_busy(root, bindex, verbose); - if (unlikely(err)) { - if (do_wh) - goto out_wh; - goto out; - } - - err = 0; - if (to_free) { - /* - * now we confirmed the branch is deletable. - * let's free the remaining opened dirs on the branch. - */ - di_write_unlock(root); - br_del_file(to_free, opened, br_id); - di_write_lock_child(root); - } - - if (!remount) - au_br_do_del(sb, bindex, br); - else { - sysaufs_brs_del(sb, bindex); - au_br_do_del(sb, bindex, br); - sysaufs_brs_add(sb, bindex); - } - - if (!bindex) { - au_cpup_attr_all(d_inode(root), /*force*/1); - sb->s_maxbytes = au_sbr_sb(sb, 0)->s_maxbytes; - } else - au_sub_nlink(d_inode(root), d_inode(del->h_path.dentry)); - if (au_opt_test(mnt_flags, PLINK)) - au_plink_half_refresh(sb, br_id); - - if (au_xino_brid(sb) == br_id) - au_xino_brid_set(sb, -1); - goto out; /* success */ - -out_wh: - /* revert */ - rerr = au_br_init_wh(sb, br, br->br_perm); - if (rerr) - pr_warn("failed re-creating base whiteout, %s. (%d)\n", - del->pathname, rerr); -out: - if (to_free) - au_farray_free(to_free, opened); - return err; -} - -/* ---------------------------------------------------------------------- */ - -static int au_ibusy(struct super_block *sb, struct aufs_ibusy __user *arg) -{ - int err; - aufs_bindex_t bstart, bend; - struct aufs_ibusy ibusy; - struct inode *inode, *h_inode; - - err = -EPERM; - if (unlikely(!capable(CAP_SYS_ADMIN))) - goto out; - - err = copy_from_user(&ibusy, arg, sizeof(ibusy)); - if (!err) - err = !access_ok(VERIFY_WRITE, &arg->h_ino, sizeof(arg->h_ino)); - if (unlikely(err)) { - err = -EFAULT; - AuTraceErr(err); - goto out; - } - - err = -EINVAL; - si_read_lock(sb, AuLock_FLUSH); - if (unlikely(ibusy.bindex < 0 || ibusy.bindex > au_sbend(sb))) - goto out_unlock; - - err = 0; - ibusy.h_ino = 0; /* invalid */ - inode = ilookup(sb, ibusy.ino); - if (!inode - || inode->i_ino == AUFS_ROOT_INO - || is_bad_inode(inode)) - goto out_unlock; - - ii_read_lock_child(inode); - bstart = au_ibstart(inode); - bend = au_ibend(inode); - if (bstart <= ibusy.bindex && ibusy.bindex <= bend) { - h_inode = au_h_iptr(inode, ibusy.bindex); - if (h_inode && au_test_ibusy(inode, bstart, bend)) - ibusy.h_ino = h_inode->i_ino; - } - ii_read_unlock(inode); - iput(inode); - -out_unlock: - si_read_unlock(sb); - if (!err) { - err = __put_user(ibusy.h_ino, &arg->h_ino); - if (unlikely(err)) { - err = -EFAULT; - AuTraceErr(err); - } - } -out: - return err; -} - -long au_ibusy_ioctl(struct file *file, unsigned long arg) -{ - return au_ibusy(file->f_path.dentry->d_sb, (void __user *)arg); -} - -#ifdef CONFIG_COMPAT -long au_ibusy_compat_ioctl(struct file *file, unsigned long arg) -{ - return au_ibusy(file->f_path.dentry->d_sb, compat_ptr(arg)); -} -#endif - -/* ---------------------------------------------------------------------- */ - -/* - * change a branch permission - */ - -static void au_warn_ima(void) -{ -#ifdef CONFIG_IMA - /* since it doesn't support mark_files_ro() */ - AuWarn1("RW -> RO makes IMA to produce wrong message\n"); -#endif -} - -static int do_need_sigen_inc(int a, int b) -{ - return au_br_whable(a) && !au_br_whable(b); -} - -static int need_sigen_inc(int old, int new) -{ - return do_need_sigen_inc(old, new) - || do_need_sigen_inc(new, old); -} - -static int au_br_mod_files_ro(struct super_block *sb, aufs_bindex_t bindex) -{ - int err, do_warn; - unsigned int mnt_flags; - unsigned long long ull, max; - aufs_bindex_t br_id; - unsigned char verbose, writer; - struct file *file, *hf, **array; - struct au_hfile *hfile; - - mnt_flags = au_mntflags(sb); - verbose = !!au_opt_test(mnt_flags, VERBOSE); - - array = au_farray_alloc(sb, &max); - err = PTR_ERR(array); - if (IS_ERR(array)) - goto out; - - do_warn = 0; - br_id = au_sbr_id(sb, bindex); - for (ull = 0; ull < max; ull++) { - file = array[ull]; - if (unlikely(!file)) - break; - - /* AuDbg("%pD\n", file); */ - fi_read_lock(file); - if (unlikely(au_test_mmapped(file))) { - err = -EBUSY; - AuVerbose(verbose, "mmapped %pD\n", file); - AuDbgFile(file); - FiMustNoWaiters(file); - fi_read_unlock(file); - goto out_array; - } - - hfile = &au_fi(file)->fi_htop; - hf = hfile->hf_file; - if (!d_is_reg(file->f_path.dentry) - || !(file->f_mode & FMODE_WRITE) - || hfile->hf_br->br_id != br_id - || !(hf->f_mode & FMODE_WRITE)) - array[ull] = NULL; - else { - do_warn = 1; - get_file(file); - } - - FiMustNoWaiters(file); - fi_read_unlock(file); - fput(file); - } - - err = 0; - if (do_warn) - au_warn_ima(); - - for (ull = 0; ull < max; ull++) { - file = array[ull]; - if (!file) - continue; - - /* todo: already flushed? */ - /* - * fs/super.c:mark_files_ro() is gone, but aufs keeps its - * approach which resets f_mode and calls mnt_drop_write() and - * file_release_write() for each file, because the branch - * attribute in aufs world is totally different from the native - * fs rw/ro mode. - */ - /* fi_read_lock(file); */ - hfile = &au_fi(file)->fi_htop; - hf = hfile->hf_file; - /* fi_read_unlock(file); */ - spin_lock(&hf->f_lock); - writer = !!(hf->f_mode & FMODE_WRITER); - hf->f_mode &= ~(FMODE_WRITE | FMODE_WRITER); - spin_unlock(&hf->f_lock); - if (writer) { - put_write_access(file_inode(hf)); - __mnt_drop_write(hf->f_path.mnt); - } - } - -out_array: - au_farray_free(array, max); -out: - AuTraceErr(err); - return err; -} - -int au_br_mod(struct super_block *sb, struct au_opt_mod *mod, int remount, - int *do_refresh) -{ - int err, rerr; - aufs_bindex_t bindex; - struct dentry *root; - struct au_branch *br; - struct au_br_fhsm *bf; - - root = sb->s_root; - bindex = au_find_dbindex(root, mod->h_root); - if (bindex < 0) { - if (remount) - return 0; /* success */ - err = -ENOENT; - pr_err("%s no such branch\n", mod->path); - goto out; - } - AuDbg("bindex b%d\n", bindex); - - err = test_br(d_inode(mod->h_root), mod->perm, mod->path); - if (unlikely(err)) - goto out; - - br = au_sbr(sb, bindex); - AuDebugOn(mod->h_root != au_br_dentry(br)); - if (br->br_perm == mod->perm) - return 0; /* success */ - - /* pre-allocate for non-fhsm --> fhsm */ - bf = NULL; - if (!au_br_fhsm(br->br_perm) && au_br_fhsm(mod->perm)) { - err = au_fhsm_br_alloc(br); - if (unlikely(err)) - goto out; - bf = br->br_fhsm; - br->br_fhsm = NULL; - } - - if (au_br_writable(br->br_perm)) { - /* remove whiteout base */ - err = au_br_init_wh(sb, br, mod->perm); - if (unlikely(err)) - goto out_bf; - - if (!au_br_writable(mod->perm)) { - /* rw --> ro, file might be mmapped */ - DiMustNoWaiters(root); - IiMustNoWaiters(d_inode(root)); - di_write_unlock(root); - err = au_br_mod_files_ro(sb, bindex); - /* aufs_write_lock() calls ..._child() */ - di_write_lock_child(root); - - if (unlikely(err)) { - rerr = -ENOMEM; - br->br_wbr = kzalloc(sizeof(*br->br_wbr), - GFP_NOFS); - if (br->br_wbr) - rerr = au_wbr_init(br, sb, br->br_perm); - if (unlikely(rerr)) { - AuIOErr("nested error %d (%d)\n", - rerr, err); - br->br_perm = mod->perm; - } - } - } - } else if (au_br_writable(mod->perm)) { - /* ro --> rw */ - err = -ENOMEM; - br->br_wbr = kzalloc(sizeof(*br->br_wbr), GFP_NOFS); - if (br->br_wbr) { - err = au_wbr_init(br, sb, mod->perm); - if (unlikely(err)) { - kfree(br->br_wbr); - br->br_wbr = NULL; - } - } - } - if (unlikely(err)) - goto out_bf; - - if (au_br_fhsm(br->br_perm)) { - if (!au_br_fhsm(mod->perm)) { - /* fhsm --> non-fhsm */ - au_br_fhsm_fin(br->br_fhsm); - kfree(br->br_fhsm); - br->br_fhsm = NULL; - } - } else if (au_br_fhsm(mod->perm)) - /* non-fhsm --> fhsm */ - br->br_fhsm = bf; - - *do_refresh |= need_sigen_inc(br->br_perm, mod->perm); - br->br_perm = mod->perm; - goto out; /* success */ - -out_bf: - kfree(bf); -out: - AuTraceErr(err); - return err; -} - -/* ---------------------------------------------------------------------- */ - -int au_br_stfs(struct au_branch *br, struct aufs_stfs *stfs) -{ - int err; - struct kstatfs kstfs; - - err = vfs_statfs(&br->br_path, &kstfs); - if (!err) { - stfs->f_blocks = kstfs.f_blocks; - stfs->f_bavail = kstfs.f_bavail; - stfs->f_files = kstfs.f_files; - stfs->f_ffree = kstfs.f_ffree; - } - - return err; -} diff --git a/fs/aufs/branch.h b/fs/aufs/branch.h deleted file mode 100644 index 4c52ae166..000000000 --- a/fs/aufs/branch.h +++ /dev/null @@ -1,266 +0,0 @@ -/* - * Copyright (C) 2005-2016 Junjiro R. Okajima - */ - -/* - * branch filesystems and xino for them - */ - -#ifndef __AUFS_BRANCH_H__ -#define __AUFS_BRANCH_H__ - -#ifdef __KERNEL__ - -#include <linux/mount.h> -#include "dynop.h" -#include "rwsem.h" -#include "super.h" - -/* ---------------------------------------------------------------------- */ - -/* a xino file */ -struct au_xino_file { - struct file *xi_file; - struct mutex xi_nondir_mtx; - - /* todo: make xino files an array to support huge inode number */ - -#ifdef CONFIG_DEBUG_FS - struct dentry *xi_dbgaufs; -#endif -}; - -/* File-based Hierarchical Storage Management */ -struct au_br_fhsm { -#ifdef CONFIG_AUFS_FHSM - struct mutex bf_lock; - unsigned long bf_jiffy; - struct aufs_stfs bf_stfs; - int bf_readable; -#endif -}; - -/* members for writable branch only */ -enum {AuBrWh_BASE, AuBrWh_PLINK, AuBrWh_ORPH, AuBrWh_Last}; -struct au_wbr { - struct au_rwsem wbr_wh_rwsem; - struct dentry *wbr_wh[AuBrWh_Last]; - atomic_t wbr_wh_running; -#define wbr_whbase wbr_wh[AuBrWh_BASE] /* whiteout base */ -#define wbr_plink wbr_wh[AuBrWh_PLINK] /* pseudo-link dir */ -#define wbr_orph wbr_wh[AuBrWh_ORPH] /* dir for orphans */ - - /* mfs mode */ - unsigned long long wbr_bytes; -}; - -/* ext2 has 3 types of operations at least, ext3 has 4 */ -#define AuBrDynOp (AuDyLast * 4) - -#ifdef CONFIG_AUFS_HFSNOTIFY -/* support for asynchronous destruction */ -struct au_br_hfsnotify { - struct fsnotify_group *hfsn_group; -}; -#endif - -/* sysfs entries */ -struct au_brsysfs { - char name[16]; - struct attribute attr; -}; - -enum { - AuBrSysfs_BR, - AuBrSysfs_BRID, - AuBrSysfs_Last -}; - -/* protected by superblock rwsem */ -struct au_branch { - struct au_xino_file br_xino; - - aufs_bindex_t br_id; - - int br_perm; - struct path br_path; - spinlock_t br_dykey_lock; - struct au_dykey *br_dykey[AuBrDynOp]; - atomic_t br_count; - - struct au_wbr *br_wbr; - struct au_br_fhsm *br_fhsm; - - /* xino truncation */ - atomic_t br_xino_running; - -#ifdef CONFIG_AUFS_HFSNOTIFY - struct au_br_hfsnotify *br_hfsn; -#endif - -#ifdef CONFIG_SYSFS - /* entries under sysfs per mount-point */ - struct au_brsysfs br_sysfs[AuBrSysfs_Last]; -#endif -}; - -/* ---------------------------------------------------------------------- */ - -static inline struct vfsmount *au_br_mnt(struct au_branch *br) -{ - return br->br_path.mnt; -} - -static inline struct dentry *au_br_dentry(struct au_branch *br) -{ - return br->br_path.dentry; -} - -static inline struct super_block *au_br_sb(struct au_branch *br) -{ - return au_br_mnt(br)->mnt_sb; -} - -static inline int au_br_rdonly(struct au_branch *br) -{ - return ((au_br_sb(br)->s_flags & MS_RDONLY) - || !au_br_writable(br->br_perm)) - ? -EROFS : 0; -} - -static inline int au_br_hnotifyable(int brperm __maybe_unused) -{ -#ifdef CONFIG_AUFS_HNOTIFY - return !(brperm & AuBrPerm_RR); -#else - return 0; -#endif -} - -static inline int au_br_test_oflag(int oflag, struct au_branch *br) -{ - int err, exec_flag; - - err = 0; - exec_flag = oflag & __FMODE_EXEC; - if (unlikely(exec_flag && (au_br_mnt(br)->mnt_flags & MNT_NOEXEC))) - err = -EACCES; - - return err; -} - -/* ---------------------------------------------------------------------- */ - -/* branch.c */ -struct au_sbinfo; -void au_br_free(struct au_sbinfo *sinfo); -int au_br_index(struct super_block *sb, aufs_bindex_t br_id); -struct au_opt_add; -int au_br_add(struct super_block *sb, struct au_opt_add *add, int remount); -struct au_opt_del; -int au_br_del(struct super_block *sb, struct au_opt_del *del, int remount); -long au_ibusy_ioctl(struct file *file, unsigned long arg); -#ifdef CONFIG_COMPAT -long au_ibusy_compat_ioctl(struct file *file, unsigned long arg); -#endif -struct au_opt_mod; -int au_br_mod(struct super_block *sb, struct au_opt_mod *mod, int remount, - int *do_refresh); -struct aufs_stfs; -int au_br_stfs(struct au_branch *br, struct aufs_stfs *stfs); - -/* xino.c */ -static const loff_t au_loff_max = LLONG_MAX; - -int au_xib_trunc(struct super_block *sb); -ssize_t xino_fread(vfs_readf_t func, struct file *file, void *buf, size_t size, - loff_t *pos); -ssize_t xino_fwrite(vfs_writef_t func, struct file *file, void *buf, - size_t size, loff_t *pos); -struct file *au_xino_create2(struct file *base_file, struct file *copy_src); -struct file *au_xino_create(struct super_block *sb, char *fname, int silent); -ino_t au_xino_new_ino(struct super_block *sb); -void au_xino_delete_inode(struct inode *inode, const int unlinked); -int au_xino_write(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino, - ino_t ino); -int au_xino_read(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino, - ino_t *ino); -int au_xino_br(struct super_block *sb, struct au_branch *br, ino_t hino, - struct file *base_file, int do_test); -int au_xino_trunc(struct super_block *sb, aufs_bindex_t bindex); - -struct au_opt_xino; -int au_xino_set(struct super_block *sb, struct au_opt_xino *xino, int remount); -void au_xino_clr(struct super_block *sb); -struct file *au_xino_def(struct super_block *sb); -int au_xino_path(struct seq_file *seq, struct file *file); - -/* ---------------------------------------------------------------------- */ - -/* Superblock to branch */ -static inline -aufs_bindex_t au_sbr_id(struct super_block *sb, aufs_bindex_t bindex) -{ - return au_sbr(sb, bindex)->br_id; -} - -static inline -struct vfsmount *au_sbr_mnt(struct super_block *sb, aufs_bindex_t bindex) -{ - return au_br_mnt(au_sbr(sb, bindex)); -} - -static inline -struct super_block *au_sbr_sb(struct super_block *sb, aufs_bindex_t bindex) -{ - return au_br_sb(au_sbr(sb, bindex)); -} - -static inline void au_sbr_put(struct super_block *sb, aufs_bindex_t bindex) -{ - atomic_dec(&au_sbr(sb, bindex)->br_count); -} - -static inline int au_sbr_perm(struct super_block *sb, aufs_bindex_t bindex) -{ - return au_sbr(sb, bindex)->br_perm; -} - -static inline int au_sbr_whable(struct super_block *sb, aufs_bindex_t bindex) -{ - return au_br_whable(au_sbr_perm(sb, bindex)); -} - -/* ---------------------------------------------------------------------- */ - -/* - * wbr_wh_read_lock, wbr_wh_write_lock - * wbr_wh_read_unlock, wbr_wh_write_unlock, wbr_wh_downgrade_lock - */ -AuSimpleRwsemFuncs(wbr_wh, struct au_wbr *wbr, &wbr->wbr_wh_rwsem); - -#define WbrWhMustNoWaiters(wbr) AuRwMustNoWaiters(&wbr->wbr_wh_rwsem) -#define WbrWhMustAnyLock(wbr) AuRwMustAnyLock(&wbr->wbr_wh_rwsem) -#define WbrWhMustWriteLock(wbr) AuRwMustWriteLock(&wbr->wbr_wh_rwsem) - -/* ---------------------------------------------------------------------- */ - -#ifdef CONFIG_AUFS_FHSM -static inline void au_br_fhsm_init(struct au_br_fhsm *brfhsm) -{ - mutex_init(&brfhsm->bf_lock); - brfhsm->bf_jiffy = 0; - brfhsm->bf_readable = 0; -} - -static inline void au_br_fhsm_fin(struct au_br_fhsm *brfhsm) -{ - mutex_destroy(&brfhsm->bf_lock); -} -#else -AuStubVoid(au_br_fhsm_init, struct au_br_fhsm *brfhsm) -AuStubVoid(au_br_fhsm_fin, struct au_br_fhsm *brfhsm) -#endif - -#endif /* __KERNEL__ */ -#endif /* __AUFS_BRANCH_H__ */ diff --git a/fs/aufs/cpup.c b/fs/aufs/cpup.c deleted file mode 100644 index cadb3adb7..000000000 --- a/fs/aufs/cpup.c +++ /dev/null @@ -1,1366 +0,0 @@ -/* - * Copyright (C) 2005-2016 Junjiro R. Okajima - */ - -/* - * copy-up functions, see wbr_policy.c for copy-down - */ - -#include <linux/fs_stack.h> -#include <linux/mm.h> -#include <linux/task_work.h> -#include "aufs.h" - -void au_cpup_attr_flags(struct inode *dst, unsigned int iflags) -{ - const unsigned int mask = S_DEAD | S_SWAPFILE | S_PRIVATE - | S_NOATIME | S_NOCMTIME | S_AUTOMOUNT; - - BUILD_BUG_ON(sizeof(iflags) != sizeof(dst->i_flags)); - - dst->i_flags |= iflags & ~mask; - if (au_test_fs_notime(dst->i_sb)) - dst->i_flags |= S_NOATIME | S_NOCMTIME; -} - -void au_cpup_attr_timesizes(struct inode *inode) -{ - struct inode *h_inode; - - h_inode = au_h_iptr(inode, au_ibstart(inode)); - fsstack_copy_attr_times(inode, h_inode); - fsstack_copy_inode_size(inode, h_inode); -} - -void au_cpup_attr_nlink(struct inode *inode, int force) -{ - struct inode *h_inode; - struct super_block *sb; - aufs_bindex_t bindex, bend; - - sb = inode->i_sb; - bindex = au_ibstart(inode); - h_inode = au_h_iptr(inode, bindex); - if (!force - && !S_ISDIR(h_inode->i_mode) - && au_opt_test(au_mntflags(sb), PLINK) - && au_plink_test(inode)) - return; - - /* - * 0 can happen in revalidating. - * h_inode->i_mutex may not be held here, but it is harmless since once - * i_nlink reaches 0, it will never become positive except O_TMPFILE - * case. - * todo: O_TMPFILE+linkat(AT_SYMLINK_FOLLOW) bypassing aufs may cause - * the incorrect link count. - */ - set_nlink(inode, h_inode->i_nlink); - - /* - * fewer nlink makes find(1) noisy, but larger nlink doesn't. - * it may includes whplink directory. - */ - if (S_ISDIR(h_inode->i_mode)) { - bend = au_ibend(inode); - for (bindex++; bindex <= bend; bindex++) { - h_inode = au_h_iptr(inode, bindex); - if (h_inode) - au_add_nlink(inode, h_inode); - } - } -} - -void au_cpup_attr_changeable(struct inode *inode) -{ - struct inode *h_inode; - - h_inode = au_h_iptr(inode, au_ibstart(inode)); - inode->i_mode = h_inode->i_mode; - inode->i_uid = h_inode->i_uid; - inode->i_gid = h_inode->i_gid; - au_cpup_attr_timesizes(inode); - au_cpup_attr_flags(inode, h_inode->i_flags); -} - -void au_cpup_igen(struct inode *inode, struct inode *h_inode) -{ - struct au_iinfo *iinfo = au_ii(inode); - - IiMustWriteLock(inode); - - iinfo->ii_higen = h_inode->i_generation; - iinfo->ii_hsb1 = h_inode->i_sb; -} - -void au_cpup_attr_all(struct inode *inode, int force) -{ - struct inode *h_inode; - - h_inode = au_h_iptr(inode, au_ibstart(inode)); - au_cpup_attr_changeable(inode); - if (inode->i_nlink > 0) - au_cpup_attr_nlink(inode, force); - inode->i_rdev = h_inode->i_rdev; - inode->i_blkbits = h_inode->i_blkbits; - au_cpup_igen(inode, h_inode); -} - -/* ---------------------------------------------------------------------- */ - -/* Note: dt_dentry and dt_h_dentry are not dget/dput-ed */ - -/* keep the timestamps of the parent dir when cpup */ -void au_dtime_store(struct au_dtime *dt, struct dentry *dentry, - struct path *h_path) -{ - struct inode *h_inode; - - dt->dt_dentry = dentry; - dt->dt_h_path = *h_path; - h_inode = d_inode(h_path->dentry); - dt->dt_atime = h_inode->i_atime; - dt->dt_mtime = h_inode->i_mtime; - /* smp_mb(); */ -} - -void au_dtime_revert(struct au_dtime *dt) -{ - struct iattr attr; - int err; - - attr.ia_atime = dt->dt_atime; - attr.ia_mtime = dt->dt_mtime; - attr.ia_valid = ATTR_FORCE | ATTR_MTIME | ATTR_MTIME_SET - | ATTR_ATIME | ATTR_ATIME_SET; - - /* no delegation since this is a directory */ - err = vfsub_notify_change(&dt->dt_h_path, &attr, /*delegated*/NULL); - if (unlikely(err)) - pr_warn("restoring timestamps failed(%d). ignored\n", err); -} - -/* ---------------------------------------------------------------------- */ - -/* internal use only */ -struct au_cpup_reg_attr { - int valid; - struct kstat st; - unsigned int iflags; /* inode->i_flags */ -}; - -static noinline_for_stack -int cpup_iattr(struct dentry *dst, aufs_bindex_t bindex, struct dentry *h_src, - struct au_cpup_reg_attr *h_src_attr) -{ - int err, sbits, icex; - unsigned int mnt_flags; - unsigned char verbose; - struct iattr ia; - struct path h_path; - struct inode *h_isrc, *h_idst; - struct kstat *h_st; - struct au_branch *br; - - h_path.dentry = au_h_dptr(dst, bindex); - h_idst = d_inode(h_path.dentry); - br = au_sbr(dst->d_sb, bindex); - h_path.mnt = au_br_mnt(br); - h_isrc = d_inode(h_src); - ia.ia_valid = ATTR_FORCE | ATTR_UID | ATTR_GID - | ATTR_ATIME | ATTR_MTIME - | ATTR_ATIME_SET | ATTR_MTIME_SET; - if (h_src_attr && h_src_attr->valid) { - h_st = &h_src_attr->st; - ia.ia_uid = h_st->uid; - ia.ia_gid = h_st->gid; - ia.ia_atime = h_st->atime; - ia.ia_mtime = h_st->mtime; - if (h_idst->i_mode != h_st->mode - && !S_ISLNK(h_idst->i_mode)) { - ia.ia_valid |= ATTR_MODE; - ia.ia_mode = h_st->mode; - } - sbits = !!(h_st->mode & (S_ISUID | S_ISGID)); - au_cpup_attr_flags(h_idst, h_src_attr->iflags); - } else { - ia.ia_uid = h_isrc->i_uid; - ia.ia_gid = h_isrc->i_gid; - ia.ia_atime = h_isrc->i_atime; - ia.ia_mtime = h_isrc->i_mtime; - if (h_idst->i_mode != h_isrc->i_mode - && !S_ISLNK(h_idst->i_mode)) { - ia.ia_valid |= ATTR_MODE; - ia.ia_mode = h_isrc->i_mode; - } - sbits = !!(h_isrc->i_mode & (S_ISUID | S_ISGID)); - au_cpup_attr_flags(h_idst, h_isrc->i_flags); - } - /* no delegation since it is just created */ - err = vfsub_notify_change(&h_path, &ia, /*delegated*/NULL); - - /* is this nfs only? */ - if (!err && sbits && au_test_nfs(h_path.dentry->d_sb)) { - ia.ia_valid = ATTR_FORCE | ATTR_MODE; - ia.ia_mode = h_isrc->i_mode; - err = vfsub_notify_change(&h_path, &ia, /*delegated*/NULL); - } - - icex = br->br_perm & AuBrAttr_ICEX; - if (!err) { - mnt_flags = au_mntflags(dst->d_sb); - verbose = !!au_opt_test(mnt_flags, VERBOSE); - err = au_cpup_xattr(h_path.dentry, h_src, icex, verbose); - } - - return err; -} - -/* ---------------------------------------------------------------------- */ - -static int au_do_copy_file(struct file *dst, struct file *src, loff_t len, - char *buf, unsigned long blksize) -{ - int err; - size_t sz, rbytes, wbytes; - unsigned char all_zero; - char *p, *zp; - struct mutex *h_mtx; - /* reduce stack usage */ - struct iattr *ia; - - zp = page_address(ZERO_PAGE(0)); - if (unlikely(!zp)) - return -ENOMEM; /* possible? */ - - err = 0; - all_zero = 0; - while (len) { - AuDbg("len %lld\n", len); - sz = blksize; - if (len < blksize) - sz = len; - - rbytes = 0; - /* todo: signal_pending? */ - while (!rbytes || err == -EAGAIN || err == -EINTR) { - rbytes = vfsub_read_k(src, buf, sz, &src->f_pos); - err = rbytes; - } - if (unlikely(err < 0)) - break; - - all_zero = 0; - if (len >= rbytes && rbytes == blksize) - all_zero = !memcmp(buf, zp, rbytes); - if (!all_zero) { - wbytes = rbytes; - p = buf; - while (wbytes) { - size_t b; - - b = vfsub_write_k(dst, p, wbytes, &dst->f_pos); - err = b; - /* todo: signal_pending? */ - if (unlikely(err == -EAGAIN || err == -EINTR)) - continue; - if (unlikely(err < 0)) - break; - wbytes -= b; - p += b; - } - if (unlikely(err < 0)) - break; - } else { - loff_t res; - - AuLabel(hole); - res = vfsub_llseek(dst, rbytes, SEEK_CUR); - err = res; - if (unlikely(res < 0)) - break; - } - len -= rbytes; - err = 0; - } - - /* the last block may be a hole */ - if (!err && all_zero) { - AuLabel(last hole); - - err = 1; - if (au_test_nfs(dst->f_path.dentry->d_sb)) { - /* nfs requires this step to make last hole */ - /* is this only nfs? */ - do { - /* todo: signal_pending? */ - err = vfsub_write_k(dst, "\0", 1, &dst->f_pos); - } while (err == -EAGAIN || err == -EINTR); - if (err == 1) - dst->f_pos--; - } - - if (err == 1) { - ia = (void *)buf; - ia->ia_size = dst->f_pos; - ia->ia_valid = ATTR_SIZE | ATTR_FILE; - ia->ia_file = dst; - h_mtx = &file_inode(dst)->i_mutex; - mutex_lock_nested(h_mtx, AuLsc_I_CHILD2); - /* no delegation since it is just created */ - err = vfsub_notify_change(&dst->f_path, ia, - /*delegated*/NULL); - mutex_unlock(h_mtx); - } - } - - return err; -} - -int au_copy_file(struct file *dst, struct file *src, loff_t len) -{ - int err; - unsigned long blksize; - unsigned char do_kfree; - char *buf; - - err = -ENOMEM; - blksize = dst->f_path.dentry->d_sb->s_blocksize; - if (!blksize || PAGE_SIZE < blksize) - blksize = PAGE_SIZE; - AuDbg("blksize %lu\n", blksize); - do_kfree = (blksize != PAGE_SIZE && blksize >= sizeof(struct iattr *)); - if (do_kfree) - buf = kmalloc(blksize, GFP_NOFS); - else - buf = (void *)__get_free_page(GFP_NOFS); - if (unlikely(!buf)) - goto out; - - if (len > (1 << 22)) - AuDbg("copying a large file %lld\n", (long long)len); - - src->f_pos = 0; - dst->f_pos = 0; - err = au_do_copy_file(dst, src, len, buf, blksize); - if (do_kfree) - kfree(buf); - else - free_page((unsigned long)buf); - -out: - return err; -} - -/* - * to support a sparse file which is opened with O_APPEND, - * we need to close the file. - */ -static int au_cp_regular(struct au_cp_generic *cpg) -{ - int err, i; - enum { SRC, DST }; - struct { - aufs_bindex_t bindex; - unsigned int flags; - struct dentry *dentry; - int force_wr; - struct file *file; - void *label; - } *f, file[] = { - { - .bindex = cpg->bsrc, - .flags = O_RDONLY | O_NOATIME | O_LARGEFILE, - .label = &&out - }, - { - .bindex = cpg->bdst, - .flags = O_WRONLY | O_NOATIME | O_LARGEFILE, - .force_wr = !!au_ftest_cpup(cpg->flags, RWDST), - .label = &&out_src - } - }; - struct super_block *sb; - struct task_struct *tsk = current; - - /* bsrc branch can be ro/rw. */ - sb = cpg->dentry->d_sb; - f = file; - for (i = 0; i < 2; i++, f++) { - f->dentry = au_h_dptr(cpg->dentry, f->bindex); - f->file = au_h_open(cpg->dentry, f->bindex, f->flags, - /*file*/NULL, f->force_wr); - err = PTR_ERR(f->file); - if (IS_ERR(f->file)) - goto *f->label; - } - - /* try stopping to update while we copyup */ - IMustLock(d_inode(file[SRC].dentry)); - err = au_copy_file(file[DST].file, file[SRC].file, cpg->len); - - /* i wonder if we had O_NO_DELAY_FPUT flag */ - if (tsk->flags & PF_KTHREAD) - __fput_sync(file[DST].file); - else { - WARN(1, "%pD\nPlease report this warning to aufs-users ML", - file[DST].file); - fput(file[DST].file); - /* - * too bad. - * we have to call both since we don't know which place the file - * was added to. - */ - task_work_run(); - flush_delayed_fput(); - } - au_sbr_put(sb, file[DST].bindex); - -out_src: - fput(file[SRC].file); - au_sbr_put(sb, file[SRC].bindex); -out: - return err; -} - -static int au_do_cpup_regular(struct au_cp_generic *cpg, - struct au_cpup_reg_attr *h_src_attr) -{ - int err, rerr; - loff_t l; - struct path h_path; - struct inode *h_src_inode, *h_dst_inode; - - err = 0; - h_src_inode = au_h_iptr(d_inode(cpg->dentry), cpg->bsrc); - l = i_size_read(h_src_inode); - if (cpg->len == -1 || l < cpg->len) - cpg->len = l; - if (cpg->len) { - /* try stopping to update while we are referencing */ - mutex_lock_nested(&h_src_inode->i_mutex, AuLsc_I_CHILD); - au_pin_hdir_unlock(cpg->pin); - - h_path.dentry = au_h_dptr(cpg->dentry, cpg->bsrc); - h_path.mnt = au_sbr_mnt(cpg->dentry->d_sb, cpg->bsrc); - h_src_attr->iflags = h_src_inode->i_flags; - if (!au_test_nfs(h_src_inode->i_sb)) - err = vfs_getattr(&h_path, &h_src_attr->st); - else { - mutex_unlock(&h_src_inode->i_mutex); - err = vfs_getattr(&h_path, &h_src_attr->st); - mutex_lock_nested(&h_src_inode->i_mutex, AuLsc_I_CHILD); - } - if (unlikely(err)) { - mutex_unlock(&h_src_inode->i_mutex); - goto out; - } - h_src_attr->valid = 1; - err = au_cp_regular(cpg); - mutex_unlock(&h_src_inode->i_mutex); - rerr = au_pin_hdir_relock(cpg->pin); - if (!err && rerr) - err = rerr; - } - if (!err && (h_src_inode->i_state & I_LINKABLE)) { - h_path.dentry = au_h_dptr(cpg->dentry, cpg->bdst); - h_dst_inode = d_inode(h_path.dentry); - spin_lock(&h_dst_inode->i_lock); - h_dst_inode->i_state |= I_LINKABLE; - spin_unlock(&h_dst_inode->i_lock); - } - -out: - return err; -} - -static int au_do_cpup_symlink(struct path *h_path, struct dentry *h_src, - struct inode *h_dir) -{ - int err, symlen; - mm_segment_t old_fs; - union { - char *k; - char __user *u; - } sym; - struct inode *h_inode = d_inode(h_src); - const struct inode_operations *h_iop = h_inode->i_op; - - err = -ENOSYS; - if (unlikely(!h_iop->readlink)) - goto out; - - err = -ENOMEM; - sym.k = (void *)__get_free_page(GFP_NOFS); - if (unlikely(!sym.k)) - goto out; - - /* unnecessary to support mmap_sem since symlink is not mmap-able */ - old_fs = get_fs(); - set_fs(KERNEL_DS); - symlen = h_iop->readlink(h_src, sym.u, PATH_MAX); - err = symlen; - set_fs(old_fs); - - if (symlen > 0) { - sym.k[symlen] = 0; - err = vfsub_symlink(h_dir, h_path, sym.k); - } - free_page((unsigned long)sym.k); - -out: - return err; -} - -/* - * regardless 'acl' option, reset all ACL. - * All ACL will be copied up later from the original entry on the lower branch. - */ -static int au_reset_acl(struct inode *h_dir, struct path *h_path, umode_t mode) -{ - int err; - struct dentry *h_dentry; - struct inode *h_inode; - - h_dentry = h_path->dentry; - h_inode = d_inode(h_dentry); - /* forget_all_cached_acls(h_inode)); */ - err = vfsub_removexattr(h_dentry, XATTR_NAME_POSIX_ACL_ACCESS); - AuTraceErr(err); - if (err == -EOPNOTSUPP) - err = 0; - if (!err) - err = vfsub_acl_chmod(h_inode, mode); - - AuTraceErr(err); - return err; -} - -static int au_do_cpup_dir(struct au_cp_generic *cpg, struct dentry *dst_parent, - struct inode *h_dir, struct path *h_path) -{ - int err; - struct inode *dir, *inode; - - err = vfsub_removexattr(h_path->dentry, XATTR_NAME_POSIX_ACL_DEFAULT); - AuTraceErr(err); - if (err == -EOPNOTSUPP) - err = 0; - if (unlikely(err)) - goto out; - - /* - * strange behaviour from the users view, - * particularry setattr case - */ - dir = d_inode(dst_parent); - if (au_ibstart(dir) == cpg->bdst) - au_cpup_attr_nlink(dir, /*force*/1); - inode = d_inode(cpg->dentry); - au_cpup_attr_nlink(inode, /*force*/1); - -out: - return err; -} - -static noinline_for_stack -int cpup_entry(struct au_cp_generic *cpg, struct dentry *dst_parent, - struct au_cpup_reg_attr *h_src_attr) -{ - int err; - umode_t mode; - unsigned int mnt_flags; - unsigned char isdir, isreg, force; - const unsigned char do_dt = !!au_ftest_cpup(cpg->flags, DTIME); - struct au_dtime dt; - struct path h_path; - struct dentry *h_src, *h_dst, *h_parent; - struct inode *h_inode, *h_dir; - struct super_block *sb; - - /* bsrc branch can be ro/rw. */ - h_src = au_h_dptr(cpg->dentry, cpg->bsrc); - h_inode = d_inode(h_src); - AuDebugOn(h_inode != au_h_iptr(d_inode(cpg->dentry), cpg->bsrc)); - - /* try stopping to be referenced while we are creating */ - h_dst = au_h_dptr(cpg->dentry, cpg->bdst); - if (au_ftest_cpup(cpg->flags, RENAME)) - AuDebugOn(strncmp(h_dst->d_name.name, AUFS_WH_PFX, - AUFS_WH_PFX_LEN)); - h_parent = h_dst->d_parent; /* dir inode is locked */ - h_dir = d_inode(h_parent); - IMustLock(h_dir); - AuDebugOn(h_parent != h_dst->d_parent); - - sb = cpg->dentry->d_sb; - h_path.mnt = au_sbr_mnt(sb, cpg->bdst); - if (do_dt) { - h_path.dentry = h_parent; - au_dtime_store(&dt, dst_parent, &h_path); - } - h_path.dentry = h_dst; - - isreg = 0; - isdir = 0; - mode = h_inode->i_mode; - switch (mode & S_IFMT) { - case S_IFREG: - isreg = 1; - err = vfsub_create(h_dir, &h_path, S_IRUSR | S_IWUSR, - /*want_excl*/true); - if (!err) - err = au_do_cpup_regular(cpg, h_src_attr); - break; - case S_IFDIR: - isdir = 1; - err = vfsub_mkdir(h_dir, &h_path, mode); - if (!err) - err = au_do_cpup_dir(cpg, dst_parent, h_dir, &h_path); - break; - case S_IFLNK: - err = au_do_cpup_symlink(&h_path, h_src, h_dir); - break; - case S_IFCHR: - case S_IFBLK: - AuDebugOn(!capable(CAP_MKNOD)); - /*FALLTHROUGH*/ - case S_IFIFO: - case S_IFSOCK: - err = vfsub_mknod(h_dir, &h_path, mode, h_inode->i_rdev); - break; - default: - AuIOErr("Unknown inode type 0%o\n", mode); - err = -EIO; - } - if (!err) - err = au_reset_acl(h_dir, &h_path, mode); - - mnt_flags = au_mntflags(sb); - if (!au_opt_test(mnt_flags, UDBA_NONE) - && !isdir - && au_opt_test(mnt_flags, XINO) - && (h_inode->i_nlink == 1 - || (h_inode->i_state & I_LINKABLE)) - /* todo: unnecessary? */ - /* && d_inode(cpg->dentry)->i_nlink == 1 */ - && cpg->bdst < cpg->bsrc - && !au_ftest_cpup(cpg->flags, KEEPLINO)) - au_xino_write(sb, cpg->bsrc, h_inode->i_ino, /*ino*/0); - /* ignore this error */ - - if (!err) { - force = 0; - if (isreg) { - force = !!cpg->len; - if (cpg->len == -1) - force = !!i_size_read(h_inode); - } - au_fhsm_wrote(sb, cpg->bdst, force); - } - - if (do_dt) - au_dtime_revert(&dt); - return err; -} - -static int au_do_ren_after_cpup(struct au_cp_generic *cpg, struct path *h_path) -{ - int err; - struct dentry *dentry, *h_dentry, *h_parent, *parent; - struct inode *h_dir; - aufs_bindex_t bdst; - - dentry = cpg->dentry; - bdst = cpg->bdst; - h_dentry = au_h_dptr(dentry, bdst); - if (!au_ftest_cpup(cpg->flags, OVERWRITE)) { - dget(h_dentry); - au_set_h_dptr(dentry, bdst, NULL); - err = au_lkup_neg(dentry, bdst, /*wh*/0); - if (!err) - h_path->dentry = dget(au_h_dptr(dentry, bdst)); - au_set_h_dptr(dentry, bdst, h_dentry); - } else { - err = 0; - parent = dget_parent(dentry); - h_parent = au_h_dptr(parent, bdst); - dput(parent); - h_path->dentry = vfsub_lkup_one(&dentry->d_name, h_parent); - if (IS_ERR(h_path->dentry)) - err = PTR_ERR(h_path->dentry); - } - if (unlikely(err)) - goto out; - - h_parent = h_dentry->d_parent; /* dir inode is locked */ - h_dir = d_inode(h_parent); - IMustLock(h_dir); - AuDbg("%pd %pd\n", h_dentry, h_path->dentry); - /* no delegation since it is just created */ - err = vfsub_rename(h_dir, h_dentry, h_dir, h_path, /*delegated*/NULL); - dput(h_path->dentry); - -out: - return err; -} - -/* - * copyup the @dentry from @bsrc to @bdst. - * the caller must set the both of lower dentries. - * @len is for truncating when it is -1 copyup the entire file. - * in link/rename cases, @dst_parent may be different from the real one. - * basic->bsrc can be larger than basic->bdst. - */ -static int au_cpup_single(struct au_cp_generic *cpg, struct dentry *dst_parent) -{ - int err, rerr; - aufs_bindex_t old_ibstart; - unsigned char isdir, plink; - struct dentry *h_src, *h_dst, *h_parent; - struct inode *dst_inode, *h_dir, *inode, *delegated, *src_inode; - struct super_block *sb; - struct au_branch *br; - /* to reuduce stack size */ - struct { - struct au_dtime dt; - struct path h_path; - struct au_cpup_reg_attr h_src_attr; - } *a; - - err = -ENOMEM; - a = kmalloc(sizeof(*a), GFP_NOFS); - if (unlikely(!a)) - goto out; - a->h_src_attr.valid = 0; - - sb = cpg->dentry->d_sb; - br = au_sbr(sb, cpg->bdst); - a->h_path.mnt = au_br_mnt(br); - h_dst = au_h_dptr(cpg->dentry, cpg->bdst); - h_parent = h_dst->d_parent; /* dir inode is locked */ - h_dir = d_inode(h_parent); - IMustLock(h_dir); - - h_src = au_h_dptr(cpg->dentry, cpg->bsrc); - inode = d_inode(cpg->dentry); - - if (!dst_parent) - dst_parent = dget_parent(cpg->dentry); - else - dget(dst_parent); - - plink = !!au_opt_test(au_mntflags(sb), PLINK); - dst_inode = au_h_iptr(inode, cpg->bdst); - if (dst_inode) { - if (unlikely(!plink)) { - err = -EIO; - AuIOErr("hi%lu(i%lu) exists on b%d " - "but plink is disabled\n", - dst_inode->i_ino, inode->i_ino, cpg->bdst); - goto out_parent; - } - - if (dst_inode->i_nlink) { - const int do_dt = au_ftest_cpup(cpg->flags, DTIME); - - h_src = au_plink_lkup(inode, cpg->bdst); - err = PTR_ERR(h_src); - if (IS_ERR(h_src)) - goto out_parent; - if (unlikely(d_is_negative(h_src))) { - err = -EIO; - AuIOErr("i%lu exists on b%d " - "but not pseudo-linked\n", - inode->i_ino, cpg->bdst); - dput(h_src); - goto out_parent; - } - - if (do_dt) { - a->h_path.dentry = h_parent; - au_dtime_store(&a->dt, dst_parent, &a->h_path); - } - - a->h_path.dentry = h_dst; - delegated = NULL; - err = vfsub_link(h_src, h_dir, &a->h_path, &delegated); - if (!err && au_ftest_cpup(cpg->flags, RENAME)) - err = au_do_ren_after_cpup(cpg, &a->h_path); - if (do_dt) - au_dtime_revert(&a->dt); - if (unlikely(err == -EWOULDBLOCK)) { - pr_warn("cannot retry for NFSv4 delegation" - " for an internal link\n"); - iput(delegated); - } - dput(h_src); - goto out_parent; - } else - /* todo: cpup_wh_file? */ - /* udba work */ - au_update_ibrange(inode, /*do_put_zero*/1); - } - - isdir = S_ISDIR(inode->i_mode); - old_ibstart = au_ibstart(inode); - err = cpup_entry(cpg, dst_parent, &a->h_src_attr); - if (unlikely(err)) - goto out_rev; - dst_inode = d_inode(h_dst); - mutex_lock_nested(&dst_inode->i_mutex, AuLsc_I_CHILD2); - /* todo: necessary? */ - /* au_pin_hdir_unlock(cpg->pin); */ - - err = cpup_iattr(cpg->dentry, cpg->bdst, h_src, &a->h_src_attr); - if (unlikely(err)) { - /* todo: necessary? */ - /* au_pin_hdir_relock(cpg->pin); */ /* ignore an error */ - mutex_unlock(&dst_inode->i_mutex); - goto out_rev; - } - - if (cpg->bdst < old_ibstart) { - if (S_ISREG(inode->i_mode)) { - err = au_dy_iaop(inode, cpg->bdst, dst_inode); - if (unlikely(err)) { - /* ignore an error */ - /* au_pin_hdir_relock(cpg->pin); */ - mutex_unlock(&dst_inode->i_mutex); - goto out_rev; - } - } - au_set_ibstart(inode, cpg->bdst); - } else - au_set_ibend(inode, cpg->bdst); - au_set_h_iptr(inode, cpg->bdst, au_igrab(dst_inode), - au_hi_flags(inode, isdir)); - - /* todo: necessary? */ - /* err = au_pin_hdir_relock(cpg->pin); */ - mutex_unlock(&dst_inode->i_mutex); - if (unlikely(err)) - goto out_rev; - - src_inode = d_inode(h_src); - if (!isdir - && (src_inode->i_nlink > 1 - || src_inode->i_state & I_LINKABLE) - && plink) - au_plink_append(inode, cpg->bdst, h_dst); - - if (au_ftest_cpup(cpg->flags, RENAME)) { - a->h_path.dentry = h_dst; - err = au_do_ren_after_cpup(cpg, &a->h_path); - } - if (!err) - goto out_parent; /* success */ - - /* revert */ -out_rev: - a->h_path.dentry = h_parent; - au_dtime_store(&a->dt, dst_parent, &a->h_path); - a->h_path.dentry = h_dst; - rerr = 0; - if (d_is_positive(h_dst)) { - if (!isdir) { - /* no delegation since it is just created */ - rerr = vfsub_unlink(h_dir, &a->h_path, - /*delegated*/NULL, /*force*/0); - } else - rerr = vfsub_rmdir(h_dir, &a->h_path); - } - au_dtime_revert(&a->dt); - if (rerr) { - AuIOErr("failed removing broken entry(%d, %d)\n", err, rerr); - err = -EIO; - } -out_parent: - dput(dst_parent); - kfree(a); -out: - return err; -} - -#if 0 /* reserved */ -struct au_cpup_single_args { - int *errp; - struct au_cp_generic *cpg; - struct dentry *dst_parent; -}; - -static void au_call_cpup_single(void *args) -{ - struct au_cpup_single_args *a = args; - - au_pin_hdir_acquire_nest(a->cpg->pin); - *a->errp = au_cpup_single(a->cpg, a->dst_parent); - au_pin_hdir_release(a->cpg->pin); -} -#endif - -/* - * prevent SIGXFSZ in copy-up. - * testing CAP_MKNOD is for generic fs, - * but CAP_FSETID is for xfs only, currently. - */ -static int au_cpup_sio_test(struct au_pin *pin, umode_t mode) -{ - int do_sio; - struct super_block *sb; - struct inode *h_dir; - - do_sio = 0; - sb = au_pinned_parent(pin)->d_sb; - if (!au_wkq_test() - && (!au_sbi(sb)->si_plink_maint_pid - || au_plink_maint(sb, AuLock_NOPLM))) { - switch (mode & S_IFMT) { - case S_IFREG: - /* no condition about RLIMIT_FSIZE and the file size */ - do_sio = 1; - break; - case S_IFCHR: - case S_IFBLK: - do_sio = !capable(CAP_MKNOD); - break; - } - if (!do_sio) - do_sio = ((mode & (S_ISUID | S_ISGID)) - && !capable(CAP_FSETID)); - /* this workaround may be removed in the future */ - if (!do_sio) { - h_dir = au_pinned_h_dir(pin); - do_sio = h_dir->i_mode & S_ISVTX; - } - } - - return do_sio; -} - -#if 0 /* reserved */ -int au_sio_cpup_single(struct au_cp_generic *cpg, struct dentry *dst_parent) -{ - int err, wkq_err; - struct dentry *h_dentry; - - h_dentry = au_h_dptr(cpg->dentry, cpg->bsrc); - if (!au_cpup_sio_test(pin, d_inode(h_dentry)->i_mode)) - err = au_cpup_single(cpg, dst_parent); - else { - struct au_cpup_single_args args = { - .errp = &err, - .cpg = cpg, - .dst_parent = dst_parent - }; - wkq_err = au_wkq_wait(au_call_cpup_single, &args); - if (unlikely(wkq_err)) - err = wkq_err; - } - - return err; -} -#endif - -/* - * copyup the @dentry from the first active lower branch to @bdst, - * using au_cpup_single(). - */ -static int au_cpup_simple(struct au_cp_generic *cpg) -{ - int err; - unsigned int flags_orig; - struct dentry *dentry; - - AuDebugOn(cpg->bsrc < 0); - - dentry = cpg->dentry; - DiMustWriteLock(dentry); - - err = au_lkup_neg(dentry, cpg->bdst, /*wh*/1); - if (!err) { - flags_orig = cpg->flags; - au_fset_cpup(cpg->flags, RENAME); - err = au_cpup_single(cpg, NULL); - cpg->flags = flags_orig; - if (!err) - return 0; /* success */ - - /* revert */ - au_set_h_dptr(dentry, cpg->bdst, NULL); - au_set_dbstart(dentry, cpg->bsrc); - } - - return err; -} - -struct au_cpup_simple_args { - int *errp; - struct au_cp_generic *cpg; -}; - -static void au_call_cpup_simple(void *args) -{ - struct au_cpup_simple_args *a = args; - - au_pin_hdir_acquire_nest(a->cpg->pin); - *a->errp = au_cpup_simple(a->cpg); - au_pin_hdir_release(a->cpg->pin); -} - -static int au_do_sio_cpup_simple(struct au_cp_generic *cpg) -{ - int err, wkq_err; - struct dentry *dentry, *parent; - struct file *h_file; - struct inode *h_dir; - - dentry = cpg->dentry; - h_file = NULL; - if (au_ftest_cpup(cpg->flags, HOPEN)) { - AuDebugOn(cpg->bsrc < 0); - h_file = au_h_open_pre(dentry, cpg->bsrc, /*force_wr*/0); - err = PTR_ERR(h_file); - if (IS_ERR(h_file)) - goto out; - } - - parent = dget_parent(dentry); - h_dir = au_h_iptr(d_inode(parent), cpg->bdst); - if (!au_test_h_perm_sio(h_dir, MAY_EXEC | MAY_WRITE) - && !au_cpup_sio_test(cpg->pin, d_inode(dentry)->i_mode)) - err = au_cpup_simple(cpg); - else { - struct au_cpup_simple_args args = { - .errp = &err, - .cpg = cpg - }; - wkq_err = au_wkq_wait(au_call_cpup_simple, &args); - if (unlikely(wkq_err)) - err = wkq_err; - } - - dput(parent); - if (h_file) - au_h_open_post(dentry, cpg->bsrc, h_file); - -out: - return err; -} - -int au_sio_cpup_simple(struct au_cp_generic *cpg) -{ - aufs_bindex_t bsrc, bend; - struct dentry *dentry, *h_dentry; - - if (cpg->bsrc < 0) { - dentry = cpg->dentry; - bend = au_dbend(dentry); - for (bsrc = cpg->bdst + 1; bsrc <= bend; bsrc++) { - h_dentry = au_h_dptr(dentry, bsrc); - if (h_dentry) { - AuDebugOn(d_is_negative(h_dentry)); - break; - } - } - AuDebugOn(bsrc > bend); - cpg->bsrc = bsrc; - } - AuDebugOn(cpg->bsrc <= cpg->bdst); - return au_do_sio_cpup_simple(cpg); -} - -int au_sio_cpdown_simple(struct au_cp_generic *cpg) -{ - AuDebugOn(cpg->bdst <= cpg->bsrc); - return au_do_sio_cpup_simple(cpg); -} - -/* ---------------------------------------------------------------------- */ - -/* - * copyup the deleted file for writing. - */ -static int au_do_cpup_wh(struct au_cp_generic *cpg, struct dentry *wh_dentry, - struct file *file) -{ - int err; - unsigned int flags_orig; - aufs_bindex_t bsrc_orig; - struct dentry *h_d_dst, *h_d_start; - struct au_dinfo *dinfo; - struct au_hdentry *hdp; - - dinfo = au_di(cpg->dentry); - AuRwMustWriteLock(&dinfo->di_rwsem); - - bsrc_orig = cpg->bsrc; - cpg->bsrc = dinfo->di_bstart; - hdp = dinfo->di_hdentry; - h_d_dst = hdp[0 + cpg->bdst].hd_dentry; - dinfo->di_bstart = cpg->bdst; - hdp[0 + cpg->bdst].hd_dentry = wh_dentry; - h_d_start = NULL; - if (file) { - h_d_start = hdp[0 + cpg->bsrc].hd_dentry; - hdp[0 + cpg->bsrc].hd_dentry = au_hf_top(file)->f_path.dentry; - } - flags_orig = cpg->flags; - cpg->flags = !AuCpup_DTIME; - err = au_cpup_single(cpg, /*h_parent*/NULL); - cpg->flags = flags_orig; - if (file) { - if (!err) - err = au_reopen_nondir(file); - hdp[0 + cpg->bsrc].hd_dentry = h_d_start; - } - hdp[0 + cpg->bdst].hd_dentry = h_d_dst; - dinfo->di_bstart = cpg->bsrc; - cpg->bsrc = bsrc_orig; - - return err; -} - -static int au_cpup_wh(struct au_cp_generic *cpg, struct file *file) -{ - int err; - aufs_bindex_t bdst; - struct au_dtime dt; - struct dentry *dentry, *parent, *h_parent, *wh_dentry; - struct au_branch *br; - struct path h_path; - - dentry = cpg->dentry; - bdst = cpg->bdst; - br = au_sbr(dentry->d_sb, bdst); - parent = dget_parent(dentry); - h_parent = au_h_dptr(parent, bdst); - wh_dentry = au_whtmp_lkup(h_parent, br, &dentry->d_name); - err = PTR_ERR(wh_dentry); - if (IS_ERR(wh_dentry)) - goto out; - - h_path.dentry = h_parent; - h_path.mnt = au_br_mnt(br); - au_dtime_store(&dt, parent, &h_path); - err = au_do_cpup_wh(cpg, wh_dentry, file); - if (unlikely(err)) - goto out_wh; - - dget(wh_dentry); - h_path.dentry = wh_dentry; - if (!d_is_dir(wh_dentry)) { - /* no delegation since it is just created */ - err = vfsub_unlink(d_inode(h_parent), &h_path, - /*delegated*/NULL, /*force*/0); - } else - err = vfsub_rmdir(d_inode(h_parent), &h_path); - if (unlikely(err)) { - AuIOErr("failed remove copied-up tmp file %pd(%d)\n", - wh_dentry, err); - err = -EIO; - } - au_dtime_revert(&dt); - au_set_hi_wh(d_inode(dentry), bdst, wh_dentry); - -out_wh: - dput(wh_dentry); -out: - dput(parent); - return err; -} - -struct au_cpup_wh_args { - int *errp; - struct au_cp_generic *cpg; - struct file *file; -}; - -static void au_call_cpup_wh(void *args) -{ - struct au_cpup_wh_args *a = args; - - au_pin_hdir_acquire_nest(a->cpg->pin); - *a->errp = au_cpup_wh(a->cpg, a->file); - au_pin_hdir_release(a->cpg->pin); -} - -int au_sio_cpup_wh(struct au_cp_generic *cpg, struct file *file) -{ - int err, wkq_err; - aufs_bindex_t bdst; - struct dentry *dentry, *parent, *h_orph, *h_parent; - struct inode *dir, *h_dir, *h_tmpdir; - struct au_wbr *wbr; - struct au_pin wh_pin, *pin_orig; - - dentry = cpg->dentry; - bdst = cpg->bdst; - parent = dget_parent(dentry); - dir = d_inode(parent); - h_orph = NULL; - h_parent = NULL; - h_dir = au_igrab(au_h_iptr(dir, bdst)); - h_tmpdir = h_dir; - pin_orig = NULL; - if (!h_dir->i_nlink) { - wbr = au_sbr(dentry->d_sb, bdst)->br_wbr; - h_orph = wbr->wbr_orph; - - h_parent = dget(au_h_dptr(parent, bdst)); - au_set_h_dptr(parent, bdst, dget(h_orph)); - h_tmpdir = d_inode(h_orph); - au_set_h_iptr(dir, bdst, au_igrab(h_tmpdir), /*flags*/0); - - mutex_lock_nested(&h_tmpdir->i_mutex, AuLsc_I_PARENT3); - /* todo: au_h_open_pre()? */ - - pin_orig = cpg->pin; - au_pin_init(&wh_pin, dentry, bdst, AuLsc_DI_PARENT, - AuLsc_I_PARENT3, cpg->pin->udba, AuPin_DI_LOCKED); - cpg->pin = &wh_pin; - } - - if (!au_test_h_perm_sio(h_tmpdir, MAY_EXEC | MAY_WRITE) - && !au_cpup_sio_test(cpg->pin, d_inode(dentry)->i_mode)) - err = au_cpup_wh(cpg, file); - else { - struct au_cpup_wh_args args = { - .errp = &err, - .cpg = cpg, - .file = file - }; - wkq_err = au_wkq_wait(au_call_cpup_wh, &args); - if (unlikely(wkq_err)) - err = wkq_err; - } - - if (h_orph) { - mutex_unlock(&h_tmpdir->i_mutex); - /* todo: au_h_open_post()? */ - au_set_h_iptr(dir, bdst, au_igrab(h_dir), /*flags*/0); - au_set_h_dptr(parent, bdst, h_parent); - AuDebugOn(!pin_orig); - cpg->pin = pin_orig; - } - iput(h_dir); - dput(parent); - - return err; -} - -/* ---------------------------------------------------------------------- */ - -/* - * generic routine for both of copy-up and copy-down. - */ -/* cf. revalidate function in file.c */ -int au_cp_dirs(struct dentry *dentry, aufs_bindex_t bdst, - int (*cp)(struct dentry *dentry, aufs_bindex_t bdst, - struct au_pin *pin, - struct dentry *h_parent, void *arg), - void *arg) -{ - int err; - struct au_pin pin; - struct dentry *d, *parent, *h_parent, *real_parent, *h_dentry; - - err = 0; - parent = dget_parent(dentry); - if (IS_ROOT(parent)) - goto out; - - au_pin_init(&pin, dentry, bdst, AuLsc_DI_PARENT2, AuLsc_I_PARENT2, - au_opt_udba(dentry->d_sb), AuPin_MNT_WRITE); - - /* do not use au_dpage */ - real_parent = parent; - while (1) { - dput(parent); - parent = dget_parent(dentry); - h_parent = au_h_dptr(parent, bdst); - if (h_parent) - goto out; /* success */ - - /* find top dir which is necessary to cpup */ - do { - d = parent; - dput(parent); - parent = dget_parent(d); - di_read_lock_parent3(parent, !AuLock_IR); - h_parent = au_h_dptr(parent, bdst); - di_read_unlock(parent, !AuLock_IR); - } while (!h_parent); - - if (d != real_parent) - di_write_lock_child3(d); - - /* somebody else might create while we were sleeping */ - h_dentry = au_h_dptr(d, bdst); - if (!h_dentry || d_is_negative(h_dentry)) { - if (h_dentry) - au_update_dbstart(d); - - au_pin_set_dentry(&pin, d); - err = au_do_pin(&pin); - if (!err) { - err = cp(d, bdst, &pin, h_parent, arg); - au_unpin(&pin); - } - } - - if (d != real_parent) - di_write_unlock(d); - if (unlikely(err)) - break; - } - -out: - dput(parent); - return err; -} - -static int au_cpup_dir(struct dentry *dentry, aufs_bindex_t bdst, - struct au_pin *pin, - struct dentry *h_parent __maybe_unused, - void *arg __maybe_unused) -{ - struct au_cp_generic cpg = { - .dentry = dentry, - .bdst = bdst, - .bsrc = -1, - .len = 0, - .pin = pin, - .flags = AuCpup_DTIME - }; - return au_sio_cpup_simple(&cpg); -} - -int au_cpup_dirs(struct dentry *dentry, aufs_bindex_t bdst) -{ - return au_cp_dirs(dentry, bdst, au_cpup_dir, NULL); -} - -int au_test_and_cpup_dirs(struct dentry *dentry, aufs_bindex_t bdst) -{ - int err; - struct dentry *parent; - struct inode *dir; - - parent = dget_parent(dentry); - dir = d_inode(parent); - err = 0; - if (au_h_iptr(dir, bdst)) - goto out; - - di_read_unlock(parent, AuLock_IR); - di_write_lock_parent(parent); - /* someone else might change our inode while we were sleeping */ - if (!au_h_iptr(dir, bdst)) - err = au_cpup_dirs(dentry, bdst); - di_downgrade_lock(parent, AuLock_IR); - -out: - dput(parent); - return err; -} diff --git a/fs/aufs/cpup.h b/fs/aufs/cpup.h deleted file mode 100644 index ccba2c427..000000000 --- a/fs/aufs/cpup.h +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Copyright (C) 2005-2016 Junjiro R. Okajima - */ - -/* - * copy-up/down functions - */ - -#ifndef __AUFS_CPUP_H__ -#define __AUFS_CPUP_H__ - -#ifdef __KERNEL__ - -#include <linux/path.h> - -struct inode; -struct file; -struct au_pin; - -void au_cpup_attr_flags(struct inode *dst, unsigned int iflags); -void au_cpup_attr_timesizes(struct inode *inode); -void au_cpup_attr_nlink(struct inode *inode, int force); -void au_cpup_attr_changeable(struct inode *inode); -void au_cpup_igen(struct inode *inode, struct inode *h_inode); -void au_cpup_attr_all(struct inode *inode, int force); - -/* ---------------------------------------------------------------------- */ - -struct au_cp_generic { - struct dentry *dentry; - aufs_bindex_t bdst, bsrc; - loff_t len; - struct au_pin *pin; - unsigned int flags; -}; - -/* cpup flags */ -#define AuCpup_DTIME 1 /* do dtime_store/revert */ -#define AuCpup_KEEPLINO (1 << 1) /* do not clear the lower xino, - for link(2) */ -#define AuCpup_RENAME (1 << 2) /* rename after cpup */ -#define AuCpup_HOPEN (1 << 3) /* call h_open_pre/post() in - cpup */ -#define AuCpup_OVERWRITE (1 << 4) /* allow overwriting the - existing entry */ -#define AuCpup_RWDST (1 << 5) /* force write target even if - the branch is marked as RO */ - -#define au_ftest_cpup(flags, name) ((flags) & AuCpup_##name) -#define au_fset_cpup(flags, name) \ - do { (flags) |= AuCpup_##name; } while (0) -#define au_fclr_cpup(flags, name) \ - do { (flags) &= ~AuCpup_##name; } while (0) - -int au_copy_file(struct file *dst, struct file *src, loff_t len); -int au_sio_cpup_simple(struct au_cp_generic *cpg); -int au_sio_cpdown_simple(struct au_cp_generic *cpg); -int au_sio_cpup_wh(struct au_cp_generic *cpg, struct file *file); - -int au_cp_dirs(struct dentry *dentry, aufs_bindex_t bdst, - int (*cp)(struct dentry *dentry, aufs_bindex_t bdst, - struct au_pin *pin, - struct dentry *h_parent, void *arg), - void *arg); -int au_cpup_dirs(struct dentry *dentry, aufs_bindex_t bdst); -int au_test_and_cpup_dirs(struct dentry *dentry, aufs_bindex_t bdst); - -/* ---------------------------------------------------------------------- */ - -/* keep timestamps when copyup */ -struct au_dtime { - struct dentry *dt_dentry; - struct path dt_h_path; - struct timespec dt_atime, dt_mtime; -}; -void au_dtime_store(struct au_dtime *dt, struct dentry *dentry, - struct path *h_path); -void au_dtime_revert(struct au_dtime *dt); - -#endif /* __KERNEL__ */ -#endif /* __AUFS_CPUP_H__ */ diff --git a/fs/aufs/dbgaufs.c b/fs/aufs/dbgaufs.c deleted file mode 100644 index 0aefb5ed8..000000000 --- a/fs/aufs/dbgaufs.c +++ /dev/null @@ -1,419 +0,0 @@ -/* - * Copyright (C) 2005-2016 Junjiro R. Okajima - */ - -/* - * debugfs interface - */ - -#include <linux/debugfs.h> -#include "aufs.h" - -#ifndef CONFIG_SYSFS -#error DEBUG_FS depends upon SYSFS -#endif - -static struct dentry *dbgaufs; -static const mode_t dbgaufs_mode = S_IRUSR | S_IRGRP | S_IROTH; - -/* 20 is max digits length of ulong 64 */ -struct dbgaufs_arg { - int n; - char a[20 * 4]; -}; - -/* - * common function for all XINO files - */ -static int dbgaufs_xi_release(struct inode *inode __maybe_unused, - struct file *file) -{ - kfree(file->private_data); - return 0; -} - -static int dbgaufs_xi_open(struct file *xf, struct file *file, int do_fcnt) -{ - int err; - struct kstat st; - struct dbgaufs_arg *p; - - err = -ENOMEM; - p = kmalloc(sizeof(*p), GFP_NOFS); - if (unlikely(!p)) - goto out; - - err = 0; - p->n = 0; - file->private_data = p; - if (!xf) - goto out; - - err = vfs_getattr(&xf->f_path, &st); - if (!err) { - if (do_fcnt) - p->n = snprintf - (p->a, sizeof(p->a), "%ld, %llux%lu %lld\n", - (long)file_count(xf), st.blocks, st.blksize, - (long long)st.size); - else - p->n = snprintf(p->a, sizeof(p->a), "%llux%lu %lld\n", - st.blocks, st.blksize, - (long long)st.size); - AuDebugOn(p->n >= sizeof(p->a)); - } else { - p->n = snprintf(p->a, sizeof(p->a), "err %d\n", err); - err = 0; - } - -out: - return err; - -} - -static ssize_t dbgaufs_xi_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) -{ - struct dbgaufs_arg *p; - - p = file->private_data; - return simple_read_from_buffer(buf, count, ppos, p->a, p->n); -} - -/* ---------------------------------------------------------------------- */ - -struct dbgaufs_plink_arg { - int n; - char a[]; -}; - -static int dbgaufs_plink_release(struct inode *inode __maybe_unused, - struct file *file) -{ - free_page((unsigned long)file->private_data); - return 0; -} - -static int dbgaufs_plink_open(struct inode *inode, struct file *file) -{ - int err, i, limit; - unsigned long n, sum; - struct dbgaufs_plink_arg *p; - struct au_sbinfo *sbinfo; - struct super_block *sb; - struct au_sphlhead *sphl; - - err = -ENOMEM; - p = (void *)get_zeroed_page(GFP_NOFS); - if (unlikely(!p)) - goto out; - - err = -EFBIG; - sbinfo = inode->i_private; - sb = sbinfo->si_sb; - si_noflush_read_lock(sb); - if (au_opt_test(au_mntflags(sb), PLINK)) { - limit = PAGE_SIZE - sizeof(p->n); - - /* the number of buckets */ - n = snprintf(p->a + p->n, limit, "%d\n", AuPlink_NHASH); - p->n += n; - limit -= n; - - sum = 0; - for (i = 0, sphl = sbinfo->si_plink; - i < AuPlink_NHASH; - i++, sphl++) { - n = au_sphl_count(sphl); - sum += n; - - n = snprintf(p->a + p->n, limit, "%lu ", n); - p->n += n; - limit -= n; - if (unlikely(limit <= 0)) - goto out_free; - } - p->a[p->n - 1] = '\n'; - - /* the sum of plinks */ - n = snprintf(p->a + p->n, limit, "%lu\n", sum); - p->n += n; - limit -= n; - if (unlikely(limit <= 0)) - goto out_free; - } else { -#define str "1\n0\n0\n" - p->n = sizeof(str) - 1; - strcpy(p->a, str); -#undef str - } - si_read_unlock(sb); - - err = 0; - file->private_data = p; - goto out; /* success */ - -out_free: - free_page((unsigned long)p); -out: - return err; -} - -static ssize_t dbgaufs_plink_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) -{ - struct dbgaufs_plink_arg *p; - - p = file->private_data; - return simple_read_from_buffer(buf, count, ppos, p->a, p->n); -} - -static const struct file_operations dbgaufs_plink_fop = { - .owner = THIS_MODULE, - .open = dbgaufs_plink_open, - .release = dbgaufs_plink_release, - .read = dbgaufs_plink_read -}; - -/* ---------------------------------------------------------------------- */ - -static int dbgaufs_xib_open(struct inode *inode, struct file *file) -{ - int err; - struct au_sbinfo *sbinfo; - struct super_block *sb; - - sbinfo = inode->i_private; - sb = sbinfo->si_sb; - si_noflush_read_lock(sb); - err = dbgaufs_xi_open(sbinfo->si_xib, file, /*do_fcnt*/0); - si_read_unlock(sb); - return err; -} - -static const struct file_operations dbgaufs_xib_fop = { - .owner = THIS_MODULE, - .open = dbgaufs_xib_open, - .release = dbgaufs_xi_release, - .read = dbgaufs_xi_read -}; - -/* ---------------------------------------------------------------------- */ - -#define DbgaufsXi_PREFIX "xi" - -static int dbgaufs_xino_open(struct inode *inode, struct file *file) -{ - int err; - long l; - struct au_sbinfo *sbinfo; - struct super_block *sb; - struct file *xf; - struct qstr *name; - - err = -ENOENT; - xf = NULL; - name = &file->f_path.dentry->d_name; - if (unlikely(name->len < sizeof(DbgaufsXi_PREFIX) - || memcmp(name->name, DbgaufsXi_PREFIX, - sizeof(DbgaufsXi_PREFIX) - 1))) - goto out; - err = kstrtol(name->name + sizeof(DbgaufsXi_PREFIX) - 1, 10, &l); - if (unlikely(err)) - goto out; - - sbinfo = inode->i_private; - sb = sbinfo->si_sb; - si_noflush_read_lock(sb); - if (l <= au_sbend(sb)) { - xf = au_sbr(sb, (aufs_bindex_t)l)->br_xino.xi_file; - err = dbgaufs_xi_open(xf, file, /*do_fcnt*/1); - } else - err = -ENOENT; - si_read_unlock(sb); - -out: - return err; -} - -static const struct file_operations dbgaufs_xino_fop = { - .owner = THIS_MODULE, - .open = dbgaufs_xino_open, - .release = dbgaufs_xi_release, - .read = dbgaufs_xi_read -}; - -void dbgaufs_brs_del(struct super_block *sb, aufs_bindex_t bindex) -{ - aufs_bindex_t bend; - struct au_branch *br; - struct au_xino_file *xi; - - if (!au_sbi(sb)->si_dbgaufs) - return; - - bend = au_sbend(sb); - for (; bindex <= bend; bindex++) { - br = au_sbr(sb, bindex); - xi = &br->br_xino; - debugfs_remove(xi->xi_dbgaufs); - xi->xi_dbgaufs = NULL; - } -} - -void dbgaufs_brs_add(struct super_block *sb, aufs_bindex_t bindex) -{ - struct au_sbinfo *sbinfo; - struct dentry *parent; - struct au_branch *br; - struct au_xino_file *xi; - aufs_bindex_t bend; - char name[sizeof(DbgaufsXi_PREFIX) + 5]; /* "xi" bindex NULL */ - - sbinfo = au_sbi(sb); - parent = sbinfo->si_dbgaufs; - if (!parent) - return; - - bend = au_sbend(sb); - for (; bindex <= bend; bindex++) { - snprintf(name, sizeof(name), DbgaufsXi_PREFIX "%d", bindex); - br = au_sbr(sb, bindex); - xi = &br->br_xino; - AuDebugOn(xi->xi_dbgaufs); - xi->xi_dbgaufs = debugfs_create_file(name, dbgaufs_mode, parent, - sbinfo, &dbgaufs_xino_fop); - /* ignore an error */ - if (unlikely(!xi->xi_dbgaufs)) - AuWarn1("failed %s under debugfs\n", name); - } -} - -/* ---------------------------------------------------------------------- */ - -#ifdef CONFIG_AUFS_EXPORT -static int dbgaufs_xigen_open(struct inode *inode, struct file *file) -{ - int err; - struct au_sbinfo *sbinfo; - struct super_block *sb; - - sbinfo = inode->i_private; - sb = sbinfo->si_sb; - si_noflush_read_lock(sb); - err = dbgaufs_xi_open(sbinfo->si_xigen, file, /*do_fcnt*/0); - si_read_unlock(sb); - return err; -} - -static const struct file_operations dbgaufs_xigen_fop = { - .owner = THIS_MODULE, - .open = dbgaufs_xigen_open, - .release = dbgaufs_xi_release, - .read = dbgaufs_xi_read -}; - -static int dbgaufs_xigen_init(struct au_sbinfo *sbinfo) -{ - int err; - - /* - * This function is a dynamic '__init' function actually, - * so the tiny check for si_rwsem is unnecessary. - */ - /* AuRwMustWriteLock(&sbinfo->si_rwsem); */ - - err = -EIO; - sbinfo->si_dbgaufs_xigen = debugfs_create_file - ("xigen", dbgaufs_mode, sbinfo->si_dbgaufs, sbinfo, - &dbgaufs_xigen_fop); - if (sbinfo->si_dbgaufs_xigen) - err = 0; - - return err; -} -#else -static int dbgaufs_xigen_init(struct au_sbinfo *sbinfo) -{ - return 0; -} -#endif /* CONFIG_AUFS_EXPORT */ - -/* ---------------------------------------------------------------------- */ - -void dbgaufs_si_fin(struct au_sbinfo *sbinfo) -{ - /* - * This function is a dynamic '__fin' function actually, - * so the tiny check for si_rwsem is unnecessary. - */ - /* AuRwMustWriteLock(&sbinfo->si_rwsem); */ - - debugfs_remove_recursive(sbinfo->si_dbgaufs); - sbinfo->si_dbgaufs = NULL; - kobject_put(&sbinfo->si_kobj); -} - -int dbgaufs_si_init(struct au_sbinfo *sbinfo) -{ - int err; - char name[SysaufsSiNameLen]; - - /* - * This function is a dynamic '__init' function actually, - * so the tiny check for si_rwsem is unnecessary. - */ - /* AuRwMustWriteLock(&sbinfo->si_rwsem); */ - - err = -ENOENT; - if (!dbgaufs) { - AuErr1("/debug/aufs is uninitialized\n"); - goto out; - } - - err = -EIO; - sysaufs_name(sbinfo, name); - sbinfo->si_dbgaufs = debugfs_create_dir(name, dbgaufs); - if (unlikely(!sbinfo->si_dbgaufs)) - goto out; - kobject_get(&sbinfo->si_kobj); - - sbinfo->si_dbgaufs_xib = debugfs_create_file - ("xib", dbgaufs_mode, sbinfo->si_dbgaufs, sbinfo, - &dbgaufs_xib_fop); - if (unlikely(!sbinfo->si_dbgaufs_xib)) - goto out_dir; - - sbinfo->si_dbgaufs_plink = debugfs_create_file - ("plink", dbgaufs_mode, sbinfo->si_dbgaufs, sbinfo, - &dbgaufs_plink_fop); - if (unlikely(!sbinfo->si_dbgaufs_plink)) - goto out_dir; - - err = dbgaufs_xigen_init(sbinfo); - if (!err) - goto out; /* success */ - -out_dir: - dbgaufs_si_fin(sbinfo); -out: - return err; -} - -/* ---------------------------------------------------------------------- */ - -void dbgaufs_fin(void) -{ - debugfs_remove(dbgaufs); -} - -int __init dbgaufs_init(void) -{ - int err; - - err = -EIO; - dbgaufs = debugfs_create_dir(AUFS_NAME, NULL); - if (dbgaufs) - err = 0; - return err; -} diff --git a/fs/aufs/dbgaufs.h b/fs/aufs/dbgaufs.h deleted file mode 100644 index 81f272e42..000000000 --- a/fs/aufs/dbgaufs.h +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Copyright (C) 2005-2016 Junjiro R. Okajima - */ - -/* - * debugfs interface - */ - -#ifndef __DBGAUFS_H__ -#define __DBGAUFS_H__ - -#ifdef __KERNEL__ - -struct super_block; -struct au_sbinfo; - -#ifdef CONFIG_DEBUG_FS -/* dbgaufs.c */ -void dbgaufs_brs_del(struct super_block *sb, aufs_bindex_t bindex); -void dbgaufs_brs_add(struct super_block *sb, aufs_bindex_t bindex); -void dbgaufs_si_fin(struct au_sbinfo *sbinfo); -int dbgaufs_si_init(struct au_sbinfo *sbinfo); -void dbgaufs_fin(void); -int __init dbgaufs_init(void); -#else -AuStubVoid(dbgaufs_brs_del, struct super_block *sb, aufs_bindex_t bindex) -AuStubVoid(dbgaufs_brs_add, struct super_block *sb, aufs_bindex_t bindex) -AuStubVoid(dbgaufs_si_fin, struct au_sbinfo *sbinfo) -AuStubInt0(dbgaufs_si_init, struct au_sbinfo *sbinfo) -AuStubVoid(dbgaufs_fin, void) -AuStubInt0(__init dbgaufs_init, void) -#endif /* CONFIG_DEBUG_FS */ - -#endif /* __KERNEL__ */ -#endif /* __DBGAUFS_H__ */ diff --git a/fs/aufs/dcsub.c b/fs/aufs/dcsub.c deleted file mode 100644 index e72accebb..000000000 --- a/fs/aufs/dcsub.c +++ /dev/null @@ -1,211 +0,0 @@ -/* - * Copyright (C) 2005-2016 Junjiro R. Okajima - */ - -/* - * sub-routines for dentry cache - */ - -#include "aufs.h" - -static void au_dpage_free(struct au_dpage *dpage) -{ - int i; - struct dentry **p; - - p = dpage->dentries; - for (i = 0; i < dpage->ndentry; i++) - dput(*p++); - free_page((unsigned long)dpage->dentries); -} - -int au_dpages_init(struct au_dcsub_pages *dpages, gfp_t gfp) -{ - int err; - void *p; - - err = -ENOMEM; - dpages->dpages = kmalloc(sizeof(*dpages->dpages), gfp); - if (unlikely(!dpages->dpages)) - goto out; - - p = (void *)__get_free_page(gfp); - if (unlikely(!p)) - goto out_dpages; - - dpages->dpages[0].ndentry = 0; - dpages->dpages[0].dentries = p; - dpages->ndpage = 1; - return 0; /* success */ - -out_dpages: - kfree(dpages->dpages); -out: - return err; -} - -void au_dpages_free(struct au_dcsub_pages *dpages) -{ - int i; - struct au_dpage *p; - - p = dpages->dpages; - for (i = 0; i < dpages->ndpage; i++) - au_dpage_free(p++); - kfree(dpages->dpages); -} - -static int au_dpages_append(struct au_dcsub_pages *dpages, - struct dentry *dentry, gfp_t gfp) -{ - int err, sz; - struct au_dpage *dpage; - void *p; - - dpage = dpages->dpages + dpages->ndpage - 1; - sz = PAGE_SIZE / sizeof(dentry); - if (unlikely(dpage->ndentry >= sz)) { - AuLabel(new dpage); - err = -ENOMEM; - sz = dpages->ndpage * sizeof(*dpages->dpages); - p = au_kzrealloc(dpages->dpages, sz, - sz + sizeof(*dpages->dpages), gfp); - if (unlikely(!p)) - goto out; - - dpages->dpages = p; - dpage = dpages->dpages + dpages->ndpage; - p = (void *)__get_free_page(gfp); - if (unlikely(!p)) - goto out; - - dpage->ndentry = 0; - dpage->dentries = p; - dpages->ndpage++; - } - - AuDebugOn(au_dcount(dentry) <= 0); - dpage->dentries[dpage->ndentry++] = dget_dlock(dentry); - return 0; /* success */ - -out: - return err; -} - -/* todo: BAD approach */ -/* copied from linux/fs/dcache.c */ -enum d_walk_ret { - D_WALK_CONTINUE, - D_WALK_QUIT, - D_WALK_NORETRY, - D_WALK_SKIP, -}; - -extern void d_walk(struct dentry *parent, void *data, - enum d_walk_ret (*enter)(void *, struct dentry *), - void (*finish)(void *)); - -struct ac_dpages_arg { - int err; - struct au_dcsub_pages *dpages; - struct super_block *sb; - au_dpages_test test; - void *arg; -}; - -static enum d_walk_ret au_call_dpages_append(void *_arg, struct dentry *dentry) -{ - enum d_walk_ret ret; - struct ac_dpages_arg *arg = _arg; - - ret = D_WALK_CONTINUE; - if (dentry->d_sb == arg->sb - && !IS_ROOT(dentry) - && au_dcount(dentry) > 0 - && au_di(dentry) - && (!arg->test || arg->test(dentry, arg->arg))) { - arg->err = au_dpages_append(arg->dpages, dentry, GFP_ATOMIC); - if (unlikely(arg->err)) - ret = D_WALK_QUIT; - } - - return ret; -} - -int au_dcsub_pages(struct au_dcsub_pages *dpages, struct dentry *root, - au_dpages_test test, void *arg) -{ - struct ac_dpages_arg args = { - .err = 0, - .dpages = dpages, - .sb = root->d_sb, - .test = test, - .arg = arg - }; - - d_walk(root, &args, au_call_dpages_append, NULL); - - return args.err; -} - -int au_dcsub_pages_rev(struct au_dcsub_pages *dpages, struct dentry *dentry, - int do_include, au_dpages_test test, void *arg) -{ - int err; - - err = 0; - write_seqlock(&rename_lock); - spin_lock(&dentry->d_lock); - if (do_include - && au_dcount(dentry) > 0 - && (!test || test(dentry, arg))) - err = au_dpages_append(dpages, dentry, GFP_ATOMIC); - spin_unlock(&dentry->d_lock); - if (unlikely(err)) - goto out; - - /* - * RCU for vfsmount is unnecessary since this is a traverse in a single - * mount - */ - while (!IS_ROOT(dentry)) { - dentry = dentry->d_parent; /* rename_lock is locked */ - spin_lock(&dentry->d_lock); - if (au_dcount(dentry) > 0 - && (!test || test(dentry, arg))) - err = au_dpages_append(dpages, dentry, GFP_ATOMIC); - spin_unlock(&dentry->d_lock); - if (unlikely(err)) - break; - } - -out: - write_sequnlock(&rename_lock); - return err; -} - -static inline int au_dcsub_dpages_aufs(struct dentry *dentry, void *arg) -{ - return au_di(dentry) && dentry->d_sb == arg; -} - -int au_dcsub_pages_rev_aufs(struct au_dcsub_pages *dpages, - struct dentry *dentry, int do_include) -{ - return au_dcsub_pages_rev(dpages, dentry, do_include, - au_dcsub_dpages_aufs, dentry->d_sb); -} - -int au_test_subdir(struct dentry *d1, struct dentry *d2) -{ - struct path path[2] = { - { - .dentry = d1 - }, - { - .dentry = d2 - } - }; - - return path_is_under(path + 0, path + 1); -} diff --git a/fs/aufs/dcsub.h b/fs/aufs/dcsub.h deleted file mode 100644 index 5d2cf661d..000000000 --- a/fs/aufs/dcsub.h +++ /dev/null @@ -1,123 +0,0 @@ -/* - * Copyright (C) 2005-2016 Junjiro R. Okajima - */ - -/* - * sub-routines for dentry cache - */ - -#ifndef __AUFS_DCSUB_H__ -#define __AUFS_DCSUB_H__ - -#ifdef __KERNEL__ - -#include <linux/dcache.h> -#include <linux/fs.h> - -struct au_dpage { - int ndentry; - struct dentry **dentries; -}; - -struct au_dcsub_pages { - int ndpage; - struct au_dpage *dpages; -}; - -/* ---------------------------------------------------------------------- */ - -/* dcsub.c */ -int au_dpages_init(struct au_dcsub_pages *dpages, gfp_t gfp); -void au_dpages_free(struct au_dcsub_pages *dpages); -typedef int (*au_dpages_test)(struct dentry *dentry, void *arg); -int au_dcsub_pages(struct au_dcsub_pages *dpages, struct dentry *root, - au_dpages_test test, void *arg); -int au_dcsub_pages_rev(struct au_dcsub_pages *dpages, struct dentry *dentry, - int do_include, au_dpages_test test, void *arg); -int au_dcsub_pages_rev_aufs(struct au_dcsub_pages *dpages, - struct dentry *dentry, int do_include); -int au_test_subdir(struct dentry *d1, struct dentry *d2); - -/* ---------------------------------------------------------------------- */ - -/* - * todo: in linux-3.13, several similar (but faster) helpers are added to - * include/linux/dcache.h. Try them (in the future). - */ - -static inline int au_d_hashed_positive(struct dentry *d) -{ - int err; - struct inode *inode = d_inode(d); - - err = 0; - if (unlikely(d_unhashed(d) - || d_is_negative(d) - || !inode->i_nlink)) - err = -ENOENT; - return err; -} - -static inline int au_d_linkable(struct dentry *d) -{ - int err; - struct inode *inode = d_inode(d); - - err = au_d_hashed_positive(d); - if (err - && d_is_positive(d) - && (inode->i_state & I_LINKABLE)) - err = 0; - return err; -} - -static inline int au_d_alive(struct dentry *d) -{ - int err; - struct inode *inode; - - err = 0; - if (!IS_ROOT(d)) - err = au_d_hashed_positive(d); - else { - inode = d_inode(d); - if (unlikely(d_unlinked(d) - || d_is_negative(d) - || !inode->i_nlink)) - err = -ENOENT; - } - return err; -} - -static inline int au_alive_dir(struct dentry *d) -{ - int err; - - err = au_d_alive(d); - if (unlikely(err || IS_DEADDIR(d_inode(d)))) - err = -ENOENT; - return err; -} - -static inline int au_qstreq(struct qstr *a, struct qstr *b) -{ - return a->len == b->len - && !memcmp(a->name, b->name, a->len); -} - -/* - * by the commit - * 360f547 2015-01-25 dcache: let the dentry count go down to zero without - * taking d_lock - * the type of d_lockref.count became int, but the inlined function d_count() - * still returns unsigned int. - * I don't know why. Maybe it is for every d_count() users? - * Anyway au_dcount() lives on. - */ -static inline int au_dcount(struct dentry *d) -{ - return (int)d_count(d); -} - -#endif /* __KERNEL__ */ -#endif /* __AUFS_DCSUB_H__ */ diff --git a/fs/aufs/debug.c b/fs/aufs/debug.c deleted file mode 100644 index 4529831a9..000000000 --- a/fs/aufs/debug.c +++ /dev/null @@ -1,425 +0,0 @@ -/* - * Copyright (C) 2005-2016 Junjiro R. Okajima - */ - -/* - * debug print functions - */ - -#include "aufs.h" - -/* Returns 0, or -errno. arg is in kp->arg. */ -static int param_atomic_t_set(const char *val, const struct kernel_param *kp) -{ - int err, n; - - err = kstrtoint(val, 0, &n); - if (!err) { - if (n > 0) - au_debug_on(); - else - au_debug_off(); - } - return err; -} - -/* Returns length written or -errno. Buffer is 4k (ie. be short!) */ -static int param_atomic_t_get(char *buffer, const struct kernel_param *kp) -{ - atomic_t *a; - - a = kp->arg; - return sprintf(buffer, "%d", atomic_read(a)); -} - -static struct kernel_param_ops param_ops_atomic_t = { - .set = param_atomic_t_set, - .get = param_atomic_t_get - /* void (*free)(void *arg) */ -}; - -atomic_t aufs_debug = ATOMIC_INIT(0); -MODULE_PARM_DESC(debug, "debug print"); -module_param_named(debug, aufs_debug, atomic_t, S_IRUGO | S_IWUSR | S_IWGRP); - -DEFINE_MUTEX(au_dbg_mtx); /* just to serialize the dbg msgs */ -char *au_plevel = KERN_DEBUG; -#define dpri(fmt, ...) do { \ - if ((au_plevel \ - && strcmp(au_plevel, KERN_DEBUG)) \ - || au_debug_test()) \ - printk("%s" fmt, au_plevel, ##__VA_ARGS__); \ -} while (0) - -/* ---------------------------------------------------------------------- */ - -void au_dpri_whlist(struct au_nhash *whlist) -{ - unsigned long ul, n; - struct hlist_head *head; - struct au_vdir_wh *pos; - - n = whlist->nh_num; - head = whlist->nh_head; - for (ul = 0; ul < n; ul++) { - hlist_for_each_entry(pos, head, wh_hash) - dpri("b%d, %.*s, %d\n", - pos->wh_bindex, - pos->wh_str.len, pos->wh_str.name, - pos->wh_str.len); - head++; - } -} - -void au_dpri_vdir(struct au_vdir *vdir) -{ - unsigned long ul; - union au_vdir_deblk_p p; - unsigned char *o; - - if (!vdir || IS_ERR(vdir)) { - dpri("err %ld\n", PTR_ERR(vdir)); - return; - } - - dpri("deblk %u, nblk %lu, deblk %p, last{%lu, %p}, ver %lu\n", - vdir->vd_deblk_sz, vdir->vd_nblk, vdir->vd_deblk, - vdir->vd_last.ul, vdir->vd_last.p.deblk, vdir->vd_version); - for (ul = 0; ul < vdir->vd_nblk; ul++) { - p.deblk = vdir->vd_deblk[ul]; - o = p.deblk; - dpri("[%lu]: %p\n", ul, o); - } -} - -static int do_pri_inode(aufs_bindex_t bindex, struct inode *inode, int hn, - struct dentry *wh) -{ - char *n = NULL; - int l = 0; - - if (!inode || IS_ERR(inode)) { - dpri("i%d: err %ld\n", bindex, PTR_ERR(inode)); - return -1; - } - - /* the type of i_blocks depends upon CONFIG_LBDAF */ - BUILD_BUG_ON(sizeof(inode->i_blocks) != sizeof(unsigned long) - && sizeof(inode->i_blocks) != sizeof(u64)); - if (wh) { - n = (void *)wh->d_name.name; - l = wh->d_name.len; - } - - dpri("i%d: %p, i%lu, %s, cnt %d, nl %u, 0%o, sz %llu, blk %llu," - " hn %d, ct %lld, np %lu, st 0x%lx, f 0x%x, v %llu, g %x%s%.*s\n", - bindex, inode, - inode->i_ino, inode->i_sb ? au_sbtype(inode->i_sb) : "??", - atomic_read(&inode->i_count), inode->i_nlink, inode->i_mode, - i_size_read(inode), (unsigned long long)inode->i_blocks, - hn, (long long)timespec_to_ns(&inode->i_ctime) & 0x0ffff, - inode->i_mapping ? inode->i_mapping->nrpages : 0, - inode->i_state, inode->i_flags, inode->i_version, - inode->i_generation, - l ? ", wh " : "", l, n); - return 0; -} - -void au_dpri_inode(struct inode *inode) -{ - struct au_iinfo *iinfo; - aufs_bindex_t bindex; - int err, hn; - - err = do_pri_inode(-1, inode, -1, NULL); - if (err || !au_test_aufs(inode->i_sb)) - return; - - iinfo = au_ii(inode); - if (!iinfo) - return; - dpri("i-1: bstart %d, bend %d, gen %d\n", - iinfo->ii_bstart, iinfo->ii_bend, au_iigen(inode, NULL)); - if (iinfo->ii_bstart < 0) - return; - hn = 0; - for (bindex = iinfo->ii_bstart; bindex <= iinfo->ii_bend; bindex++) { - hn = !!au_hn(iinfo->ii_hinode + bindex); - do_pri_inode(bindex, iinfo->ii_hinode[0 + bindex].hi_inode, hn, - iinfo->ii_hinode[0 + bindex].hi_whdentry); - } -} - -void au_dpri_dalias(struct inode *inode) -{ - struct dentry *d; - - spin_lock(&inode->i_lock); - hlist_for_each_entry(d, &inode->i_dentry, d_u.d_alias) - au_dpri_dentry(d); - spin_unlock(&inode->i_lock); -} - -static int do_pri_dentry(aufs_bindex_t bindex, struct dentry *dentry) -{ - struct dentry *wh = NULL; - int hn; - struct au_iinfo *iinfo; - - if (!dentry || IS_ERR(dentry)) { - dpri("d%d: err %ld\n", bindex, PTR_ERR(dentry)); - return -1; - } - /* do not call dget_parent() here */ - /* note: access d_xxx without d_lock */ - dpri("d%d: %p, %pd2?, %s, cnt %d, flags 0x%x, %shashed\n", - bindex, dentry, dentry, - dentry->d_sb ? au_sbtype(dentry->d_sb) : "??", - au_dcount(dentry), dentry->d_flags, - d_unhashed(dentry) ? "un" : ""); - hn = -1; - if (bindex >= 0 - && d_is_positive(dentry) - && au_test_aufs(dentry->d_sb)) { - iinfo = au_ii(d_inode(dentry)); - if (iinfo) { - hn = !!au_hn(iinfo->ii_hinode + bindex); - wh = iinfo->ii_hinode[0 + bindex].hi_whdentry; - } - } - do_pri_inode(bindex, d_inode(dentry), hn, wh); - return 0; -} - -void au_dpri_dentry(struct dentry *dentry) -{ - struct au_dinfo *dinfo; - aufs_bindex_t bindex; - int err; - struct au_hdentry *hdp; - - err = do_pri_dentry(-1, dentry); - if (err || !au_test_aufs(dentry->d_sb)) - return; - - dinfo = au_di(dentry); - if (!dinfo) - return; - dpri("d-1: bstart %d, bend %d, bwh %d, bdiropq %d, gen %d, tmp %d\n", - dinfo->di_bstart, dinfo->di_bend, - dinfo->di_bwh, dinfo->di_bdiropq, au_digen(dentry), - dinfo->di_tmpfile); - if (dinfo->di_bstart < 0) - return; - hdp = dinfo->di_hdentry; - for (bindex = dinfo->di_bstart; bindex <= dinfo->di_bend; bindex++) - do_pri_dentry(bindex, hdp[0 + bindex].hd_dentry); -} - -static int do_pri_file(aufs_bindex_t bindex, struct file *file) -{ - char a[32]; - - if (!file || IS_ERR(file)) { - dpri("f%d: err %ld\n", bindex, PTR_ERR(file)); - return -1; - } - a[0] = 0; - if (bindex < 0 - && !IS_ERR_OR_NULL(file->f_path.dentry) - && au_test_aufs(file->f_path.dentry->d_sb) - && au_fi(file)) - snprintf(a, sizeof(a), ", gen %d, mmapped %d", - au_figen(file), atomic_read(&au_fi(file)->fi_mmapped)); - dpri("f%d: mode 0x%x, flags 0%o, cnt %ld, v %llu, pos %llu%s\n", - bindex, file->f_mode, file->f_flags, (long)file_count(file), - file->f_version, file->f_pos, a); - if (!IS_ERR_OR_NULL(file->f_path.dentry)) - do_pri_dentry(bindex, file->f_path.dentry); - return 0; -} - -void au_dpri_file(struct file *file) -{ - struct au_finfo *finfo; - struct au_fidir *fidir; - struct au_hfile *hfile; - aufs_bindex_t bindex; - int err; - - err = do_pri_file(-1, file); - if (err - || IS_ERR_OR_NULL(file->f_path.dentry) - || !au_test_aufs(file->f_path.dentry->d_sb)) - return; - - finfo = au_fi(file); - if (!finfo) - return; - if (finfo->fi_btop < 0) - return; - fidir = finfo->fi_hdir; - if (!fidir) - do_pri_file(finfo->fi_btop, finfo->fi_htop.hf_file); - else - for (bindex = finfo->fi_btop; - bindex >= 0 && bindex <= fidir->fd_bbot; - bindex++) { - hfile = fidir->fd_hfile + bindex; - do_pri_file(bindex, hfile ? hfile->hf_file : NULL); - } -} - -static int do_pri_br(aufs_bindex_t bindex, struct au_branch *br) -{ - struct vfsmount *mnt; - struct super_block *sb; - - if (!br || IS_ERR(br)) - goto out; - mnt = au_br_mnt(br); - if (!mnt || IS_ERR(mnt)) - goto out; - sb = mnt->mnt_sb; - if (!sb || IS_ERR(sb)) - goto out; - - dpri("s%d: {perm 0x%x, id %d, cnt %d, wbr %p}, " - "%s, dev 0x%02x%02x, flags 0x%lx, cnt %d, active %d, " - "xino %d\n", - bindex, br->br_perm, br->br_id, atomic_read(&br->br_count), - br->br_wbr, au_sbtype(sb), MAJOR(sb->s_dev), MINOR(sb->s_dev), - sb->s_flags, sb->s_count, - atomic_read(&sb->s_active), !!br->br_xino.xi_file); - return 0; - -out: - dpri("s%d: err %ld\n", bindex, PTR_ERR(br)); - return -1; -} - -void au_dpri_sb(struct super_block *sb) -{ - struct au_sbinfo *sbinfo; - aufs_bindex_t bindex; - int err; - /* to reuduce stack size */ - struct { - struct vfsmount mnt; - struct au_branch fake; - } *a; - - /* this function can be called from magic sysrq */ - a = kzalloc(sizeof(*a), GFP_ATOMIC); - if (unlikely(!a)) { - dpri("no memory\n"); - return; - } - - a->mnt.mnt_sb = sb; - a->fake.br_path.mnt = &a->mnt; - atomic_set(&a->fake.br_count, 0); - smp_mb(); /* atomic_set */ - err = do_pri_br(-1, &a->fake); - kfree(a); - dpri("dev 0x%x\n", sb->s_dev); - if (err || !au_test_aufs(sb)) - return; - - sbinfo = au_sbi(sb); - if (!sbinfo) - return; - dpri("nw %d, gen %u, kobj %d\n", - atomic_read(&sbinfo->si_nowait.nw_len), sbinfo->si_generation, - atomic_read(&sbinfo->si_kobj.kref.refcount)); - for (bindex = 0; bindex <= sbinfo->si_bend; bindex++) - do_pri_br(bindex, sbinfo->si_branch[0 + bindex]); -} - -/* ---------------------------------------------------------------------- */ - -void __au_dbg_verify_dinode(struct dentry *dentry, const char *func, int line) -{ - struct inode *h_inode, *inode = d_inode(dentry); - struct dentry *h_dentry; - aufs_bindex_t bindex, bend, bi; - - if (!inode /* || au_di(dentry)->di_lsc == AuLsc_DI_TMP */) - return; - - bend = au_dbend(dentry); - bi = au_ibend(inode); - if (bi < bend) - bend = bi; - bindex = au_dbstart(dentry); - bi = au_ibstart(inode); - if (bi > bindex) - bindex = bi; - - for (; bindex <= bend; bindex++) { - h_dentry = au_h_dptr(dentry, bindex); - if (!h_dentry) - continue; - h_inode = au_h_iptr(inode, bindex); - if (unlikely(h_inode != d_inode(h_dentry))) { - au_debug_on(); - AuDbg("b%d, %s:%d\n", bindex, func, line); - AuDbgDentry(dentry); - AuDbgInode(inode); - au_debug_off(); - BUG(); - } - } -} - -void au_dbg_verify_gen(struct dentry *parent, unsigned int sigen) -{ - int err, i, j; - struct au_dcsub_pages dpages; - struct au_dpage *dpage; - struct dentry **dentries; - - err = au_dpages_init(&dpages, GFP_NOFS); - AuDebugOn(err); - err = au_dcsub_pages_rev_aufs(&dpages, parent, /*do_include*/1); - AuDebugOn(err); - for (i = dpages.ndpage - 1; !err && i >= 0; i--) { - dpage = dpages.dpages + i; - dentries = dpage->dentries; - for (j = dpage->ndentry - 1; !err && j >= 0; j--) - AuDebugOn(au_digen_test(dentries[j], sigen)); - } - au_dpages_free(&dpages); -} - -void au_dbg_verify_kthread(void) -{ - if (au_wkq_test()) { - au_dbg_blocked(); - /* - * It may be recursive, but udba=notify between two aufs mounts, - * where a single ro branch is shared, is not a problem. - */ - /* WARN_ON(1); */ - } -} - -/* ---------------------------------------------------------------------- */ - -int __init au_debug_init(void) -{ - aufs_bindex_t bindex; - struct au_vdir_destr destr; - - bindex = -1; - AuDebugOn(bindex >= 0); - - destr.len = -1; - AuDebugOn(destr.len < NAME_MAX); - -#ifdef CONFIG_4KSTACKS - pr_warn("CONFIG_4KSTACKS is defined.\n"); -#endif - - return 0; -} diff --git a/fs/aufs/debug.h b/fs/aufs/debug.h deleted file mode 100644 index 0567f31d0..000000000 --- a/fs/aufs/debug.h +++ /dev/null @@ -1,212 +0,0 @@ -/* - * Copyright (C) 2005-2016 Junjiro R. Okajima - */ - -/* - * debug print functions - */ - -#ifndef __AUFS_DEBUG_H__ -#define __AUFS_DEBUG_H__ - -#ifdef __KERNEL__ - -#include <linux/atomic.h> -#include <linux/module.h> -#include <linux/kallsyms.h> -#include <linux/sysrq.h> - -#ifdef CONFIG_AUFS_DEBUG -#define AuDebugOn(a) BUG_ON(a) - -/* module parameter */ -extern atomic_t aufs_debug; -static inline void au_debug_on(void) -{ - atomic_inc(&aufs_debug); -} -static inline void au_debug_off(void) -{ - atomic_dec_if_positive(&aufs_debug); -} - -static inline int au_debug_test(void) -{ - return atomic_read(&aufs_debug) > 0; -} -#else -#define AuDebugOn(a) do {} while (0) -AuStubVoid(au_debug_on, void) -AuStubVoid(au_debug_off, void) -AuStubInt0(au_debug_test, void) -#endif /* CONFIG_AUFS_DEBUG */ - -#define param_check_atomic_t(name, p) __param_check(name, p, atomic_t) - -/* ---------------------------------------------------------------------- */ - -/* debug print */ - -#define AuDbg(fmt, ...) do { \ - if (au_debug_test()) \ - pr_debug("DEBUG: " fmt, ##__VA_ARGS__); \ -} while (0) -#define AuLabel(l) AuDbg(#l "\n") -#define AuIOErr(fmt, ...) pr_err("I/O Error, " fmt, ##__VA_ARGS__) -#define AuWarn1(fmt, ...) do { \ - static unsigned char _c; \ - if (!_c++) \ - pr_warn(fmt, ##__VA_ARGS__); \ -} while (0) - -#define AuErr1(fmt, ...) do { \ - static unsigned char _c; \ - if (!_c++) \ - pr_err(fmt, ##__VA_ARGS__); \ -} while (0) - -#define AuIOErr1(fmt, ...) do { \ - static unsigned char _c; \ - if (!_c++) \ - AuIOErr(fmt, ##__VA_ARGS__); \ -} while (0) - -#define AuUnsupportMsg "This operation is not supported." \ - " Please report this application to aufs-users ML." -#define AuUnsupport(fmt, ...) do { \ - pr_err(AuUnsupportMsg "\n" fmt, ##__VA_ARGS__); \ - dump_stack(); \ -} while (0) - -#define AuTraceErr(e) do { \ - if (unlikely((e) < 0)) \ - AuDbg("err %d\n", (int)(e)); \ -} while (0) - -#define AuTraceErrPtr(p) do { \ - if (IS_ERR(p)) \ - AuDbg("err %ld\n", PTR_ERR(p)); \ -} while (0) - -/* dirty macros for debug print, use with "%.*s" and caution */ -#define AuLNPair(qstr) (qstr)->len, (qstr)->name - -/* ---------------------------------------------------------------------- */ - -struct dentry; -#ifdef CONFIG_AUFS_DEBUG -extern struct mutex au_dbg_mtx; -extern char *au_plevel; -struct au_nhash; -void au_dpri_whlist(struct au_nhash *whlist); -struct au_vdir; -void au_dpri_vdir(struct au_vdir *vdir); -struct inode; -void au_dpri_inode(struct inode *inode); -void au_dpri_dalias(struct inode *inode); -void au_dpri_dentry(struct dentry *dentry); -struct file; -void au_dpri_file(struct file *filp); -struct super_block; -void au_dpri_sb(struct super_block *sb); - -#define au_dbg_verify_dinode(d) __au_dbg_verify_dinode(d, __func__, __LINE__) -void __au_dbg_verify_dinode(struct dentry *dentry, const char *func, int line); -void au_dbg_verify_gen(struct dentry *parent, unsigned int sigen); -void au_dbg_verify_kthread(void); - -int __init au_debug_init(void); - -#define AuDbgWhlist(w) do { \ - mutex_lock(&au_dbg_mtx); \ - AuDbg(#w "\n"); \ - au_dpri_whlist(w); \ - mutex_unlock(&au_dbg_mtx); \ -} while (0) - -#define AuDbgVdir(v) do { \ - mutex_lock(&au_dbg_mtx); \ - AuDbg(#v "\n"); \ - au_dpri_vdir(v); \ - mutex_unlock(&au_dbg_mtx); \ -} while (0) - -#define AuDbgInode(i) do { \ - mutex_lock(&au_dbg_mtx); \ - AuDbg(#i "\n"); \ - au_dpri_inode(i); \ - mutex_unlock(&au_dbg_mtx); \ -} while (0) - -#define AuDbgDAlias(i) do { \ - mutex_lock(&au_dbg_mtx); \ - AuDbg(#i "\n"); \ - au_dpri_dalias(i); \ - mutex_unlock(&au_dbg_mtx); \ -} while (0) - -#define AuDbgDentry(d) do { \ - mutex_lock(&au_dbg_mtx); \ - AuDbg(#d "\n"); \ - au_dpri_dentry(d); \ - mutex_unlock(&au_dbg_mtx); \ -} while (0) - -#define AuDbgFile(f) do { \ - mutex_lock(&au_dbg_mtx); \ - AuDbg(#f "\n"); \ - au_dpri_file(f); \ - mutex_unlock(&au_dbg_mtx); \ -} while (0) - -#define AuDbgSb(sb) do { \ - mutex_lock(&au_dbg_mtx); \ - AuDbg(#sb "\n"); \ - au_dpri_sb(sb); \ - mutex_unlock(&au_dbg_mtx); \ -} while (0) - -#define AuDbgSym(addr) do { \ - char sym[KSYM_SYMBOL_LEN]; \ - sprint_symbol(sym, (unsigned long)addr); \ - AuDbg("%s\n", sym); \ -} while (0) -#else -AuStubVoid(au_dbg_verify_dinode, struct dentry *dentry) -AuStubVoid(au_dbg_verify_gen, struct dentry *parent, unsigned int sigen) -AuStubVoid(au_dbg_verify_kthread, void) -AuStubInt0(__init au_debug_init, void) - -#define AuDbgWhlist(w) do {} while (0) -#define AuDbgVdir(v) do {} while (0) -#define AuDbgInode(i) do {} while (0) -#define AuDbgDAlias(i) do {} while (0) -#define AuDbgDentry(d) do {} while (0) -#define AuDbgFile(f) do {} while (0) -#define AuDbgSb(sb) do {} while (0) -#define AuDbgSym(addr) do {} while (0) -#endif /* CONFIG_AUFS_DEBUG */ - -/* ---------------------------------------------------------------------- */ - -#ifdef CONFIG_AUFS_MAGIC_SYSRQ -int __init au_sysrq_init(void); -void au_sysrq_fin(void); - -#ifdef CONFIG_HW_CONSOLE -#define au_dbg_blocked() do { \ - WARN_ON(1); \ - handle_sysrq('w'); \ -} while (0) -#else -AuStubVoid(au_dbg_blocked, void) -#endif - -#else -AuStubInt0(__init au_sysrq_init, void) -AuStubVoid(au_sysrq_fin, void) -AuStubVoid(au_dbg_blocked, void) -#endif /* CONFIG_AUFS_MAGIC_SYSRQ */ - -#endif /* __KERNEL__ */ -#endif /* __AUFS_DEBUG_H__ */ diff --git a/fs/aufs/dentry.c b/fs/aufs/dentry.c deleted file mode 100644 index e47a7e6c4..000000000 --- a/fs/aufs/dentry.c +++ /dev/null @@ -1,1123 +0,0 @@ -/* - * Copyright (C) 2005-2016 Junjiro R. Okajima - */ - -/* - * lookup and dentry operations - */ - -#include <linux/namei.h> -#include "aufs.h" - -#define AuLkup_ALLOW_NEG 1 -#define AuLkup_IGNORE_PERM (1 << 1) -#define au_ftest_lkup(flags, name) ((flags) & AuLkup_##name) -#define au_fset_lkup(flags, name) \ - do { (flags) |= AuLkup_##name; } while (0) -#define au_fclr_lkup(flags, name) \ - do { (flags) &= ~AuLkup_##name; } while (0) - -struct au_do_lookup_args { - unsigned int flags; - mode_t type; -}; - -/* - * returns positive/negative dentry, NULL or an error. - * NULL means whiteout-ed or not-found. - */ -static struct dentry* -au_do_lookup(struct dentry *h_parent, struct dentry *dentry, - aufs_bindex_t bindex, struct qstr *wh_name, - struct au_do_lookup_args *args) -{ - struct dentry *h_dentry; - struct inode *h_inode; - struct au_branch *br; - int wh_found, opq; - unsigned char wh_able; - const unsigned char allow_neg = !!au_ftest_lkup(args->flags, ALLOW_NEG); - const unsigned char ignore_perm = !!au_ftest_lkup(args->flags, - IGNORE_PERM); - - wh_found = 0; - br = au_sbr(dentry->d_sb, bindex); - wh_able = !!au_br_whable(br->br_perm); - if (wh_able) - wh_found = au_wh_test(h_parent, wh_name, /*try_sio*/0); - h_dentry = ERR_PTR(wh_found); - if (!wh_found) - goto real_lookup; - if (unlikely(wh_found < 0)) - goto out; - - /* We found a whiteout */ - /* au_set_dbend(dentry, bindex); */ - au_set_dbwh(dentry, bindex); - if (!allow_neg) - return NULL; /* success */ - -real_lookup: - if (!ignore_perm) - h_dentry = vfsub_lkup_one(&dentry->d_name, h_parent); - else - h_dentry = au_sio_lkup_one(&dentry->d_name, h_parent); - if (IS_ERR(h_dentry)) { - if (PTR_ERR(h_dentry) == -ENAMETOOLONG - && !allow_neg) - h_dentry = NULL; - goto out; - } - - h_inode = d_inode(h_dentry); - if (d_is_negative(h_dentry)) { - if (!allow_neg) - goto out_neg; - } else if (wh_found - || (args->type && args->type != (h_inode->i_mode & S_IFMT))) - goto out_neg; - - if (au_dbend(dentry) <= bindex) - au_set_dbend(dentry, bindex); - if (au_dbstart(dentry) < 0 || bindex < au_dbstart(dentry)) - au_set_dbstart(dentry, bindex); - au_set_h_dptr(dentry, bindex, h_dentry); - - if (!d_is_dir(h_dentry) - || !wh_able - || (d_really_is_positive(dentry) && !d_is_dir(dentry))) - goto out; /* success */ - - mutex_lock_nested(&h_inode->i_mutex, AuLsc_I_CHILD); - opq = au_diropq_test(h_dentry); - mutex_unlock(&h_inode->i_mutex); - if (opq > 0) - au_set_dbdiropq(dentry, bindex); - else if (unlikely(opq < 0)) { - au_set_h_dptr(dentry, bindex, NULL); - h_dentry = ERR_PTR(opq); - } - goto out; - -out_neg: - dput(h_dentry); - h_dentry = NULL; -out: - return h_dentry; -} - -static int au_test_shwh(struct super_block *sb, const struct qstr *name) -{ - if (unlikely(!au_opt_test(au_mntflags(sb), SHWH) - && !strncmp(name->name, AUFS_WH_PFX, AUFS_WH_PFX_LEN))) - return -EPERM; - return 0; -} - -/* - * returns the number of lower positive dentries, - * otherwise an error. - * can be called at unlinking with @type is zero. - */ -int au_lkup_dentry(struct dentry *dentry, aufs_bindex_t bstart, mode_t type) -{ - int npositive, err; - aufs_bindex_t bindex, btail, bdiropq; - unsigned char isdir, dirperm1; - struct qstr whname; - struct au_do_lookup_args args = { - .flags = 0, - .type = type - }; - const struct qstr *name = &dentry->d_name; - struct dentry *parent; - struct super_block *sb; - - sb = dentry->d_sb; - err = au_test_shwh(sb, name); - if (unlikely(err)) - goto out; - - err = au_wh_name_alloc(&whname, name); - if (unlikely(err)) - goto out; - - isdir = !!d_is_dir(dentry); - if (!type) - au_fset_lkup(args.flags, ALLOW_NEG); - dirperm1 = !!au_opt_test(au_mntflags(sb), DIRPERM1); - - npositive = 0; - parent = dget_parent(dentry); - btail = au_dbtaildir(parent); - for (bindex = bstart; bindex <= btail; bindex++) { - struct dentry *h_parent, *h_dentry; - struct inode *h_inode, *h_dir; - - h_dentry = au_h_dptr(dentry, bindex); - if (h_dentry) { - if (d_is_positive(h_dentry)) - npositive++; - if (type != S_IFDIR) - break; - continue; - } - h_parent = au_h_dptr(parent, bindex); - if (!h_parent || !d_is_dir(h_parent)) - continue; - - h_dir = d_inode(h_parent); - mutex_lock_nested(&h_dir->i_mutex, AuLsc_I_PARENT); - h_dentry = au_do_lookup(h_parent, dentry, bindex, &whname, - &args); - mutex_unlock(&h_dir->i_mutex); - err = PTR_ERR(h_dentry); - if (IS_ERR(h_dentry)) - goto out_parent; - if (h_dentry) - au_fclr_lkup(args.flags, ALLOW_NEG); - if (dirperm1) - au_fset_lkup(args.flags, IGNORE_PERM); - - if (au_dbwh(dentry) == bindex) - break; - if (!h_dentry) - continue; - if (d_is_negative(h_dentry)) - continue; - h_inode = d_inode(h_dentry); - npositive++; - if (!args.type) - args.type = h_inode->i_mode & S_IFMT; - if (args.type != S_IFDIR) - break; - else if (isdir) { - /* the type of lower may be different */ - bdiropq = au_dbdiropq(dentry); - if (bdiropq >= 0 && bdiropq <= bindex) - break; - } - } - - if (npositive) { - AuLabel(positive); - au_update_dbstart(dentry); - } - err = npositive; - if (unlikely(!au_opt_test(au_mntflags(sb), UDBA_NONE) - && au_dbstart(dentry) < 0)) { - err = -EIO; - AuIOErr("both of real entry and whiteout found, %pd, err %d\n", - dentry, err); - } - -out_parent: - dput(parent); - kfree(whname.name); -out: - return err; -} - -struct dentry *au_sio_lkup_one(struct qstr *name, struct dentry *parent) -{ - struct dentry *dentry; - int wkq_err; - - if (!au_test_h_perm_sio(d_inode(parent), MAY_EXEC)) - dentry = vfsub_lkup_one(name, parent); - else { - struct vfsub_lkup_one_args args = { - .errp = &dentry, - .name = name, - .parent = parent - }; - - wkq_err = au_wkq_wait(vfsub_call_lkup_one, &args); - if (unlikely(wkq_err)) - dentry = ERR_PTR(wkq_err); - } - - return dentry; -} - -/* - * lookup @dentry on @bindex which should be negative. - */ -int au_lkup_neg(struct dentry *dentry, aufs_bindex_t bindex, int wh) -{ - int err; - struct dentry *parent, *h_parent, *h_dentry; - struct au_branch *br; - - parent = dget_parent(dentry); - h_parent = au_h_dptr(parent, bindex); - br = au_sbr(dentry->d_sb, bindex); - if (wh) - h_dentry = au_whtmp_lkup(h_parent, br, &dentry->d_name); - else - h_dentry = au_sio_lkup_one(&dentry->d_name, h_parent); - err = PTR_ERR(h_dentry); - if (IS_ERR(h_dentry)) - goto out; - if (unlikely(d_is_positive(h_dentry))) { - err = -EIO; - AuIOErr("%pd should be negative on b%d.\n", h_dentry, bindex); - dput(h_dentry); - goto out; - } - - err = 0; - if (bindex < au_dbstart(dentry)) - au_set_dbstart(dentry, bindex); - if (au_dbend(dentry) < bindex) - au_set_dbend(dentry, bindex); - au_set_h_dptr(dentry, bindex, h_dentry); - -out: - dput(parent); - return err; -} - -/* ---------------------------------------------------------------------- */ - -/* subset of struct inode */ -struct au_iattr { - unsigned long i_ino; - /* unsigned int i_nlink; */ - kuid_t i_uid; - kgid_t i_gid; - u64 i_version; -/* - loff_t i_size; - blkcnt_t i_blocks; -*/ - umode_t i_mode; -}; - -static void au_iattr_save(struct au_iattr *ia, struct inode *h_inode) -{ - ia->i_ino = h_inode->i_ino; - /* ia->i_nlink = h_inode->i_nlink; */ - ia->i_uid = h_inode->i_uid; - ia->i_gid = h_inode->i_gid; - ia->i_version = h_inode->i_version; -/* - ia->i_size = h_inode->i_size; - ia->i_blocks = h_inode->i_blocks; -*/ - ia->i_mode = (h_inode->i_mode & S_IFMT); -} - -static int au_iattr_test(struct au_iattr *ia, struct inode *h_inode) -{ - return ia->i_ino != h_inode->i_ino - /* || ia->i_nlink != h_inode->i_nlink */ - || !uid_eq(ia->i_uid, h_inode->i_uid) - || !gid_eq(ia->i_gid, h_inode->i_gid) - || ia->i_version != h_inode->i_version -/* - || ia->i_size != h_inode->i_size - || ia->i_blocks != h_inode->i_blocks -*/ - || ia->i_mode != (h_inode->i_mode & S_IFMT); -} - -static int au_h_verify_dentry(struct dentry *h_dentry, struct dentry *h_parent, - struct au_branch *br) -{ - int err; - struct au_iattr ia; - struct inode *h_inode; - struct dentry *h_d; - struct super_block *h_sb; - - err = 0; - memset(&ia, -1, sizeof(ia)); - h_sb = h_dentry->d_sb; - h_inode = NULL; - if (d_is_positive(h_dentry)) { - h_inode = d_inode(h_dentry); - au_iattr_save(&ia, h_inode); - } else if (au_test_nfs(h_sb) || au_test_fuse(h_sb)) - /* nfs d_revalidate may return 0 for negative dentry */ - /* fuse d_revalidate always return 0 for negative dentry */ - goto out; - - /* main purpose is namei.c:cached_lookup() and d_revalidate */ - h_d = vfsub_lkup_one(&h_dentry->d_name, h_parent); - err = PTR_ERR(h_d); - if (IS_ERR(h_d)) - goto out; - - err = 0; - if (unlikely(h_d != h_dentry - || d_inode(h_d) != h_inode - || (h_inode && au_iattr_test(&ia, h_inode)))) - err = au_busy_or_stale(); - dput(h_d); - -out: - AuTraceErr(err); - return err; -} - -int au_h_verify(struct dentry *h_dentry, unsigned int udba, struct inode *h_dir, - struct dentry *h_parent, struct au_branch *br) -{ - int err; - - err = 0; - if (udba == AuOpt_UDBA_REVAL - && !au_test_fs_remote(h_dentry->d_sb)) { - IMustLock(h_dir); - err = (d_inode(h_dentry->d_parent) != h_dir); - } else if (udba != AuOpt_UDBA_NONE) - err = au_h_verify_dentry(h_dentry, h_parent, br); - - return err; -} - -/* ---------------------------------------------------------------------- */ - -static int au_do_refresh_hdentry(struct dentry *dentry, struct dentry *parent) -{ - int err; - aufs_bindex_t new_bindex, bindex, bend, bwh, bdiropq; - struct au_hdentry tmp, *p, *q; - struct au_dinfo *dinfo; - struct super_block *sb; - - DiMustWriteLock(dentry); - - sb = dentry->d_sb; - dinfo = au_di(dentry); - bend = dinfo->di_bend; - bwh = dinfo->di_bwh; - bdiropq = dinfo->di_bdiropq; - p = dinfo->di_hdentry + dinfo->di_bstart; - for (bindex = dinfo->di_bstart; bindex <= bend; bindex++, p++) { - if (!p->hd_dentry) - continue; - - new_bindex = au_br_index(sb, p->hd_id); - if (new_bindex == bindex) - continue; - - if (dinfo->di_bwh == bindex) - bwh = new_bindex; - if (dinfo->di_bdiropq == bindex) - bdiropq = new_bindex; - if (new_bindex < 0) { - au_hdput(p); - p->hd_dentry = NULL; - continue; - } - - /* swap two lower dentries, and loop again */ - q = dinfo->di_hdentry + new_bindex; - tmp = *q; - *q = *p; - *p = tmp; - if (tmp.hd_dentry) { - bindex--; - p--; - } - } - - dinfo->di_bwh = -1; - if (bwh >= 0 && bwh <= au_sbend(sb) && au_sbr_whable(sb, bwh)) - dinfo->di_bwh = bwh; - - dinfo->di_bdiropq = -1; - if (bdiropq >= 0 - && bdiropq <= au_sbend(sb) - && au_sbr_whable(sb, bdiropq)) - dinfo->di_bdiropq = bdiropq; - - err = -EIO; - dinfo->di_bstart = -1; - dinfo->di_bend = -1; - bend = au_dbend(parent); - p = dinfo->di_hdentry; - for (bindex = 0; bindex <= bend; bindex++, p++) - if (p->hd_dentry) { - dinfo->di_bstart = bindex; - break; - } - - if (dinfo->di_bstart >= 0) { - p = dinfo->di_hdentry + bend; - for (bindex = bend; bindex >= 0; bindex--, p--) - if (p->hd_dentry) { - dinfo->di_bend = bindex; - err = 0; - break; - } - } - - return err; -} - -static void au_do_hide(struct dentry *dentry) -{ - struct inode *inode; - - if (d_really_is_positive(dentry)) { - inode = d_inode(dentry); - if (!d_is_dir(dentry)) { - if (inode->i_nlink && !d_unhashed(dentry)) - drop_nlink(inode); - } else { - clear_nlink(inode); - /* stop next lookup */ - inode->i_flags |= S_DEAD; - } - smp_mb(); /* necessary? */ - } - d_drop(dentry); -} - -static int au_hide_children(struct dentry *parent) -{ - int err, i, j, ndentry; - struct au_dcsub_pages dpages; - struct au_dpage *dpage; - struct dentry *dentry; - - err = au_dpages_init(&dpages, GFP_NOFS); - if (unlikely(err)) - goto out; - err = au_dcsub_pages(&dpages, parent, NULL, NULL); - if (unlikely(err)) - goto out_dpages; - - /* in reverse order */ - for (i = dpages.ndpage - 1; i >= 0; i--) { - dpage = dpages.dpages + i; - ndentry = dpage->ndentry; - for (j = ndentry - 1; j >= 0; j--) { - dentry = dpage->dentries[j]; - if (dentry != parent) - au_do_hide(dentry); - } - } - -out_dpages: - au_dpages_free(&dpages); -out: - return err; -} - -static void au_hide(struct dentry *dentry) -{ - int err; - - AuDbgDentry(dentry); - if (d_is_dir(dentry)) { - /* shrink_dcache_parent(dentry); */ - err = au_hide_children(dentry); - if (unlikely(err)) - AuIOErr("%pd, failed hiding children, ignored %d\n", - dentry, err); - } - au_do_hide(dentry); -} - -/* - * By adding a dirty branch, a cached dentry may be affected in various ways. - * - * a dirty branch is added - * - on the top of layers - * - in the middle of layers - * - to the bottom of layers - * - * on the added branch there exists - * - a whiteout - * - a diropq - * - a same named entry - * + exist - * * negative --> positive - * * positive --> positive - * - type is unchanged - * - type is changed - * + doesn't exist - * * negative --> negative - * * positive --> negative (rejected by au_br_del() for non-dir case) - * - none - */ -static int au_refresh_by_dinfo(struct dentry *dentry, struct au_dinfo *dinfo, - struct au_dinfo *tmp) -{ - int err; - aufs_bindex_t bindex, bend; - struct { - struct dentry *dentry; - struct inode *inode; - mode_t mode; - } orig_h, tmp_h = { - .dentry = NULL - }; - struct au_hdentry *hd; - struct inode *inode, *h_inode; - struct dentry *h_dentry; - - err = 0; - AuDebugOn(dinfo->di_bstart < 0); - orig_h.mode = 0; - orig_h.dentry = dinfo->di_hdentry[dinfo->di_bstart].hd_dentry; - orig_h.inode = NULL; - if (d_is_positive(orig_h.dentry)) { - orig_h.inode = d_inode(orig_h.dentry); - orig_h.mode = orig_h.inode->i_mode & S_IFMT; - } - if (tmp->di_bstart >= 0) { - tmp_h.dentry = tmp->di_hdentry[tmp->di_bstart].hd_dentry; - if (d_is_positive(tmp_h.dentry)) { - tmp_h.inode = d_inode(tmp_h.dentry); - tmp_h.mode = tmp_h.inode->i_mode & S_IFMT; - } - } - - inode = NULL; - if (d_really_is_positive(dentry)) - inode = d_inode(dentry); - if (!orig_h.inode) { - AuDbg("nagative originally\n"); - if (inode) { - au_hide(dentry); - goto out; - } - AuDebugOn(inode); - AuDebugOn(dinfo->di_bstart != dinfo->di_bend); - AuDebugOn(dinfo->di_bdiropq != -1); - - if (!tmp_h.inode) { - AuDbg("negative --> negative\n"); - /* should have only one negative lower */ - if (tmp->di_bstart >= 0 - && tmp->di_bstart < dinfo->di_bstart) { - AuDebugOn(tmp->di_bstart != tmp->di_bend); - AuDebugOn(dinfo->di_bstart != dinfo->di_bend); - au_set_h_dptr(dentry, dinfo->di_bstart, NULL); - au_di_cp(dinfo, tmp); - hd = tmp->di_hdentry + tmp->di_bstart; - au_set_h_dptr(dentry, tmp->di_bstart, - dget(hd->hd_dentry)); - } - au_dbg_verify_dinode(dentry); - } else { - AuDbg("negative --> positive\n"); - /* - * similar to the behaviour of creating with bypassing - * aufs. - * unhash it in order to force an error in the - * succeeding create operation. - * we should not set S_DEAD here. - */ - d_drop(dentry); - /* au_di_swap(tmp, dinfo); */ - au_dbg_verify_dinode(dentry); - } - } else { - AuDbg("positive originally\n"); - /* inode may be NULL */ - AuDebugOn(inode && (inode->i_mode & S_IFMT) != orig_h.mode); - if (!tmp_h.inode) { - AuDbg("positive --> negative\n"); - /* or bypassing aufs */ - au_hide(dentry); - if (tmp->di_bwh >= 0 && tmp->di_bwh <= dinfo->di_bstart) - dinfo->di_bwh = tmp->di_bwh; - if (inode) - err = au_refresh_hinode_self(inode); - au_dbg_verify_dinode(dentry); - } else if (orig_h.mode == tmp_h.mode) { - AuDbg("positive --> positive, same type\n"); - if (!S_ISDIR(orig_h.mode) - && dinfo->di_bstart > tmp->di_bstart) { - /* - * similar to the behaviour of removing and - * creating. - */ - au_hide(dentry); - if (inode) - err = au_refresh_hinode_self(inode); - au_dbg_verify_dinode(dentry); - } else { - /* fill empty slots */ - if (dinfo->di_bstart > tmp->di_bstart) - dinfo->di_bstart = tmp->di_bstart; - if (dinfo->di_bend < tmp->di_bend) - dinfo->di_bend = tmp->di_bend; - dinfo->di_bwh = tmp->di_bwh; - dinfo->di_bdiropq = tmp->di_bdiropq; - hd = tmp->di_hdentry; - bend = dinfo->di_bend; - for (bindex = tmp->di_bstart; bindex <= bend; - bindex++) { - if (au_h_dptr(dentry, bindex)) - continue; - h_dentry = hd[bindex].hd_dentry; - if (!h_dentry) - continue; - AuDebugOn(d_is_negative(h_dentry)); - h_inode = d_inode(h_dentry); - AuDebugOn(orig_h.mode - != (h_inode->i_mode - & S_IFMT)); - au_set_h_dptr(dentry, bindex, - dget(h_dentry)); - } - err = au_refresh_hinode(inode, dentry); - au_dbg_verify_dinode(dentry); - } - } else { - AuDbg("positive --> positive, different type\n"); - /* similar to the behaviour of removing and creating */ - au_hide(dentry); - if (inode) - err = au_refresh_hinode_self(inode); - au_dbg_verify_dinode(dentry); - } - } - -out: - return err; -} - -void au_refresh_dop(struct dentry *dentry, int force_reval) -{ - const struct dentry_operations *dop - = force_reval ? &aufs_dop : dentry->d_sb->s_d_op; - static const unsigned int mask - = DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE; - - BUILD_BUG_ON(sizeof(mask) != sizeof(dentry->d_flags)); - - if (dentry->d_op == dop) - return; - - AuDbg("%pd\n", dentry); - spin_lock(&dentry->d_lock); - if (dop == &aufs_dop) - dentry->d_flags |= mask; - else - dentry->d_flags &= ~mask; - dentry->d_op = dop; - spin_unlock(&dentry->d_lock); -} - -int au_refresh_dentry(struct dentry *dentry, struct dentry *parent) -{ - int err, ebrange; - unsigned int sigen; - struct au_dinfo *dinfo, *tmp; - struct super_block *sb; - struct inode *inode; - - DiMustWriteLock(dentry); - AuDebugOn(IS_ROOT(dentry)); - AuDebugOn(d_really_is_negative(parent)); - - sb = dentry->d_sb; - sigen = au_sigen(sb); - err = au_digen_test(parent, sigen); - if (unlikely(err)) - goto out; - - dinfo = au_di(dentry); - err = au_di_realloc(dinfo, au_sbend(sb) + 1); - if (unlikely(err)) - goto out; - ebrange = au_dbrange_test(dentry); - if (!ebrange) - ebrange = au_do_refresh_hdentry(dentry, parent); - - if (d_unhashed(dentry) || ebrange /* || dinfo->di_tmpfile */) { - AuDebugOn(au_dbstart(dentry) < 0 && au_dbend(dentry) >= 0); - if (d_really_is_positive(dentry)) { - inode = d_inode(dentry); - err = au_refresh_hinode_self(inode); - } - au_dbg_verify_dinode(dentry); - if (!err) - goto out_dgen; /* success */ - goto out; - } - - /* temporary dinfo */ - AuDbgDentry(dentry); - err = -ENOMEM; - tmp = au_di_alloc(sb, AuLsc_DI_TMP); - if (unlikely(!tmp)) - goto out; - au_di_swap(tmp, dinfo); - /* returns the number of positive dentries */ - /* - * if current working dir is removed, it returns an error. - * but the dentry is legal. - */ - err = au_lkup_dentry(dentry, /*bstart*/0, /*type*/0); - AuDbgDentry(dentry); - au_di_swap(tmp, dinfo); - if (err == -ENOENT) - err = 0; - if (err >= 0) { - /* compare/refresh by dinfo */ - AuDbgDentry(dentry); - err = au_refresh_by_dinfo(dentry, dinfo, tmp); - au_dbg_verify_dinode(dentry); - AuTraceErr(err); - } - au_rw_write_unlock(&tmp->di_rwsem); - au_di_free(tmp); - if (unlikely(err)) - goto out; - -out_dgen: - au_update_digen(dentry); -out: - if (unlikely(err && !(dentry->d_flags & DCACHE_NFSFS_RENAMED))) { - AuIOErr("failed refreshing %pd, %d\n", dentry, err); - AuDbgDentry(dentry); - } - AuTraceErr(err); - return err; -} - -static int au_do_h_d_reval(struct dentry *h_dentry, unsigned int flags, - struct dentry *dentry, aufs_bindex_t bindex) -{ - int err, valid; - - err = 0; - if (!(h_dentry->d_flags & DCACHE_OP_REVALIDATE)) - goto out; - - AuDbg("b%d\n", bindex); - /* - * gave up supporting LOOKUP_CREATE/OPEN for lower fs, - * due to whiteout and branch permission. - */ - flags &= ~(/*LOOKUP_PARENT |*/ LOOKUP_OPEN | LOOKUP_CREATE - | LOOKUP_FOLLOW | LOOKUP_EXCL); - /* it may return tri-state */ - valid = h_dentry->d_op->d_revalidate(h_dentry, flags); - - if (unlikely(valid < 0)) - err = valid; - else if (!valid) - err = -EINVAL; - -out: - AuTraceErr(err); - return err; -} - -/* todo: remove this */ -static int h_d_revalidate(struct dentry *dentry, struct inode *inode, - unsigned int flags, int do_udba) -{ - int err; - umode_t mode, h_mode; - aufs_bindex_t bindex, btail, bstart, ibs, ibe; - unsigned char plus, unhashed, is_root, h_plus, h_nfs, tmpfile; - struct inode *h_inode, *h_cached_inode; - struct dentry *h_dentry; - struct qstr *name, *h_name; - - err = 0; - plus = 0; - mode = 0; - ibs = -1; - ibe = -1; - unhashed = !!d_unhashed(dentry); - is_root = !!IS_ROOT(dentry); - name = &dentry->d_name; - tmpfile = au_di(dentry)->di_tmpfile; - - /* - * Theoretically, REVAL test should be unnecessary in case of - * {FS,I}NOTIFY. - * But {fs,i}notify doesn't fire some necessary events, - * IN_ATTRIB for atime/nlink/pageio - * Let's do REVAL test too. - */ - if (do_udba && inode) { - mode = (inode->i_mode & S_IFMT); - plus = (inode->i_nlink > 0); - ibs = au_ibstart(inode); - ibe = au_ibend(inode); - } - - bstart = au_dbstart(dentry); - btail = bstart; - if (inode && S_ISDIR(inode->i_mode)) - btail = au_dbtaildir(dentry); - for (bindex = bstart; bindex <= btail; bindex++) { - h_dentry = au_h_dptr(dentry, bindex); - if (!h_dentry) - continue; - - AuDbg("b%d, %pd\n", bindex, h_dentry); - h_nfs = !!au_test_nfs(h_dentry->d_sb); - spin_lock(&h_dentry->d_lock); - h_name = &h_dentry->d_name; - if (unlikely(do_udba - && !is_root - && ((!h_nfs - && (unhashed != !!d_unhashed(h_dentry) - || (!tmpfile - && !au_qstreq(name, h_name)) - )) - || (h_nfs - && !(flags & LOOKUP_OPEN) - && (h_dentry->d_flags - & DCACHE_NFSFS_RENAMED))) - )) { - int h_unhashed; - - h_unhashed = d_unhashed(h_dentry); - spin_unlock(&h_dentry->d_lock); - AuDbg("unhash 0x%x 0x%x, %pd %pd\n", - unhashed, h_unhashed, dentry, h_dentry); - goto err; - } - spin_unlock(&h_dentry->d_lock); - - err = au_do_h_d_reval(h_dentry, flags, dentry, bindex); - if (unlikely(err)) - /* do not goto err, to keep the errno */ - break; - - /* todo: plink too? */ - if (!do_udba) - continue; - - /* UDBA tests */ - if (unlikely(!!inode != d_is_positive(h_dentry))) - goto err; - - h_inode = NULL; - if (d_is_positive(h_dentry)) - h_inode = d_inode(h_dentry); - h_plus = plus; - h_mode = mode; - h_cached_inode = h_inode; - if (h_inode) { - h_mode = (h_inode->i_mode & S_IFMT); - h_plus = (h_inode->i_nlink > 0); - } - if (inode && ibs <= bindex && bindex <= ibe) - h_cached_inode = au_h_iptr(inode, bindex); - - if (!h_nfs) { - if (unlikely(plus != h_plus && !tmpfile)) - goto err; - } else { - if (unlikely(!(h_dentry->d_flags & DCACHE_NFSFS_RENAMED) - && !is_root - && !IS_ROOT(h_dentry) - && unhashed != d_unhashed(h_dentry))) - goto err; - } - if (unlikely(mode != h_mode - || h_cached_inode != h_inode)) - goto err; - continue; - -err: - err = -EINVAL; - break; - } - - AuTraceErr(err); - return err; -} - -/* todo: consolidate with do_refresh() and au_reval_for_attr() */ -static int simple_reval_dpath(struct dentry *dentry, unsigned int sigen) -{ - int err; - struct dentry *parent; - - if (!au_digen_test(dentry, sigen)) - return 0; - - parent = dget_parent(dentry); - di_read_lock_parent(parent, AuLock_IR); - AuDebugOn(au_digen_test(parent, sigen)); - au_dbg_verify_gen(parent, sigen); - err = au_refresh_dentry(dentry, parent); - di_read_unlock(parent, AuLock_IR); - dput(parent); - AuTraceErr(err); - return err; -} - -int au_reval_dpath(struct dentry *dentry, unsigned int sigen) -{ - int err; - struct dentry *d, *parent; - - if (!au_ftest_si(au_sbi(dentry->d_sb), FAILED_REFRESH_DIR)) - return simple_reval_dpath(dentry, sigen); - - /* slow loop, keep it simple and stupid */ - /* cf: au_cpup_dirs() */ - err = 0; - parent = NULL; - while (au_digen_test(dentry, sigen)) { - d = dentry; - while (1) { - dput(parent); - parent = dget_parent(d); - if (!au_digen_test(parent, sigen)) - break; - d = parent; - } - - if (d != dentry) - di_write_lock_child2(d); - - /* someone might update our dentry while we were sleeping */ - if (au_digen_test(d, sigen)) { - /* - * todo: consolidate with simple_reval_dpath(), - * do_refresh() and au_reval_for_attr(). - */ - di_read_lock_parent(parent, AuLock_IR); - err = au_refresh_dentry(d, parent); - di_read_unlock(parent, AuLock_IR); - } - - if (d != dentry) - di_write_unlock(d); - dput(parent); - if (unlikely(err)) - break; - } - - return err; -} - -/* - * if valid returns 1, otherwise 0. - */ -static int aufs_d_revalidate(struct dentry *dentry, unsigned int flags) -{ - int valid, err; - unsigned int sigen; - unsigned char do_udba; - struct super_block *sb; - struct inode *inode; - - /* todo: support rcu-walk? */ - if (flags & LOOKUP_RCU) - return -ECHILD; - - valid = 0; - if (unlikely(!au_di(dentry))) - goto out; - - valid = 1; - sb = dentry->d_sb; - /* - * todo: very ugly - * i_mutex of parent dir may be held, - * but we should not return 'invalid' due to busy. - */ - err = aufs_read_lock(dentry, AuLock_FLUSH | AuLock_DW | AuLock_NOPLM); - if (unlikely(err)) { - valid = err; - AuTraceErr(err); - goto out; - } - inode = NULL; - if (d_really_is_positive(dentry)) - inode = d_inode(dentry); - if (unlikely(inode && is_bad_inode(inode))) { - err = -EINVAL; - AuTraceErr(err); - goto out_dgrade; - } - if (unlikely(au_dbrange_test(dentry))) { - err = -EINVAL; - AuTraceErr(err); - goto out_dgrade; - } - - sigen = au_sigen(sb); - if (au_digen_test(dentry, sigen)) { - AuDebugOn(IS_ROOT(dentry)); - err = au_reval_dpath(dentry, sigen); - if (unlikely(err)) { - AuTraceErr(err); - goto out_dgrade; - } - } - di_downgrade_lock(dentry, AuLock_IR); - - err = -EINVAL; - if (!(flags & (LOOKUP_OPEN | LOOKUP_EMPTY)) - && inode - && !(inode->i_state && I_LINKABLE) - && (IS_DEADDIR(inode) || !inode->i_nlink)) { - AuTraceErr(err); - goto out_inval; - } - - do_udba = !au_opt_test(au_mntflags(sb), UDBA_NONE); - if (do_udba && inode) { - aufs_bindex_t bstart = au_ibstart(inode); - struct inode *h_inode; - - if (bstart >= 0) { - h_inode = au_h_iptr(inode, bstart); - if (h_inode && au_test_higen(inode, h_inode)) { - AuTraceErr(err); - goto out_inval; - } - } - } - - err = h_d_revalidate(dentry, inode, flags, do_udba); - if (unlikely(!err && do_udba && au_dbstart(dentry) < 0)) { - err = -EIO; - AuDbg("both of real entry and whiteout found, %p, err %d\n", - dentry, err); - } - goto out_inval; - -out_dgrade: - di_downgrade_lock(dentry, AuLock_IR); -out_inval: - aufs_read_unlock(dentry, AuLock_IR); - AuTraceErr(err); - valid = !err; -out: - if (!valid) { - AuDbg("%pd invalid, %d\n", dentry, valid); - d_drop(dentry); - } - return valid; -} - -static void aufs_d_release(struct dentry *dentry) -{ - if (au_di(dentry)) { - au_di_fin(dentry); - au_hn_di_reinit(dentry); - } -} - -const struct dentry_operations aufs_dop = { - .d_revalidate = aufs_d_revalidate, - .d_weak_revalidate = aufs_d_revalidate, - .d_release = aufs_d_release -}; - -/* aufs_dop without d_revalidate */ -const struct dentry_operations aufs_dop_noreval = { - .d_release = aufs_d_release -}; diff --git a/fs/aufs/dentry.h b/fs/aufs/dentry.h deleted file mode 100644 index c794adf59..000000000 --- a/fs/aufs/dentry.h +++ /dev/null @@ -1,221 +0,0 @@ -/* - * Copyright (C) 2005-2016 Junjiro R. Okajima - */ - -/* - * lookup and dentry operations - */ - -#ifndef __AUFS_DENTRY_H__ -#define __AUFS_DENTRY_H__ - -#ifdef __KERNEL__ - -#include <linux/dcache.h> -#include "rwsem.h" - -struct au_hdentry { - struct dentry *hd_dentry; - aufs_bindex_t hd_id; -}; - -struct au_dinfo { - atomic_t di_generation; - - struct au_rwsem di_rwsem; - aufs_bindex_t di_bstart, di_bend, di_bwh, di_bdiropq; - unsigned char di_tmpfile; /* to allow the different name */ - struct au_hdentry *di_hdentry; -} ____cacheline_aligned_in_smp; - -/* ---------------------------------------------------------------------- */ - -/* dentry.c */ -extern const struct dentry_operations aufs_dop, aufs_dop_noreval; -struct au_branch; -struct dentry *au_sio_lkup_one(struct qstr *name, struct dentry *parent); -int au_h_verify(struct dentry *h_dentry, unsigned int udba, struct inode *h_dir, - struct dentry *h_parent, struct au_branch *br); - -int au_lkup_dentry(struct dentry *dentry, aufs_bindex_t bstart, mode_t type); -int au_lkup_neg(struct dentry *dentry, aufs_bindex_t bindex, int wh); -int au_refresh_dentry(struct dentry *dentry, struct dentry *parent); -int au_reval_dpath(struct dentry *dentry, unsigned int sigen); -void au_refresh_dop(struct dentry *dentry, int force_reval); - -/* dinfo.c */ -void au_di_init_once(void *_di); -struct au_dinfo *au_di_alloc(struct super_block *sb, unsigned int lsc); -void au_di_free(struct au_dinfo *dinfo); -void au_di_swap(struct au_dinfo *a, struct au_dinfo *b); -void au_di_cp(struct au_dinfo *dst, struct au_dinfo *src); -int au_di_init(struct dentry *dentry); -void au_di_fin(struct dentry *dentry); -int au_di_realloc(struct au_dinfo *dinfo, int nbr); - -void di_read_lock(struct dentry *d, int flags, unsigned int lsc); -void di_read_unlock(struct dentry *d, int flags); -void di_downgrade_lock(struct dentry *d, int flags); -void di_write_lock(struct dentry *d, unsigned int lsc); -void di_write_unlock(struct dentry *d); -void di_write_lock2_child(struct dentry *d1, struct dentry *d2, int isdir); -void di_write_lock2_parent(struct dentry *d1, struct dentry *d2, int isdir); -void di_write_unlock2(struct dentry *d1, struct dentry *d2); - -struct dentry *au_h_dptr(struct dentry *dentry, aufs_bindex_t bindex); -struct dentry *au_h_d_alias(struct dentry *dentry, aufs_bindex_t bindex); -aufs_bindex_t au_dbtail(struct dentry *dentry); -aufs_bindex_t au_dbtaildir(struct dentry *dentry); - -void au_set_h_dptr(struct dentry *dentry, aufs_bindex_t bindex, - struct dentry *h_dentry); -int au_digen_test(struct dentry *dentry, unsigned int sigen); -int au_dbrange_test(struct dentry *dentry); -void au_update_digen(struct dentry *dentry); -void au_update_dbrange(struct dentry *dentry, int do_put_zero); -void au_update_dbstart(struct dentry *dentry); -void au_update_dbend(struct dentry *dentry); -int au_find_dbindex(struct dentry *dentry, struct dentry *h_dentry); - -/* ---------------------------------------------------------------------- */ - -static inline struct au_dinfo *au_di(struct dentry *dentry) -{ - return dentry->d_fsdata; -} - -/* ---------------------------------------------------------------------- */ - -/* lock subclass for dinfo */ -enum { - AuLsc_DI_CHILD, /* child first */ - AuLsc_DI_CHILD2, /* rename(2), link(2), and cpup at hnotify */ - AuLsc_DI_CHILD3, /* copyup dirs */ - AuLsc_DI_PARENT, - AuLsc_DI_PARENT2, - AuLsc_DI_PARENT3, - AuLsc_DI_TMP /* temp for replacing dinfo */ -}; - -/* - * di_read_lock_child, di_write_lock_child, - * di_read_lock_child2, di_write_lock_child2, - * di_read_lock_child3, di_write_lock_child3, - * di_read_lock_parent, di_write_lock_parent, - * di_read_lock_parent2, di_write_lock_parent2, - * di_read_lock_parent3, di_write_lock_parent3, - */ -#define AuReadLockFunc(name, lsc) \ -static inline void di_read_lock_##name(struct dentry *d, int flags) \ -{ di_read_lock(d, flags, AuLsc_DI_##lsc); } - -#define AuWriteLockFunc(name, lsc) \ -static inline void di_write_lock_##name(struct dentry *d) \ -{ di_write_lock(d, AuLsc_DI_##lsc); } - -#define AuRWLockFuncs(name, lsc) \ - AuReadLockFunc(name, lsc) \ - AuWriteLockFunc(name, lsc) - -AuRWLockFuncs(child, CHILD); -AuRWLockFuncs(child2, CHILD2); -AuRWLockFuncs(child3, CHILD3); -AuRWLockFuncs(parent, PARENT); -AuRWLockFuncs(parent2, PARENT2); -AuRWLockFuncs(parent3, PARENT3); - -#undef AuReadLockFunc -#undef AuWriteLockFunc -#undef AuRWLockFuncs - -#define DiMustNoWaiters(d) AuRwMustNoWaiters(&au_di(d)->di_rwsem) -#define DiMustAnyLock(d) AuRwMustAnyLock(&au_di(d)->di_rwsem) -#define DiMustWriteLock(d) AuRwMustWriteLock(&au_di(d)->di_rwsem) - -/* ---------------------------------------------------------------------- */ - -/* todo: memory barrier? */ -static inline unsigned int au_digen(struct dentry *d) -{ - return atomic_read(&au_di(d)->di_generation); -} - -static inline void au_h_dentry_init(struct au_hdentry *hdentry) -{ - hdentry->hd_dentry = NULL; -} - -static inline void au_hdput(struct au_hdentry *hd) -{ - if (hd) - dput(hd->hd_dentry); -} - -static inline aufs_bindex_t au_dbstart(struct dentry *dentry) -{ - DiMustAnyLock(dentry); - return au_di(dentry)->di_bstart; -} - -static inline aufs_bindex_t au_dbend(struct dentry *dentry) -{ - DiMustAnyLock(dentry); - return au_di(dentry)->di_bend; -} - -static inline aufs_bindex_t au_dbwh(struct dentry *dentry) -{ - DiMustAnyLock(dentry); - return au_di(dentry)->di_bwh; -} - -static inline aufs_bindex_t au_dbdiropq(struct dentry *dentry) -{ - DiMustAnyLock(dentry); - return au_di(dentry)->di_bdiropq; -} - -/* todo: hard/soft set? */ -static inline void au_set_dbstart(struct dentry *dentry, aufs_bindex_t bindex) -{ - DiMustWriteLock(dentry); - au_di(dentry)->di_bstart = bindex; -} - -static inline void au_set_dbend(struct dentry *dentry, aufs_bindex_t bindex) -{ - DiMustWriteLock(dentry); - au_di(dentry)->di_bend = bindex; -} - -static inline void au_set_dbwh(struct dentry *dentry, aufs_bindex_t bindex) -{ - DiMustWriteLock(dentry); - /* dbwh can be outside of bstart - bend range */ - au_di(dentry)->di_bwh = bindex; -} - -static inline void au_set_dbdiropq(struct dentry *dentry, aufs_bindex_t bindex) -{ - DiMustWriteLock(dentry); - au_di(dentry)->di_bdiropq = bindex; -} - -/* ---------------------------------------------------------------------- */ - -#ifdef CONFIG_AUFS_HNOTIFY -static inline void au_digen_dec(struct dentry *d) -{ - atomic_dec(&au_di(d)->di_generation); -} - -static inline void au_hn_di_reinit(struct dentry *dentry) -{ - dentry->d_fsdata = NULL; -} -#else -AuStubVoid(au_hn_di_reinit, struct dentry *dentry __maybe_unused) -#endif /* CONFIG_AUFS_HNOTIFY */ - -#endif /* __KERNEL__ */ -#endif /* __AUFS_DENTRY_H__ */ diff --git a/fs/aufs/dinfo.c b/fs/aufs/dinfo.c deleted file mode 100644 index ad6d045c4..000000000 --- a/fs/aufs/dinfo.c +++ /dev/null @@ -1,537 +0,0 @@ -/* - * Copyright (C) 2005-2016 Junjiro R. Okajima - */ - -/* - * dentry private data - */ - -#include "aufs.h" - -void au_di_init_once(void *_dinfo) -{ - struct au_dinfo *dinfo = _dinfo; - static struct lock_class_key aufs_di; - - au_rw_init(&dinfo->di_rwsem); - au_rw_class(&dinfo->di_rwsem, &aufs_di); -} - -struct au_dinfo *au_di_alloc(struct super_block *sb, unsigned int lsc) -{ - struct au_dinfo *dinfo; - int nbr, i; - - dinfo = au_cache_alloc_dinfo(); - if (unlikely(!dinfo)) - goto out; - - nbr = au_sbend(sb) + 1; - if (nbr <= 0) - nbr = 1; - dinfo->di_hdentry = kcalloc(nbr, sizeof(*dinfo->di_hdentry), GFP_NOFS); - if (dinfo->di_hdentry) { - au_rw_write_lock_nested(&dinfo->di_rwsem, lsc); - dinfo->di_bstart = -1; - dinfo->di_bend = -1; - dinfo->di_bwh = -1; - dinfo->di_bdiropq = -1; - dinfo->di_tmpfile = 0; - for (i = 0; i < nbr; i++) - dinfo->di_hdentry[i].hd_id = -1; - goto out; - } - - au_cache_free_dinfo(dinfo); - dinfo = NULL; - -out: - return dinfo; -} - -void au_di_free(struct au_dinfo *dinfo) -{ - struct au_hdentry *p; - aufs_bindex_t bend, bindex; - - /* dentry may not be revalidated */ - bindex = dinfo->di_bstart; - if (bindex >= 0) { - bend = dinfo->di_bend; - p = dinfo->di_hdentry + bindex; - while (bindex++ <= bend) - au_hdput(p++); - } - kfree(dinfo->di_hdentry); - au_cache_free_dinfo(dinfo); -} - -void au_di_swap(struct au_dinfo *a, struct au_dinfo *b) -{ - struct au_hdentry *p; - aufs_bindex_t bi; - - AuRwMustWriteLock(&a->di_rwsem); - AuRwMustWriteLock(&b->di_rwsem); - -#define DiSwap(v, name) \ - do { \ - v = a->di_##name; \ - a->di_##name = b->di_##name; \ - b->di_##name = v; \ - } while (0) - - DiSwap(p, hdentry); - DiSwap(bi, bstart); - DiSwap(bi, bend); - DiSwap(bi, bwh); - DiSwap(bi, bdiropq); - /* smp_mb(); */ - -#undef DiSwap -} - -void au_di_cp(struct au_dinfo *dst, struct au_dinfo *src) -{ - AuRwMustWriteLock(&dst->di_rwsem); - AuRwMustWriteLock(&src->di_rwsem); - - dst->di_bstart = src->di_bstart; - dst->di_bend = src->di_bend; - dst->di_bwh = src->di_bwh; - dst->di_bdiropq = src->di_bdiropq; - /* smp_mb(); */ -} - -int au_di_init(struct dentry *dentry) -{ - int err; - struct super_block *sb; - struct au_dinfo *dinfo; - - err = 0; - sb = dentry->d_sb; - dinfo = au_di_alloc(sb, AuLsc_DI_CHILD); - if (dinfo) { - atomic_set(&dinfo->di_generation, au_sigen(sb)); - /* smp_mb(); */ /* atomic_set */ - dentry->d_fsdata = dinfo; - } else - err = -ENOMEM; - - return err; -} - -void au_di_fin(struct dentry *dentry) -{ - struct au_dinfo *dinfo; - - dinfo = au_di(dentry); - AuRwDestroy(&dinfo->di_rwsem); - au_di_free(dinfo); -} - -int au_di_realloc(struct au_dinfo *dinfo, int nbr) -{ - int err, sz; - struct au_hdentry *hdp; - - AuRwMustWriteLock(&dinfo->di_rwsem); - - err = -ENOMEM; - sz = sizeof(*hdp) * (dinfo->di_bend + 1); - if (!sz) - sz = sizeof(*hdp); - hdp = au_kzrealloc(dinfo->di_hdentry, sz, sizeof(*hdp) * nbr, GFP_NOFS); - if (hdp) { - dinfo->di_hdentry = hdp; - err = 0; - } - - return err; -} - -/* ---------------------------------------------------------------------- */ - -static void do_ii_write_lock(struct inode *inode, unsigned int lsc) -{ - switch (lsc) { - case AuLsc_DI_CHILD: - ii_write_lock_child(inode); - break; - case AuLsc_DI_CHILD2: - ii_write_lock_child2(inode); - break; - case AuLsc_DI_CHILD3: - ii_write_lock_child3(inode); - break; - case AuLsc_DI_PARENT: - ii_write_lock_parent(inode); - break; - case AuLsc_DI_PARENT2: - ii_write_lock_parent2(inode); - break; - case AuLsc_DI_PARENT3: - ii_write_lock_parent3(inode); - break; - default: - BUG(); - } -} - -static void do_ii_read_lock(struct inode *inode, unsigned int lsc) -{ - switch (lsc) { - case AuLsc_DI_CHILD: - ii_read_lock_child(inode); - break; - case AuLsc_DI_CHILD2: - ii_read_lock_child2(inode); - break; - case AuLsc_DI_CHILD3: - ii_read_lock_child3(inode); - break; - case AuLsc_DI_PARENT: - ii_read_lock_parent(inode); - break; - case AuLsc_DI_PARENT2: - ii_read_lock_parent2(inode); - break; - case AuLsc_DI_PARENT3: - ii_read_lock_parent3(inode); - break; - default: - BUG(); - } -} - -void di_read_lock(struct dentry *d, int flags, unsigned int lsc) -{ - struct inode *inode; - - au_rw_read_lock_nested(&au_di(d)->di_rwsem, lsc); - if (d_really_is_positive(d)) { - inode = d_inode(d); - if (au_ftest_lock(flags, IW)) - do_ii_write_lock(inode, lsc); - else if (au_ftest_lock(flags, IR)) - do_ii_read_lock(inode, lsc); - } -} - -void di_read_unlock(struct dentry *d, int flags) -{ - struct inode *inode; - - if (d_really_is_positive(d)) { - inode = d_inode(d); - if (au_ftest_lock(flags, IW)) { - au_dbg_verify_dinode(d); - ii_write_unlock(inode); - } else if (au_ftest_lock(flags, IR)) { - au_dbg_verify_dinode(d); - ii_read_unlock(inode); - } - } - au_rw_read_unlock(&au_di(d)->di_rwsem); -} - -void di_downgrade_lock(struct dentry *d, int flags) -{ - if (d_really_is_positive(d) && au_ftest_lock(flags, IR)) - ii_downgrade_lock(d_inode(d)); - au_rw_dgrade_lock(&au_di(d)->di_rwsem); -} - -void di_write_lock(struct dentry *d, unsigned int lsc) -{ - au_rw_write_lock_nested(&au_di(d)->di_rwsem, lsc); - if (d_really_is_positive(d)) - do_ii_write_lock(d_inode(d), lsc); -} - -void di_write_unlock(struct dentry *d) -{ - au_dbg_verify_dinode(d); - if (d_really_is_positive(d)) - ii_write_unlock(d_inode(d)); - au_rw_write_unlock(&au_di(d)->di_rwsem); -} - -void di_write_lock2_child(struct dentry *d1, struct dentry *d2, int isdir) -{ - AuDebugOn(d1 == d2 - || d_inode(d1) == d_inode(d2) - || d1->d_sb != d2->d_sb); - - if (isdir && au_test_subdir(d1, d2)) { - di_write_lock_child(d1); - di_write_lock_child2(d2); - } else { - /* there should be no races */ - di_write_lock_child(d2); - di_write_lock_child2(d1); - } -} - -void di_write_lock2_parent(struct dentry *d1, struct dentry *d2, int isdir) -{ - AuDebugOn(d1 == d2 - || d_inode(d1) == d_inode(d2) - || d1->d_sb != d2->d_sb); - - if (isdir && au_test_subdir(d1, d2)) { - di_write_lock_parent(d1); - di_write_lock_parent2(d2); - } else { - /* there should be no races */ - di_write_lock_parent(d2); - di_write_lock_parent2(d1); - } -} - -void di_write_unlock2(struct dentry *d1, struct dentry *d2) -{ - di_write_unlock(d1); - if (d_inode(d1) == d_inode(d2)) - au_rw_write_unlock(&au_di(d2)->di_rwsem); - else - di_write_unlock(d2); -} - -/* ---------------------------------------------------------------------- */ - -struct dentry *au_h_dptr(struct dentry *dentry, aufs_bindex_t bindex) -{ - struct dentry *d; - - DiMustAnyLock(dentry); - - if (au_dbstart(dentry) < 0 || bindex < au_dbstart(dentry)) - return NULL; - AuDebugOn(bindex < 0); - d = au_di(dentry)->di_hdentry[0 + bindex].hd_dentry; - AuDebugOn(d && au_dcount(d) <= 0); - return d; -} - -/* - * extended version of au_h_dptr(). - * returns a hashed and positive (or linkable) h_dentry in bindex, NULL, or - * error. - */ -struct dentry *au_h_d_alias(struct dentry *dentry, aufs_bindex_t bindex) -{ - struct dentry *h_dentry; - struct inode *inode, *h_inode; - - AuDebugOn(d_really_is_negative(dentry)); - - h_dentry = NULL; - if (au_dbstart(dentry) <= bindex - && bindex <= au_dbend(dentry)) - h_dentry = au_h_dptr(dentry, bindex); - if (h_dentry && !au_d_linkable(h_dentry)) { - dget(h_dentry); - goto out; /* success */ - } - - inode = d_inode(dentry); - AuDebugOn(bindex < au_ibstart(inode)); - AuDebugOn(au_ibend(inode) < bindex); - h_inode = au_h_iptr(inode, bindex); - h_dentry = d_find_alias(h_inode); - if (h_dentry) { - if (!IS_ERR(h_dentry)) { - if (!au_d_linkable(h_dentry)) - goto out; /* success */ - dput(h_dentry); - } else - goto out; - } - - if (au_opt_test(au_mntflags(dentry->d_sb), PLINK)) { - h_dentry = au_plink_lkup(inode, bindex); - AuDebugOn(!h_dentry); - if (!IS_ERR(h_dentry)) { - if (!au_d_hashed_positive(h_dentry)) - goto out; /* success */ - dput(h_dentry); - h_dentry = NULL; - } - } - -out: - AuDbgDentry(h_dentry); - return h_dentry; -} - -aufs_bindex_t au_dbtail(struct dentry *dentry) -{ - aufs_bindex_t bend, bwh; - - bend = au_dbend(dentry); - if (0 <= bend) { - bwh = au_dbwh(dentry); - if (!bwh) - return bwh; - if (0 < bwh && bwh < bend) - return bwh - 1; - } - return bend; -} - -aufs_bindex_t au_dbtaildir(struct dentry *dentry) -{ - aufs_bindex_t bend, bopq; - - bend = au_dbtail(dentry); - if (0 <= bend) { - bopq = au_dbdiropq(dentry); - if (0 <= bopq && bopq < bend) - bend = bopq; - } - return bend; -} - -/* ---------------------------------------------------------------------- */ - -void au_set_h_dptr(struct dentry *dentry, aufs_bindex_t bindex, - struct dentry *h_dentry) -{ - struct au_hdentry *hd = au_di(dentry)->di_hdentry + bindex; - struct au_branch *br; - - DiMustWriteLock(dentry); - - au_hdput(hd); - hd->hd_dentry = h_dentry; - if (h_dentry) { - br = au_sbr(dentry->d_sb, bindex); - hd->hd_id = br->br_id; - } -} - -int au_dbrange_test(struct dentry *dentry) -{ - int err; - aufs_bindex_t bstart, bend; - - err = 0; - bstart = au_dbstart(dentry); - bend = au_dbend(dentry); - if (bstart >= 0) - AuDebugOn(bend < 0 && bstart > bend); - else { - err = -EIO; - AuDebugOn(bend >= 0); - } - - return err; -} - -int au_digen_test(struct dentry *dentry, unsigned int sigen) -{ - int err; - - err = 0; - if (unlikely(au_digen(dentry) != sigen - || au_iigen_test(d_inode(dentry), sigen))) - err = -EIO; - - return err; -} - -void au_update_digen(struct dentry *dentry) -{ - atomic_set(&au_di(dentry)->di_generation, au_sigen(dentry->d_sb)); - /* smp_mb(); */ /* atomic_set */ -} - -void au_update_dbrange(struct dentry *dentry, int do_put_zero) -{ - struct au_dinfo *dinfo; - struct dentry *h_d; - struct au_hdentry *hdp; - - DiMustWriteLock(dentry); - - dinfo = au_di(dentry); - if (!dinfo || dinfo->di_bstart < 0) - return; - - hdp = dinfo->di_hdentry; - if (do_put_zero) { - aufs_bindex_t bindex, bend; - - bend = dinfo->di_bend; - for (bindex = dinfo->di_bstart; bindex <= bend; bindex++) { - h_d = hdp[0 + bindex].hd_dentry; - if (h_d && d_is_negative(h_d)) - au_set_h_dptr(dentry, bindex, NULL); - } - } - - dinfo->di_bstart = -1; - while (++dinfo->di_bstart <= dinfo->di_bend) - if (hdp[0 + dinfo->di_bstart].hd_dentry) - break; - if (dinfo->di_bstart > dinfo->di_bend) { - dinfo->di_bstart = -1; - dinfo->di_bend = -1; - return; - } - - dinfo->di_bend++; - while (0 <= --dinfo->di_bend) - if (hdp[0 + dinfo->di_bend].hd_dentry) - break; - AuDebugOn(dinfo->di_bstart > dinfo->di_bend || dinfo->di_bend < 0); -} - -void au_update_dbstart(struct dentry *dentry) -{ - aufs_bindex_t bindex, bend; - struct dentry *h_dentry; - - bend = au_dbend(dentry); - for (bindex = au_dbstart(dentry); bindex <= bend; bindex++) { - h_dentry = au_h_dptr(dentry, bindex); - if (!h_dentry) - continue; - if (d_is_positive(h_dentry)) { - au_set_dbstart(dentry, bindex); - return; - } - au_set_h_dptr(dentry, bindex, NULL); - } -} - -void au_update_dbend(struct dentry *dentry) -{ - aufs_bindex_t bindex, bstart; - struct dentry *h_dentry; - - bstart = au_dbstart(dentry); - for (bindex = au_dbend(dentry); bindex >= bstart; bindex--) { - h_dentry = au_h_dptr(dentry, bindex); - if (!h_dentry) - continue; - if (d_is_positive(h_dentry)) { - au_set_dbend(dentry, bindex); - return; - } - au_set_h_dptr(dentry, bindex, NULL); - } -} - -int au_find_dbindex(struct dentry *dentry, struct dentry *h_dentry) -{ - aufs_bindex_t bindex, bend; - - bend = au_dbend(dentry); - for (bindex = au_dbstart(dentry); bindex <= bend; bindex++) - if (au_h_dptr(dentry, bindex) == h_dentry) - return bindex; - return -1; -} diff --git a/fs/aufs/dir.c b/fs/aufs/dir.c deleted file mode 100644 index a994e0862..000000000 --- a/fs/aufs/dir.c +++ /dev/null @@ -1,745 +0,0 @@ -/* - * Copyright (C) 2005-2016 Junjiro R. Okajima - */ - -/* - * directory operations - */ - -#include <linux/fs_stack.h> -#include "aufs.h" - -void au_add_nlink(struct inode *dir, struct inode *h_dir) -{ - unsigned int nlink; - - AuDebugOn(!S_ISDIR(dir->i_mode) || !S_ISDIR(h_dir->i_mode)); - - nlink = dir->i_nlink; - nlink += h_dir->i_nlink - 2; - if (h_dir->i_nlink < 2) - nlink += 2; - smp_mb(); /* for i_nlink */ - /* 0 can happen in revaliding */ - set_nlink(dir, nlink); -} - -void au_sub_nlink(struct inode *dir, struct inode *h_dir) -{ - unsigned int nlink; - - AuDebugOn(!S_ISDIR(dir->i_mode) || !S_ISDIR(h_dir->i_mode)); - - nlink = dir->i_nlink; - nlink -= h_dir->i_nlink - 2; - if (h_dir->i_nlink < 2) - nlink -= 2; - smp_mb(); /* for i_nlink */ - /* nlink == 0 means the branch-fs is broken */ - set_nlink(dir, nlink); -} - -loff_t au_dir_size(struct file *file, struct dentry *dentry) -{ - loff_t sz; - aufs_bindex_t bindex, bend; - struct file *h_file; - struct dentry *h_dentry; - - sz = 0; - if (file) { - AuDebugOn(!d_is_dir(file->f_path.dentry)); - - bend = au_fbend_dir(file); - for (bindex = au_fbstart(file); - bindex <= bend && sz < KMALLOC_MAX_SIZE; - bindex++) { - h_file = au_hf_dir(file, bindex); - if (h_file && file_inode(h_file)) - sz += vfsub_f_size_read(h_file); - } - } else { - AuDebugOn(!dentry); - AuDebugOn(!d_is_dir(dentry)); - - bend = au_dbtaildir(dentry); - for (bindex = au_dbstart(dentry); - bindex <= bend && sz < KMALLOC_MAX_SIZE; - bindex++) { - h_dentry = au_h_dptr(dentry, bindex); - if (h_dentry && d_is_positive(h_dentry)) - sz += i_size_read(d_inode(h_dentry)); - } - } - if (sz < KMALLOC_MAX_SIZE) - sz = roundup_pow_of_two(sz); - if (sz > KMALLOC_MAX_SIZE) - sz = KMALLOC_MAX_SIZE; - else if (sz < NAME_MAX) { - BUILD_BUG_ON(AUFS_RDBLK_DEF < NAME_MAX); - sz = AUFS_RDBLK_DEF; - } - return sz; -} - -struct au_dir_ts_arg { - struct dentry *dentry; - aufs_bindex_t brid; -}; - -static void au_do_dir_ts(void *arg) -{ - struct au_dir_ts_arg *a = arg; - struct au_dtime dt; - struct path h_path; - struct inode *dir, *h_dir; - struct super_block *sb; - struct au_branch *br; - struct au_hinode *hdir; - int err; - aufs_bindex_t bstart, bindex; - - sb = a->dentry->d_sb; - if (d_really_is_negative(a->dentry)) - goto out; - /* no dir->i_mutex lock */ - aufs_read_lock(a->dentry, AuLock_DW); /* noflush */ - - dir = d_inode(a->dentry); - bstart = au_ibstart(dir); - bindex = au_br_index(sb, a->brid); - if (bindex < bstart) - goto out_unlock; - - br = au_sbr(sb, bindex); - h_path.dentry = au_h_dptr(a->dentry, bindex); - if (!h_path.dentry) - goto out_unlock; - h_path.mnt = au_br_mnt(br); - au_dtime_store(&dt, a->dentry, &h_path); - - br = au_sbr(sb, bstart); - if (!au_br_writable(br->br_perm)) - goto out_unlock; - h_path.dentry = au_h_dptr(a->dentry, bstart); - h_path.mnt = au_br_mnt(br); - err = vfsub_mnt_want_write(h_path.mnt); - if (err) - goto out_unlock; - hdir = au_hi(dir, bstart); - au_hn_imtx_lock_nested(hdir, AuLsc_I_PARENT); - h_dir = au_h_iptr(dir, bstart); - if (h_dir->i_nlink - && timespec_compare(&h_dir->i_mtime, &dt.dt_mtime) < 0) { - dt.dt_h_path = h_path; - au_dtime_revert(&dt); - } - au_hn_imtx_unlock(hdir); - vfsub_mnt_drop_write(h_path.mnt); - au_cpup_attr_timesizes(dir); - -out_unlock: - aufs_read_unlock(a->dentry, AuLock_DW); -out: - dput(a->dentry); - au_nwt_done(&au_sbi(sb)->si_nowait); - kfree(arg); -} - -void au_dir_ts(struct inode *dir, aufs_bindex_t bindex) -{ - int perm, wkq_err; - aufs_bindex_t bstart; - struct au_dir_ts_arg *arg; - struct dentry *dentry; - struct super_block *sb; - - IMustLock(dir); - - dentry = d_find_any_alias(dir); - AuDebugOn(!dentry); - sb = dentry->d_sb; - bstart = au_ibstart(dir); - if (bstart == bindex) { - au_cpup_attr_timesizes(dir); - goto out; - } - - perm = au_sbr_perm(sb, bstart); - if (!au_br_writable(perm)) - goto out; - - arg = kmalloc(sizeof(*arg), GFP_NOFS); - if (!arg) - goto out; - - arg->dentry = dget(dentry); /* will be dput-ted by au_do_dir_ts() */ - arg->brid = au_sbr_id(sb, bindex); - wkq_err = au_wkq_nowait(au_do_dir_ts, arg, sb, /*flags*/0); - if (unlikely(wkq_err)) { - pr_err("wkq %d\n", wkq_err); - dput(dentry); - kfree(arg); - } - -out: - dput(dentry); -} - -/* ---------------------------------------------------------------------- */ - -static int reopen_dir(struct file *file) -{ - int err; - unsigned int flags; - aufs_bindex_t bindex, btail, bstart; - struct dentry *dentry, *h_dentry; - struct file *h_file; - - /* open all lower dirs */ - dentry = file->f_path.dentry; - bstart = au_dbstart(dentry); - for (bindex = au_fbstart(file); bindex < bstart; bindex++) - au_set_h_fptr(file, bindex, NULL); - au_set_fbstart(file, bstart); - - btail = au_dbtaildir(dentry); - for (bindex = au_fbend_dir(file); btail < bindex; bindex--) - au_set_h_fptr(file, bindex, NULL); - au_set_fbend_dir(file, btail); - - flags = vfsub_file_flags(file); - for (bindex = bstart; bindex <= btail; bindex++) { - h_dentry = au_h_dptr(dentry, bindex); - if (!h_dentry) - continue; - h_file = au_hf_dir(file, bindex); - if (h_file) - continue; - - h_file = au_h_open(dentry, bindex, flags, file, /*force_wr*/0); - err = PTR_ERR(h_file); - if (IS_ERR(h_file)) - goto out; /* close all? */ - au_set_h_fptr(file, bindex, h_file); - } - au_update_figen(file); - /* todo: necessary? */ - /* file->f_ra = h_file->f_ra; */ - err = 0; - -out: - return err; -} - -static int do_open_dir(struct file *file, int flags, struct file *h_file) -{ - int err; - aufs_bindex_t bindex, btail; - struct dentry *dentry, *h_dentry; - struct vfsmount *mnt; - - FiMustWriteLock(file); - AuDebugOn(h_file); - - err = 0; - mnt = file->f_path.mnt; - dentry = file->f_path.dentry; - file->f_version = d_inode(dentry)->i_version; - bindex = au_dbstart(dentry); - au_set_fbstart(file, bindex); - btail = au_dbtaildir(dentry); - au_set_fbend_dir(file, btail); - for (; !err && bindex <= btail; bindex++) { - h_dentry = au_h_dptr(dentry, bindex); - if (!h_dentry) - continue; - - err = vfsub_test_mntns(mnt, h_dentry->d_sb); - if (unlikely(err)) - break; - h_file = au_h_open(dentry, bindex, flags, file, /*force_wr*/0); - if (IS_ERR(h_file)) { - err = PTR_ERR(h_file); - break; - } - au_set_h_fptr(file, bindex, h_file); - } - au_update_figen(file); - /* todo: necessary? */ - /* file->f_ra = h_file->f_ra; */ - if (!err) - return 0; /* success */ - - /* close all */ - for (bindex = au_fbstart(file); bindex <= btail; bindex++) - au_set_h_fptr(file, bindex, NULL); - au_set_fbstart(file, -1); - au_set_fbend_dir(file, -1); - - return err; -} - -static int aufs_open_dir(struct inode *inode __maybe_unused, - struct file *file) -{ - int err; - struct super_block *sb; - struct au_fidir *fidir; - - err = -ENOMEM; - sb = file->f_path.dentry->d_sb; - si_read_lock(sb, AuLock_FLUSH); - fidir = au_fidir_alloc(sb); - if (fidir) { - struct au_do_open_args args = { - .open = do_open_dir, - .fidir = fidir - }; - err = au_do_open(file, &args); - if (unlikely(err)) - kfree(fidir); - } - si_read_unlock(sb); - return err; -} - -static int aufs_release_dir(struct inode *inode __maybe_unused, - struct file *file) -{ - struct au_vdir *vdir_cache; - struct au_finfo *finfo; - struct au_fidir *fidir; - aufs_bindex_t bindex, bend; - - finfo = au_fi(file); - fidir = finfo->fi_hdir; - if (fidir) { - au_sphl_del(&finfo->fi_hlist, - &au_sbi(file->f_path.dentry->d_sb)->si_files); - vdir_cache = fidir->fd_vdir_cache; /* lock-free */ - if (vdir_cache) - au_vdir_free(vdir_cache); - - bindex = finfo->fi_btop; - if (bindex >= 0) { - /* - * calls fput() instead of filp_close(), - * since no dnotify or lock for the lower file. - */ - bend = fidir->fd_bbot; - for (; bindex <= bend; bindex++) - au_set_h_fptr(file, bindex, NULL); - } - kfree(fidir); - finfo->fi_hdir = NULL; - } - au_finfo_fin(file); - return 0; -} - -/* ---------------------------------------------------------------------- */ - -static int au_do_flush_dir(struct file *file, fl_owner_t id) -{ - int err; - aufs_bindex_t bindex, bend; - struct file *h_file; - - err = 0; - bend = au_fbend_dir(file); - for (bindex = au_fbstart(file); !err && bindex <= bend; bindex++) { - h_file = au_hf_dir(file, bindex); - if (h_file) - err = vfsub_flush(h_file, id); - } - return err; -} - -static int aufs_flush_dir(struct file *file, fl_owner_t id) -{ - return au_do_flush(file, id, au_do_flush_dir); -} - -/* ---------------------------------------------------------------------- */ - -static int au_do_fsync_dir_no_file(struct dentry *dentry, int datasync) -{ - int err; - aufs_bindex_t bend, bindex; - struct inode *inode; - struct super_block *sb; - - err = 0; - sb = dentry->d_sb; - inode = d_inode(dentry); - IMustLock(inode); - bend = au_dbend(dentry); - for (bindex = au_dbstart(dentry); !err && bindex <= bend; bindex++) { - struct path h_path; - - if (au_test_ro(sb, bindex, inode)) - continue; - h_path.dentry = au_h_dptr(dentry, bindex); - if (!h_path.dentry) - continue; - - h_path.mnt = au_sbr_mnt(sb, bindex); - err = vfsub_fsync(NULL, &h_path, datasync); - } - - return err; -} - -static int au_do_fsync_dir(struct file *file, int datasync) -{ - int err; - aufs_bindex_t bend, bindex; - struct file *h_file; - struct super_block *sb; - struct inode *inode; - - err = au_reval_and_lock_fdi(file, reopen_dir, /*wlock*/1); - if (unlikely(err)) - goto out; - - inode = file_inode(file); - sb = inode->i_sb; - bend = au_fbend_dir(file); - for (bindex = au_fbstart(file); !err && bindex <= bend; bindex++) { - h_file = au_hf_dir(file, bindex); - if (!h_file || au_test_ro(sb, bindex, inode)) - continue; - - err = vfsub_fsync(h_file, &h_file->f_path, datasync); - } - -out: - return err; -} - -/* - * @file may be NULL - */ -static int aufs_fsync_dir(struct file *file, loff_t start, loff_t end, - int datasync) -{ - int err; - struct dentry *dentry; - struct inode *inode; - struct super_block *sb; - struct mutex *mtx; - - err = 0; - dentry = file->f_path.dentry; - inode = d_inode(dentry); - mtx = &inode->i_mutex; - mutex_lock(mtx); - sb = dentry->d_sb; - si_noflush_read_lock(sb); - if (file) - err = au_do_fsync_dir(file, datasync); - else { - di_write_lock_child(dentry); - err = au_do_fsync_dir_no_file(dentry, datasync); - } - au_cpup_attr_timesizes(inode); - di_write_unlock(dentry); - if (file) - fi_write_unlock(file); - - si_read_unlock(sb); - mutex_unlock(mtx); - return err; -} - -/* ---------------------------------------------------------------------- */ - -static int aufs_iterate(struct file *file, struct dir_context *ctx) -{ - int err; - struct dentry *dentry; - struct inode *inode, *h_inode; - struct super_block *sb; - - AuDbg("%pD, ctx{%pf, %llu}\n", file, ctx->actor, ctx->pos); - - dentry = file->f_path.dentry; - inode = d_inode(dentry); - IMustLock(inode); - - sb = dentry->d_sb; - si_read_lock(sb, AuLock_FLUSH); - err = au_reval_and_lock_fdi(file, reopen_dir, /*wlock*/1); - if (unlikely(err)) - goto out; - err = au_alive_dir(dentry); - if (!err) - err = au_vdir_init(file); - di_downgrade_lock(dentry, AuLock_IR); - if (unlikely(err)) - goto out_unlock; - - h_inode = au_h_iptr(inode, au_ibstart(inode)); - if (!au_test_nfsd()) { - err = au_vdir_fill_de(file, ctx); - fsstack_copy_attr_atime(inode, h_inode); - } else { - /* - * nfsd filldir may call lookup_one_len(), vfs_getattr(), - * encode_fh() and others. - */ - atomic_inc(&h_inode->i_count); - di_read_unlock(dentry, AuLock_IR); - si_read_unlock(sb); - err = au_vdir_fill_de(file, ctx); - fsstack_copy_attr_atime(inode, h_inode); - fi_write_unlock(file); - iput(h_inode); - - AuTraceErr(err); - return err; - } - -out_unlock: - di_read_unlock(dentry, AuLock_IR); - fi_write_unlock(file); -out: - si_read_unlock(sb); - return err; -} - -/* ---------------------------------------------------------------------- */ - -#define AuTestEmpty_WHONLY 1 -#define AuTestEmpty_CALLED (1 << 1) -#define AuTestEmpty_SHWH (1 << 2) -#define au_ftest_testempty(flags, name) ((flags) & AuTestEmpty_##name) -#define au_fset_testempty(flags, name) \ - do { (flags) |= AuTestEmpty_##name; } while (0) -#define au_fclr_testempty(flags, name) \ - do { (flags) &= ~AuTestEmpty_##name; } while (0) - -#ifndef CONFIG_AUFS_SHWH -#undef AuTestEmpty_SHWH -#define AuTestEmpty_SHWH 0 -#endif - -struct test_empty_arg { - struct dir_context ctx; - struct au_nhash *whlist; - unsigned int flags; - int err; - aufs_bindex_t bindex; -}; - -static int test_empty_cb(struct dir_context *ctx, const char *__name, - int namelen, loff_t offset __maybe_unused, u64 ino, - unsigned int d_type) -{ - struct test_empty_arg *arg = container_of(ctx, struct test_empty_arg, - ctx); - char *name = (void *)__name; - - arg->err = 0; - au_fset_testempty(arg->flags, CALLED); - /* smp_mb(); */ - if (name[0] == '.' - && (namelen == 1 || (name[1] == '.' && namelen == 2))) - goto out; /* success */ - - if (namelen <= AUFS_WH_PFX_LEN - || memcmp(name, AUFS_WH_PFX, AUFS_WH_PFX_LEN)) { - if (au_ftest_testempty(arg->flags, WHONLY) - && !au_nhash_test_known_wh(arg->whlist, name, namelen)) - arg->err = -ENOTEMPTY; - goto out; - } - - name += AUFS_WH_PFX_LEN; - namelen -= AUFS_WH_PFX_LEN; - if (!au_nhash_test_known_wh(arg->whlist, name, namelen)) - arg->err = au_nhash_append_wh - (arg->whlist, name, namelen, ino, d_type, arg->bindex, - au_ftest_testempty(arg->flags, SHWH)); - -out: - /* smp_mb(); */ - AuTraceErr(arg->err); - return arg->err; -} - -static int do_test_empty(struct dentry *dentry, struct test_empty_arg *arg) -{ - int err; - struct file *h_file; - - h_file = au_h_open(dentry, arg->bindex, - O_RDONLY | O_NONBLOCK | O_DIRECTORY | O_LARGEFILE, - /*file*/NULL, /*force_wr*/0); - err = PTR_ERR(h_file); - if (IS_ERR(h_file)) - goto out; - - err = 0; - if (!au_opt_test(au_mntflags(dentry->d_sb), UDBA_NONE) - && !file_inode(h_file)->i_nlink) - goto out_put; - - do { - arg->err = 0; - au_fclr_testempty(arg->flags, CALLED); - /* smp_mb(); */ - err = vfsub_iterate_dir(h_file, &arg->ctx); - if (err >= 0) - err = arg->err; - } while (!err && au_ftest_testempty(arg->flags, CALLED)); - -out_put: - fput(h_file); - au_sbr_put(dentry->d_sb, arg->bindex); -out: - return err; -} - -struct do_test_empty_args { - int *errp; - struct dentry *dentry; - struct test_empty_arg *arg; -}; - -static void call_do_test_empty(void *args) -{ - struct do_test_empty_args *a = args; - *a->errp = do_test_empty(a->dentry, a->arg); -} - -static int sio_test_empty(struct dentry *dentry, struct test_empty_arg *arg) -{ - int err, wkq_err; - struct dentry *h_dentry; - struct inode *h_inode; - - h_dentry = au_h_dptr(dentry, arg->bindex); - h_inode = d_inode(h_dentry); - /* todo: i_mode changes anytime? */ - mutex_lock_nested(&h_inode->i_mutex, AuLsc_I_CHILD); - err = au_test_h_perm_sio(h_inode, MAY_EXEC | MAY_READ); - mutex_unlock(&h_inode->i_mutex); - if (!err) - err = do_test_empty(dentry, arg); - else { - struct do_test_empty_args args = { - .errp = &err, - .dentry = dentry, - .arg = arg - }; - unsigned int flags = arg->flags; - - wkq_err = au_wkq_wait(call_do_test_empty, &args); - if (unlikely(wkq_err)) - err = wkq_err; - arg->flags = flags; - } - - return err; -} - -int au_test_empty_lower(struct dentry *dentry) -{ - int err; - unsigned int rdhash; - aufs_bindex_t bindex, bstart, btail; - struct au_nhash whlist; - struct test_empty_arg arg = { - .ctx = { - .actor = test_empty_cb - } - }; - int (*test_empty)(struct dentry *dentry, struct test_empty_arg *arg); - - SiMustAnyLock(dentry->d_sb); - - rdhash = au_sbi(dentry->d_sb)->si_rdhash; - if (!rdhash) - rdhash = au_rdhash_est(au_dir_size(/*file*/NULL, dentry)); - err = au_nhash_alloc(&whlist, rdhash, GFP_NOFS); - if (unlikely(err)) - goto out; - - arg.flags = 0; - arg.whlist = &whlist; - bstart = au_dbstart(dentry); - if (au_opt_test(au_mntflags(dentry->d_sb), SHWH)) - au_fset_testempty(arg.flags, SHWH); - test_empty = do_test_empty; - if (au_opt_test(au_mntflags(dentry->d_sb), DIRPERM1)) - test_empty = sio_test_empty; - arg.bindex = bstart; - err = test_empty(dentry, &arg); - if (unlikely(err)) - goto out_whlist; - - au_fset_testempty(arg.flags, WHONLY); - btail = au_dbtaildir(dentry); - for (bindex = bstart + 1; !err && bindex <= btail; bindex++) { - struct dentry *h_dentry; - - h_dentry = au_h_dptr(dentry, bindex); - if (h_dentry && d_is_positive(h_dentry)) { - arg.bindex = bindex; - err = test_empty(dentry, &arg); - } - } - -out_whlist: - au_nhash_wh_free(&whlist); -out: - return err; -} - -int au_test_empty(struct dentry *dentry, struct au_nhash *whlist) -{ - int err; - struct test_empty_arg arg = { - .ctx = { - .actor = test_empty_cb - } - }; - aufs_bindex_t bindex, btail; - - err = 0; - arg.whlist = whlist; - arg.flags = AuTestEmpty_WHONLY; - if (au_opt_test(au_mntflags(dentry->d_sb), SHWH)) - au_fset_testempty(arg.flags, SHWH); - btail = au_dbtaildir(dentry); - for (bindex = au_dbstart(dentry); !err && bindex <= btail; bindex++) { - struct dentry *h_dentry; - - h_dentry = au_h_dptr(dentry, bindex); - if (h_dentry && d_is_positive(h_dentry)) { - arg.bindex = bindex; - err = sio_test_empty(dentry, &arg); - } - } - - return err; -} - -/* ---------------------------------------------------------------------- */ - -const struct file_operations aufs_dir_fop = { - .owner = THIS_MODULE, - .llseek = default_llseek, - .read = generic_read_dir, - .iterate = aufs_iterate, - .unlocked_ioctl = aufs_ioctl_dir, -#ifdef CONFIG_COMPAT - .compat_ioctl = aufs_compat_ioctl_dir, -#endif - .open = aufs_open_dir, - .release = aufs_release_dir, - .flush = aufs_flush_dir, - .fsync = aufs_fsync_dir -}; diff --git a/fs/aufs/dir.h b/fs/aufs/dir.h deleted file mode 100644 index b0a79d722..000000000 --- a/fs/aufs/dir.h +++ /dev/null @@ -1,118 +0,0 @@ -/* - * Copyright (C) 2005-2016 Junjiro R. Okajima - */ - -/* - * directory operations - */ - -#ifndef __AUFS_DIR_H__ -#define __AUFS_DIR_H__ - -#ifdef __KERNEL__ - -#include <linux/fs.h> - -/* ---------------------------------------------------------------------- */ - -/* need to be faster and smaller */ - -struct au_nhash { - unsigned int nh_num; - struct hlist_head *nh_head; -}; - -struct au_vdir_destr { - unsigned char len; - unsigned char name[0]; -} __packed; - -struct au_vdir_dehstr { - struct hlist_node hash; - struct au_vdir_destr *str; -} ____cacheline_aligned_in_smp; - -struct au_vdir_de { - ino_t de_ino; - unsigned char de_type; - /* caution: packed */ - struct au_vdir_destr de_str; -} __packed; - -struct au_vdir_wh { - struct hlist_node wh_hash; -#ifdef CONFIG_AUFS_SHWH - ino_t wh_ino; - aufs_bindex_t wh_bindex; - unsigned char wh_type; -#else - aufs_bindex_t wh_bindex; -#endif - /* caution: packed */ - struct au_vdir_destr wh_str; -} __packed; - -union au_vdir_deblk_p { - unsigned char *deblk; - struct au_vdir_de *de; -}; - -struct au_vdir { - unsigned char **vd_deblk; - unsigned long vd_nblk; - struct { - unsigned long ul; - union au_vdir_deblk_p p; - } vd_last; - - unsigned long vd_version; - unsigned int vd_deblk_sz; - unsigned long vd_jiffy; -} ____cacheline_aligned_in_smp; - -/* ---------------------------------------------------------------------- */ - -/* dir.c */ -extern const struct file_operations aufs_dir_fop; -void au_add_nlink(struct inode *dir, struct inode *h_dir); -void au_sub_nlink(struct inode *dir, struct inode *h_dir); -loff_t au_dir_size(struct file *file, struct dentry *dentry); -void au_dir_ts(struct inode *dir, aufs_bindex_t bsrc); -int au_test_empty_lower(struct dentry *dentry); -int au_test_empty(struct dentry *dentry, struct au_nhash *whlist); - -/* vdir.c */ -unsigned int au_rdhash_est(loff_t sz); -int au_nhash_alloc(struct au_nhash *nhash, unsigned int num_hash, gfp_t gfp); -void au_nhash_wh_free(struct au_nhash *whlist); -int au_nhash_test_longer_wh(struct au_nhash *whlist, aufs_bindex_t btgt, - int limit); -int au_nhash_test_known_wh(struct au_nhash *whlist, char *name, int nlen); -int au_nhash_append_wh(struct au_nhash *whlist, char *name, int nlen, ino_t ino, - unsigned int d_type, aufs_bindex_t bindex, - unsigned char shwh); -void au_vdir_free(struct au_vdir *vdir); -int au_vdir_init(struct file *file); -int au_vdir_fill_de(struct file *file, struct dir_context *ctx); - -/* ioctl.c */ -long aufs_ioctl_dir(struct file *file, unsigned int cmd, unsigned long arg); - -#ifdef CONFIG_AUFS_RDU -/* rdu.c */ -long au_rdu_ioctl(struct file *file, unsigned int cmd, unsigned long arg); -#ifdef CONFIG_COMPAT -long au_rdu_compat_ioctl(struct file *file, unsigned int cmd, - unsigned long arg); -#endif -#else -AuStub(long, au_rdu_ioctl, return -EINVAL, struct file *file, - unsigned int cmd, unsigned long arg) -#ifdef CONFIG_COMPAT -AuStub(long, au_rdu_compat_ioctl, return -EINVAL, struct file *file, - unsigned int cmd, unsigned long arg) -#endif -#endif - -#endif /* __KERNEL__ */ -#endif /* __AUFS_DIR_H__ */ diff --git a/fs/aufs/dynop.c b/fs/aufs/dynop.c deleted file mode 100644 index 53a8b55d8..000000000 --- a/fs/aufs/dynop.c +++ /dev/null @@ -1,356 +0,0 @@ -/* - * Copyright (C) 2010-2016 Junjiro R. Okajima - */ - -/* - * dynamically customizable operations for regular files - */ - -#include "aufs.h" - -#define DyPrSym(key) AuDbgSym(key->dk_op.dy_hop) - -/* - * How large will these lists be? - * Usually just a few elements, 20-30 at most for each, I guess. - */ -static struct au_splhead dynop[AuDyLast]; - -static struct au_dykey *dy_gfind_get(struct au_splhead *spl, const void *h_op) -{ - struct au_dykey *key, *tmp; - struct list_head *head; - - key = NULL; - head = &spl->head; - rcu_read_lock(); - list_for_each_entry_rcu(tmp, head, dk_list) - if (tmp->dk_op.dy_hop == h_op) { - key = tmp; - kref_get(&key->dk_kref); - break; - } - rcu_read_unlock(); - - return key; -} - -static struct au_dykey *dy_bradd(struct au_branch *br, struct au_dykey *key) -{ - struct au_dykey **k, *found; - const void *h_op = key->dk_op.dy_hop; - int i; - - found = NULL; - k = br->br_dykey; - for (i = 0; i < AuBrDynOp; i++) - if (k[i]) { - if (k[i]->dk_op.dy_hop == h_op) { - found = k[i]; - break; - } - } else - break; - if (!found) { - spin_lock(&br->br_dykey_lock); - for (; i < AuBrDynOp; i++) - if (k[i]) { - if (k[i]->dk_op.dy_hop == h_op) { - found = k[i]; - break; - } - } else { - k[i] = key; - break; - } - spin_unlock(&br->br_dykey_lock); - BUG_ON(i == AuBrDynOp); /* expand the array */ - } - - return found; -} - -/* kref_get() if @key is already added */ -static struct au_dykey *dy_gadd(struct au_splhead *spl, struct au_dykey *key) -{ - struct au_dykey *tmp, *found; - struct list_head *head; - const void *h_op = key->dk_op.dy_hop; - - found = NULL; - head = &spl->head; - spin_lock(&spl->spin); - list_for_each_entry(tmp, head, dk_list) - if (tmp->dk_op.dy_hop == h_op) { - kref_get(&tmp->dk_kref); - found = tmp; - break; - } - if (!found) - list_add_rcu(&key->dk_list, head); - spin_unlock(&spl->spin); - - if (!found) - DyPrSym(key); - return found; -} - -static void dy_free_rcu(struct rcu_head *rcu) -{ - struct au_dykey *key; - - key = container_of(rcu, struct au_dykey, dk_rcu); - DyPrSym(key); - kfree(key); -} - -static void dy_free(struct kref *kref) -{ - struct au_dykey *key; - struct au_splhead *spl; - - key = container_of(kref, struct au_dykey, dk_kref); - spl = dynop + key->dk_op.dy_type; - au_spl_del_rcu(&key->dk_list, spl); - call_rcu(&key->dk_rcu, dy_free_rcu); -} - -void au_dy_put(struct au_dykey *key) -{ - kref_put(&key->dk_kref, dy_free); -} - -/* ---------------------------------------------------------------------- */ - -#define DyDbgSize(cnt, op) AuDebugOn(cnt != sizeof(op)/sizeof(void *)) - -#ifdef CONFIG_AUFS_DEBUG -#define DyDbgDeclare(cnt) unsigned int cnt = 0 -#define DyDbgInc(cnt) do { cnt++; } while (0) -#else -#define DyDbgDeclare(cnt) do {} while (0) -#define DyDbgInc(cnt) do {} while (0) -#endif - -#define DySet(func, dst, src, h_op, h_sb) do { \ - DyDbgInc(cnt); \ - if (h_op->func) { \ - if (src.func) \ - dst.func = src.func; \ - else \ - AuDbg("%s %s\n", au_sbtype(h_sb), #func); \ - } \ -} while (0) - -#define DySetForce(func, dst, src) do { \ - AuDebugOn(!src.func); \ - DyDbgInc(cnt); \ - dst.func = src.func; \ -} while (0) - -#define DySetAop(func) \ - DySet(func, dyaop->da_op, aufs_aop, h_aop, h_sb) -#define DySetAopForce(func) \ - DySetForce(func, dyaop->da_op, aufs_aop) - -static void dy_aop(struct au_dykey *key, const void *h_op, - struct super_block *h_sb __maybe_unused) -{ - struct au_dyaop *dyaop = (void *)key; - const struct address_space_operations *h_aop = h_op; - DyDbgDeclare(cnt); - - AuDbg("%s\n", au_sbtype(h_sb)); - - DySetAop(writepage); - DySetAopForce(readpage); /* force */ - DySetAop(writepages); - DySetAop(set_page_dirty); - DySetAop(readpages); - DySetAop(write_begin); - DySetAop(write_end); - DySetAop(bmap); - DySetAop(invalidatepage); - DySetAop(releasepage); - DySetAop(freepage); - /* this one will be changed according to an aufs mount option */ - DySetAop(direct_IO); - DySetAop(migratepage); - DySetAop(launder_page); - DySetAop(is_partially_uptodate); - DySetAop(is_dirty_writeback); - DySetAop(error_remove_page); - DySetAop(swap_activate); - DySetAop(swap_deactivate); - - DyDbgSize(cnt, *h_aop); -} - -/* ---------------------------------------------------------------------- */ - -static void dy_bug(struct kref *kref) -{ - BUG(); -} - -static struct au_dykey *dy_get(struct au_dynop *op, struct au_branch *br) -{ - struct au_dykey *key, *old; - struct au_splhead *spl; - struct op { - unsigned int sz; - void (*set)(struct au_dykey *key, const void *h_op, - struct super_block *h_sb __maybe_unused); - }; - static const struct op a[] = { - [AuDy_AOP] = { - .sz = sizeof(struct au_dyaop), - .set = dy_aop - } - }; - const struct op *p; - - spl = dynop + op->dy_type; - key = dy_gfind_get(spl, op->dy_hop); - if (key) - goto out_add; /* success */ - - p = a + op->dy_type; - key = kzalloc(p->sz, GFP_NOFS); - if (unlikely(!key)) { - key = ERR_PTR(-ENOMEM); - goto out; - } - - key->dk_op.dy_hop = op->dy_hop; - kref_init(&key->dk_kref); - p->set(key, op->dy_hop, au_br_sb(br)); - old = dy_gadd(spl, key); - if (old) { - kfree(key); - key = old; - } - -out_add: - old = dy_bradd(br, key); - if (old) - /* its ref-count should never be zero here */ - kref_put(&key->dk_kref, dy_bug); -out: - return key; -} - -/* ---------------------------------------------------------------------- */ -/* - * Aufs prohibits O_DIRECT by defaut even if the branch supports it. - * This behaviour is necessary to return an error from open(O_DIRECT) instead - * of the succeeding I/O. The dio mount option enables O_DIRECT and makes - * open(O_DIRECT) always succeed, but the succeeding I/O may return an error. - * See the aufs manual in detail. - */ -static void dy_adx(struct au_dyaop *dyaop, int do_dx) -{ - if (!do_dx) - dyaop->da_op.direct_IO = NULL; - else - dyaop->da_op.direct_IO = aufs_aop.direct_IO; -} - -static struct au_dyaop *dy_aget(struct au_branch *br, - const struct address_space_operations *h_aop, - int do_dx) -{ - struct au_dyaop *dyaop; - struct au_dynop op; - - op.dy_type = AuDy_AOP; - op.dy_haop = h_aop; - dyaop = (void *)dy_get(&op, br); - if (IS_ERR(dyaop)) - goto out; - dy_adx(dyaop, do_dx); - -out: - return dyaop; -} - -int au_dy_iaop(struct inode *inode, aufs_bindex_t bindex, - struct inode *h_inode) -{ - int err, do_dx; - struct super_block *sb; - struct au_branch *br; - struct au_dyaop *dyaop; - - AuDebugOn(!S_ISREG(h_inode->i_mode)); - IiMustWriteLock(inode); - - sb = inode->i_sb; - br = au_sbr(sb, bindex); - do_dx = !!au_opt_test(au_mntflags(sb), DIO); - dyaop = dy_aget(br, h_inode->i_mapping->a_ops, do_dx); - err = PTR_ERR(dyaop); - if (IS_ERR(dyaop)) - /* unnecessary to call dy_fput() */ - goto out; - - err = 0; - inode->i_mapping->a_ops = &dyaop->da_op; - -out: - return err; -} - -/* - * Is it safe to replace a_ops during the inode/file is in operation? - * Yes, I hope so. - */ -int au_dy_irefresh(struct inode *inode) -{ - int err; - aufs_bindex_t bstart; - struct inode *h_inode; - - err = 0; - if (S_ISREG(inode->i_mode)) { - bstart = au_ibstart(inode); - h_inode = au_h_iptr(inode, bstart); - err = au_dy_iaop(inode, bstart, h_inode); - } - return err; -} - -void au_dy_arefresh(int do_dx) -{ - struct au_splhead *spl; - struct list_head *head; - struct au_dykey *key; - - spl = dynop + AuDy_AOP; - head = &spl->head; - spin_lock(&spl->spin); - list_for_each_entry(key, head, dk_list) - dy_adx((void *)key, do_dx); - spin_unlock(&spl->spin); -} - -/* ---------------------------------------------------------------------- */ - -void __init au_dy_init(void) -{ - int i; - - /* make sure that 'struct au_dykey *' can be any type */ - BUILD_BUG_ON(offsetof(struct au_dyaop, da_key)); - - for (i = 0; i < AuDyLast; i++) - au_spl_init(dynop + i); -} - -void au_dy_fin(void) -{ - int i; - - for (i = 0; i < AuDyLast; i++) - WARN_ON(!list_empty(&dynop[i].head)); -} diff --git a/fs/aufs/dynop.h b/fs/aufs/dynop.h deleted file mode 100644 index 8680bfc53..000000000 --- a/fs/aufs/dynop.h +++ /dev/null @@ -1,61 +0,0 @@ -/* - * Copyright (C) 2010-2016 Junjiro R. Okajima - */ - -/* - * dynamically customizable operations (for regular files only) - */ - -#ifndef __AUFS_DYNOP_H__ -#define __AUFS_DYNOP_H__ - -#ifdef __KERNEL__ - -#include <linux/fs.h> -#include <linux/kref.h> - -enum {AuDy_AOP, AuDyLast}; - -struct au_dynop { - int dy_type; - union { - const void *dy_hop; - const struct address_space_operations *dy_haop; - }; -}; - -struct au_dykey { - union { - struct list_head dk_list; - struct rcu_head dk_rcu; - }; - struct au_dynop dk_op; - - /* - * during I am in the branch local array, kref is gotten. when the - * branch is removed, kref is put. - */ - struct kref dk_kref; -}; - -/* stop unioning since their sizes are very different from each other */ -struct au_dyaop { - struct au_dykey da_key; - struct address_space_operations da_op; /* not const */ -}; - -/* ---------------------------------------------------------------------- */ - -/* dynop.c */ -struct au_branch; -void au_dy_put(struct au_dykey *key); -int au_dy_iaop(struct inode *inode, aufs_bindex_t bindex, - struct inode *h_inode); -int au_dy_irefresh(struct inode *inode); -void au_dy_arefresh(int do_dio); - -void __init au_dy_init(void); -void au_dy_fin(void); - -#endif /* __KERNEL__ */ -#endif /* __AUFS_DYNOP_H__ */ diff --git a/fs/aufs/export.c b/fs/aufs/export.c deleted file mode 100644 index 7f6fec61f..000000000 --- a/fs/aufs/export.c +++ /dev/null @@ -1,819 +0,0 @@ -/* - * Copyright (C) 2005-2016 Junjiro R. Okajima - */ - -/* - * export via nfs - */ - -#include <linux/exportfs.h> -#include <linux/fs_struct.h> -#include <linux/namei.h> -#include <linux/nsproxy.h> -#include <linux/random.h> -#include <linux/writeback.h> -#include "../fs/mount.h" -#include "aufs.h" - -union conv { -#ifdef CONFIG_AUFS_INO_T_64 - __u32 a[2]; -#else - __u32 a[1]; -#endif - ino_t ino; -}; - -static ino_t decode_ino(__u32 *a) -{ - union conv u; - - BUILD_BUG_ON(sizeof(u.ino) != sizeof(u.a)); - u.a[0] = a[0]; -#ifdef CONFIG_AUFS_INO_T_64 - u.a[1] = a[1]; -#endif - return u.ino; -} - -static void encode_ino(__u32 *a, ino_t ino) -{ - union conv u; - - u.ino = ino; - a[0] = u.a[0]; -#ifdef CONFIG_AUFS_INO_T_64 - a[1] = u.a[1]; -#endif -} - -/* NFS file handle */ -enum { - Fh_br_id, - Fh_sigen, -#ifdef CONFIG_AUFS_INO_T_64 - /* support 64bit inode number */ - Fh_ino1, - Fh_ino2, - Fh_dir_ino1, - Fh_dir_ino2, -#else - Fh_ino1, - Fh_dir_ino1, -#endif - Fh_igen, - Fh_h_type, - Fh_tail, - - Fh_ino = Fh_ino1, - Fh_dir_ino = Fh_dir_ino1 -}; - -static int au_test_anon(struct dentry *dentry) -{ - /* note: read d_flags without d_lock */ - return !!(dentry->d_flags & DCACHE_DISCONNECTED); -} - -int au_test_nfsd(void) -{ - int ret; - struct task_struct *tsk = current; - char comm[sizeof(tsk->comm)]; - - ret = 0; - if (tsk->flags & PF_KTHREAD) { - get_task_comm(comm, tsk); - ret = !strcmp(comm, "nfsd"); - } - - return ret; -} - -/* ---------------------------------------------------------------------- */ -/* inode generation external table */ - -void au_xigen_inc(struct inode *inode) -{ - loff_t pos; - ssize_t sz; - __u32 igen; - struct super_block *sb; - struct au_sbinfo *sbinfo; - - sb = inode->i_sb; - AuDebugOn(!au_opt_test(au_mntflags(sb), XINO)); - - sbinfo = au_sbi(sb); - pos = inode->i_ino; - pos *= sizeof(igen); - igen = inode->i_generation + 1; - sz = xino_fwrite(sbinfo->si_xwrite, sbinfo->si_xigen, &igen, - sizeof(igen), &pos); - if (sz == sizeof(igen)) - return; /* success */ - - if (unlikely(sz >= 0)) - AuIOErr("xigen error (%zd)\n", sz); -} - -int au_xigen_new(struct inode *inode) -{ - int err; - loff_t pos; - ssize_t sz; - struct super_block *sb; - struct au_sbinfo *sbinfo; - struct file *file; - - err = 0; - /* todo: dirty, at mount time */ - if (inode->i_ino == AUFS_ROOT_INO) - goto out; - sb = inode->i_sb; - SiMustAnyLock(sb); - if (unlikely(!au_opt_test(au_mntflags(sb), XINO))) - goto out; - - err = -EFBIG; - pos = inode->i_ino; - if (unlikely(au_loff_max / sizeof(inode->i_generation) - 1 < pos)) { - AuIOErr1("too large i%lld\n", pos); - goto out; - } - pos *= sizeof(inode->i_generation); - - err = 0; - sbinfo = au_sbi(sb); - file = sbinfo->si_xigen; - BUG_ON(!file); - - if (vfsub_f_size_read(file) - < pos + sizeof(inode->i_generation)) { - inode->i_generation = atomic_inc_return(&sbinfo->si_xigen_next); - sz = xino_fwrite(sbinfo->si_xwrite, file, &inode->i_generation, - sizeof(inode->i_generation), &pos); - } else - sz = xino_fread(sbinfo->si_xread, file, &inode->i_generation, - sizeof(inode->i_generation), &pos); - if (sz == sizeof(inode->i_generation)) - goto out; /* success */ - - err = sz; - if (unlikely(sz >= 0)) { - err = -EIO; - AuIOErr("xigen error (%zd)\n", sz); - } - -out: - return err; -} - -int au_xigen_set(struct super_block *sb, struct file *base) -{ - int err; - struct au_sbinfo *sbinfo; - struct file *file; - - SiMustWriteLock(sb); - - sbinfo = au_sbi(sb); - file = au_xino_create2(base, sbinfo->si_xigen); - err = PTR_ERR(file); - if (IS_ERR(file)) - goto out; - err = 0; - if (sbinfo->si_xigen) - fput(sbinfo->si_xigen); - sbinfo->si_xigen = file; - -out: - return err; -} - -void au_xigen_clr(struct super_block *sb) -{ - struct au_sbinfo *sbinfo; - - SiMustWriteLock(sb); - - sbinfo = au_sbi(sb); - if (sbinfo->si_xigen) { - fput(sbinfo->si_xigen); - sbinfo->si_xigen = NULL; - } -} - -/* ---------------------------------------------------------------------- */ - -static struct dentry *decode_by_ino(struct super_block *sb, ino_t ino, - ino_t dir_ino) -{ - struct dentry *dentry, *d; - struct inode *inode; - unsigned int sigen; - - dentry = NULL; - inode = ilookup(sb, ino); - if (!inode) - goto out; - - dentry = ERR_PTR(-ESTALE); - sigen = au_sigen(sb); - if (unlikely(is_bad_inode(inode) - || IS_DEADDIR(inode) - || sigen != au_iigen(inode, NULL))) - goto out_iput; - - dentry = NULL; - if (!dir_ino || S_ISDIR(inode->i_mode)) - dentry = d_find_alias(inode); - else { - spin_lock(&inode->i_lock); - hlist_for_each_entry(d, &inode->i_dentry, d_u.d_alias) { - spin_lock(&d->d_lock); - if (!au_test_anon(d) - && d_inode(d->d_parent)->i_ino == dir_ino) { - dentry = dget_dlock(d); - spin_unlock(&d->d_lock); - break; - } - spin_unlock(&d->d_lock); - } - spin_unlock(&inode->i_lock); - } - if (unlikely(dentry && au_digen_test(dentry, sigen))) { - /* need to refresh */ - dput(dentry); - dentry = NULL; - } - -out_iput: - iput(inode); -out: - AuTraceErrPtr(dentry); - return dentry; -} - -/* ---------------------------------------------------------------------- */ - -/* todo: dirty? */ -/* if exportfs_decode_fh() passed vfsmount*, we could be happy */ - -struct au_compare_mnt_args { - /* input */ - struct super_block *sb; - - /* output */ - struct vfsmount *mnt; -}; - -static int au_compare_mnt(struct vfsmount *mnt, void *arg) -{ - struct au_compare_mnt_args *a = arg; - - if (mnt->mnt_sb != a->sb) - return 0; - a->mnt = mntget(mnt); - return 1; -} - -static struct vfsmount *au_mnt_get(struct super_block *sb) -{ - int err; - struct path root; - struct au_compare_mnt_args args = { - .sb = sb - }; - - get_fs_root(current->fs, &root); - rcu_read_lock(); - err = iterate_mounts(au_compare_mnt, &args, root.mnt); - rcu_read_unlock(); - path_put(&root); - AuDebugOn(!err); - AuDebugOn(!args.mnt); - return args.mnt; -} - -struct au_nfsd_si_lock { - unsigned int sigen; - aufs_bindex_t bindex, br_id; - unsigned char force_lock; -}; - -static int si_nfsd_read_lock(struct super_block *sb, - struct au_nfsd_si_lock *nsi_lock) -{ - int err; - aufs_bindex_t bindex; - - si_read_lock(sb, AuLock_FLUSH); - - /* branch id may be wrapped around */ - err = 0; - bindex = au_br_index(sb, nsi_lock->br_id); - if (bindex >= 0 && nsi_lock->sigen + AUFS_BRANCH_MAX > au_sigen(sb)) - goto out; /* success */ - - err = -ESTALE; - bindex = -1; - if (!nsi_lock->force_lock) - si_read_unlock(sb); - -out: - nsi_lock->bindex = bindex; - return err; -} - -struct find_name_by_ino { - struct dir_context ctx; - int called, found; - ino_t ino; - char *name; - int namelen; -}; - -static int -find_name_by_ino(struct dir_context *ctx, const char *name, int namelen, - loff_t offset, u64 ino, unsigned int d_type) -{ - struct find_name_by_ino *a = container_of(ctx, struct find_name_by_ino, - ctx); - - a->called++; - if (a->ino != ino) - return 0; - - memcpy(a->name, name, namelen); - a->namelen = namelen; - a->found = 1; - return 1; -} - -static struct dentry *au_lkup_by_ino(struct path *path, ino_t ino, - struct au_nfsd_si_lock *nsi_lock) -{ - struct dentry *dentry, *parent; - struct file *file; - struct inode *dir; - struct find_name_by_ino arg = { - .ctx = { - .actor = find_name_by_ino - } - }; - int err; - - parent = path->dentry; - if (nsi_lock) - si_read_unlock(parent->d_sb); - file = vfsub_dentry_open(path, au_dir_roflags); - dentry = (void *)file; - if (IS_ERR(file)) - goto out; - - dentry = ERR_PTR(-ENOMEM); - arg.name = (void *)__get_free_page(GFP_NOFS); - if (unlikely(!arg.name)) - goto out_file; - arg.ino = ino; - arg.found = 0; - do { - arg.called = 0; - /* smp_mb(); */ - err = vfsub_iterate_dir(file, &arg.ctx); - } while (!err && !arg.found && arg.called); - dentry = ERR_PTR(err); - if (unlikely(err)) - goto out_name; - /* instead of ENOENT */ - dentry = ERR_PTR(-ESTALE); - if (!arg.found) - goto out_name; - - /* do not call vfsub_lkup_one() */ - dir = d_inode(parent); - mutex_lock(&dir->i_mutex); - dentry = vfsub_lookup_one_len(arg.name, parent, arg.namelen); - mutex_unlock(&dir->i_mutex); - AuTraceErrPtr(dentry); - if (IS_ERR(dentry)) - goto out_name; - AuDebugOn(au_test_anon(dentry)); - if (unlikely(d_really_is_negative(dentry))) { - dput(dentry); - dentry = ERR_PTR(-ENOENT); - } - -out_name: - free_page((unsigned long)arg.name); -out_file: - fput(file); -out: - if (unlikely(nsi_lock - && si_nfsd_read_lock(parent->d_sb, nsi_lock) < 0)) - if (!IS_ERR(dentry)) { - dput(dentry); - dentry = ERR_PTR(-ESTALE); - } - AuTraceErrPtr(dentry); - return dentry; -} - -static struct dentry *decode_by_dir_ino(struct super_block *sb, ino_t ino, - ino_t dir_ino, - struct au_nfsd_si_lock *nsi_lock) -{ - struct dentry *dentry; - struct path path; - - if (dir_ino != AUFS_ROOT_INO) { - path.dentry = decode_by_ino(sb, dir_ino, 0); - dentry = path.dentry; - if (!path.dentry || IS_ERR(path.dentry)) - goto out; - AuDebugOn(au_test_anon(path.dentry)); - } else - path.dentry = dget(sb->s_root); - - path.mnt = au_mnt_get(sb); - dentry = au_lkup_by_ino(&path, ino, nsi_lock); - path_put(&path); - -out: - AuTraceErrPtr(dentry); - return dentry; -} - -/* ---------------------------------------------------------------------- */ - -static int h_acceptable(void *expv, struct dentry *dentry) -{ - return 1; -} - -static char *au_build_path(struct dentry *h_parent, struct path *h_rootpath, - char *buf, int len, struct super_block *sb) -{ - char *p; - int n; - struct path path; - - p = d_path(h_rootpath, buf, len); - if (IS_ERR(p)) - goto out; - n = strlen(p); - - path.mnt = h_rootpath->mnt; - path.dentry = h_parent; - p = d_path(&path, buf, len); - if (IS_ERR(p)) - goto out; - if (n != 1) - p += n; - - path.mnt = au_mnt_get(sb); - path.dentry = sb->s_root; - p = d_path(&path, buf, len - strlen(p)); - mntput(path.mnt); - if (IS_ERR(p)) - goto out; - if (n != 1) - p[strlen(p)] = '/'; - -out: - AuTraceErrPtr(p); - return p; -} - -static -struct dentry *decode_by_path(struct super_block *sb, ino_t ino, __u32 *fh, - int fh_len, struct au_nfsd_si_lock *nsi_lock) -{ - struct dentry *dentry, *h_parent, *root; - struct super_block *h_sb; - char *pathname, *p; - struct vfsmount *h_mnt; - struct au_branch *br; - int err; - struct path path; - - br = au_sbr(sb, nsi_lock->bindex); - h_mnt = au_br_mnt(br); - h_sb = h_mnt->mnt_sb; - /* todo: call lower fh_to_dentry()? fh_to_parent()? */ - h_parent = exportfs_decode_fh(h_mnt, (void *)(fh + Fh_tail), - fh_len - Fh_tail, fh[Fh_h_type], - h_acceptable, /*context*/NULL); - dentry = h_parent; - if (unlikely(!h_parent || IS_ERR(h_parent))) { - AuWarn1("%s decode_fh failed, %ld\n", - au_sbtype(h_sb), PTR_ERR(h_parent)); - goto out; - } - dentry = NULL; - if (unlikely(au_test_anon(h_parent))) { - AuWarn1("%s decode_fh returned a disconnected dentry\n", - au_sbtype(h_sb)); - goto out_h_parent; - } - - dentry = ERR_PTR(-ENOMEM); - pathname = (void *)__get_free_page(GFP_NOFS); - if (unlikely(!pathname)) - goto out_h_parent; - - root = sb->s_root; - path.mnt = h_mnt; - di_read_lock_parent(root, !AuLock_IR); - path.dentry = au_h_dptr(root, nsi_lock->bindex); - di_read_unlock(root, !AuLock_IR); - p = au_build_path(h_parent, &path, pathname, PAGE_SIZE, sb); - dentry = (void *)p; - if (IS_ERR(p)) - goto out_pathname; - - si_read_unlock(sb); - err = vfsub_kern_path(p, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &path); - dentry = ERR_PTR(err); - if (unlikely(err)) - goto out_relock; - - dentry = ERR_PTR(-ENOENT); - AuDebugOn(au_test_anon(path.dentry)); - if (unlikely(d_really_is_negative(path.dentry))) - goto out_path; - - if (ino != d_inode(path.dentry)->i_ino) - dentry = au_lkup_by_ino(&path, ino, /*nsi_lock*/NULL); - else - dentry = dget(path.dentry); - -out_path: - path_put(&path); -out_relock: - if (unlikely(si_nfsd_read_lock(sb, nsi_lock) < 0)) - if (!IS_ERR(dentry)) { - dput(dentry); - dentry = ERR_PTR(-ESTALE); - } -out_pathname: - free_page((unsigned long)pathname); -out_h_parent: - dput(h_parent); -out: - AuTraceErrPtr(dentry); - return dentry; -} - -/* ---------------------------------------------------------------------- */ - -static struct dentry * -aufs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, - int fh_type) -{ - struct dentry *dentry; - __u32 *fh = fid->raw; - struct au_branch *br; - ino_t ino, dir_ino; - struct au_nfsd_si_lock nsi_lock = { - .force_lock = 0 - }; - - dentry = ERR_PTR(-ESTALE); - /* it should never happen, but the file handle is unreliable */ - if (unlikely(fh_len < Fh_tail)) - goto out; - nsi_lock.sigen = fh[Fh_sigen]; - nsi_lock.br_id = fh[Fh_br_id]; - - /* branch id may be wrapped around */ - br = NULL; - if (unlikely(si_nfsd_read_lock(sb, &nsi_lock))) - goto out; - nsi_lock.force_lock = 1; - - /* is this inode still cached? */ - ino = decode_ino(fh + Fh_ino); - /* it should never happen */ - if (unlikely(ino == AUFS_ROOT_INO)) - goto out_unlock; - - dir_ino = decode_ino(fh + Fh_dir_ino); - dentry = decode_by_ino(sb, ino, dir_ino); - if (IS_ERR(dentry)) - goto out_unlock; - if (dentry) - goto accept; - - /* is the parent dir cached? */ - br = au_sbr(sb, nsi_lock.bindex); - atomic_inc(&br->br_count); - dentry = decode_by_dir_ino(sb, ino, dir_ino, &nsi_lock); - if (IS_ERR(dentry)) - goto out_unlock; - if (dentry) - goto accept; - - /* lookup path */ - dentry = decode_by_path(sb, ino, fh, fh_len, &nsi_lock); - if (IS_ERR(dentry)) - goto out_unlock; - if (unlikely(!dentry)) - /* todo?: make it ESTALE */ - goto out_unlock; - -accept: - if (!au_digen_test(dentry, au_sigen(sb)) - && d_inode(dentry)->i_generation == fh[Fh_igen]) - goto out_unlock; /* success */ - - dput(dentry); - dentry = ERR_PTR(-ESTALE); -out_unlock: - if (br) - atomic_dec(&br->br_count); - si_read_unlock(sb); -out: - AuTraceErrPtr(dentry); - return dentry; -} - -#if 0 /* reserved for future use */ -/* support subtreecheck option */ -static struct dentry *aufs_fh_to_parent(struct super_block *sb, struct fid *fid, - int fh_len, int fh_type) -{ - struct dentry *parent; - __u32 *fh = fid->raw; - ino_t dir_ino; - - dir_ino = decode_ino(fh + Fh_dir_ino); - parent = decode_by_ino(sb, dir_ino, 0); - if (IS_ERR(parent)) - goto out; - if (!parent) - parent = decode_by_path(sb, au_br_index(sb, fh[Fh_br_id]), - dir_ino, fh, fh_len); - -out: - AuTraceErrPtr(parent); - return parent; -} -#endif - -/* ---------------------------------------------------------------------- */ - -static int aufs_encode_fh(struct inode *inode, __u32 *fh, int *max_len, - struct inode *dir) -{ - int err; - aufs_bindex_t bindex; - struct super_block *sb, *h_sb; - struct dentry *dentry, *parent, *h_parent; - struct inode *h_dir; - struct au_branch *br; - - err = -ENOSPC; - if (unlikely(*max_len <= Fh_tail)) { - AuWarn1("NFSv2 client (max_len %d)?\n", *max_len); - goto out; - } - - err = FILEID_ROOT; - if (inode->i_ino == AUFS_ROOT_INO) { - AuDebugOn(inode->i_ino != AUFS_ROOT_INO); - goto out; - } - - h_parent = NULL; - sb = inode->i_sb; - err = si_read_lock(sb, AuLock_FLUSH); - if (unlikely(err)) - goto out; - -#ifdef CONFIG_AUFS_DEBUG - if (unlikely(!au_opt_test(au_mntflags(sb), XINO))) - AuWarn1("NFS-exporting requires xino\n"); -#endif - err = -EIO; - parent = NULL; - ii_read_lock_child(inode); - bindex = au_ibstart(inode); - if (!dir) { - dentry = d_find_any_alias(inode); - if (unlikely(!dentry)) - goto out_unlock; - AuDebugOn(au_test_anon(dentry)); - parent = dget_parent(dentry); - dput(dentry); - if (unlikely(!parent)) - goto out_unlock; - if (d_really_is_positive(parent)) - dir = d_inode(parent); - } - - ii_read_lock_parent(dir); - h_dir = au_h_iptr(dir, bindex); - ii_read_unlock(dir); - if (unlikely(!h_dir)) - goto out_parent; - h_parent = d_find_any_alias(h_dir); - if (unlikely(!h_parent)) - goto out_hparent; - - err = -EPERM; - br = au_sbr(sb, bindex); - h_sb = au_br_sb(br); - if (unlikely(!h_sb->s_export_op)) { - AuErr1("%s branch is not exportable\n", au_sbtype(h_sb)); - goto out_hparent; - } - - fh[Fh_br_id] = br->br_id; - fh[Fh_sigen] = au_sigen(sb); - encode_ino(fh + Fh_ino, inode->i_ino); - encode_ino(fh + Fh_dir_ino, dir->i_ino); - fh[Fh_igen] = inode->i_generation; - - *max_len -= Fh_tail; - fh[Fh_h_type] = exportfs_encode_fh(h_parent, (void *)(fh + Fh_tail), - max_len, - /*connectable or subtreecheck*/0); - err = fh[Fh_h_type]; - *max_len += Fh_tail; - /* todo: macros? */ - if (err != FILEID_INVALID) - err = 99; - else - AuWarn1("%s encode_fh failed\n", au_sbtype(h_sb)); - -out_hparent: - dput(h_parent); -out_parent: - dput(parent); -out_unlock: - ii_read_unlock(inode); - si_read_unlock(sb); -out: - if (unlikely(err < 0)) - err = FILEID_INVALID; - return err; -} - -/* ---------------------------------------------------------------------- */ - -static int aufs_commit_metadata(struct inode *inode) -{ - int err; - aufs_bindex_t bindex; - struct super_block *sb; - struct inode *h_inode; - int (*f)(struct inode *inode); - - sb = inode->i_sb; - si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW); - ii_write_lock_child(inode); - bindex = au_ibstart(inode); - AuDebugOn(bindex < 0); - h_inode = au_h_iptr(inode, bindex); - - f = h_inode->i_sb->s_export_op->commit_metadata; - if (f) - err = f(h_inode); - else { - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = 0 /* metadata only */ - }; - - err = sync_inode(h_inode, &wbc); - } - - au_cpup_attr_timesizes(inode); - ii_write_unlock(inode); - si_read_unlock(sb); - return err; -} - -/* ---------------------------------------------------------------------- */ - -static struct export_operations aufs_export_op = { - .fh_to_dentry = aufs_fh_to_dentry, - /* .fh_to_parent = aufs_fh_to_parent, */ - .encode_fh = aufs_encode_fh, - .commit_metadata = aufs_commit_metadata -}; - -void au_export_init(struct super_block *sb) -{ - struct au_sbinfo *sbinfo; - __u32 u; - - sb->s_export_op = &aufs_export_op; - sbinfo = au_sbi(sb); - sbinfo->si_xigen = NULL; - get_random_bytes(&u, sizeof(u)); - BUILD_BUG_ON(sizeof(u) != sizeof(int)); - atomic_set(&sbinfo->si_xigen_next, u); -} diff --git a/fs/aufs/f_op.c b/fs/aufs/f_op.c deleted file mode 100644 index 145dec870..000000000 --- a/fs/aufs/f_op.c +++ /dev/null @@ -1,759 +0,0 @@ -/* - * Copyright (C) 2005-2016 Junjiro R. Okajima - */ - -/* - * file and vm operations - */ - -#include <linux/aio.h> -#include <linux/fs_stack.h> -#include <linux/mman.h> -#include <linux/security.h> -#include "aufs.h" - -int au_do_open_nondir(struct file *file, int flags, struct file *h_file) -{ - int err; - aufs_bindex_t bindex; - struct dentry *dentry, *h_dentry; - struct au_finfo *finfo; - struct inode *h_inode; - - FiMustWriteLock(file); - - err = 0; - dentry = file->f_path.dentry; - AuDebugOn(IS_ERR_OR_NULL(dentry)); - finfo = au_fi(file); - memset(&finfo->fi_htop, 0, sizeof(finfo->fi_htop)); - atomic_set(&finfo->fi_mmapped, 0); - bindex = au_dbstart(dentry); - if (!h_file) { - h_dentry = au_h_dptr(dentry, bindex); - err = vfsub_test_mntns(file->f_path.mnt, h_dentry->d_sb); - if (unlikely(err)) - goto out; - h_file = au_h_open(dentry, bindex, flags, file, /*force_wr*/0); - } else { - h_dentry = h_file->f_path.dentry; - err = vfsub_test_mntns(file->f_path.mnt, h_dentry->d_sb); - if (unlikely(err)) - goto out; - get_file(h_file); - } - if (IS_ERR(h_file)) - err = PTR_ERR(h_file); - else { - if ((flags & __O_TMPFILE) - && !(flags & O_EXCL)) { - h_inode = file_inode(h_file); - spin_lock(&h_inode->i_lock); - h_inode->i_state |= I_LINKABLE; - spin_unlock(&h_inode->i_lock); - } - au_set_fbstart(file, bindex); - au_set_h_fptr(file, bindex, h_file); - au_update_figen(file); - /* todo: necessary? */ - /* file->f_ra = h_file->f_ra; */ - } - -out: - return err; -} - -static int aufs_open_nondir(struct inode *inode __maybe_unused, - struct file *file) -{ - int err; - struct super_block *sb; - struct au_do_open_args args = { - .open = au_do_open_nondir - }; - - AuDbg("%pD, f_flags 0x%x, f_mode 0x%x\n", - file, vfsub_file_flags(file), file->f_mode); - - sb = file->f_path.dentry->d_sb; - si_read_lock(sb, AuLock_FLUSH); - err = au_do_open(file, &args); - si_read_unlock(sb); - return err; -} - -int aufs_release_nondir(struct inode *inode __maybe_unused, struct file *file) -{ - struct au_finfo *finfo; - aufs_bindex_t bindex; - - finfo = au_fi(file); - au_sphl_del(&finfo->fi_hlist, - &au_sbi(file->f_path.dentry->d_sb)->si_files); - bindex = finfo->fi_btop; - if (bindex >= 0) - au_set_h_fptr(file, bindex, NULL); - - au_finfo_fin(file); - return 0; -} - -/* ---------------------------------------------------------------------- */ - -static int au_do_flush_nondir(struct file *file, fl_owner_t id) -{ - int err; - struct file *h_file; - - err = 0; - h_file = au_hf_top(file); - if (h_file) - err = vfsub_flush(h_file, id); - return err; -} - -static int aufs_flush_nondir(struct file *file, fl_owner_t id) -{ - return au_do_flush(file, id, au_do_flush_nondir); -} - -/* ---------------------------------------------------------------------- */ -/* - * read and write functions acquire [fdi]_rwsem once, but release before - * mmap_sem. This is because to stop a race condition between mmap(2). - * Releasing these aufs-rwsem should be safe, no branch-mamagement (by keeping - * si_rwsem), no harmful copy-up should happen. Actually copy-up may happen in - * read functions after [fdi]_rwsem are released, but it should be harmless. - */ - -/* Callers should call au_read_post() or fput() in the end */ -struct file *au_read_pre(struct file *file, int keep_fi) -{ - struct file *h_file; - int err; - - err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/0); - if (!err) { - di_read_unlock(file->f_path.dentry, AuLock_IR); - h_file = au_hf_top(file); - get_file(h_file); - if (!keep_fi) - fi_read_unlock(file); - } else - h_file = ERR_PTR(err); - - return h_file; -} - -static void au_read_post(struct inode *inode, struct file *h_file) -{ - /* update without lock, I don't think it a problem */ - fsstack_copy_attr_atime(inode, file_inode(h_file)); - fput(h_file); -} - -struct au_write_pre { - blkcnt_t blks; - aufs_bindex_t bstart; -}; - -/* - * return with iinfo is write-locked - * callers should call au_write_post() or iinfo_write_unlock() + fput() in the - * end - */ -static struct file *au_write_pre(struct file *file, int do_ready, - struct au_write_pre *wpre) -{ - struct file *h_file; - struct dentry *dentry; - int err; - struct au_pin pin; - - err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/1); - h_file = ERR_PTR(err); - if (unlikely(err)) - goto out; - - dentry = file->f_path.dentry; - if (do_ready) { - err = au_ready_to_write(file, -1, &pin); - if (unlikely(err)) { - h_file = ERR_PTR(err); - di_write_unlock(dentry); - goto out_fi; - } - } - - di_downgrade_lock(dentry, /*flags*/0); - if (wpre) - wpre->bstart = au_fbstart(file); - h_file = au_hf_top(file); - get_file(h_file); - if (wpre) - wpre->blks = file_inode(h_file)->i_blocks; - if (do_ready) - au_unpin(&pin); - di_read_unlock(dentry, /*flags*/0); - -out_fi: - fi_write_unlock(file); -out: - return h_file; -} - -static void au_write_post(struct inode *inode, struct file *h_file, - struct au_write_pre *wpre, ssize_t written) -{ - struct inode *h_inode; - - au_cpup_attr_timesizes(inode); - AuDebugOn(au_ibstart(inode) != wpre->bstart); - h_inode = file_inode(h_file); - inode->i_mode = h_inode->i_mode; - ii_write_unlock(inode); - fput(h_file); - - /* AuDbg("blks %llu, %llu\n", (u64)blks, (u64)h_inode->i_blocks); */ - if (written > 0) - au_fhsm_wrote(inode->i_sb, wpre->bstart, - /*force*/h_inode->i_blocks > wpre->blks); -} - -static ssize_t aufs_read(struct file *file, char __user *buf, size_t count, - loff_t *ppos) -{ - ssize_t err; - struct inode *inode; - struct file *h_file; - struct super_block *sb; - - inode = file_inode(file); - sb = inode->i_sb; - si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW); - - h_file = au_read_pre(file, /*keep_fi*/0); - err = PTR_ERR(h_file); - if (IS_ERR(h_file)) - goto out; - - /* filedata may be obsoleted by concurrent copyup, but no problem */ - err = vfsub_read_u(h_file, buf, count, ppos); - /* todo: necessary? */ - /* file->f_ra = h_file->f_ra; */ - au_read_post(inode, h_file); - -out: - si_read_unlock(sb); - return err; -} - -/* - * todo: very ugly - * it locks both of i_mutex and si_rwsem for read in safe. - * if the plink maintenance mode continues forever (that is the problem), - * may loop forever. - */ -static void au_mtx_and_read_lock(struct inode *inode) -{ - int err; - struct super_block *sb = inode->i_sb; - - while (1) { - mutex_lock(&inode->i_mutex); - err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM); - if (!err) - break; - mutex_unlock(&inode->i_mutex); - si_read_lock(sb, AuLock_NOPLMW); - si_read_unlock(sb); - } -} - -static ssize_t aufs_write(struct file *file, const char __user *ubuf, - size_t count, loff_t *ppos) -{ - ssize_t err; - struct au_write_pre wpre; - struct inode *inode; - struct file *h_file; - char __user *buf = (char __user *)ubuf; - - inode = file_inode(file); - au_mtx_and_read_lock(inode); - - h_file = au_write_pre(file, /*do_ready*/1, &wpre); - err = PTR_ERR(h_file); - if (IS_ERR(h_file)) - goto out; - - err = vfsub_write_u(h_file, buf, count, ppos); - au_write_post(inode, h_file, &wpre, err); - -out: - si_read_unlock(inode->i_sb); - mutex_unlock(&inode->i_mutex); - return err; -} - -static ssize_t au_do_iter(struct file *h_file, int rw, struct kiocb *kio, - struct iov_iter *iov_iter) -{ - ssize_t err; - struct file *file; - ssize_t (*iter)(struct kiocb *, struct iov_iter *); - - err = security_file_permission(h_file, rw); - if (unlikely(err)) - goto out; - - err = -ENOSYS; - iter = NULL; - if (rw == MAY_READ) - iter = h_file->f_op->read_iter; - else if (rw == MAY_WRITE) - iter = h_file->f_op->write_iter; - - file = kio->ki_filp; - kio->ki_filp = h_file; - if (iter) { - lockdep_off(); - err = iter(kio, iov_iter); - lockdep_on(); - } else - /* currently there is no such fs */ - WARN_ON_ONCE(1); - kio->ki_filp = file; - -out: - return err; -} - -static ssize_t aufs_read_iter(struct kiocb *kio, struct iov_iter *iov_iter) -{ - ssize_t err; - struct file *file, *h_file; - struct inode *inode; - struct super_block *sb; - - file = kio->ki_filp; - inode = file_inode(file); - sb = inode->i_sb; - si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW); - - h_file = au_read_pre(file, /*keep_fi*/0); - err = PTR_ERR(h_file); - if (IS_ERR(h_file)) - goto out; - - err = au_do_iter(h_file, MAY_READ, kio, iov_iter); - /* todo: necessary? */ - /* file->f_ra = h_file->f_ra; */ - au_read_post(inode, h_file); - -out: - si_read_unlock(sb); - return err; -} - -static ssize_t aufs_write_iter(struct kiocb *kio, struct iov_iter *iov_iter) -{ - ssize_t err; - struct au_write_pre wpre; - struct inode *inode; - struct file *file, *h_file; - - file = kio->ki_filp; - inode = file_inode(file); - au_mtx_and_read_lock(inode); - - h_file = au_write_pre(file, /*do_ready*/1, &wpre); - err = PTR_ERR(h_file); - if (IS_ERR(h_file)) - goto out; - - err = au_do_iter(h_file, MAY_WRITE, kio, iov_iter); - au_write_post(inode, h_file, &wpre, err); - -out: - si_read_unlock(inode->i_sb); - mutex_unlock(&inode->i_mutex); - return err; -} - -static ssize_t aufs_splice_read(struct file *file, loff_t *ppos, - struct pipe_inode_info *pipe, size_t len, - unsigned int flags) -{ - ssize_t err; - struct file *h_file; - struct inode *inode; - struct super_block *sb; - - inode = file_inode(file); - sb = inode->i_sb; - si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW); - - h_file = au_read_pre(file, /*keep_fi*/1); - err = PTR_ERR(h_file); - if (IS_ERR(h_file)) - goto out; - - if (au_test_loopback_kthread()) { - au_warn_loopback(h_file->f_path.dentry->d_sb); - if (file->f_mapping != h_file->f_mapping) { - file->f_mapping = h_file->f_mapping; - smp_mb(); /* unnecessary? */ - } - } - fi_read_unlock(file); - - err = vfsub_splice_to(h_file, ppos, pipe, len, flags); - /* todo: necessasry? */ - /* file->f_ra = h_file->f_ra; */ - au_read_post(inode, h_file); - -out: - si_read_unlock(sb); - return err; -} - -static ssize_t -aufs_splice_write(struct pipe_inode_info *pipe, struct file *file, loff_t *ppos, - size_t len, unsigned int flags) -{ - ssize_t err; - struct au_write_pre wpre; - struct inode *inode; - struct file *h_file; - - inode = file_inode(file); - au_mtx_and_read_lock(inode); - - h_file = au_write_pre(file, /*do_ready*/1, &wpre); - err = PTR_ERR(h_file); - if (IS_ERR(h_file)) - goto out; - - err = vfsub_splice_from(pipe, h_file, ppos, len, flags); - au_write_post(inode, h_file, &wpre, err); - -out: - si_read_unlock(inode->i_sb); - mutex_unlock(&inode->i_mutex); - return err; -} - -static long aufs_fallocate(struct file *file, int mode, loff_t offset, - loff_t len) -{ - long err; - struct au_write_pre wpre; - struct inode *inode; - struct file *h_file; - - inode = file_inode(file); - au_mtx_and_read_lock(inode); - - h_file = au_write_pre(file, /*do_ready*/1, &wpre); - err = PTR_ERR(h_file); - if (IS_ERR(h_file)) - goto out; - - lockdep_off(); - err = vfs_fallocate(h_file, mode, offset, len); - lockdep_on(); - au_write_post(inode, h_file, &wpre, /*written*/1); - -out: - si_read_unlock(inode->i_sb); - mutex_unlock(&inode->i_mutex); - return err; -} - -/* ---------------------------------------------------------------------- */ - -/* - * The locking order around current->mmap_sem. - * - in most and regular cases - * file I/O syscall -- aufs_read() or something - * -- si_rwsem for read -- mmap_sem - * (Note that [fdi]i_rwsem are released before mmap_sem). - * - in mmap case - * mmap(2) -- mmap_sem -- aufs_mmap() -- si_rwsem for read -- [fdi]i_rwsem - * This AB-BA order is definitly bad, but is not a problem since "si_rwsem for - * read" allows muliple processes to acquire it and [fdi]i_rwsem are not held in - * file I/O. Aufs needs to stop lockdep in aufs_mmap() though. - * It means that when aufs acquires si_rwsem for write, the process should never - * acquire mmap_sem. - * - * Actually aufs_iterate() holds [fdi]i_rwsem before mmap_sem, but this is not a - * problem either since any directory is not able to be mmap-ed. - * The similar scenario is applied to aufs_readlink() too. - */ - -#if 0 /* stop calling security_file_mmap() */ -/* cf. linux/include/linux/mman.h: calc_vm_prot_bits() */ -#define AuConv_VM_PROT(f, b) _calc_vm_trans(f, VM_##b, PROT_##b) - -static unsigned long au_arch_prot_conv(unsigned long flags) -{ - /* currently ppc64 only */ -#ifdef CONFIG_PPC64 - /* cf. linux/arch/powerpc/include/asm/mman.h */ - AuDebugOn(arch_calc_vm_prot_bits(-1) != VM_SAO); - return AuConv_VM_PROT(flags, SAO); -#else - AuDebugOn(arch_calc_vm_prot_bits(-1)); - return 0; -#endif -} - -static unsigned long au_prot_conv(unsigned long flags) -{ - return AuConv_VM_PROT(flags, READ) - | AuConv_VM_PROT(flags, WRITE) - | AuConv_VM_PROT(flags, EXEC) - | au_arch_prot_conv(flags); -} - -/* cf. linux/include/linux/mman.h: calc_vm_flag_bits() */ -#define AuConv_VM_MAP(f, b) _calc_vm_trans(f, VM_##b, MAP_##b) - -static unsigned long au_flag_conv(unsigned long flags) -{ - return AuConv_VM_MAP(flags, GROWSDOWN) - | AuConv_VM_MAP(flags, DENYWRITE) - | AuConv_VM_MAP(flags, LOCKED); -} -#endif - -static int aufs_mmap(struct file *file, struct vm_area_struct *vma) -{ - int err; - const unsigned char wlock - = (file->f_mode & FMODE_WRITE) && (vma->vm_flags & VM_SHARED); - struct super_block *sb; - struct file *h_file; - struct inode *inode; - - AuDbgVmRegion(file, vma); - - inode = file_inode(file); - sb = inode->i_sb; - lockdep_off(); - si_read_lock(sb, AuLock_NOPLMW); - - h_file = au_write_pre(file, wlock, /*wpre*/NULL); - lockdep_on(); - err = PTR_ERR(h_file); - if (IS_ERR(h_file)) - goto out; - - err = 0; - au_set_mmapped(file); - au_vm_file_reset(vma, h_file); - /* - * we cannot call security_mmap_file() here since it may acquire - * mmap_sem or i_mutex. - * - * err = security_mmap_file(h_file, au_prot_conv(vma->vm_flags), - * au_flag_conv(vma->vm_flags)); - */ - if (!err) - err = h_file->f_op->mmap(h_file, vma); - if (!err) { - au_vm_prfile_set(vma, file); - fsstack_copy_attr_atime(inode, file_inode(h_file)); - goto out_fput; /* success */ - } - au_unset_mmapped(file); - au_vm_file_reset(vma, file); - -out_fput: - lockdep_off(); - ii_write_unlock(inode); - lockdep_on(); - fput(h_file); -out: - lockdep_off(); - si_read_unlock(sb); - lockdep_on(); - AuTraceErr(err); - return err; -} - -/* ---------------------------------------------------------------------- */ - -static int aufs_fsync_nondir(struct file *file, loff_t start, loff_t end, - int datasync) -{ - int err; - struct au_write_pre wpre; - struct inode *inode; - struct file *h_file; - - err = 0; /* -EBADF; */ /* posix? */ - if (unlikely(!(file->f_mode & FMODE_WRITE))) - goto out; - - inode = file_inode(file); - au_mtx_and_read_lock(inode); - - h_file = au_write_pre(file, /*do_ready*/1, &wpre); - err = PTR_ERR(h_file); - if (IS_ERR(h_file)) - goto out_unlock; - - err = vfsub_fsync(h_file, &h_file->f_path, datasync); - au_write_post(inode, h_file, &wpre, /*written*/0); - -out_unlock: - si_read_unlock(inode->i_sb); - mutex_unlock(&inode->i_mutex); -out: - return err; -} - -/* no one supports this operation, currently */ -#if 0 -static int aufs_aio_fsync_nondir(struct kiocb *kio, int datasync) -{ - int err; - struct au_write_pre wpre; - struct inode *inode; - struct file *file, *h_file; - - err = 0; /* -EBADF; */ /* posix? */ - if (unlikely(!(file->f_mode & FMODE_WRITE))) - goto out; - - file = kio->ki_filp; - inode = file_inode(file); - au_mtx_and_read_lock(inode); - - h_file = au_write_pre(file, /*do_ready*/1, &wpre); - err = PTR_ERR(h_file); - if (IS_ERR(h_file)) - goto out_unlock; - - err = -ENOSYS; - h_file = au_hf_top(file); - if (h_file->f_op->aio_fsync) { - struct mutex *h_mtx; - - h_mtx = &file_inode(h_file)->i_mutex; - if (!is_sync_kiocb(kio)) { - get_file(h_file); - fput(file); - } - kio->ki_filp = h_file; - err = h_file->f_op->aio_fsync(kio, datasync); - mutex_lock_nested(h_mtx, AuLsc_I_CHILD); - if (!err) - vfsub_update_h_iattr(&h_file->f_path, /*did*/NULL); - /*ignore*/ - mutex_unlock(h_mtx); - } - au_write_post(inode, h_file, &wpre, /*written*/0); - -out_unlock: - si_read_unlock(inode->sb); - mutex_unlock(&inode->i_mutex); -out: - return err; -} -#endif - -static int aufs_fasync(int fd, struct file *file, int flag) -{ - int err; - struct file *h_file; - struct super_block *sb; - - sb = file->f_path.dentry->d_sb; - si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW); - - h_file = au_read_pre(file, /*keep_fi*/0); - err = PTR_ERR(h_file); - if (IS_ERR(h_file)) - goto out; - - if (h_file->f_op->fasync) - err = h_file->f_op->fasync(fd, h_file, flag); - fput(h_file); /* instead of au_read_post() */ - -out: - si_read_unlock(sb); - return err; -} - -static int aufs_setfl(struct file *file, unsigned long arg) -{ - int err; - struct file *h_file; - struct super_block *sb; - - sb = file->f_path.dentry->d_sb; - si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW); - - h_file = au_read_pre(file, /*keep_fi*/0); - err = PTR_ERR(h_file); - if (IS_ERR(h_file)) - goto out; - - arg |= vfsub_file_flags(file) & FASYNC; /* stop calling h_file->fasync */ - err = setfl(/*unused fd*/-1, h_file, arg); - fput(h_file); /* instead of au_read_post() */ - -out: - si_read_unlock(sb); - return err; -} - -/* ---------------------------------------------------------------------- */ - -/* no one supports this operation, currently */ -#if 0 -static ssize_t aufs_sendpage(struct file *file, struct page *page, int offset, - size_t len, loff_t *pos, int more) -{ -} -#endif - -/* ---------------------------------------------------------------------- */ - -const struct file_operations aufs_file_fop = { - .owner = THIS_MODULE, - - .llseek = default_llseek, - - .read = aufs_read, - .write = aufs_write, - .read_iter = aufs_read_iter, - .write_iter = aufs_write_iter, - -#ifdef CONFIG_AUFS_POLL - .poll = aufs_poll, -#endif - .unlocked_ioctl = aufs_ioctl_nondir, -#ifdef CONFIG_COMPAT - .compat_ioctl = aufs_compat_ioctl_nondir, -#endif - .mmap = aufs_mmap, - .open = aufs_open_nondir, - .flush = aufs_flush_nondir, - .release = aufs_release_nondir, - .fsync = aufs_fsync_nondir, - /* .aio_fsync = aufs_aio_fsync_nondir, */ - .fasync = aufs_fasync, - /* .sendpage = aufs_sendpage, */ - .setfl = aufs_setfl, - .splice_write = aufs_splice_write, - .splice_read = aufs_splice_read, -#if 0 - .aio_splice_write = aufs_aio_splice_write, - .aio_splice_read = aufs_aio_splice_read, -#endif - .fallocate = aufs_fallocate -}; diff --git a/fs/aufs/fhsm.c b/fs/aufs/fhsm.c deleted file mode 100644 index db079d6ee..000000000 --- a/fs/aufs/fhsm.c +++ /dev/null @@ -1,412 +0,0 @@ -/* - * Copyright (C) 2011-2016 Junjiro R. Okajima - */ - -/* - * File-based Hierarchy Storage Management - */ - -#include <linux/anon_inodes.h> -#include <linux/poll.h> -#include <linux/seq_file.h> -#include <linux/statfs.h> -#include "aufs.h" - -static aufs_bindex_t au_fhsm_bottom(struct super_block *sb) -{ - struct au_sbinfo *sbinfo; - struct au_fhsm *fhsm; - - SiMustAnyLock(sb); - - sbinfo = au_sbi(sb); - fhsm = &sbinfo->si_fhsm; - AuDebugOn(!fhsm); - return fhsm->fhsm_bottom; -} - -void au_fhsm_set_bottom(struct super_block *sb, aufs_bindex_t bindex) -{ - struct au_sbinfo *sbinfo; - struct au_fhsm *fhsm; - - SiMustWriteLock(sb); - - sbinfo = au_sbi(sb); - fhsm = &sbinfo->si_fhsm; - AuDebugOn(!fhsm); - fhsm->fhsm_bottom = bindex; -} - -/* ---------------------------------------------------------------------- */ - -static int au_fhsm_test_jiffy(struct au_sbinfo *sbinfo, struct au_branch *br) -{ - struct au_br_fhsm *bf; - - bf = br->br_fhsm; - MtxMustLock(&bf->bf_lock); - - return !bf->bf_readable - || time_after(jiffies, - bf->bf_jiffy + sbinfo->si_fhsm.fhsm_expire); -} - -/* ---------------------------------------------------------------------- */ - -static void au_fhsm_notify(struct super_block *sb, int val) -{ - struct au_sbinfo *sbinfo; - struct au_fhsm *fhsm; - - SiMustAnyLock(sb); - - sbinfo = au_sbi(sb); - fhsm = &sbinfo->si_fhsm; - if (au_fhsm_pid(fhsm) - && atomic_read(&fhsm->fhsm_readable) != -1) { - atomic_set(&fhsm->fhsm_readable, val); - if (val) - wake_up(&fhsm->fhsm_wqh); - } -} - -static int au_fhsm_stfs(struct super_block *sb, aufs_bindex_t bindex, - struct aufs_stfs *rstfs, int do_lock, int do_notify) -{ - int err; - struct au_branch *br; - struct au_br_fhsm *bf; - - br = au_sbr(sb, bindex); - AuDebugOn(au_br_rdonly(br)); - bf = br->br_fhsm; - AuDebugOn(!bf); - - if (do_lock) - mutex_lock(&bf->bf_lock); - else - MtxMustLock(&bf->bf_lock); - - /* sb->s_root for NFS is unreliable */ - err = au_br_stfs(br, &bf->bf_stfs); - if (unlikely(err)) { - AuErr1("FHSM failed (%d), b%d, ignored.\n", bindex, err); - goto out; - } - - bf->bf_jiffy = jiffies; - bf->bf_readable = 1; - if (do_notify) - au_fhsm_notify(sb, /*val*/1); - if (rstfs) - *rstfs = bf->bf_stfs; - -out: - if (do_lock) - mutex_unlock(&bf->bf_lock); - au_fhsm_notify(sb, /*val*/1); - - return err; -} - -void au_fhsm_wrote(struct super_block *sb, aufs_bindex_t bindex, int force) -{ - int err; - struct au_sbinfo *sbinfo; - struct au_fhsm *fhsm; - struct au_branch *br; - struct au_br_fhsm *bf; - - AuDbg("b%d, force %d\n", bindex, force); - SiMustAnyLock(sb); - - sbinfo = au_sbi(sb); - fhsm = &sbinfo->si_fhsm; - if (!au_ftest_si(sbinfo, FHSM) - || fhsm->fhsm_bottom == bindex) - return; - - br = au_sbr(sb, bindex); - bf = br->br_fhsm; - AuDebugOn(!bf); - mutex_lock(&bf->bf_lock); - if (force - || au_fhsm_pid(fhsm) - || au_fhsm_test_jiffy(sbinfo, br)) - err = au_fhsm_stfs(sb, bindex, /*rstfs*/NULL, /*do_lock*/0, - /*do_notify*/1); - mutex_unlock(&bf->bf_lock); -} - -void au_fhsm_wrote_all(struct super_block *sb, int force) -{ - aufs_bindex_t bindex, bend; - struct au_branch *br; - - /* exclude the bottom */ - bend = au_fhsm_bottom(sb); - for (bindex = 0; bindex < bend; bindex++) { - br = au_sbr(sb, bindex); - if (au_br_fhsm(br->br_perm)) - au_fhsm_wrote(sb, bindex, force); - } -} - -/* ---------------------------------------------------------------------- */ - -static unsigned int au_fhsm_poll(struct file *file, - struct poll_table_struct *wait) -{ - unsigned int mask; - struct au_sbinfo *sbinfo; - struct au_fhsm *fhsm; - - mask = 0; - sbinfo = file->private_data; - fhsm = &sbinfo->si_fhsm; - poll_wait(file, &fhsm->fhsm_wqh, wait); - if (atomic_read(&fhsm->fhsm_readable)) - mask = POLLIN /* | POLLRDNORM */; - - AuTraceErr((int)mask); - return mask; -} - -static int au_fhsm_do_read_one(struct aufs_stbr __user *stbr, - struct aufs_stfs *stfs, __s16 brid) -{ - int err; - - err = copy_to_user(&stbr->stfs, stfs, sizeof(*stfs)); - if (!err) - err = __put_user(brid, &stbr->brid); - if (unlikely(err)) - err = -EFAULT; - - return err; -} - -static ssize_t au_fhsm_do_read(struct super_block *sb, - struct aufs_stbr __user *stbr, size_t count) -{ - ssize_t err; - int nstbr; - aufs_bindex_t bindex, bend; - struct au_branch *br; - struct au_br_fhsm *bf; - - /* except the bottom branch */ - err = 0; - nstbr = 0; - bend = au_fhsm_bottom(sb); - for (bindex = 0; !err && bindex < bend; bindex++) { - br = au_sbr(sb, bindex); - if (!au_br_fhsm(br->br_perm)) - continue; - - bf = br->br_fhsm; - mutex_lock(&bf->bf_lock); - if (bf->bf_readable) { - err = -EFAULT; - if (count >= sizeof(*stbr)) - err = au_fhsm_do_read_one(stbr++, &bf->bf_stfs, - br->br_id); - if (!err) { - bf->bf_readable = 0; - count -= sizeof(*stbr); - nstbr++; - } - } - mutex_unlock(&bf->bf_lock); - } - if (!err) - err = sizeof(*stbr) * nstbr; - - return err; -} - -static ssize_t au_fhsm_read(struct file *file, char __user *buf, size_t count, - loff_t *pos) -{ - ssize_t err; - int readable; - aufs_bindex_t nfhsm, bindex, bend; - struct au_sbinfo *sbinfo; - struct au_fhsm *fhsm; - struct au_branch *br; - struct super_block *sb; - - err = 0; - sbinfo = file->private_data; - fhsm = &sbinfo->si_fhsm; -need_data: - spin_lock_irq(&fhsm->fhsm_wqh.lock); - if (!atomic_read(&fhsm->fhsm_readable)) { - if (vfsub_file_flags(file) & O_NONBLOCK) - err = -EAGAIN; - else - err = wait_event_interruptible_locked_irq - (fhsm->fhsm_wqh, - atomic_read(&fhsm->fhsm_readable)); - } - spin_unlock_irq(&fhsm->fhsm_wqh.lock); - if (unlikely(err)) - goto out; - - /* sb may already be dead */ - au_rw_read_lock(&sbinfo->si_rwsem); - readable = atomic_read(&fhsm->fhsm_readable); - if (readable > 0) { - sb = sbinfo->si_sb; - AuDebugOn(!sb); - /* exclude the bottom branch */ - nfhsm = 0; - bend = au_fhsm_bottom(sb); - for (bindex = 0; bindex < bend; bindex++) { - br = au_sbr(sb, bindex); - if (au_br_fhsm(br->br_perm)) - nfhsm++; - } - err = -EMSGSIZE; - if (nfhsm * sizeof(struct aufs_stbr) <= count) { - atomic_set(&fhsm->fhsm_readable, 0); - err = au_fhsm_do_read(sbinfo->si_sb, (void __user *)buf, - count); - } - } - au_rw_read_unlock(&sbinfo->si_rwsem); - if (!readable) - goto need_data; - -out: - return err; -} - -static int au_fhsm_release(struct inode *inode, struct file *file) -{ - struct au_sbinfo *sbinfo; - struct au_fhsm *fhsm; - - /* sb may already be dead */ - sbinfo = file->private_data; - fhsm = &sbinfo->si_fhsm; - spin_lock(&fhsm->fhsm_spin); - fhsm->fhsm_pid = 0; - spin_unlock(&fhsm->fhsm_spin); - kobject_put(&sbinfo->si_kobj); - - return 0; -} - -static const struct file_operations au_fhsm_fops = { - .owner = THIS_MODULE, - .llseek = noop_llseek, - .read = au_fhsm_read, - .poll = au_fhsm_poll, - .release = au_fhsm_release -}; - -int au_fhsm_fd(struct super_block *sb, int oflags) -{ - int err, fd; - struct au_sbinfo *sbinfo; - struct au_fhsm *fhsm; - - err = -EPERM; - if (unlikely(!capable(CAP_SYS_ADMIN))) - goto out; - - err = -EINVAL; - if (unlikely(oflags & ~(O_CLOEXEC | O_NONBLOCK))) - goto out; - - err = 0; - sbinfo = au_sbi(sb); - fhsm = &sbinfo->si_fhsm; - spin_lock(&fhsm->fhsm_spin); - if (!fhsm->fhsm_pid) - fhsm->fhsm_pid = current->pid; - else - err = -EBUSY; - spin_unlock(&fhsm->fhsm_spin); - if (unlikely(err)) - goto out; - - oflags |= O_RDONLY; - /* oflags |= FMODE_NONOTIFY; */ - fd = anon_inode_getfd("[aufs_fhsm]", &au_fhsm_fops, sbinfo, oflags); - err = fd; - if (unlikely(fd < 0)) - goto out_pid; - - /* succeed reglardless 'fhsm' status */ - kobject_get(&sbinfo->si_kobj); - si_noflush_read_lock(sb); - if (au_ftest_si(sbinfo, FHSM)) - au_fhsm_wrote_all(sb, /*force*/0); - si_read_unlock(sb); - goto out; /* success */ - -out_pid: - spin_lock(&fhsm->fhsm_spin); - fhsm->fhsm_pid = 0; - spin_unlock(&fhsm->fhsm_spin); -out: - AuTraceErr(err); - return err; -} - -/* ---------------------------------------------------------------------- */ - -int au_fhsm_br_alloc(struct au_branch *br) -{ - int err; - - err = 0; - br->br_fhsm = kmalloc(sizeof(*br->br_fhsm), GFP_NOFS); - if (br->br_fhsm) - au_br_fhsm_init(br->br_fhsm); - else - err = -ENOMEM; - - return err; -} - -/* ---------------------------------------------------------------------- */ - -void au_fhsm_fin(struct super_block *sb) -{ - au_fhsm_notify(sb, /*val*/-1); -} - -void au_fhsm_init(struct au_sbinfo *sbinfo) -{ - struct au_fhsm *fhsm; - - fhsm = &sbinfo->si_fhsm; - spin_lock_init(&fhsm->fhsm_spin); - init_waitqueue_head(&fhsm->fhsm_wqh); - atomic_set(&fhsm->fhsm_readable, 0); - fhsm->fhsm_expire - = msecs_to_jiffies(AUFS_FHSM_CACHE_DEF_SEC * MSEC_PER_SEC); - fhsm->fhsm_bottom = -1; -} - -void au_fhsm_set(struct au_sbinfo *sbinfo, unsigned int sec) -{ - sbinfo->si_fhsm.fhsm_expire - = msecs_to_jiffies(sec * MSEC_PER_SEC); -} - -void au_fhsm_show(struct seq_file *seq, struct au_sbinfo *sbinfo) -{ - unsigned int u; - - if (!au_ftest_si(sbinfo, FHSM)) - return; - - u = jiffies_to_msecs(sbinfo->si_fhsm.fhsm_expire) / MSEC_PER_SEC; - if (u != AUFS_FHSM_CACHE_DEF_SEC) - seq_printf(seq, ",fhsm_sec=%u", u); -} diff --git a/fs/aufs/file.c b/fs/aufs/file.c deleted file mode 100644 index 6b8a66b4a..000000000 --- a/fs/aufs/file.c +++ /dev/null @@ -1,831 +0,0 @@ -/* - * Copyright (C) 2005-2016 Junjiro R. Okajima - */ - -/* - * handling file/dir, and address_space operation - */ - -#ifdef CONFIG_AUFS_DEBUG -#include <linux/migrate.h> -#endif -#include <linux/pagemap.h> -#include "aufs.h" - -/* drop flags for writing */ -unsigned int au_file_roflags(unsigned int flags) -{ - flags &= ~(O_WRONLY | O_RDWR | O_APPEND | O_CREAT | O_TRUNC); - flags |= O_RDONLY | O_NOATIME; - return flags; -} - -/* common functions to regular file and dir */ -struct file *au_h_open(struct dentry *dentry, aufs_bindex_t bindex, int flags, - struct file *file, int force_wr) -{ - struct file *h_file; - struct dentry *h_dentry; - struct inode *h_inode; - struct super_block *sb; - struct au_branch *br; - struct path h_path; - int err; - - /* a race condition can happen between open and unlink/rmdir */ - h_file = ERR_PTR(-ENOENT); - h_dentry = au_h_dptr(dentry, bindex); - if (au_test_nfsd() && (!h_dentry || d_is_negative(h_dentry))) - goto out; - h_inode = d_inode(h_dentry); - spin_lock(&h_dentry->d_lock); - err = (!d_unhashed(dentry) && d_unlinked(h_dentry)) - /* || !d_inode(dentry)->i_nlink */ - ; - spin_unlock(&h_dentry->d_lock); - if (unlikely(err)) - goto out; - - sb = dentry->d_sb; - br = au_sbr(sb, bindex); - err = au_br_test_oflag(flags, br); - h_file = ERR_PTR(err); - if (unlikely(err)) - goto out; - - /* drop flags for writing */ - if (au_test_ro(sb, bindex, d_inode(dentry))) { - if (force_wr && !(flags & O_WRONLY)) - force_wr = 0; - flags = au_file_roflags(flags); - if (force_wr) { - h_file = ERR_PTR(-EROFS); - flags = au_file_roflags(flags); - if (unlikely(vfsub_native_ro(h_inode) - || IS_APPEND(h_inode))) - goto out; - flags &= ~O_ACCMODE; - flags |= O_WRONLY; - } - } - flags &= ~O_CREAT; - atomic_inc(&br->br_count); - h_path.dentry = h_dentry; - h_path.mnt = au_br_mnt(br); - h_file = vfsub_dentry_open(&h_path, flags); - if (IS_ERR(h_file)) - goto out_br; - - if (flags & __FMODE_EXEC) { - err = deny_write_access(h_file); - if (unlikely(err)) { - fput(h_file); - h_file = ERR_PTR(err); - goto out_br; - } - } - fsnotify_open(h_file); - goto out; /* success */ - -out_br: - atomic_dec(&br->br_count); -out: - return h_file; -} - -static int au_cmoo(struct dentry *dentry) -{ - int err, cmoo; - unsigned int udba; - struct path h_path; - struct au_pin pin; - struct au_cp_generic cpg = { - .dentry = dentry, - .bdst = -1, - .bsrc = -1, - .len = -1, - .pin = &pin, - .flags = AuCpup_DTIME | AuCpup_HOPEN - }; - struct inode *delegated; - struct super_block *sb; - struct au_sbinfo *sbinfo; - struct au_fhsm *fhsm; - pid_t pid; - struct au_branch *br; - struct dentry *parent; - struct au_hinode *hdir; - - DiMustWriteLock(dentry); - IiMustWriteLock(d_inode(dentry)); - - err = 0; - if (IS_ROOT(dentry)) - goto out; - cpg.bsrc = au_dbstart(dentry); - if (!cpg.bsrc) - goto out; - - sb = dentry->d_sb; - sbinfo = au_sbi(sb); - fhsm = &sbinfo->si_fhsm; - pid = au_fhsm_pid(fhsm); - if (pid - && (current->pid == pid - || current->real_parent->pid == pid)) - goto out; - - br = au_sbr(sb, cpg.bsrc); - cmoo = au_br_cmoo(br->br_perm); - if (!cmoo) - goto out; - if (!d_is_reg(dentry)) - cmoo &= AuBrAttr_COO_ALL; - if (!cmoo) - goto out; - - parent = dget_parent(dentry); - di_write_lock_parent(parent); - err = au_wbr_do_copyup_bu(dentry, cpg.bsrc - 1); - cpg.bdst = err; - if (unlikely(err < 0)) { - err = 0; /* there is no upper writable branch */ - goto out_dgrade; - } - AuDbg("bsrc %d, bdst %d\n", cpg.bsrc, cpg.bdst); - - /* do not respect the coo attrib for the target branch */ - err = au_cpup_dirs(dentry, cpg.bdst); - if (unlikely(err)) - goto out_dgrade; - - di_downgrade_lock(parent, AuLock_IR); - udba = au_opt_udba(sb); - err = au_pin(&pin, dentry, cpg.bdst, udba, - AuPin_DI_LOCKED | AuPin_MNT_WRITE); - if (unlikely(err)) - goto out_parent; - - err = au_sio_cpup_simple(&cpg); - au_unpin(&pin); - if (unlikely(err)) - goto out_parent; - if (!(cmoo & AuBrWAttr_MOO)) - goto out_parent; /* success */ - - err = au_pin(&pin, dentry, cpg.bsrc, udba, - AuPin_DI_LOCKED | AuPin_MNT_WRITE); - if (unlikely(err)) - goto out_parent; - - h_path.mnt = au_br_mnt(br); - h_path.dentry = au_h_dptr(dentry, cpg.bsrc); - hdir = au_hi(d_inode(parent), cpg.bsrc); - delegated = NULL; - err = vfsub_unlink(hdir->hi_inode, &h_path, &delegated, /*force*/1); - au_unpin(&pin); - /* todo: keep h_dentry or not? */ - if (unlikely(err == -EWOULDBLOCK)) { - pr_warn("cannot retry for NFSv4 delegation" - " for an internal unlink\n"); - iput(delegated); - } - if (unlikely(err)) { - pr_err("unlink %pd after coo failed (%d), ignored\n", - dentry, err); - err = 0; - } - goto out_parent; /* success */ - -out_dgrade: - di_downgrade_lock(parent, AuLock_IR); -out_parent: - di_read_unlock(parent, AuLock_IR); - dput(parent); -out: - AuTraceErr(err); - return err; -} - -int au_do_open(struct file *file, struct au_do_open_args *args) -{ - int err, no_lock = args->no_lock; - struct dentry *dentry; - struct au_finfo *finfo; - - if (!no_lock) - err = au_finfo_init(file, args->fidir); - else { - lockdep_off(); - err = au_finfo_init(file, args->fidir); - lockdep_on(); - } - if (unlikely(err)) - goto out; - - dentry = file->f_path.dentry; - AuDebugOn(IS_ERR_OR_NULL(dentry)); - if (!no_lock) { - di_write_lock_child(dentry); - err = au_cmoo(dentry); - di_downgrade_lock(dentry, AuLock_IR); - if (!err) - err = args->open(file, vfsub_file_flags(file), NULL); - di_read_unlock(dentry, AuLock_IR); - } else { - err = au_cmoo(dentry); - if (!err) - err = args->open(file, vfsub_file_flags(file), - args->h_file); - if (!err && au_fbstart(file) != au_dbstart(dentry)) - /* - * cmoo happens after h_file was opened. - * need to refresh file later. - */ - atomic_dec(&au_fi(file)->fi_generation); - } - - finfo = au_fi(file); - if (!err) { - finfo->fi_file = file; - au_sphl_add(&finfo->fi_hlist, - &au_sbi(file->f_path.dentry->d_sb)->si_files); - } - if (!no_lock) - fi_write_unlock(file); - else { - lockdep_off(); - fi_write_unlock(file); - lockdep_on(); - } - if (unlikely(err)) { - finfo->fi_hdir = NULL; - au_finfo_fin(file); - } - -out: - return err; -} - -int au_reopen_nondir(struct file *file) -{ - int err; - aufs_bindex_t bstart; - struct dentry *dentry; - struct file *h_file, *h_file_tmp; - - dentry = file->f_path.dentry; - bstart = au_dbstart(dentry); - h_file_tmp = NULL; - if (au_fbstart(file) == bstart) { - h_file = au_hf_top(file); - if (file->f_mode == h_file->f_mode) - return 0; /* success */ - h_file_tmp = h_file; - get_file(h_file_tmp); - au_set_h_fptr(file, bstart, NULL); - } - AuDebugOn(au_fi(file)->fi_hdir); - /* - * it can happen - * file exists on both of rw and ro - * open --> dbstart and fbstart are both 0 - * prepend a branch as rw, "rw" become ro - * remove rw/file - * delete the top branch, "rw" becomes rw again - * --> dbstart is 1, fbstart is still 0 - * write --> fbstart is 0 but dbstart is 1 - */ - /* AuDebugOn(au_fbstart(file) < bstart); */ - - h_file = au_h_open(dentry, bstart, vfsub_file_flags(file) & ~O_TRUNC, - file, /*force_wr*/0); - err = PTR_ERR(h_file); - if (IS_ERR(h_file)) { - if (h_file_tmp) { - atomic_inc(&au_sbr(dentry->d_sb, bstart)->br_count); - au_set_h_fptr(file, bstart, h_file_tmp); - h_file_tmp = NULL; - } - goto out; /* todo: close all? */ - } - - err = 0; - au_set_fbstart(file, bstart); - au_set_h_fptr(file, bstart, h_file); - au_update_figen(file); - /* todo: necessary? */ - /* file->f_ra = h_file->f_ra; */ - -out: - if (h_file_tmp) - fput(h_file_tmp); - return err; -} - -/* ---------------------------------------------------------------------- */ - -static int au_reopen_wh(struct file *file, aufs_bindex_t btgt, - struct dentry *hi_wh) -{ - int err; - aufs_bindex_t bstart; - struct au_dinfo *dinfo; - struct dentry *h_dentry; - struct au_hdentry *hdp; - - dinfo = au_di(file->f_path.dentry); - AuRwMustWriteLock(&dinfo->di_rwsem); - - bstart = dinfo->di_bstart; - dinfo->di_bstart = btgt; - hdp = dinfo->di_hdentry; - h_dentry = hdp[0 + btgt].hd_dentry; - hdp[0 + btgt].hd_dentry = hi_wh; - err = au_reopen_nondir(file); - hdp[0 + btgt].hd_dentry = h_dentry; - dinfo->di_bstart = bstart; - - return err; -} - -static int au_ready_to_write_wh(struct file *file, loff_t len, - aufs_bindex_t bcpup, struct au_pin *pin) -{ - int err; - struct inode *inode, *h_inode; - struct dentry *h_dentry, *hi_wh; - struct au_cp_generic cpg = { - .dentry = file->f_path.dentry, - .bdst = bcpup, - .bsrc = -1, - .len = len, - .pin = pin - }; - - au_update_dbstart(cpg.dentry); - inode = d_inode(cpg.dentry); - h_inode = NULL; - if (au_dbstart(cpg.dentry) <= bcpup - && au_dbend(cpg.dentry) >= bcpup) { - h_dentry = au_h_dptr(cpg.dentry, bcpup); - if (h_dentry && d_is_positive(h_dentry)) - h_inode = d_inode(h_dentry); - } - hi_wh = au_hi_wh(inode, bcpup); - if (!hi_wh && !h_inode) - err = au_sio_cpup_wh(&cpg, file); - else - /* already copied-up after unlink */ - err = au_reopen_wh(file, bcpup, hi_wh); - - if (!err - && (inode->i_nlink > 1 - || (inode->i_state & I_LINKABLE)) - && au_opt_test(au_mntflags(cpg.dentry->d_sb), PLINK)) - au_plink_append(inode, bcpup, au_h_dptr(cpg.dentry, bcpup)); - - return err; -} - -/* - * prepare the @file for writing. - */ -int au_ready_to_write(struct file *file, loff_t len, struct au_pin *pin) -{ - int err; - aufs_bindex_t dbstart; - struct dentry *parent; - struct inode *inode; - struct super_block *sb; - struct file *h_file; - struct au_cp_generic cpg = { - .dentry = file->f_path.dentry, - .bdst = -1, - .bsrc = -1, - .len = len, - .pin = pin, - .flags = AuCpup_DTIME - }; - - sb = cpg.dentry->d_sb; - inode = d_inode(cpg.dentry); - cpg.bsrc = au_fbstart(file); - err = au_test_ro(sb, cpg.bsrc, inode); - if (!err && (au_hf_top(file)->f_mode & FMODE_WRITE)) { - err = au_pin(pin, cpg.dentry, cpg.bsrc, AuOpt_UDBA_NONE, - /*flags*/0); - goto out; - } - - /* need to cpup or reopen */ - parent = dget_parent(cpg.dentry); - di_write_lock_parent(parent); - err = AuWbrCopyup(au_sbi(sb), cpg.dentry); - cpg.bdst = err; - if (unlikely(err < 0)) - goto out_dgrade; - err = 0; - - if (!d_unhashed(cpg.dentry) && !au_h_dptr(parent, cpg.bdst)) { - err = au_cpup_dirs(cpg.dentry, cpg.bdst); - if (unlikely(err)) - goto out_dgrade; - } - - err = au_pin(pin, cpg.dentry, cpg.bdst, AuOpt_UDBA_NONE, - AuPin_DI_LOCKED | AuPin_MNT_WRITE); - if (unlikely(err)) - goto out_dgrade; - - dbstart = au_dbstart(cpg.dentry); - if (dbstart <= cpg.bdst) - cpg.bsrc = cpg.bdst; - - if (dbstart <= cpg.bdst /* just reopen */ - || !d_unhashed(cpg.dentry) /* copyup and reopen */ - ) { - h_file = au_h_open_pre(cpg.dentry, cpg.bsrc, /*force_wr*/0); - if (IS_ERR(h_file)) - err = PTR_ERR(h_file); - else { - di_downgrade_lock(parent, AuLock_IR); - if (dbstart > cpg.bdst) - err = au_sio_cpup_simple(&cpg); - if (!err) - err = au_reopen_nondir(file); - au_h_open_post(cpg.dentry, cpg.bsrc, h_file); - } - } else { /* copyup as wh and reopen */ - /* - * since writable hfsplus branch is not supported, - * h_open_pre/post() are unnecessary. - */ - err = au_ready_to_write_wh(file, len, cpg.bdst, pin); - di_downgrade_lock(parent, AuLock_IR); - } - - if (!err) { - au_pin_set_parent_lflag(pin, /*lflag*/0); - goto out_dput; /* success */ - } - au_unpin(pin); - goto out_unlock; - -out_dgrade: - di_downgrade_lock(parent, AuLock_IR); -out_unlock: - di_read_unlock(parent, AuLock_IR); -out_dput: - dput(parent); -out: - return err; -} - -/* ---------------------------------------------------------------------- */ - -int au_do_flush(struct file *file, fl_owner_t id, - int (*flush)(struct file *file, fl_owner_t id)) -{ - int err; - struct super_block *sb; - struct inode *inode; - - inode = file_inode(file); - sb = inode->i_sb; - si_noflush_read_lock(sb); - fi_read_lock(file); - ii_read_lock_child(inode); - - err = flush(file, id); - au_cpup_attr_timesizes(inode); - - ii_read_unlock(inode); - fi_read_unlock(file); - si_read_unlock(sb); - return err; -} - -/* ---------------------------------------------------------------------- */ - -static int au_file_refresh_by_inode(struct file *file, int *need_reopen) -{ - int err; - struct au_pin pin; - struct au_finfo *finfo; - struct dentry *parent, *hi_wh; - struct inode *inode; - struct super_block *sb; - struct au_cp_generic cpg = { - .dentry = file->f_path.dentry, - .bdst = -1, - .bsrc = -1, - .len = -1, - .pin = &pin, - .flags = AuCpup_DTIME - }; - - FiMustWriteLock(file); - - err = 0; - finfo = au_fi(file); - sb = cpg.dentry->d_sb; - inode = d_inode(cpg.dentry); - cpg.bdst = au_ibstart(inode); - if (cpg.bdst == finfo->fi_btop || IS_ROOT(cpg.dentry)) - goto out; - - parent = dget_parent(cpg.dentry); - if (au_test_ro(sb, cpg.bdst, inode)) { - di_read_lock_parent(parent, !AuLock_IR); - err = AuWbrCopyup(au_sbi(sb), cpg.dentry); - cpg.bdst = err; - di_read_unlock(parent, !AuLock_IR); - if (unlikely(err < 0)) - goto out_parent; - err = 0; - } - - di_read_lock_parent(parent, AuLock_IR); - hi_wh = au_hi_wh(inode, cpg.bdst); - if (!S_ISDIR(inode->i_mode) - && au_opt_test(au_mntflags(sb), PLINK) - && au_plink_test(inode) - && !d_unhashed(cpg.dentry) - && cpg.bdst < au_dbstart(cpg.dentry)) { - err = au_test_and_cpup_dirs(cpg.dentry, cpg.bdst); - if (unlikely(err)) - goto out_unlock; - - /* always superio. */ - err = au_pin(&pin, cpg.dentry, cpg.bdst, AuOpt_UDBA_NONE, - AuPin_DI_LOCKED | AuPin_MNT_WRITE); - if (!err) { - err = au_sio_cpup_simple(&cpg); - au_unpin(&pin); - } - } else if (hi_wh) { - /* already copied-up after unlink */ - err = au_reopen_wh(file, cpg.bdst, hi_wh); - *need_reopen = 0; - } - -out_unlock: - di_read_unlock(parent, AuLock_IR); -out_parent: - dput(parent); -out: - return err; -} - -static void au_do_refresh_dir(struct file *file) -{ - aufs_bindex_t bindex, bend, new_bindex, brid; - struct au_hfile *p, tmp, *q; - struct au_finfo *finfo; - struct super_block *sb; - struct au_fidir *fidir; - - FiMustWriteLock(file); - - sb = file->f_path.dentry->d_sb; - finfo = au_fi(file); - fidir = finfo->fi_hdir; - AuDebugOn(!fidir); - p = fidir->fd_hfile + finfo->fi_btop; - brid = p->hf_br->br_id; - bend = fidir->fd_bbot; - for (bindex = finfo->fi_btop; bindex <= bend; bindex++, p++) { - if (!p->hf_file) - continue; - - new_bindex = au_br_index(sb, p->hf_br->br_id); - if (new_bindex == bindex) - continue; - if (new_bindex < 0) { - au_set_h_fptr(file, bindex, NULL); - continue; - } - - /* swap two lower inode, and loop again */ - q = fidir->fd_hfile + new_bindex; - tmp = *q; - *q = *p; - *p = tmp; - if (tmp.hf_file) { - bindex--; - p--; - } - } - - p = fidir->fd_hfile; - if (!au_test_mmapped(file) && !d_unlinked(file->f_path.dentry)) { - bend = au_sbend(sb); - for (finfo->fi_btop = 0; finfo->fi_btop <= bend; - finfo->fi_btop++, p++) - if (p->hf_file) { - if (file_inode(p->hf_file)) - break; - au_hfput(p, file); - } - } else { - bend = au_br_index(sb, brid); - for (finfo->fi_btop = 0; finfo->fi_btop < bend; - finfo->fi_btop++, p++) - if (p->hf_file) - au_hfput(p, file); - bend = au_sbend(sb); - } - - p = fidir->fd_hfile + bend; - for (fidir->fd_bbot = bend; fidir->fd_bbot >= finfo->fi_btop; - fidir->fd_bbot--, p--) - if (p->hf_file) { - if (file_inode(p->hf_file)) - break; - au_hfput(p, file); - } - AuDebugOn(fidir->fd_bbot < finfo->fi_btop); -} - -/* - * after branch manipulating, refresh the file. - */ -static int refresh_file(struct file *file, int (*reopen)(struct file *file)) -{ - int err, need_reopen; - aufs_bindex_t bend, bindex; - struct dentry *dentry; - struct au_finfo *finfo; - struct au_hfile *hfile; - - dentry = file->f_path.dentry; - finfo = au_fi(file); - if (!finfo->fi_hdir) { - hfile = &finfo->fi_htop; - AuDebugOn(!hfile->hf_file); - bindex = au_br_index(dentry->d_sb, hfile->hf_br->br_id); - AuDebugOn(bindex < 0); - if (bindex != finfo->fi_btop) - au_set_fbstart(file, bindex); - } else { - err = au_fidir_realloc(finfo, au_sbend(dentry->d_sb) + 1); - if (unlikely(err)) - goto out; - au_do_refresh_dir(file); - } - - err = 0; - need_reopen = 1; - if (!au_test_mmapped(file)) - err = au_file_refresh_by_inode(file, &need_reopen); - if (!err && need_reopen && !d_unlinked(dentry)) - err = reopen(file); - if (!err) { - au_update_figen(file); - goto out; /* success */ - } - - /* error, close all lower files */ - if (finfo->fi_hdir) { - bend = au_fbend_dir(file); - for (bindex = au_fbstart(file); bindex <= bend; bindex++) - au_set_h_fptr(file, bindex, NULL); - } - -out: - return err; -} - -/* common function to regular file and dir */ -int au_reval_and_lock_fdi(struct file *file, int (*reopen)(struct file *file), - int wlock) -{ - int err; - unsigned int sigen, figen; - aufs_bindex_t bstart; - unsigned char pseudo_link; - struct dentry *dentry; - struct inode *inode; - - err = 0; - dentry = file->f_path.dentry; - inode = d_inode(dentry); - sigen = au_sigen(dentry->d_sb); - fi_write_lock(file); - figen = au_figen(file); - di_write_lock_child(dentry); - bstart = au_dbstart(dentry); - pseudo_link = (bstart != au_ibstart(inode)); - if (sigen == figen && !pseudo_link && au_fbstart(file) == bstart) { - if (!wlock) { - di_downgrade_lock(dentry, AuLock_IR); - fi_downgrade_lock(file); - } - goto out; /* success */ - } - - AuDbg("sigen %d, figen %d\n", sigen, figen); - if (au_digen_test(dentry, sigen)) { - err = au_reval_dpath(dentry, sigen); - AuDebugOn(!err && au_digen_test(dentry, sigen)); - } - - if (!err) - err = refresh_file(file, reopen); - if (!err) { - if (!wlock) { - di_downgrade_lock(dentry, AuLock_IR); - fi_downgrade_lock(file); - } - } else { - di_write_unlock(dentry); - fi_write_unlock(file); - } - -out: - return err; -} - -/* ---------------------------------------------------------------------- */ - -/* cf. aufs_nopage() */ -/* for madvise(2) */ -static int aufs_readpage(struct file *file __maybe_unused, struct page *page) -{ - unlock_page(page); - return 0; -} - -/* it will never be called, but necessary to support O_DIRECT */ -static ssize_t aufs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, - loff_t offset) -{ BUG(); return 0; } - -/* they will never be called. */ -#ifdef CONFIG_AUFS_DEBUG -static int aufs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, - struct page **pagep, void **fsdata) -{ AuUnsupport(); return 0; } -static int aufs_write_end(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) -{ AuUnsupport(); return 0; } -static int aufs_writepage(struct page *page, struct writeback_control *wbc) -{ AuUnsupport(); return 0; } - -static int aufs_set_page_dirty(struct page *page) -{ AuUnsupport(); return 0; } -static void aufs_invalidatepage(struct page *page, unsigned int offset, - unsigned int length) -{ AuUnsupport(); } -static int aufs_releasepage(struct page *page, gfp_t gfp) -{ AuUnsupport(); return 0; } -#if 0 /* called by memory compaction regardless file */ -static int aufs_migratepage(struct address_space *mapping, struct page *newpage, - struct page *page, enum migrate_mode mode) -{ AuUnsupport(); return 0; } -#endif -static int aufs_launder_page(struct page *page) -{ AuUnsupport(); return 0; } -static int aufs_is_partially_uptodate(struct page *page, - unsigned long from, - unsigned long count) -{ AuUnsupport(); return 0; } -static void aufs_is_dirty_writeback(struct page *page, bool *dirty, - bool *writeback) -{ AuUnsupport(); } -static int aufs_error_remove_page(struct address_space *mapping, - struct page *page) -{ AuUnsupport(); return 0; } -static int aufs_swap_activate(struct swap_info_struct *sis, struct file *file, - sector_t *span) -{ AuUnsupport(); return 0; } -static void aufs_swap_deactivate(struct file *file) -{ AuUnsupport(); } -#endif /* CONFIG_AUFS_DEBUG */ - -const struct address_space_operations aufs_aop = { - .readpage = aufs_readpage, - .direct_IO = aufs_direct_IO, -#ifdef CONFIG_AUFS_DEBUG - .writepage = aufs_writepage, - /* no writepages, because of writepage */ - .set_page_dirty = aufs_set_page_dirty, - /* no readpages, because of readpage */ - .write_begin = aufs_write_begin, - .write_end = aufs_write_end, - /* no bmap, no block device */ - .invalidatepage = aufs_invalidatepage, - .releasepage = aufs_releasepage, - /* is fallback_migrate_page ok? */ - /* .migratepage = aufs_migratepage, */ - .launder_page = aufs_launder_page, - .is_partially_uptodate = aufs_is_partially_uptodate, - .is_dirty_writeback = aufs_is_dirty_writeback, - .error_remove_page = aufs_error_remove_page, - .swap_activate = aufs_swap_activate, - .swap_deactivate = aufs_swap_deactivate -#endif /* CONFIG_AUFS_DEBUG */ -}; diff --git a/fs/aufs/file.h b/fs/aufs/file.h deleted file mode 100644 index 27d802487..000000000 --- a/fs/aufs/file.h +++ /dev/null @@ -1,278 +0,0 @@ -/* - * Copyright (C) 2005-2016 Junjiro R. Okajima - */ - -/* - * file operations - */ - -#ifndef __AUFS_FILE_H__ -#define __AUFS_FILE_H__ - -#ifdef __KERNEL__ - -#include <linux/file.h> -#include <linux/fs.h> -#include <linux/poll.h> -#include "rwsem.h" - -struct au_branch; -struct au_hfile { - struct file *hf_file; - struct au_branch *hf_br; -}; - -struct au_vdir; -struct au_fidir { - aufs_bindex_t fd_bbot; - aufs_bindex_t fd_nent; - struct au_vdir *fd_vdir_cache; - struct au_hfile fd_hfile[]; -}; - -static inline int au_fidir_sz(int nent) -{ - AuDebugOn(nent < 0); - return sizeof(struct au_fidir) + sizeof(struct au_hfile) * nent; -} - -struct au_finfo { - atomic_t fi_generation; - - struct au_rwsem fi_rwsem; - aufs_bindex_t fi_btop; - - /* do not union them */ - struct { /* for non-dir */ - struct au_hfile fi_htop; - atomic_t fi_mmapped; - }; - struct au_fidir *fi_hdir; /* for dir only */ - - struct hlist_node fi_hlist; - struct file *fi_file; /* very ugly */ -} ____cacheline_aligned_in_smp; - -/* ---------------------------------------------------------------------- */ - -/* file.c */ -extern const struct address_space_operations aufs_aop; -unsigned int au_file_roflags(unsigned int flags); -struct file *au_h_open(struct dentry *dentry, aufs_bindex_t bindex, int flags, - struct file *file, int force_wr); -struct au_do_open_args { - int no_lock; - int (*open)(struct file *file, int flags, - struct file *h_file); - struct au_fidir *fidir; - struct file *h_file; -}; -int au_do_open(struct file *file, struct au_do_open_args *args); -int au_reopen_nondir(struct file *file); -struct au_pin; -int au_ready_to_write(struct file *file, loff_t len, struct au_pin *pin); -int au_reval_and_lock_fdi(struct file *file, int (*reopen)(struct file *file), - int wlock); -int au_do_flush(struct file *file, fl_owner_t id, - int (*flush)(struct file *file, fl_owner_t id)); - -/* poll.c */ -#ifdef CONFIG_AUFS_POLL -unsigned int aufs_poll(struct file *file, poll_table *wait); -#endif - -#ifdef CONFIG_AUFS_BR_HFSPLUS -/* hfsplus.c */ -struct file *au_h_open_pre(struct dentry *dentry, aufs_bindex_t bindex, - int force_wr); -void au_h_open_post(struct dentry *dentry, aufs_bindex_t bindex, - struct file *h_file); -#else -AuStub(struct file *, au_h_open_pre, return NULL, struct dentry *dentry, - aufs_bindex_t bindex, int force_wr) -AuStubVoid(au_h_open_post, struct dentry *dentry, aufs_bindex_t bindex, - struct file *h_file); -#endif - -/* f_op.c */ -extern const struct file_operations aufs_file_fop; -int au_do_open_nondir(struct file *file, int flags, struct file *h_file); -int aufs_release_nondir(struct inode *inode __maybe_unused, struct file *file); -struct file *au_read_pre(struct file *file, int keep_fi); - -/* finfo.c */ -void au_hfput(struct au_hfile *hf, struct file *file); -void au_set_h_fptr(struct file *file, aufs_bindex_t bindex, - struct file *h_file); - -void au_update_figen(struct file *file); -struct au_fidir *au_fidir_alloc(struct super_block *sb); -int au_fidir_realloc(struct au_finfo *finfo, int nbr); - -void au_fi_init_once(void *_fi); -void au_finfo_fin(struct file *file); -int au_finfo_init(struct file *file, struct au_fidir *fidir); - -/* ioctl.c */ -long aufs_ioctl_nondir(struct file *file, unsigned int cmd, unsigned long arg); -#ifdef CONFIG_COMPAT -long aufs_compat_ioctl_dir(struct file *file, unsigned int cmd, - unsigned long arg); -long aufs_compat_ioctl_nondir(struct file *file, unsigned int cmd, - unsigned long arg); -#endif - -/* ---------------------------------------------------------------------- */ - -static inline struct au_finfo *au_fi(struct file *file) -{ - return file->private_data; -} - -/* ---------------------------------------------------------------------- */ - -/* - * fi_read_lock, fi_write_lock, - * fi_read_unlock, fi_write_unlock, fi_downgrade_lock - */ -AuSimpleRwsemFuncs(fi, struct file *f, &au_fi(f)->fi_rwsem); - -#define FiMustNoWaiters(f) AuRwMustNoWaiters(&au_fi(f)->fi_rwsem) -#define FiMustAnyLock(f) AuRwMustAnyLock(&au_fi(f)->fi_rwsem) -#define FiMustWriteLock(f) AuRwMustWriteLock(&au_fi(f)->fi_rwsem) - -/* ---------------------------------------------------------------------- */ - -/* todo: hard/soft set? */ -static inline aufs_bindex_t au_fbstart(struct file *file) -{ - FiMustAnyLock(file); - return au_fi(file)->fi_btop; -} - -static inline aufs_bindex_t au_fbend_dir(struct file *file) -{ - FiMustAnyLock(file); - AuDebugOn(!au_fi(file)->fi_hdir); - return au_fi(file)->fi_hdir->fd_bbot; -} - -static inline struct au_vdir *au_fvdir_cache(struct file *file) -{ - FiMustAnyLock(file); - AuDebugOn(!au_fi(file)->fi_hdir); - return au_fi(file)->fi_hdir->fd_vdir_cache; -} - -static inline void au_set_fbstart(struct file *file, aufs_bindex_t bindex) -{ - FiMustWriteLock(file); - au_fi(file)->fi_btop = bindex; -} - -static inline void au_set_fbend_dir(struct file *file, aufs_bindex_t bindex) -{ - FiMustWriteLock(file); - AuDebugOn(!au_fi(file)->fi_hdir); - au_fi(file)->fi_hdir->fd_bbot = bindex; -} - -static inline void au_set_fvdir_cache(struct file *file, - struct au_vdir *vdir_cache) -{ - FiMustWriteLock(file); - AuDebugOn(!au_fi(file)->fi_hdir); - au_fi(file)->fi_hdir->fd_vdir_cache = vdir_cache; -} - -static inline struct file *au_hf_top(struct file *file) -{ - FiMustAnyLock(file); - AuDebugOn(au_fi(file)->fi_hdir); - return au_fi(file)->fi_htop.hf_file; -} - -static inline struct file *au_hf_dir(struct file *file, aufs_bindex_t bindex) -{ - FiMustAnyLock(file); - AuDebugOn(!au_fi(file)->fi_hdir); - return au_fi(file)->fi_hdir->fd_hfile[0 + bindex].hf_file; -} - -/* todo: memory barrier? */ -static inline unsigned int au_figen(struct file *f) -{ - return atomic_read(&au_fi(f)->fi_generation); -} - -static inline void au_set_mmapped(struct file *f) -{ - if (atomic_inc_return(&au_fi(f)->fi_mmapped)) - return; - pr_warn("fi_mmapped wrapped around\n"); - while (!atomic_inc_return(&au_fi(f)->fi_mmapped)) - ; -} - -static inline void au_unset_mmapped(struct file *f) -{ - atomic_dec(&au_fi(f)->fi_mmapped); -} - -static inline int au_test_mmapped(struct file *f) -{ - return atomic_read(&au_fi(f)->fi_mmapped); -} - -/* customize vma->vm_file */ - -static inline void au_do_vm_file_reset(struct vm_area_struct *vma, - struct file *file) -{ - struct file *f; - - f = vma->vm_file; - get_file(file); - vma->vm_file = file; - fput(f); -} - -#ifdef CONFIG_MMU -#define AuDbgVmRegion(file, vma) do {} while (0) - -static inline void au_vm_file_reset(struct vm_area_struct *vma, - struct file *file) -{ - au_do_vm_file_reset(vma, file); -} -#else -#define AuDbgVmRegion(file, vma) \ - AuDebugOn((vma)->vm_region && (vma)->vm_region->vm_file != (file)) - -static inline void au_vm_file_reset(struct vm_area_struct *vma, - struct file *file) -{ - struct file *f; - - au_do_vm_file_reset(vma, file); - f = vma->vm_region->vm_file; - get_file(file); - vma->vm_region->vm_file = file; - fput(f); -} -#endif /* CONFIG_MMU */ - -/* handle vma->vm_prfile */ -static inline void au_vm_prfile_set(struct vm_area_struct *vma, - struct file *file) -{ - get_file(file); - vma->vm_prfile = file; -#ifndef CONFIG_MMU - get_file(file); - vma->vm_region->vm_prfile = file; -#endif -} - -#endif /* __KERNEL__ */ -#endif /* __AUFS_FILE_H__ */ diff --git a/fs/aufs/finfo.c b/fs/aufs/finfo.c deleted file mode 100644 index b5eb55dfb..000000000 --- a/fs/aufs/finfo.c +++ /dev/null @@ -1,143 +0,0 @@ -/* - * Copyright (C) 2005-2016 Junjiro R. Okajima - */ - -/* - * file private data - */ - -#include "aufs.h" - -void au_hfput(struct au_hfile *hf, struct file *file) -{ - /* todo: direct access f_flags */ - if (vfsub_file_flags(file) & __FMODE_EXEC) - allow_write_access(hf->hf_file); - fput(hf->hf_file); - hf->hf_file = NULL; - atomic_dec(&hf->hf_br->br_count); - hf->hf_br = NULL; -} - -void au_set_h_fptr(struct file *file, aufs_bindex_t bindex, struct file *val) -{ - struct au_finfo *finfo = au_fi(file); - struct au_hfile *hf; - struct au_fidir *fidir; - - fidir = finfo->fi_hdir; - if (!fidir) { - AuDebugOn(finfo->fi_btop != bindex); - hf = &finfo->fi_htop; - } else - hf = fidir->fd_hfile + bindex; - - if (hf && hf->hf_file) - au_hfput(hf, file); - if (val) { - FiMustWriteLock(file); - AuDebugOn(IS_ERR_OR_NULL(file->f_path.dentry)); - hf->hf_file = val; - hf->hf_br = au_sbr(file->f_path.dentry->d_sb, bindex); - } -} - -void au_update_figen(struct file *file) -{ - atomic_set(&au_fi(file)->fi_generation, au_digen(file->f_path.dentry)); - /* smp_mb(); */ /* atomic_set */ -} - -/* ---------------------------------------------------------------------- */ - -struct au_fidir *au_fidir_alloc(struct super_block *sb) -{ - struct au_fidir *fidir; - int nbr; - - nbr = au_sbend(sb) + 1; - if (nbr < 2) - nbr = 2; /* initial allocate for 2 branches */ - fidir = kzalloc(au_fidir_sz(nbr), GFP_NOFS); - if (fidir) { - fidir->fd_bbot = -1; - fidir->fd_nent = nbr; - } - - return fidir; -} - -int au_fidir_realloc(struct au_finfo *finfo, int nbr) -{ - int err; - struct au_fidir *fidir, *p; - - AuRwMustWriteLock(&finfo->fi_rwsem); - fidir = finfo->fi_hdir; - AuDebugOn(!fidir); - - err = -ENOMEM; - p = au_kzrealloc(fidir, au_fidir_sz(fidir->fd_nent), au_fidir_sz(nbr), - GFP_NOFS); - if (p) { - p->fd_nent = nbr; - finfo->fi_hdir = p; - err = 0; - } - - return err; -} - -/* ---------------------------------------------------------------------- */ - -void au_finfo_fin(struct file *file) -{ - struct au_finfo *finfo; - - au_nfiles_dec(file->f_path.dentry->d_sb); - - finfo = au_fi(file); - AuDebugOn(finfo->fi_hdir); - AuRwDestroy(&finfo->fi_rwsem); - au_cache_free_finfo(finfo); -} - -void au_fi_init_once(void *_finfo) -{ - struct au_finfo *finfo = _finfo; - static struct lock_class_key aufs_fi; - - au_rw_init(&finfo->fi_rwsem); - au_rw_class(&finfo->fi_rwsem, &aufs_fi); -} - -int au_finfo_init(struct file *file, struct au_fidir *fidir) -{ - int err; - struct au_finfo *finfo; - struct dentry *dentry; - - err = -ENOMEM; - dentry = file->f_path.dentry; - finfo = au_cache_alloc_finfo(); - if (unlikely(!finfo)) - goto out; - - err = 0; - au_nfiles_inc(dentry->d_sb); - /* verbose coding for lock class name */ - if (!fidir) - au_rw_class(&finfo->fi_rwsem, au_lc_key + AuLcNonDir_FIINFO); - else - au_rw_class(&finfo->fi_rwsem, au_lc_key + AuLcDir_FIINFO); - au_rw_write_lock(&finfo->fi_rwsem); - finfo->fi_btop = -1; - finfo->fi_hdir = fidir; - atomic_set(&finfo->fi_generation, au_digen(dentry)); - /* smp_mb(); */ /* atomic_set */ - - file->private_data = finfo; - -out: - return err; -} diff --git a/fs/aufs/fstype.h b/fs/aufs/fstype.h deleted file mode 100644 index 725b2ffff..000000000 --- a/fs/aufs/fstype.h +++ /dev/null @@ -1,387 +0,0 @@ -/* - * Copyright (C) 2005-2016 Junjiro R. Okajima - */ - -/* - * judging filesystem type - */ - -#ifndef __AUFS_FSTYPE_H__ -#define __AUFS_FSTYPE_H__ - -#ifdef __KERNEL__ - -#include <linux/fs.h> -#include <linux/magic.h> -#include <linux/nfs_fs.h> -#include <linux/romfs_fs.h> - -static inline int au_test_aufs(struct super_block *sb) -{ - return sb->s_magic == AUFS_SUPER_MAGIC; -} - -static inline const char *au_sbtype(struct super_block *sb) -{ - return sb->s_type->name; -} - -static inline int au_test_iso9660(struct super_block *sb __maybe_unused) -{ -#if defined(CONFIG_ISO9660_FS) || defined(CONFIG_ISO9660_FS_MODULE) - return sb->s_magic == ISOFS_SUPER_MAGIC; -#else - return 0; -#endif -} - -static inline int au_test_romfs(struct super_block *sb __maybe_unused) -{ -#if defined(CONFIG_ROMFS_FS) || defined(CONFIG_ROMFS_FS_MODULE) - return sb->s_magic == ROMFS_MAGIC; -#else - return 0; -#endif -} - -static inline int au_test_cramfs(struct super_block *sb __maybe_unused) -{ -#if defined(CONFIG_CRAMFS) || defined(CONFIG_CRAMFS_MODULE) - return sb->s_magic == CRAMFS_MAGIC; -#endif - return 0; -} - -static inline int au_test_nfs(struct super_block *sb __maybe_unused) -{ -#if defined(CONFIG_NFS_FS) || defined(CONFIG_NFS_FS_MODULE) - return sb->s_magic == NFS_SUPER_MAGIC; -#else - return 0; -#endif -} - -static inline int au_test_fuse(struct super_block *sb __maybe_unused) -{ -#if defined(CONFIG_FUSE_FS) || defined(CONFIG_FUSE_FS_MODULE) - return sb->s_magic == FUSE_SUPER_MAGIC; -#else - return 0; -#endif -} - -static inline int au_test_xfs(struct super_block *sb __maybe_unused) -{ -#if defined(CONFIG_XFS_FS) || defined(CONFIG_XFS_FS_MODULE) - return sb->s_magic == XFS_SB_MAGIC; -#else - return 0; -#endif -} - -static inline int au_test_tmpfs(struct super_block *sb __maybe_unused) -{ -#ifdef CONFIG_TMPFS - return sb->s_magic == TMPFS_MAGIC; -#else - return 0; -#endif -} - -static inline int au_test_ecryptfs(struct super_block *sb __maybe_unused) -{ -#if defined(CONFIG_ECRYPT_FS) || defined(CONFIG_ECRYPT_FS_MODULE) - return !strcmp(au_sbtype(sb), "ecryptfs"); -#else - return 0; -#endif -} - -static inline int au_test_ramfs(struct super_block *sb) -{ - return sb->s_magic == RAMFS_MAGIC; -} - -static inline int au_test_ubifs(struct super_block *sb __maybe_unused) -{ -#if defined(CONFIG_UBIFS_FS) || defined(CONFIG_UBIFS_FS_MODULE) - return sb->s_magic == UBIFS_SUPER_MAGIC; -#else - return 0; -#endif -} - -static inline int au_test_procfs(struct super_block *sb __maybe_unused) -{ -#ifdef CONFIG_PROC_FS - return sb->s_magic == PROC_SUPER_MAGIC; -#else - return 0; -#endif -} - -static inline int au_test_sysfs(struct super_block *sb __maybe_unused) -{ -#ifdef CONFIG_SYSFS - return sb->s_magic == SYSFS_MAGIC; -#else - return 0; -#endif -} - -static inline int au_test_configfs(struct super_block *sb __maybe_unused) -{ -#if defined(CONFIG_CONFIGFS_FS) || defined(CONFIG_CONFIGFS_FS_MODULE) - return sb->s_magic == CONFIGFS_MAGIC; -#else - return 0; -#endif -} - -static inline int au_test_minix(struct super_block *sb __maybe_unused) -{ -#if defined(CONFIG_MINIX_FS) || defined(CONFIG_MINIX_FS_MODULE) - return sb->s_magic == MINIX3_SUPER_MAGIC - || sb->s_magic == MINIX2_SUPER_MAGIC - || sb->s_magic == MINIX2_SUPER_MAGIC2 - || sb->s_magic == MINIX_SUPER_MAGIC - || sb->s_magic == MINIX_SUPER_MAGIC2; -#else - return 0; -#endif -} - -static inline int au_test_fat(struct super_block *sb __maybe_unused) -{ -#if defined(CONFIG_FAT_FS) || defined(CONFIG_FAT_FS_MODULE) - return sb->s_magic == MSDOS_SUPER_MAGIC; -#else - return 0; -#endif -} - -static inline int au_test_msdos(struct super_block *sb) -{ - return au_test_fat(sb); -} - -static inline int au_test_vfat(struct super_block *sb) -{ - return au_test_fat(sb); -} - -static inline int au_test_securityfs(struct super_block *sb __maybe_unused) -{ -#ifdef CONFIG_SECURITYFS - return sb->s_magic == SECURITYFS_MAGIC; -#else - return 0; -#endif -} - -static inline int au_test_squashfs(struct super_block *sb __maybe_unused) -{ -#if defined(CONFIG_SQUASHFS) || defined(CONFIG_SQUASHFS_MODULE) - return sb->s_magic == SQUASHFS_MAGIC; -#else - return 0; -#endif -} - -static inline int au_test_btrfs(struct super_block *sb __maybe_unused) -{ -#if defined(CONFIG_BTRFS_FS) || defined(CONFIG_BTRFS_FS_MODULE) - return sb->s_magic == BTRFS_SUPER_MAGIC; -#else - return 0; -#endif -} - -static inline int au_test_xenfs(struct super_block *sb __maybe_unused) -{ -#if defined(CONFIG_XENFS) || defined(CONFIG_XENFS_MODULE) - return sb->s_magic == XENFS_SUPER_MAGIC; -#else - return 0; -#endif -} - -static inline int au_test_debugfs(struct super_block *sb __maybe_unused) -{ -#ifdef CONFIG_DEBUG_FS - return sb->s_magic == DEBUGFS_MAGIC; -#else - return 0; -#endif -} - -static inline int au_test_nilfs(struct super_block *sb __maybe_unused) -{ -#if defined(CONFIG_NILFS) || defined(CONFIG_NILFS_MODULE) - return sb->s_magic == NILFS_SUPER_MAGIC; -#else - return 0; -#endif -} - -static inline int au_test_hfsplus(struct super_block *sb __maybe_unused) -{ -#if defined(CONFIG_HFSPLUS_FS) || defined(CONFIG_HFSPLUS_FS_MODULE) - return sb->s_magic == HFSPLUS_SUPER_MAGIC; -#else - return 0; -#endif -} - -/* ---------------------------------------------------------------------- */ -/* - * they can't be an aufs branch. - */ -static inline int au_test_fs_unsuppoted(struct super_block *sb) -{ - return -#ifndef CONFIG_AUFS_BR_RAMFS - au_test_ramfs(sb) || -#endif - au_test_procfs(sb) - || au_test_sysfs(sb) - || au_test_configfs(sb) - || au_test_debugfs(sb) - || au_test_securityfs(sb) - || au_test_xenfs(sb) - || au_test_ecryptfs(sb) - /* || !strcmp(au_sbtype(sb), "unionfs") */ - || au_test_aufs(sb); /* will be supported in next version */ -} - -static inline int au_test_fs_remote(struct super_block *sb) -{ - return !au_test_tmpfs(sb) -#ifdef CONFIG_AUFS_BR_RAMFS - && !au_test_ramfs(sb) -#endif - && !(sb->s_type->fs_flags & FS_REQUIRES_DEV); -} - -/* ---------------------------------------------------------------------- */ - -/* - * Note: these functions (below) are created after reading ->getattr() in all - * filesystems under linux/fs. it means we have to do so in every update... - */ - -/* - * some filesystems require getattr to refresh the inode attributes before - * referencing. - * in most cases, we can rely on the inode attribute in NFS (or every remote fs) - * and leave the work for d_revalidate() - */ -static inline int au_test_fs_refresh_iattr(struct super_block *sb) -{ - return au_test_nfs(sb) - || au_test_fuse(sb) - /* || au_test_btrfs(sb) */ /* untested */ - ; -} - -/* - * filesystems which don't maintain i_size or i_blocks. - */ -static inline int au_test_fs_bad_iattr_size(struct super_block *sb) -{ - return au_test_xfs(sb) - || au_test_btrfs(sb) - || au_test_ubifs(sb) - || au_test_hfsplus(sb) /* maintained, but incorrect */ - /* || au_test_minix(sb) */ /* untested */ - ; -} - -/* - * filesystems which don't store the correct value in some of their inode - * attributes. - */ -static inline int au_test_fs_bad_iattr(struct super_block *sb) -{ - return au_test_fs_bad_iattr_size(sb) - || au_test_fat(sb) - || au_test_msdos(sb) - || au_test_vfat(sb); -} - -/* they don't check i_nlink in link(2) */ -static inline int au_test_fs_no_limit_nlink(struct super_block *sb) -{ - return au_test_tmpfs(sb) -#ifdef CONFIG_AUFS_BR_RAMFS - || au_test_ramfs(sb) -#endif - || au_test_ubifs(sb) - || au_test_hfsplus(sb); -} - -/* - * filesystems which sets S_NOATIME and S_NOCMTIME. - */ -static inline int au_test_fs_notime(struct super_block *sb) -{ - return au_test_nfs(sb) - || au_test_fuse(sb) - || au_test_ubifs(sb) - ; -} - -/* temporary support for i#1 in cramfs */ -static inline int au_test_fs_unique_ino(struct inode *inode) -{ - if (au_test_cramfs(inode->i_sb)) - return inode->i_ino != 1; - return 1; -} - -/* ---------------------------------------------------------------------- */ - -/* - * the filesystem where the xino files placed must support i/o after unlink and - * maintain i_size and i_blocks. - */ -static inline int au_test_fs_bad_xino(struct super_block *sb) -{ - return au_test_fs_remote(sb) - || au_test_fs_bad_iattr_size(sb) - /* don't want unnecessary work for xino */ - || au_test_aufs(sb) - || au_test_ecryptfs(sb) - || au_test_nilfs(sb); -} - -static inline int au_test_fs_trunc_xino(struct super_block *sb) -{ - return au_test_tmpfs(sb) - || au_test_ramfs(sb); -} - -/* - * test if the @sb is real-readonly. - */ -static inline int au_test_fs_rr(struct super_block *sb) -{ - return au_test_squashfs(sb) - || au_test_iso9660(sb) - || au_test_cramfs(sb) - || au_test_romfs(sb); -} - -/* - * test if the @inode is nfs with 'noacl' option - * NFS always sets MS_POSIXACL regardless its mount option 'noacl.' - */ -static inline int au_test_nfs_noacl(struct inode *inode) -{ - return au_test_nfs(inode->i_sb) - /* && IS_POSIXACL(inode) */ - && !nfs_server_capable(inode, NFS_CAP_ACLS); -} - -#endif /* __KERNEL__ */ -#endif /* __AUFS_FSTYPE_H__ */ diff --git a/fs/aufs/hfsnotify.c b/fs/aufs/hfsnotify.c deleted file mode 100644 index c0a1a63a9..000000000 --- a/fs/aufs/hfsnotify.c +++ /dev/null @@ -1,275 +0,0 @@ -/* - * Copyright (C) 2005-2016 Junjiro R. Okajima - */ - -/* - * fsnotify for the lower directories - */ - -#include "aufs.h" - -/* FS_IN_IGNORED is unnecessary */ -static const __u32 AuHfsnMask = (FS_MOVED_TO | FS_MOVED_FROM | FS_DELETE - | FS_CREATE | FS_EVENT_ON_CHILD); -static DECLARE_WAIT_QUEUE_HEAD(au_hfsn_wq); -static __cacheline_aligned_in_smp atomic64_t au_hfsn_ifree = ATOMIC64_INIT(0); - -static void au_hfsn_free_mark(struct fsnotify_mark *mark) -{ - struct au_hnotify *hn = container_of(mark, struct au_hnotify, - hn_mark); - AuDbg("here\n"); - au_cache_free_hnotify(hn); - smp_mb__before_atomic(); - if (atomic64_dec_and_test(&au_hfsn_ifree)) - wake_up(&au_hfsn_wq); -} - -static int au_hfsn_alloc(struct au_hinode *hinode) -{ - int err; - struct au_hnotify *hn; - struct super_block *sb; - struct au_branch *br; - struct fsnotify_mark *mark; - aufs_bindex_t bindex; - - hn = hinode->hi_notify; - sb = hn->hn_aufs_inode->i_sb; - bindex = au_br_index(sb, hinode->hi_id); - br = au_sbr(sb, bindex); - AuDebugOn(!br->br_hfsn); - - mark = &hn->hn_mark; - fsnotify_init_mark(mark, au_hfsn_free_mark); - mark->mask = AuHfsnMask; - /* - * by udba rename or rmdir, aufs assign a new inode to the known - * h_inode, so specify 1 to allow dups. - */ - lockdep_off(); - err = fsnotify_add_mark(mark, br->br_hfsn->hfsn_group, hinode->hi_inode, - /*mnt*/NULL, /*allow_dups*/1); - /* even if err */ - fsnotify_put_mark(mark); - lockdep_on(); - - return err; -} - -static int au_hfsn_free(struct au_hinode *hinode, struct au_hnotify *hn) -{ - struct fsnotify_mark *mark; - unsigned long long ull; - struct fsnotify_group *group; - - ull = atomic64_inc_return(&au_hfsn_ifree); - BUG_ON(!ull); - - mark = &hn->hn_mark; - spin_lock(&mark->lock); - group = mark->group; - fsnotify_get_group(group); - spin_unlock(&mark->lock); - lockdep_off(); - fsnotify_destroy_mark(mark, group); - fsnotify_put_group(group); - lockdep_on(); - - /* free hn by myself */ - return 0; -} - -/* ---------------------------------------------------------------------- */ - -static void au_hfsn_ctl(struct au_hinode *hinode, int do_set) -{ - struct fsnotify_mark *mark; - - mark = &hinode->hi_notify->hn_mark; - spin_lock(&mark->lock); - if (do_set) { - AuDebugOn(mark->mask & AuHfsnMask); - mark->mask |= AuHfsnMask; - } else { - AuDebugOn(!(mark->mask & AuHfsnMask)); - mark->mask &= ~AuHfsnMask; - } - spin_unlock(&mark->lock); - /* fsnotify_recalc_inode_mask(hinode->hi_inode); */ -} - -/* ---------------------------------------------------------------------- */ - -/* #define AuDbgHnotify */ -#ifdef AuDbgHnotify -static char *au_hfsn_name(u32 mask) -{ -#ifdef CONFIG_AUFS_DEBUG -#define test_ret(flag) \ - do { \ - if (mask & flag) \ - return #flag; \ - } while (0) - test_ret(FS_ACCESS); - test_ret(FS_MODIFY); - test_ret(FS_ATTRIB); - test_ret(FS_CLOSE_WRITE); - test_ret(FS_CLOSE_NOWRITE); - test_ret(FS_OPEN); - test_ret(FS_MOVED_FROM); - test_ret(FS_MOVED_TO); - test_ret(FS_CREATE); - test_ret(FS_DELETE); - test_ret(FS_DELETE_SELF); - test_ret(FS_MOVE_SELF); - test_ret(FS_UNMOUNT); - test_ret(FS_Q_OVERFLOW); - test_ret(FS_IN_IGNORED); - test_ret(FS_ISDIR); - test_ret(FS_IN_ONESHOT); - test_ret(FS_EVENT_ON_CHILD); - return ""; -#undef test_ret -#else - return "??"; -#endif -} -#endif - -/* ---------------------------------------------------------------------- */ - -static void au_hfsn_free_group(struct fsnotify_group *group) -{ - struct au_br_hfsnotify *hfsn = group->private; - - AuDbg("here\n"); - kfree(hfsn); -} - -static int au_hfsn_handle_event(struct fsnotify_group *group, - struct inode *inode, - struct fsnotify_mark *inode_mark, - struct fsnotify_mark *vfsmount_mark, - u32 mask, void *data, int data_type, - const unsigned char *file_name, u32 cookie) -{ - int err; - struct au_hnotify *hnotify; - struct inode *h_dir, *h_inode; - struct qstr h_child_qstr = QSTR_INIT(file_name, strlen(file_name)); - - AuDebugOn(data_type != FSNOTIFY_EVENT_INODE); - - err = 0; - /* if FS_UNMOUNT happens, there must be another bug */ - AuDebugOn(mask & FS_UNMOUNT); - if (mask & (FS_IN_IGNORED | FS_UNMOUNT)) - goto out; - - h_dir = inode; - h_inode = NULL; -#ifdef AuDbgHnotify - au_debug_on(); - if (1 || h_child_qstr.len != sizeof(AUFS_XINO_FNAME) - 1 - || strncmp(h_child_qstr.name, AUFS_XINO_FNAME, h_child_qstr.len)) { - AuDbg("i%lu, mask 0x%x %s, hcname %.*s, hi%lu\n", - h_dir->i_ino, mask, au_hfsn_name(mask), - AuLNPair(&h_child_qstr), h_inode ? h_inode->i_ino : 0); - /* WARN_ON(1); */ - } - au_debug_off(); -#endif - - AuDebugOn(!inode_mark); - hnotify = container_of(inode_mark, struct au_hnotify, hn_mark); - err = au_hnotify(h_dir, hnotify, mask, &h_child_qstr, h_inode); - -out: - return err; -} - -static struct fsnotify_ops au_hfsn_ops = { - .handle_event = au_hfsn_handle_event, - .free_group_priv = au_hfsn_free_group -}; - -/* ---------------------------------------------------------------------- */ - -static void au_hfsn_fin_br(struct au_branch *br) -{ - struct au_br_hfsnotify *hfsn; - - hfsn = br->br_hfsn; - if (hfsn) { - lockdep_off(); - fsnotify_put_group(hfsn->hfsn_group); - lockdep_on(); - } -} - -static int au_hfsn_init_br(struct au_branch *br, int perm) -{ - int err; - struct fsnotify_group *group; - struct au_br_hfsnotify *hfsn; - - err = 0; - br->br_hfsn = NULL; - if (!au_br_hnotifyable(perm)) - goto out; - - err = -ENOMEM; - hfsn = kmalloc(sizeof(*hfsn), GFP_NOFS); - if (unlikely(!hfsn)) - goto out; - - err = 0; - group = fsnotify_alloc_group(&au_hfsn_ops); - if (IS_ERR(group)) { - err = PTR_ERR(group); - pr_err("fsnotify_alloc_group() failed, %d\n", err); - goto out_hfsn; - } - - group->private = hfsn; - hfsn->hfsn_group = group; - br->br_hfsn = hfsn; - goto out; /* success */ - -out_hfsn: - kfree(hfsn); -out: - return err; -} - -static int au_hfsn_reset_br(unsigned int udba, struct au_branch *br, int perm) -{ - int err; - - err = 0; - if (!br->br_hfsn) - err = au_hfsn_init_br(br, perm); - - return err; -} - -/* ---------------------------------------------------------------------- */ - -static void au_hfsn_fin(void) -{ - AuDbg("au_hfsn_ifree %lld\n", (long long)atomic64_read(&au_hfsn_ifree)); - wait_event(au_hfsn_wq, !atomic64_read(&au_hfsn_ifree)); -} - -const struct au_hnotify_op au_hnotify_op = { - .ctl = au_hfsn_ctl, - .alloc = au_hfsn_alloc, - .free = au_hfsn_free, - - .fin = au_hfsn_fin, - - .reset_br = au_hfsn_reset_br, - .fin_br = au_hfsn_fin_br, - .init_br = au_hfsn_init_br -}; diff --git a/fs/aufs/hfsplus.c b/fs/aufs/hfsplus.c deleted file mode 100644 index 145c6ac2f..000000000 --- a/fs/aufs/hfsplus.c +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Copyright (C) 2010-2016 Junjiro R. Okajima - */ - -/* - * special support for filesystems which aqucires an inode mutex - * at final closing a file, eg, hfsplus. - * - * This trick is very simple and stupid, just to open the file before really - * neceeary open to tell hfsplus that this is not the final closing. - * The caller should call au_h_open_pre() after acquiring the inode mutex, - * and au_h_open_post() after releasing it. - */ - -#include "aufs.h" - -struct file *au_h_open_pre(struct dentry *dentry, aufs_bindex_t bindex, - int force_wr) -{ - struct file *h_file; - struct dentry *h_dentry; - - h_dentry = au_h_dptr(dentry, bindex); - AuDebugOn(!h_dentry); - AuDebugOn(d_is_negative(h_dentry)); - - h_file = NULL; - if (au_test_hfsplus(h_dentry->d_sb) - && d_is_reg(h_dentry)) - h_file = au_h_open(dentry, bindex, - O_RDONLY | O_NOATIME | O_LARGEFILE, - /*file*/NULL, force_wr); - return h_file; -} - -void au_h_open_post(struct dentry *dentry, aufs_bindex_t bindex, - struct file *h_file) -{ - if (h_file) { - fput(h_file); - au_sbr_put(dentry->d_sb, bindex); - } -} diff --git a/fs/aufs/hnotify.c b/fs/aufs/hnotify.c deleted file mode 100644 index 3e0a4f67d..000000000 --- a/fs/aufs/hnotify.c +++ /dev/null @@ -1,697 +0,0 @@ -/* - * Copyright (C) 2005-2016 Junjiro R. Okajima - */ - -/* - * abstraction to notify the direct changes on lower directories - */ - -#include "aufs.h" - -int au_hn_alloc(struct au_hinode *hinode, struct inode *inode) -{ - int err; - struct au_hnotify *hn; - - err = -ENOMEM; - hn = au_cache_alloc_hnotify(); - if (hn) { - hn->hn_aufs_inode = inode; - hinode->hi_notify = hn; - err = au_hnotify_op.alloc(hinode); - AuTraceErr(err); - if (unlikely(err)) { - hinode->hi_notify = NULL; - au_cache_free_hnotify(hn); - /* - * The upper dir was removed by udba, but the same named - * dir left. In this case, aufs assignes a new inode - * number and set the monitor again. - * For the lower dir, the old monitnor is still left. - */ - if (err == -EEXIST) - err = 0; - } - } - - AuTraceErr(err); - return err; -} - -void au_hn_free(struct au_hinode *hinode) -{ - struct au_hnotify *hn; - - hn = hinode->hi_notify; - if (hn) { - hinode->hi_notify = NULL; - if (au_hnotify_op.free(hinode, hn)) - au_cache_free_hnotify(hn); - } -} - -/* ---------------------------------------------------------------------- */ - -void au_hn_ctl(struct au_hinode *hinode, int do_set) -{ - if (hinode->hi_notify) - au_hnotify_op.ctl(hinode, do_set); -} - -void au_hn_reset(struct inode *inode, unsigned int flags) -{ - aufs_bindex_t bindex, bend; - struct inode *hi; - struct dentry *iwhdentry; - - bend = au_ibend(inode); - for (bindex = au_ibstart(inode); bindex <= bend; bindex++) { - hi = au_h_iptr(inode, bindex); - if (!hi) - continue; - - /* mutex_lock_nested(&hi->i_mutex, AuLsc_I_CHILD); */ - iwhdentry = au_hi_wh(inode, bindex); - if (iwhdentry) - dget(iwhdentry); - au_igrab(hi); - au_set_h_iptr(inode, bindex, NULL, 0); - au_set_h_iptr(inode, bindex, au_igrab(hi), - flags & ~AuHi_XINO); - iput(hi); - dput(iwhdentry); - /* mutex_unlock(&hi->i_mutex); */ - } -} - -/* ---------------------------------------------------------------------- */ - -static int hn_xino(struct inode *inode, struct inode *h_inode) -{ - int err; - aufs_bindex_t bindex, bend, bfound, bstart; - struct inode *h_i; - - err = 0; - if (unlikely(inode->i_ino == AUFS_ROOT_INO)) { - pr_warn("branch root dir was changed\n"); - goto out; - } - - bfound = -1; - bend = au_ibend(inode); - bstart = au_ibstart(inode); -#if 0 /* reserved for future use */ - if (bindex == bend) { - /* keep this ino in rename case */ - goto out; - } -#endif - for (bindex = bstart; bindex <= bend; bindex++) - if (au_h_iptr(inode, bindex) == h_inode) { - bfound = bindex; - break; - } - if (bfound < 0) - goto out; - - for (bindex = bstart; bindex <= bend; bindex++) { - h_i = au_h_iptr(inode, bindex); - if (!h_i) - continue; - - err = au_xino_write(inode->i_sb, bindex, h_i->i_ino, /*ino*/0); - /* ignore this error */ - /* bad action? */ - } - - /* children inode number will be broken */ - -out: - AuTraceErr(err); - return err; -} - -static int hn_gen_tree(struct dentry *dentry) -{ - int err, i, j, ndentry; - struct au_dcsub_pages dpages; - struct au_dpage *dpage; - struct dentry **dentries; - - err = au_dpages_init(&dpages, GFP_NOFS); - if (unlikely(err)) - goto out; - err = au_dcsub_pages(&dpages, dentry, NULL, NULL); - if (unlikely(err)) - goto out_dpages; - - for (i = 0; i < dpages.ndpage; i++) { - dpage = dpages.dpages + i; - dentries = dpage->dentries; - ndentry = dpage->ndentry; - for (j = 0; j < ndentry; j++) { - struct dentry *d; - - d = dentries[j]; - if (IS_ROOT(d)) - continue; - - au_digen_dec(d); - if (d_really_is_positive(d)) - /* todo: reset children xino? - cached children only? */ - au_iigen_dec(d_inode(d)); - } - } - -out_dpages: - au_dpages_free(&dpages); - -#if 0 - /* discard children */ - dentry_unhash(dentry); - dput(dentry); -#endif -out: - return err; -} - -/* - * return 0 if processed. - */ -static int hn_gen_by_inode(char *name, unsigned int nlen, struct inode *inode, - const unsigned int isdir) -{ - int err; - struct dentry *d; - struct qstr *dname; - - err = 1; - if (unlikely(inode->i_ino == AUFS_ROOT_INO)) { - pr_warn("branch root dir was changed\n"); - err = 0; - goto out; - } - - if (!isdir) { - AuDebugOn(!name); - au_iigen_dec(inode); - spin_lock(&inode->i_lock); - hlist_for_each_entry(d, &inode->i_dentry, d_u.d_alias) { - spin_lock(&d->d_lock); - dname = &d->d_name; - if (dname->len != nlen - && memcmp(dname->name, name, nlen)) { - spin_unlock(&d->d_lock); - continue; - } - err = 0; - au_digen_dec(d); - spin_unlock(&d->d_lock); - break; - } - spin_unlock(&inode->i_lock); - } else { - au_fset_si(au_sbi(inode->i_sb), FAILED_REFRESH_DIR); - d = d_find_any_alias(inode); - if (!d) { - au_iigen_dec(inode); - goto out; - } - - spin_lock(&d->d_lock); - dname = &d->d_name; - if (dname->len == nlen && !memcmp(dname->name, name, nlen)) { - spin_unlock(&d->d_lock); - err = hn_gen_tree(d); - spin_lock(&d->d_lock); - } - spin_unlock(&d->d_lock); - dput(d); - } - -out: - AuTraceErr(err); - return err; -} - -static int hn_gen_by_name(struct dentry *dentry, const unsigned int isdir) -{ - int err; - - if (IS_ROOT(dentry)) { - pr_warn("branch root dir was changed\n"); - return 0; - } - - err = 0; - if (!isdir) { - au_digen_dec(dentry); - if (d_really_is_positive(dentry)) - au_iigen_dec(d_inode(dentry)); - } else { - au_fset_si(au_sbi(dentry->d_sb), FAILED_REFRESH_DIR); - if (d_really_is_positive(dentry)) - err = hn_gen_tree(dentry); - } - - AuTraceErr(err); - return err; -} - -/* ---------------------------------------------------------------------- */ - -/* hnotify job flags */ -#define AuHnJob_XINO0 1 -#define AuHnJob_GEN (1 << 1) -#define AuHnJob_DIRENT (1 << 2) -#define AuHnJob_ISDIR (1 << 3) -#define AuHnJob_TRYXINO0 (1 << 4) -#define AuHnJob_MNTPNT (1 << 5) -#define au_ftest_hnjob(flags, name) ((flags) & AuHnJob_##name) -#define au_fset_hnjob(flags, name) \ - do { (flags) |= AuHnJob_##name; } while (0) -#define au_fclr_hnjob(flags, name) \ - do { (flags) &= ~AuHnJob_##name; } while (0) - -enum { - AuHn_CHILD, - AuHn_PARENT, - AuHnLast -}; - -struct au_hnotify_args { - struct inode *h_dir, *dir, *h_child_inode; - u32 mask; - unsigned int flags[AuHnLast]; - unsigned int h_child_nlen; - char h_child_name[]; -}; - -struct hn_job_args { - unsigned int flags; - struct inode *inode, *h_inode, *dir, *h_dir; - struct dentry *dentry; - char *h_name; - int h_nlen; -}; - -static int hn_job(struct hn_job_args *a) -{ - const unsigned int isdir = au_ftest_hnjob(a->flags, ISDIR); - int e; - - /* reset xino */ - if (au_ftest_hnjob(a->flags, XINO0) && a->inode) - hn_xino(a->inode, a->h_inode); /* ignore this error */ - - if (au_ftest_hnjob(a->flags, TRYXINO0) - && a->inode - && a->h_inode) { - mutex_lock_nested(&a->h_inode->i_mutex, AuLsc_I_CHILD); - if (!a->h_inode->i_nlink - && !(a->h_inode->i_state & I_LINKABLE)) - hn_xino(a->inode, a->h_inode); /* ignore this error */ - mutex_unlock(&a->h_inode->i_mutex); - } - - /* make the generation obsolete */ - if (au_ftest_hnjob(a->flags, GEN)) { - e = -1; - if (a->inode) - e = hn_gen_by_inode(a->h_name, a->h_nlen, a->inode, - isdir); - if (e && a->dentry) - hn_gen_by_name(a->dentry, isdir); - /* ignore this error */ - } - - /* make dir entries obsolete */ - if (au_ftest_hnjob(a->flags, DIRENT) && a->inode) { - struct au_vdir *vdir; - - vdir = au_ivdir(a->inode); - if (vdir) - vdir->vd_jiffy = 0; - /* IMustLock(a->inode); */ - /* a->inode->i_version++; */ - } - - /* can do nothing but warn */ - if (au_ftest_hnjob(a->flags, MNTPNT) - && a->dentry - && d_mountpoint(a->dentry)) - pr_warn("mount-point %pd is removed or renamed\n", a->dentry); - - return 0; -} - -/* ---------------------------------------------------------------------- */ - -static struct dentry *lookup_wlock_by_name(char *name, unsigned int nlen, - struct inode *dir) -{ - struct dentry *dentry, *d, *parent; - struct qstr *dname; - - parent = d_find_any_alias(dir); - if (!parent) - return NULL; - - dentry = NULL; - spin_lock(&parent->d_lock); - list_for_each_entry(d, &parent->d_subdirs, d_child) { - /* AuDbg("%pd\n", d); */ - spin_lock_nested(&d->d_lock, DENTRY_D_LOCK_NESTED); - dname = &d->d_name; - if (dname->len != nlen || memcmp(dname->name, name, nlen)) - goto cont_unlock; - if (au_di(d)) - au_digen_dec(d); - else - goto cont_unlock; - if (au_dcount(d) > 0) { - dentry = dget_dlock(d); - spin_unlock(&d->d_lock); - break; - } - -cont_unlock: - spin_unlock(&d->d_lock); - } - spin_unlock(&parent->d_lock); - dput(parent); - - if (dentry) - di_write_lock_child(dentry); - - return dentry; -} - -static struct inode *lookup_wlock_by_ino(struct super_block *sb, - aufs_bindex_t bindex, ino_t h_ino) -{ - struct inode *inode; - ino_t ino; - int err; - - inode = NULL; - err = au_xino_read(sb, bindex, h_ino, &ino); - if (!err && ino) - inode = ilookup(sb, ino); - if (!inode) - goto out; - - if (unlikely(inode->i_ino == AUFS_ROOT_INO)) { - pr_warn("wrong root branch\n"); - iput(inode); - inode = NULL; - goto out; - } - - ii_write_lock_child(inode); - -out: - return inode; -} - -static void au_hn_bh(void *_args) -{ - struct au_hnotify_args *a = _args; - struct super_block *sb; - aufs_bindex_t bindex, bend, bfound; - unsigned char xino, try_iput; - int err; - struct inode *inode; - ino_t h_ino; - struct hn_job_args args; - struct dentry *dentry; - struct au_sbinfo *sbinfo; - - AuDebugOn(!_args); - AuDebugOn(!a->h_dir); - AuDebugOn(!a->dir); - AuDebugOn(!a->mask); - AuDbg("mask 0x%x, i%lu, hi%lu, hci%lu\n", - a->mask, a->dir->i_ino, a->h_dir->i_ino, - a->h_child_inode ? a->h_child_inode->i_ino : 0); - - inode = NULL; - dentry = NULL; - /* - * do not lock a->dir->i_mutex here - * because of d_revalidate() may cause a deadlock. - */ - sb = a->dir->i_sb; - AuDebugOn(!sb); - sbinfo = au_sbi(sb); - AuDebugOn(!sbinfo); - si_write_lock(sb, AuLock_NOPLMW); - - ii_read_lock_parent(a->dir); - bfound = -1; - bend = au_ibend(a->dir); - for (bindex = au_ibstart(a->dir); bindex <= bend; bindex++) - if (au_h_iptr(a->dir, bindex) == a->h_dir) { - bfound = bindex; - break; - } - ii_read_unlock(a->dir); - if (unlikely(bfound < 0)) - goto out; - - xino = !!au_opt_test(au_mntflags(sb), XINO); - h_ino = 0; - if (a->h_child_inode) - h_ino = a->h_child_inode->i_ino; - - if (a->h_child_nlen - && (au_ftest_hnjob(a->flags[AuHn_CHILD], GEN) - || au_ftest_hnjob(a->flags[AuHn_CHILD], MNTPNT))) - dentry = lookup_wlock_by_name(a->h_child_name, a->h_child_nlen, - a->dir); - try_iput = 0; - if (dentry && d_really_is_positive(dentry)) - inode = d_inode(dentry); - if (xino && !inode && h_ino - && (au_ftest_hnjob(a->flags[AuHn_CHILD], XINO0) - || au_ftest_hnjob(a->flags[AuHn_CHILD], TRYXINO0) - || au_ftest_hnjob(a->flags[AuHn_CHILD], GEN))) { - inode = lookup_wlock_by_ino(sb, bfound, h_ino); - try_iput = 1; - } - - args.flags = a->flags[AuHn_CHILD]; - args.dentry = dentry; - args.inode = inode; - args.h_inode = a->h_child_inode; - args.dir = a->dir; - args.h_dir = a->h_dir; - args.h_name = a->h_child_name; - args.h_nlen = a->h_child_nlen; - err = hn_job(&args); - if (dentry) { - if (au_di(dentry)) - di_write_unlock(dentry); - dput(dentry); - } - if (inode && try_iput) { - ii_write_unlock(inode); - iput(inode); - } - - ii_write_lock_parent(a->dir); - args.flags = a->flags[AuHn_PARENT]; - args.dentry = NULL; - args.inode = a->dir; - args.h_inode = a->h_dir; - args.dir = NULL; - args.h_dir = NULL; - args.h_name = NULL; - args.h_nlen = 0; - err = hn_job(&args); - ii_write_unlock(a->dir); - -out: - iput(a->h_child_inode); - iput(a->h_dir); - iput(a->dir); - si_write_unlock(sb); - au_nwt_done(&sbinfo->si_nowait); - kfree(a); -} - -/* ---------------------------------------------------------------------- */ - -int au_hnotify(struct inode *h_dir, struct au_hnotify *hnotify, u32 mask, - struct qstr *h_child_qstr, struct inode *h_child_inode) -{ - int err, len; - unsigned int flags[AuHnLast], f; - unsigned char isdir, isroot, wh; - struct inode *dir; - struct au_hnotify_args *args; - char *p, *h_child_name; - - err = 0; - AuDebugOn(!hnotify || !hnotify->hn_aufs_inode); - dir = igrab(hnotify->hn_aufs_inode); - if (!dir) - goto out; - - isroot = (dir->i_ino == AUFS_ROOT_INO); - wh = 0; - h_child_name = (void *)h_child_qstr->name; - len = h_child_qstr->len; - if (h_child_name) { - if (len > AUFS_WH_PFX_LEN - && !memcmp(h_child_name, AUFS_WH_PFX, AUFS_WH_PFX_LEN)) { - h_child_name += AUFS_WH_PFX_LEN; - len -= AUFS_WH_PFX_LEN; - wh = 1; - } - } - - isdir = 0; - if (h_child_inode) - isdir = !!S_ISDIR(h_child_inode->i_mode); - flags[AuHn_PARENT] = AuHnJob_ISDIR; - flags[AuHn_CHILD] = 0; - if (isdir) - flags[AuHn_CHILD] = AuHnJob_ISDIR; - au_fset_hnjob(flags[AuHn_PARENT], DIRENT); - au_fset_hnjob(flags[AuHn_CHILD], GEN); - switch (mask & FS_EVENTS_POSS_ON_CHILD) { - case FS_MOVED_FROM: - case FS_MOVED_TO: - au_fset_hnjob(flags[AuHn_CHILD], XINO0); - au_fset_hnjob(flags[AuHn_CHILD], MNTPNT); - /*FALLTHROUGH*/ - case FS_CREATE: - AuDebugOn(!h_child_name); - break; - - case FS_DELETE: - /* - * aufs never be able to get this child inode. - * revalidation should be in d_revalidate() - * by checking i_nlink, i_generation or d_unhashed(). - */ - AuDebugOn(!h_child_name); - au_fset_hnjob(flags[AuHn_CHILD], TRYXINO0); - au_fset_hnjob(flags[AuHn_CHILD], MNTPNT); - break; - - default: - AuDebugOn(1); - } - - if (wh) - h_child_inode = NULL; - - err = -ENOMEM; - /* iput() and kfree() will be called in au_hnotify() */ - args = kmalloc(sizeof(*args) + len + 1, GFP_NOFS); - if (unlikely(!args)) { - AuErr1("no memory\n"); - iput(dir); - goto out; - } - args->flags[AuHn_PARENT] = flags[AuHn_PARENT]; - args->flags[AuHn_CHILD] = flags[AuHn_CHILD]; - args->mask = mask; - args->dir = dir; - args->h_dir = igrab(h_dir); - if (h_child_inode) - h_child_inode = igrab(h_child_inode); /* can be NULL */ - args->h_child_inode = h_child_inode; - args->h_child_nlen = len; - if (len) { - p = (void *)args; - p += sizeof(*args); - memcpy(p, h_child_name, len); - p[len] = 0; - } - - /* NFS fires the event for silly-renamed one from kworker */ - f = 0; - if (!dir->i_nlink - || (au_test_nfs(h_dir->i_sb) && (mask & FS_DELETE))) - f = AuWkq_NEST; - err = au_wkq_nowait(au_hn_bh, args, dir->i_sb, f); - if (unlikely(err)) { - pr_err("wkq %d\n", err); - iput(args->h_child_inode); - iput(args->h_dir); - iput(args->dir); - kfree(args); - } - -out: - return err; -} - -/* ---------------------------------------------------------------------- */ - -int au_hnotify_reset_br(unsigned int udba, struct au_branch *br, int perm) -{ - int err; - - AuDebugOn(!(udba & AuOptMask_UDBA)); - - err = 0; - if (au_hnotify_op.reset_br) - err = au_hnotify_op.reset_br(udba, br, perm); - - return err; -} - -int au_hnotify_init_br(struct au_branch *br, int perm) -{ - int err; - - err = 0; - if (au_hnotify_op.init_br) - err = au_hnotify_op.init_br(br, perm); - - return err; -} - -void au_hnotify_fin_br(struct au_branch *br) -{ - if (au_hnotify_op.fin_br) - au_hnotify_op.fin_br(br); -} - -static void au_hn_destroy_cache(void) -{ - kmem_cache_destroy(au_cachep[AuCache_HNOTIFY]); - au_cachep[AuCache_HNOTIFY] = NULL; -} - -int __init au_hnotify_init(void) -{ - int err; - - err = -ENOMEM; - au_cachep[AuCache_HNOTIFY] = AuCache(au_hnotify); - if (au_cachep[AuCache_HNOTIFY]) { - err = 0; - if (au_hnotify_op.init) - err = au_hnotify_op.init(); - if (unlikely(err)) - au_hn_destroy_cache(); - } - AuTraceErr(err); - return err; -} - -void au_hnotify_fin(void) -{ - if (au_hnotify_op.fin) - au_hnotify_op.fin(); - /* cf. au_cache_fin() */ - if (au_cachep[AuCache_HNOTIFY]) - au_hn_destroy_cache(); -} diff --git a/fs/aufs/i_op.c b/fs/aufs/i_op.c deleted file mode 100644 index 6e50526d8..000000000 --- a/fs/aufs/i_op.c +++ /dev/null @@ -1,1477 +0,0 @@ -/* - * Copyright (C) 2005-2016 Junjiro R. Okajima - */ - -/* - * inode operations (except add/del/rename) - */ - -#include <linux/device_cgroup.h> -#include <linux/fs_stack.h> -#include <linux/namei.h> -#include <linux/security.h> -#include "aufs.h" - -static int h_permission(struct inode *h_inode, int mask, - struct vfsmount *h_mnt, int brperm) -{ - int err; - const unsigned char write_mask = !!(mask & (MAY_WRITE | MAY_APPEND)); - - err = -EACCES; - if ((write_mask && IS_IMMUTABLE(h_inode)) - || ((mask & MAY_EXEC) - && S_ISREG(h_inode->i_mode) - && ((h_mnt->mnt_flags & MNT_NOEXEC) - || !(h_inode->i_mode & S_IXUGO)))) - goto out; - - /* - * - skip the lower fs test in the case of write to ro branch. - * - nfs dir permission write check is optimized, but a policy for - * link/rename requires a real check. - * - nfs always sets MS_POSIXACL regardless its mount option 'noacl.' - * in this case, generic_permission() returns -EOPNOTSUPP. - */ - if ((write_mask && !au_br_writable(brperm)) - || (au_test_nfs(h_inode->i_sb) && S_ISDIR(h_inode->i_mode) - && write_mask && !(mask & MAY_READ)) - || !h_inode->i_op->permission) { - /* AuLabel(generic_permission); */ - /* AuDbg("get_acl %pf\n", h_inode->i_op->get_acl); */ - err = generic_permission(h_inode, mask); - if (err == -EOPNOTSUPP && au_test_nfs_noacl(h_inode)) - err = h_inode->i_op->permission(h_inode, mask); - AuTraceErr(err); - } else { - /* AuLabel(h_inode->permission); */ - err = h_inode->i_op->permission(h_inode, mask); - AuTraceErr(err); - } - - if (!err) - err = devcgroup_inode_permission(h_inode, mask); - if (!err) - err = security_inode_permission(h_inode, mask); - -#if 0 - if (!err) { - /* todo: do we need to call ima_path_check()? */ - struct path h_path = { - .dentry = - .mnt = h_mnt - }; - err = ima_path_check(&h_path, - mask & (MAY_READ | MAY_WRITE | MAY_EXEC), - IMA_COUNT_LEAVE); - } -#endif - -out: - return err; -} - -static int aufs_permission(struct inode *inode, int mask) -{ - int err; - aufs_bindex_t bindex, bend; - const unsigned char isdir = !!S_ISDIR(inode->i_mode), - write_mask = !!(mask & (MAY_WRITE | MAY_APPEND)); - struct inode *h_inode; - struct super_block *sb; - struct au_branch *br; - - /* todo: support rcu-walk? */ - if (mask & MAY_NOT_BLOCK) - return -ECHILD; - - sb = inode->i_sb; - si_read_lock(sb, AuLock_FLUSH); - ii_read_lock_child(inode); -#if 0 - err = au_iigen_test(inode, au_sigen(sb)); - if (unlikely(err)) - goto out; -#endif - - if (!isdir - || write_mask - || au_opt_test(au_mntflags(sb), DIRPERM1)) { - err = au_busy_or_stale(); - h_inode = au_h_iptr(inode, au_ibstart(inode)); - if (unlikely(!h_inode - || (h_inode->i_mode & S_IFMT) - != (inode->i_mode & S_IFMT))) - goto out; - - err = 0; - bindex = au_ibstart(inode); - br = au_sbr(sb, bindex); - err = h_permission(h_inode, mask, au_br_mnt(br), br->br_perm); - if (write_mask - && !err - && !special_file(h_inode->i_mode)) { - /* test whether the upper writable branch exists */ - err = -EROFS; - for (; bindex >= 0; bindex--) - if (!au_br_rdonly(au_sbr(sb, bindex))) { - err = 0; - break; - } - } - goto out; - } - - /* non-write to dir */ - err = 0; - bend = au_ibend(inode); - for (bindex = au_ibstart(inode); !err && bindex <= bend; bindex++) { - h_inode = au_h_iptr(inode, bindex); - if (h_inode) { - err = au_busy_or_stale(); - if (unlikely(!S_ISDIR(h_inode->i_mode))) - break; - - br = au_sbr(sb, bindex); - err = h_permission(h_inode, mask, au_br_mnt(br), - br->br_perm); - } - } - -out: - ii_read_unlock(inode); - si_read_unlock(sb); - return err; -} - -/* ---------------------------------------------------------------------- */ - -static struct dentry *aufs_lookup(struct inode *dir, struct dentry *dentry, - unsigned int flags) -{ - struct dentry *ret, *parent; - struct inode *inode; - struct super_block *sb; - int err, npositive; - - IMustLock(dir); - - /* todo: support rcu-walk? */ - ret = ERR_PTR(-ECHILD); - if (flags & LOOKUP_RCU) - goto out; - - ret = ERR_PTR(-ENAMETOOLONG); - if (unlikely(dentry->d_name.len > AUFS_MAX_NAMELEN)) - goto out; - - sb = dir->i_sb; - err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM); - ret = ERR_PTR(err); - if (unlikely(err)) - goto out; - - err = au_di_init(dentry); - ret = ERR_PTR(err); - if (unlikely(err)) - goto out_si; - - inode = NULL; - npositive = 0; /* suppress a warning */ - parent = dentry->d_parent; /* dir inode is locked */ - di_read_lock_parent(parent, AuLock_IR); - err = au_alive_dir(parent); - if (!err) - err = au_digen_test(parent, au_sigen(sb)); - if (!err) { - npositive = au_lkup_dentry(dentry, au_dbstart(parent), - /*type*/0); - err = npositive; - } - di_read_unlock(parent, AuLock_IR); - ret = ERR_PTR(err); - if (unlikely(err < 0)) - goto out_unlock; - - if (npositive) { - inode = au_new_inode(dentry, /*must_new*/0); - if (IS_ERR(inode)) { - ret = (void *)inode; - inode = NULL; - goto out_unlock; - } - } - - if (inode) - atomic_inc(&inode->i_count); - ret = d_splice_alias(inode, dentry); -#if 0 - if (unlikely(d_need_lookup(dentry))) { - spin_lock(&dentry->d_lock); - dentry->d_flags &= ~DCACHE_NEED_LOOKUP; - spin_unlock(&dentry->d_lock); - } else -#endif - if (inode) { - if (!IS_ERR(ret)) { - iput(inode); - if (ret && ret != dentry) - ii_write_unlock(inode); - } else { - ii_write_unlock(inode); - iput(inode); - inode = NULL; - } - } - -out_unlock: - di_write_unlock(dentry); - if (inode) { - /* verbose coding for lock class name */ - if (unlikely(S_ISLNK(inode->i_mode))) - au_rw_class(&au_di(dentry)->di_rwsem, - au_lc_key + AuLcSymlink_DIINFO); - else if (unlikely(S_ISDIR(inode->i_mode))) - au_rw_class(&au_di(dentry)->di_rwsem, - au_lc_key + AuLcDir_DIINFO); - else /* likely */ - au_rw_class(&au_di(dentry)->di_rwsem, - au_lc_key + AuLcNonDir_DIINFO); - } -out_si: - si_read_unlock(sb); -out: - return ret; -} - -/* ---------------------------------------------------------------------- */ - -struct aopen_node { - struct hlist_node hlist; - struct file *file, *h_file; -}; - -static int au_do_aopen(struct inode *inode, struct file *file) -{ - struct au_sphlhead *aopen; - struct aopen_node *node; - struct au_do_open_args args = { - .no_lock = 1, - .open = au_do_open_nondir - }; - - aopen = &au_sbi(inode->i_sb)->si_aopen; - spin_lock(&aopen->spin); - hlist_for_each_entry(node, &aopen->head, hlist) - if (node->file == file) { - args.h_file = node->h_file; - break; - } - spin_unlock(&aopen->spin); - /* AuDebugOn(!args.h_file); */ - - return au_do_open(file, &args); -} - -static int aufs_atomic_open(struct inode *dir, struct dentry *dentry, - struct file *file, unsigned int open_flag, - umode_t create_mode, int *opened) -{ - int err, h_opened = *opened; - struct dentry *parent; - struct dentry *d; - struct au_sphlhead *aopen; - struct vfsub_aopen_args args = { - .open_flag = open_flag, - .create_mode = create_mode, - .opened = &h_opened - }; - struct aopen_node aopen_node = { - .file = file - }; - - IMustLock(dir); - AuDbg("open_flag 0x%x\n", open_flag); - AuDbgDentry(dentry); - - err = 0; - if (!au_di(dentry)) { - d = aufs_lookup(dir, dentry, /*flags*/0); - if (IS_ERR(d)) { - err = PTR_ERR(d); - goto out; - } else if (d) { - /* - * obsoleted dentry found. - * another error will be returned later. - */ - d_drop(d); - dput(d); - AuDbgDentry(d); - } - AuDbgDentry(dentry); - } - - if (d_is_positive(dentry) - || d_unhashed(dentry) - || d_unlinked(dentry) - || !(open_flag & O_CREAT)) - goto out_no_open; - - err = aufs_read_lock(dentry, AuLock_DW | AuLock_FLUSH | AuLock_GEN); - if (unlikely(err)) - goto out; - - parent = dentry->d_parent; /* dir is locked */ - di_write_lock_parent(parent); - err = au_lkup_dentry(dentry, /*bstart*/0, /*type*/0); - if (unlikely(err)) - goto out_unlock; - - AuDbgDentry(dentry); - if (d_is_positive(dentry)) - goto out_unlock; - - args.file = get_empty_filp(); - err = PTR_ERR(args.file); - if (IS_ERR(args.file)) - goto out_unlock; - - args.file->f_flags = file->f_flags; - err = au_aopen_or_create(dir, dentry, &args); - AuTraceErr(err); - AuDbgFile(args.file); - if (unlikely(err < 0)) { - if (h_opened & FILE_OPENED) - fput(args.file); - else - put_filp(args.file); - goto out_unlock; - } - - /* some filesystems don't set FILE_CREATED while succeeded? */ - *opened |= FILE_CREATED; - if (h_opened & FILE_OPENED) - aopen_node.h_file = args.file; - else { - put_filp(args.file); - args.file = NULL; - } - aopen = &au_sbi(dir->i_sb)->si_aopen; - au_sphl_add(&aopen_node.hlist, aopen); - err = finish_open(file, dentry, au_do_aopen, opened); - au_sphl_del(&aopen_node.hlist, aopen); - AuTraceErr(err); - AuDbgFile(file); - if (aopen_node.h_file) - fput(aopen_node.h_file); - -out_unlock: - di_write_unlock(parent); - aufs_read_unlock(dentry, AuLock_DW); - AuDbgDentry(dentry); - if (unlikely(err)) - goto out; -out_no_open: - if (!err && !(*opened & FILE_CREATED)) { - AuLabel(out_no_open); - dget(dentry); - err = finish_no_open(file, dentry); - } -out: - AuDbg("%pd%s%s\n", dentry, - (*opened & FILE_CREATED) ? " created" : "", - (*opened & FILE_OPENED) ? " opened" : ""); - AuTraceErr(err); - return err; -} - - -/* ---------------------------------------------------------------------- */ - -static int au_wr_dir_cpup(struct dentry *dentry, struct dentry *parent, - const unsigned char add_entry, aufs_bindex_t bcpup, - aufs_bindex_t bstart) -{ - int err; - struct dentry *h_parent; - struct inode *h_dir; - - if (add_entry) - IMustLock(d_inode(parent)); - else - di_write_lock_parent(parent); - - err = 0; - if (!au_h_dptr(parent, bcpup)) { - if (bstart > bcpup) - err = au_cpup_dirs(dentry, bcpup); - else if (bstart < bcpup) - err = au_cpdown_dirs(dentry, bcpup); - else - BUG(); - } - if (!err && add_entry && !au_ftest_wrdir(add_entry, TMPFILE)) { - h_parent = au_h_dptr(parent, bcpup); - h_dir = d_inode(h_parent); - mutex_lock_nested(&h_dir->i_mutex, AuLsc_I_PARENT); - err = au_lkup_neg(dentry, bcpup, /*wh*/0); - /* todo: no unlock here */ - mutex_unlock(&h_dir->i_mutex); - - AuDbg("bcpup %d\n", bcpup); - if (!err) { - if (d_really_is_negative(dentry)) - au_set_h_dptr(dentry, bstart, NULL); - au_update_dbrange(dentry, /*do_put_zero*/0); - } - } - - if (!add_entry) - di_write_unlock(parent); - if (!err) - err = bcpup; /* success */ - - AuTraceErr(err); - return err; -} - -/* - * decide the branch and the parent dir where we will create a new entry. - * returns new bindex or an error. - * copyup the parent dir if needed. - */ -int au_wr_dir(struct dentry *dentry, struct dentry *src_dentry, - struct au_wr_dir_args *args) -{ - int err; - unsigned int flags; - aufs_bindex_t bcpup, bstart, src_bstart; - const unsigned char add_entry - = au_ftest_wrdir(args->flags, ADD_ENTRY) - | au_ftest_wrdir(args->flags, TMPFILE); - struct super_block *sb; - struct dentry *parent; - struct au_sbinfo *sbinfo; - - sb = dentry->d_sb; - sbinfo = au_sbi(sb); - parent = dget_parent(dentry); - bstart = au_dbstart(dentry); - bcpup = bstart; - if (args->force_btgt < 0) { - if (src_dentry) { - src_bstart = au_dbstart(src_dentry); - if (src_bstart < bstart) - bcpup = src_bstart; - } else if (add_entry) { - flags = 0; - if (au_ftest_wrdir(args->flags, ISDIR)) - au_fset_wbr(flags, DIR); - err = AuWbrCreate(sbinfo, dentry, flags); - bcpup = err; - } - - if (bcpup < 0 || au_test_ro(sb, bcpup, d_inode(dentry))) { - if (add_entry) - err = AuWbrCopyup(sbinfo, dentry); - else { - if (!IS_ROOT(dentry)) { - di_read_lock_parent(parent, !AuLock_IR); - err = AuWbrCopyup(sbinfo, dentry); - di_read_unlock(parent, !AuLock_IR); - } else - err = AuWbrCopyup(sbinfo, dentry); - } - bcpup = err; - if (unlikely(err < 0)) - goto out; - } - } else { - bcpup = args->force_btgt; - AuDebugOn(au_test_ro(sb, bcpup, d_inode(dentry))); - } - - AuDbg("bstart %d, bcpup %d\n", bstart, bcpup); - err = bcpup; - if (bcpup == bstart) - goto out; /* success */ - - /* copyup the new parent into the branch we process */ - err = au_wr_dir_cpup(dentry, parent, add_entry, bcpup, bstart); - if (err >= 0) { - if (d_really_is_negative(dentry)) { - au_set_h_dptr(dentry, bstart, NULL); - au_set_dbstart(dentry, bcpup); - au_set_dbend(dentry, bcpup); - } - AuDebugOn(add_entry - && !au_ftest_wrdir(args->flags, TMPFILE) - && !au_h_dptr(dentry, bcpup)); - } - -out: - dput(parent); - return err; -} - -/* ---------------------------------------------------------------------- */ - -void au_pin_hdir_unlock(struct au_pin *p) -{ - if (p->hdir) - au_hn_imtx_unlock(p->hdir); -} - -int au_pin_hdir_lock(struct au_pin *p) -{ - int err; - - err = 0; - if (!p->hdir) - goto out; - - /* even if an error happens later, keep this lock */ - au_hn_imtx_lock_nested(p->hdir, p->lsc_hi); - - err = -EBUSY; - if (unlikely(p->hdir->hi_inode != d_inode(p->h_parent))) - goto out; - - err = 0; - if (p->h_dentry) - err = au_h_verify(p->h_dentry, p->udba, p->hdir->hi_inode, - p->h_parent, p->br); - -out: - return err; -} - -int au_pin_hdir_relock(struct au_pin *p) -{ - int err, i; - struct inode *h_i; - struct dentry *h_d[] = { - p->h_dentry, - p->h_parent - }; - - err = au_pin_hdir_lock(p); - if (unlikely(err)) - goto out; - - for (i = 0; !err && i < sizeof(h_d)/sizeof(*h_d); i++) { - if (!h_d[i]) - continue; - if (d_is_positive(h_d[i])) { - h_i = d_inode(h_d[i]); - err = !h_i->i_nlink; - } - } - -out: - return err; -} - -void au_pin_hdir_set_owner(struct au_pin *p, struct task_struct *task) -{ -#if defined(CONFIG_DEBUG_MUTEXES) || defined(CONFIG_SMP) - p->hdir->hi_inode->i_mutex.owner = task; -#endif -} - -void au_pin_hdir_acquire_nest(struct au_pin *p) -{ - if (p->hdir) { - mutex_acquire_nest(&p->hdir->hi_inode->i_mutex.dep_map, - p->lsc_hi, 0, NULL, _RET_IP_); - au_pin_hdir_set_owner(p, current); - } -} - -void au_pin_hdir_release(struct au_pin *p) -{ - if (p->hdir) { - au_pin_hdir_set_owner(p, p->task); - mutex_release(&p->hdir->hi_inode->i_mutex.dep_map, 1, _RET_IP_); - } -} - -struct dentry *au_pinned_h_parent(struct au_pin *pin) -{ - if (pin && pin->parent) - return au_h_dptr(pin->parent, pin->bindex); - return NULL; -} - -void au_unpin(struct au_pin *p) -{ - if (p->hdir) - au_pin_hdir_unlock(p); - if (p->h_mnt && au_ftest_pin(p->flags, MNT_WRITE)) - vfsub_mnt_drop_write(p->h_mnt); - if (!p->hdir) - return; - - if (!au_ftest_pin(p->flags, DI_LOCKED)) - di_read_unlock(p->parent, AuLock_IR); - iput(p->hdir->hi_inode); - dput(p->parent); - p->parent = NULL; - p->hdir = NULL; - p->h_mnt = NULL; - /* do not clear p->task */ -} - -int au_do_pin(struct au_pin *p) -{ - int err; - struct super_block *sb; - struct inode *h_dir; - - err = 0; - sb = p->dentry->d_sb; - p->br = au_sbr(sb, p->bindex); - if (IS_ROOT(p->dentry)) { - if (au_ftest_pin(p->flags, MNT_WRITE)) { - p->h_mnt = au_br_mnt(p->br); - err = vfsub_mnt_want_write(p->h_mnt); - if (unlikely(err)) { - au_fclr_pin(p->flags, MNT_WRITE); - goto out_err; - } - } - goto out; - } - - p->h_dentry = NULL; - if (p->bindex <= au_dbend(p->dentry)) - p->h_dentry = au_h_dptr(p->dentry, p->bindex); - - p->parent = dget_parent(p->dentry); - if (!au_ftest_pin(p->flags, DI_LOCKED)) - di_read_lock(p->parent, AuLock_IR, p->lsc_di); - - h_dir = NULL; - p->h_parent = au_h_dptr(p->parent, p->bindex); - p->hdir = au_hi(d_inode(p->parent), p->bindex); - if (p->hdir) - h_dir = p->hdir->hi_inode; - - /* - * udba case, or - * if DI_LOCKED is not set, then p->parent may be different - * and h_parent can be NULL. - */ - if (unlikely(!p->hdir || !h_dir || !p->h_parent)) { - err = -EBUSY; - if (!au_ftest_pin(p->flags, DI_LOCKED)) - di_read_unlock(p->parent, AuLock_IR); - dput(p->parent); - p->parent = NULL; - goto out_err; - } - - if (au_ftest_pin(p->flags, MNT_WRITE)) { - p->h_mnt = au_br_mnt(p->br); - err = vfsub_mnt_want_write(p->h_mnt); - if (unlikely(err)) { - au_fclr_pin(p->flags, MNT_WRITE); - if (!au_ftest_pin(p->flags, DI_LOCKED)) - di_read_unlock(p->parent, AuLock_IR); - dput(p->parent); - p->parent = NULL; - goto out_err; - } - } - - au_igrab(h_dir); - err = au_pin_hdir_lock(p); - if (!err) - goto out; /* success */ - - au_unpin(p); - -out_err: - pr_err("err %d\n", err); - err = au_busy_or_stale(); -out: - return err; -} - -void au_pin_init(struct au_pin *p, struct dentry *dentry, - aufs_bindex_t bindex, int lsc_di, int lsc_hi, - unsigned int udba, unsigned char flags) -{ - p->dentry = dentry; - p->udba = udba; - p->lsc_di = lsc_di; - p->lsc_hi = lsc_hi; - p->flags = flags; - p->bindex = bindex; - - p->parent = NULL; - p->hdir = NULL; - p->h_mnt = NULL; - - p->h_dentry = NULL; - p->h_parent = NULL; - p->br = NULL; - p->task = current; -} - -int au_pin(struct au_pin *pin, struct dentry *dentry, aufs_bindex_t bindex, - unsigned int udba, unsigned char flags) -{ - au_pin_init(pin, dentry, bindex, AuLsc_DI_PARENT, AuLsc_I_PARENT2, - udba, flags); - return au_do_pin(pin); -} - -/* ---------------------------------------------------------------------- */ - -/* - * ->setattr() and ->getattr() are called in various cases. - * chmod, stat: dentry is revalidated. - * fchmod, fstat: file and dentry are not revalidated, additionally they may be - * unhashed. - * for ->setattr(), ia->ia_file is passed from ftruncate only. - */ -/* todo: consolidate with do_refresh() and simple_reval_dpath() */ -int au_reval_for_attr(struct dentry *dentry, unsigned int sigen) -{ - int err; - struct dentry *parent; - - err = 0; - if (au_digen_test(dentry, sigen)) { - parent = dget_parent(dentry); - di_read_lock_parent(parent, AuLock_IR); - err = au_refresh_dentry(dentry, parent); - di_read_unlock(parent, AuLock_IR); - dput(parent); - } - - AuTraceErr(err); - return err; -} - -int au_pin_and_icpup(struct dentry *dentry, struct iattr *ia, - struct au_icpup_args *a) -{ - int err; - loff_t sz; - aufs_bindex_t bstart, ibstart; - struct dentry *hi_wh, *parent; - struct inode *inode; - struct au_wr_dir_args wr_dir_args = { - .force_btgt = -1, - .flags = 0 - }; - - if (d_is_dir(dentry)) - au_fset_wrdir(wr_dir_args.flags, ISDIR); - /* plink or hi_wh() case */ - bstart = au_dbstart(dentry); - inode = d_inode(dentry); - ibstart = au_ibstart(inode); - if (bstart != ibstart && !au_test_ro(inode->i_sb, ibstart, inode)) - wr_dir_args.force_btgt = ibstart; - err = au_wr_dir(dentry, /*src_dentry*/NULL, &wr_dir_args); - if (unlikely(err < 0)) - goto out; - a->btgt = err; - if (err != bstart) - au_fset_icpup(a->flags, DID_CPUP); - - err = 0; - a->pin_flags = AuPin_MNT_WRITE; - parent = NULL; - if (!IS_ROOT(dentry)) { - au_fset_pin(a->pin_flags, DI_LOCKED); - parent = dget_parent(dentry); - di_write_lock_parent(parent); - } - - err = au_pin(&a->pin, dentry, a->btgt, a->udba, a->pin_flags); - if (unlikely(err)) - goto out_parent; - - a->h_path.dentry = au_h_dptr(dentry, bstart); - sz = -1; - a->h_inode = d_inode(a->h_path.dentry); - if (ia && (ia->ia_valid & ATTR_SIZE)) { - mutex_lock_nested(&a->h_inode->i_mutex, AuLsc_I_CHILD); - if (ia->ia_size < i_size_read(a->h_inode)) - sz = ia->ia_size; - mutex_unlock(&a->h_inode->i_mutex); - } - - hi_wh = NULL; - if (au_ftest_icpup(a->flags, DID_CPUP) && d_unlinked(dentry)) { - hi_wh = au_hi_wh(inode, a->btgt); - if (!hi_wh) { - struct au_cp_generic cpg = { - .dentry = dentry, - .bdst = a->btgt, - .bsrc = -1, - .len = sz, - .pin = &a->pin - }; - err = au_sio_cpup_wh(&cpg, /*file*/NULL); - if (unlikely(err)) - goto out_unlock; - hi_wh = au_hi_wh(inode, a->btgt); - /* todo: revalidate hi_wh? */ - } - } - - if (parent) { - au_pin_set_parent_lflag(&a->pin, /*lflag*/0); - di_downgrade_lock(parent, AuLock_IR); - dput(parent); - parent = NULL; - } - if (!au_ftest_icpup(a->flags, DID_CPUP)) - goto out; /* success */ - - if (!d_unhashed(dentry)) { - struct au_cp_generic cpg = { - .dentry = dentry, - .bdst = a->btgt, - .bsrc = bstart, - .len = sz, - .pin = &a->pin, - .flags = AuCpup_DTIME | AuCpup_HOPEN - }; - err = au_sio_cpup_simple(&cpg); - if (!err) - a->h_path.dentry = au_h_dptr(dentry, a->btgt); - } else if (!hi_wh) - a->h_path.dentry = au_h_dptr(dentry, a->btgt); - else - a->h_path.dentry = hi_wh; /* do not dget here */ - -out_unlock: - a->h_inode = d_inode(a->h_path.dentry); - if (!err) - goto out; /* success */ - au_unpin(&a->pin); -out_parent: - if (parent) { - di_write_unlock(parent); - dput(parent); - } -out: - if (!err) - mutex_lock_nested(&a->h_inode->i_mutex, AuLsc_I_CHILD); - return err; -} - -static int aufs_setattr(struct dentry *dentry, struct iattr *ia) -{ - int err; - struct inode *inode, *delegated; - struct super_block *sb; - struct file *file; - struct au_icpup_args *a; - - inode = d_inode(dentry); - IMustLock(inode); - - err = -ENOMEM; - a = kzalloc(sizeof(*a), GFP_NOFS); - if (unlikely(!a)) - goto out; - - if (ia->ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID)) - ia->ia_valid &= ~ATTR_MODE; - - file = NULL; - sb = dentry->d_sb; - err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM); - if (unlikely(err)) - goto out_kfree; - - if (ia->ia_valid & ATTR_FILE) { - /* currently ftruncate(2) only */ - AuDebugOn(!d_is_reg(dentry)); - file = ia->ia_file; - err = au_reval_and_lock_fdi(file, au_reopen_nondir, /*wlock*/1); - if (unlikely(err)) - goto out_si; - ia->ia_file = au_hf_top(file); - a->udba = AuOpt_UDBA_NONE; - } else { - /* fchmod() doesn't pass ia_file */ - a->udba = au_opt_udba(sb); - di_write_lock_child(dentry); - /* no d_unlinked(), to set UDBA_NONE for root */ - if (d_unhashed(dentry)) - a->udba = AuOpt_UDBA_NONE; - if (a->udba != AuOpt_UDBA_NONE) { - AuDebugOn(IS_ROOT(dentry)); - err = au_reval_for_attr(dentry, au_sigen(sb)); - if (unlikely(err)) - goto out_dentry; - } - } - - err = au_pin_and_icpup(dentry, ia, a); - if (unlikely(err < 0)) - goto out_dentry; - if (au_ftest_icpup(a->flags, DID_CPUP)) { - ia->ia_file = NULL; - ia->ia_valid &= ~ATTR_FILE; - } - - a->h_path.mnt = au_sbr_mnt(sb, a->btgt); - if ((ia->ia_valid & (ATTR_MODE | ATTR_CTIME)) - == (ATTR_MODE | ATTR_CTIME)) { - err = security_path_chmod(&a->h_path, ia->ia_mode); - if (unlikely(err)) - goto out_unlock; - } else if ((ia->ia_valid & (ATTR_UID | ATTR_GID)) - && (ia->ia_valid & ATTR_CTIME)) { - err = security_path_chown(&a->h_path, ia->ia_uid, ia->ia_gid); - if (unlikely(err)) - goto out_unlock; - } - - if (ia->ia_valid & ATTR_SIZE) { - struct file *f; - - if (ia->ia_size < i_size_read(inode)) - /* unmap only */ - truncate_setsize(inode, ia->ia_size); - - f = NULL; - if (ia->ia_valid & ATTR_FILE) - f = ia->ia_file; - mutex_unlock(&a->h_inode->i_mutex); - err = vfsub_trunc(&a->h_path, ia->ia_size, ia->ia_valid, f); - mutex_lock_nested(&a->h_inode->i_mutex, AuLsc_I_CHILD); - } else { - delegated = NULL; - while (1) { - err = vfsub_notify_change(&a->h_path, ia, &delegated); - if (delegated) { - err = break_deleg_wait(&delegated); - if (!err) - continue; - } - break; - } - } - /* - * regardless aufs 'acl' option setting. - * why don't all acl-aware fs call this func from their ->setattr()? - */ - if (!err && (ia->ia_valid & ATTR_MODE)) - err = vfsub_acl_chmod(a->h_inode, ia->ia_mode); - if (!err) - au_cpup_attr_changeable(inode); - -out_unlock: - mutex_unlock(&a->h_inode->i_mutex); - au_unpin(&a->pin); - if (unlikely(err)) - au_update_dbstart(dentry); -out_dentry: - di_write_unlock(dentry); - if (file) { - fi_write_unlock(file); - ia->ia_file = file; - ia->ia_valid |= ATTR_FILE; - } -out_si: - si_read_unlock(sb); -out_kfree: - kfree(a); -out: - AuTraceErr(err); - return err; -} - -#if IS_ENABLED(CONFIG_AUFS_XATTR) || IS_ENABLED(CONFIG_FS_POSIX_ACL) -static int au_h_path_to_set_attr(struct dentry *dentry, - struct au_icpup_args *a, struct path *h_path) -{ - int err; - struct super_block *sb; - - sb = dentry->d_sb; - a->udba = au_opt_udba(sb); - /* no d_unlinked(), to set UDBA_NONE for root */ - if (d_unhashed(dentry)) - a->udba = AuOpt_UDBA_NONE; - if (a->udba != AuOpt_UDBA_NONE) { - AuDebugOn(IS_ROOT(dentry)); - err = au_reval_for_attr(dentry, au_sigen(sb)); - if (unlikely(err)) - goto out; - } - err = au_pin_and_icpup(dentry, /*ia*/NULL, a); - if (unlikely(err < 0)) - goto out; - - h_path->dentry = a->h_path.dentry; - h_path->mnt = au_sbr_mnt(sb, a->btgt); - -out: - return err; -} - -ssize_t au_srxattr(struct dentry *dentry, struct au_srxattr *arg) -{ - int err; - struct path h_path; - struct super_block *sb; - struct au_icpup_args *a; - struct inode *inode, *h_inode; - - inode = d_inode(dentry); - IMustLock(inode); - - err = -ENOMEM; - a = kzalloc(sizeof(*a), GFP_NOFS); - if (unlikely(!a)) - goto out; - - sb = dentry->d_sb; - err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM); - if (unlikely(err)) - goto out_kfree; - - h_path.dentry = NULL; /* silence gcc */ - di_write_lock_child(dentry); - err = au_h_path_to_set_attr(dentry, a, &h_path); - if (unlikely(err)) - goto out_di; - - mutex_unlock(&a->h_inode->i_mutex); - switch (arg->type) { - case AU_XATTR_SET: - err = vfsub_setxattr(h_path.dentry, - arg->u.set.name, arg->u.set.value, - arg->u.set.size, arg->u.set.flags); - break; - case AU_XATTR_REMOVE: - err = vfsub_removexattr(h_path.dentry, arg->u.remove.name); - break; - case AU_ACL_SET: - err = -EOPNOTSUPP; - h_inode = d_inode(h_path.dentry); - if (h_inode->i_op->set_acl) - err = h_inode->i_op->set_acl(h_inode, - arg->u.acl_set.acl, - arg->u.acl_set.type); - break; - } - if (!err) - au_cpup_attr_timesizes(inode); - - au_unpin(&a->pin); - if (unlikely(err)) - au_update_dbstart(dentry); - -out_di: - di_write_unlock(dentry); - si_read_unlock(sb); -out_kfree: - kfree(a); -out: - AuTraceErr(err); - return err; -} -#endif - -static void au_refresh_iattr(struct inode *inode, struct kstat *st, - unsigned int nlink) -{ - unsigned int n; - - inode->i_mode = st->mode; - /* don't i_[ug]id_write() here */ - inode->i_uid = st->uid; - inode->i_gid = st->gid; - inode->i_atime = st->atime; - inode->i_mtime = st->mtime; - inode->i_ctime = st->ctime; - - au_cpup_attr_nlink(inode, /*force*/0); - if (S_ISDIR(inode->i_mode)) { - n = inode->i_nlink; - n -= nlink; - n += st->nlink; - smp_mb(); /* for i_nlink */ - /* 0 can happen */ - set_nlink(inode, n); - } - - spin_lock(&inode->i_lock); - inode->i_blocks = st->blocks; - i_size_write(inode, st->size); - spin_unlock(&inode->i_lock); -} - -/* - * common routine for aufs_getattr() and aufs_getxattr(). - * returns zero or negative (an error). - * @dentry will be read-locked in success. - */ -int au_h_path_getattr(struct dentry *dentry, int force, struct path *h_path) -{ - int err; - unsigned int mnt_flags, sigen; - unsigned char udba_none; - aufs_bindex_t bindex; - struct super_block *sb, *h_sb; - struct inode *inode; - - h_path->mnt = NULL; - h_path->dentry = NULL; - - err = 0; - sb = dentry->d_sb; - mnt_flags = au_mntflags(sb); - udba_none = !!au_opt_test(mnt_flags, UDBA_NONE); - - /* support fstat(2) */ - if (!d_unlinked(dentry) && !udba_none) { - sigen = au_sigen(sb); - err = au_digen_test(dentry, sigen); - if (!err) { - di_read_lock_child(dentry, AuLock_IR); - err = au_dbrange_test(dentry); - if (unlikely(err)) { - di_read_unlock(dentry, AuLock_IR); - goto out; - } - } else { - AuDebugOn(IS_ROOT(dentry)); - di_write_lock_child(dentry); - err = au_dbrange_test(dentry); - if (!err) - err = au_reval_for_attr(dentry, sigen); - if (!err) - di_downgrade_lock(dentry, AuLock_IR); - else { - di_write_unlock(dentry); - goto out; - } - } - } else - di_read_lock_child(dentry, AuLock_IR); - - inode = d_inode(dentry); - bindex = au_ibstart(inode); - h_path->mnt = au_sbr_mnt(sb, bindex); - h_sb = h_path->mnt->mnt_sb; - if (!force - && !au_test_fs_bad_iattr(h_sb) - && udba_none) - goto out; /* success */ - - if (au_dbstart(dentry) == bindex) - h_path->dentry = au_h_dptr(dentry, bindex); - else if (au_opt_test(mnt_flags, PLINK) && au_plink_test(inode)) { - h_path->dentry = au_plink_lkup(inode, bindex); - if (IS_ERR(h_path->dentry)) - /* pretending success */ - h_path->dentry = NULL; - else - dput(h_path->dentry); - } - -out: - return err; -} - -static int aufs_getattr(struct vfsmount *mnt __maybe_unused, - struct dentry *dentry, struct kstat *st) -{ - int err; - unsigned char positive; - struct path h_path; - struct inode *inode; - struct super_block *sb; - - inode = d_inode(dentry); - sb = dentry->d_sb; - err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM); - if (unlikely(err)) - goto out; - err = au_h_path_getattr(dentry, /*force*/0, &h_path); - if (unlikely(err)) - goto out_si; - if (unlikely(!h_path.dentry)) - /* illegally overlapped or something */ - goto out_fill; /* pretending success */ - - positive = d_is_positive(h_path.dentry); - if (positive) - err = vfs_getattr(&h_path, st); - if (!err) { - if (positive) - au_refresh_iattr(inode, st, - d_inode(h_path.dentry)->i_nlink); - goto out_fill; /* success */ - } - AuTraceErr(err); - goto out_di; - -out_fill: - generic_fillattr(inode, st); -out_di: - di_read_unlock(dentry, AuLock_IR); -out_si: - si_read_unlock(sb); -out: - AuTraceErr(err); - return err; -} - -/* ---------------------------------------------------------------------- */ - -/* - * Assumption: - * - the number of symlinks is not so many. - * - * Structure: - * - sbinfo (instead of iinfo) contains an hlist of struct au_symlink. - * If iinfo contained the hlist, then it would be rather large waste of memory - * I am afraid. - * - struct au_symlink contains the necessary info for h_inode follow_link() and - * put_link(). - */ - -struct au_symlink { - union { - struct hlist_node hlist; - struct rcu_head rcu; - }; - - struct inode *h_inode; - void *h_cookie; -}; - -static void au_symlink_add(struct super_block *sb, struct au_symlink *slink, - struct inode *h_inode, void *cookie) -{ - struct au_sbinfo *sbinfo; - - ihold(h_inode); - slink->h_inode = h_inode; - slink->h_cookie = cookie; - sbinfo = au_sbi(sb); - au_sphl_add(&slink->hlist, &sbinfo->si_symlink); -} - -static void au_symlink_del(struct super_block *sb, struct au_symlink *slink) -{ - struct au_sbinfo *sbinfo; - - /* do not iput() within rcu */ - iput(slink->h_inode); - slink->h_inode = NULL; - sbinfo = au_sbi(sb); - au_sphl_del_rcu(&slink->hlist, &sbinfo->si_symlink); - kfree_rcu(slink, rcu); -} - -static const char *aufs_follow_link(struct dentry *dentry, void **cookie) -{ - const char *ret; - struct inode *inode, *h_inode; - struct dentry *h_dentry; - struct au_symlink *slink; - int err; - aufs_bindex_t bindex; - - ret = NULL; /* suppress a warning */ - err = aufs_read_lock(dentry, AuLock_IR | AuLock_GEN); - if (unlikely(err)) - goto out; - - err = au_d_hashed_positive(dentry); - if (unlikely(err)) - goto out_unlock; - - err = -EINVAL; - inode = d_inode(dentry); - bindex = au_ibstart(inode); - h_inode = au_h_iptr(inode, bindex); - if (unlikely(!h_inode->i_op->follow_link)) - goto out_unlock; - - err = -ENOMEM; - slink = kmalloc(sizeof(*slink), GFP_NOFS); - if (unlikely(!slink)) - goto out_unlock; - - err = -EBUSY; - h_dentry = NULL; - if (au_dbstart(dentry) <= bindex) { - h_dentry = au_h_dptr(dentry, bindex); - if (h_dentry) - dget(h_dentry); - } - if (!h_dentry) { - h_dentry = d_find_any_alias(h_inode); - if (IS_ERR(h_dentry)) { - err = PTR_ERR(h_dentry); - goto out_free; - } - } - if (unlikely(!h_dentry)) - goto out_free; - - err = 0; - AuDbg("%pf\n", h_inode->i_op->follow_link); - AuDbgDentry(h_dentry); - ret = h_inode->i_op->follow_link(h_dentry, cookie); - dput(h_dentry); - - if (!IS_ERR_OR_NULL(ret)) { - au_symlink_add(inode->i_sb, slink, h_inode, *cookie); - *cookie = slink; - AuDbg("slink %p\n", slink); - goto out_unlock; /* success */ - } - -out_free: - slink->h_inode = NULL; - kfree_rcu(slink, rcu); -out_unlock: - aufs_read_unlock(dentry, AuLock_IR); -out: - if (unlikely(err)) - ret = ERR_PTR(err); - AuTraceErrPtr(ret); - return ret; -} - -static void aufs_put_link(struct inode *inode, void *cookie) -{ - struct au_symlink *slink; - struct inode *h_inode; - - slink = cookie; - AuDbg("slink %p\n", slink); - h_inode = slink->h_inode; - AuDbg("%pf\n", h_inode->i_op->put_link); - AuDbgInode(h_inode); - if (h_inode->i_op->put_link) - h_inode->i_op->put_link(h_inode, slink->h_cookie); - au_symlink_del(inode->i_sb, slink); -} - -/* ---------------------------------------------------------------------- */ - -static int aufs_update_time(struct inode *inode, struct timespec *ts, int flags) -{ - int err; - struct super_block *sb; - struct inode *h_inode; - - sb = inode->i_sb; - /* mmap_sem might be acquired already, cf. aufs_mmap() */ - lockdep_off(); - si_read_lock(sb, AuLock_FLUSH); - ii_write_lock_child(inode); - lockdep_on(); - h_inode = au_h_iptr(inode, au_ibstart(inode)); - err = vfsub_update_time(h_inode, ts, flags); - lockdep_off(); - if (!err) - au_cpup_attr_timesizes(inode); - ii_write_unlock(inode); - si_read_unlock(sb); - lockdep_on(); - - if (!err && (flags & S_VERSION)) - inode_inc_iversion(inode); - - return err; -} - -/* ---------------------------------------------------------------------- */ - -/* no getattr version will be set by module.c:aufs_init() */ -struct inode_operations aufs_iop_nogetattr[AuIop_Last], - aufs_iop[] = { - [AuIop_SYMLINK] = { - .permission = aufs_permission, -#ifdef CONFIG_FS_POSIX_ACL - .get_acl = aufs_get_acl, - .set_acl = aufs_set_acl, /* unsupport for symlink? */ -#endif - - .setattr = aufs_setattr, - .getattr = aufs_getattr, - -#ifdef CONFIG_AUFS_XATTR - .setxattr = aufs_setxattr, - .getxattr = aufs_getxattr, - .listxattr = aufs_listxattr, - .removexattr = aufs_removexattr, -#endif - - .readlink = generic_readlink, - .follow_link = aufs_follow_link, - .put_link = aufs_put_link, - - /* .update_time = aufs_update_time */ - }, - [AuIop_DIR] = { - .create = aufs_create, - .lookup = aufs_lookup, - .link = aufs_link, - .unlink = aufs_unlink, - .symlink = aufs_symlink, - .mkdir = aufs_mkdir, - .rmdir = aufs_rmdir, - .mknod = aufs_mknod, - .rename = aufs_rename, - - .permission = aufs_permission, -#ifdef CONFIG_FS_POSIX_ACL - .get_acl = aufs_get_acl, - .set_acl = aufs_set_acl, -#endif - - .setattr = aufs_setattr, - .getattr = aufs_getattr, - -#ifdef CONFIG_AUFS_XATTR - .setxattr = aufs_setxattr, - .getxattr = aufs_getxattr, - .listxattr = aufs_listxattr, - .removexattr = aufs_removexattr, -#endif - - .update_time = aufs_update_time, - .atomic_open = aufs_atomic_open, - .tmpfile = aufs_tmpfile - }, - [AuIop_OTHER] = { - .permission = aufs_permission, -#ifdef CONFIG_FS_POSIX_ACL - .get_acl = aufs_get_acl, - .set_acl = aufs_set_acl, -#endif - - .setattr = aufs_setattr, - .getattr = aufs_getattr, - -#ifdef CONFIG_AUFS_XATTR - .setxattr = aufs_setxattr, - .getxattr = aufs_getxattr, - .listxattr = aufs_listxattr, - .removexattr = aufs_removexattr, -#endif - - .update_time = aufs_update_time - } -}; diff --git a/fs/aufs/i_op_add.c b/fs/aufs/i_op_add.c deleted file mode 100644 index 3fc355859..000000000 --- a/fs/aufs/i_op_add.c +++ /dev/null @@ -1,919 +0,0 @@ -/* - * Copyright (C) 2005-2016 Junjiro R. Okajima - */ - -/* - * inode operations (add entry) - */ - -#include "aufs.h" - -/* - * final procedure of adding a new entry, except link(2). - * remove whiteout, instantiate, copyup the parent dir's times and size - * and update version. - * if it failed, re-create the removed whiteout. - */ -static int epilog(struct inode *dir, aufs_bindex_t bindex, - struct dentry *wh_dentry, struct dentry *dentry) -{ - int err, rerr; - aufs_bindex_t bwh; - struct path h_path; - struct super_block *sb; - struct inode *inode, *h_dir; - struct dentry *wh; - - bwh = -1; - sb = dir->i_sb; - if (wh_dentry) { - h_dir = d_inode(wh_dentry->d_parent); /* dir inode is locked */ - IMustLock(h_dir); - AuDebugOn(au_h_iptr(dir, bindex) != h_dir); - bwh = au_dbwh(dentry); - h_path.dentry = wh_dentry; - h_path.mnt = au_sbr_mnt(sb, bindex); - err = au_wh_unlink_dentry(au_h_iptr(dir, bindex), &h_path, - dentry); - if (unlikely(err)) - goto out; - } - - inode = au_new_inode(dentry, /*must_new*/1); - if (!IS_ERR(inode)) { - d_instantiate(dentry, inode); - dir = d_inode(dentry->d_parent); /* dir inode is locked */ - IMustLock(dir); - au_dir_ts(dir, bindex); - dir->i_version++; - au_fhsm_wrote(sb, bindex, /*force*/0); - return 0; /* success */ - } - - err = PTR_ERR(inode); - if (!wh_dentry) - goto out; - - /* revert */ - /* dir inode is locked */ - wh = au_wh_create(dentry, bwh, wh_dentry->d_parent); - rerr = PTR_ERR(wh); - if (IS_ERR(wh)) { - AuIOErr("%pd reverting whiteout failed(%d, %d)\n", - dentry, err, rerr); - err = -EIO; - } else - dput(wh); - -out: - return err; -} - -static int au_d_may_add(struct dentry *dentry) -{ - int err; - - err = 0; - if (unlikely(d_unhashed(dentry))) - err = -ENOENT; - if (unlikely(d_really_is_positive(dentry))) - err = -EEXIST; - return err; -} - -/* - * simple tests for the adding inode operations. - * following the checks in vfs, plus the parent-child relationship. - */ -int au_may_add(struct dentry *dentry, aufs_bindex_t bindex, - struct dentry *h_parent, int isdir) -{ - int err; - umode_t h_mode; - struct dentry *h_dentry; - struct inode *h_inode; - - err = -ENAMETOOLONG; - if (unlikely(dentry->d_name.len > AUFS_MAX_NAMELEN)) - goto out; - - h_dentry = au_h_dptr(dentry, bindex); - if (d_really_is_negative(dentry)) { - err = -EEXIST; - if (unlikely(d_is_positive(h_dentry))) - goto out; - } else { - /* rename(2) case */ - err = -EIO; - if (unlikely(d_is_negative(h_dentry))) - goto out; - h_inode = d_inode(h_dentry); - if (unlikely(!h_inode->i_nlink)) - goto out; - - h_mode = h_inode->i_mode; - if (!isdir) { - err = -EISDIR; - if (unlikely(S_ISDIR(h_mode))) - goto out; - } else if (unlikely(!S_ISDIR(h_mode))) { - err = -ENOTDIR; - goto out; - } - } - - err = 0; - /* expected parent dir is locked */ - if (unlikely(h_parent != h_dentry->d_parent)) - err = -EIO; - -out: - AuTraceErr(err); - return err; -} - -/* - * initial procedure of adding a new entry. - * prepare writable branch and the parent dir, lock it, - * and lookup whiteout for the new entry. - */ -static struct dentry* -lock_hdir_lkup_wh(struct dentry *dentry, struct au_dtime *dt, - struct dentry *src_dentry, struct au_pin *pin, - struct au_wr_dir_args *wr_dir_args) -{ - struct dentry *wh_dentry, *h_parent; - struct super_block *sb; - struct au_branch *br; - int err; - unsigned int udba; - aufs_bindex_t bcpup; - - AuDbg("%pd\n", dentry); - - err = au_wr_dir(dentry, src_dentry, wr_dir_args); - bcpup = err; - wh_dentry = ERR_PTR(err); - if (unlikely(err < 0)) - goto out; - - sb = dentry->d_sb; - udba = au_opt_udba(sb); - err = au_pin(pin, dentry, bcpup, udba, - AuPin_DI_LOCKED | AuPin_MNT_WRITE); - wh_dentry = ERR_PTR(err); - if (unlikely(err)) - goto out; - - h_parent = au_pinned_h_parent(pin); - if (udba != AuOpt_UDBA_NONE - && au_dbstart(dentry) == bcpup) - err = au_may_add(dentry, bcpup, h_parent, - au_ftest_wrdir(wr_dir_args->flags, ISDIR)); - else if (unlikely(dentry->d_name.len > AUFS_MAX_NAMELEN)) - err = -ENAMETOOLONG; - wh_dentry = ERR_PTR(err); - if (unlikely(err)) - goto out_unpin; - - br = au_sbr(sb, bcpup); - if (dt) { - struct path tmp = { - .dentry = h_parent, - .mnt = au_br_mnt(br) - }; - au_dtime_store(dt, au_pinned_parent(pin), &tmp); - } - - wh_dentry = NULL; - if (bcpup != au_dbwh(dentry)) - goto out; /* success */ - - /* - * ENAMETOOLONG here means that if we allowed create such name, then it - * would not be able to removed in the future. So we don't allow such - * name here and we don't handle ENAMETOOLONG differently here. - */ - wh_dentry = au_wh_lkup(h_parent, &dentry->d_name, br); - -out_unpin: - if (IS_ERR(wh_dentry)) - au_unpin(pin); -out: - return wh_dentry; -} - -/* ---------------------------------------------------------------------- */ - -enum { Mknod, Symlink, Creat }; -struct simple_arg { - int type; - union { - struct { - umode_t mode; - bool want_excl; - bool try_aopen; - struct vfsub_aopen_args *aopen; - } c; - struct { - const char *symname; - } s; - struct { - umode_t mode; - dev_t dev; - } m; - } u; -}; - -static int add_simple(struct inode *dir, struct dentry *dentry, - struct simple_arg *arg) -{ - int err, rerr; - aufs_bindex_t bstart; - unsigned char created; - const unsigned char try_aopen - = (arg->type == Creat && arg->u.c.try_aopen); - struct dentry *wh_dentry, *parent; - struct inode *h_dir; - struct super_block *sb; - struct au_branch *br; - /* to reuduce stack size */ - struct { - struct au_dtime dt; - struct au_pin pin; - struct path h_path; - struct au_wr_dir_args wr_dir_args; - } *a; - - AuDbg("%pd\n", dentry); - IMustLock(dir); - - err = -ENOMEM; - a = kmalloc(sizeof(*a), GFP_NOFS); - if (unlikely(!a)) - goto out; - a->wr_dir_args.force_btgt = -1; - a->wr_dir_args.flags = AuWrDir_ADD_ENTRY; - - parent = dentry->d_parent; /* dir inode is locked */ - if (!try_aopen) { - err = aufs_read_lock(dentry, AuLock_DW | AuLock_GEN); - if (unlikely(err)) - goto out_free; - } - err = au_d_may_add(dentry); - if (unlikely(err)) - goto out_unlock; - if (!try_aopen) - di_write_lock_parent(parent); - wh_dentry = lock_hdir_lkup_wh(dentry, &a->dt, /*src_dentry*/NULL, - &a->pin, &a->wr_dir_args); - err = PTR_ERR(wh_dentry); - if (IS_ERR(wh_dentry)) - goto out_parent; - - bstart = au_dbstart(dentry); - sb = dentry->d_sb; - br = au_sbr(sb, bstart); - a->h_path.dentry = au_h_dptr(dentry, bstart); - a->h_path.mnt = au_br_mnt(br); - h_dir = au_pinned_h_dir(&a->pin); - switch (arg->type) { - case Creat: - err = 0; - if (!try_aopen || !h_dir->i_op->atomic_open) - err = vfsub_create(h_dir, &a->h_path, arg->u.c.mode, - arg->u.c.want_excl); - else - err = vfsub_atomic_open(h_dir, a->h_path.dentry, - arg->u.c.aopen, br); - break; - case Symlink: - err = vfsub_symlink(h_dir, &a->h_path, arg->u.s.symname); - break; - case Mknod: - err = vfsub_mknod(h_dir, &a->h_path, arg->u.m.mode, - arg->u.m.dev); - break; - default: - BUG(); - } - created = !err; - if (!err) - err = epilog(dir, bstart, wh_dentry, dentry); - - /* revert */ - if (unlikely(created && err && d_is_positive(a->h_path.dentry))) { - /* no delegation since it is just created */ - rerr = vfsub_unlink(h_dir, &a->h_path, /*delegated*/NULL, - /*force*/0); - if (rerr) { - AuIOErr("%pd revert failure(%d, %d)\n", - dentry, err, rerr); - err = -EIO; - } - au_dtime_revert(&a->dt); - } - - if (!err && try_aopen && !h_dir->i_op->atomic_open) - *arg->u.c.aopen->opened |= FILE_CREATED; - - au_unpin(&a->pin); - dput(wh_dentry); - -out_parent: - if (!try_aopen) - di_write_unlock(parent); -out_unlock: - if (unlikely(err)) { - au_update_dbstart(dentry); - d_drop(dentry); - } - if (!try_aopen) - aufs_read_unlock(dentry, AuLock_DW); -out_free: - kfree(a); -out: - return err; -} - -int aufs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, - dev_t dev) -{ - struct simple_arg arg = { - .type = Mknod, - .u.m = { - .mode = mode, - .dev = dev - } - }; - return add_simple(dir, dentry, &arg); -} - -int aufs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) -{ - struct simple_arg arg = { - .type = Symlink, - .u.s.symname = symname - }; - return add_simple(dir, dentry, &arg); -} - -int aufs_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool want_excl) -{ - struct simple_arg arg = { - .type = Creat, - .u.c = { - .mode = mode, - .want_excl = want_excl - } - }; - return add_simple(dir, dentry, &arg); -} - -int au_aopen_or_create(struct inode *dir, struct dentry *dentry, - struct vfsub_aopen_args *aopen_args) -{ - struct simple_arg arg = { - .type = Creat, - .u.c = { - .mode = aopen_args->create_mode, - .want_excl = aopen_args->open_flag & O_EXCL, - .try_aopen = true, - .aopen = aopen_args - } - }; - return add_simple(dir, dentry, &arg); -} - -int aufs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) -{ - int err; - aufs_bindex_t bindex; - struct super_block *sb; - struct dentry *parent, *h_parent, *h_dentry; - struct inode *h_dir, *inode; - struct vfsmount *h_mnt; - struct au_wr_dir_args wr_dir_args = { - .force_btgt = -1, - .flags = AuWrDir_TMPFILE - }; - - /* copy-up may happen */ - mutex_lock(&dir->i_mutex); - - sb = dir->i_sb; - err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM); - if (unlikely(err)) - goto out; - - err = au_di_init(dentry); - if (unlikely(err)) - goto out_si; - - err = -EBUSY; - parent = d_find_any_alias(dir); - AuDebugOn(!parent); - di_write_lock_parent(parent); - if (unlikely(d_inode(parent) != dir)) - goto out_parent; - - err = au_digen_test(parent, au_sigen(sb)); - if (unlikely(err)) - goto out_parent; - - bindex = au_dbstart(parent); - au_set_dbstart(dentry, bindex); - au_set_dbend(dentry, bindex); - err = au_wr_dir(dentry, /*src_dentry*/NULL, &wr_dir_args); - bindex = err; - if (unlikely(err < 0)) - goto out_parent; - - err = -EOPNOTSUPP; - h_dir = au_h_iptr(dir, bindex); - if (unlikely(!h_dir->i_op->tmpfile)) - goto out_parent; - - h_mnt = au_sbr_mnt(sb, bindex); - err = vfsub_mnt_want_write(h_mnt); - if (unlikely(err)) - goto out_parent; - - h_parent = au_h_dptr(parent, bindex); - err = inode_permission(d_inode(h_parent), MAY_WRITE | MAY_EXEC); - if (unlikely(err)) - goto out_mnt; - - err = -ENOMEM; - h_dentry = d_alloc(h_parent, &dentry->d_name); - if (unlikely(!h_dentry)) - goto out_mnt; - - err = h_dir->i_op->tmpfile(h_dir, h_dentry, mode); - if (unlikely(err)) - goto out_dentry; - - au_set_dbstart(dentry, bindex); - au_set_dbend(dentry, bindex); - au_set_h_dptr(dentry, bindex, dget(h_dentry)); - inode = au_new_inode(dentry, /*must_new*/1); - if (IS_ERR(inode)) { - err = PTR_ERR(inode); - au_set_h_dptr(dentry, bindex, NULL); - au_set_dbstart(dentry, -1); - au_set_dbend(dentry, -1); - } else { - if (!inode->i_nlink) - set_nlink(inode, 1); - d_tmpfile(dentry, inode); - au_di(dentry)->di_tmpfile = 1; - - /* update without i_mutex */ - if (au_ibstart(dir) == au_dbstart(dentry)) - au_cpup_attr_timesizes(dir); - } - -out_dentry: - dput(h_dentry); -out_mnt: - vfsub_mnt_drop_write(h_mnt); -out_parent: - di_write_unlock(parent); - dput(parent); - di_write_unlock(dentry); - if (!err) -#if 0 - /* verbose coding for lock class name */ - au_rw_class(&au_di(dentry)->di_rwsem, - au_lc_key + AuLcNonDir_DIINFO); -#else - ; -#endif - else { - au_di_fin(dentry); - dentry->d_fsdata = NULL; - } -out_si: - si_read_unlock(sb); -out: - mutex_unlock(&dir->i_mutex); - return err; -} - -/* ---------------------------------------------------------------------- */ - -struct au_link_args { - aufs_bindex_t bdst, bsrc; - struct au_pin pin; - struct path h_path; - struct dentry *src_parent, *parent; -}; - -static int au_cpup_before_link(struct dentry *src_dentry, - struct au_link_args *a) -{ - int err; - struct dentry *h_src_dentry; - struct au_cp_generic cpg = { - .dentry = src_dentry, - .bdst = a->bdst, - .bsrc = a->bsrc, - .len = -1, - .pin = &a->pin, - .flags = AuCpup_DTIME | AuCpup_HOPEN /* | AuCpup_KEEPLINO */ - }; - - di_read_lock_parent(a->src_parent, AuLock_IR); - err = au_test_and_cpup_dirs(src_dentry, a->bdst); - if (unlikely(err)) - goto out; - - h_src_dentry = au_h_dptr(src_dentry, a->bsrc); - err = au_pin(&a->pin, src_dentry, a->bdst, - au_opt_udba(src_dentry->d_sb), - AuPin_DI_LOCKED | AuPin_MNT_WRITE); - if (unlikely(err)) - goto out; - - err = au_sio_cpup_simple(&cpg); - au_unpin(&a->pin); - -out: - di_read_unlock(a->src_parent, AuLock_IR); - return err; -} - -static int au_cpup_or_link(struct dentry *src_dentry, struct dentry *dentry, - struct au_link_args *a) -{ - int err; - unsigned char plink; - aufs_bindex_t bend; - struct dentry *h_src_dentry; - struct inode *h_inode, *inode, *delegated; - struct super_block *sb; - struct file *h_file; - - plink = 0; - h_inode = NULL; - sb = src_dentry->d_sb; - inode = d_inode(src_dentry); - if (au_ibstart(inode) <= a->bdst) - h_inode = au_h_iptr(inode, a->bdst); - if (!h_inode || !h_inode->i_nlink) { - /* copyup src_dentry as the name of dentry. */ - bend = au_dbend(dentry); - if (bend < a->bsrc) - au_set_dbend(dentry, a->bsrc); - au_set_h_dptr(dentry, a->bsrc, - dget(au_h_dptr(src_dentry, a->bsrc))); - dget(a->h_path.dentry); - au_set_h_dptr(dentry, a->bdst, NULL); - AuDbg("temporary d_inode...\n"); - spin_lock(&dentry->d_lock); - dentry->d_inode = d_inode(src_dentry); /* tmp */ - spin_unlock(&dentry->d_lock); - h_file = au_h_open_pre(dentry, a->bsrc, /*force_wr*/0); - if (IS_ERR(h_file)) - err = PTR_ERR(h_file); - else { - struct au_cp_generic cpg = { - .dentry = dentry, - .bdst = a->bdst, - .bsrc = -1, - .len = -1, - .pin = &a->pin, - .flags = AuCpup_KEEPLINO - }; - err = au_sio_cpup_simple(&cpg); - au_h_open_post(dentry, a->bsrc, h_file); - if (!err) { - dput(a->h_path.dentry); - a->h_path.dentry = au_h_dptr(dentry, a->bdst); - } else - au_set_h_dptr(dentry, a->bdst, - a->h_path.dentry); - } - spin_lock(&dentry->d_lock); - dentry->d_inode = NULL; /* restore */ - spin_unlock(&dentry->d_lock); - AuDbg("temporary d_inode...done\n"); - au_set_h_dptr(dentry, a->bsrc, NULL); - au_set_dbend(dentry, bend); - } else { - /* the inode of src_dentry already exists on a.bdst branch */ - h_src_dentry = d_find_alias(h_inode); - if (!h_src_dentry && au_plink_test(inode)) { - plink = 1; - h_src_dentry = au_plink_lkup(inode, a->bdst); - err = PTR_ERR(h_src_dentry); - if (IS_ERR(h_src_dentry)) - goto out; - - if (unlikely(d_is_negative(h_src_dentry))) { - dput(h_src_dentry); - h_src_dentry = NULL; - } - - } - if (h_src_dentry) { - delegated = NULL; - err = vfsub_link(h_src_dentry, au_pinned_h_dir(&a->pin), - &a->h_path, &delegated); - if (unlikely(err == -EWOULDBLOCK)) { - pr_warn("cannot retry for NFSv4 delegation" - " for an internal link\n"); - iput(delegated); - } - dput(h_src_dentry); - } else { - AuIOErr("no dentry found for hi%lu on b%d\n", - h_inode->i_ino, a->bdst); - err = -EIO; - } - } - - if (!err && !plink) - au_plink_append(inode, a->bdst, a->h_path.dentry); - -out: - AuTraceErr(err); - return err; -} - -int aufs_link(struct dentry *src_dentry, struct inode *dir, - struct dentry *dentry) -{ - int err, rerr; - struct au_dtime dt; - struct au_link_args *a; - struct dentry *wh_dentry, *h_src_dentry; - struct inode *inode, *delegated; - struct super_block *sb; - struct au_wr_dir_args wr_dir_args = { - /* .force_btgt = -1, */ - .flags = AuWrDir_ADD_ENTRY - }; - - IMustLock(dir); - inode = d_inode(src_dentry); - IMustLock(inode); - - err = -ENOMEM; - a = kzalloc(sizeof(*a), GFP_NOFS); - if (unlikely(!a)) - goto out; - - a->parent = dentry->d_parent; /* dir inode is locked */ - err = aufs_read_and_write_lock2(dentry, src_dentry, - AuLock_NOPLM | AuLock_GEN); - if (unlikely(err)) - goto out_kfree; - err = au_d_linkable(src_dentry); - if (unlikely(err)) - goto out_unlock; - err = au_d_may_add(dentry); - if (unlikely(err)) - goto out_unlock; - - a->src_parent = dget_parent(src_dentry); - wr_dir_args.force_btgt = au_ibstart(inode); - - di_write_lock_parent(a->parent); - wr_dir_args.force_btgt = au_wbr(dentry, wr_dir_args.force_btgt); - wh_dentry = lock_hdir_lkup_wh(dentry, &dt, src_dentry, &a->pin, - &wr_dir_args); - err = PTR_ERR(wh_dentry); - if (IS_ERR(wh_dentry)) - goto out_parent; - - err = 0; - sb = dentry->d_sb; - a->bdst = au_dbstart(dentry); - a->h_path.dentry = au_h_dptr(dentry, a->bdst); - a->h_path.mnt = au_sbr_mnt(sb, a->bdst); - a->bsrc = au_ibstart(inode); - h_src_dentry = au_h_d_alias(src_dentry, a->bsrc); - if (!h_src_dentry && au_di(src_dentry)->di_tmpfile) - h_src_dentry = dget(au_hi_wh(inode, a->bsrc)); - if (!h_src_dentry) { - a->bsrc = au_dbstart(src_dentry); - h_src_dentry = au_h_d_alias(src_dentry, a->bsrc); - AuDebugOn(!h_src_dentry); - } else if (IS_ERR(h_src_dentry)) { - err = PTR_ERR(h_src_dentry); - goto out_parent; - } - - if (au_opt_test(au_mntflags(sb), PLINK)) { - if (a->bdst < a->bsrc - /* && h_src_dentry->d_sb != a->h_path.dentry->d_sb */) - err = au_cpup_or_link(src_dentry, dentry, a); - else { - delegated = NULL; - err = vfsub_link(h_src_dentry, au_pinned_h_dir(&a->pin), - &a->h_path, &delegated); - if (unlikely(err == -EWOULDBLOCK)) { - pr_warn("cannot retry for NFSv4 delegation" - " for an internal link\n"); - iput(delegated); - } - } - dput(h_src_dentry); - } else { - /* - * copyup src_dentry to the branch we process, - * and then link(2) to it. - */ - dput(h_src_dentry); - if (a->bdst < a->bsrc - /* && h_src_dentry->d_sb != a->h_path.dentry->d_sb */) { - au_unpin(&a->pin); - di_write_unlock(a->parent); - err = au_cpup_before_link(src_dentry, a); - di_write_lock_parent(a->parent); - if (!err) - err = au_pin(&a->pin, dentry, a->bdst, - au_opt_udba(sb), - AuPin_DI_LOCKED | AuPin_MNT_WRITE); - if (unlikely(err)) - goto out_wh; - } - if (!err) { - h_src_dentry = au_h_dptr(src_dentry, a->bdst); - err = -ENOENT; - if (h_src_dentry && d_is_positive(h_src_dentry)) { - delegated = NULL; - err = vfsub_link(h_src_dentry, - au_pinned_h_dir(&a->pin), - &a->h_path, &delegated); - if (unlikely(err == -EWOULDBLOCK)) { - pr_warn("cannot retry" - " for NFSv4 delegation" - " for an internal link\n"); - iput(delegated); - } - } - } - } - if (unlikely(err)) - goto out_unpin; - - if (wh_dentry) { - a->h_path.dentry = wh_dentry; - err = au_wh_unlink_dentry(au_pinned_h_dir(&a->pin), &a->h_path, - dentry); - if (unlikely(err)) - goto out_revert; - } - - au_dir_ts(dir, a->bdst); - dir->i_version++; - inc_nlink(inode); - inode->i_ctime = dir->i_ctime; - d_instantiate(dentry, au_igrab(inode)); - if (d_unhashed(a->h_path.dentry)) - /* some filesystem calls d_drop() */ - d_drop(dentry); - /* some filesystems consume an inode even hardlink */ - au_fhsm_wrote(sb, a->bdst, /*force*/0); - goto out_unpin; /* success */ - -out_revert: - /* no delegation since it is just created */ - rerr = vfsub_unlink(au_pinned_h_dir(&a->pin), &a->h_path, - /*delegated*/NULL, /*force*/0); - if (unlikely(rerr)) { - AuIOErr("%pd reverting failed(%d, %d)\n", dentry, err, rerr); - err = -EIO; - } - au_dtime_revert(&dt); -out_unpin: - au_unpin(&a->pin); -out_wh: - dput(wh_dentry); -out_parent: - di_write_unlock(a->parent); - dput(a->src_parent); -out_unlock: - if (unlikely(err)) { - au_update_dbstart(dentry); - d_drop(dentry); - } - aufs_read_and_write_unlock2(dentry, src_dentry); -out_kfree: - kfree(a); -out: - AuTraceErr(err); - return err; -} - -int aufs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) -{ - int err, rerr; - aufs_bindex_t bindex; - unsigned char diropq; - struct path h_path; - struct dentry *wh_dentry, *parent, *opq_dentry; - struct mutex *h_mtx; - struct super_block *sb; - struct { - struct au_pin pin; - struct au_dtime dt; - } *a; /* reduce the stack usage */ - struct au_wr_dir_args wr_dir_args = { - .force_btgt = -1, - .flags = AuWrDir_ADD_ENTRY | AuWrDir_ISDIR - }; - - IMustLock(dir); - - err = -ENOMEM; - a = kmalloc(sizeof(*a), GFP_NOFS); - if (unlikely(!a)) - goto out; - - err = aufs_read_lock(dentry, AuLock_DW | AuLock_GEN); - if (unlikely(err)) - goto out_free; - err = au_d_may_add(dentry); - if (unlikely(err)) - goto out_unlock; - - parent = dentry->d_parent; /* dir inode is locked */ - di_write_lock_parent(parent); - wh_dentry = lock_hdir_lkup_wh(dentry, &a->dt, /*src_dentry*/NULL, - &a->pin, &wr_dir_args); - err = PTR_ERR(wh_dentry); - if (IS_ERR(wh_dentry)) - goto out_parent; - - sb = dentry->d_sb; - bindex = au_dbstart(dentry); - h_path.dentry = au_h_dptr(dentry, bindex); - h_path.mnt = au_sbr_mnt(sb, bindex); - err = vfsub_mkdir(au_pinned_h_dir(&a->pin), &h_path, mode); - if (unlikely(err)) - goto out_unpin; - - /* make the dir opaque */ - diropq = 0; - h_mtx = &d_inode(h_path.dentry)->i_mutex; - if (wh_dentry - || au_opt_test(au_mntflags(sb), ALWAYS_DIROPQ)) { - mutex_lock_nested(h_mtx, AuLsc_I_CHILD); - opq_dentry = au_diropq_create(dentry, bindex); - mutex_unlock(h_mtx); - err = PTR_ERR(opq_dentry); - if (IS_ERR(opq_dentry)) - goto out_dir; - dput(opq_dentry); - diropq = 1; - } - - err = epilog(dir, bindex, wh_dentry, dentry); - if (!err) { - inc_nlink(dir); - goto out_unpin; /* success */ - } - - /* revert */ - if (diropq) { - AuLabel(revert opq); - mutex_lock_nested(h_mtx, AuLsc_I_CHILD); - rerr = au_diropq_remove(dentry, bindex); - mutex_unlock(h_mtx); - if (rerr) { - AuIOErr("%pd reverting diropq failed(%d, %d)\n", - dentry, err, rerr); - err = -EIO; - } - } - -out_dir: - AuLabel(revert dir); - rerr = vfsub_rmdir(au_pinned_h_dir(&a->pin), &h_path); - if (rerr) { - AuIOErr("%pd reverting dir failed(%d, %d)\n", - dentry, err, rerr); - err = -EIO; - } - au_dtime_revert(&a->dt); -out_unpin: - au_unpin(&a->pin); - dput(wh_dentry); -out_parent: - di_write_unlock(parent); -out_unlock: - if (unlikely(err)) { - au_update_dbstart(dentry); - d_drop(dentry); - } - aufs_read_unlock(dentry, AuLock_DW); -out_free: - kfree(a); -out: - return err; -} diff --git a/fs/aufs/i_op_del.c b/fs/aufs/i_op_del.c deleted file mode 100644 index 68741aadb..000000000 --- a/fs/aufs/i_op_del.c +++ /dev/null @@ -1,497 +0,0 @@ -/* - * Copyright (C) 2005-2016 Junjiro R. Okajima - */ - -/* - * inode operations (del entry) - */ - -#include "aufs.h" - -/* - * decide if a new whiteout for @dentry is necessary or not. - * when it is necessary, prepare the parent dir for the upper branch whose - * branch index is @bcpup for creation. the actual creation of the whiteout will - * be done by caller. - * return value: - * 0: wh is unnecessary - * plus: wh is necessary - * minus: error - */ -int au_wr_dir_need_wh(struct dentry *dentry, int isdir, aufs_bindex_t *bcpup) -{ - int need_wh, err; - aufs_bindex_t bstart; - struct super_block *sb; - - sb = dentry->d_sb; - bstart = au_dbstart(dentry); - if (*bcpup < 0) { - *bcpup = bstart; - if (au_test_ro(sb, bstart, d_inode(dentry))) { - err = AuWbrCopyup(au_sbi(sb), dentry); - *bcpup = err; - if (unlikely(err < 0)) - goto out; - } - } else - AuDebugOn(bstart < *bcpup - || au_test_ro(sb, *bcpup, d_inode(dentry))); - AuDbg("bcpup %d, bstart %d\n", *bcpup, bstart); - - if (*bcpup != bstart) { - err = au_cpup_dirs(dentry, *bcpup); - if (unlikely(err)) - goto out; - need_wh = 1; - } else { - struct au_dinfo *dinfo, *tmp; - - need_wh = -ENOMEM; - dinfo = au_di(dentry); - tmp = au_di_alloc(sb, AuLsc_DI_TMP); - if (tmp) { - au_di_cp(tmp, dinfo); - au_di_swap(tmp, dinfo); - /* returns the number of positive dentries */ - need_wh = au_lkup_dentry(dentry, bstart + 1, /*type*/0); - au_di_swap(tmp, dinfo); - au_rw_write_unlock(&tmp->di_rwsem); - au_di_free(tmp); - } - } - AuDbg("need_wh %d\n", need_wh); - err = need_wh; - -out: - return err; -} - -/* - * simple tests for the del-entry operations. - * following the checks in vfs, plus the parent-child relationship. - */ -int au_may_del(struct dentry *dentry, aufs_bindex_t bindex, - struct dentry *h_parent, int isdir) -{ - int err; - umode_t h_mode; - struct dentry *h_dentry, *h_latest; - struct inode *h_inode; - - h_dentry = au_h_dptr(dentry, bindex); - if (d_really_is_positive(dentry)) { - err = -ENOENT; - if (unlikely(d_is_negative(h_dentry))) - goto out; - h_inode = d_inode(h_dentry); - if (unlikely(!h_inode->i_nlink)) - goto out; - - h_mode = h_inode->i_mode; - if (!isdir) { - err = -EISDIR; - if (unlikely(S_ISDIR(h_mode))) - goto out; - } else if (unlikely(!S_ISDIR(h_mode))) { - err = -ENOTDIR; - goto out; - } - } else { - /* rename(2) case */ - err = -EIO; - if (unlikely(d_is_positive(h_dentry))) - goto out; - } - - err = -ENOENT; - /* expected parent dir is locked */ - if (unlikely(h_parent != h_dentry->d_parent)) - goto out; - err = 0; - - /* - * rmdir a dir may break the consistency on some filesystem. - * let's try heavy test. - */ - err = -EACCES; - if (unlikely(!au_opt_test(au_mntflags(dentry->d_sb), DIRPERM1) - && au_test_h_perm(d_inode(h_parent), - MAY_EXEC | MAY_WRITE))) - goto out; - - h_latest = au_sio_lkup_one(&dentry->d_name, h_parent); - err = -EIO; - if (IS_ERR(h_latest)) - goto out; - if (h_latest == h_dentry) - err = 0; - dput(h_latest); - -out: - return err; -} - -/* - * decide the branch where we operate for @dentry. the branch index will be set - * @rbcpup. after diciding it, 'pin' it and store the timestamps of the parent - * dir for reverting. - * when a new whiteout is necessary, create it. - */ -static struct dentry* -lock_hdir_create_wh(struct dentry *dentry, int isdir, aufs_bindex_t *rbcpup, - struct au_dtime *dt, struct au_pin *pin) -{ - struct dentry *wh_dentry; - struct super_block *sb; - struct path h_path; - int err, need_wh; - unsigned int udba; - aufs_bindex_t bcpup; - - need_wh = au_wr_dir_need_wh(dentry, isdir, rbcpup); - wh_dentry = ERR_PTR(need_wh); - if (unlikely(need_wh < 0)) - goto out; - - sb = dentry->d_sb; - udba = au_opt_udba(sb); - bcpup = *rbcpup; - err = au_pin(pin, dentry, bcpup, udba, - AuPin_DI_LOCKED | AuPin_MNT_WRITE); - wh_dentry = ERR_PTR(err); - if (unlikely(err)) - goto out; - - h_path.dentry = au_pinned_h_parent(pin); - if (udba != AuOpt_UDBA_NONE - && au_dbstart(dentry) == bcpup) { - err = au_may_del(dentry, bcpup, h_path.dentry, isdir); - wh_dentry = ERR_PTR(err); - if (unlikely(err)) - goto out_unpin; - } - - h_path.mnt = au_sbr_mnt(sb, bcpup); - au_dtime_store(dt, au_pinned_parent(pin), &h_path); - wh_dentry = NULL; - if (!need_wh) - goto out; /* success, no need to create whiteout */ - - wh_dentry = au_wh_create(dentry, bcpup, h_path.dentry); - if (IS_ERR(wh_dentry)) - goto out_unpin; - - /* returns with the parent is locked and wh_dentry is dget-ed */ - goto out; /* success */ - -out_unpin: - au_unpin(pin); -out: - return wh_dentry; -} - -/* - * when removing a dir, rename it to a unique temporary whiteout-ed name first - * in order to be revertible and save time for removing many child whiteouts - * under the dir. - * returns 1 when there are too many child whiteout and caller should remove - * them asynchronously. returns 0 when the number of children is enough small to - * remove now or the branch fs is a remote fs. - * otherwise return an error. - */ -static int renwh_and_rmdir(struct dentry *dentry, aufs_bindex_t bindex, - struct au_nhash *whlist, struct inode *dir) -{ - int rmdir_later, err, dirwh; - struct dentry *h_dentry; - struct super_block *sb; - struct inode *inode; - - sb = dentry->d_sb; - SiMustAnyLock(sb); - h_dentry = au_h_dptr(dentry, bindex); - err = au_whtmp_ren(h_dentry, au_sbr(sb, bindex)); - if (unlikely(err)) - goto out; - - /* stop monitoring */ - inode = d_inode(dentry); - au_hn_free(au_hi(inode, bindex)); - - if (!au_test_fs_remote(h_dentry->d_sb)) { - dirwh = au_sbi(sb)->si_dirwh; - rmdir_later = (dirwh <= 1); - if (!rmdir_later) - rmdir_later = au_nhash_test_longer_wh(whlist, bindex, - dirwh); - if (rmdir_later) - return rmdir_later; - } - - err = au_whtmp_rmdir(dir, bindex, h_dentry, whlist); - if (unlikely(err)) { - AuIOErr("rmdir %pd, b%d failed, %d. ignored\n", - h_dentry, bindex, err); - err = 0; - } - -out: - AuTraceErr(err); - return err; -} - -/* - * final procedure for deleting a entry. - * maintain dentry and iattr. - */ -static void epilog(struct inode *dir, struct dentry *dentry, - aufs_bindex_t bindex) -{ - struct inode *inode; - - inode = d_inode(dentry); - d_drop(dentry); - inode->i_ctime = dir->i_ctime; - - au_dir_ts(dir, bindex); - dir->i_version++; -} - -/* - * when an error happened, remove the created whiteout and revert everything. - */ -static int do_revert(int err, struct inode *dir, aufs_bindex_t bindex, - aufs_bindex_t bwh, struct dentry *wh_dentry, - struct dentry *dentry, struct au_dtime *dt) -{ - int rerr; - struct path h_path = { - .dentry = wh_dentry, - .mnt = au_sbr_mnt(dir->i_sb, bindex) - }; - - rerr = au_wh_unlink_dentry(au_h_iptr(dir, bindex), &h_path, dentry); - if (!rerr) { - au_set_dbwh(dentry, bwh); - au_dtime_revert(dt); - return 0; - } - - AuIOErr("%pd reverting whiteout failed(%d, %d)\n", dentry, err, rerr); - return -EIO; -} - -/* ---------------------------------------------------------------------- */ - -int aufs_unlink(struct inode *dir, struct dentry *dentry) -{ - int err; - aufs_bindex_t bwh, bindex, bstart; - struct inode *inode, *h_dir, *delegated; - struct dentry *parent, *wh_dentry; - /* to reuduce stack size */ - struct { - struct au_dtime dt; - struct au_pin pin; - struct path h_path; - } *a; - - IMustLock(dir); - - err = -ENOMEM; - a = kmalloc(sizeof(*a), GFP_NOFS); - if (unlikely(!a)) - goto out; - - err = aufs_read_lock(dentry, AuLock_DW | AuLock_GEN); - if (unlikely(err)) - goto out_free; - err = au_d_hashed_positive(dentry); - if (unlikely(err)) - goto out_unlock; - inode = d_inode(dentry); - IMustLock(inode); - err = -EISDIR; - if (unlikely(d_is_dir(dentry))) - goto out_unlock; /* possible? */ - - bstart = au_dbstart(dentry); - bwh = au_dbwh(dentry); - bindex = -1; - parent = dentry->d_parent; /* dir inode is locked */ - di_write_lock_parent(parent); - wh_dentry = lock_hdir_create_wh(dentry, /*isdir*/0, &bindex, &a->dt, - &a->pin); - err = PTR_ERR(wh_dentry); - if (IS_ERR(wh_dentry)) - goto out_parent; - - a->h_path.mnt = au_sbr_mnt(dentry->d_sb, bstart); - a->h_path.dentry = au_h_dptr(dentry, bstart); - dget(a->h_path.dentry); - if (bindex == bstart) { - h_dir = au_pinned_h_dir(&a->pin); - delegated = NULL; - err = vfsub_unlink(h_dir, &a->h_path, &delegated, /*force*/0); - if (unlikely(err == -EWOULDBLOCK)) { - pr_warn("cannot retry for NFSv4 delegation" - " for an internal unlink\n"); - iput(delegated); - } - } else { - /* dir inode is locked */ - h_dir = d_inode(wh_dentry->d_parent); - IMustLock(h_dir); - err = 0; - } - - if (!err) { - vfsub_drop_nlink(inode); - epilog(dir, dentry, bindex); - - /* update target timestamps */ - if (bindex == bstart) { - vfsub_update_h_iattr(&a->h_path, /*did*/NULL); - /*ignore*/ - inode->i_ctime = d_inode(a->h_path.dentry)->i_ctime; - } else - /* todo: this timestamp may be reverted later */ - inode->i_ctime = h_dir->i_ctime; - goto out_unpin; /* success */ - } - - /* revert */ - if (wh_dentry) { - int rerr; - - rerr = do_revert(err, dir, bindex, bwh, wh_dentry, dentry, - &a->dt); - if (rerr) - err = rerr; - } - -out_unpin: - au_unpin(&a->pin); - dput(wh_dentry); - dput(a->h_path.dentry); -out_parent: - di_write_unlock(parent); -out_unlock: - aufs_read_unlock(dentry, AuLock_DW); -out_free: - kfree(a); -out: - return err; -} - -int aufs_rmdir(struct inode *dir, struct dentry *dentry) -{ - int err, rmdir_later; - aufs_bindex_t bwh, bindex, bstart; - struct inode *inode; - struct dentry *parent, *wh_dentry, *h_dentry; - struct au_whtmp_rmdir *args; - /* to reuduce stack size */ - struct { - struct au_dtime dt; - struct au_pin pin; - } *a; - - IMustLock(dir); - - err = -ENOMEM; - a = kmalloc(sizeof(*a), GFP_NOFS); - if (unlikely(!a)) - goto out; - - err = aufs_read_lock(dentry, AuLock_DW | AuLock_FLUSH | AuLock_GEN); - if (unlikely(err)) - goto out_free; - err = au_alive_dir(dentry); - if (unlikely(err)) - goto out_unlock; - inode = d_inode(dentry); - IMustLock(inode); - err = -ENOTDIR; - if (unlikely(!d_is_dir(dentry))) - goto out_unlock; /* possible? */ - - err = -ENOMEM; - args = au_whtmp_rmdir_alloc(dir->i_sb, GFP_NOFS); - if (unlikely(!args)) - goto out_unlock; - - parent = dentry->d_parent; /* dir inode is locked */ - di_write_lock_parent(parent); - err = au_test_empty(dentry, &args->whlist); - if (unlikely(err)) - goto out_parent; - - bstart = au_dbstart(dentry); - bwh = au_dbwh(dentry); - bindex = -1; - wh_dentry = lock_hdir_create_wh(dentry, /*isdir*/1, &bindex, &a->dt, - &a->pin); - err = PTR_ERR(wh_dentry); - if (IS_ERR(wh_dentry)) - goto out_parent; - - h_dentry = au_h_dptr(dentry, bstart); - dget(h_dentry); - rmdir_later = 0; - if (bindex == bstart) { - err = renwh_and_rmdir(dentry, bstart, &args->whlist, dir); - if (err > 0) { - rmdir_later = err; - err = 0; - } - } else { - /* stop monitoring */ - au_hn_free(au_hi(inode, bstart)); - - /* dir inode is locked */ - IMustLock(d_inode(wh_dentry->d_parent)); - err = 0; - } - - if (!err) { - vfsub_dead_dir(inode); - au_set_dbdiropq(dentry, -1); - epilog(dir, dentry, bindex); - - if (rmdir_later) { - au_whtmp_kick_rmdir(dir, bstart, h_dentry, args); - args = NULL; - } - - goto out_unpin; /* success */ - } - - /* revert */ - AuLabel(revert); - if (wh_dentry) { - int rerr; - - rerr = do_revert(err, dir, bindex, bwh, wh_dentry, dentry, - &a->dt); - if (rerr) - err = rerr; - } - -out_unpin: - au_unpin(&a->pin); - dput(wh_dentry); - dput(h_dentry); -out_parent: - di_write_unlock(parent); - if (args) - au_whtmp_rmdir_free(args); -out_unlock: - aufs_read_unlock(dentry, AuLock_DW); -out_free: - kfree(a); -out: - AuTraceErr(err); - return err; -} diff --git a/fs/aufs/i_op_ren.c b/fs/aufs/i_op_ren.c deleted file mode 100644 index c880144b5..000000000 --- a/fs/aufs/i_op_ren.c +++ /dev/null @@ -1,1002 +0,0 @@ -/* - * Copyright (C) 2005-2016 Junjiro R. Okajima - */ - -/* - * inode operation (rename entry) - * todo: this is crazy monster - */ - -#include "aufs.h" - -enum { AuSRC, AuDST, AuSrcDst }; -enum { AuPARENT, AuCHILD, AuParentChild }; - -#define AuRen_ISDIR 1 -#define AuRen_ISSAMEDIR (1 << 1) -#define AuRen_WHSRC (1 << 2) -#define AuRen_WHDST (1 << 3) -#define AuRen_MNT_WRITE (1 << 4) -#define AuRen_DT_DSTDIR (1 << 5) -#define AuRen_DIROPQ (1 << 6) -#define au_ftest_ren(flags, name) ((flags) & AuRen_##name) -#define au_fset_ren(flags, name) \ - do { (flags) |= AuRen_##name; } while (0) -#define au_fclr_ren(flags, name) \ - do { (flags) &= ~AuRen_##name; } while (0) - -struct au_ren_args { - struct { - struct dentry *dentry, *h_dentry, *parent, *h_parent, - *wh_dentry; - struct inode *dir, *inode; - struct au_hinode *hdir; - struct au_dtime dt[AuParentChild]; - aufs_bindex_t bstart; - } sd[AuSrcDst]; - -#define src_dentry sd[AuSRC].dentry -#define src_dir sd[AuSRC].dir -#define src_inode sd[AuSRC].inode -#define src_h_dentry sd[AuSRC].h_dentry -#define src_parent sd[AuSRC].parent -#define src_h_parent sd[AuSRC].h_parent -#define src_wh_dentry sd[AuSRC].wh_dentry -#define src_hdir sd[AuSRC].hdir -#define src_h_dir sd[AuSRC].hdir->hi_inode -#define src_dt sd[AuSRC].dt -#define src_bstart sd[AuSRC].bstart - -#define dst_dentry sd[AuDST].dentry -#define dst_dir sd[AuDST].dir -#define dst_inode sd[AuDST].inode -#define dst_h_dentry sd[AuDST].h_dentry -#define dst_parent sd[AuDST].parent -#define dst_h_parent sd[AuDST].h_parent -#define dst_wh_dentry sd[AuDST].wh_dentry -#define dst_hdir sd[AuDST].hdir -#define dst_h_dir sd[AuDST].hdir->hi_inode -#define dst_dt sd[AuDST].dt -#define dst_bstart sd[AuDST].bstart - - struct dentry *h_trap; - struct au_branch *br; - struct au_hinode *src_hinode; - struct path h_path; - struct au_nhash whlist; - aufs_bindex_t btgt, src_bwh, src_bdiropq; - - unsigned int flags; - - struct au_whtmp_rmdir *thargs; - struct dentry *h_dst; -}; - -/* ---------------------------------------------------------------------- */ - -/* - * functions for reverting. - * when an error happened in a single rename systemcall, we should revert - * everything as if nothing happened. - * we don't need to revert the copied-up/down the parent dir since they are - * harmless. - */ - -#define RevertFailure(fmt, ...) do { \ - AuIOErr("revert failure: " fmt " (%d, %d)\n", \ - ##__VA_ARGS__, err, rerr); \ - err = -EIO; \ -} while (0) - -static void au_ren_rev_diropq(int err, struct au_ren_args *a) -{ - int rerr; - - au_hn_imtx_lock_nested(a->src_hinode, AuLsc_I_CHILD); - rerr = au_diropq_remove(a->src_dentry, a->btgt); - au_hn_imtx_unlock(a->src_hinode); - au_set_dbdiropq(a->src_dentry, a->src_bdiropq); - if (rerr) - RevertFailure("remove diropq %pd", a->src_dentry); -} - -static void au_ren_rev_rename(int err, struct au_ren_args *a) -{ - int rerr; - struct inode *delegated; - - a->h_path.dentry = vfsub_lkup_one(&a->src_dentry->d_name, - a->src_h_parent); - rerr = PTR_ERR(a->h_path.dentry); - if (IS_ERR(a->h_path.dentry)) { - RevertFailure("lkup one %pd", a->src_dentry); - return; - } - - delegated = NULL; - rerr = vfsub_rename(a->dst_h_dir, - au_h_dptr(a->src_dentry, a->btgt), - a->src_h_dir, &a->h_path, &delegated); - if (unlikely(rerr == -EWOULDBLOCK)) { - pr_warn("cannot retry for NFSv4 delegation" - " for an internal rename\n"); - iput(delegated); - } - d_drop(a->h_path.dentry); - dput(a->h_path.dentry); - /* au_set_h_dptr(a->src_dentry, a->btgt, NULL); */ - if (rerr) - RevertFailure("rename %pd", a->src_dentry); -} - -static void au_ren_rev_whtmp(int err, struct au_ren_args *a) -{ - int rerr; - struct inode *delegated; - - a->h_path.dentry = vfsub_lkup_one(&a->dst_dentry->d_name, - a->dst_h_parent); - rerr = PTR_ERR(a->h_path.dentry); - if (IS_ERR(a->h_path.dentry)) { - RevertFailure("lkup one %pd", a->dst_dentry); - return; - } - if (d_is_positive(a->h_path.dentry)) { - d_drop(a->h_path.dentry); - dput(a->h_path.dentry); - return; - } - - delegated = NULL; - rerr = vfsub_rename(a->dst_h_dir, a->h_dst, a->dst_h_dir, &a->h_path, - &delegated); - if (unlikely(rerr == -EWOULDBLOCK)) { - pr_warn("cannot retry for NFSv4 delegation" - " for an internal rename\n"); - iput(delegated); - } - d_drop(a->h_path.dentry); - dput(a->h_path.dentry); - if (!rerr) - au_set_h_dptr(a->dst_dentry, a->btgt, dget(a->h_dst)); - else - RevertFailure("rename %pd", a->h_dst); -} - -static void au_ren_rev_whsrc(int err, struct au_ren_args *a) -{ - int rerr; - - a->h_path.dentry = a->src_wh_dentry; - rerr = au_wh_unlink_dentry(a->src_h_dir, &a->h_path, a->src_dentry); - au_set_dbwh(a->src_dentry, a->src_bwh); - if (rerr) - RevertFailure("unlink %pd", a->src_wh_dentry); -} -#undef RevertFailure - -/* ---------------------------------------------------------------------- */ - -/* - * when we have to copyup the renaming entry, do it with the rename-target name - * in order to minimize the cost (the later actual rename is unnecessary). - * otherwise rename it on the target branch. - */ -static int au_ren_or_cpup(struct au_ren_args *a) -{ - int err; - struct dentry *d; - struct inode *delegated; - - d = a->src_dentry; - if (au_dbstart(d) == a->btgt) { - a->h_path.dentry = a->dst_h_dentry; - if (au_ftest_ren(a->flags, DIROPQ) - && au_dbdiropq(d) == a->btgt) - au_fclr_ren(a->flags, DIROPQ); - AuDebugOn(au_dbstart(d) != a->btgt); - delegated = NULL; - err = vfsub_rename(a->src_h_dir, au_h_dptr(d, a->btgt), - a->dst_h_dir, &a->h_path, &delegated); - if (unlikely(err == -EWOULDBLOCK)) { - pr_warn("cannot retry for NFSv4 delegation" - " for an internal rename\n"); - iput(delegated); - } - } else - BUG(); - - if (!err && a->h_dst) - /* it will be set to dinfo later */ - dget(a->h_dst); - - return err; -} - -/* cf. aufs_rmdir() */ -static int au_ren_del_whtmp(struct au_ren_args *a) -{ - int err; - struct inode *dir; - - dir = a->dst_dir; - SiMustAnyLock(dir->i_sb); - if (!au_nhash_test_longer_wh(&a->whlist, a->btgt, - au_sbi(dir->i_sb)->si_dirwh) - || au_test_fs_remote(a->h_dst->d_sb)) { - err = au_whtmp_rmdir(dir, a->btgt, a->h_dst, &a->whlist); - if (unlikely(err)) - pr_warn("failed removing whtmp dir %pd (%d), " - "ignored.\n", a->h_dst, err); - } else { - au_nhash_wh_free(&a->thargs->whlist); - a->thargs->whlist = a->whlist; - a->whlist.nh_num = 0; - au_whtmp_kick_rmdir(dir, a->btgt, a->h_dst, a->thargs); - dput(a->h_dst); - a->thargs = NULL; - } - - return 0; -} - -/* make it 'opaque' dir. */ -static int au_ren_diropq(struct au_ren_args *a) -{ - int err; - struct dentry *diropq; - - err = 0; - a->src_bdiropq = au_dbdiropq(a->src_dentry); - a->src_hinode = au_hi(a->src_inode, a->btgt); - au_hn_imtx_lock_nested(a->src_hinode, AuLsc_I_CHILD); - diropq = au_diropq_create(a->src_dentry, a->btgt); - au_hn_imtx_unlock(a->src_hinode); - if (IS_ERR(diropq)) - err = PTR_ERR(diropq); - else - dput(diropq); - - return err; -} - -static int do_rename(struct au_ren_args *a) -{ - int err; - struct dentry *d, *h_d; - - /* prepare workqueue args for asynchronous rmdir */ - h_d = a->dst_h_dentry; - if (au_ftest_ren(a->flags, ISDIR) && d_is_positive(h_d)) { - err = -ENOMEM; - a->thargs = au_whtmp_rmdir_alloc(a->src_dentry->d_sb, GFP_NOFS); - if (unlikely(!a->thargs)) - goto out; - a->h_dst = dget(h_d); - } - - /* create whiteout for src_dentry */ - if (au_ftest_ren(a->flags, WHSRC)) { - a->src_bwh = au_dbwh(a->src_dentry); - AuDebugOn(a->src_bwh >= 0); - a->src_wh_dentry - = au_wh_create(a->src_dentry, a->btgt, a->src_h_parent); - err = PTR_ERR(a->src_wh_dentry); - if (IS_ERR(a->src_wh_dentry)) - goto out_thargs; - } - - /* lookup whiteout for dentry */ - if (au_ftest_ren(a->flags, WHDST)) { - h_d = au_wh_lkup(a->dst_h_parent, &a->dst_dentry->d_name, - a->br); - err = PTR_ERR(h_d); - if (IS_ERR(h_d)) - goto out_whsrc; - if (d_is_negative(h_d)) - dput(h_d); - else - a->dst_wh_dentry = h_d; - } - - /* rename dentry to tmpwh */ - if (a->thargs) { - err = au_whtmp_ren(a->dst_h_dentry, a->br); - if (unlikely(err)) - goto out_whdst; - - d = a->dst_dentry; - au_set_h_dptr(d, a->btgt, NULL); - err = au_lkup_neg(d, a->btgt, /*wh*/0); - if (unlikely(err)) - goto out_whtmp; - a->dst_h_dentry = au_h_dptr(d, a->btgt); - } - - BUG_ON(d_is_positive(a->dst_h_dentry) && a->src_bstart != a->btgt); - - /* rename by vfs_rename or cpup */ - d = a->dst_dentry; - if (au_ftest_ren(a->flags, ISDIR) - && (a->dst_wh_dentry - || au_dbdiropq(d) == a->btgt - /* hide the lower to keep xino */ - || a->btgt < au_dbend(d) - || au_opt_test(au_mntflags(d->d_sb), ALWAYS_DIROPQ))) - au_fset_ren(a->flags, DIROPQ); - err = au_ren_or_cpup(a); - if (unlikely(err)) - /* leave the copied-up one */ - goto out_whtmp; - - /* make dir opaque */ - if (au_ftest_ren(a->flags, DIROPQ)) { - err = au_ren_diropq(a); - if (unlikely(err)) - goto out_rename; - } - - /* update target timestamps */ - AuDebugOn(au_dbstart(a->src_dentry) != a->btgt); - a->h_path.dentry = au_h_dptr(a->src_dentry, a->btgt); - vfsub_update_h_iattr(&a->h_path, /*did*/NULL); /*ignore*/ - a->src_inode->i_ctime = d_inode(a->h_path.dentry)->i_ctime; - - /* remove whiteout for dentry */ - if (a->dst_wh_dentry) { - a->h_path.dentry = a->dst_wh_dentry; - err = au_wh_unlink_dentry(a->dst_h_dir, &a->h_path, - a->dst_dentry); - if (unlikely(err)) - goto out_diropq; - } - - /* remove whtmp */ - if (a->thargs) - au_ren_del_whtmp(a); /* ignore this error */ - - au_fhsm_wrote(a->src_dentry->d_sb, a->btgt, /*force*/0); - err = 0; - goto out_success; - -out_diropq: - if (au_ftest_ren(a->flags, DIROPQ)) - au_ren_rev_diropq(err, a); -out_rename: - au_ren_rev_rename(err, a); - dput(a->h_dst); -out_whtmp: - if (a->thargs) - au_ren_rev_whtmp(err, a); -out_whdst: - dput(a->dst_wh_dentry); - a->dst_wh_dentry = NULL; -out_whsrc: - if (a->src_wh_dentry) - au_ren_rev_whsrc(err, a); -out_success: - dput(a->src_wh_dentry); - dput(a->dst_wh_dentry); -out_thargs: - if (a->thargs) { - dput(a->h_dst); - au_whtmp_rmdir_free(a->thargs); - a->thargs = NULL; - } -out: - return err; -} - -/* ---------------------------------------------------------------------- */ - -/* - * test if @dentry dir can be rename destination or not. - * success means, it is a logically empty dir. - */ -static int may_rename_dstdir(struct dentry *dentry, struct au_nhash *whlist) -{ - return au_test_empty(dentry, whlist); -} - -/* - * test if @dentry dir can be rename source or not. - * if it can, return 0 and @children is filled. - * success means, - * - it is a logically empty dir. - * - or, it exists on writable branch and has no children including whiteouts - * on the lower branch. - */ -static int may_rename_srcdir(struct dentry *dentry, aufs_bindex_t btgt) -{ - int err; - unsigned int rdhash; - aufs_bindex_t bstart; - - bstart = au_dbstart(dentry); - if (bstart != btgt) { - struct au_nhash whlist; - - SiMustAnyLock(dentry->d_sb); - rdhash = au_sbi(dentry->d_sb)->si_rdhash; - if (!rdhash) - rdhash = au_rdhash_est(au_dir_size(/*file*/NULL, - dentry)); - err = au_nhash_alloc(&whlist, rdhash, GFP_NOFS); - if (unlikely(err)) - goto out; - err = au_test_empty(dentry, &whlist); - au_nhash_wh_free(&whlist); - goto out; - } - - if (bstart == au_dbtaildir(dentry)) - return 0; /* success */ - - err = au_test_empty_lower(dentry); - -out: - if (err == -ENOTEMPTY) { - AuWarn1("renaming dir who has child(ren) on multiple branches," - " is not supported\n"); - err = -EXDEV; - } - return err; -} - -/* side effect: sets whlist and h_dentry */ -static int au_ren_may_dir(struct au_ren_args *a) -{ - int err; - unsigned int rdhash; - struct dentry *d; - - d = a->dst_dentry; - SiMustAnyLock(d->d_sb); - - err = 0; - if (au_ftest_ren(a->flags, ISDIR) && a->dst_inode) { - rdhash = au_sbi(d->d_sb)->si_rdhash; - if (!rdhash) - rdhash = au_rdhash_est(au_dir_size(/*file*/NULL, d)); - err = au_nhash_alloc(&a->whlist, rdhash, GFP_NOFS); - if (unlikely(err)) - goto out; - - au_set_dbstart(d, a->dst_bstart); - err = may_rename_dstdir(d, &a->whlist); - au_set_dbstart(d, a->btgt); - } - a->dst_h_dentry = au_h_dptr(d, au_dbstart(d)); - if (unlikely(err)) - goto out; - - d = a->src_dentry; - a->src_h_dentry = au_h_dptr(d, au_dbstart(d)); - if (au_ftest_ren(a->flags, ISDIR)) { - err = may_rename_srcdir(d, a->btgt); - if (unlikely(err)) { - au_nhash_wh_free(&a->whlist); - a->whlist.nh_num = 0; - } - } -out: - return err; -} - -/* ---------------------------------------------------------------------- */ - -/* - * simple tests for rename. - * following the checks in vfs, plus the parent-child relationship. - */ -static int au_may_ren(struct au_ren_args *a) -{ - int err, isdir; - struct inode *h_inode; - - if (a->src_bstart == a->btgt) { - err = au_may_del(a->src_dentry, a->btgt, a->src_h_parent, - au_ftest_ren(a->flags, ISDIR)); - if (unlikely(err)) - goto out; - err = -EINVAL; - if (unlikely(a->src_h_dentry == a->h_trap)) - goto out; - } - - err = 0; - if (a->dst_bstart != a->btgt) - goto out; - - err = -ENOTEMPTY; - if (unlikely(a->dst_h_dentry == a->h_trap)) - goto out; - - err = -EIO; - isdir = !!au_ftest_ren(a->flags, ISDIR); - if (d_really_is_negative(a->dst_dentry)) { - if (d_is_negative(a->dst_h_dentry)) - err = au_may_add(a->dst_dentry, a->btgt, - a->dst_h_parent, isdir); - } else { - if (unlikely(d_is_negative(a->dst_h_dentry))) - goto out; - h_inode = d_inode(a->dst_h_dentry); - if (h_inode->i_nlink) - err = au_may_del(a->dst_dentry, a->btgt, - a->dst_h_parent, isdir); - } - -out: - if (unlikely(err == -ENOENT || err == -EEXIST)) - err = -EIO; - AuTraceErr(err); - return err; -} - -/* ---------------------------------------------------------------------- */ - -/* - * locking order - * (VFS) - * - src_dir and dir by lock_rename() - * - inode if exitsts - * (aufs) - * - lock all - * + src_dentry and dentry by aufs_read_and_write_lock2() which calls, - * + si_read_lock - * + di_write_lock2_child() - * + di_write_lock_child() - * + ii_write_lock_child() - * + di_write_lock_child2() - * + ii_write_lock_child2() - * + src_parent and parent - * + di_write_lock_parent() - * + ii_write_lock_parent() - * + di_write_lock_parent2() - * + ii_write_lock_parent2() - * + lower src_dir and dir by vfsub_lock_rename() - * + verify the every relationships between child and parent. if any - * of them failed, unlock all and return -EBUSY. - */ -static void au_ren_unlock(struct au_ren_args *a) -{ - vfsub_unlock_rename(a->src_h_parent, a->src_hdir, - a->dst_h_parent, a->dst_hdir); - if (au_ftest_ren(a->flags, MNT_WRITE)) - vfsub_mnt_drop_write(au_br_mnt(a->br)); -} - -static int au_ren_lock(struct au_ren_args *a) -{ - int err; - unsigned int udba; - - err = 0; - a->src_h_parent = au_h_dptr(a->src_parent, a->btgt); - a->src_hdir = au_hi(a->src_dir, a->btgt); - a->dst_h_parent = au_h_dptr(a->dst_parent, a->btgt); - a->dst_hdir = au_hi(a->dst_dir, a->btgt); - - err = vfsub_mnt_want_write(au_br_mnt(a->br)); - if (unlikely(err)) - goto out; - au_fset_ren(a->flags, MNT_WRITE); - a->h_trap = vfsub_lock_rename(a->src_h_parent, a->src_hdir, - a->dst_h_parent, a->dst_hdir); - udba = au_opt_udba(a->src_dentry->d_sb); - if (unlikely(a->src_hdir->hi_inode != d_inode(a->src_h_parent) - || a->dst_hdir->hi_inode != d_inode(a->dst_h_parent))) - err = au_busy_or_stale(); - if (!err && au_dbstart(a->src_dentry) == a->btgt) - err = au_h_verify(a->src_h_dentry, udba, - d_inode(a->src_h_parent), a->src_h_parent, - a->br); - if (!err && au_dbstart(a->dst_dentry) == a->btgt) - err = au_h_verify(a->dst_h_dentry, udba, - d_inode(a->dst_h_parent), a->dst_h_parent, - a->br); - if (!err) - goto out; /* success */ - - err = au_busy_or_stale(); - au_ren_unlock(a); - -out: - return err; -} - -/* ---------------------------------------------------------------------- */ - -static void au_ren_refresh_dir(struct au_ren_args *a) -{ - struct inode *dir; - - dir = a->dst_dir; - dir->i_version++; - if (au_ftest_ren(a->flags, ISDIR)) { - /* is this updating defined in POSIX? */ - au_cpup_attr_timesizes(a->src_inode); - au_cpup_attr_nlink(dir, /*force*/1); - } - - au_dir_ts(dir, a->btgt); - - if (au_ftest_ren(a->flags, ISSAMEDIR)) - return; - - dir = a->src_dir; - dir->i_version++; - if (au_ftest_ren(a->flags, ISDIR)) - au_cpup_attr_nlink(dir, /*force*/1); - au_dir_ts(dir, a->btgt); -} - -static void au_ren_refresh(struct au_ren_args *a) -{ - aufs_bindex_t bend, bindex; - struct dentry *d, *h_d; - struct inode *i, *h_i; - struct super_block *sb; - - d = a->dst_dentry; - d_drop(d); - if (a->h_dst) - /* already dget-ed by au_ren_or_cpup() */ - au_set_h_dptr(d, a->btgt, a->h_dst); - - i = a->dst_inode; - if (i) { - if (!au_ftest_ren(a->flags, ISDIR)) - vfsub_drop_nlink(i); - else { - vfsub_dead_dir(i); - au_cpup_attr_timesizes(i); - } - au_update_dbrange(d, /*do_put_zero*/1); - } else { - bend = a->btgt; - for (bindex = au_dbstart(d); bindex < bend; bindex++) - au_set_h_dptr(d, bindex, NULL); - bend = au_dbend(d); - for (bindex = a->btgt + 1; bindex <= bend; bindex++) - au_set_h_dptr(d, bindex, NULL); - au_update_dbrange(d, /*do_put_zero*/0); - } - - d = a->src_dentry; - au_set_dbwh(d, -1); - bend = au_dbend(d); - for (bindex = a->btgt + 1; bindex <= bend; bindex++) { - h_d = au_h_dptr(d, bindex); - if (h_d) - au_set_h_dptr(d, bindex, NULL); - } - au_set_dbend(d, a->btgt); - - sb = d->d_sb; - i = a->src_inode; - if (au_opt_test(au_mntflags(sb), PLINK) && au_plink_test(i)) - return; /* success */ - - bend = au_ibend(i); - for (bindex = a->btgt + 1; bindex <= bend; bindex++) { - h_i = au_h_iptr(i, bindex); - if (h_i) { - au_xino_write(sb, bindex, h_i->i_ino, /*ino*/0); - /* ignore this error */ - au_set_h_iptr(i, bindex, NULL, 0); - } - } - au_set_ibend(i, a->btgt); -} - -/* ---------------------------------------------------------------------- */ - -/* mainly for link(2) and rename(2) */ -int au_wbr(struct dentry *dentry, aufs_bindex_t btgt) -{ - aufs_bindex_t bdiropq, bwh; - struct dentry *parent; - struct au_branch *br; - - parent = dentry->d_parent; - IMustLock(d_inode(parent)); /* dir is locked */ - - bdiropq = au_dbdiropq(parent); - bwh = au_dbwh(dentry); - br = au_sbr(dentry->d_sb, btgt); - if (au_br_rdonly(br) - || (0 <= bdiropq && bdiropq < btgt) - || (0 <= bwh && bwh < btgt)) - btgt = -1; - - AuDbg("btgt %d\n", btgt); - return btgt; -} - -/* sets src_bstart, dst_bstart and btgt */ -static int au_ren_wbr(struct au_ren_args *a) -{ - int err; - struct au_wr_dir_args wr_dir_args = { - /* .force_btgt = -1, */ - .flags = AuWrDir_ADD_ENTRY - }; - - a->src_bstart = au_dbstart(a->src_dentry); - a->dst_bstart = au_dbstart(a->dst_dentry); - if (au_ftest_ren(a->flags, ISDIR)) - au_fset_wrdir(wr_dir_args.flags, ISDIR); - wr_dir_args.force_btgt = a->src_bstart; - if (a->dst_inode && a->dst_bstart < a->src_bstart) - wr_dir_args.force_btgt = a->dst_bstart; - wr_dir_args.force_btgt = au_wbr(a->dst_dentry, wr_dir_args.force_btgt); - err = au_wr_dir(a->dst_dentry, a->src_dentry, &wr_dir_args); - a->btgt = err; - - return err; -} - -static void au_ren_dt(struct au_ren_args *a) -{ - a->h_path.dentry = a->src_h_parent; - au_dtime_store(a->src_dt + AuPARENT, a->src_parent, &a->h_path); - if (!au_ftest_ren(a->flags, ISSAMEDIR)) { - a->h_path.dentry = a->dst_h_parent; - au_dtime_store(a->dst_dt + AuPARENT, a->dst_parent, &a->h_path); - } - - au_fclr_ren(a->flags, DT_DSTDIR); - if (!au_ftest_ren(a->flags, ISDIR)) - return; - - a->h_path.dentry = a->src_h_dentry; - au_dtime_store(a->src_dt + AuCHILD, a->src_dentry, &a->h_path); - if (d_is_positive(a->dst_h_dentry)) { - au_fset_ren(a->flags, DT_DSTDIR); - a->h_path.dentry = a->dst_h_dentry; - au_dtime_store(a->dst_dt + AuCHILD, a->dst_dentry, &a->h_path); - } -} - -static void au_ren_rev_dt(int err, struct au_ren_args *a) -{ - struct dentry *h_d; - struct mutex *h_mtx; - - au_dtime_revert(a->src_dt + AuPARENT); - if (!au_ftest_ren(a->flags, ISSAMEDIR)) - au_dtime_revert(a->dst_dt + AuPARENT); - - if (au_ftest_ren(a->flags, ISDIR) && err != -EIO) { - h_d = a->src_dt[AuCHILD].dt_h_path.dentry; - h_mtx = &d_inode(h_d)->i_mutex; - mutex_lock_nested(h_mtx, AuLsc_I_CHILD); - au_dtime_revert(a->src_dt + AuCHILD); - mutex_unlock(h_mtx); - - if (au_ftest_ren(a->flags, DT_DSTDIR)) { - h_d = a->dst_dt[AuCHILD].dt_h_path.dentry; - h_mtx = &d_inode(h_d)->i_mutex; - mutex_lock_nested(h_mtx, AuLsc_I_CHILD); - au_dtime_revert(a->dst_dt + AuCHILD); - mutex_unlock(h_mtx); - } - } -} - -/* ---------------------------------------------------------------------- */ - -int aufs_rename(struct inode *_src_dir, struct dentry *_src_dentry, - struct inode *_dst_dir, struct dentry *_dst_dentry) -{ - int err, flags; - /* reduce stack space */ - struct au_ren_args *a; - - AuDbg("%pd, %pd\n", _src_dentry, _dst_dentry); - IMustLock(_src_dir); - IMustLock(_dst_dir); - - err = -ENOMEM; - BUILD_BUG_ON(sizeof(*a) > PAGE_SIZE); - a = kzalloc(sizeof(*a), GFP_NOFS); - if (unlikely(!a)) - goto out; - - a->src_dir = _src_dir; - a->src_dentry = _src_dentry; - a->src_inode = NULL; - if (d_really_is_positive(a->src_dentry)) - a->src_inode = d_inode(a->src_dentry); - a->src_parent = a->src_dentry->d_parent; /* dir inode is locked */ - a->dst_dir = _dst_dir; - a->dst_dentry = _dst_dentry; - a->dst_inode = NULL; - if (d_really_is_positive(a->dst_dentry)) - a->dst_inode = d_inode(a->dst_dentry); - a->dst_parent = a->dst_dentry->d_parent; /* dir inode is locked */ - if (a->dst_inode) { - IMustLock(a->dst_inode); - au_igrab(a->dst_inode); - } - - err = -ENOTDIR; - flags = AuLock_FLUSH | AuLock_NOPLM | AuLock_GEN; - if (d_is_dir(a->src_dentry)) { - au_fset_ren(a->flags, ISDIR); - if (unlikely(d_really_is_positive(a->dst_dentry) - && !d_is_dir(a->dst_dentry))) - goto out_free; - flags |= AuLock_DIRS; - } - err = aufs_read_and_write_lock2(a->dst_dentry, a->src_dentry, flags); - if (unlikely(err)) - goto out_free; - - err = au_d_hashed_positive(a->src_dentry); - if (unlikely(err)) - goto out_unlock; - err = -ENOENT; - if (a->dst_inode) { - /* - * If it is a dir, VFS unhash dst_dentry before this - * function. It means we cannot rely upon d_unhashed(). - */ - if (unlikely(!a->dst_inode->i_nlink)) - goto out_unlock; - if (!S_ISDIR(a->dst_inode->i_mode)) { - err = au_d_hashed_positive(a->dst_dentry); - if (unlikely(err)) - goto out_unlock; - } else if (unlikely(IS_DEADDIR(a->dst_inode))) - goto out_unlock; - } else if (unlikely(d_unhashed(a->dst_dentry))) - goto out_unlock; - - /* - * is it possible? - * yes, it happened (in linux-3.3-rcN) but I don't know why. - * there may exist a problem somewhere else. - */ - err = -EINVAL; - if (unlikely(d_inode(a->dst_parent) == d_inode(a->src_dentry))) - goto out_unlock; - - au_fset_ren(a->flags, ISSAMEDIR); /* temporary */ - di_write_lock_parent(a->dst_parent); - - /* which branch we process */ - err = au_ren_wbr(a); - if (unlikely(err < 0)) - goto out_parent; - a->br = au_sbr(a->dst_dentry->d_sb, a->btgt); - a->h_path.mnt = au_br_mnt(a->br); - - /* are they available to be renamed */ - err = au_ren_may_dir(a); - if (unlikely(err)) - goto out_children; - - /* prepare the writable parent dir on the same branch */ - if (a->dst_bstart == a->btgt) { - au_fset_ren(a->flags, WHDST); - } else { - err = au_cpup_dirs(a->dst_dentry, a->btgt); - if (unlikely(err)) - goto out_children; - } - - if (a->src_dir != a->dst_dir) { - /* - * this temporary unlock is safe, - * because both dir->i_mutex are locked. - */ - di_write_unlock(a->dst_parent); - di_write_lock_parent(a->src_parent); - err = au_wr_dir_need_wh(a->src_dentry, - au_ftest_ren(a->flags, ISDIR), - &a->btgt); - di_write_unlock(a->src_parent); - di_write_lock2_parent(a->src_parent, a->dst_parent, /*isdir*/1); - au_fclr_ren(a->flags, ISSAMEDIR); - } else - err = au_wr_dir_need_wh(a->src_dentry, - au_ftest_ren(a->flags, ISDIR), - &a->btgt); - if (unlikely(err < 0)) - goto out_children; - if (err) - au_fset_ren(a->flags, WHSRC); - - /* cpup src */ - if (a->src_bstart != a->btgt) { - struct au_pin pin; - - err = au_pin(&pin, a->src_dentry, a->btgt, - au_opt_udba(a->src_dentry->d_sb), - AuPin_DI_LOCKED | AuPin_MNT_WRITE); - if (!err) { - struct au_cp_generic cpg = { - .dentry = a->src_dentry, - .bdst = a->btgt, - .bsrc = a->src_bstart, - .len = -1, - .pin = &pin, - .flags = AuCpup_DTIME | AuCpup_HOPEN - }; - AuDebugOn(au_dbstart(a->src_dentry) != a->src_bstart); - err = au_sio_cpup_simple(&cpg); - au_unpin(&pin); - } - if (unlikely(err)) - goto out_children; - a->src_bstart = a->btgt; - a->src_h_dentry = au_h_dptr(a->src_dentry, a->btgt); - au_fset_ren(a->flags, WHSRC); - } - - /* lock them all */ - err = au_ren_lock(a); - if (unlikely(err)) - /* leave the copied-up one */ - goto out_children; - - if (!au_opt_test(au_mntflags(a->dst_dir->i_sb), UDBA_NONE)) - err = au_may_ren(a); - else if (unlikely(a->dst_dentry->d_name.len > AUFS_MAX_NAMELEN)) - err = -ENAMETOOLONG; - if (unlikely(err)) - goto out_hdir; - - /* store timestamps to be revertible */ - au_ren_dt(a); - - /* here we go */ - err = do_rename(a); - if (unlikely(err)) - goto out_dt; - - /* update dir attributes */ - au_ren_refresh_dir(a); - - /* dput/iput all lower dentries */ - au_ren_refresh(a); - - goto out_hdir; /* success */ - -out_dt: - au_ren_rev_dt(err, a); -out_hdir: - au_ren_unlock(a); -out_children: - au_nhash_wh_free(&a->whlist); - if (err && a->dst_inode && a->dst_bstart != a->btgt) { - AuDbg("bstart %d, btgt %d\n", a->dst_bstart, a->btgt); - au_set_h_dptr(a->dst_dentry, a->btgt, NULL); - au_set_dbstart(a->dst_dentry, a->dst_bstart); - } -out_parent: - if (!err) - d_move(a->src_dentry, a->dst_dentry); - else { - au_update_dbstart(a->dst_dentry); - if (!a->dst_inode) - d_drop(a->dst_dentry); - } - if (au_ftest_ren(a->flags, ISSAMEDIR)) - di_write_unlock(a->dst_parent); - else - di_write_unlock2(a->src_parent, a->dst_parent); -out_unlock: - aufs_read_and_write_unlock2(a->dst_dentry, a->src_dentry); -out_free: - iput(a->dst_inode); - if (a->thargs) - au_whtmp_rmdir_free(a->thargs); - kfree(a); -out: - AuTraceErr(err); - return err; -} diff --git a/fs/aufs/iinfo.c b/fs/aufs/iinfo.c deleted file mode 100644 index 67ef672a0..000000000 --- a/fs/aufs/iinfo.c +++ /dev/null @@ -1,264 +0,0 @@ -/* - * Copyright (C) 2005-2016 Junjiro R. Okajima - */ - -/* - * inode private data - */ - -#include "aufs.h" - -struct inode *au_h_iptr(struct inode *inode, aufs_bindex_t bindex) -{ - struct inode *h_inode; - - IiMustAnyLock(inode); - - h_inode = au_ii(inode)->ii_hinode[0 + bindex].hi_inode; - AuDebugOn(h_inode && atomic_read(&h_inode->i_count) <= 0); - return h_inode; -} - -/* todo: hard/soft set? */ -void au_hiput(struct au_hinode *hinode) -{ - au_hn_free(hinode); - dput(hinode->hi_whdentry); - iput(hinode->hi_inode); -} - -unsigned int au_hi_flags(struct inode *inode, int isdir) -{ - unsigned int flags; - const unsigned int mnt_flags = au_mntflags(inode->i_sb); - - flags = 0; - if (au_opt_test(mnt_flags, XINO)) - au_fset_hi(flags, XINO); - if (isdir && au_opt_test(mnt_flags, UDBA_HNOTIFY)) - au_fset_hi(flags, HNOTIFY); - return flags; -} - -void au_set_h_iptr(struct inode *inode, aufs_bindex_t bindex, - struct inode *h_inode, unsigned int flags) -{ - struct au_hinode *hinode; - struct inode *hi; - struct au_iinfo *iinfo = au_ii(inode); - - IiMustWriteLock(inode); - - hinode = iinfo->ii_hinode + bindex; - hi = hinode->hi_inode; - AuDebugOn(h_inode && atomic_read(&h_inode->i_count) <= 0); - - if (hi) - au_hiput(hinode); - hinode->hi_inode = h_inode; - if (h_inode) { - int err; - struct super_block *sb = inode->i_sb; - struct au_branch *br; - - AuDebugOn(inode->i_mode - && (h_inode->i_mode & S_IFMT) - != (inode->i_mode & S_IFMT)); - if (bindex == iinfo->ii_bstart) - au_cpup_igen(inode, h_inode); - br = au_sbr(sb, bindex); - hinode->hi_id = br->br_id; - if (au_ftest_hi(flags, XINO)) { - err = au_xino_write(sb, bindex, h_inode->i_ino, - inode->i_ino); - if (unlikely(err)) - AuIOErr1("failed au_xino_write() %d\n", err); - } - - if (au_ftest_hi(flags, HNOTIFY) - && au_br_hnotifyable(br->br_perm)) { - err = au_hn_alloc(hinode, inode); - if (unlikely(err)) - AuIOErr1("au_hn_alloc() %d\n", err); - } - } -} - -void au_set_hi_wh(struct inode *inode, aufs_bindex_t bindex, - struct dentry *h_wh) -{ - struct au_hinode *hinode; - - IiMustWriteLock(inode); - - hinode = au_ii(inode)->ii_hinode + bindex; - AuDebugOn(hinode->hi_whdentry); - hinode->hi_whdentry = h_wh; -} - -void au_update_iigen(struct inode *inode, int half) -{ - struct au_iinfo *iinfo; - struct au_iigen *iigen; - unsigned int sigen; - - sigen = au_sigen(inode->i_sb); - iinfo = au_ii(inode); - iigen = &iinfo->ii_generation; - spin_lock(&iigen->ig_spin); - iigen->ig_generation = sigen; - if (half) - au_ig_fset(iigen->ig_flags, HALF_REFRESHED); - else - au_ig_fclr(iigen->ig_flags, HALF_REFRESHED); - spin_unlock(&iigen->ig_spin); -} - -/* it may be called at remount time, too */ -void au_update_ibrange(struct inode *inode, int do_put_zero) -{ - struct au_iinfo *iinfo; - aufs_bindex_t bindex, bend; - - iinfo = au_ii(inode); - if (!iinfo) - return; - - IiMustWriteLock(inode); - - if (do_put_zero && iinfo->ii_bstart >= 0) { - for (bindex = iinfo->ii_bstart; bindex <= iinfo->ii_bend; - bindex++) { - struct inode *h_i; - - h_i = iinfo->ii_hinode[0 + bindex].hi_inode; - if (h_i - && !h_i->i_nlink - && !(h_i->i_state & I_LINKABLE)) - au_set_h_iptr(inode, bindex, NULL, 0); - } - } - - iinfo->ii_bstart = -1; - iinfo->ii_bend = -1; - bend = au_sbend(inode->i_sb); - for (bindex = 0; bindex <= bend; bindex++) - if (iinfo->ii_hinode[0 + bindex].hi_inode) { - iinfo->ii_bstart = bindex; - break; - } - if (iinfo->ii_bstart >= 0) - for (bindex = bend; bindex >= iinfo->ii_bstart; bindex--) - if (iinfo->ii_hinode[0 + bindex].hi_inode) { - iinfo->ii_bend = bindex; - break; - } - AuDebugOn(iinfo->ii_bstart > iinfo->ii_bend); -} - -/* ---------------------------------------------------------------------- */ - -void au_icntnr_init_once(void *_c) -{ - struct au_icntnr *c = _c; - struct au_iinfo *iinfo = &c->iinfo; - static struct lock_class_key aufs_ii; - - spin_lock_init(&iinfo->ii_generation.ig_spin); - au_rw_init(&iinfo->ii_rwsem); - au_rw_class(&iinfo->ii_rwsem, &aufs_ii); - inode_init_once(&c->vfs_inode); -} - -int au_iinfo_init(struct inode *inode) -{ - struct au_iinfo *iinfo; - struct super_block *sb; - int nbr, i; - - sb = inode->i_sb; - iinfo = &(container_of(inode, struct au_icntnr, vfs_inode)->iinfo); - nbr = au_sbend(sb) + 1; - if (unlikely(nbr <= 0)) - nbr = 1; - iinfo->ii_hinode = kcalloc(nbr, sizeof(*iinfo->ii_hinode), GFP_NOFS); - if (iinfo->ii_hinode) { - au_ninodes_inc(sb); - for (i = 0; i < nbr; i++) - iinfo->ii_hinode[i].hi_id = -1; - - iinfo->ii_generation.ig_generation = au_sigen(sb); - iinfo->ii_bstart = -1; - iinfo->ii_bend = -1; - iinfo->ii_vdir = NULL; - return 0; - } - return -ENOMEM; -} - -int au_ii_realloc(struct au_iinfo *iinfo, int nbr) -{ - int err, sz; - struct au_hinode *hip; - - AuRwMustWriteLock(&iinfo->ii_rwsem); - - err = -ENOMEM; - sz = sizeof(*hip) * (iinfo->ii_bend + 1); - if (!sz) - sz = sizeof(*hip); - hip = au_kzrealloc(iinfo->ii_hinode, sz, sizeof(*hip) * nbr, GFP_NOFS); - if (hip) { - iinfo->ii_hinode = hip; - err = 0; - } - - return err; -} - -void au_iinfo_fin(struct inode *inode) -{ - struct au_iinfo *iinfo; - struct au_hinode *hi; - struct super_block *sb; - aufs_bindex_t bindex, bend; - const unsigned char unlinked = !inode->i_nlink; - - iinfo = au_ii(inode); - /* bad_inode case */ - if (!iinfo) - return; - - sb = inode->i_sb; - au_ninodes_dec(sb); - if (si_pid_test(sb)) - au_xino_delete_inode(inode, unlinked); - else { - /* - * it is safe to hide the dependency between sbinfo and - * sb->s_umount. - */ - lockdep_off(); - si_noflush_read_lock(sb); - au_xino_delete_inode(inode, unlinked); - si_read_unlock(sb); - lockdep_on(); - } - - if (iinfo->ii_vdir) - au_vdir_free(iinfo->ii_vdir); - - bindex = iinfo->ii_bstart; - if (bindex >= 0) { - hi = iinfo->ii_hinode + bindex; - bend = iinfo->ii_bend; - while (bindex++ <= bend) { - if (hi->hi_inode) - au_hiput(hi); - hi++; - } - } - kfree(iinfo->ii_hinode); - iinfo->ii_hinode = NULL; - AuRwDestroy(&iinfo->ii_rwsem); -} diff --git a/fs/aufs/inode.c b/fs/aufs/inode.c deleted file mode 100644 index 5a87727ba..000000000 --- a/fs/aufs/inode.c +++ /dev/null @@ -1,514 +0,0 @@ -/* - * Copyright (C) 2005-2016 Junjiro R. Okajima - */ - -/* - * inode functions - */ - -#include "aufs.h" - -struct inode *au_igrab(struct inode *inode) -{ - if (inode) { - AuDebugOn(!atomic_read(&inode->i_count)); - ihold(inode); - } - return inode; -} - -static void au_refresh_hinode_attr(struct inode *inode, int do_version) -{ - au_cpup_attr_all(inode, /*force*/0); - au_update_iigen(inode, /*half*/1); - if (do_version) - inode->i_version++; -} - -static int au_ii_refresh(struct inode *inode, int *update) -{ - int err, e; - umode_t type; - aufs_bindex_t bindex, new_bindex; - struct super_block *sb; - struct au_iinfo *iinfo; - struct au_hinode *p, *q, tmp; - - IiMustWriteLock(inode); - - *update = 0; - sb = inode->i_sb; - type = inode->i_mode & S_IFMT; - iinfo = au_ii(inode); - err = au_ii_realloc(iinfo, au_sbend(sb) + 1); - if (unlikely(err)) - goto out; - - AuDebugOn(iinfo->ii_bstart < 0); - p = iinfo->ii_hinode + iinfo->ii_bstart; - for (bindex = iinfo->ii_bstart; bindex <= iinfo->ii_bend; - bindex++, p++) { - if (!p->hi_inode) - continue; - - AuDebugOn(type != (p->hi_inode->i_mode & S_IFMT)); - new_bindex = au_br_index(sb, p->hi_id); - if (new_bindex == bindex) - continue; - - if (new_bindex < 0) { - *update = 1; - au_hiput(p); - p->hi_inode = NULL; - continue; - } - - if (new_bindex < iinfo->ii_bstart) - iinfo->ii_bstart = new_bindex; - if (iinfo->ii_bend < new_bindex) - iinfo->ii_bend = new_bindex; - /* swap two lower inode, and loop again */ - q = iinfo->ii_hinode + new_bindex; - tmp = *q; - *q = *p; - *p = tmp; - if (tmp.hi_inode) { - bindex--; - p--; - } - } - au_update_ibrange(inode, /*do_put_zero*/0); - e = au_dy_irefresh(inode); - if (unlikely(e && !err)) - err = e; - -out: - AuTraceErr(err); - return err; -} - -void au_refresh_iop(struct inode *inode, int force_getattr) -{ - int type; - struct au_sbinfo *sbi = au_sbi(inode->i_sb); - const struct inode_operations *iop - = force_getattr ? aufs_iop : sbi->si_iop_array; - - if (inode->i_op == iop) - return; - - switch (inode->i_mode & S_IFMT) { - case S_IFDIR: - type = AuIop_DIR; - break; - case S_IFLNK: - type = AuIop_SYMLINK; - break; - default: - type = AuIop_OTHER; - break; - } - - inode->i_op = iop + type; - /* unnecessary smp_wmb() */ -} - -int au_refresh_hinode_self(struct inode *inode) -{ - int err, update; - - err = au_ii_refresh(inode, &update); - if (!err) - au_refresh_hinode_attr(inode, update && S_ISDIR(inode->i_mode)); - - AuTraceErr(err); - return err; -} - -int au_refresh_hinode(struct inode *inode, struct dentry *dentry) -{ - int err, e, update; - unsigned int flags; - umode_t mode; - aufs_bindex_t bindex, bend; - unsigned char isdir; - struct au_hinode *p; - struct au_iinfo *iinfo; - - err = au_ii_refresh(inode, &update); - if (unlikely(err)) - goto out; - - update = 0; - iinfo = au_ii(inode); - p = iinfo->ii_hinode + iinfo->ii_bstart; - mode = (inode->i_mode & S_IFMT); - isdir = S_ISDIR(mode); - flags = au_hi_flags(inode, isdir); - bend = au_dbend(dentry); - for (bindex = au_dbstart(dentry); bindex <= bend; bindex++) { - struct inode *h_i, *h_inode; - struct dentry *h_d; - - h_d = au_h_dptr(dentry, bindex); - if (!h_d || d_is_negative(h_d)) - continue; - - h_inode = d_inode(h_d); - AuDebugOn(mode != (h_inode->i_mode & S_IFMT)); - if (iinfo->ii_bstart <= bindex && bindex <= iinfo->ii_bend) { - h_i = au_h_iptr(inode, bindex); - if (h_i) { - if (h_i == h_inode) - continue; - err = -EIO; - break; - } - } - if (bindex < iinfo->ii_bstart) - iinfo->ii_bstart = bindex; - if (iinfo->ii_bend < bindex) - iinfo->ii_bend = bindex; - au_set_h_iptr(inode, bindex, au_igrab(h_inode), flags); - update = 1; - } - au_update_ibrange(inode, /*do_put_zero*/0); - e = au_dy_irefresh(inode); - if (unlikely(e && !err)) - err = e; - if (!err) - au_refresh_hinode_attr(inode, update && isdir); - -out: - AuTraceErr(err); - return err; -} - -static int set_inode(struct inode *inode, struct dentry *dentry) -{ - int err; - unsigned int flags; - umode_t mode; - aufs_bindex_t bindex, bstart, btail; - unsigned char isdir; - struct dentry *h_dentry; - struct inode *h_inode; - struct au_iinfo *iinfo; - struct inode_operations *iop; - - IiMustWriteLock(inode); - - err = 0; - isdir = 0; - iop = au_sbi(inode->i_sb)->si_iop_array; - bstart = au_dbstart(dentry); - h_dentry = au_h_dptr(dentry, bstart); - h_inode = d_inode(h_dentry); - mode = h_inode->i_mode; - switch (mode & S_IFMT) { - case S_IFREG: - btail = au_dbtail(dentry); - inode->i_op = iop + AuIop_OTHER; - inode->i_fop = &aufs_file_fop; - err = au_dy_iaop(inode, bstart, h_inode); - if (unlikely(err)) - goto out; - break; - case S_IFDIR: - isdir = 1; - btail = au_dbtaildir(dentry); - inode->i_op = iop + AuIop_DIR; - inode->i_fop = &aufs_dir_fop; - break; - case S_IFLNK: - btail = au_dbtail(dentry); - inode->i_op = iop + AuIop_SYMLINK; - break; - case S_IFBLK: - case S_IFCHR: - case S_IFIFO: - case S_IFSOCK: - btail = au_dbtail(dentry); - inode->i_op = iop + AuIop_OTHER; - init_special_inode(inode, mode, h_inode->i_rdev); - break; - default: - AuIOErr("Unknown file type 0%o\n", mode); - err = -EIO; - goto out; - } - - /* do not set hnotify for whiteouted dirs (SHWH mode) */ - flags = au_hi_flags(inode, isdir); - if (au_opt_test(au_mntflags(dentry->d_sb), SHWH) - && au_ftest_hi(flags, HNOTIFY) - && dentry->d_name.len > AUFS_WH_PFX_LEN - && !memcmp(dentry->d_name.name, AUFS_WH_PFX, AUFS_WH_PFX_LEN)) - au_fclr_hi(flags, HNOTIFY); - iinfo = au_ii(inode); - iinfo->ii_bstart = bstart; - iinfo->ii_bend = btail; - for (bindex = bstart; bindex <= btail; bindex++) { - h_dentry = au_h_dptr(dentry, bindex); - if (h_dentry) - au_set_h_iptr(inode, bindex, - au_igrab(d_inode(h_dentry)), flags); - } - au_cpup_attr_all(inode, /*force*/1); - /* - * to force calling aufs_get_acl() every time, - * do not call cache_no_acl() for aufs inode. - */ - -out: - return err; -} - -/* - * successful returns with iinfo write_locked - * minus: errno - * zero: success, matched - * plus: no error, but unmatched - */ -static int reval_inode(struct inode *inode, struct dentry *dentry) -{ - int err; - unsigned int gen, igflags; - aufs_bindex_t bindex, bend; - struct inode *h_inode, *h_dinode; - struct dentry *h_dentry; - - /* - * before this function, if aufs got any iinfo lock, it must be only - * one, the parent dir. - * it can happen by UDBA and the obsoleted inode number. - */ - err = -EIO; - if (unlikely(inode->i_ino == parent_ino(dentry))) - goto out; - - err = 1; - ii_write_lock_new_child(inode); - h_dentry = au_h_dptr(dentry, au_dbstart(dentry)); - h_dinode = d_inode(h_dentry); - bend = au_ibend(inode); - for (bindex = au_ibstart(inode); bindex <= bend; bindex++) { - h_inode = au_h_iptr(inode, bindex); - if (!h_inode || h_inode != h_dinode) - continue; - - err = 0; - gen = au_iigen(inode, &igflags); - if (gen == au_digen(dentry) - && !au_ig_ftest(igflags, HALF_REFRESHED)) - break; - - /* fully refresh inode using dentry */ - err = au_refresh_hinode(inode, dentry); - if (!err) - au_update_iigen(inode, /*half*/0); - break; - } - - if (unlikely(err)) - ii_write_unlock(inode); -out: - return err; -} - -int au_ino(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino, - unsigned int d_type, ino_t *ino) -{ - int err; - struct mutex *mtx; - - /* prevent hardlinked inode number from race condition */ - mtx = NULL; - if (d_type != DT_DIR) { - mtx = &au_sbr(sb, bindex)->br_xino.xi_nondir_mtx; - mutex_lock(mtx); - } - err = au_xino_read(sb, bindex, h_ino, ino); - if (unlikely(err)) - goto out; - - if (!*ino) { - err = -EIO; - *ino = au_xino_new_ino(sb); - if (unlikely(!*ino)) - goto out; - err = au_xino_write(sb, bindex, h_ino, *ino); - if (unlikely(err)) - goto out; - } - -out: - if (mtx) - mutex_unlock(mtx); - return err; -} - -/* successful returns with iinfo write_locked */ -/* todo: return with unlocked? */ -struct inode *au_new_inode(struct dentry *dentry, int must_new) -{ - struct inode *inode, *h_inode; - struct dentry *h_dentry; - struct super_block *sb; - struct mutex *mtx; - ino_t h_ino, ino; - int err; - aufs_bindex_t bstart; - - sb = dentry->d_sb; - bstart = au_dbstart(dentry); - h_dentry = au_h_dptr(dentry, bstart); - h_inode = d_inode(h_dentry); - h_ino = h_inode->i_ino; - - /* - * stop 'race'-ing between hardlinks under different - * parents. - */ - mtx = NULL; - if (!d_is_dir(h_dentry)) - mtx = &au_sbr(sb, bstart)->br_xino.xi_nondir_mtx; - -new_ino: - if (mtx) - mutex_lock(mtx); - err = au_xino_read(sb, bstart, h_ino, &ino); - inode = ERR_PTR(err); - if (unlikely(err)) - goto out; - - if (!ino) { - ino = au_xino_new_ino(sb); - if (unlikely(!ino)) { - inode = ERR_PTR(-EIO); - goto out; - } - } - - AuDbg("i%lu\n", (unsigned long)ino); - inode = au_iget_locked(sb, ino); - err = PTR_ERR(inode); - if (IS_ERR(inode)) - goto out; - - AuDbg("%lx, new %d\n", inode->i_state, !!(inode->i_state & I_NEW)); - if (inode->i_state & I_NEW) { - /* verbose coding for lock class name */ - if (unlikely(d_is_symlink(h_dentry))) - au_rw_class(&au_ii(inode)->ii_rwsem, - au_lc_key + AuLcSymlink_IIINFO); - else if (unlikely(d_is_dir(h_dentry))) - au_rw_class(&au_ii(inode)->ii_rwsem, - au_lc_key + AuLcDir_IIINFO); - else /* likely */ - au_rw_class(&au_ii(inode)->ii_rwsem, - au_lc_key + AuLcNonDir_IIINFO); - - ii_write_lock_new_child(inode); - err = set_inode(inode, dentry); - if (!err) { - unlock_new_inode(inode); - goto out; /* success */ - } - - /* - * iget_failed() calls iput(), but we need to call - * ii_write_unlock() after iget_failed(). so dirty hack for - * i_count. - */ - atomic_inc(&inode->i_count); - iget_failed(inode); - ii_write_unlock(inode); - au_xino_write(sb, bstart, h_ino, /*ino*/0); - /* ignore this error */ - goto out_iput; - } else if (!must_new && !IS_DEADDIR(inode) && inode->i_nlink) { - /* - * horrible race condition between lookup, readdir and copyup - * (or something). - */ - if (mtx) - mutex_unlock(mtx); - err = reval_inode(inode, dentry); - if (unlikely(err < 0)) { - mtx = NULL; - goto out_iput; - } - - if (!err) { - mtx = NULL; - goto out; /* success */ - } else if (mtx) - mutex_lock(mtx); - } - - if (unlikely(au_test_fs_unique_ino(h_inode))) - AuWarn1("Warning: Un-notified UDBA or repeatedly renamed dir," - " b%d, %s, %pd, hi%lu, i%lu.\n", - bstart, au_sbtype(h_dentry->d_sb), dentry, - (unsigned long)h_ino, (unsigned long)ino); - ino = 0; - err = au_xino_write(sb, bstart, h_ino, /*ino*/0); - if (!err) { - iput(inode); - if (mtx) - mutex_unlock(mtx); - goto new_ino; - } - -out_iput: - iput(inode); - inode = ERR_PTR(err); -out: - if (mtx) - mutex_unlock(mtx); - return inode; -} - -/* ---------------------------------------------------------------------- */ - -int au_test_ro(struct super_block *sb, aufs_bindex_t bindex, - struct inode *inode) -{ - int err; - struct inode *hi; - - err = au_br_rdonly(au_sbr(sb, bindex)); - - /* pseudo-link after flushed may happen out of bounds */ - if (!err - && inode - && au_ibstart(inode) <= bindex - && bindex <= au_ibend(inode)) { - /* - * permission check is unnecessary since vfsub routine - * will be called later - */ - hi = au_h_iptr(inode, bindex); - if (hi) - err = IS_IMMUTABLE(hi) ? -EROFS : 0; - } - - return err; -} - -int au_test_h_perm(struct inode *h_inode, int mask) -{ - if (uid_eq(current_fsuid(), GLOBAL_ROOT_UID)) - return 0; - return inode_permission(h_inode, mask); -} - -int au_test_h_perm_sio(struct inode *h_inode, int mask) -{ - if (au_test_nfs(h_inode->i_sb) - && (mask & MAY_WRITE) - && S_ISDIR(h_inode->i_mode)) - mask |= MAY_READ; /* force permission check */ - return au_test_h_perm(h_inode, mask); -} diff --git a/fs/aufs/inode.h b/fs/aufs/inode.h deleted file mode 100644 index 534b9e814..000000000 --- a/fs/aufs/inode.h +++ /dev/null @@ -1,672 +0,0 @@ -/* - * Copyright (C) 2005-2016 Junjiro R. Okajima - */ - -/* - * inode operations - */ - -#ifndef __AUFS_INODE_H__ -#define __AUFS_INODE_H__ - -#ifdef __KERNEL__ - -#include <linux/fsnotify.h> -#include "rwsem.h" - -struct vfsmount; - -struct au_hnotify { -#ifdef CONFIG_AUFS_HNOTIFY -#ifdef CONFIG_AUFS_HFSNOTIFY - /* never use fsnotify_add_vfsmount_mark() */ - struct fsnotify_mark hn_mark; -#endif - struct inode *hn_aufs_inode; /* no get/put */ -#endif -} ____cacheline_aligned_in_smp; - -struct au_hinode { - struct inode *hi_inode; - aufs_bindex_t hi_id; -#ifdef CONFIG_AUFS_HNOTIFY - struct au_hnotify *hi_notify; -#endif - - /* reference to the copied-up whiteout with get/put */ - struct dentry *hi_whdentry; -}; - -/* ig_flags */ -#define AuIG_HALF_REFRESHED 1 -#define au_ig_ftest(flags, name) ((flags) & AuIG_##name) -#define au_ig_fset(flags, name) \ - do { (flags) |= AuIG_##name; } while (0) -#define au_ig_fclr(flags, name) \ - do { (flags) &= ~AuIG_##name; } while (0) - -struct au_iigen { - spinlock_t ig_spin; - __u32 ig_generation, ig_flags; -}; - -struct au_vdir; -struct au_iinfo { - struct au_iigen ii_generation; - struct super_block *ii_hsb1; /* no get/put */ - - struct au_rwsem ii_rwsem; - aufs_bindex_t ii_bstart, ii_bend; - __u32 ii_higen; - struct au_hinode *ii_hinode; - struct au_vdir *ii_vdir; -}; - -struct au_icntnr { - struct au_iinfo iinfo; - struct inode vfs_inode; -} ____cacheline_aligned_in_smp; - -/* au_pin flags */ -#define AuPin_DI_LOCKED 1 -#define AuPin_MNT_WRITE (1 << 1) -#define au_ftest_pin(flags, name) ((flags) & AuPin_##name) -#define au_fset_pin(flags, name) \ - do { (flags) |= AuPin_##name; } while (0) -#define au_fclr_pin(flags, name) \ - do { (flags) &= ~AuPin_##name; } while (0) - -struct au_pin { - /* input */ - struct dentry *dentry; - unsigned int udba; - unsigned char lsc_di, lsc_hi, flags; - aufs_bindex_t bindex; - - /* output */ - struct dentry *parent; - struct au_hinode *hdir; - struct vfsmount *h_mnt; - - /* temporary unlock/relock for copyup */ - struct dentry *h_dentry, *h_parent; - struct au_branch *br; - struct task_struct *task; -}; - -void au_pin_hdir_unlock(struct au_pin *p); -int au_pin_hdir_lock(struct au_pin *p); -int au_pin_hdir_relock(struct au_pin *p); -void au_pin_hdir_set_owner(struct au_pin *p, struct task_struct *task); -void au_pin_hdir_acquire_nest(struct au_pin *p); -void au_pin_hdir_release(struct au_pin *p); - -/* ---------------------------------------------------------------------- */ - -static inline struct au_iinfo *au_ii(struct inode *inode) -{ - struct au_iinfo *iinfo; - - iinfo = &(container_of(inode, struct au_icntnr, vfs_inode)->iinfo); - if (iinfo->ii_hinode) - return iinfo; - return NULL; /* debugging bad_inode case */ -} - -/* ---------------------------------------------------------------------- */ - -/* inode.c */ -struct inode *au_igrab(struct inode *inode); -void au_refresh_iop(struct inode *inode, int force_getattr); -int au_refresh_hinode_self(struct inode *inode); -int au_refresh_hinode(struct inode *inode, struct dentry *dentry); -int au_ino(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino, - unsigned int d_type, ino_t *ino); -struct inode *au_new_inode(struct dentry *dentry, int must_new); -int au_test_ro(struct super_block *sb, aufs_bindex_t bindex, - struct inode *inode); -int au_test_h_perm(struct inode *h_inode, int mask); -int au_test_h_perm_sio(struct inode *h_inode, int mask); - -static inline int au_wh_ino(struct super_block *sb, aufs_bindex_t bindex, - ino_t h_ino, unsigned int d_type, ino_t *ino) -{ -#ifdef CONFIG_AUFS_SHWH - return au_ino(sb, bindex, h_ino, d_type, ino); -#else - return 0; -#endif -} - -/* i_op.c */ -enum { - AuIop_SYMLINK, - AuIop_DIR, - AuIop_OTHER, - AuIop_Last -}; -extern struct inode_operations aufs_iop[AuIop_Last], - aufs_iop_nogetattr[AuIop_Last]; - -/* au_wr_dir flags */ -#define AuWrDir_ADD_ENTRY 1 -#define AuWrDir_ISDIR (1 << 1) -#define AuWrDir_TMPFILE (1 << 2) -#define au_ftest_wrdir(flags, name) ((flags) & AuWrDir_##name) -#define au_fset_wrdir(flags, name) \ - do { (flags) |= AuWrDir_##name; } while (0) -#define au_fclr_wrdir(flags, name) \ - do { (flags) &= ~AuWrDir_##name; } while (0) - -struct au_wr_dir_args { - aufs_bindex_t force_btgt; - unsigned char flags; -}; -int au_wr_dir(struct dentry *dentry, struct dentry *src_dentry, - struct au_wr_dir_args *args); - -struct dentry *au_pinned_h_parent(struct au_pin *pin); -void au_pin_init(struct au_pin *pin, struct dentry *dentry, - aufs_bindex_t bindex, int lsc_di, int lsc_hi, - unsigned int udba, unsigned char flags); -int au_pin(struct au_pin *pin, struct dentry *dentry, aufs_bindex_t bindex, - unsigned int udba, unsigned char flags) __must_check; -int au_do_pin(struct au_pin *pin) __must_check; -void au_unpin(struct au_pin *pin); -int au_reval_for_attr(struct dentry *dentry, unsigned int sigen); - -#define AuIcpup_DID_CPUP 1 -#define au_ftest_icpup(flags, name) ((flags) & AuIcpup_##name) -#define au_fset_icpup(flags, name) \ - do { (flags) |= AuIcpup_##name; } while (0) -#define au_fclr_icpup(flags, name) \ - do { (flags) &= ~AuIcpup_##name; } while (0) - -struct au_icpup_args { - unsigned char flags; - unsigned char pin_flags; - aufs_bindex_t btgt; - unsigned int udba; - struct au_pin pin; - struct path h_path; - struct inode *h_inode; -}; - -int au_pin_and_icpup(struct dentry *dentry, struct iattr *ia, - struct au_icpup_args *a); - -int au_h_path_getattr(struct dentry *dentry, int force, struct path *h_path); - -/* i_op_add.c */ -int au_may_add(struct dentry *dentry, aufs_bindex_t bindex, - struct dentry *h_parent, int isdir); -int aufs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, - dev_t dev); -int aufs_symlink(struct inode *dir, struct dentry *dentry, const char *symname); -int aufs_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool want_excl); -struct vfsub_aopen_args; -int au_aopen_or_create(struct inode *dir, struct dentry *dentry, - struct vfsub_aopen_args *args); -int aufs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode); -int aufs_link(struct dentry *src_dentry, struct inode *dir, - struct dentry *dentry); -int aufs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); - -/* i_op_del.c */ -int au_wr_dir_need_wh(struct dentry *dentry, int isdir, aufs_bindex_t *bcpup); -int au_may_del(struct dentry *dentry, aufs_bindex_t bindex, - struct dentry *h_parent, int isdir); -int aufs_unlink(struct inode *dir, struct dentry *dentry); -int aufs_rmdir(struct inode *dir, struct dentry *dentry); - -/* i_op_ren.c */ -int au_wbr(struct dentry *dentry, aufs_bindex_t btgt); -int aufs_rename(struct inode *src_dir, struct dentry *src_dentry, - struct inode *dir, struct dentry *dentry); - -/* iinfo.c */ -struct inode *au_h_iptr(struct inode *inode, aufs_bindex_t bindex); -void au_hiput(struct au_hinode *hinode); -void au_set_hi_wh(struct inode *inode, aufs_bindex_t bindex, - struct dentry *h_wh); -unsigned int au_hi_flags(struct inode *inode, int isdir); - -/* hinode flags */ -#define AuHi_XINO 1 -#define AuHi_HNOTIFY (1 << 1) -#define au_ftest_hi(flags, name) ((flags) & AuHi_##name) -#define au_fset_hi(flags, name) \ - do { (flags) |= AuHi_##name; } while (0) -#define au_fclr_hi(flags, name) \ - do { (flags) &= ~AuHi_##name; } while (0) - -#ifndef CONFIG_AUFS_HNOTIFY -#undef AuHi_HNOTIFY -#define AuHi_HNOTIFY 0 -#endif - -void au_set_h_iptr(struct inode *inode, aufs_bindex_t bindex, - struct inode *h_inode, unsigned int flags); - -void au_update_iigen(struct inode *inode, int half); -void au_update_ibrange(struct inode *inode, int do_put_zero); - -void au_icntnr_init_once(void *_c); -int au_iinfo_init(struct inode *inode); -void au_iinfo_fin(struct inode *inode); -int au_ii_realloc(struct au_iinfo *iinfo, int nbr); - -#ifdef CONFIG_PROC_FS -/* plink.c */ -int au_plink_maint(struct super_block *sb, int flags); -struct au_sbinfo; -void au_plink_maint_leave(struct au_sbinfo *sbinfo); -int au_plink_maint_enter(struct super_block *sb); -#ifdef CONFIG_AUFS_DEBUG -void au_plink_list(struct super_block *sb); -#else -AuStubVoid(au_plink_list, struct super_block *sb) -#endif -int au_plink_test(struct inode *inode); -struct dentry *au_plink_lkup(struct inode *inode, aufs_bindex_t bindex); -void au_plink_append(struct inode *inode, aufs_bindex_t bindex, - struct dentry *h_dentry); -void au_plink_put(struct super_block *sb, int verbose); -void au_plink_clean(struct super_block *sb, int verbose); -void au_plink_half_refresh(struct super_block *sb, aufs_bindex_t br_id); -#else -AuStubInt0(au_plink_maint, struct super_block *sb, int flags); -AuStubVoid(au_plink_maint_leave, struct au_sbinfo *sbinfo); -AuStubInt0(au_plink_maint_enter, struct super_block *sb); -AuStubVoid(au_plink_list, struct super_block *sb); -AuStubInt0(au_plink_test, struct inode *inode); -AuStub(struct dentry *, au_plink_lkup, return NULL, - struct inode *inode, aufs_bindex_t bindex); -AuStubVoid(au_plink_append, struct inode *inode, aufs_bindex_t bindex, - struct dentry *h_dentry); -AuStubVoid(au_plink_put, struct super_block *sb, int verbose); -AuStubVoid(au_plink_clean, struct super_block *sb, int verbose); -AuStubVoid(au_plink_half_refresh, struct super_block *sb, aufs_bindex_t br_id); -#endif /* CONFIG_PROC_FS */ - -#ifdef CONFIG_AUFS_XATTR -/* xattr.c */ -int au_cpup_xattr(struct dentry *h_dst, struct dentry *h_src, int ignore_flags, - unsigned int verbose); -ssize_t aufs_listxattr(struct dentry *dentry, char *list, size_t size); -ssize_t aufs_getxattr(struct dentry *dentry, const char *name, void *value, - size_t size); -int aufs_setxattr(struct dentry *dentry, const char *name, const void *value, - size_t size, int flags); -int aufs_removexattr(struct dentry *dentry, const char *name); - -/* void au_xattr_init(struct super_block *sb); */ -#else -AuStubInt0(au_cpup_xattr, struct dentry *h_dst, struct dentry *h_src, - int ignore_flags, unsigned int verbose); -/* AuStubVoid(au_xattr_init, struct super_block *sb); */ -#endif - -#ifdef CONFIG_FS_POSIX_ACL -struct posix_acl *aufs_get_acl(struct inode *inode, int type); -int aufs_set_acl(struct inode *inode, struct posix_acl *acl, int type); -#endif - -#if IS_ENABLED(CONFIG_AUFS_XATTR) || IS_ENABLED(CONFIG_FS_POSIX_ACL) -enum { - AU_XATTR_SET, - AU_XATTR_REMOVE, - AU_ACL_SET -}; - -struct au_srxattr { - int type; - union { - struct { - const char *name; - const void *value; - size_t size; - int flags; - } set; - struct { - const char *name; - } remove; - struct { - struct posix_acl *acl; - int type; - } acl_set; - } u; -}; -ssize_t au_srxattr(struct dentry *dentry, struct au_srxattr *arg); -#endif - -/* ---------------------------------------------------------------------- */ - -/* lock subclass for iinfo */ -enum { - AuLsc_II_CHILD, /* child first */ - AuLsc_II_CHILD2, /* rename(2), link(2), and cpup at hnotify */ - AuLsc_II_CHILD3, /* copyup dirs */ - AuLsc_II_PARENT, /* see AuLsc_I_PARENT in vfsub.h */ - AuLsc_II_PARENT2, - AuLsc_II_PARENT3, /* copyup dirs */ - AuLsc_II_NEW_CHILD -}; - -/* - * ii_read_lock_child, ii_write_lock_child, - * ii_read_lock_child2, ii_write_lock_child2, - * ii_read_lock_child3, ii_write_lock_child3, - * ii_read_lock_parent, ii_write_lock_parent, - * ii_read_lock_parent2, ii_write_lock_parent2, - * ii_read_lock_parent3, ii_write_lock_parent3, - * ii_read_lock_new_child, ii_write_lock_new_child, - */ -#define AuReadLockFunc(name, lsc) \ -static inline void ii_read_lock_##name(struct inode *i) \ -{ \ - au_rw_read_lock_nested(&au_ii(i)->ii_rwsem, AuLsc_II_##lsc); \ -} - -#define AuWriteLockFunc(name, lsc) \ -static inline void ii_write_lock_##name(struct inode *i) \ -{ \ - au_rw_write_lock_nested(&au_ii(i)->ii_rwsem, AuLsc_II_##lsc); \ -} - -#define AuRWLockFuncs(name, lsc) \ - AuReadLockFunc(name, lsc) \ - AuWriteLockFunc(name, lsc) - -AuRWLockFuncs(child, CHILD); -AuRWLockFuncs(child2, CHILD2); -AuRWLockFuncs(child3, CHILD3); -AuRWLockFuncs(parent, PARENT); -AuRWLockFuncs(parent2, PARENT2); -AuRWLockFuncs(parent3, PARENT3); -AuRWLockFuncs(new_child, NEW_CHILD); - -#undef AuReadLockFunc -#undef AuWriteLockFunc -#undef AuRWLockFuncs - -/* - * ii_read_unlock, ii_write_unlock, ii_downgrade_lock - */ -AuSimpleUnlockRwsemFuncs(ii, struct inode *i, &au_ii(i)->ii_rwsem); - -#define IiMustNoWaiters(i) AuRwMustNoWaiters(&au_ii(i)->ii_rwsem) -#define IiMustAnyLock(i) AuRwMustAnyLock(&au_ii(i)->ii_rwsem) -#define IiMustWriteLock(i) AuRwMustWriteLock(&au_ii(i)->ii_rwsem) - -/* ---------------------------------------------------------------------- */ - -static inline void au_icntnr_init(struct au_icntnr *c) -{ -#ifdef CONFIG_AUFS_DEBUG - c->vfs_inode.i_mode = 0; -#endif -} - -static inline unsigned int au_iigen(struct inode *inode, unsigned int *igflags) -{ - unsigned int gen; - struct au_iinfo *iinfo; - struct au_iigen *iigen; - - iinfo = au_ii(inode); - iigen = &iinfo->ii_generation; - spin_lock(&iigen->ig_spin); - if (igflags) - *igflags = iigen->ig_flags; - gen = iigen->ig_generation; - spin_unlock(&iigen->ig_spin); - - return gen; -} - -/* tiny test for inode number */ -/* tmpfs generation is too rough */ -static inline int au_test_higen(struct inode *inode, struct inode *h_inode) -{ - struct au_iinfo *iinfo; - - iinfo = au_ii(inode); - AuRwMustAnyLock(&iinfo->ii_rwsem); - return !(iinfo->ii_hsb1 == h_inode->i_sb - && iinfo->ii_higen == h_inode->i_generation); -} - -static inline void au_iigen_dec(struct inode *inode) -{ - struct au_iinfo *iinfo; - struct au_iigen *iigen; - - iinfo = au_ii(inode); - iigen = &iinfo->ii_generation; - spin_lock(&iigen->ig_spin); - iigen->ig_generation--; - spin_unlock(&iigen->ig_spin); -} - -static inline int au_iigen_test(struct inode *inode, unsigned int sigen) -{ - int err; - - err = 0; - if (unlikely(inode && au_iigen(inode, NULL) != sigen)) - err = -EIO; - - return err; -} - -/* ---------------------------------------------------------------------- */ - -static inline aufs_bindex_t au_ii_br_id(struct inode *inode, - aufs_bindex_t bindex) -{ - IiMustAnyLock(inode); - return au_ii(inode)->ii_hinode[0 + bindex].hi_id; -} - -static inline aufs_bindex_t au_ibstart(struct inode *inode) -{ - IiMustAnyLock(inode); - return au_ii(inode)->ii_bstart; -} - -static inline aufs_bindex_t au_ibend(struct inode *inode) -{ - IiMustAnyLock(inode); - return au_ii(inode)->ii_bend; -} - -static inline struct au_vdir *au_ivdir(struct inode *inode) -{ - IiMustAnyLock(inode); - return au_ii(inode)->ii_vdir; -} - -static inline struct dentry *au_hi_wh(struct inode *inode, aufs_bindex_t bindex) -{ - IiMustAnyLock(inode); - return au_ii(inode)->ii_hinode[0 + bindex].hi_whdentry; -} - -static inline void au_set_ibstart(struct inode *inode, aufs_bindex_t bindex) -{ - IiMustWriteLock(inode); - au_ii(inode)->ii_bstart = bindex; -} - -static inline void au_set_ibend(struct inode *inode, aufs_bindex_t bindex) -{ - IiMustWriteLock(inode); - au_ii(inode)->ii_bend = bindex; -} - -static inline void au_set_ivdir(struct inode *inode, struct au_vdir *vdir) -{ - IiMustWriteLock(inode); - au_ii(inode)->ii_vdir = vdir; -} - -static inline struct au_hinode *au_hi(struct inode *inode, aufs_bindex_t bindex) -{ - IiMustAnyLock(inode); - return au_ii(inode)->ii_hinode + bindex; -} - -/* ---------------------------------------------------------------------- */ - -static inline struct dentry *au_pinned_parent(struct au_pin *pin) -{ - if (pin) - return pin->parent; - return NULL; -} - -static inline struct inode *au_pinned_h_dir(struct au_pin *pin) -{ - if (pin && pin->hdir) - return pin->hdir->hi_inode; - return NULL; -} - -static inline struct au_hinode *au_pinned_hdir(struct au_pin *pin) -{ - if (pin) - return pin->hdir; - return NULL; -} - -static inline void au_pin_set_dentry(struct au_pin *pin, struct dentry *dentry) -{ - if (pin) - pin->dentry = dentry; -} - -static inline void au_pin_set_parent_lflag(struct au_pin *pin, - unsigned char lflag) -{ - if (pin) { - if (lflag) - au_fset_pin(pin->flags, DI_LOCKED); - else - au_fclr_pin(pin->flags, DI_LOCKED); - } -} - -#if 0 /* reserved */ -static inline void au_pin_set_parent(struct au_pin *pin, struct dentry *parent) -{ - if (pin) { - dput(pin->parent); - pin->parent = dget(parent); - } -} -#endif - -/* ---------------------------------------------------------------------- */ - -struct au_branch; -#ifdef CONFIG_AUFS_HNOTIFY -struct au_hnotify_op { - void (*ctl)(struct au_hinode *hinode, int do_set); - int (*alloc)(struct au_hinode *hinode); - - /* - * if it returns true, the the caller should free hinode->hi_notify, - * otherwise ->free() frees it. - */ - int (*free)(struct au_hinode *hinode, - struct au_hnotify *hn) __must_check; - - void (*fin)(void); - int (*init)(void); - - int (*reset_br)(unsigned int udba, struct au_branch *br, int perm); - void (*fin_br)(struct au_branch *br); - int (*init_br)(struct au_branch *br, int perm); -}; - -/* hnotify.c */ -int au_hn_alloc(struct au_hinode *hinode, struct inode *inode); -void au_hn_free(struct au_hinode *hinode); -void au_hn_ctl(struct au_hinode *hinode, int do_set); -void au_hn_reset(struct inode *inode, unsigned int flags); -int au_hnotify(struct inode *h_dir, struct au_hnotify *hnotify, u32 mask, - struct qstr *h_child_qstr, struct inode *h_child_inode); -int au_hnotify_reset_br(unsigned int udba, struct au_branch *br, int perm); -int au_hnotify_init_br(struct au_branch *br, int perm); -void au_hnotify_fin_br(struct au_branch *br); -int __init au_hnotify_init(void); -void au_hnotify_fin(void); - -/* hfsnotify.c */ -extern const struct au_hnotify_op au_hnotify_op; - -static inline -void au_hn_init(struct au_hinode *hinode) -{ - hinode->hi_notify = NULL; -} - -static inline struct au_hnotify *au_hn(struct au_hinode *hinode) -{ - return hinode->hi_notify; -} - -#else -AuStub(int, au_hn_alloc, return -EOPNOTSUPP, - struct au_hinode *hinode __maybe_unused, - struct inode *inode __maybe_unused) -AuStub(struct au_hnotify *, au_hn, return NULL, struct au_hinode *hinode) -AuStubVoid(au_hn_free, struct au_hinode *hinode __maybe_unused) -AuStubVoid(au_hn_ctl, struct au_hinode *hinode __maybe_unused, - int do_set __maybe_unused) -AuStubVoid(au_hn_reset, struct inode *inode __maybe_unused, - unsigned int flags __maybe_unused) -AuStubInt0(au_hnotify_reset_br, unsigned int udba __maybe_unused, - struct au_branch *br __maybe_unused, - int perm __maybe_unused) -AuStubInt0(au_hnotify_init_br, struct au_branch *br __maybe_unused, - int perm __maybe_unused) -AuStubVoid(au_hnotify_fin_br, struct au_branch *br __maybe_unused) -AuStubInt0(__init au_hnotify_init, void) -AuStubVoid(au_hnotify_fin, void) -AuStubVoid(au_hn_init, struct au_hinode *hinode __maybe_unused) -#endif /* CONFIG_AUFS_HNOTIFY */ - -static inline void au_hn_suspend(struct au_hinode *hdir) -{ - au_hn_ctl(hdir, /*do_set*/0); -} - -static inline void au_hn_resume(struct au_hinode *hdir) -{ - au_hn_ctl(hdir, /*do_set*/1); -} - -static inline void au_hn_imtx_lock(struct au_hinode *hdir) -{ - mutex_lock(&hdir->hi_inode->i_mutex); - au_hn_suspend(hdir); -} - -static inline void au_hn_imtx_lock_nested(struct au_hinode *hdir, - unsigned int sc __maybe_unused) -{ - mutex_lock_nested(&hdir->hi_inode->i_mutex, sc); - au_hn_suspend(hdir); -} - -static inline void au_hn_imtx_unlock(struct au_hinode *hdir) -{ - au_hn_resume(hdir); - mutex_unlock(&hdir->hi_inode->i_mutex); -} - -#endif /* __KERNEL__ */ -#endif /* __AUFS_INODE_H__ */ diff --git a/fs/aufs/ioctl.c b/fs/aufs/ioctl.c deleted file mode 100644 index 6528fb911..000000000 --- a/fs/aufs/ioctl.c +++ /dev/null @@ -1,206 +0,0 @@ -/* - * Copyright (C) 2005-2016 Junjiro R. Okajima - */ - -/* - * ioctl - * plink-management and readdir in userspace. - * assist the pathconf(3) wrapper library. - * move-down - * File-based Hierarchical Storage Management. - */ - -#include <linux/compat.h> -#include <linux/file.h> -#include "aufs.h" - -static int au_wbr_fd(struct path *path, struct aufs_wbr_fd __user *arg) -{ - int err, fd; - aufs_bindex_t wbi, bindex, bend; - struct file *h_file; - struct super_block *sb; - struct dentry *root; - struct au_branch *br; - struct aufs_wbr_fd wbrfd = { - .oflags = au_dir_roflags, - .brid = -1 - }; - const int valid = O_RDONLY | O_NONBLOCK | O_LARGEFILE | O_DIRECTORY - | O_NOATIME | O_CLOEXEC; - - AuDebugOn(wbrfd.oflags & ~valid); - - if (arg) { - err = copy_from_user(&wbrfd, arg, sizeof(wbrfd)); - if (unlikely(err)) { - err = -EFAULT; - goto out; - } - - err = -EINVAL; - AuDbg("wbrfd{0%o, %d}\n", wbrfd.oflags, wbrfd.brid); - wbrfd.oflags |= au_dir_roflags; - AuDbg("0%o\n", wbrfd.oflags); - if (unlikely(wbrfd.oflags & ~valid)) - goto out; - } - - fd = get_unused_fd_flags(0); - err = fd; - if (unlikely(fd < 0)) - goto out; - - h_file = ERR_PTR(-EINVAL); - wbi = 0; - br = NULL; - sb = path->dentry->d_sb; - root = sb->s_root; - aufs_read_lock(root, AuLock_IR); - bend = au_sbend(sb); - if (wbrfd.brid >= 0) { - wbi = au_br_index(sb, wbrfd.brid); - if (unlikely(wbi < 0 || wbi > bend)) - goto out_unlock; - } - - h_file = ERR_PTR(-ENOENT); - br = au_sbr(sb, wbi); - if (!au_br_writable(br->br_perm)) { - if (arg) - goto out_unlock; - - bindex = wbi + 1; - wbi = -1; - for (; bindex <= bend; bindex++) { - br = au_sbr(sb, bindex); - if (au_br_writable(br->br_perm)) { - wbi = bindex; - br = au_sbr(sb, wbi); - break; - } - } - } - AuDbg("wbi %d\n", wbi); - if (wbi >= 0) - h_file = au_h_open(root, wbi, wbrfd.oflags, NULL, - /*force_wr*/0); - -out_unlock: - aufs_read_unlock(root, AuLock_IR); - err = PTR_ERR(h_file); - if (IS_ERR(h_file)) - goto out_fd; - - atomic_dec(&br->br_count); /* cf. au_h_open() */ - fd_install(fd, h_file); - err = fd; - goto out; /* success */ - -out_fd: - put_unused_fd(fd); -out: - AuTraceErr(err); - return err; -} - -/* ---------------------------------------------------------------------- */ - -long aufs_ioctl_dir(struct file *file, unsigned int cmd, unsigned long arg) -{ - long err; - struct dentry *dentry; - - switch (cmd) { - case AUFS_CTL_RDU: - case AUFS_CTL_RDU_INO: - err = au_rdu_ioctl(file, cmd, arg); - break; - - case AUFS_CTL_WBR_FD: - err = au_wbr_fd(&file->f_path, (void __user *)arg); - break; - - case AUFS_CTL_IBUSY: - err = au_ibusy_ioctl(file, arg); - break; - - case AUFS_CTL_BRINFO: - err = au_brinfo_ioctl(file, arg); - break; - - case AUFS_CTL_FHSM_FD: - dentry = file->f_path.dentry; - if (IS_ROOT(dentry)) - err = au_fhsm_fd(dentry->d_sb, arg); - else - err = -ENOTTY; - break; - - default: - /* do not call the lower */ - AuDbg("0x%x\n", cmd); - err = -ENOTTY; - } - - AuTraceErr(err); - return err; -} - -long aufs_ioctl_nondir(struct file *file, unsigned int cmd, unsigned long arg) -{ - long err; - - switch (cmd) { - case AUFS_CTL_MVDOWN: - err = au_mvdown(file->f_path.dentry, (void __user *)arg); - break; - - case AUFS_CTL_WBR_FD: - err = au_wbr_fd(&file->f_path, (void __user *)arg); - break; - - default: - /* do not call the lower */ - AuDbg("0x%x\n", cmd); - err = -ENOTTY; - } - - AuTraceErr(err); - return err; -} - -#ifdef CONFIG_COMPAT -long aufs_compat_ioctl_dir(struct file *file, unsigned int cmd, - unsigned long arg) -{ - long err; - - switch (cmd) { - case AUFS_CTL_RDU: - case AUFS_CTL_RDU_INO: - err = au_rdu_compat_ioctl(file, cmd, arg); - break; - - case AUFS_CTL_IBUSY: - err = au_ibusy_compat_ioctl(file, arg); - break; - - case AUFS_CTL_BRINFO: - err = au_brinfo_compat_ioctl(file, arg); - break; - - default: - err = aufs_ioctl_dir(file, cmd, arg); - } - - AuTraceErr(err); - return err; -} - -long aufs_compat_ioctl_nondir(struct file *file, unsigned int cmd, - unsigned long arg) -{ - return aufs_ioctl_nondir(file, cmd, (unsigned long)compat_ptr(arg)); -} -#endif diff --git a/fs/aufs/loop.c b/fs/aufs/loop.c deleted file mode 100644 index 5711e7a2f..000000000 --- a/fs/aufs/loop.c +++ /dev/null @@ -1,133 +0,0 @@ -/* - * Copyright (C) 2005-2016 Junjiro R. Okajima - */ - -/* - * support for loopback block device as a branch - */ - -#include "aufs.h" - -/* added into drivers/block/loop.c */ -static struct file *(*backing_file_func)(struct super_block *sb); - -/* - * test if two lower dentries have overlapping branches. - */ -int au_test_loopback_overlap(struct super_block *sb, struct dentry *h_adding) -{ - struct super_block *h_sb; - struct file *backing_file; - - if (unlikely(!backing_file_func)) { - /* don't load "loop" module here */ - backing_file_func = symbol_get(loop_backing_file); - if (unlikely(!backing_file_func)) - /* "loop" module is not loaded */ - return 0; - } - - h_sb = h_adding->d_sb; - backing_file = backing_file_func(h_sb); - if (!backing_file) - return 0; - - h_adding = backing_file->f_path.dentry; - /* - * h_adding can be local NFS. - * in this case aufs cannot detect the loop. - */ - if (unlikely(h_adding->d_sb == sb)) - return 1; - return !!au_test_subdir(h_adding, sb->s_root); -} - -/* true if a kernel thread named 'loop[0-9].*' accesses a file */ -int au_test_loopback_kthread(void) -{ - int ret; - struct task_struct *tsk = current; - char c, comm[sizeof(tsk->comm)]; - - ret = 0; - if (tsk->flags & PF_KTHREAD) { - get_task_comm(comm, tsk); - c = comm[4]; - ret = ('0' <= c && c <= '9' - && !strncmp(comm, "loop", 4)); - } - - return ret; -} - -/* ---------------------------------------------------------------------- */ - -#define au_warn_loopback_step 16 -static int au_warn_loopback_nelem = au_warn_loopback_step; -static unsigned long *au_warn_loopback_array; - -void au_warn_loopback(struct super_block *h_sb) -{ - int i, new_nelem; - unsigned long *a, magic; - static DEFINE_SPINLOCK(spin); - - magic = h_sb->s_magic; - spin_lock(&spin); - a = au_warn_loopback_array; - for (i = 0; i < au_warn_loopback_nelem && *a; i++) - if (a[i] == magic) { - spin_unlock(&spin); - return; - } - - /* h_sb is new to us, print it */ - if (i < au_warn_loopback_nelem) { - a[i] = magic; - goto pr; - } - - /* expand the array */ - new_nelem = au_warn_loopback_nelem + au_warn_loopback_step; - a = au_kzrealloc(au_warn_loopback_array, - au_warn_loopback_nelem * sizeof(unsigned long), - new_nelem * sizeof(unsigned long), GFP_ATOMIC); - if (a) { - au_warn_loopback_nelem = new_nelem; - au_warn_loopback_array = a; - a[i] = magic; - goto pr; - } - - spin_unlock(&spin); - AuWarn1("realloc failed, ignored\n"); - return; - -pr: - spin_unlock(&spin); - pr_warn("you may want to try another patch for loopback file " - "on %s(0x%lx) branch\n", au_sbtype(h_sb), magic); -} - -int au_loopback_init(void) -{ - int err; - struct super_block *sb __maybe_unused; - - BUILD_BUG_ON(sizeof(sb->s_magic) != sizeof(unsigned long)); - - err = 0; - au_warn_loopback_array = kcalloc(au_warn_loopback_step, - sizeof(unsigned long), GFP_NOFS); - if (unlikely(!au_warn_loopback_array)) - err = -ENOMEM; - - return err; -} - -void au_loopback_fin(void) -{ - if (backing_file_func) - symbol_put(loop_backing_file); - kfree(au_warn_loopback_array); -} diff --git a/fs/aufs/loop.h b/fs/aufs/loop.h deleted file mode 100644 index 48bf070e8..000000000 --- a/fs/aufs/loop.h +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Copyright (C) 2005-2016 Junjiro R. Okajima - */ - -/* - * support for loopback mount as a branch - */ - -#ifndef __AUFS_LOOP_H__ -#define __AUFS_LOOP_H__ - -#ifdef __KERNEL__ - -struct dentry; -struct super_block; - -#ifdef CONFIG_AUFS_BDEV_LOOP -/* drivers/block/loop.c */ -struct file *loop_backing_file(struct super_block *sb); - -/* loop.c */ -int au_test_loopback_overlap(struct super_block *sb, struct dentry *h_adding); -int au_test_loopback_kthread(void); -void au_warn_loopback(struct super_block *h_sb); - -int au_loopback_init(void); -void au_loopback_fin(void); -#else -AuStubInt0(au_test_loopback_overlap, struct super_block *sb, - struct dentry *h_adding) -AuStubInt0(au_test_loopback_kthread, void) -AuStubVoid(au_warn_loopback, struct super_block *h_sb) - -AuStubInt0(au_loopback_init, void) -AuStubVoid(au_loopback_fin, void) -#endif /* BLK_DEV_LOOP */ - -#endif /* __KERNEL__ */ -#endif /* __AUFS_LOOP_H__ */ diff --git a/fs/aufs/magic.mk b/fs/aufs/magic.mk deleted file mode 100644 index 4f83bdf1d..000000000 --- a/fs/aufs/magic.mk +++ /dev/null @@ -1,30 +0,0 @@ - -# defined in ${srctree}/fs/fuse/inode.c -# tristate -ifdef CONFIG_FUSE_FS -ccflags-y += -DFUSE_SUPER_MAGIC=0x65735546 -endif - -# defined in ${srctree}/fs/xfs/xfs_sb.h -# tristate -ifdef CONFIG_XFS_FS -ccflags-y += -DXFS_SB_MAGIC=0x58465342 -endif - -# defined in ${srctree}/fs/configfs/mount.c -# tristate -ifdef CONFIG_CONFIGFS_FS -ccflags-y += -DCONFIGFS_MAGIC=0x62656570 -endif - -# defined in ${srctree}/fs/ubifs/ubifs.h -# tristate -ifdef CONFIG_UBIFS_FS -ccflags-y += -DUBIFS_SUPER_MAGIC=0x24051905 -endif - -# defined in ${srctree}/fs/hfsplus/hfsplus_raw.h -# tristate -ifdef CONFIG_HFSPLUS_FS -ccflags-y += -DHFSPLUS_SUPER_MAGIC=0x482b -endif diff --git a/fs/aufs/module.c b/fs/aufs/module.c deleted file mode 100644 index 8a28377c5..000000000 --- a/fs/aufs/module.c +++ /dev/null @@ -1,207 +0,0 @@ -/* - * Copyright (C) 2005-2016 Junjiro R. Okajima - */ - -/* - * module global variables and operations - */ - -#include <linux/module.h> -#include <linux/seq_file.h> -#include "aufs.h" - -void *au_kzrealloc(void *p, unsigned int nused, unsigned int new_sz, gfp_t gfp) -{ - if (new_sz <= nused) - return p; - - p = krealloc(p, new_sz, gfp); - if (p) - memset(p + nused, 0, new_sz - nused); - return p; -} - -/* ---------------------------------------------------------------------- */ - -/* - * aufs caches - */ -struct kmem_cache *au_cachep[AuCache_Last]; -static int __init au_cache_init(void) -{ - au_cachep[AuCache_DINFO] = AuCacheCtor(au_dinfo, au_di_init_once); - if (au_cachep[AuCache_DINFO]) - /* SLAB_DESTROY_BY_RCU */ - au_cachep[AuCache_ICNTNR] = AuCacheCtor(au_icntnr, - au_icntnr_init_once); - if (au_cachep[AuCache_ICNTNR]) - au_cachep[AuCache_FINFO] = AuCacheCtor(au_finfo, - au_fi_init_once); - if (au_cachep[AuCache_FINFO]) - au_cachep[AuCache_VDIR] = AuCache(au_vdir); - if (au_cachep[AuCache_VDIR]) - au_cachep[AuCache_DEHSTR] = AuCache(au_vdir_dehstr); - if (au_cachep[AuCache_DEHSTR]) - return 0; - - return -ENOMEM; -} - -static void au_cache_fin(void) -{ - int i; - - /* - * Make sure all delayed rcu free inodes are flushed before we - * destroy cache. - */ - rcu_barrier(); - - /* excluding AuCache_HNOTIFY */ - BUILD_BUG_ON(AuCache_HNOTIFY + 1 != AuCache_Last); - for (i = 0; i < AuCache_HNOTIFY; i++) { - kmem_cache_destroy(au_cachep[i]); - au_cachep[i] = NULL; - } -} - -/* ---------------------------------------------------------------------- */ - -int au_dir_roflags; - -#ifdef CONFIG_AUFS_SBILIST -/* - * iterate_supers_type() doesn't protect us from - * remounting (branch management) - */ -struct au_splhead au_sbilist; -#endif - -struct lock_class_key au_lc_key[AuLcKey_Last]; - -/* - * functions for module interface. - */ -MODULE_LICENSE("GPL"); -/* MODULE_LICENSE("GPL v2"); */ -MODULE_AUTHOR("Junjiro R. Okajima <aufs-users@lists.sourceforge.net>"); -MODULE_DESCRIPTION(AUFS_NAME - " -- Advanced multi layered unification filesystem"); -MODULE_VERSION(AUFS_VERSION); - -/* this module parameter has no meaning when SYSFS is disabled */ -int sysaufs_brs = 1; -MODULE_PARM_DESC(brs, "use <sysfs>/fs/aufs/si_*/brN"); -module_param_named(brs, sysaufs_brs, int, S_IRUGO); - -/* this module parameter has no meaning when USER_NS is disabled */ -bool au_userns; -MODULE_PARM_DESC(allow_userns, "allow unprivileged to mount under userns"); -module_param_named(allow_userns, au_userns, bool, S_IRUGO); - -/* ---------------------------------------------------------------------- */ - -static char au_esc_chars[0x20 + 3]; /* 0x01-0x20, backslash, del, and NULL */ - -int au_seq_path(struct seq_file *seq, struct path *path) -{ - int err; - - err = seq_path(seq, path, au_esc_chars); - if (err > 0) - err = 0; - else if (err < 0) - err = -ENOMEM; - - return err; -} - -/* ---------------------------------------------------------------------- */ - -static int __init aufs_init(void) -{ - int err, i; - char *p; - - p = au_esc_chars; - for (i = 1; i <= ' '; i++) - *p++ = i; - *p++ = '\\'; - *p++ = '\x7f'; - *p = 0; - - au_dir_roflags = au_file_roflags(O_DIRECTORY | O_LARGEFILE); - - memcpy(aufs_iop_nogetattr, aufs_iop, sizeof(aufs_iop)); - for (i = 0; i < AuIop_Last; i++) - aufs_iop_nogetattr[i].getattr = NULL; - - au_sbilist_init(); - sysaufs_brs_init(); - au_debug_init(); - au_dy_init(); - err = sysaufs_init(); - if (unlikely(err)) - goto out; - err = au_procfs_init(); - if (unlikely(err)) - goto out_sysaufs; - err = au_wkq_init(); - if (unlikely(err)) - goto out_procfs; - err = au_loopback_init(); - if (unlikely(err)) - goto out_wkq; - err = au_hnotify_init(); - if (unlikely(err)) - goto out_loopback; - err = au_sysrq_init(); - if (unlikely(err)) - goto out_hin; - err = au_cache_init(); - if (unlikely(err)) - goto out_sysrq; - - aufs_fs_type.fs_flags |= au_userns ? FS_USERNS_MOUNT : 0; - err = register_filesystem(&aufs_fs_type); - if (unlikely(err)) - goto out_cache; - - /* since we define pr_fmt, call printk directly */ - printk(KERN_INFO AUFS_NAME " " AUFS_VERSION "\n"); - goto out; /* success */ - -out_cache: - au_cache_fin(); -out_sysrq: - au_sysrq_fin(); -out_hin: - au_hnotify_fin(); -out_loopback: - au_loopback_fin(); -out_wkq: - au_wkq_fin(); -out_procfs: - au_procfs_fin(); -out_sysaufs: - sysaufs_fin(); - au_dy_fin(); -out: - return err; -} - -static void __exit aufs_exit(void) -{ - unregister_filesystem(&aufs_fs_type); - au_cache_fin(); - au_sysrq_fin(); - au_hnotify_fin(); - au_loopback_fin(); - au_wkq_fin(); - au_procfs_fin(); - sysaufs_fin(); - au_dy_fin(); -} - -module_init(aufs_init); -module_exit(aufs_exit); diff --git a/fs/aufs/module.h b/fs/aufs/module.h deleted file mode 100644 index bb8644730..000000000 --- a/fs/aufs/module.h +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Copyright (C) 2005-2016 Junjiro R. Okajima - */ - -/* - * module initialization and module-global - */ - -#ifndef __AUFS_MODULE_H__ -#define __AUFS_MODULE_H__ - -#ifdef __KERNEL__ - -#include <linux/slab.h> - -struct path; -struct seq_file; - -/* module parameters */ -extern int sysaufs_brs; -extern bool au_userns; - -/* ---------------------------------------------------------------------- */ - -extern int au_dir_roflags; - -enum { - AuLcNonDir_FIINFO, - AuLcNonDir_DIINFO, - AuLcNonDir_IIINFO, - - AuLcDir_FIINFO, - AuLcDir_DIINFO, - AuLcDir_IIINFO, - - AuLcSymlink_DIINFO, - AuLcSymlink_IIINFO, - - AuLcKey_Last -}; -extern struct lock_class_key au_lc_key[AuLcKey_Last]; - -void *au_kzrealloc(void *p, unsigned int nused, unsigned int new_sz, gfp_t gfp); -int au_seq_path(struct seq_file *seq, struct path *path); - -#ifdef CONFIG_PROC_FS -/* procfs.c */ -int __init au_procfs_init(void); -void au_procfs_fin(void); -#else -AuStubInt0(au_procfs_init, void); -AuStubVoid(au_procfs_fin, void); -#endif - -/* ---------------------------------------------------------------------- */ - -/* kmem cache */ -enum { - AuCache_DINFO, - AuCache_ICNTNR, - AuCache_FINFO, - AuCache_VDIR, - AuCache_DEHSTR, - AuCache_HNOTIFY, /* must be last */ - AuCache_Last -}; - -#define AuCacheFlags (SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD) -#define AuCache(type) KMEM_CACHE(type, AuCacheFlags) -#define AuCacheCtor(type, ctor) \ - kmem_cache_create(#type, sizeof(struct type), \ - __alignof__(struct type), AuCacheFlags, ctor) - -extern struct kmem_cache *au_cachep[]; - -#define AuCacheFuncs(name, index) \ -static inline struct au_##name *au_cache_alloc_##name(void) \ -{ return kmem_cache_alloc(au_cachep[AuCache_##index], GFP_NOFS); } \ -static inline void au_cache_free_##name(struct au_##name *p) \ -{ kmem_cache_free(au_cachep[AuCache_##index], p); } - -AuCacheFuncs(dinfo, DINFO); -AuCacheFuncs(icntnr, ICNTNR); -AuCacheFuncs(finfo, FINFO); -AuCacheFuncs(vdir, VDIR); -AuCacheFuncs(vdir_dehstr, DEHSTR); -#ifdef CONFIG_AUFS_HNOTIFY -AuCacheFuncs(hnotify, HNOTIFY); -#endif - -#endif /* __KERNEL__ */ -#endif /* __AUFS_MODULE_H__ */ diff --git a/fs/aufs/mvdown.c b/fs/aufs/mvdown.c deleted file mode 100644 index 1f2224f6d..000000000 --- a/fs/aufs/mvdown.c +++ /dev/null @@ -1,690 +0,0 @@ -/* - * Copyright (C) 2011-2016 Junjiro R. Okajima - */ - -/* - * move-down, opposite of copy-up - */ - -#include "aufs.h" - -struct au_mvd_args { - struct { - struct super_block *h_sb; - struct dentry *h_parent; - struct au_hinode *hdir; - struct inode *h_dir, *h_inode; - struct au_pin pin; - } info[AUFS_MVDOWN_NARRAY]; - - struct aufs_mvdown mvdown; - struct dentry *dentry, *parent; - struct inode *inode, *dir; - struct super_block *sb; - aufs_bindex_t bopq, bwh, bfound; - unsigned char rename_lock; -}; - -#define mvd_errno mvdown.au_errno -#define mvd_bsrc mvdown.stbr[AUFS_MVDOWN_UPPER].bindex -#define mvd_src_brid mvdown.stbr[AUFS_MVDOWN_UPPER].brid -#define mvd_bdst mvdown.stbr[AUFS_MVDOWN_LOWER].bindex -#define mvd_dst_brid mvdown.stbr[AUFS_MVDOWN_LOWER].brid - -#define mvd_h_src_sb info[AUFS_MVDOWN_UPPER].h_sb -#define mvd_h_src_parent info[AUFS_MVDOWN_UPPER].h_parent -#define mvd_hdir_src info[AUFS_MVDOWN_UPPER].hdir -#define mvd_h_src_dir info[AUFS_MVDOWN_UPPER].h_dir -#define mvd_h_src_inode info[AUFS_MVDOWN_UPPER].h_inode -#define mvd_pin_src info[AUFS_MVDOWN_UPPER].pin - -#define mvd_h_dst_sb info[AUFS_MVDOWN_LOWER].h_sb -#define mvd_h_dst_parent info[AUFS_MVDOWN_LOWER].h_parent -#define mvd_hdir_dst info[AUFS_MVDOWN_LOWER].hdir -#define mvd_h_dst_dir info[AUFS_MVDOWN_LOWER].h_dir -#define mvd_h_dst_inode info[AUFS_MVDOWN_LOWER].h_inode -#define mvd_pin_dst info[AUFS_MVDOWN_LOWER].pin - -#define AU_MVD_PR(flag, ...) do { \ - if (flag) \ - pr_err(__VA_ARGS__); \ - } while (0) - -static int find_lower_writable(struct au_mvd_args *a) -{ - struct super_block *sb; - aufs_bindex_t bindex, bend; - struct au_branch *br; - - sb = a->sb; - bindex = a->mvd_bsrc; - bend = au_sbend(sb); - if (a->mvdown.flags & AUFS_MVDOWN_FHSM_LOWER) - for (bindex++; bindex <= bend; bindex++) { - br = au_sbr(sb, bindex); - if (au_br_fhsm(br->br_perm) - && (!(au_br_sb(br)->s_flags & MS_RDONLY))) - return bindex; - } - else if (!(a->mvdown.flags & AUFS_MVDOWN_ROLOWER)) - for (bindex++; bindex <= bend; bindex++) { - br = au_sbr(sb, bindex); - if (!au_br_rdonly(br)) - return bindex; - } - else - for (bindex++; bindex <= bend; bindex++) { - br = au_sbr(sb, bindex); - if (!(au_br_sb(br)->s_flags & MS_RDONLY)) { - if (au_br_rdonly(br)) - a->mvdown.flags - |= AUFS_MVDOWN_ROLOWER_R; - return bindex; - } - } - - return -1; -} - -/* make the parent dir on bdst */ -static int au_do_mkdir(const unsigned char dmsg, struct au_mvd_args *a) -{ - int err; - - err = 0; - a->mvd_hdir_src = au_hi(a->dir, a->mvd_bsrc); - a->mvd_hdir_dst = au_hi(a->dir, a->mvd_bdst); - a->mvd_h_src_parent = au_h_dptr(a->parent, a->mvd_bsrc); - a->mvd_h_dst_parent = NULL; - if (au_dbend(a->parent) >= a->mvd_bdst) - a->mvd_h_dst_parent = au_h_dptr(a->parent, a->mvd_bdst); - if (!a->mvd_h_dst_parent) { - err = au_cpdown_dirs(a->dentry, a->mvd_bdst); - if (unlikely(err)) { - AU_MVD_PR(dmsg, "cpdown_dirs failed\n"); - goto out; - } - a->mvd_h_dst_parent = au_h_dptr(a->parent, a->mvd_bdst); - } - -out: - AuTraceErr(err); - return err; -} - -/* lock them all */ -static int au_do_lock(const unsigned char dmsg, struct au_mvd_args *a) -{ - int err; - struct dentry *h_trap; - - a->mvd_h_src_sb = au_sbr_sb(a->sb, a->mvd_bsrc); - a->mvd_h_dst_sb = au_sbr_sb(a->sb, a->mvd_bdst); - err = au_pin(&a->mvd_pin_dst, a->dentry, a->mvd_bdst, - au_opt_udba(a->sb), - AuPin_MNT_WRITE | AuPin_DI_LOCKED); - AuTraceErr(err); - if (unlikely(err)) { - AU_MVD_PR(dmsg, "pin_dst failed\n"); - goto out; - } - - if (a->mvd_h_src_sb != a->mvd_h_dst_sb) { - a->rename_lock = 0; - au_pin_init(&a->mvd_pin_src, a->dentry, a->mvd_bsrc, - AuLsc_DI_PARENT, AuLsc_I_PARENT3, - au_opt_udba(a->sb), - AuPin_MNT_WRITE | AuPin_DI_LOCKED); - err = au_do_pin(&a->mvd_pin_src); - AuTraceErr(err); - a->mvd_h_src_dir = d_inode(a->mvd_h_src_parent); - if (unlikely(err)) { - AU_MVD_PR(dmsg, "pin_src failed\n"); - goto out_dst; - } - goto out; /* success */ - } - - a->rename_lock = 1; - au_pin_hdir_unlock(&a->mvd_pin_dst); - err = au_pin(&a->mvd_pin_src, a->dentry, a->mvd_bsrc, - au_opt_udba(a->sb), - AuPin_MNT_WRITE | AuPin_DI_LOCKED); - AuTraceErr(err); - a->mvd_h_src_dir = d_inode(a->mvd_h_src_parent); - if (unlikely(err)) { - AU_MVD_PR(dmsg, "pin_src failed\n"); - au_pin_hdir_lock(&a->mvd_pin_dst); - goto out_dst; - } - au_pin_hdir_unlock(&a->mvd_pin_src); - h_trap = vfsub_lock_rename(a->mvd_h_src_parent, a->mvd_hdir_src, - a->mvd_h_dst_parent, a->mvd_hdir_dst); - if (h_trap) { - err = (h_trap != a->mvd_h_src_parent); - if (err) - err = (h_trap != a->mvd_h_dst_parent); - } - BUG_ON(err); /* it should never happen */ - if (unlikely(a->mvd_h_src_dir != au_pinned_h_dir(&a->mvd_pin_src))) { - err = -EBUSY; - AuTraceErr(err); - vfsub_unlock_rename(a->mvd_h_src_parent, a->mvd_hdir_src, - a->mvd_h_dst_parent, a->mvd_hdir_dst); - au_pin_hdir_lock(&a->mvd_pin_src); - au_unpin(&a->mvd_pin_src); - au_pin_hdir_lock(&a->mvd_pin_dst); - goto out_dst; - } - goto out; /* success */ - -out_dst: - au_unpin(&a->mvd_pin_dst); -out: - AuTraceErr(err); - return err; -} - -static void au_do_unlock(const unsigned char dmsg, struct au_mvd_args *a) -{ - if (!a->rename_lock) - au_unpin(&a->mvd_pin_src); - else { - vfsub_unlock_rename(a->mvd_h_src_parent, a->mvd_hdir_src, - a->mvd_h_dst_parent, a->mvd_hdir_dst); - au_pin_hdir_lock(&a->mvd_pin_src); - au_unpin(&a->mvd_pin_src); - au_pin_hdir_lock(&a->mvd_pin_dst); - } - au_unpin(&a->mvd_pin_dst); -} - -/* copy-down the file */ -static int au_do_cpdown(const unsigned char dmsg, struct au_mvd_args *a) -{ - int err; - struct au_cp_generic cpg = { - .dentry = a->dentry, - .bdst = a->mvd_bdst, - .bsrc = a->mvd_bsrc, - .len = -1, - .pin = &a->mvd_pin_dst, - .flags = AuCpup_DTIME | AuCpup_HOPEN - }; - - AuDbg("b%d, b%d\n", cpg.bsrc, cpg.bdst); - if (a->mvdown.flags & AUFS_MVDOWN_OWLOWER) - au_fset_cpup(cpg.flags, OVERWRITE); - if (a->mvdown.flags & AUFS_MVDOWN_ROLOWER) - au_fset_cpup(cpg.flags, RWDST); - err = au_sio_cpdown_simple(&cpg); - if (unlikely(err)) - AU_MVD_PR(dmsg, "cpdown failed\n"); - - AuTraceErr(err); - return err; -} - -/* - * unlink the whiteout on bdst if exist which may be created by UDBA while we - * were sleeping - */ -static int au_do_unlink_wh(const unsigned char dmsg, struct au_mvd_args *a) -{ - int err; - struct path h_path; - struct au_branch *br; - struct inode *delegated; - - br = au_sbr(a->sb, a->mvd_bdst); - h_path.dentry = au_wh_lkup(a->mvd_h_dst_parent, &a->dentry->d_name, br); - err = PTR_ERR(h_path.dentry); - if (IS_ERR(h_path.dentry)) { - AU_MVD_PR(dmsg, "wh_lkup failed\n"); - goto out; - } - - err = 0; - if (d_is_positive(h_path.dentry)) { - h_path.mnt = au_br_mnt(br); - delegated = NULL; - err = vfsub_unlink(d_inode(a->mvd_h_dst_parent), &h_path, - &delegated, /*force*/0); - if (unlikely(err == -EWOULDBLOCK)) { - pr_warn("cannot retry for NFSv4 delegation" - " for an internal unlink\n"); - iput(delegated); - } - if (unlikely(err)) - AU_MVD_PR(dmsg, "wh_unlink failed\n"); - } - dput(h_path.dentry); - -out: - AuTraceErr(err); - return err; -} - -/* - * unlink the topmost h_dentry - */ -static int au_do_unlink(const unsigned char dmsg, struct au_mvd_args *a) -{ - int err; - struct path h_path; - struct inode *delegated; - - h_path.mnt = au_sbr_mnt(a->sb, a->mvd_bsrc); - h_path.dentry = au_h_dptr(a->dentry, a->mvd_bsrc); - delegated = NULL; - err = vfsub_unlink(a->mvd_h_src_dir, &h_path, &delegated, /*force*/0); - if (unlikely(err == -EWOULDBLOCK)) { - pr_warn("cannot retry for NFSv4 delegation" - " for an internal unlink\n"); - iput(delegated); - } - if (unlikely(err)) - AU_MVD_PR(dmsg, "unlink failed\n"); - - AuTraceErr(err); - return err; -} - -/* Since mvdown succeeded, we ignore an error of this function */ -static void au_do_stfs(const unsigned char dmsg, struct au_mvd_args *a) -{ - int err; - struct au_branch *br; - - a->mvdown.flags |= AUFS_MVDOWN_STFS_FAILED; - br = au_sbr(a->sb, a->mvd_bsrc); - err = au_br_stfs(br, &a->mvdown.stbr[AUFS_MVDOWN_UPPER].stfs); - if (!err) { - br = au_sbr(a->sb, a->mvd_bdst); - a->mvdown.stbr[AUFS_MVDOWN_LOWER].brid = br->br_id; - err = au_br_stfs(br, &a->mvdown.stbr[AUFS_MVDOWN_LOWER].stfs); - } - if (!err) - a->mvdown.flags &= ~AUFS_MVDOWN_STFS_FAILED; - else - AU_MVD_PR(dmsg, "statfs failed (%d), ignored\n", err); -} - -/* - * copy-down the file and unlink the bsrc file. - * - unlink the bdst whout if exist - * - copy-down the file (with whtmp name and rename) - * - unlink the bsrc file - */ -static int au_do_mvdown(const unsigned char dmsg, struct au_mvd_args *a) -{ - int err; - - err = au_do_mkdir(dmsg, a); - if (!err) - err = au_do_lock(dmsg, a); - if (unlikely(err)) - goto out; - - /* - * do not revert the activities we made on bdst since they should be - * harmless in aufs. - */ - - err = au_do_cpdown(dmsg, a); - if (!err) - err = au_do_unlink_wh(dmsg, a); - if (!err && !(a->mvdown.flags & AUFS_MVDOWN_KUPPER)) - err = au_do_unlink(dmsg, a); - if (unlikely(err)) - goto out_unlock; - - AuDbg("%pd2, 0x%x, %d --> %d\n", - a->dentry, a->mvdown.flags, a->mvd_bsrc, a->mvd_bdst); - if (find_lower_writable(a) < 0) - a->mvdown.flags |= AUFS_MVDOWN_BOTTOM; - - if (a->mvdown.flags & AUFS_MVDOWN_STFS) - au_do_stfs(dmsg, a); - - /* maintain internal array */ - if (!(a->mvdown.flags & AUFS_MVDOWN_KUPPER)) { - au_set_h_dptr(a->dentry, a->mvd_bsrc, NULL); - au_set_dbstart(a->dentry, a->mvd_bdst); - au_set_h_iptr(a->inode, a->mvd_bsrc, NULL, /*flags*/0); - au_set_ibstart(a->inode, a->mvd_bdst); - } else { - /* hide the lower */ - au_set_h_dptr(a->dentry, a->mvd_bdst, NULL); - au_set_dbend(a->dentry, a->mvd_bsrc); - au_set_h_iptr(a->inode, a->mvd_bdst, NULL, /*flags*/0); - au_set_ibend(a->inode, a->mvd_bsrc); - } - if (au_dbend(a->dentry) < a->mvd_bdst) - au_set_dbend(a->dentry, a->mvd_bdst); - if (au_ibend(a->inode) < a->mvd_bdst) - au_set_ibend(a->inode, a->mvd_bdst); - -out_unlock: - au_do_unlock(dmsg, a); -out: - AuTraceErr(err); - return err; -} - -/* ---------------------------------------------------------------------- */ - -/* make sure the file is idle */ -static int au_mvd_args_busy(const unsigned char dmsg, struct au_mvd_args *a) -{ - int err, plinked; - - err = 0; - plinked = !!au_opt_test(au_mntflags(a->sb), PLINK); - if (au_dbstart(a->dentry) == a->mvd_bsrc - && au_dcount(a->dentry) == 1 - && atomic_read(&a->inode->i_count) == 1 - /* && a->mvd_h_src_inode->i_nlink == 1 */ - && (!plinked || !au_plink_test(a->inode)) - && a->inode->i_nlink == 1) - goto out; - - err = -EBUSY; - AU_MVD_PR(dmsg, - "b%d, d{b%d, c%d?}, i{c%d?, l%u}, hi{l%u}, p{%d, %d}\n", - a->mvd_bsrc, au_dbstart(a->dentry), au_dcount(a->dentry), - atomic_read(&a->inode->i_count), a->inode->i_nlink, - a->mvd_h_src_inode->i_nlink, - plinked, plinked ? au_plink_test(a->inode) : 0); - -out: - AuTraceErr(err); - return err; -} - -/* make sure the parent dir is fine */ -static int au_mvd_args_parent(const unsigned char dmsg, - struct au_mvd_args *a) -{ - int err; - aufs_bindex_t bindex; - - err = 0; - if (unlikely(au_alive_dir(a->parent))) { - err = -ENOENT; - AU_MVD_PR(dmsg, "parent dir is dead\n"); - goto out; - } - - a->bopq = au_dbdiropq(a->parent); - bindex = au_wbr_nonopq(a->dentry, a->mvd_bdst); - AuDbg("b%d\n", bindex); - if (unlikely((bindex >= 0 && bindex < a->mvd_bdst) - || (a->bopq != -1 && a->bopq < a->mvd_bdst))) { - err = -EINVAL; - a->mvd_errno = EAU_MVDOWN_OPAQUE; - AU_MVD_PR(dmsg, "ancestor is opaque b%d, b%d\n", - a->bopq, a->mvd_bdst); - } - -out: - AuTraceErr(err); - return err; -} - -static int au_mvd_args_intermediate(const unsigned char dmsg, - struct au_mvd_args *a) -{ - int err; - struct au_dinfo *dinfo, *tmp; - - /* lookup the next lower positive entry */ - err = -ENOMEM; - tmp = au_di_alloc(a->sb, AuLsc_DI_TMP); - if (unlikely(!tmp)) - goto out; - - a->bfound = -1; - a->bwh = -1; - dinfo = au_di(a->dentry); - au_di_cp(tmp, dinfo); - au_di_swap(tmp, dinfo); - - /* returns the number of positive dentries */ - err = au_lkup_dentry(a->dentry, a->mvd_bsrc + 1, /*type*/0); - if (!err) - a->bwh = au_dbwh(a->dentry); - else if (err > 0) - a->bfound = au_dbstart(a->dentry); - - au_di_swap(tmp, dinfo); - au_rw_write_unlock(&tmp->di_rwsem); - au_di_free(tmp); - if (unlikely(err < 0)) - AU_MVD_PR(dmsg, "failed look-up lower\n"); - - /* - * here, we have these cases. - * bfound == -1 - * no positive dentry under bsrc. there are more sub-cases. - * bwh < 0 - * there no whiteout, we can safely move-down. - * bwh <= bsrc - * impossible - * bsrc < bwh && bwh < bdst - * there is a whiteout on RO branch. cannot proceed. - * bwh == bdst - * there is a whiteout on the RW target branch. it should - * be removed. - * bdst < bwh - * there is a whiteout somewhere unrelated branch. - * -1 < bfound && bfound <= bsrc - * impossible. - * bfound < bdst - * found, but it is on RO branch between bsrc and bdst. cannot - * proceed. - * bfound == bdst - * found, replace it if AUFS_MVDOWN_FORCE is set. otherwise return - * error. - * bdst < bfound - * found, after we create the file on bdst, it will be hidden. - */ - - AuDebugOn(a->bfound == -1 - && a->bwh != -1 - && a->bwh <= a->mvd_bsrc); - AuDebugOn(-1 < a->bfound - && a->bfound <= a->mvd_bsrc); - - err = -EINVAL; - if (a->bfound == -1 - && a->mvd_bsrc < a->bwh - && a->bwh != -1 - && a->bwh < a->mvd_bdst) { - a->mvd_errno = EAU_MVDOWN_WHITEOUT; - AU_MVD_PR(dmsg, "bsrc %d, bdst %d, bfound %d, bwh %d\n", - a->mvd_bsrc, a->mvd_bdst, a->bfound, a->bwh); - goto out; - } else if (a->bfound != -1 && a->bfound < a->mvd_bdst) { - a->mvd_errno = EAU_MVDOWN_UPPER; - AU_MVD_PR(dmsg, "bdst %d, bfound %d\n", - a->mvd_bdst, a->bfound); - goto out; - } - - err = 0; /* success */ - -out: - AuTraceErr(err); - return err; -} - -static int au_mvd_args_exist(const unsigned char dmsg, struct au_mvd_args *a) -{ - int err; - - err = 0; - if (!(a->mvdown.flags & AUFS_MVDOWN_OWLOWER) - && a->bfound == a->mvd_bdst) - err = -EEXIST; - AuTraceErr(err); - return err; -} - -static int au_mvd_args(const unsigned char dmsg, struct au_mvd_args *a) -{ - int err; - struct au_branch *br; - - err = -EISDIR; - if (unlikely(S_ISDIR(a->inode->i_mode))) - goto out; - - err = -EINVAL; - if (!(a->mvdown.flags & AUFS_MVDOWN_BRID_UPPER)) - a->mvd_bsrc = au_ibstart(a->inode); - else { - a->mvd_bsrc = au_br_index(a->sb, a->mvd_src_brid); - if (unlikely(a->mvd_bsrc < 0 - || (a->mvd_bsrc < au_dbstart(a->dentry) - || au_dbend(a->dentry) < a->mvd_bsrc - || !au_h_dptr(a->dentry, a->mvd_bsrc)) - || (a->mvd_bsrc < au_ibstart(a->inode) - || au_ibend(a->inode) < a->mvd_bsrc - || !au_h_iptr(a->inode, a->mvd_bsrc)))) { - a->mvd_errno = EAU_MVDOWN_NOUPPER; - AU_MVD_PR(dmsg, "no upper\n"); - goto out; - } - } - if (unlikely(a->mvd_bsrc == au_sbend(a->sb))) { - a->mvd_errno = EAU_MVDOWN_BOTTOM; - AU_MVD_PR(dmsg, "on the bottom\n"); - goto out; - } - a->mvd_h_src_inode = au_h_iptr(a->inode, a->mvd_bsrc); - br = au_sbr(a->sb, a->mvd_bsrc); - err = au_br_rdonly(br); - if (!(a->mvdown.flags & AUFS_MVDOWN_ROUPPER)) { - if (unlikely(err)) - goto out; - } else if (!(vfsub_native_ro(a->mvd_h_src_inode) - || IS_APPEND(a->mvd_h_src_inode))) { - if (err) - a->mvdown.flags |= AUFS_MVDOWN_ROUPPER_R; - /* go on */ - } else - goto out; - - err = -EINVAL; - if (!(a->mvdown.flags & AUFS_MVDOWN_BRID_LOWER)) { - a->mvd_bdst = find_lower_writable(a); - if (unlikely(a->mvd_bdst < 0)) { - a->mvd_errno = EAU_MVDOWN_BOTTOM; - AU_MVD_PR(dmsg, "no writable lower branch\n"); - goto out; - } - } else { - a->mvd_bdst = au_br_index(a->sb, a->mvd_dst_brid); - if (unlikely(a->mvd_bdst < 0 - || au_sbend(a->sb) < a->mvd_bdst)) { - a->mvd_errno = EAU_MVDOWN_NOLOWERBR; - AU_MVD_PR(dmsg, "no lower brid\n"); - goto out; - } - } - - err = au_mvd_args_busy(dmsg, a); - if (!err) - err = au_mvd_args_parent(dmsg, a); - if (!err) - err = au_mvd_args_intermediate(dmsg, a); - if (!err) - err = au_mvd_args_exist(dmsg, a); - if (!err) - AuDbg("b%d, b%d\n", a->mvd_bsrc, a->mvd_bdst); - -out: - AuTraceErr(err); - return err; -} - -int au_mvdown(struct dentry *dentry, struct aufs_mvdown __user *uarg) -{ - int err, e; - unsigned char dmsg; - struct au_mvd_args *args; - struct inode *inode; - - inode = d_inode(dentry); - err = -EPERM; - if (unlikely(!capable(CAP_SYS_ADMIN))) - goto out; - - err = -ENOMEM; - args = kmalloc(sizeof(*args), GFP_NOFS); - if (unlikely(!args)) - goto out; - - err = copy_from_user(&args->mvdown, uarg, sizeof(args->mvdown)); - if (!err) - err = !access_ok(VERIFY_WRITE, uarg, sizeof(*uarg)); - if (unlikely(err)) { - err = -EFAULT; - AuTraceErr(err); - goto out_free; - } - AuDbg("flags 0x%x\n", args->mvdown.flags); - args->mvdown.flags &= ~(AUFS_MVDOWN_ROLOWER_R | AUFS_MVDOWN_ROUPPER_R); - args->mvdown.au_errno = 0; - args->dentry = dentry; - args->inode = inode; - args->sb = dentry->d_sb; - - err = -ENOENT; - dmsg = !!(args->mvdown.flags & AUFS_MVDOWN_DMSG); - args->parent = dget_parent(dentry); - args->dir = d_inode(args->parent); - mutex_lock_nested(&args->dir->i_mutex, I_MUTEX_PARENT); - dput(args->parent); - if (unlikely(args->parent != dentry->d_parent)) { - AU_MVD_PR(dmsg, "parent dir is moved\n"); - goto out_dir; - } - - mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); - err = aufs_read_lock(dentry, AuLock_DW | AuLock_FLUSH | AuLock_NOPLMW); - if (unlikely(err)) - goto out_inode; - - di_write_lock_parent(args->parent); - err = au_mvd_args(dmsg, args); - if (unlikely(err)) - goto out_parent; - - err = au_do_mvdown(dmsg, args); - if (unlikely(err)) - goto out_parent; - - au_cpup_attr_timesizes(args->dir); - au_cpup_attr_timesizes(inode); - if (!(args->mvdown.flags & AUFS_MVDOWN_KUPPER)) - au_cpup_igen(inode, au_h_iptr(inode, args->mvd_bdst)); - /* au_digen_dec(dentry); */ - -out_parent: - di_write_unlock(args->parent); - aufs_read_unlock(dentry, AuLock_DW); -out_inode: - mutex_unlock(&inode->i_mutex); -out_dir: - mutex_unlock(&args->dir->i_mutex); -out_free: - e = copy_to_user(uarg, &args->mvdown, sizeof(args->mvdown)); - if (unlikely(e)) - err = -EFAULT; - kfree(args); -out: - AuTraceErr(err); - return err; -} diff --git a/fs/aufs/opts.c b/fs/aufs/opts.c deleted file mode 100644 index 5c39817f3..000000000 --- a/fs/aufs/opts.c +++ /dev/null @@ -1,1846 +0,0 @@ -/* - * Copyright (C) 2005-2016 Junjiro R. Okajima - */ - -/* - * mount options/flags - */ - -#include <linux/namei.h> -#include <linux/types.h> /* a distribution requires */ -#include <linux/parser.h> -#include "aufs.h" - -/* ---------------------------------------------------------------------- */ - -enum { - Opt_br, - Opt_add, Opt_del, Opt_mod, Opt_append, Opt_prepend, - Opt_idel, Opt_imod, - Opt_dirwh, Opt_rdcache, Opt_rdblk, Opt_rdhash, - Opt_rdblk_def, Opt_rdhash_def, - Opt_xino, Opt_noxino, - Opt_trunc_xino, Opt_trunc_xino_v, Opt_notrunc_xino, - Opt_trunc_xino_path, Opt_itrunc_xino, - Opt_trunc_xib, Opt_notrunc_xib, - Opt_shwh, Opt_noshwh, - Opt_plink, Opt_noplink, Opt_list_plink, - Opt_udba, - Opt_dio, Opt_nodio, - Opt_diropq_a, Opt_diropq_w, - Opt_warn_perm, Opt_nowarn_perm, - Opt_wbr_copyup, Opt_wbr_create, - Opt_fhsm_sec, - Opt_verbose, Opt_noverbose, - Opt_sum, Opt_nosum, Opt_wsum, - Opt_dirperm1, Opt_nodirperm1, - Opt_acl, Opt_noacl, - Opt_tail, Opt_ignore, Opt_ignore_silent, Opt_err -}; - -static match_table_t options = { - {Opt_br, "br=%s"}, - {Opt_br, "br:%s"}, - - {Opt_add, "add=%d:%s"}, - {Opt_add, "add:%d:%s"}, - {Opt_add, "ins=%d:%s"}, - {Opt_add, "ins:%d:%s"}, - {Opt_append, "append=%s"}, - {Opt_append, "append:%s"}, - {Opt_prepend, "prepend=%s"}, - {Opt_prepend, "prepend:%s"}, - - {Opt_del, "del=%s"}, - {Opt_del, "del:%s"}, - /* {Opt_idel, "idel:%d"}, */ - {Opt_mod, "mod=%s"}, - {Opt_mod, "mod:%s"}, - /* {Opt_imod, "imod:%d:%s"}, */ - - {Opt_dirwh, "dirwh=%d"}, - - {Opt_xino, "xino=%s"}, - {Opt_noxino, "noxino"}, - {Opt_trunc_xino, "trunc_xino"}, - {Opt_trunc_xino_v, "trunc_xino_v=%d:%d"}, - {Opt_notrunc_xino, "notrunc_xino"}, - {Opt_trunc_xino_path, "trunc_xino=%s"}, - {Opt_itrunc_xino, "itrunc_xino=%d"}, - /* {Opt_zxino, "zxino=%s"}, */ - {Opt_trunc_xib, "trunc_xib"}, - {Opt_notrunc_xib, "notrunc_xib"}, - -#ifdef CONFIG_PROC_FS - {Opt_plink, "plink"}, -#else - {Opt_ignore_silent, "plink"}, -#endif - - {Opt_noplink, "noplink"}, - -#ifdef CONFIG_AUFS_DEBUG - {Opt_list_plink, "list_plink"}, -#endif - - {Opt_udba, "udba=%s"}, - - {Opt_dio, "dio"}, - {Opt_nodio, "nodio"}, - -#ifdef CONFIG_AUFS_FHSM - {Opt_fhsm_sec, "fhsm_sec=%d"}, -#else - {Opt_ignore_silent, "fhsm_sec=%d"}, -#endif - - {Opt_diropq_a, "diropq=always"}, - {Opt_diropq_a, "diropq=a"}, - {Opt_diropq_w, "diropq=whiteouted"}, - {Opt_diropq_w, "diropq=w"}, - - {Opt_warn_perm, "warn_perm"}, - {Opt_nowarn_perm, "nowarn_perm"}, - - /* keep them temporary */ - {Opt_ignore_silent, "nodlgt"}, - {Opt_ignore_silent, "clean_plink"}, - -#ifdef CONFIG_AUFS_SHWH - {Opt_shwh, "shwh"}, -#endif - {Opt_noshwh, "noshwh"}, - - {Opt_dirperm1, "dirperm1"}, - {Opt_nodirperm1, "nodirperm1"}, - - {Opt_verbose, "verbose"}, - {Opt_verbose, "v"}, - {Opt_noverbose, "noverbose"}, - {Opt_noverbose, "quiet"}, - {Opt_noverbose, "q"}, - {Opt_noverbose, "silent"}, - - {Opt_sum, "sum"}, - {Opt_nosum, "nosum"}, - {Opt_wsum, "wsum"}, - - {Opt_rdcache, "rdcache=%d"}, - {Opt_rdblk, "rdblk=%d"}, - {Opt_rdblk_def, "rdblk=def"}, - {Opt_rdhash, "rdhash=%d"}, - {Opt_rdhash_def, "rdhash=def"}, - - {Opt_wbr_create, "create=%s"}, - {Opt_wbr_create, "create_policy=%s"}, - {Opt_wbr_copyup, "cpup=%s"}, - {Opt_wbr_copyup, "copyup=%s"}, - {Opt_wbr_copyup, "copyup_policy=%s"}, - - /* generic VFS flag */ -#ifdef CONFIG_FS_POSIX_ACL - {Opt_acl, "acl"}, - {Opt_noacl, "noacl"}, -#else - {Opt_ignore_silent, "acl"}, - {Opt_ignore_silent, "noacl"}, -#endif - - /* internal use for the scripts */ - {Opt_ignore_silent, "si=%s"}, - - {Opt_br, "dirs=%s"}, - {Opt_ignore, "debug=%d"}, - {Opt_ignore, "delete=whiteout"}, - {Opt_ignore, "delete=all"}, - {Opt_ignore, "imap=%s"}, - - /* temporary workaround, due to old mount(8)? */ - {Opt_ignore_silent, "relatime"}, - - {Opt_err, NULL} -}; - -/* ---------------------------------------------------------------------- */ - -static const char *au_parser_pattern(int val, match_table_t tbl) -{ - struct match_token *p; - - p = tbl; - while (p->pattern) { - if (p->token == val) - return p->pattern; - p++; - } - BUG(); - return "??"; -} - -static const char *au_optstr(int *val, match_table_t tbl) -{ - struct match_token *p; - int v; - - v = *val; - if (!v) - goto out; - p = tbl; - while (p->pattern) { - if (p->token - && (v & p->token) == p->token) { - *val &= ~p->token; - return p->pattern; - } - p++; - } - -out: - return NULL; -} - -/* ---------------------------------------------------------------------- */ - -static match_table_t brperm = { - {AuBrPerm_RO, AUFS_BRPERM_RO}, - {AuBrPerm_RR, AUFS_BRPERM_RR}, - {AuBrPerm_RW, AUFS_BRPERM_RW}, - {0, NULL} -}; - -static match_table_t brattr = { - /* general */ - {AuBrAttr_COO_REG, AUFS_BRATTR_COO_REG}, - {AuBrAttr_COO_ALL, AUFS_BRATTR_COO_ALL}, - /* 'unpin' attrib is meaningless since linux-3.18-rc1 */ - {AuBrAttr_UNPIN, AUFS_BRATTR_UNPIN}, -#ifdef CONFIG_AUFS_FHSM - {AuBrAttr_FHSM, AUFS_BRATTR_FHSM}, -#endif -#ifdef CONFIG_AUFS_XATTR - {AuBrAttr_ICEX, AUFS_BRATTR_ICEX}, - {AuBrAttr_ICEX_SEC, AUFS_BRATTR_ICEX_SEC}, - {AuBrAttr_ICEX_SYS, AUFS_BRATTR_ICEX_SYS}, - {AuBrAttr_ICEX_TR, AUFS_BRATTR_ICEX_TR}, - {AuBrAttr_ICEX_USR, AUFS_BRATTR_ICEX_USR}, - {AuBrAttr_ICEX_OTH, AUFS_BRATTR_ICEX_OTH}, -#endif - - /* ro/rr branch */ - {AuBrRAttr_WH, AUFS_BRRATTR_WH}, - - /* rw branch */ - {AuBrWAttr_MOO, AUFS_BRWATTR_MOO}, - {AuBrWAttr_NoLinkWH, AUFS_BRWATTR_NLWH}, - - {0, NULL} -}; - -static int br_attr_val(char *str, match_table_t table, substring_t args[]) -{ - int attr, v; - char *p; - - attr = 0; - do { - p = strchr(str, '+'); - if (p) - *p = 0; - v = match_token(str, table, args); - if (v) { - if (v & AuBrAttr_CMOO_Mask) - attr &= ~AuBrAttr_CMOO_Mask; - attr |= v; - } else { - if (p) - *p = '+'; - pr_warn("ignored branch attribute %s\n", str); - break; - } - if (p) - str = p + 1; - } while (p); - - return attr; -} - -static int au_do_optstr_br_attr(au_br_perm_str_t *str, int perm) -{ - int sz; - const char *p; - char *q; - - q = str->a; - *q = 0; - p = au_optstr(&perm, brattr); - if (p) { - sz = strlen(p); - memcpy(q, p, sz + 1); - q += sz; - } else - goto out; - - do { - p = au_optstr(&perm, brattr); - if (p) { - *q++ = '+'; - sz = strlen(p); - memcpy(q, p, sz + 1); - q += sz; - } - } while (p); - -out: - return q - str->a; -} - -static int noinline_for_stack br_perm_val(char *perm) -{ - int val, bad, sz; - char *p; - substring_t args[MAX_OPT_ARGS]; - au_br_perm_str_t attr; - - p = strchr(perm, '+'); - if (p) - *p = 0; - val = match_token(perm, brperm, args); - if (!val) { - if (p) - *p = '+'; - pr_warn("ignored branch permission %s\n", perm); - val = AuBrPerm_RO; - goto out; - } - if (!p) - goto out; - - val |= br_attr_val(p + 1, brattr, args); - - bad = 0; - switch (val & AuBrPerm_Mask) { - case AuBrPerm_RO: - case AuBrPerm_RR: - bad = val & AuBrWAttr_Mask; - val &= ~AuBrWAttr_Mask; - break; - case AuBrPerm_RW: - bad = val & AuBrRAttr_Mask; - val &= ~AuBrRAttr_Mask; - break; - } - - /* - * 'unpin' attrib becomes meaningless since linux-3.18-rc1, but aufs - * does not treat it as an error, just warning. - * this is a tiny guard for the user operation. - */ - if (val & AuBrAttr_UNPIN) { - bad |= AuBrAttr_UNPIN; - val &= ~AuBrAttr_UNPIN; - } - - if (unlikely(bad)) { - sz = au_do_optstr_br_attr(&attr, bad); - AuDebugOn(!sz); - pr_warn("ignored branch attribute %s\n", attr.a); - } - -out: - return val; -} - -void au_optstr_br_perm(au_br_perm_str_t *str, int perm) -{ - au_br_perm_str_t attr; - const char *p; - char *q; - int sz; - - q = str->a; - p = au_optstr(&perm, brperm); - AuDebugOn(!p || !*p); - sz = strlen(p); - memcpy(q, p, sz + 1); - q += sz; - - sz = au_do_optstr_br_attr(&attr, perm); - if (sz) { - *q++ = '+'; - memcpy(q, attr.a, sz + 1); - } - - AuDebugOn(strlen(str->a) >= sizeof(str->a)); -} - -/* ---------------------------------------------------------------------- */ - -static match_table_t udbalevel = { - {AuOpt_UDBA_REVAL, "reval"}, - {AuOpt_UDBA_NONE, "none"}, -#ifdef CONFIG_AUFS_HNOTIFY - {AuOpt_UDBA_HNOTIFY, "notify"}, /* abstraction */ -#ifdef CONFIG_AUFS_HFSNOTIFY - {AuOpt_UDBA_HNOTIFY, "fsnotify"}, -#endif -#endif - {-1, NULL} -}; - -static int noinline_for_stack udba_val(char *str) -{ - substring_t args[MAX_OPT_ARGS]; - - return match_token(str, udbalevel, args); -} - -const char *au_optstr_udba(int udba) -{ - return au_parser_pattern(udba, udbalevel); -} - -/* ---------------------------------------------------------------------- */ - -static match_table_t au_wbr_create_policy = { - {AuWbrCreate_TDP, "tdp"}, - {AuWbrCreate_TDP, "top-down-parent"}, - {AuWbrCreate_RR, "rr"}, - {AuWbrCreate_RR, "round-robin"}, - {AuWbrCreate_MFS, "mfs"}, - {AuWbrCreate_MFS, "most-free-space"}, - {AuWbrCreate_MFSV, "mfs:%d"}, - {AuWbrCreate_MFSV, "most-free-space:%d"}, - - {AuWbrCreate_MFSRR, "mfsrr:%d"}, - {AuWbrCreate_MFSRRV, "mfsrr:%d:%d"}, - {AuWbrCreate_PMFS, "pmfs"}, - {AuWbrCreate_PMFSV, "pmfs:%d"}, - {AuWbrCreate_PMFSRR, "pmfsrr:%d"}, - {AuWbrCreate_PMFSRRV, "pmfsrr:%d:%d"}, - - {-1, NULL} -}; - -/* - * cf. linux/lib/parser.c and cmdline.c - * gave up calling memparse() since it uses simple_strtoull() instead of - * kstrto...(). - */ -static int noinline_for_stack -au_match_ull(substring_t *s, unsigned long long *result) -{ - int err; - unsigned int len; - char a[32]; - - err = -ERANGE; - len = s->to - s->from; - if (len + 1 <= sizeof(a)) { - memcpy(a, s->from, len); - a[len] = '\0'; - err = kstrtoull(a, 0, result); - } - return err; -} - -static int au_wbr_mfs_wmark(substring_t *arg, char *str, - struct au_opt_wbr_create *create) -{ - int err; - unsigned long long ull; - - err = 0; - if (!au_match_ull(arg, &ull)) - create->mfsrr_watermark = ull; - else { - pr_err("bad integer in %s\n", str); - err = -EINVAL; - } - - return err; -} - -static int au_wbr_mfs_sec(substring_t *arg, char *str, - struct au_opt_wbr_create *create) -{ - int n, err; - - err = 0; - if (!match_int(arg, &n) && 0 <= n && n <= AUFS_MFS_MAX_SEC) - create->mfs_second = n; - else { - pr_err("bad integer in %s\n", str); - err = -EINVAL; - } - - return err; -} - -static int noinline_for_stack -au_wbr_create_val(char *str, struct au_opt_wbr_create *create) -{ - int err, e; - substring_t args[MAX_OPT_ARGS]; - - err = match_token(str, au_wbr_create_policy, args); - create->wbr_create = err; - switch (err) { - case AuWbrCreate_MFSRRV: - case AuWbrCreate_PMFSRRV: - e = au_wbr_mfs_wmark(&args[0], str, create); - if (!e) - e = au_wbr_mfs_sec(&args[1], str, create); - if (unlikely(e)) - err = e; - break; - case AuWbrCreate_MFSRR: - case AuWbrCreate_PMFSRR: - e = au_wbr_mfs_wmark(&args[0], str, create); - if (unlikely(e)) { - err = e; - break; - } - /*FALLTHROUGH*/ - case AuWbrCreate_MFS: - case AuWbrCreate_PMFS: - create->mfs_second = AUFS_MFS_DEF_SEC; - break; - case AuWbrCreate_MFSV: - case AuWbrCreate_PMFSV: - e = au_wbr_mfs_sec(&args[0], str, create); - if (unlikely(e)) - err = e; - break; - } - - return err; -} - -const char *au_optstr_wbr_create(int wbr_create) -{ - return au_parser_pattern(wbr_create, au_wbr_create_policy); -} - -static match_table_t au_wbr_copyup_policy = { - {AuWbrCopyup_TDP, "tdp"}, - {AuWbrCopyup_TDP, "top-down-parent"}, - {AuWbrCopyup_BUP, "bup"}, - {AuWbrCopyup_BUP, "bottom-up-parent"}, - {AuWbrCopyup_BU, "bu"}, - {AuWbrCopyup_BU, "bottom-up"}, - {-1, NULL} -}; - -static int noinline_for_stack au_wbr_copyup_val(char *str) -{ - substring_t args[MAX_OPT_ARGS]; - - return match_token(str, au_wbr_copyup_policy, args); -} - -const char *au_optstr_wbr_copyup(int wbr_copyup) -{ - return au_parser_pattern(wbr_copyup, au_wbr_copyup_policy); -} - -/* ---------------------------------------------------------------------- */ - -static const int lkup_dirflags = LOOKUP_FOLLOW | LOOKUP_DIRECTORY; - -static void dump_opts(struct au_opts *opts) -{ -#ifdef CONFIG_AUFS_DEBUG - /* reduce stack space */ - union { - struct au_opt_add *add; - struct au_opt_del *del; - struct au_opt_mod *mod; - struct au_opt_xino *xino; - struct au_opt_xino_itrunc *xino_itrunc; - struct au_opt_wbr_create *create; - } u; - struct au_opt *opt; - - opt = opts->opt; - while (opt->type != Opt_tail) { - switch (opt->type) { - case Opt_add: - u.add = &opt->add; - AuDbg("add {b%d, %s, 0x%x, %p}\n", - u.add->bindex, u.add->pathname, u.add->perm, - u.add->path.dentry); - break; - case Opt_del: - case Opt_idel: - u.del = &opt->del; - AuDbg("del {%s, %p}\n", - u.del->pathname, u.del->h_path.dentry); - break; - case Opt_mod: - case Opt_imod: - u.mod = &opt->mod; - AuDbg("mod {%s, 0x%x, %p}\n", - u.mod->path, u.mod->perm, u.mod->h_root); - break; - case Opt_append: - u.add = &opt->add; - AuDbg("append {b%d, %s, 0x%x, %p}\n", - u.add->bindex, u.add->pathname, u.add->perm, - u.add->path.dentry); - break; - case Opt_prepend: - u.add = &opt->add; - AuDbg("prepend {b%d, %s, 0x%x, %p}\n", - u.add->bindex, u.add->pathname, u.add->perm, - u.add->path.dentry); - break; - case Opt_dirwh: - AuDbg("dirwh %d\n", opt->dirwh); - break; - case Opt_rdcache: - AuDbg("rdcache %d\n", opt->rdcache); - break; - case Opt_rdblk: - AuDbg("rdblk %u\n", opt->rdblk); - break; - case Opt_rdblk_def: - AuDbg("rdblk_def\n"); - break; - case Opt_rdhash: - AuDbg("rdhash %u\n", opt->rdhash); - break; - case Opt_rdhash_def: - AuDbg("rdhash_def\n"); - break; - case Opt_xino: - u.xino = &opt->xino; - AuDbg("xino {%s %pD}\n", u.xino->path, u.xino->file); - break; - case Opt_trunc_xino: - AuLabel(trunc_xino); - break; - case Opt_notrunc_xino: - AuLabel(notrunc_xino); - break; - case Opt_trunc_xino_path: - case Opt_itrunc_xino: - u.xino_itrunc = &opt->xino_itrunc; - AuDbg("trunc_xino %d\n", u.xino_itrunc->bindex); - break; - case Opt_noxino: - AuLabel(noxino); - break; - case Opt_trunc_xib: - AuLabel(trunc_xib); - break; - case Opt_notrunc_xib: - AuLabel(notrunc_xib); - break; - case Opt_shwh: - AuLabel(shwh); - break; - case Opt_noshwh: - AuLabel(noshwh); - break; - case Opt_dirperm1: - AuLabel(dirperm1); - break; - case Opt_nodirperm1: - AuLabel(nodirperm1); - break; - case Opt_plink: - AuLabel(plink); - break; - case Opt_noplink: - AuLabel(noplink); - break; - case Opt_list_plink: - AuLabel(list_plink); - break; - case Opt_udba: - AuDbg("udba %d, %s\n", - opt->udba, au_optstr_udba(opt->udba)); - break; - case Opt_dio: - AuLabel(dio); - break; - case Opt_nodio: - AuLabel(nodio); - break; - case Opt_diropq_a: - AuLabel(diropq_a); - break; - case Opt_diropq_w: - AuLabel(diropq_w); - break; - case Opt_warn_perm: - AuLabel(warn_perm); - break; - case Opt_nowarn_perm: - AuLabel(nowarn_perm); - break; - case Opt_verbose: - AuLabel(verbose); - break; - case Opt_noverbose: - AuLabel(noverbose); - break; - case Opt_sum: - AuLabel(sum); - break; - case Opt_nosum: - AuLabel(nosum); - break; - case Opt_wsum: - AuLabel(wsum); - break; - case Opt_wbr_create: - u.create = &opt->wbr_create; - AuDbg("create %d, %s\n", u.create->wbr_create, - au_optstr_wbr_create(u.create->wbr_create)); - switch (u.create->wbr_create) { - case AuWbrCreate_MFSV: - case AuWbrCreate_PMFSV: - AuDbg("%d sec\n", u.create->mfs_second); - break; - case AuWbrCreate_MFSRR: - AuDbg("%llu watermark\n", - u.create->mfsrr_watermark); - break; - case AuWbrCreate_MFSRRV: - case AuWbrCreate_PMFSRRV: - AuDbg("%llu watermark, %d sec\n", - u.create->mfsrr_watermark, - u.create->mfs_second); - break; - } - break; - case Opt_wbr_copyup: - AuDbg("copyup %d, %s\n", opt->wbr_copyup, - au_optstr_wbr_copyup(opt->wbr_copyup)); - break; - case Opt_fhsm_sec: - AuDbg("fhsm_sec %u\n", opt->fhsm_second); - break; - case Opt_acl: - AuLabel(acl); - break; - case Opt_noacl: - AuLabel(noacl); - break; - default: - BUG(); - } - opt++; - } -#endif -} - -void au_opts_free(struct au_opts *opts) -{ - struct au_opt *opt; - - opt = opts->opt; - while (opt->type != Opt_tail) { - switch (opt->type) { - case Opt_add: - case Opt_append: - case Opt_prepend: - path_put(&opt->add.path); - break; - case Opt_del: - case Opt_idel: - path_put(&opt->del.h_path); - break; - case Opt_mod: - case Opt_imod: - dput(opt->mod.h_root); - break; - case Opt_xino: - fput(opt->xino.file); - break; - } - opt++; - } -} - -static int opt_add(struct au_opt *opt, char *opt_str, unsigned long sb_flags, - aufs_bindex_t bindex) -{ - int err; - struct au_opt_add *add = &opt->add; - char *p; - - add->bindex = bindex; - add->perm = AuBrPerm_RO; - add->pathname = opt_str; - p = strchr(opt_str, '='); - if (p) { - *p++ = 0; - if (*p) - add->perm = br_perm_val(p); - } - - err = vfsub_kern_path(add->pathname, lkup_dirflags, &add->path); - if (!err) { - if (!p) { - add->perm = AuBrPerm_RO; - if (au_test_fs_rr(add->path.dentry->d_sb)) - add->perm = AuBrPerm_RR; - else if (!bindex && !(sb_flags & MS_RDONLY)) - add->perm = AuBrPerm_RW; - } - opt->type = Opt_add; - goto out; - } - pr_err("lookup failed %s (%d)\n", add->pathname, err); - err = -EINVAL; - -out: - return err; -} - -static int au_opts_parse_del(struct au_opt_del *del, substring_t args[]) -{ - int err; - - del->pathname = args[0].from; - AuDbg("del path %s\n", del->pathname); - - err = vfsub_kern_path(del->pathname, lkup_dirflags, &del->h_path); - if (unlikely(err)) - pr_err("lookup failed %s (%d)\n", del->pathname, err); - - return err; -} - -#if 0 /* reserved for future use */ -static int au_opts_parse_idel(struct super_block *sb, aufs_bindex_t bindex, - struct au_opt_del *del, substring_t args[]) -{ - int err; - struct dentry *root; - - err = -EINVAL; - root = sb->s_root; - aufs_read_lock(root, AuLock_FLUSH); - if (bindex < 0 || au_sbend(sb) < bindex) { - pr_err("out of bounds, %d\n", bindex); - goto out; - } - - err = 0; - del->h_path.dentry = dget(au_h_dptr(root, bindex)); - del->h_path.mnt = mntget(au_sbr_mnt(sb, bindex)); - -out: - aufs_read_unlock(root, !AuLock_IR); - return err; -} -#endif - -static int noinline_for_stack -au_opts_parse_mod(struct au_opt_mod *mod, substring_t args[]) -{ - int err; - struct path path; - char *p; - - err = -EINVAL; - mod->path = args[0].from; - p = strchr(mod->path, '='); - if (unlikely(!p)) { - pr_err("no permssion %s\n", args[0].from); - goto out; - } - - *p++ = 0; - err = vfsub_kern_path(mod->path, lkup_dirflags, &path); - if (unlikely(err)) { - pr_err("lookup failed %s (%d)\n", mod->path, err); - goto out; - } - - mod->perm = br_perm_val(p); - AuDbg("mod path %s, perm 0x%x, %s\n", mod->path, mod->perm, p); - mod->h_root = dget(path.dentry); - path_put(&path); - -out: - return err; -} - -#if 0 /* reserved for future use */ -static int au_opts_parse_imod(struct super_block *sb, aufs_bindex_t bindex, - struct au_opt_mod *mod, substring_t args[]) -{ - int err; - struct dentry *root; - - err = -EINVAL; - root = sb->s_root; - aufs_read_lock(root, AuLock_FLUSH); - if (bindex < 0 || au_sbend(sb) < bindex) { - pr_err("out of bounds, %d\n", bindex); - goto out; - } - - err = 0; - mod->perm = br_perm_val(args[1].from); - AuDbg("mod path %s, perm 0x%x, %s\n", - mod->path, mod->perm, args[1].from); - mod->h_root = dget(au_h_dptr(root, bindex)); - -out: - aufs_read_unlock(root, !AuLock_IR); - return err; -} -#endif - -static int au_opts_parse_xino(struct super_block *sb, struct au_opt_xino *xino, - substring_t args[]) -{ - int err; - struct file *file; - - file = au_xino_create(sb, args[0].from, /*silent*/0); - err = PTR_ERR(file); - if (IS_ERR(file)) - goto out; - - err = -EINVAL; - if (unlikely(file->f_path.dentry->d_sb == sb)) { - fput(file); - pr_err("%s must be outside\n", args[0].from); - goto out; - } - - err = 0; - xino->file = file; - xino->path = args[0].from; - -out: - return err; -} - -static int noinline_for_stack -au_opts_parse_xino_itrunc_path(struct super_block *sb, - struct au_opt_xino_itrunc *xino_itrunc, - substring_t args[]) -{ - int err; - aufs_bindex_t bend, bindex; - struct path path; - struct dentry *root; - - err = vfsub_kern_path(args[0].from, lkup_dirflags, &path); - if (unlikely(err)) { - pr_err("lookup failed %s (%d)\n", args[0].from, err); - goto out; - } - - xino_itrunc->bindex = -1; - root = sb->s_root; - aufs_read_lock(root, AuLock_FLUSH); - bend = au_sbend(sb); - for (bindex = 0; bindex <= bend; bindex++) { - if (au_h_dptr(root, bindex) == path.dentry) { - xino_itrunc->bindex = bindex; - break; - } - } - aufs_read_unlock(root, !AuLock_IR); - path_put(&path); - - if (unlikely(xino_itrunc->bindex < 0)) { - pr_err("no such branch %s\n", args[0].from); - err = -EINVAL; - } - -out: - return err; -} - -/* called without aufs lock */ -int au_opts_parse(struct super_block *sb, char *str, struct au_opts *opts) -{ - int err, n, token; - aufs_bindex_t bindex; - unsigned char skipped; - struct dentry *root; - struct au_opt *opt, *opt_tail; - char *opt_str; - /* reduce the stack space */ - union { - struct au_opt_xino_itrunc *xino_itrunc; - struct au_opt_wbr_create *create; - } u; - struct { - substring_t args[MAX_OPT_ARGS]; - } *a; - - err = -ENOMEM; - a = kmalloc(sizeof(*a), GFP_NOFS); - if (unlikely(!a)) - goto out; - - root = sb->s_root; - err = 0; - bindex = 0; - opt = opts->opt; - opt_tail = opt + opts->max_opt - 1; - opt->type = Opt_tail; - while (!err && (opt_str = strsep(&str, ",")) && *opt_str) { - err = -EINVAL; - skipped = 0; - token = match_token(opt_str, options, a->args); - switch (token) { - case Opt_br: - err = 0; - while (!err && (opt_str = strsep(&a->args[0].from, ":")) - && *opt_str) { - err = opt_add(opt, opt_str, opts->sb_flags, - bindex++); - if (unlikely(!err && ++opt > opt_tail)) { - err = -E2BIG; - break; - } - opt->type = Opt_tail; - skipped = 1; - } - break; - case Opt_add: - if (unlikely(match_int(&a->args[0], &n))) { - pr_err("bad integer in %s\n", opt_str); - break; - } - bindex = n; - err = opt_add(opt, a->args[1].from, opts->sb_flags, - bindex); - if (!err) - opt->type = token; - break; - case Opt_append: - err = opt_add(opt, a->args[0].from, opts->sb_flags, - /*dummy bindex*/1); - if (!err) - opt->type = token; - break; - case Opt_prepend: - err = opt_add(opt, a->args[0].from, opts->sb_flags, - /*bindex*/0); - if (!err) - opt->type = token; - break; - case Opt_del: - err = au_opts_parse_del(&opt->del, a->args); - if (!err) - opt->type = token; - break; -#if 0 /* reserved for future use */ - case Opt_idel: - del->pathname = "(indexed)"; - if (unlikely(match_int(&args[0], &n))) { - pr_err("bad integer in %s\n", opt_str); - break; - } - err = au_opts_parse_idel(sb, n, &opt->del, a->args); - if (!err) - opt->type = token; - break; -#endif - case Opt_mod: - err = au_opts_parse_mod(&opt->mod, a->args); - if (!err) - opt->type = token; - break; -#ifdef IMOD /* reserved for future use */ - case Opt_imod: - u.mod->path = "(indexed)"; - if (unlikely(match_int(&a->args[0], &n))) { - pr_err("bad integer in %s\n", opt_str); - break; - } - err = au_opts_parse_imod(sb, n, &opt->mod, a->args); - if (!err) - opt->type = token; - break; -#endif - case Opt_xino: - err = au_opts_parse_xino(sb, &opt->xino, a->args); - if (!err) - opt->type = token; - break; - - case Opt_trunc_xino_path: - err = au_opts_parse_xino_itrunc_path - (sb, &opt->xino_itrunc, a->args); - if (!err) - opt->type = token; - break; - - case Opt_itrunc_xino: - u.xino_itrunc = &opt->xino_itrunc; - if (unlikely(match_int(&a->args[0], &n))) { - pr_err("bad integer in %s\n", opt_str); - break; - } - u.xino_itrunc->bindex = n; - aufs_read_lock(root, AuLock_FLUSH); - if (n < 0 || au_sbend(sb) < n) { - pr_err("out of bounds, %d\n", n); - aufs_read_unlock(root, !AuLock_IR); - break; - } - aufs_read_unlock(root, !AuLock_IR); - err = 0; - opt->type = token; - break; - - case Opt_dirwh: - if (unlikely(match_int(&a->args[0], &opt->dirwh))) - break; - err = 0; - opt->type = token; - break; - - case Opt_rdcache: - if (unlikely(match_int(&a->args[0], &n))) { - pr_err("bad integer in %s\n", opt_str); - break; - } - if (unlikely(n > AUFS_RDCACHE_MAX)) { - pr_err("rdcache must be smaller than %d\n", - AUFS_RDCACHE_MAX); - break; - } - opt->rdcache = n; - err = 0; - opt->type = token; - break; - case Opt_rdblk: - if (unlikely(match_int(&a->args[0], &n) - || n < 0 - || n > KMALLOC_MAX_SIZE)) { - pr_err("bad integer in %s\n", opt_str); - break; - } - if (unlikely(n && n < NAME_MAX)) { - pr_err("rdblk must be larger than %d\n", - NAME_MAX); - break; - } - opt->rdblk = n; - err = 0; - opt->type = token; - break; - case Opt_rdhash: - if (unlikely(match_int(&a->args[0], &n) - || n < 0 - || n * sizeof(struct hlist_head) - > KMALLOC_MAX_SIZE)) { - pr_err("bad integer in %s\n", opt_str); - break; - } - opt->rdhash = n; - err = 0; - opt->type = token; - break; - - case Opt_trunc_xino: - case Opt_notrunc_xino: - case Opt_noxino: - case Opt_trunc_xib: - case Opt_notrunc_xib: - case Opt_shwh: - case Opt_noshwh: - case Opt_dirperm1: - case Opt_nodirperm1: - case Opt_plink: - case Opt_noplink: - case Opt_list_plink: - case Opt_dio: - case Opt_nodio: - case Opt_diropq_a: - case Opt_diropq_w: - case Opt_warn_perm: - case Opt_nowarn_perm: - case Opt_verbose: - case Opt_noverbose: - case Opt_sum: - case Opt_nosum: - case Opt_wsum: - case Opt_rdblk_def: - case Opt_rdhash_def: - case Opt_acl: - case Opt_noacl: - err = 0; - opt->type = token; - break; - - case Opt_udba: - opt->udba = udba_val(a->args[0].from); - if (opt->udba >= 0) { - err = 0; - opt->type = token; - } else - pr_err("wrong value, %s\n", opt_str); - break; - - case Opt_wbr_create: - u.create = &opt->wbr_create; - u.create->wbr_create - = au_wbr_create_val(a->args[0].from, u.create); - if (u.create->wbr_create >= 0) { - err = 0; - opt->type = token; - } else - pr_err("wrong value, %s\n", opt_str); - break; - case Opt_wbr_copyup: - opt->wbr_copyup = au_wbr_copyup_val(a->args[0].from); - if (opt->wbr_copyup >= 0) { - err = 0; - opt->type = token; - } else - pr_err("wrong value, %s\n", opt_str); - break; - - case Opt_fhsm_sec: - if (unlikely(match_int(&a->args[0], &n) - || n < 0)) { - pr_err("bad integer in %s\n", opt_str); - break; - } - if (sysaufs_brs) { - opt->fhsm_second = n; - opt->type = token; - } else - pr_warn("ignored %s\n", opt_str); - err = 0; - break; - - case Opt_ignore: - pr_warn("ignored %s\n", opt_str); - /*FALLTHROUGH*/ - case Opt_ignore_silent: - skipped = 1; - err = 0; - break; - case Opt_err: - pr_err("unknown option %s\n", opt_str); - break; - } - - if (!err && !skipped) { - if (unlikely(++opt > opt_tail)) { - err = -E2BIG; - opt--; - opt->type = Opt_tail; - break; - } - opt->type = Opt_tail; - } - } - - kfree(a); - dump_opts(opts); - if (unlikely(err)) - au_opts_free(opts); - -out: - return err; -} - -static int au_opt_wbr_create(struct super_block *sb, - struct au_opt_wbr_create *create) -{ - int err; - struct au_sbinfo *sbinfo; - - SiMustWriteLock(sb); - - err = 1; /* handled */ - sbinfo = au_sbi(sb); - if (sbinfo->si_wbr_create_ops->fin) { - err = sbinfo->si_wbr_create_ops->fin(sb); - if (!err) - err = 1; - } - - sbinfo->si_wbr_create = create->wbr_create; - sbinfo->si_wbr_create_ops = au_wbr_create_ops + create->wbr_create; - switch (create->wbr_create) { - case AuWbrCreate_MFSRRV: - case AuWbrCreate_MFSRR: - case AuWbrCreate_PMFSRR: - case AuWbrCreate_PMFSRRV: - sbinfo->si_wbr_mfs.mfsrr_watermark = create->mfsrr_watermark; - /*FALLTHROUGH*/ - case AuWbrCreate_MFS: - case AuWbrCreate_MFSV: - case AuWbrCreate_PMFS: - case AuWbrCreate_PMFSV: - sbinfo->si_wbr_mfs.mfs_expire - = msecs_to_jiffies(create->mfs_second * MSEC_PER_SEC); - break; - } - - if (sbinfo->si_wbr_create_ops->init) - sbinfo->si_wbr_create_ops->init(sb); /* ignore */ - - return err; -} - -/* - * returns, - * plus: processed without an error - * zero: unprocessed - */ -static int au_opt_simple(struct super_block *sb, struct au_opt *opt, - struct au_opts *opts) -{ - int err; - struct au_sbinfo *sbinfo; - - SiMustWriteLock(sb); - - err = 1; /* handled */ - sbinfo = au_sbi(sb); - switch (opt->type) { - case Opt_udba: - sbinfo->si_mntflags &= ~AuOptMask_UDBA; - sbinfo->si_mntflags |= opt->udba; - opts->given_udba |= opt->udba; - break; - - case Opt_plink: - au_opt_set(sbinfo->si_mntflags, PLINK); - break; - case Opt_noplink: - if (au_opt_test(sbinfo->si_mntflags, PLINK)) - au_plink_put(sb, /*verbose*/1); - au_opt_clr(sbinfo->si_mntflags, PLINK); - break; - case Opt_list_plink: - if (au_opt_test(sbinfo->si_mntflags, PLINK)) - au_plink_list(sb); - break; - - case Opt_dio: - au_opt_set(sbinfo->si_mntflags, DIO); - au_fset_opts(opts->flags, REFRESH_DYAOP); - break; - case Opt_nodio: - au_opt_clr(sbinfo->si_mntflags, DIO); - au_fset_opts(opts->flags, REFRESH_DYAOP); - break; - - case Opt_fhsm_sec: - au_fhsm_set(sbinfo, opt->fhsm_second); - break; - - case Opt_diropq_a: - au_opt_set(sbinfo->si_mntflags, ALWAYS_DIROPQ); - break; - case Opt_diropq_w: - au_opt_clr(sbinfo->si_mntflags, ALWAYS_DIROPQ); - break; - - case Opt_warn_perm: - au_opt_set(sbinfo->si_mntflags, WARN_PERM); - break; - case Opt_nowarn_perm: - au_opt_clr(sbinfo->si_mntflags, WARN_PERM); - break; - - case Opt_verbose: - au_opt_set(sbinfo->si_mntflags, VERBOSE); - break; - case Opt_noverbose: - au_opt_clr(sbinfo->si_mntflags, VERBOSE); - break; - - case Opt_sum: - au_opt_set(sbinfo->si_mntflags, SUM); - break; - case Opt_wsum: - au_opt_clr(sbinfo->si_mntflags, SUM); - au_opt_set(sbinfo->si_mntflags, SUM_W); - case Opt_nosum: - au_opt_clr(sbinfo->si_mntflags, SUM); - au_opt_clr(sbinfo->si_mntflags, SUM_W); - break; - - case Opt_wbr_create: - err = au_opt_wbr_create(sb, &opt->wbr_create); - break; - case Opt_wbr_copyup: - sbinfo->si_wbr_copyup = opt->wbr_copyup; - sbinfo->si_wbr_copyup_ops = au_wbr_copyup_ops + opt->wbr_copyup; - break; - - case Opt_dirwh: - sbinfo->si_dirwh = opt->dirwh; - break; - - case Opt_rdcache: - sbinfo->si_rdcache - = msecs_to_jiffies(opt->rdcache * MSEC_PER_SEC); - break; - case Opt_rdblk: - sbinfo->si_rdblk = opt->rdblk; - break; - case Opt_rdblk_def: - sbinfo->si_rdblk = AUFS_RDBLK_DEF; - break; - case Opt_rdhash: - sbinfo->si_rdhash = opt->rdhash; - break; - case Opt_rdhash_def: - sbinfo->si_rdhash = AUFS_RDHASH_DEF; - break; - - case Opt_shwh: - au_opt_set(sbinfo->si_mntflags, SHWH); - break; - case Opt_noshwh: - au_opt_clr(sbinfo->si_mntflags, SHWH); - break; - - case Opt_dirperm1: - au_opt_set(sbinfo->si_mntflags, DIRPERM1); - break; - case Opt_nodirperm1: - au_opt_clr(sbinfo->si_mntflags, DIRPERM1); - break; - - case Opt_trunc_xino: - au_opt_set(sbinfo->si_mntflags, TRUNC_XINO); - break; - case Opt_notrunc_xino: - au_opt_clr(sbinfo->si_mntflags, TRUNC_XINO); - break; - - case Opt_trunc_xino_path: - case Opt_itrunc_xino: - err = au_xino_trunc(sb, opt->xino_itrunc.bindex); - if (!err) - err = 1; - break; - - case Opt_trunc_xib: - au_fset_opts(opts->flags, TRUNC_XIB); - break; - case Opt_notrunc_xib: - au_fclr_opts(opts->flags, TRUNC_XIB); - break; - - case Opt_acl: - sb->s_flags |= MS_POSIXACL; - break; - case Opt_noacl: - sb->s_flags &= ~MS_POSIXACL; - break; - - default: - err = 0; - break; - } - - return err; -} - -/* - * returns tri-state. - * plus: processed without an error - * zero: unprocessed - * minus: error - */ -static int au_opt_br(struct super_block *sb, struct au_opt *opt, - struct au_opts *opts) -{ - int err, do_refresh; - - err = 0; - switch (opt->type) { - case Opt_append: - opt->add.bindex = au_sbend(sb) + 1; - if (opt->add.bindex < 0) - opt->add.bindex = 0; - goto add; - case Opt_prepend: - opt->add.bindex = 0; - add: /* indented label */ - case Opt_add: - err = au_br_add(sb, &opt->add, - au_ftest_opts(opts->flags, REMOUNT)); - if (!err) { - err = 1; - au_fset_opts(opts->flags, REFRESH); - } - break; - - case Opt_del: - case Opt_idel: - err = au_br_del(sb, &opt->del, - au_ftest_opts(opts->flags, REMOUNT)); - if (!err) { - err = 1; - au_fset_opts(opts->flags, TRUNC_XIB); - au_fset_opts(opts->flags, REFRESH); - } - break; - - case Opt_mod: - case Opt_imod: - err = au_br_mod(sb, &opt->mod, - au_ftest_opts(opts->flags, REMOUNT), - &do_refresh); - if (!err) { - err = 1; - if (do_refresh) - au_fset_opts(opts->flags, REFRESH); - } - break; - } - - return err; -} - -static int au_opt_xino(struct super_block *sb, struct au_opt *opt, - struct au_opt_xino **opt_xino, - struct au_opts *opts) -{ - int err; - aufs_bindex_t bend, bindex; - struct dentry *root, *parent, *h_root; - - err = 0; - switch (opt->type) { - case Opt_xino: - err = au_xino_set(sb, &opt->xino, - !!au_ftest_opts(opts->flags, REMOUNT)); - if (unlikely(err)) - break; - - *opt_xino = &opt->xino; - au_xino_brid_set(sb, -1); - - /* safe d_parent access */ - parent = opt->xino.file->f_path.dentry->d_parent; - root = sb->s_root; - bend = au_sbend(sb); - for (bindex = 0; bindex <= bend; bindex++) { - h_root = au_h_dptr(root, bindex); - if (h_root == parent) { - au_xino_brid_set(sb, au_sbr_id(sb, bindex)); - break; - } - } - break; - - case Opt_noxino: - au_xino_clr(sb); - au_xino_brid_set(sb, -1); - *opt_xino = (void *)-1; - break; - } - - return err; -} - -int au_opts_verify(struct super_block *sb, unsigned long sb_flags, - unsigned int pending) -{ - int err, fhsm; - aufs_bindex_t bindex, bend; - unsigned char do_plink, skip, do_free, can_no_dreval; - struct au_branch *br; - struct au_wbr *wbr; - struct dentry *root, *dentry; - struct inode *dir, *h_dir; - struct au_sbinfo *sbinfo; - struct au_hinode *hdir; - - SiMustAnyLock(sb); - - sbinfo = au_sbi(sb); - AuDebugOn(!(sbinfo->si_mntflags & AuOptMask_UDBA)); - - if (!(sb_flags & MS_RDONLY)) { - if (unlikely(!au_br_writable(au_sbr_perm(sb, 0)))) - pr_warn("first branch should be rw\n"); - if (unlikely(au_opt_test(sbinfo->si_mntflags, SHWH))) - pr_warn("shwh should be used with ro\n"); - } - - if (au_opt_test((sbinfo->si_mntflags | pending), UDBA_HNOTIFY) - && !au_opt_test(sbinfo->si_mntflags, XINO)) - pr_warn("udba=*notify requires xino\n"); - - if (au_opt_test(sbinfo->si_mntflags, DIRPERM1)) - pr_warn("dirperm1 breaks the protection" - " by the permission bits on the lower branch\n"); - - err = 0; - fhsm = 0; - root = sb->s_root; - dir = d_inode(root); - do_plink = !!au_opt_test(sbinfo->si_mntflags, PLINK); - can_no_dreval = !!au_opt_test((sbinfo->si_mntflags | pending), - UDBA_NONE); - bend = au_sbend(sb); - for (bindex = 0; !err && bindex <= bend; bindex++) { - skip = 0; - h_dir = au_h_iptr(dir, bindex); - br = au_sbr(sb, bindex); - - if ((br->br_perm & AuBrAttr_ICEX) - && !h_dir->i_op->listxattr) - br->br_perm &= ~AuBrAttr_ICEX; -#if 0 - if ((br->br_perm & AuBrAttr_ICEX_SEC) - && (au_br_sb(br)->s_flags & MS_NOSEC)) - br->br_perm &= ~AuBrAttr_ICEX_SEC; -#endif - - do_free = 0; - wbr = br->br_wbr; - if (wbr) - wbr_wh_read_lock(wbr); - - if (!au_br_writable(br->br_perm)) { - do_free = !!wbr; - skip = (!wbr - || (!wbr->wbr_whbase - && !wbr->wbr_plink - && !wbr->wbr_orph)); - } else if (!au_br_wh_linkable(br->br_perm)) { - /* skip = (!br->br_whbase && !br->br_orph); */ - skip = (!wbr || !wbr->wbr_whbase); - if (skip && wbr) { - if (do_plink) - skip = !!wbr->wbr_plink; - else - skip = !wbr->wbr_plink; - } - } else { - /* skip = (br->br_whbase && br->br_ohph); */ - skip = (wbr && wbr->wbr_whbase); - if (skip) { - if (do_plink) - skip = !!wbr->wbr_plink; - else - skip = !wbr->wbr_plink; - } - } - if (wbr) - wbr_wh_read_unlock(wbr); - - if (can_no_dreval) { - dentry = br->br_path.dentry; - spin_lock(&dentry->d_lock); - if (dentry->d_flags & - (DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE)) - can_no_dreval = 0; - spin_unlock(&dentry->d_lock); - } - - if (au_br_fhsm(br->br_perm)) { - fhsm++; - AuDebugOn(!br->br_fhsm); - } - - if (skip) - continue; - - hdir = au_hi(dir, bindex); - au_hn_imtx_lock_nested(hdir, AuLsc_I_PARENT); - if (wbr) - wbr_wh_write_lock(wbr); - err = au_wh_init(br, sb); - if (wbr) - wbr_wh_write_unlock(wbr); - au_hn_imtx_unlock(hdir); - - if (!err && do_free) { - kfree(wbr); - br->br_wbr = NULL; - } - } - - if (can_no_dreval) - au_fset_si(sbinfo, NO_DREVAL); - else - au_fclr_si(sbinfo, NO_DREVAL); - - if (fhsm >= 2) { - au_fset_si(sbinfo, FHSM); - for (bindex = bend; bindex >= 0; bindex--) { - br = au_sbr(sb, bindex); - if (au_br_fhsm(br->br_perm)) { - au_fhsm_set_bottom(sb, bindex); - break; - } - } - } else { - au_fclr_si(sbinfo, FHSM); - au_fhsm_set_bottom(sb, -1); - } - - return err; -} - -int au_opts_mount(struct super_block *sb, struct au_opts *opts) -{ - int err; - unsigned int tmp; - aufs_bindex_t bindex, bend; - struct au_opt *opt; - struct au_opt_xino *opt_xino, xino; - struct au_sbinfo *sbinfo; - struct au_branch *br; - struct inode *dir; - - SiMustWriteLock(sb); - - err = 0; - opt_xino = NULL; - opt = opts->opt; - while (err >= 0 && opt->type != Opt_tail) - err = au_opt_simple(sb, opt++, opts); - if (err > 0) - err = 0; - else if (unlikely(err < 0)) - goto out; - - /* disable xino and udba temporary */ - sbinfo = au_sbi(sb); - tmp = sbinfo->si_mntflags; - au_opt_clr(sbinfo->si_mntflags, XINO); - au_opt_set_udba(sbinfo->si_mntflags, UDBA_REVAL); - - opt = opts->opt; - while (err >= 0 && opt->type != Opt_tail) - err = au_opt_br(sb, opt++, opts); - if (err > 0) - err = 0; - else if (unlikely(err < 0)) - goto out; - - bend = au_sbend(sb); - if (unlikely(bend < 0)) { - err = -EINVAL; - pr_err("no branches\n"); - goto out; - } - - if (au_opt_test(tmp, XINO)) - au_opt_set(sbinfo->si_mntflags, XINO); - opt = opts->opt; - while (!err && opt->type != Opt_tail) - err = au_opt_xino(sb, opt++, &opt_xino, opts); - if (unlikely(err)) - goto out; - - err = au_opts_verify(sb, sb->s_flags, tmp); - if (unlikely(err)) - goto out; - - /* restore xino */ - if (au_opt_test(tmp, XINO) && !opt_xino) { - xino.file = au_xino_def(sb); - err = PTR_ERR(xino.file); - if (IS_ERR(xino.file)) - goto out; - - err = au_xino_set(sb, &xino, /*remount*/0); - fput(xino.file); - if (unlikely(err)) - goto out; - } - - /* restore udba */ - tmp &= AuOptMask_UDBA; - sbinfo->si_mntflags &= ~AuOptMask_UDBA; - sbinfo->si_mntflags |= tmp; - bend = au_sbend(sb); - for (bindex = 0; bindex <= bend; bindex++) { - br = au_sbr(sb, bindex); - err = au_hnotify_reset_br(tmp, br, br->br_perm); - if (unlikely(err)) - AuIOErr("hnotify failed on br %d, %d, ignored\n", - bindex, err); - /* go on even if err */ - } - if (au_opt_test(tmp, UDBA_HNOTIFY)) { - dir = d_inode(sb->s_root); - au_hn_reset(dir, au_hi_flags(dir, /*isdir*/1) & ~AuHi_XINO); - } - -out: - return err; -} - -int au_opts_remount(struct super_block *sb, struct au_opts *opts) -{ - int err, rerr; - unsigned char no_dreval; - struct inode *dir; - struct au_opt_xino *opt_xino; - struct au_opt *opt; - struct au_sbinfo *sbinfo; - - SiMustWriteLock(sb); - - err = 0; - dir = d_inode(sb->s_root); - sbinfo = au_sbi(sb); - opt_xino = NULL; - opt = opts->opt; - while (err >= 0 && opt->type != Opt_tail) { - err = au_opt_simple(sb, opt, opts); - if (!err) - err = au_opt_br(sb, opt, opts); - if (!err) - err = au_opt_xino(sb, opt, &opt_xino, opts); - opt++; - } - if (err > 0) - err = 0; - AuTraceErr(err); - /* go on even err */ - - no_dreval = !!au_ftest_si(sbinfo, NO_DREVAL); - rerr = au_opts_verify(sb, opts->sb_flags, /*pending*/0); - if (unlikely(rerr && !err)) - err = rerr; - - if (no_dreval != !!au_ftest_si(sbinfo, NO_DREVAL)) - au_fset_opts(opts->flags, REFRESH_IDOP); - - if (au_ftest_opts(opts->flags, TRUNC_XIB)) { - rerr = au_xib_trunc(sb); - if (unlikely(rerr && !err)) - err = rerr; - } - - /* will be handled by the caller */ - if (!au_ftest_opts(opts->flags, REFRESH) - && (opts->given_udba - || au_opt_test(sbinfo->si_mntflags, XINO) - || au_ftest_opts(opts->flags, REFRESH_IDOP) - )) - au_fset_opts(opts->flags, REFRESH); - - AuDbg("status 0x%x\n", opts->flags); - return err; -} - -/* ---------------------------------------------------------------------- */ - -unsigned int au_opt_udba(struct super_block *sb) -{ - return au_mntflags(sb) & AuOptMask_UDBA; -} diff --git a/fs/aufs/opts.h b/fs/aufs/opts.h deleted file mode 100644 index 0d6c2e1c7..000000000 --- a/fs/aufs/opts.h +++ /dev/null @@ -1,198 +0,0 @@ -/* - * Copyright (C) 2005-2016 Junjiro R. Okajima - */ - -/* - * mount options/flags - */ - -#ifndef __AUFS_OPTS_H__ -#define __AUFS_OPTS_H__ - -#ifdef __KERNEL__ - -#include <linux/path.h> - -struct file; -struct super_block; - -/* ---------------------------------------------------------------------- */ - -/* mount flags */ -#define AuOpt_XINO 1 /* external inode number bitmap - and translation table */ -#define AuOpt_TRUNC_XINO (1 << 1) /* truncate xino files */ -#define AuOpt_UDBA_NONE (1 << 2) /* users direct branch access */ -#define AuOpt_UDBA_REVAL (1 << 3) -#define AuOpt_UDBA_HNOTIFY (1 << 4) -#define AuOpt_SHWH (1 << 5) /* show whiteout */ -#define AuOpt_PLINK (1 << 6) /* pseudo-link */ -#define AuOpt_DIRPERM1 (1 << 7) /* ignore the lower dir's perm - bits */ -#define AuOpt_ALWAYS_DIROPQ (1 << 9) /* policy to creating diropq */ -#define AuOpt_SUM (1 << 10) /* summation for statfs(2) */ -#define AuOpt_SUM_W (1 << 11) /* unimplemented */ -#define AuOpt_WARN_PERM (1 << 12) /* warn when add-branch */ -#define AuOpt_VERBOSE (1 << 13) /* busy inode when del-branch */ -#define AuOpt_DIO (1 << 14) /* direct io */ - -#ifndef CONFIG_AUFS_HNOTIFY -#undef AuOpt_UDBA_HNOTIFY -#define AuOpt_UDBA_HNOTIFY 0 -#endif -#ifndef CONFIG_AUFS_SHWH -#undef AuOpt_SHWH -#define AuOpt_SHWH 0 -#endif - -#define AuOpt_Def (AuOpt_XINO \ - | AuOpt_UDBA_REVAL \ - | AuOpt_PLINK \ - /* | AuOpt_DIRPERM1 */ \ - | AuOpt_WARN_PERM) -#define AuOptMask_UDBA (AuOpt_UDBA_NONE \ - | AuOpt_UDBA_REVAL \ - | AuOpt_UDBA_HNOTIFY) - -#define au_opt_test(flags, name) (flags & AuOpt_##name) -#define au_opt_set(flags, name) do { \ - BUILD_BUG_ON(AuOpt_##name & AuOptMask_UDBA); \ - ((flags) |= AuOpt_##name); \ -} while (0) -#define au_opt_set_udba(flags, name) do { \ - (flags) &= ~AuOptMask_UDBA; \ - ((flags) |= AuOpt_##name); \ -} while (0) -#define au_opt_clr(flags, name) do { \ - ((flags) &= ~AuOpt_##name); \ -} while (0) - -static inline unsigned int au_opts_plink(unsigned int mntflags) -{ -#ifdef CONFIG_PROC_FS - return mntflags; -#else - return mntflags & ~AuOpt_PLINK; -#endif -} - -/* ---------------------------------------------------------------------- */ - -/* policies to select one among multiple writable branches */ -enum { - AuWbrCreate_TDP, /* top down parent */ - AuWbrCreate_RR, /* round robin */ - AuWbrCreate_MFS, /* most free space */ - AuWbrCreate_MFSV, /* mfs with seconds */ - AuWbrCreate_MFSRR, /* mfs then rr */ - AuWbrCreate_MFSRRV, /* mfs then rr with seconds */ - AuWbrCreate_PMFS, /* parent and mfs */ - AuWbrCreate_PMFSV, /* parent and mfs with seconds */ - AuWbrCreate_PMFSRR, /* parent, mfs and round-robin */ - AuWbrCreate_PMFSRRV, /* plus seconds */ - - AuWbrCreate_Def = AuWbrCreate_TDP -}; - -enum { - AuWbrCopyup_TDP, /* top down parent */ - AuWbrCopyup_BUP, /* bottom up parent */ - AuWbrCopyup_BU, /* bottom up */ - - AuWbrCopyup_Def = AuWbrCopyup_TDP -}; - -/* ---------------------------------------------------------------------- */ - -struct au_opt_add { - aufs_bindex_t bindex; - char *pathname; - int perm; - struct path path; -}; - -struct au_opt_del { - char *pathname; - struct path h_path; -}; - -struct au_opt_mod { - char *path; - int perm; - struct dentry *h_root; -}; - -struct au_opt_xino { - char *path; - struct file *file; -}; - -struct au_opt_xino_itrunc { - aufs_bindex_t bindex; -}; - -struct au_opt_wbr_create { - int wbr_create; - int mfs_second; - unsigned long long mfsrr_watermark; -}; - -struct au_opt { - int type; - union { - struct au_opt_xino xino; - struct au_opt_xino_itrunc xino_itrunc; - struct au_opt_add add; - struct au_opt_del del; - struct au_opt_mod mod; - int dirwh; - int rdcache; - unsigned int rdblk; - unsigned int rdhash; - int udba; - struct au_opt_wbr_create wbr_create; - int wbr_copyup; - unsigned int fhsm_second; - }; -}; - -/* opts flags */ -#define AuOpts_REMOUNT 1 -#define AuOpts_REFRESH (1 << 1) -#define AuOpts_TRUNC_XIB (1 << 2) -#define AuOpts_REFRESH_DYAOP (1 << 3) -#define AuOpts_REFRESH_IDOP (1 << 4) -#define au_ftest_opts(flags, name) ((flags) & AuOpts_##name) -#define au_fset_opts(flags, name) \ - do { (flags) |= AuOpts_##name; } while (0) -#define au_fclr_opts(flags, name) \ - do { (flags) &= ~AuOpts_##name; } while (0) - -struct au_opts { - struct au_opt *opt; - int max_opt; - - unsigned int given_udba; - unsigned int flags; - unsigned long sb_flags; -}; - -/* ---------------------------------------------------------------------- */ - -/* opts.c */ -void au_optstr_br_perm(au_br_perm_str_t *str, int perm); -const char *au_optstr_udba(int udba); -const char *au_optstr_wbr_copyup(int wbr_copyup); -const char *au_optstr_wbr_create(int wbr_create); - -void au_opts_free(struct au_opts *opts); -int au_opts_parse(struct super_block *sb, char *str, struct au_opts *opts); -int au_opts_verify(struct super_block *sb, unsigned long sb_flags, - unsigned int pending); -int au_opts_mount(struct super_block *sb, struct au_opts *opts); -int au_opts_remount(struct super_block *sb, struct au_opts *opts); - -unsigned int au_opt_udba(struct super_block *sb); - -#endif /* __KERNEL__ */ -#endif /* __AUFS_OPTS_H__ */ diff --git a/fs/aufs/plink.c b/fs/aufs/plink.c deleted file mode 100644 index 6fdab1e0e..000000000 --- a/fs/aufs/plink.c +++ /dev/null @@ -1,515 +0,0 @@ -/* - * Copyright (C) 2005-2016 Junjiro R. Okajima - */ - -/* - * pseudo-link - */ - -#include "aufs.h" - -/* - * the pseudo-link maintenance mode. - * during a user process maintains the pseudo-links, - * prohibit adding a new plink and branch manipulation. - * - * Flags - * NOPLM: - * For entry functions which will handle plink, and i_mutex is already held - * in VFS. - * They cannot wait and should return an error at once. - * Callers has to check the error. - * NOPLMW: - * For entry functions which will handle plink, but i_mutex is not held - * in VFS. - * They can wait the plink maintenance mode to finish. - * - * They behave like F_SETLK and F_SETLKW. - * If the caller never handle plink, then both flags are unnecessary. - */ - -int au_plink_maint(struct super_block *sb, int flags) -{ - int err; - pid_t pid, ppid; - struct au_sbinfo *sbi; - - SiMustAnyLock(sb); - - err = 0; - if (!au_opt_test(au_mntflags(sb), PLINK)) - goto out; - - sbi = au_sbi(sb); - pid = sbi->si_plink_maint_pid; - if (!pid || pid == current->pid) - goto out; - - /* todo: it highly depends upon /sbin/mount.aufs */ - rcu_read_lock(); - ppid = task_pid_vnr(rcu_dereference(current->real_parent)); - rcu_read_unlock(); - if (pid == ppid) - goto out; - - if (au_ftest_lock(flags, NOPLMW)) { - /* if there is no i_mutex lock in VFS, we don't need to wait */ - /* AuDebugOn(!lockdep_depth(current)); */ - while (sbi->si_plink_maint_pid) { - si_read_unlock(sb); - /* gave up wake_up_bit() */ - wait_event(sbi->si_plink_wq, !sbi->si_plink_maint_pid); - - if (au_ftest_lock(flags, FLUSH)) - au_nwt_flush(&sbi->si_nowait); - si_noflush_read_lock(sb); - } - } else if (au_ftest_lock(flags, NOPLM)) { - AuDbg("ppid %d, pid %d\n", ppid, pid); - err = -EAGAIN; - } - -out: - return err; -} - -void au_plink_maint_leave(struct au_sbinfo *sbinfo) -{ - spin_lock(&sbinfo->si_plink_maint_lock); - sbinfo->si_plink_maint_pid = 0; - spin_unlock(&sbinfo->si_plink_maint_lock); - wake_up_all(&sbinfo->si_plink_wq); -} - -int au_plink_maint_enter(struct super_block *sb) -{ - int err; - struct au_sbinfo *sbinfo; - - err = 0; - sbinfo = au_sbi(sb); - /* make sure i am the only one in this fs */ - si_write_lock(sb, AuLock_FLUSH); - if (au_opt_test(au_mntflags(sb), PLINK)) { - spin_lock(&sbinfo->si_plink_maint_lock); - if (!sbinfo->si_plink_maint_pid) - sbinfo->si_plink_maint_pid = current->pid; - else - err = -EBUSY; - spin_unlock(&sbinfo->si_plink_maint_lock); - } - si_write_unlock(sb); - - return err; -} - -/* ---------------------------------------------------------------------- */ - -#ifdef CONFIG_AUFS_DEBUG -void au_plink_list(struct super_block *sb) -{ - int i; - struct au_sbinfo *sbinfo; - struct hlist_head *plink_hlist; - struct pseudo_link *plink; - - SiMustAnyLock(sb); - - sbinfo = au_sbi(sb); - AuDebugOn(!au_opt_test(au_mntflags(sb), PLINK)); - AuDebugOn(au_plink_maint(sb, AuLock_NOPLM)); - - for (i = 0; i < AuPlink_NHASH; i++) { - plink_hlist = &sbinfo->si_plink[i].head; - rcu_read_lock(); - hlist_for_each_entry_rcu(plink, plink_hlist, hlist) - AuDbg("%lu\n", plink->inode->i_ino); - rcu_read_unlock(); - } -} -#endif - -/* is the inode pseudo-linked? */ -int au_plink_test(struct inode *inode) -{ - int found, i; - struct au_sbinfo *sbinfo; - struct hlist_head *plink_hlist; - struct pseudo_link *plink; - - sbinfo = au_sbi(inode->i_sb); - AuRwMustAnyLock(&sbinfo->si_rwsem); - AuDebugOn(!au_opt_test(au_mntflags(inode->i_sb), PLINK)); - AuDebugOn(au_plink_maint(inode->i_sb, AuLock_NOPLM)); - - found = 0; - i = au_plink_hash(inode->i_ino); - plink_hlist = &sbinfo->si_plink[i].head; - rcu_read_lock(); - hlist_for_each_entry_rcu(plink, plink_hlist, hlist) - if (plink->inode == inode) { - found = 1; - break; - } - rcu_read_unlock(); - return found; -} - -/* ---------------------------------------------------------------------- */ - -/* - * generate a name for plink. - * the file will be stored under AUFS_WH_PLINKDIR. - */ -/* 20 is max digits length of ulong 64 */ -#define PLINK_NAME_LEN ((20 + 1) * 2) - -static int plink_name(char *name, int len, struct inode *inode, - aufs_bindex_t bindex) -{ - int rlen; - struct inode *h_inode; - - h_inode = au_h_iptr(inode, bindex); - rlen = snprintf(name, len, "%lu.%lu", inode->i_ino, h_inode->i_ino); - return rlen; -} - -struct au_do_plink_lkup_args { - struct dentry **errp; - struct qstr *tgtname; - struct dentry *h_parent; - struct au_branch *br; -}; - -static struct dentry *au_do_plink_lkup(struct qstr *tgtname, - struct dentry *h_parent, - struct au_branch *br) -{ - struct dentry *h_dentry; - struct mutex *h_mtx; - - h_mtx = &d_inode(h_parent)->i_mutex; - mutex_lock_nested(h_mtx, AuLsc_I_CHILD2); - h_dentry = vfsub_lkup_one(tgtname, h_parent); - mutex_unlock(h_mtx); - return h_dentry; -} - -static void au_call_do_plink_lkup(void *args) -{ - struct au_do_plink_lkup_args *a = args; - *a->errp = au_do_plink_lkup(a->tgtname, a->h_parent, a->br); -} - -/* lookup the plink-ed @inode under the branch at @bindex */ -struct dentry *au_plink_lkup(struct inode *inode, aufs_bindex_t bindex) -{ - struct dentry *h_dentry, *h_parent; - struct au_branch *br; - int wkq_err; - char a[PLINK_NAME_LEN]; - struct qstr tgtname = QSTR_INIT(a, 0); - - AuDebugOn(au_plink_maint(inode->i_sb, AuLock_NOPLM)); - - br = au_sbr(inode->i_sb, bindex); - h_parent = br->br_wbr->wbr_plink; - tgtname.len = plink_name(a, sizeof(a), inode, bindex); - - if (!uid_eq(current_fsuid(), GLOBAL_ROOT_UID)) { - struct au_do_plink_lkup_args args = { - .errp = &h_dentry, - .tgtname = &tgtname, - .h_parent = h_parent, - .br = br - }; - - wkq_err = au_wkq_wait(au_call_do_plink_lkup, &args); - if (unlikely(wkq_err)) - h_dentry = ERR_PTR(wkq_err); - } else - h_dentry = au_do_plink_lkup(&tgtname, h_parent, br); - - return h_dentry; -} - -/* create a pseudo-link */ -static int do_whplink(struct qstr *tgt, struct dentry *h_parent, - struct dentry *h_dentry, struct au_branch *br) -{ - int err; - struct path h_path = { - .mnt = au_br_mnt(br) - }; - struct inode *h_dir, *delegated; - - h_dir = d_inode(h_parent); - mutex_lock_nested(&h_dir->i_mutex, AuLsc_I_CHILD2); -again: - h_path.dentry = vfsub_lkup_one(tgt, h_parent); - err = PTR_ERR(h_path.dentry); - if (IS_ERR(h_path.dentry)) - goto out; - - err = 0; - /* wh.plink dir is not monitored */ - /* todo: is it really safe? */ - if (d_is_positive(h_path.dentry) - && d_inode(h_path.dentry) != d_inode(h_dentry)) { - delegated = NULL; - err = vfsub_unlink(h_dir, &h_path, &delegated, /*force*/0); - if (unlikely(err == -EWOULDBLOCK)) { - pr_warn("cannot retry for NFSv4 delegation" - " for an internal unlink\n"); - iput(delegated); - } - dput(h_path.dentry); - h_path.dentry = NULL; - if (!err) - goto again; - } - if (!err && d_is_negative(h_path.dentry)) { - delegated = NULL; - err = vfsub_link(h_dentry, h_dir, &h_path, &delegated); - if (unlikely(err == -EWOULDBLOCK)) { - pr_warn("cannot retry for NFSv4 delegation" - " for an internal link\n"); - iput(delegated); - } - } - dput(h_path.dentry); - -out: - mutex_unlock(&h_dir->i_mutex); - return err; -} - -struct do_whplink_args { - int *errp; - struct qstr *tgt; - struct dentry *h_parent; - struct dentry *h_dentry; - struct au_branch *br; -}; - -static void call_do_whplink(void *args) -{ - struct do_whplink_args *a = args; - *a->errp = do_whplink(a->tgt, a->h_parent, a->h_dentry, a->br); -} - -static int whplink(struct dentry *h_dentry, struct inode *inode, - aufs_bindex_t bindex, struct au_branch *br) -{ - int err, wkq_err; - struct au_wbr *wbr; - struct dentry *h_parent; - char a[PLINK_NAME_LEN]; - struct qstr tgtname = QSTR_INIT(a, 0); - - wbr = au_sbr(inode->i_sb, bindex)->br_wbr; - h_parent = wbr->wbr_plink; - tgtname.len = plink_name(a, sizeof(a), inode, bindex); - - /* always superio. */ - if (!uid_eq(current_fsuid(), GLOBAL_ROOT_UID)) { - struct do_whplink_args args = { - .errp = &err, - .tgt = &tgtname, - .h_parent = h_parent, - .h_dentry = h_dentry, - .br = br - }; - wkq_err = au_wkq_wait(call_do_whplink, &args); - if (unlikely(wkq_err)) - err = wkq_err; - } else - err = do_whplink(&tgtname, h_parent, h_dentry, br); - - return err; -} - -/* free a single plink */ -static void do_put_plink(struct pseudo_link *plink, int do_del) -{ - if (do_del) - hlist_del(&plink->hlist); - iput(plink->inode); - kfree(plink); -} - -static void do_put_plink_rcu(struct rcu_head *rcu) -{ - struct pseudo_link *plink; - - plink = container_of(rcu, struct pseudo_link, rcu); - iput(plink->inode); - kfree(plink); -} - -/* - * create a new pseudo-link for @h_dentry on @bindex. - * the linked inode is held in aufs @inode. - */ -void au_plink_append(struct inode *inode, aufs_bindex_t bindex, - struct dentry *h_dentry) -{ - struct super_block *sb; - struct au_sbinfo *sbinfo; - struct hlist_head *plink_hlist; - struct pseudo_link *plink, *tmp; - struct au_sphlhead *sphl; - int found, err, cnt, i; - - sb = inode->i_sb; - sbinfo = au_sbi(sb); - AuDebugOn(!au_opt_test(au_mntflags(sb), PLINK)); - AuDebugOn(au_plink_maint(sb, AuLock_NOPLM)); - - found = au_plink_test(inode); - if (found) - return; - - i = au_plink_hash(inode->i_ino); - sphl = sbinfo->si_plink + i; - plink_hlist = &sphl->head; - tmp = kmalloc(sizeof(*plink), GFP_NOFS); - if (tmp) - tmp->inode = au_igrab(inode); - else { - err = -ENOMEM; - goto out; - } - - spin_lock(&sphl->spin); - hlist_for_each_entry(plink, plink_hlist, hlist) { - if (plink->inode == inode) { - found = 1; - break; - } - } - if (!found) - hlist_add_head_rcu(&tmp->hlist, plink_hlist); - spin_unlock(&sphl->spin); - if (!found) { - cnt = au_sphl_count(sphl); -#define msg "unexpectedly unblanced or too many pseudo-links" - if (cnt > AUFS_PLINK_WARN) - AuWarn1(msg ", %d\n", cnt); -#undef msg - err = whplink(h_dentry, inode, bindex, au_sbr(sb, bindex)); - } else { - do_put_plink(tmp, 0); - return; - } - -out: - if (unlikely(err)) { - pr_warn("err %d, damaged pseudo link.\n", err); - if (tmp) { - au_sphl_del_rcu(&tmp->hlist, sphl); - call_rcu(&tmp->rcu, do_put_plink_rcu); - } - } -} - -/* free all plinks */ -void au_plink_put(struct super_block *sb, int verbose) -{ - int i, warned; - struct au_sbinfo *sbinfo; - struct hlist_head *plink_hlist; - struct hlist_node *tmp; - struct pseudo_link *plink; - - SiMustWriteLock(sb); - - sbinfo = au_sbi(sb); - AuDebugOn(!au_opt_test(au_mntflags(sb), PLINK)); - AuDebugOn(au_plink_maint(sb, AuLock_NOPLM)); - - /* no spin_lock since sbinfo is write-locked */ - warned = 0; - for (i = 0; i < AuPlink_NHASH; i++) { - plink_hlist = &sbinfo->si_plink[i].head; - if (!warned && verbose && !hlist_empty(plink_hlist)) { - pr_warn("pseudo-link is not flushed"); - warned = 1; - } - hlist_for_each_entry_safe(plink, tmp, plink_hlist, hlist) - do_put_plink(plink, 0); - INIT_HLIST_HEAD(plink_hlist); - } -} - -void au_plink_clean(struct super_block *sb, int verbose) -{ - struct dentry *root; - - root = sb->s_root; - aufs_write_lock(root); - if (au_opt_test(au_mntflags(sb), PLINK)) - au_plink_put(sb, verbose); - aufs_write_unlock(root); -} - -static int au_plink_do_half_refresh(struct inode *inode, aufs_bindex_t br_id) -{ - int do_put; - aufs_bindex_t bstart, bend, bindex; - - do_put = 0; - bstart = au_ibstart(inode); - bend = au_ibend(inode); - if (bstart >= 0) { - for (bindex = bstart; bindex <= bend; bindex++) { - if (!au_h_iptr(inode, bindex) - || au_ii_br_id(inode, bindex) != br_id) - continue; - au_set_h_iptr(inode, bindex, NULL, 0); - do_put = 1; - break; - } - if (do_put) - for (bindex = bstart; bindex <= bend; bindex++) - if (au_h_iptr(inode, bindex)) { - do_put = 0; - break; - } - } else - do_put = 1; - - return do_put; -} - -/* free the plinks on a branch specified by @br_id */ -void au_plink_half_refresh(struct super_block *sb, aufs_bindex_t br_id) -{ - struct au_sbinfo *sbinfo; - struct hlist_head *plink_hlist; - struct hlist_node *tmp; - struct pseudo_link *plink; - struct inode *inode; - int i, do_put; - - SiMustWriteLock(sb); - - sbinfo = au_sbi(sb); - AuDebugOn(!au_opt_test(au_mntflags(sb), PLINK)); - AuDebugOn(au_plink_maint(sb, AuLock_NOPLM)); - - /* no spin_lock since sbinfo is write-locked */ - for (i = 0; i < AuPlink_NHASH; i++) { - plink_hlist = &sbinfo->si_plink[i].head; - hlist_for_each_entry_safe(plink, tmp, plink_hlist, hlist) { - inode = au_igrab(plink->inode); - ii_write_lock_child(inode); - do_put = au_plink_do_half_refresh(inode, br_id); - if (do_put) - do_put_plink(plink, 1); - ii_write_unlock(inode); - iput(inode); - } - } -} diff --git a/fs/aufs/poll.c b/fs/aufs/poll.c deleted file mode 100644 index dd2baf5dc..000000000 --- a/fs/aufs/poll.c +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Copyright (C) 2005-2016 Junjiro R. Okajima - */ - -/* - * poll operation - * There is only one filesystem which implements ->poll operation, currently. - */ - -#include "aufs.h" - -unsigned int aufs_poll(struct file *file, poll_table *wait) -{ - unsigned int mask; - int err; - struct file *h_file; - struct super_block *sb; - - /* We should pretend an error happened. */ - mask = POLLERR /* | POLLIN | POLLOUT */; - sb = file->f_path.dentry->d_sb; - si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLMW); - - h_file = au_read_pre(file, /*keep_fi*/0); - err = PTR_ERR(h_file); - if (IS_ERR(h_file)) - goto out; - - /* it is not an error if h_file has no operation */ - mask = DEFAULT_POLLMASK; - if (h_file->f_op->poll) - mask = h_file->f_op->poll(h_file, wait); - fput(h_file); /* instead of au_read_post() */ - -out: - si_read_unlock(sb); - AuTraceErr((int)mask); - return mask; -} diff --git a/fs/aufs/posix_acl.c b/fs/aufs/posix_acl.c deleted file mode 100644 index a3c442c08..000000000 --- a/fs/aufs/posix_acl.c +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Copyright (C) 2014-2016 Junjiro R. Okajima - */ - -/* - * posix acl operations - */ - -#include <linux/fs.h> -#include "aufs.h" - -struct posix_acl *aufs_get_acl(struct inode *inode, int type) -{ - struct posix_acl *acl; - int err; - aufs_bindex_t bindex; - struct inode *h_inode; - struct super_block *sb; - - acl = NULL; - sb = inode->i_sb; - si_read_lock(sb, AuLock_FLUSH); - ii_read_lock_child(inode); - if (!(sb->s_flags & MS_POSIXACL)) - goto out; - - bindex = au_ibstart(inode); - h_inode = au_h_iptr(inode, bindex); - if (unlikely(!h_inode - || ((h_inode->i_mode & S_IFMT) - != (inode->i_mode & S_IFMT)))) { - err = au_busy_or_stale(); - acl = ERR_PTR(err); - goto out; - } - - /* always topmost only */ - acl = get_acl(h_inode, type); - -out: - ii_read_unlock(inode); - si_read_unlock(sb); - - AuTraceErrPtr(acl); - return acl; -} - -int aufs_set_acl(struct inode *inode, struct posix_acl *acl, int type) -{ - int err; - ssize_t ssz; - struct dentry *dentry; - struct au_srxattr arg = { - .type = AU_ACL_SET, - .u.acl_set = { - .acl = acl, - .type = type - }, - }; - - mutex_lock(&inode->i_mutex); - if (inode->i_ino == AUFS_ROOT_INO) - dentry = dget(inode->i_sb->s_root); - else { - dentry = d_find_alias(inode); - if (!dentry) - dentry = d_find_any_alias(inode); - if (!dentry) { - pr_warn("cannot handle this inode, " - "please report to aufs-users ML\n"); - err = -ENOENT; - goto out; - } - } - - ssz = au_srxattr(dentry, &arg); - dput(dentry); - err = ssz; - if (ssz >= 0) - err = 0; - -out: - mutex_unlock(&inode->i_mutex); - return err; -} diff --git a/fs/aufs/procfs.c b/fs/aufs/procfs.c deleted file mode 100644 index 2c8893edf..000000000 --- a/fs/aufs/procfs.c +++ /dev/null @@ -1,156 +0,0 @@ -/* - * Copyright (C) 2010-2016 Junjiro R. Okajima - */ - -/* - * procfs interfaces - */ - -#include <linux/proc_fs.h> -#include "aufs.h" - -static int au_procfs_plm_release(struct inode *inode, struct file *file) -{ - struct au_sbinfo *sbinfo; - - sbinfo = file->private_data; - if (sbinfo) { - au_plink_maint_leave(sbinfo); - kobject_put(&sbinfo->si_kobj); - } - - return 0; -} - -static void au_procfs_plm_write_clean(struct file *file) -{ - struct au_sbinfo *sbinfo; - - sbinfo = file->private_data; - if (sbinfo) - au_plink_clean(sbinfo->si_sb, /*verbose*/0); -} - -static int au_procfs_plm_write_si(struct file *file, unsigned long id) -{ - int err; - struct super_block *sb; - struct au_sbinfo *sbinfo; - - err = -EBUSY; - if (unlikely(file->private_data)) - goto out; - - sb = NULL; - /* don't use au_sbilist_lock() here */ - spin_lock(&au_sbilist.spin); - list_for_each_entry(sbinfo, &au_sbilist.head, si_list) - if (id == sysaufs_si_id(sbinfo)) { - kobject_get(&sbinfo->si_kobj); - sb = sbinfo->si_sb; - break; - } - spin_unlock(&au_sbilist.spin); - - err = -EINVAL; - if (unlikely(!sb)) - goto out; - - err = au_plink_maint_enter(sb); - if (!err) - /* keep kobject_get() */ - file->private_data = sbinfo; - else - kobject_put(&sbinfo->si_kobj); -out: - return err; -} - -/* - * Accept a valid "si=xxxx" only. - * Once it is accepted successfully, accept "clean" too. - */ -static ssize_t au_procfs_plm_write(struct file *file, const char __user *ubuf, - size_t count, loff_t *ppos) -{ - ssize_t err; - unsigned long id; - /* last newline is allowed */ - char buf[3 + sizeof(unsigned long) * 2 + 1]; - - err = -EACCES; - if (unlikely(!capable(CAP_SYS_ADMIN))) - goto out; - - err = -EINVAL; - if (unlikely(count > sizeof(buf))) - goto out; - - err = copy_from_user(buf, ubuf, count); - if (unlikely(err)) { - err = -EFAULT; - goto out; - } - buf[count] = 0; - - err = -EINVAL; - if (!strcmp("clean", buf)) { - au_procfs_plm_write_clean(file); - goto out_success; - } else if (unlikely(strncmp("si=", buf, 3))) - goto out; - - err = kstrtoul(buf + 3, 16, &id); - if (unlikely(err)) - goto out; - - err = au_procfs_plm_write_si(file, id); - if (unlikely(err)) - goto out; - -out_success: - err = count; /* success */ -out: - return err; -} - -static const struct file_operations au_procfs_plm_fop = { - .write = au_procfs_plm_write, - .release = au_procfs_plm_release, - .owner = THIS_MODULE -}; - -/* ---------------------------------------------------------------------- */ - -static struct proc_dir_entry *au_procfs_dir; - -void au_procfs_fin(void) -{ - remove_proc_entry(AUFS_PLINK_MAINT_NAME, au_procfs_dir); - remove_proc_entry(AUFS_PLINK_MAINT_DIR, NULL); -} - -int __init au_procfs_init(void) -{ - int err; - struct proc_dir_entry *entry; - - err = -ENOMEM; - au_procfs_dir = proc_mkdir(AUFS_PLINK_MAINT_DIR, NULL); - if (unlikely(!au_procfs_dir)) - goto out; - - entry = proc_create(AUFS_PLINK_MAINT_NAME, S_IFREG | S_IWUSR, - au_procfs_dir, &au_procfs_plm_fop); - if (unlikely(!entry)) - goto out_dir; - - err = 0; - goto out; /* success */ - - -out_dir: - remove_proc_entry(AUFS_PLINK_MAINT_DIR, NULL); -out: - return err; -} diff --git a/fs/aufs/rdu.c b/fs/aufs/rdu.c deleted file mode 100644 index a9e9e9893..000000000 --- a/fs/aufs/rdu.c +++ /dev/null @@ -1,375 +0,0 @@ -/* - * Copyright (C) 2005-2016 Junjiro R. Okajima - */ - -/* - * readdir in userspace. - */ - -#include <linux/compat.h> -#include <linux/fs_stack.h> -#include <linux/security.h> -#include "aufs.h" - -/* bits for struct aufs_rdu.flags */ -#define AuRdu_CALLED 1 -#define AuRdu_CONT (1 << 1) -#define AuRdu_FULL (1 << 2) -#define au_ftest_rdu(flags, name) ((flags) & AuRdu_##name) -#define au_fset_rdu(flags, name) \ - do { (flags) |= AuRdu_##name; } while (0) -#define au_fclr_rdu(flags, name) \ - do { (flags) &= ~AuRdu_##name; } while (0) - -struct au_rdu_arg { - struct dir_context ctx; - struct aufs_rdu *rdu; - union au_rdu_ent_ul ent; - unsigned long end; - - struct super_block *sb; - int err; -}; - -static int au_rdu_fill(struct dir_context *ctx, const char *name, int nlen, - loff_t offset, u64 h_ino, unsigned int d_type) -{ - int err, len; - struct au_rdu_arg *arg = container_of(ctx, struct au_rdu_arg, ctx); - struct aufs_rdu *rdu = arg->rdu; - struct au_rdu_ent ent; - - err = 0; - arg->err = 0; - au_fset_rdu(rdu->cookie.flags, CALLED); - len = au_rdu_len(nlen); - if (arg->ent.ul + len < arg->end) { - ent.ino = h_ino; - ent.bindex = rdu->cookie.bindex; - ent.type = d_type; - ent.nlen = nlen; - if (unlikely(nlen > AUFS_MAX_NAMELEN)) - ent.type = DT_UNKNOWN; - - /* unnecessary to support mmap_sem since this is a dir */ - err = -EFAULT; - if (copy_to_user(arg->ent.e, &ent, sizeof(ent))) - goto out; - if (copy_to_user(arg->ent.e->name, name, nlen)) - goto out; - /* the terminating NULL */ - if (__put_user(0, arg->ent.e->name + nlen)) - goto out; - err = 0; - /* AuDbg("%p, %.*s\n", arg->ent.p, nlen, name); */ - arg->ent.ul += len; - rdu->rent++; - } else { - err = -EFAULT; - au_fset_rdu(rdu->cookie.flags, FULL); - rdu->full = 1; - rdu->tail = arg->ent; - } - -out: - /* AuTraceErr(err); */ - return err; -} - -static int au_rdu_do(struct file *h_file, struct au_rdu_arg *arg) -{ - int err; - loff_t offset; - struct au_rdu_cookie *cookie = &arg->rdu->cookie; - - /* we don't have to care (FMODE_32BITHASH | FMODE_64BITHASH) for ext4 */ - offset = vfsub_llseek(h_file, cookie->h_pos, SEEK_SET); - err = offset; - if (unlikely(offset != cookie->h_pos)) - goto out; - - err = 0; - do { - arg->err = 0; - au_fclr_rdu(cookie->flags, CALLED); - /* smp_mb(); */ - err = vfsub_iterate_dir(h_file, &arg->ctx); - if (err >= 0) - err = arg->err; - } while (!err - && au_ftest_rdu(cookie->flags, CALLED) - && !au_ftest_rdu(cookie->flags, FULL)); - cookie->h_pos = h_file->f_pos; - -out: - AuTraceErr(err); - return err; -} - -static int au_rdu(struct file *file, struct aufs_rdu *rdu) -{ - int err; - aufs_bindex_t bend; - struct au_rdu_arg arg = { - .ctx = { - .actor = au_rdu_fill - } - }; - struct dentry *dentry; - struct inode *inode; - struct file *h_file; - struct au_rdu_cookie *cookie = &rdu->cookie; - - err = !access_ok(VERIFY_WRITE, rdu->ent.e, rdu->sz); - if (unlikely(err)) { - err = -EFAULT; - AuTraceErr(err); - goto out; - } - rdu->rent = 0; - rdu->tail = rdu->ent; - rdu->full = 0; - arg.rdu = rdu; - arg.ent = rdu->ent; - arg.end = arg.ent.ul; - arg.end += rdu->sz; - - err = -ENOTDIR; - if (unlikely(!file->f_op->iterate)) - goto out; - - err = security_file_permission(file, MAY_READ); - AuTraceErr(err); - if (unlikely(err)) - goto out; - - dentry = file->f_path.dentry; - inode = d_inode(dentry); -#if 1 - mutex_lock(&inode->i_mutex); -#else - err = mutex_lock_killable(&inode->i_mutex); - AuTraceErr(err); - if (unlikely(err)) - goto out; -#endif - - arg.sb = inode->i_sb; - err = si_read_lock(arg.sb, AuLock_FLUSH | AuLock_NOPLM); - if (unlikely(err)) - goto out_mtx; - err = au_alive_dir(dentry); - if (unlikely(err)) - goto out_si; - /* todo: reval? */ - fi_read_lock(file); - - err = -EAGAIN; - if (unlikely(au_ftest_rdu(cookie->flags, CONT) - && cookie->generation != au_figen(file))) - goto out_unlock; - - err = 0; - if (!rdu->blk) { - rdu->blk = au_sbi(arg.sb)->si_rdblk; - if (!rdu->blk) - rdu->blk = au_dir_size(file, /*dentry*/NULL); - } - bend = au_fbstart(file); - if (cookie->bindex < bend) - cookie->bindex = bend; - bend = au_fbend_dir(file); - /* AuDbg("b%d, b%d\n", cookie->bindex, bend); */ - for (; !err && cookie->bindex <= bend; - cookie->bindex++, cookie->h_pos = 0) { - h_file = au_hf_dir(file, cookie->bindex); - if (!h_file) - continue; - - au_fclr_rdu(cookie->flags, FULL); - err = au_rdu_do(h_file, &arg); - AuTraceErr(err); - if (unlikely(au_ftest_rdu(cookie->flags, FULL) || err)) - break; - } - AuDbg("rent %llu\n", rdu->rent); - - if (!err && !au_ftest_rdu(cookie->flags, CONT)) { - rdu->shwh = !!au_opt_test(au_sbi(arg.sb)->si_mntflags, SHWH); - au_fset_rdu(cookie->flags, CONT); - cookie->generation = au_figen(file); - } - - ii_read_lock_child(inode); - fsstack_copy_attr_atime(inode, au_h_iptr(inode, au_ibstart(inode))); - ii_read_unlock(inode); - -out_unlock: - fi_read_unlock(file); -out_si: - si_read_unlock(arg.sb); -out_mtx: - mutex_unlock(&inode->i_mutex); -out: - AuTraceErr(err); - return err; -} - -static int au_rdu_ino(struct file *file, struct aufs_rdu *rdu) -{ - int err; - ino_t ino; - unsigned long long nent; - union au_rdu_ent_ul *u; - struct au_rdu_ent ent; - struct super_block *sb; - - err = 0; - nent = rdu->nent; - u = &rdu->ent; - sb = file->f_path.dentry->d_sb; - si_read_lock(sb, AuLock_FLUSH); - while (nent-- > 0) { - /* unnecessary to support mmap_sem since this is a dir */ - err = copy_from_user(&ent, u->e, sizeof(ent)); - if (!err) - err = !access_ok(VERIFY_WRITE, &u->e->ino, sizeof(ino)); - if (unlikely(err)) { - err = -EFAULT; - AuTraceErr(err); - break; - } - - /* AuDbg("b%d, i%llu\n", ent.bindex, ent.ino); */ - if (!ent.wh) - err = au_ino(sb, ent.bindex, ent.ino, ent.type, &ino); - else - err = au_wh_ino(sb, ent.bindex, ent.ino, ent.type, - &ino); - if (unlikely(err)) { - AuTraceErr(err); - break; - } - - err = __put_user(ino, &u->e->ino); - if (unlikely(err)) { - err = -EFAULT; - AuTraceErr(err); - break; - } - u->ul += au_rdu_len(ent.nlen); - } - si_read_unlock(sb); - - return err; -} - -/* ---------------------------------------------------------------------- */ - -static int au_rdu_verify(struct aufs_rdu *rdu) -{ - AuDbg("rdu{%llu, %p, %u | %u | %llu, %u, %u | " - "%llu, b%d, 0x%x, g%u}\n", - rdu->sz, rdu->ent.e, rdu->verify[AufsCtlRduV_SZ], - rdu->blk, - rdu->rent, rdu->shwh, rdu->full, - rdu->cookie.h_pos, rdu->cookie.bindex, rdu->cookie.flags, - rdu->cookie.generation); - - if (rdu->verify[AufsCtlRduV_SZ] == sizeof(*rdu)) - return 0; - - AuDbg("%u:%u\n", - rdu->verify[AufsCtlRduV_SZ], (unsigned int)sizeof(*rdu)); - return -EINVAL; -} - -long au_rdu_ioctl(struct file *file, unsigned int cmd, unsigned long arg) -{ - long err, e; - struct aufs_rdu rdu; - void __user *p = (void __user *)arg; - - err = copy_from_user(&rdu, p, sizeof(rdu)); - if (unlikely(err)) { - err = -EFAULT; - AuTraceErr(err); - goto out; - } - err = au_rdu_verify(&rdu); - if (unlikely(err)) - goto out; - - switch (cmd) { - case AUFS_CTL_RDU: - err = au_rdu(file, &rdu); - if (unlikely(err)) - break; - - e = copy_to_user(p, &rdu, sizeof(rdu)); - if (unlikely(e)) { - err = -EFAULT; - AuTraceErr(err); - } - break; - case AUFS_CTL_RDU_INO: - err = au_rdu_ino(file, &rdu); - break; - - default: - /* err = -ENOTTY; */ - err = -EINVAL; - } - -out: - AuTraceErr(err); - return err; -} - -#ifdef CONFIG_COMPAT -long au_rdu_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) -{ - long err, e; - struct aufs_rdu rdu; - void __user *p = compat_ptr(arg); - - /* todo: get_user()? */ - err = copy_from_user(&rdu, p, sizeof(rdu)); - if (unlikely(err)) { - err = -EFAULT; - AuTraceErr(err); - goto out; - } - rdu.ent.e = compat_ptr(rdu.ent.ul); - err = au_rdu_verify(&rdu); - if (unlikely(err)) - goto out; - - switch (cmd) { - case AUFS_CTL_RDU: - err = au_rdu(file, &rdu); - if (unlikely(err)) - break; - - rdu.ent.ul = ptr_to_compat(rdu.ent.e); - rdu.tail.ul = ptr_to_compat(rdu.tail.e); - e = copy_to_user(p, &rdu, sizeof(rdu)); - if (unlikely(e)) { - err = -EFAULT; - AuTraceErr(err); - } - break; - case AUFS_CTL_RDU_INO: - err = au_rdu_ino(file, &rdu); - break; - - default: - /* err = -ENOTTY; */ - err = -EINVAL; - } - -out: - AuTraceErr(err); - return err; -} -#endif diff --git a/fs/aufs/rwsem.h b/fs/aufs/rwsem.h deleted file mode 100644 index ef50c2ccb..000000000 --- a/fs/aufs/rwsem.h +++ /dev/null @@ -1,178 +0,0 @@ -/* - * Copyright (C) 2005-2016 Junjiro R. Okajima - */ - -/* - * simple read-write semaphore wrappers - */ - -#ifndef __AUFS_RWSEM_H__ -#define __AUFS_RWSEM_H__ - -#ifdef __KERNEL__ - -#include "debug.h" - -struct au_rwsem { - struct rw_semaphore rwsem; -#ifdef CONFIG_AUFS_DEBUG - /* just for debugging, not almighty counter */ - atomic_t rcnt, wcnt; -#endif -}; - -#ifdef CONFIG_AUFS_DEBUG -#define AuDbgCntInit(rw) do { \ - atomic_set(&(rw)->rcnt, 0); \ - atomic_set(&(rw)->wcnt, 0); \ - smp_mb(); /* atomic set */ \ -} while (0) - -#define AuDbgRcntInc(rw) atomic_inc(&(rw)->rcnt) -#define AuDbgRcntDec(rw) WARN_ON(atomic_dec_return(&(rw)->rcnt) < 0) -#define AuDbgWcntInc(rw) atomic_inc(&(rw)->wcnt) -#define AuDbgWcntDec(rw) WARN_ON(atomic_dec_return(&(rw)->wcnt) < 0) -#else -#define AuDbgCntInit(rw) do {} while (0) -#define AuDbgRcntInc(rw) do {} while (0) -#define AuDbgRcntDec(rw) do {} while (0) -#define AuDbgWcntInc(rw) do {} while (0) -#define AuDbgWcntDec(rw) do {} while (0) -#endif /* CONFIG_AUFS_DEBUG */ - -/* to debug easier, do not make them inlined functions */ -#define AuRwMustNoWaiters(rw) AuDebugOn(!list_empty(&(rw)->rwsem.wait_list)) -/* rwsem_is_locked() is unusable */ -#define AuRwMustReadLock(rw) AuDebugOn(atomic_read(&(rw)->rcnt) <= 0) -#define AuRwMustWriteLock(rw) AuDebugOn(atomic_read(&(rw)->wcnt) <= 0) -#define AuRwMustAnyLock(rw) AuDebugOn(atomic_read(&(rw)->rcnt) <= 0 \ - && atomic_read(&(rw)->wcnt) <= 0) -#define AuRwDestroy(rw) AuDebugOn(atomic_read(&(rw)->rcnt) \ - || atomic_read(&(rw)->wcnt)) - -#define au_rw_class(rw, key) lockdep_set_class(&(rw)->rwsem, key) - -static inline void au_rw_init(struct au_rwsem *rw) -{ - AuDbgCntInit(rw); - init_rwsem(&rw->rwsem); -} - -static inline void au_rw_init_wlock(struct au_rwsem *rw) -{ - au_rw_init(rw); - down_write(&rw->rwsem); - AuDbgWcntInc(rw); -} - -static inline void au_rw_init_wlock_nested(struct au_rwsem *rw, - unsigned int lsc) -{ - au_rw_init(rw); - down_write_nested(&rw->rwsem, lsc); - AuDbgWcntInc(rw); -} - -static inline void au_rw_read_lock(struct au_rwsem *rw) -{ - down_read(&rw->rwsem); - AuDbgRcntInc(rw); -} - -static inline void au_rw_read_lock_nested(struct au_rwsem *rw, unsigned int lsc) -{ - down_read_nested(&rw->rwsem, lsc); - AuDbgRcntInc(rw); -} - -static inline void au_rw_read_unlock(struct au_rwsem *rw) -{ - AuRwMustReadLock(rw); - AuDbgRcntDec(rw); - up_read(&rw->rwsem); -} - -static inline void au_rw_dgrade_lock(struct au_rwsem *rw) -{ - AuRwMustWriteLock(rw); - AuDbgRcntInc(rw); - AuDbgWcntDec(rw); - downgrade_write(&rw->rwsem); -} - -static inline void au_rw_write_lock(struct au_rwsem *rw) -{ - down_write(&rw->rwsem); - AuDbgWcntInc(rw); -} - -static inline void au_rw_write_lock_nested(struct au_rwsem *rw, - unsigned int lsc) -{ - down_write_nested(&rw->rwsem, lsc); - AuDbgWcntInc(rw); -} - -static inline void au_rw_write_unlock(struct au_rwsem *rw) -{ - AuRwMustWriteLock(rw); - AuDbgWcntDec(rw); - up_write(&rw->rwsem); -} - -/* why is not _nested version defined */ -static inline int au_rw_read_trylock(struct au_rwsem *rw) -{ - int ret; - - ret = down_read_trylock(&rw->rwsem); - if (ret) - AuDbgRcntInc(rw); - return ret; -} - -static inline int au_rw_write_trylock(struct au_rwsem *rw) -{ - int ret; - - ret = down_write_trylock(&rw->rwsem); - if (ret) - AuDbgWcntInc(rw); - return ret; -} - -#undef AuDbgCntInit -#undef AuDbgRcntInc -#undef AuDbgRcntDec -#undef AuDbgWcntInc -#undef AuDbgWcntDec - -#define AuSimpleLockRwsemFuncs(prefix, param, rwsem) \ -static inline void prefix##_read_lock(param) \ -{ au_rw_read_lock(rwsem); } \ -static inline void prefix##_write_lock(param) \ -{ au_rw_write_lock(rwsem); } \ -static inline int prefix##_read_trylock(param) \ -{ return au_rw_read_trylock(rwsem); } \ -static inline int prefix##_write_trylock(param) \ -{ return au_rw_write_trylock(rwsem); } -/* why is not _nested version defined */ -/* static inline void prefix##_read_trylock_nested(param, lsc) -{ au_rw_read_trylock_nested(rwsem, lsc)); } -static inline void prefix##_write_trylock_nestd(param, lsc) -{ au_rw_write_trylock_nested(rwsem, lsc); } */ - -#define AuSimpleUnlockRwsemFuncs(prefix, param, rwsem) \ -static inline void prefix##_read_unlock(param) \ -{ au_rw_read_unlock(rwsem); } \ -static inline void prefix##_write_unlock(param) \ -{ au_rw_write_unlock(rwsem); } \ -static inline void prefix##_downgrade_lock(param) \ -{ au_rw_dgrade_lock(rwsem); } - -#define AuSimpleRwsemFuncs(prefix, param, rwsem) \ - AuSimpleLockRwsemFuncs(prefix, param, rwsem) \ - AuSimpleUnlockRwsemFuncs(prefix, param, rwsem) - -#endif /* __KERNEL__ */ -#endif /* __AUFS_RWSEM_H__ */ diff --git a/fs/aufs/sbinfo.c b/fs/aufs/sbinfo.c deleted file mode 100644 index e3c58f643..000000000 --- a/fs/aufs/sbinfo.c +++ /dev/null @@ -1,353 +0,0 @@ -/* - * Copyright (C) 2005-2016 Junjiro R. Okajima - */ - -/* - * superblock private data - */ - -#include "aufs.h" - -/* - * they are necessary regardless sysfs is disabled. - */ -void au_si_free(struct kobject *kobj) -{ - int i; - struct au_sbinfo *sbinfo; - char *locked __maybe_unused; /* debug only */ - - sbinfo = container_of(kobj, struct au_sbinfo, si_kobj); - for (i = 0; i < AuPlink_NHASH; i++) - AuDebugOn(!hlist_empty(&sbinfo->si_plink[i].head)); - AuDebugOn(atomic_read(&sbinfo->si_nowait.nw_len)); - - AuDebugOn(!hlist_empty(&sbinfo->si_symlink.head)); - - au_rw_write_lock(&sbinfo->si_rwsem); - au_br_free(sbinfo); - au_rw_write_unlock(&sbinfo->si_rwsem); - - AuDebugOn(radix_tree_gang_lookup - (&sbinfo->au_si_pid.tree, (void **)&locked, - /*first_index*/PID_MAX_DEFAULT - 1, - /*max_items*/sizeof(locked)/sizeof(*locked))); - - kfree(sbinfo->si_branch); - kfree(sbinfo->au_si_pid.bitmap); - mutex_destroy(&sbinfo->si_xib_mtx); - AuRwDestroy(&sbinfo->si_rwsem); - - kfree(sbinfo); -} - -int au_si_alloc(struct super_block *sb) -{ - int err, i; - struct au_sbinfo *sbinfo; - static struct lock_class_key aufs_si; - - err = -ENOMEM; - sbinfo = kzalloc(sizeof(*sbinfo), GFP_NOFS); - if (unlikely(!sbinfo)) - goto out; - - BUILD_BUG_ON(sizeof(unsigned long) != - sizeof(*sbinfo->au_si_pid.bitmap)); - sbinfo->au_si_pid.bitmap = kcalloc(BITS_TO_LONGS(PID_MAX_DEFAULT), - sizeof(*sbinfo->au_si_pid.bitmap), - GFP_NOFS); - if (unlikely(!sbinfo->au_si_pid.bitmap)) - goto out_sbinfo; - - /* will be reallocated separately */ - sbinfo->si_branch = kzalloc(sizeof(*sbinfo->si_branch), GFP_NOFS); - if (unlikely(!sbinfo->si_branch)) - goto out_pidmap; - - err = sysaufs_si_init(sbinfo); - if (unlikely(err)) - goto out_br; - - au_nwt_init(&sbinfo->si_nowait); - au_rw_init_wlock(&sbinfo->si_rwsem); - au_rw_class(&sbinfo->si_rwsem, &aufs_si); - spin_lock_init(&sbinfo->au_si_pid.tree_lock); - INIT_RADIX_TREE(&sbinfo->au_si_pid.tree, GFP_ATOMIC | __GFP_NOFAIL); - - atomic_long_set(&sbinfo->si_ninodes, 0); - atomic_long_set(&sbinfo->si_nfiles, 0); - - sbinfo->si_bend = -1; - sbinfo->si_last_br_id = AUFS_BRANCH_MAX / 2; - - sbinfo->si_wbr_copyup = AuWbrCopyup_Def; - sbinfo->si_wbr_create = AuWbrCreate_Def; - sbinfo->si_wbr_copyup_ops = au_wbr_copyup_ops + sbinfo->si_wbr_copyup; - sbinfo->si_wbr_create_ops = au_wbr_create_ops + sbinfo->si_wbr_create; - - au_fhsm_init(sbinfo); - - sbinfo->si_mntflags = au_opts_plink(AuOpt_Def); - - au_sphl_init(&sbinfo->si_symlink); - - sbinfo->si_xino_jiffy = jiffies; - sbinfo->si_xino_expire - = msecs_to_jiffies(AUFS_XINO_DEF_SEC * MSEC_PER_SEC); - mutex_init(&sbinfo->si_xib_mtx); - sbinfo->si_xino_brid = -1; - /* leave si_xib_last_pindex and si_xib_next_bit */ - - au_sphl_init(&sbinfo->si_aopen); - - sbinfo->si_rdcache = msecs_to_jiffies(AUFS_RDCACHE_DEF * MSEC_PER_SEC); - sbinfo->si_rdblk = AUFS_RDBLK_DEF; - sbinfo->si_rdhash = AUFS_RDHASH_DEF; - sbinfo->si_dirwh = AUFS_DIRWH_DEF; - - for (i = 0; i < AuPlink_NHASH; i++) - au_sphl_init(sbinfo->si_plink + i); - init_waitqueue_head(&sbinfo->si_plink_wq); - spin_lock_init(&sbinfo->si_plink_maint_lock); - - au_sphl_init(&sbinfo->si_files); - - /* with getattr by default */ - sbinfo->si_iop_array = aufs_iop; - - /* leave other members for sysaufs and si_mnt. */ - sbinfo->si_sb = sb; - sb->s_fs_info = sbinfo; - si_pid_set(sb); - return 0; /* success */ - -out_br: - kfree(sbinfo->si_branch); -out_pidmap: - kfree(sbinfo->au_si_pid.bitmap); -out_sbinfo: - kfree(sbinfo); -out: - return err; -} - -int au_sbr_realloc(struct au_sbinfo *sbinfo, int nbr) -{ - int err, sz; - struct au_branch **brp; - - AuRwMustWriteLock(&sbinfo->si_rwsem); - - err = -ENOMEM; - sz = sizeof(*brp) * (sbinfo->si_bend + 1); - if (unlikely(!sz)) - sz = sizeof(*brp); - brp = au_kzrealloc(sbinfo->si_branch, sz, sizeof(*brp) * nbr, GFP_NOFS); - if (brp) { - sbinfo->si_branch = brp; - err = 0; - } - - return err; -} - -/* ---------------------------------------------------------------------- */ - -unsigned int au_sigen_inc(struct super_block *sb) -{ - unsigned int gen; - struct inode *inode; - - SiMustWriteLock(sb); - - gen = ++au_sbi(sb)->si_generation; - au_update_digen(sb->s_root); - inode = d_inode(sb->s_root); - au_update_iigen(inode, /*half*/0); - inode->i_version++; - return gen; -} - -aufs_bindex_t au_new_br_id(struct super_block *sb) -{ - aufs_bindex_t br_id; - int i; - struct au_sbinfo *sbinfo; - - SiMustWriteLock(sb); - - sbinfo = au_sbi(sb); - for (i = 0; i <= AUFS_BRANCH_MAX; i++) { - br_id = ++sbinfo->si_last_br_id; - AuDebugOn(br_id < 0); - if (br_id && au_br_index(sb, br_id) < 0) - return br_id; - } - - return -1; -} - -/* ---------------------------------------------------------------------- */ - -/* it is ok that new 'nwt' tasks are appended while we are sleeping */ -int si_read_lock(struct super_block *sb, int flags) -{ - int err; - - err = 0; - if (au_ftest_lock(flags, FLUSH)) - au_nwt_flush(&au_sbi(sb)->si_nowait); - - si_noflush_read_lock(sb); - err = au_plink_maint(sb, flags); - if (unlikely(err)) - si_read_unlock(sb); - - return err; -} - -int si_write_lock(struct super_block *sb, int flags) -{ - int err; - - if (au_ftest_lock(flags, FLUSH)) - au_nwt_flush(&au_sbi(sb)->si_nowait); - - si_noflush_write_lock(sb); - err = au_plink_maint(sb, flags); - if (unlikely(err)) - si_write_unlock(sb); - - return err; -} - -/* dentry and super_block lock. call at entry point */ -int aufs_read_lock(struct dentry *dentry, int flags) -{ - int err; - struct super_block *sb; - - sb = dentry->d_sb; - err = si_read_lock(sb, flags); - if (unlikely(err)) - goto out; - - if (au_ftest_lock(flags, DW)) - di_write_lock_child(dentry); - else - di_read_lock_child(dentry, flags); - - if (au_ftest_lock(flags, GEN)) { - err = au_digen_test(dentry, au_sigen(sb)); - if (!au_opt_test(au_mntflags(sb), UDBA_NONE)) - AuDebugOn(!err && au_dbrange_test(dentry)); - else if (!err) - err = au_dbrange_test(dentry); - if (unlikely(err)) - aufs_read_unlock(dentry, flags); - } - -out: - return err; -} - -void aufs_read_unlock(struct dentry *dentry, int flags) -{ - if (au_ftest_lock(flags, DW)) - di_write_unlock(dentry); - else - di_read_unlock(dentry, flags); - si_read_unlock(dentry->d_sb); -} - -void aufs_write_lock(struct dentry *dentry) -{ - si_write_lock(dentry->d_sb, AuLock_FLUSH | AuLock_NOPLMW); - di_write_lock_child(dentry); -} - -void aufs_write_unlock(struct dentry *dentry) -{ - di_write_unlock(dentry); - si_write_unlock(dentry->d_sb); -} - -int aufs_read_and_write_lock2(struct dentry *d1, struct dentry *d2, int flags) -{ - int err; - unsigned int sigen; - struct super_block *sb; - - sb = d1->d_sb; - err = si_read_lock(sb, flags); - if (unlikely(err)) - goto out; - - di_write_lock2_child(d1, d2, au_ftest_lock(flags, DIRS)); - - if (au_ftest_lock(flags, GEN)) { - sigen = au_sigen(sb); - err = au_digen_test(d1, sigen); - AuDebugOn(!err && au_dbrange_test(d1)); - if (!err) { - err = au_digen_test(d2, sigen); - AuDebugOn(!err && au_dbrange_test(d2)); - } - if (unlikely(err)) - aufs_read_and_write_unlock2(d1, d2); - } - -out: - return err; -} - -void aufs_read_and_write_unlock2(struct dentry *d1, struct dentry *d2) -{ - di_write_unlock2(d1, d2); - si_read_unlock(d1->d_sb); -} - -/* ---------------------------------------------------------------------- */ - -int si_pid_test_slow(struct super_block *sb) -{ - void *p; - - rcu_read_lock(); - p = radix_tree_lookup(&au_sbi(sb)->au_si_pid.tree, current->pid); - rcu_read_unlock(); - - return (long)!!p; -} - -void si_pid_set_slow(struct super_block *sb) -{ - int err; - struct au_sbinfo *sbinfo; - - AuDebugOn(si_pid_test_slow(sb)); - - sbinfo = au_sbi(sb); - err = radix_tree_preload(GFP_NOFS | __GFP_NOFAIL); - AuDebugOn(err); - spin_lock(&sbinfo->au_si_pid.tree_lock); - err = radix_tree_insert(&sbinfo->au_si_pid.tree, current->pid, - /*any valid ptr*/sb); - spin_unlock(&sbinfo->au_si_pid.tree_lock); - AuDebugOn(err); - radix_tree_preload_end(); -} - -void si_pid_clr_slow(struct super_block *sb) -{ - void *p; - struct au_sbinfo *sbinfo; - - AuDebugOn(!si_pid_test_slow(sb)); - - sbinfo = au_sbi(sb); - spin_lock(&sbinfo->au_si_pid.tree_lock); - p = radix_tree_delete(&sbinfo->au_si_pid.tree, current->pid); - spin_unlock(&sbinfo->au_si_pid.tree_lock); -} diff --git a/fs/aufs/spl.h b/fs/aufs/spl.h deleted file mode 100644 index f9b528826..000000000 --- a/fs/aufs/spl.h +++ /dev/null @@ -1,98 +0,0 @@ -/* - * Copyright (C) 2005-2016 Junjiro R. Okajima - */ - -/* - * simple list protected by a spinlock - */ - -#ifndef __AUFS_SPL_H__ -#define __AUFS_SPL_H__ - -#ifdef __KERNEL__ - -struct au_splhead { - spinlock_t spin; - struct list_head head; -}; - -static inline void au_spl_init(struct au_splhead *spl) -{ - spin_lock_init(&spl->spin); - INIT_LIST_HEAD(&spl->head); -} - -static inline void au_spl_add(struct list_head *list, struct au_splhead *spl) -{ - spin_lock(&spl->spin); - list_add(list, &spl->head); - spin_unlock(&spl->spin); -} - -static inline void au_spl_del(struct list_head *list, struct au_splhead *spl) -{ - spin_lock(&spl->spin); - list_del(list); - spin_unlock(&spl->spin); -} - -static inline void au_spl_del_rcu(struct list_head *list, - struct au_splhead *spl) -{ - spin_lock(&spl->spin); - list_del_rcu(list); - spin_unlock(&spl->spin); -} - -/* ---------------------------------------------------------------------- */ - -struct au_sphlhead { - spinlock_t spin; - struct hlist_head head; -}; - -static inline void au_sphl_init(struct au_sphlhead *sphl) -{ - spin_lock_init(&sphl->spin); - INIT_HLIST_HEAD(&sphl->head); -} - -static inline void au_sphl_add(struct hlist_node *hlist, - struct au_sphlhead *sphl) -{ - spin_lock(&sphl->spin); - hlist_add_head(hlist, &sphl->head); - spin_unlock(&sphl->spin); -} - -static inline void au_sphl_del(struct hlist_node *hlist, - struct au_sphlhead *sphl) -{ - spin_lock(&sphl->spin); - hlist_del(hlist); - spin_unlock(&sphl->spin); -} - -static inline void au_sphl_del_rcu(struct hlist_node *hlist, - struct au_sphlhead *sphl) -{ - spin_lock(&sphl->spin); - hlist_del_rcu(hlist); - spin_unlock(&sphl->spin); -} - -static inline unsigned long au_sphl_count(struct au_sphlhead *sphl) -{ - unsigned long cnt; - struct hlist_node *pos; - - cnt = 0; - spin_lock(&sphl->spin); - hlist_for_each(pos, &sphl->head) - cnt++; - spin_unlock(&sphl->spin); - return cnt; -} - -#endif /* __KERNEL__ */ -#endif /* __AUFS_SPL_H__ */ diff --git a/fs/aufs/super.c b/fs/aufs/super.c deleted file mode 100644 index b41d78913..000000000 --- a/fs/aufs/super.c +++ /dev/null @@ -1,1026 +0,0 @@ -/* - * Copyright (C) 2005-2016 Junjiro R. Okajima - */ - -/* - * mount and super_block operations - */ - -#include <linux/mm.h> -#include <linux/seq_file.h> -#include <linux/statfs.h> -#include <linux/vmalloc.h> -#include "aufs.h" - -/* - * super_operations - */ -static struct inode *aufs_alloc_inode(struct super_block *sb __maybe_unused) -{ - struct au_icntnr *c; - - c = au_cache_alloc_icntnr(); - if (c) { - au_icntnr_init(c); - c->vfs_inode.i_version = 1; /* sigen(sb); */ - c->iinfo.ii_hinode = NULL; - return &c->vfs_inode; - } - return NULL; -} - -static void aufs_destroy_inode_cb(struct rcu_head *head) -{ - struct inode *inode = container_of(head, struct inode, i_rcu); - - INIT_HLIST_HEAD(&inode->i_dentry); - au_cache_free_icntnr(container_of(inode, struct au_icntnr, vfs_inode)); -} - -static void aufs_destroy_inode(struct inode *inode) -{ - au_iinfo_fin(inode); - call_rcu(&inode->i_rcu, aufs_destroy_inode_cb); -} - -struct inode *au_iget_locked(struct super_block *sb, ino_t ino) -{ - struct inode *inode; - int err; - - inode = iget_locked(sb, ino); - if (unlikely(!inode)) { - inode = ERR_PTR(-ENOMEM); - goto out; - } - if (!(inode->i_state & I_NEW)) - goto out; - - err = au_xigen_new(inode); - if (!err) - err = au_iinfo_init(inode); - if (!err) - inode->i_version++; - else { - iget_failed(inode); - inode = ERR_PTR(err); - } - -out: - /* never return NULL */ - AuDebugOn(!inode); - AuTraceErrPtr(inode); - return inode; -} - -/* lock free root dinfo */ -static int au_show_brs(struct seq_file *seq, struct super_block *sb) -{ - int err; - aufs_bindex_t bindex, bend; - struct path path; - struct au_hdentry *hdp; - struct au_branch *br; - au_br_perm_str_t perm; - - err = 0; - bend = au_sbend(sb); - hdp = au_di(sb->s_root)->di_hdentry; - for (bindex = 0; !err && bindex <= bend; bindex++) { - br = au_sbr(sb, bindex); - path.mnt = au_br_mnt(br); - path.dentry = hdp[bindex].hd_dentry; - err = au_seq_path(seq, &path); - if (!err) { - au_optstr_br_perm(&perm, br->br_perm); - seq_printf(seq, "=%s", perm.a); - if (bindex != bend) - seq_putc(seq, ':'); - } - } - if (unlikely(err || seq_has_overflowed(seq))) - err = -E2BIG; - - return err; -} - -static void au_show_wbr_create(struct seq_file *m, int v, - struct au_sbinfo *sbinfo) -{ - const char *pat; - - AuRwMustAnyLock(&sbinfo->si_rwsem); - - seq_puts(m, ",create="); - pat = au_optstr_wbr_create(v); - switch (v) { - case AuWbrCreate_TDP: - case AuWbrCreate_RR: - case AuWbrCreate_MFS: - case AuWbrCreate_PMFS: - seq_puts(m, pat); - break; - case AuWbrCreate_MFSV: - seq_printf(m, /*pat*/"mfs:%lu", - jiffies_to_msecs(sbinfo->si_wbr_mfs.mfs_expire) - / MSEC_PER_SEC); - break; - case AuWbrCreate_PMFSV: - seq_printf(m, /*pat*/"pmfs:%lu", - jiffies_to_msecs(sbinfo->si_wbr_mfs.mfs_expire) - / MSEC_PER_SEC); - break; - case AuWbrCreate_MFSRR: - seq_printf(m, /*pat*/"mfsrr:%llu", - sbinfo->si_wbr_mfs.mfsrr_watermark); - break; - case AuWbrCreate_MFSRRV: - seq_printf(m, /*pat*/"mfsrr:%llu:%lu", - sbinfo->si_wbr_mfs.mfsrr_watermark, - jiffies_to_msecs(sbinfo->si_wbr_mfs.mfs_expire) - / MSEC_PER_SEC); - break; - case AuWbrCreate_PMFSRR: - seq_printf(m, /*pat*/"pmfsrr:%llu", - sbinfo->si_wbr_mfs.mfsrr_watermark); - break; - case AuWbrCreate_PMFSRRV: - seq_printf(m, /*pat*/"pmfsrr:%llu:%lu", - sbinfo->si_wbr_mfs.mfsrr_watermark, - jiffies_to_msecs(sbinfo->si_wbr_mfs.mfs_expire) - / MSEC_PER_SEC); - break; - } -} - -static int au_show_xino(struct seq_file *seq, struct super_block *sb) -{ -#ifdef CONFIG_SYSFS - return 0; -#else - int err; - const int len = sizeof(AUFS_XINO_FNAME) - 1; - aufs_bindex_t bindex, brid; - struct qstr *name; - struct file *f; - struct dentry *d, *h_root; - struct au_hdentry *hdp; - - AuRwMustAnyLock(&sbinfo->si_rwsem); - - err = 0; - f = au_sbi(sb)->si_xib; - if (!f) - goto out; - - /* stop printing the default xino path on the first writable branch */ - h_root = NULL; - brid = au_xino_brid(sb); - if (brid >= 0) { - bindex = au_br_index(sb, brid); - hdp = au_di(sb->s_root)->di_hdentry; - h_root = hdp[0 + bindex].hd_dentry; - } - d = f->f_path.dentry; - name = &d->d_name; - /* safe ->d_parent because the file is unlinked */ - if (d->d_parent == h_root - && name->len == len - && !memcmp(name->name, AUFS_XINO_FNAME, len)) - goto out; - - seq_puts(seq, ",xino="); - err = au_xino_path(seq, f); - -out: - return err; -#endif -} - -/* seq_file will re-call me in case of too long string */ -static int aufs_show_options(struct seq_file *m, struct dentry *dentry) -{ - int err; - unsigned int mnt_flags, v; - struct super_block *sb; - struct au_sbinfo *sbinfo; - -#define AuBool(name, str) do { \ - v = au_opt_test(mnt_flags, name); \ - if (v != au_opt_test(AuOpt_Def, name)) \ - seq_printf(m, ",%s" #str, v ? "" : "no"); \ -} while (0) - -#define AuStr(name, str) do { \ - v = mnt_flags & AuOptMask_##name; \ - if (v != (AuOpt_Def & AuOptMask_##name)) \ - seq_printf(m, "," #str "=%s", au_optstr_##str(v)); \ -} while (0) - -#define AuUInt(name, str, val) do { \ - if (val != AUFS_##name##_DEF) \ - seq_printf(m, "," #str "=%u", val); \ -} while (0) - - sb = dentry->d_sb; - if (sb->s_flags & MS_POSIXACL) - seq_puts(m, ",acl"); - - /* lock free root dinfo */ - si_noflush_read_lock(sb); - sbinfo = au_sbi(sb); - seq_printf(m, ",si=%lx", sysaufs_si_id(sbinfo)); - - mnt_flags = au_mntflags(sb); - if (au_opt_test(mnt_flags, XINO)) { - err = au_show_xino(m, sb); - if (unlikely(err)) - goto out; - } else - seq_puts(m, ",noxino"); - - AuBool(TRUNC_XINO, trunc_xino); - AuStr(UDBA, udba); - AuBool(SHWH, shwh); - AuBool(PLINK, plink); - AuBool(DIO, dio); - AuBool(DIRPERM1, dirperm1); - - v = sbinfo->si_wbr_create; - if (v != AuWbrCreate_Def) - au_show_wbr_create(m, v, sbinfo); - - v = sbinfo->si_wbr_copyup; - if (v != AuWbrCopyup_Def) - seq_printf(m, ",cpup=%s", au_optstr_wbr_copyup(v)); - - v = au_opt_test(mnt_flags, ALWAYS_DIROPQ); - if (v != au_opt_test(AuOpt_Def, ALWAYS_DIROPQ)) - seq_printf(m, ",diropq=%c", v ? 'a' : 'w'); - - AuUInt(DIRWH, dirwh, sbinfo->si_dirwh); - - v = jiffies_to_msecs(sbinfo->si_rdcache) / MSEC_PER_SEC; - AuUInt(RDCACHE, rdcache, v); - - AuUInt(RDBLK, rdblk, sbinfo->si_rdblk); - AuUInt(RDHASH, rdhash, sbinfo->si_rdhash); - - au_fhsm_show(m, sbinfo); - - AuBool(SUM, sum); - /* AuBool(SUM_W, wsum); */ - AuBool(WARN_PERM, warn_perm); - AuBool(VERBOSE, verbose); - -out: - /* be sure to print "br:" last */ - if (!sysaufs_brs) { - seq_puts(m, ",br:"); - au_show_brs(m, sb); - } - si_read_unlock(sb); - return 0; - -#undef AuBool -#undef AuStr -#undef AuUInt -} - -/* ---------------------------------------------------------------------- */ - -/* sum mode which returns the summation for statfs(2) */ - -static u64 au_add_till_max(u64 a, u64 b) -{ - u64 old; - - old = a; - a += b; - if (old <= a) - return a; - return ULLONG_MAX; -} - -static u64 au_mul_till_max(u64 a, long mul) -{ - u64 old; - - old = a; - a *= mul; - if (old <= a) - return a; - return ULLONG_MAX; -} - -static int au_statfs_sum(struct super_block *sb, struct kstatfs *buf) -{ - int err; - long bsize, factor; - u64 blocks, bfree, bavail, files, ffree; - aufs_bindex_t bend, bindex, i; - unsigned char shared; - struct path h_path; - struct super_block *h_sb; - - err = 0; - bsize = LONG_MAX; - files = 0; - ffree = 0; - blocks = 0; - bfree = 0; - bavail = 0; - bend = au_sbend(sb); - for (bindex = 0; bindex <= bend; bindex++) { - h_path.mnt = au_sbr_mnt(sb, bindex); - h_sb = h_path.mnt->mnt_sb; - shared = 0; - for (i = 0; !shared && i < bindex; i++) - shared = (au_sbr_sb(sb, i) == h_sb); - if (shared) - continue; - - /* sb->s_root for NFS is unreliable */ - h_path.dentry = h_path.mnt->mnt_root; - err = vfs_statfs(&h_path, buf); - if (unlikely(err)) - goto out; - - if (bsize > buf->f_bsize) { - /* - * we will reduce bsize, so we have to expand blocks - * etc. to match them again - */ - factor = (bsize / buf->f_bsize); - blocks = au_mul_till_max(blocks, factor); - bfree = au_mul_till_max(bfree, factor); - bavail = au_mul_till_max(bavail, factor); - bsize = buf->f_bsize; - } - - factor = (buf->f_bsize / bsize); - blocks = au_add_till_max(blocks, - au_mul_till_max(buf->f_blocks, factor)); - bfree = au_add_till_max(bfree, - au_mul_till_max(buf->f_bfree, factor)); - bavail = au_add_till_max(bavail, - au_mul_till_max(buf->f_bavail, factor)); - files = au_add_till_max(files, buf->f_files); - ffree = au_add_till_max(ffree, buf->f_ffree); - } - - buf->f_bsize = bsize; - buf->f_blocks = blocks; - buf->f_bfree = bfree; - buf->f_bavail = bavail; - buf->f_files = files; - buf->f_ffree = ffree; - buf->f_frsize = 0; - -out: - return err; -} - -static int aufs_statfs(struct dentry *dentry, struct kstatfs *buf) -{ - int err; - struct path h_path; - struct super_block *sb; - - /* lock free root dinfo */ - sb = dentry->d_sb; - si_noflush_read_lock(sb); - if (!au_opt_test(au_mntflags(sb), SUM)) { - /* sb->s_root for NFS is unreliable */ - h_path.mnt = au_sbr_mnt(sb, 0); - h_path.dentry = h_path.mnt->mnt_root; - err = vfs_statfs(&h_path, buf); - } else - err = au_statfs_sum(sb, buf); - si_read_unlock(sb); - - if (!err) { - buf->f_type = AUFS_SUPER_MAGIC; - buf->f_namelen = AUFS_MAX_NAMELEN; - memset(&buf->f_fsid, 0, sizeof(buf->f_fsid)); - } - /* buf->f_bsize = buf->f_blocks = buf->f_bfree = buf->f_bavail = -1; */ - - return err; -} - -/* ---------------------------------------------------------------------- */ - -static int aufs_sync_fs(struct super_block *sb, int wait) -{ - int err, e; - aufs_bindex_t bend, bindex; - struct au_branch *br; - struct super_block *h_sb; - - err = 0; - si_noflush_read_lock(sb); - bend = au_sbend(sb); - for (bindex = 0; bindex <= bend; bindex++) { - br = au_sbr(sb, bindex); - if (!au_br_writable(br->br_perm)) - continue; - - h_sb = au_sbr_sb(sb, bindex); - if (h_sb->s_op->sync_fs) { - e = h_sb->s_op->sync_fs(h_sb, wait); - if (unlikely(e && !err)) - err = e; - /* go on even if an error happens */ - } - } - si_read_unlock(sb); - - return err; -} - -/* ---------------------------------------------------------------------- */ - -/* final actions when unmounting a file system */ -static void aufs_put_super(struct super_block *sb) -{ - struct au_sbinfo *sbinfo; - - sbinfo = au_sbi(sb); - if (!sbinfo) - return; - - dbgaufs_si_fin(sbinfo); - kobject_put(&sbinfo->si_kobj); -} - -/* ---------------------------------------------------------------------- */ - -void *au_array_alloc(unsigned long long *hint, au_arraycb_t cb, - struct super_block *sb, void *arg) -{ - void *array; - unsigned long long n, sz; - - array = NULL; - n = 0; - if (!*hint) - goto out; - - if (*hint > ULLONG_MAX / sizeof(array)) { - array = ERR_PTR(-EMFILE); - pr_err("hint %llu\n", *hint); - goto out; - } - - sz = sizeof(array) * *hint; - array = kzalloc(sz, GFP_NOFS); - if (unlikely(!array)) - array = vzalloc(sz); - if (unlikely(!array)) { - array = ERR_PTR(-ENOMEM); - goto out; - } - - n = cb(sb, array, *hint, arg); - AuDebugOn(n > *hint); - -out: - *hint = n; - return array; -} - -static unsigned long long au_iarray_cb(struct super_block *sb, void *a, - unsigned long long max __maybe_unused, - void *arg) -{ - unsigned long long n; - struct inode **p, *inode; - struct list_head *head; - - n = 0; - p = a; - head = arg; - spin_lock(&sb->s_inode_list_lock); - list_for_each_entry(inode, head, i_sb_list) { - if (!is_bad_inode(inode) - && au_ii(inode)->ii_bstart >= 0) { - spin_lock(&inode->i_lock); - if (atomic_read(&inode->i_count)) { - au_igrab(inode); - *p++ = inode; - n++; - AuDebugOn(n > max); - } - spin_unlock(&inode->i_lock); - } - } - spin_unlock(&sb->s_inode_list_lock); - - return n; -} - -struct inode **au_iarray_alloc(struct super_block *sb, unsigned long long *max) -{ - *max = atomic_long_read(&au_sbi(sb)->si_ninodes); - return au_array_alloc(max, au_iarray_cb, sb, &sb->s_inodes); -} - -void au_iarray_free(struct inode **a, unsigned long long max) -{ - unsigned long long ull; - - for (ull = 0; ull < max; ull++) - iput(a[ull]); - kvfree(a); -} - -/* ---------------------------------------------------------------------- */ - -/* - * refresh dentry and inode at remount time. - */ -/* todo: consolidate with simple_reval_dpath() and au_reval_for_attr() */ -static int au_do_refresh(struct dentry *dentry, unsigned int dir_flags, - struct dentry *parent) -{ - int err; - - di_write_lock_child(dentry); - di_read_lock_parent(parent, AuLock_IR); - err = au_refresh_dentry(dentry, parent); - if (!err && dir_flags) - au_hn_reset(d_inode(dentry), dir_flags); - di_read_unlock(parent, AuLock_IR); - di_write_unlock(dentry); - - return err; -} - -static int au_do_refresh_d(struct dentry *dentry, unsigned int sigen, - struct au_sbinfo *sbinfo, - const unsigned int dir_flags, unsigned int do_idop) -{ - int err; - struct dentry *parent; - - err = 0; - parent = dget_parent(dentry); - if (!au_digen_test(parent, sigen) && au_digen_test(dentry, sigen)) { - if (d_really_is_positive(dentry)) { - if (!d_is_dir(dentry)) - err = au_do_refresh(dentry, /*dir_flags*/0, - parent); - else { - err = au_do_refresh(dentry, dir_flags, parent); - if (unlikely(err)) - au_fset_si(sbinfo, FAILED_REFRESH_DIR); - } - } else - err = au_do_refresh(dentry, /*dir_flags*/0, parent); - AuDbgDentry(dentry); - } - dput(parent); - - if (!err) { - if (do_idop) - au_refresh_dop(dentry, /*force_reval*/0); - } else - au_refresh_dop(dentry, /*force_reval*/1); - - AuTraceErr(err); - return err; -} - -static int au_refresh_d(struct super_block *sb, unsigned int do_idop) -{ - int err, i, j, ndentry, e; - unsigned int sigen; - struct au_dcsub_pages dpages; - struct au_dpage *dpage; - struct dentry **dentries, *d; - struct au_sbinfo *sbinfo; - struct dentry *root = sb->s_root; - const unsigned int dir_flags = au_hi_flags(d_inode(root), /*isdir*/1); - - if (do_idop) - au_refresh_dop(root, /*force_reval*/0); - - err = au_dpages_init(&dpages, GFP_NOFS); - if (unlikely(err)) - goto out; - err = au_dcsub_pages(&dpages, root, NULL, NULL); - if (unlikely(err)) - goto out_dpages; - - sigen = au_sigen(sb); - sbinfo = au_sbi(sb); - for (i = 0; i < dpages.ndpage; i++) { - dpage = dpages.dpages + i; - dentries = dpage->dentries; - ndentry = dpage->ndentry; - for (j = 0; j < ndentry; j++) { - d = dentries[j]; - e = au_do_refresh_d(d, sigen, sbinfo, dir_flags, - do_idop); - if (unlikely(e && !err)) - err = e; - /* go on even err */ - } - } - -out_dpages: - au_dpages_free(&dpages); -out: - return err; -} - -static int au_refresh_i(struct super_block *sb, unsigned int do_idop) -{ - int err, e; - unsigned int sigen; - unsigned long long max, ull; - struct inode *inode, **array; - - array = au_iarray_alloc(sb, &max); - err = PTR_ERR(array); - if (IS_ERR(array)) - goto out; - - err = 0; - sigen = au_sigen(sb); - for (ull = 0; ull < max; ull++) { - inode = array[ull]; - if (unlikely(!inode)) - break; - - e = 0; - ii_write_lock_child(inode); - if (au_iigen(inode, NULL) != sigen) { - e = au_refresh_hinode_self(inode); - if (unlikely(e)) { - au_refresh_iop(inode, /*force_getattr*/1); - pr_err("error %d, i%lu\n", e, inode->i_ino); - if (!err) - err = e; - /* go on even if err */ - } - } - if (!e && do_idop) - au_refresh_iop(inode, /*force_getattr*/0); - ii_write_unlock(inode); - } - - au_iarray_free(array, max); - -out: - return err; -} - -static void au_remount_refresh(struct super_block *sb, unsigned int do_idop) -{ - int err, e; - unsigned int udba; - aufs_bindex_t bindex, bend; - struct dentry *root; - struct inode *inode; - struct au_branch *br; - struct au_sbinfo *sbi; - - au_sigen_inc(sb); - sbi = au_sbi(sb); - au_fclr_si(sbi, FAILED_REFRESH_DIR); - - root = sb->s_root; - DiMustNoWaiters(root); - inode = d_inode(root); - IiMustNoWaiters(inode); - - udba = au_opt_udba(sb); - bend = au_sbend(sb); - for (bindex = 0; bindex <= bend; bindex++) { - br = au_sbr(sb, bindex); - err = au_hnotify_reset_br(udba, br, br->br_perm); - if (unlikely(err)) - AuIOErr("hnotify failed on br %d, %d, ignored\n", - bindex, err); - /* go on even if err */ - } - au_hn_reset(inode, au_hi_flags(inode, /*isdir*/1)); - - if (do_idop) { - if (au_ftest_si(sbi, NO_DREVAL)) { - AuDebugOn(sb->s_d_op == &aufs_dop_noreval); - sb->s_d_op = &aufs_dop_noreval; - AuDebugOn(sbi->si_iop_array == aufs_iop_nogetattr); - sbi->si_iop_array = aufs_iop_nogetattr; - } else { - AuDebugOn(sb->s_d_op == &aufs_dop); - sb->s_d_op = &aufs_dop; - AuDebugOn(sbi->si_iop_array == aufs_iop); - sbi->si_iop_array = aufs_iop; - } - pr_info("reset to %pf and %pf\n", - sb->s_d_op, sbi->si_iop_array); - } - - di_write_unlock(root); - err = au_refresh_d(sb, do_idop); - e = au_refresh_i(sb, do_idop); - if (unlikely(e && !err)) - err = e; - /* aufs_write_lock() calls ..._child() */ - di_write_lock_child(root); - - au_cpup_attr_all(inode, /*force*/1); - - if (unlikely(err)) - AuIOErr("refresh failed, ignored, %d\n", err); -} - -/* stop extra interpretation of errno in mount(8), and strange error messages */ -static int cvt_err(int err) -{ - AuTraceErr(err); - - switch (err) { - case -ENOENT: - case -ENOTDIR: - case -EEXIST: - case -EIO: - err = -EINVAL; - } - return err; -} - -static int aufs_remount_fs(struct super_block *sb, int *flags, char *data) -{ - int err, do_dx; - unsigned int mntflags; - struct au_opts opts = { - .opt = NULL - }; - struct dentry *root; - struct inode *inode; - struct au_sbinfo *sbinfo; - - err = 0; - root = sb->s_root; - if (!data || !*data) { - err = si_write_lock(sb, AuLock_FLUSH | AuLock_NOPLM); - if (!err) { - di_write_lock_child(root); - err = au_opts_verify(sb, *flags, /*pending*/0); - aufs_write_unlock(root); - } - goto out; - } - - err = -ENOMEM; - opts.opt = (void *)__get_free_page(GFP_NOFS); - if (unlikely(!opts.opt)) - goto out; - opts.max_opt = PAGE_SIZE / sizeof(*opts.opt); - opts.flags = AuOpts_REMOUNT; - opts.sb_flags = *flags; - - /* parse it before aufs lock */ - err = au_opts_parse(sb, data, &opts); - if (unlikely(err)) - goto out_opts; - - sbinfo = au_sbi(sb); - inode = d_inode(root); - mutex_lock(&inode->i_mutex); - err = si_write_lock(sb, AuLock_FLUSH | AuLock_NOPLM); - if (unlikely(err)) - goto out_mtx; - di_write_lock_child(root); - - /* au_opts_remount() may return an error */ - err = au_opts_remount(sb, &opts); - au_opts_free(&opts); - - if (au_ftest_opts(opts.flags, REFRESH)) - au_remount_refresh(sb, au_ftest_opts(opts.flags, REFRESH_IDOP)); - - if (au_ftest_opts(opts.flags, REFRESH_DYAOP)) { - mntflags = au_mntflags(sb); - do_dx = !!au_opt_test(mntflags, DIO); - au_dy_arefresh(do_dx); - } - - au_fhsm_wrote_all(sb, /*force*/1); /* ?? */ - aufs_write_unlock(root); - -out_mtx: - mutex_unlock(&inode->i_mutex); -out_opts: - free_page((unsigned long)opts.opt); -out: - err = cvt_err(err); - AuTraceErr(err); - return err; -} - -static const struct super_operations aufs_sop = { - .alloc_inode = aufs_alloc_inode, - .destroy_inode = aufs_destroy_inode, - /* always deleting, no clearing */ - .drop_inode = generic_delete_inode, - .show_options = aufs_show_options, - .statfs = aufs_statfs, - .put_super = aufs_put_super, - .sync_fs = aufs_sync_fs, - .remount_fs = aufs_remount_fs -}; - -/* ---------------------------------------------------------------------- */ - -static int alloc_root(struct super_block *sb) -{ - int err; - struct inode *inode; - struct dentry *root; - - err = -ENOMEM; - inode = au_iget_locked(sb, AUFS_ROOT_INO); - err = PTR_ERR(inode); - if (IS_ERR(inode)) - goto out; - - inode->i_op = aufs_iop + AuIop_DIR; /* with getattr by default */ - inode->i_fop = &aufs_dir_fop; - inode->i_mode = S_IFDIR; - set_nlink(inode, 2); - unlock_new_inode(inode); - - root = d_make_root(inode); - if (unlikely(!root)) - goto out; - err = PTR_ERR(root); - if (IS_ERR(root)) - goto out; - - err = au_di_init(root); - if (!err) { - sb->s_root = root; - return 0; /* success */ - } - dput(root); - -out: - return err; -} - -static int aufs_fill_super(struct super_block *sb, void *raw_data, - int silent __maybe_unused) -{ - int err; - struct au_opts opts = { - .opt = NULL - }; - struct au_sbinfo *sbinfo; - struct dentry *root; - struct inode *inode; - char *arg = raw_data; - - if (unlikely(!arg || !*arg)) { - err = -EINVAL; - pr_err("no arg\n"); - goto out; - } - - err = -ENOMEM; - opts.opt = (void *)__get_free_page(GFP_NOFS); - if (unlikely(!opts.opt)) - goto out; - opts.max_opt = PAGE_SIZE / sizeof(*opts.opt); - opts.sb_flags = sb->s_flags; - - err = au_si_alloc(sb); - if (unlikely(err)) - goto out_opts; - sbinfo = au_sbi(sb); - - /* all timestamps always follow the ones on the branch */ - sb->s_flags |= MS_NOATIME | MS_NODIRATIME; - sb->s_op = &aufs_sop; - sb->s_d_op = &aufs_dop; - sb->s_magic = AUFS_SUPER_MAGIC; - sb->s_maxbytes = 0; - sb->s_stack_depth = 1; - au_export_init(sb); - /* au_xattr_init(sb); */ - - err = alloc_root(sb); - if (unlikely(err)) { - si_write_unlock(sb); - goto out_info; - } - root = sb->s_root; - inode = d_inode(root); - - /* - * actually we can parse options regardless aufs lock here. - * but at remount time, parsing must be done before aufs lock. - * so we follow the same rule. - */ - ii_write_lock_parent(inode); - aufs_write_unlock(root); - err = au_opts_parse(sb, arg, &opts); - if (unlikely(err)) - goto out_root; - - /* lock vfs_inode first, then aufs. */ - mutex_lock(&inode->i_mutex); - aufs_write_lock(root); - err = au_opts_mount(sb, &opts); - au_opts_free(&opts); - if (!err && au_ftest_si(sbinfo, NO_DREVAL)) { - sb->s_d_op = &aufs_dop_noreval; - pr_info("%pf\n", sb->s_d_op); - au_refresh_dop(root, /*force_reval*/0); - sbinfo->si_iop_array = aufs_iop_nogetattr; - au_refresh_iop(inode, /*force_getattr*/0); - } - aufs_write_unlock(root); - mutex_unlock(&inode->i_mutex); - if (!err) - goto out_opts; /* success */ - -out_root: - dput(root); - sb->s_root = NULL; -out_info: - dbgaufs_si_fin(sbinfo); - kobject_put(&sbinfo->si_kobj); - sb->s_fs_info = NULL; -out_opts: - free_page((unsigned long)opts.opt); -out: - AuTraceErr(err); - err = cvt_err(err); - AuTraceErr(err); - return err; -} - -/* ---------------------------------------------------------------------- */ - -static struct dentry *aufs_mount(struct file_system_type *fs_type, int flags, - const char *dev_name __maybe_unused, - void *raw_data) -{ - struct dentry *root; - struct super_block *sb; - - /* all timestamps always follow the ones on the branch */ - /* mnt->mnt_flags |= MNT_NOATIME | MNT_NODIRATIME; */ - root = mount_nodev(fs_type, flags, raw_data, aufs_fill_super); - if (IS_ERR(root)) - goto out; - - sb = root->d_sb; - si_write_lock(sb, !AuLock_FLUSH); - sysaufs_brs_add(sb, 0); - si_write_unlock(sb); - au_sbilist_add(sb); - -out: - return root; -} - -static void aufs_kill_sb(struct super_block *sb) -{ - struct au_sbinfo *sbinfo; - - sbinfo = au_sbi(sb); - if (sbinfo) { - au_sbilist_del(sb); - aufs_write_lock(sb->s_root); - au_fhsm_fin(sb); - if (sbinfo->si_wbr_create_ops->fin) - sbinfo->si_wbr_create_ops->fin(sb); - if (au_opt_test(sbinfo->si_mntflags, UDBA_HNOTIFY)) { - au_opt_set_udba(sbinfo->si_mntflags, UDBA_NONE); - au_remount_refresh(sb, /*do_idop*/0); - } - if (au_opt_test(sbinfo->si_mntflags, PLINK)) - au_plink_put(sb, /*verbose*/1); - au_xino_clr(sb); - sbinfo->si_sb = NULL; - aufs_write_unlock(sb->s_root); - au_nwt_flush(&sbinfo->si_nowait); - } - kill_anon_super(sb); -} - -struct file_system_type aufs_fs_type = { - .name = AUFS_FSTYPE, - /* a race between rename and others */ - .fs_flags = FS_RENAME_DOES_D_MOVE, - .mount = aufs_mount, - .kill_sb = aufs_kill_sb, - /* no need to __module_get() and module_put(). */ - .owner = THIS_MODULE, -}; diff --git a/fs/aufs/super.h b/fs/aufs/super.h deleted file mode 100644 index 2761df917..000000000 --- a/fs/aufs/super.h +++ /dev/null @@ -1,628 +0,0 @@ -/* - * Copyright (C) 2005-2016 Junjiro R. Okajima - */ - -/* - * super_block operations - */ - -#ifndef __AUFS_SUPER_H__ -#define __AUFS_SUPER_H__ - -#ifdef __KERNEL__ - -#include <linux/fs.h> -#include <linux/kobject.h> -#include "rwsem.h" -#include "spl.h" -#include "wkq.h" - -/* policies to select one among multiple writable branches */ -struct au_wbr_copyup_operations { - int (*copyup)(struct dentry *dentry); -}; - -#define AuWbr_DIR 1 /* target is a dir */ -#define AuWbr_PARENT (1 << 1) /* always require a parent */ - -#define au_ftest_wbr(flags, name) ((flags) & AuWbr_##name) -#define au_fset_wbr(flags, name) { (flags) |= AuWbr_##name; } -#define au_fclr_wbr(flags, name) { (flags) &= ~AuWbr_##name; } - -struct au_wbr_create_operations { - int (*create)(struct dentry *dentry, unsigned int flags); - int (*init)(struct super_block *sb); - int (*fin)(struct super_block *sb); -}; - -struct au_wbr_mfs { - struct mutex mfs_lock; /* protect this structure */ - unsigned long mfs_jiffy; - unsigned long mfs_expire; - aufs_bindex_t mfs_bindex; - - unsigned long long mfsrr_bytes; - unsigned long long mfsrr_watermark; -}; - -struct pseudo_link { - union { - struct hlist_node hlist; - struct rcu_head rcu; - }; - struct inode *inode; -}; - -#define AuPlink_NHASH 100 -static inline int au_plink_hash(ino_t ino) -{ - return ino % AuPlink_NHASH; -} - -/* File-based Hierarchical Storage Management */ -struct au_fhsm { -#ifdef CONFIG_AUFS_FHSM - /* allow only one process who can receive the notification */ - spinlock_t fhsm_spin; - pid_t fhsm_pid; - wait_queue_head_t fhsm_wqh; - atomic_t fhsm_readable; - - /* these are protected by si_rwsem */ - unsigned long fhsm_expire; - aufs_bindex_t fhsm_bottom; -#endif -}; - -struct au_branch; -struct au_sbinfo { - /* nowait tasks in the system-wide workqueue */ - struct au_nowait_tasks si_nowait; - - /* - * tried sb->s_umount, but failed due to the dependecy between i_mutex. - * rwsem for au_sbinfo is necessary. - */ - struct au_rwsem si_rwsem; - - /* prevent recursive locking in deleting inode */ - struct { - unsigned long *bitmap; - spinlock_t tree_lock; - struct radix_tree_root tree; - } au_si_pid; - - /* - * dirty approach to protect sb->sb_inodes and ->s_files (gone) from - * remount. - */ - atomic_long_t si_ninodes, si_nfiles; - - /* branch management */ - unsigned int si_generation; - - /* see AuSi_ flags */ - unsigned char au_si_status; - - aufs_bindex_t si_bend; - - /* dirty trick to keep br_id plus */ - unsigned int si_last_br_id : - sizeof(aufs_bindex_t) * BITS_PER_BYTE - 1; - struct au_branch **si_branch; - - /* policy to select a writable branch */ - unsigned char si_wbr_copyup; - unsigned char si_wbr_create; - struct au_wbr_copyup_operations *si_wbr_copyup_ops; - struct au_wbr_create_operations *si_wbr_create_ops; - - /* round robin */ - atomic_t si_wbr_rr_next; - - /* most free space */ - struct au_wbr_mfs si_wbr_mfs; - - /* File-based Hierarchical Storage Management */ - struct au_fhsm si_fhsm; - - /* mount flags */ - /* include/asm-ia64/siginfo.h defines a macro named si_flags */ - unsigned int si_mntflags; - - /* symlink to follow_link() and put_link() */ - struct au_sphlhead si_symlink; - - /* external inode number (bitmap and translation table) */ - vfs_readf_t si_xread; - vfs_writef_t si_xwrite; - struct file *si_xib; - struct mutex si_xib_mtx; /* protect xib members */ - unsigned long *si_xib_buf; - unsigned long si_xib_last_pindex; - int si_xib_next_bit; - aufs_bindex_t si_xino_brid; - unsigned long si_xino_jiffy; - unsigned long si_xino_expire; - /* reserved for future use */ - /* unsigned long long si_xib_limit; */ /* Max xib file size */ - -#ifdef CONFIG_AUFS_EXPORT - /* i_generation */ - struct file *si_xigen; - atomic_t si_xigen_next; -#endif - - /* dirty trick to suppoer atomic_open */ - struct au_sphlhead si_aopen; - - /* vdir parameters */ - unsigned long si_rdcache; /* max cache time in jiffies */ - unsigned int si_rdblk; /* deblk size */ - unsigned int si_rdhash; /* hash size */ - - /* - * If the number of whiteouts are larger than si_dirwh, leave all of - * them after au_whtmp_ren to reduce the cost of rmdir(2). - * future fsck.aufs or kernel thread will remove them later. - * Otherwise, remove all whiteouts and the dir in rmdir(2). - */ - unsigned int si_dirwh; - - /* pseudo_link list */ - struct au_sphlhead si_plink[AuPlink_NHASH]; - wait_queue_head_t si_plink_wq; - spinlock_t si_plink_maint_lock; - pid_t si_plink_maint_pid; - - /* file list */ - struct au_sphlhead si_files; - - /* with/without getattr, brother of sb->s_d_op */ - struct inode_operations *si_iop_array; - - /* - * sysfs and lifetime management. - * this is not a small structure and it may be a waste of memory in case - * of sysfs is disabled, particulary when many aufs-es are mounted. - * but using sysfs is majority. - */ - struct kobject si_kobj; -#ifdef CONFIG_DEBUG_FS - struct dentry *si_dbgaufs; - struct dentry *si_dbgaufs_plink; - struct dentry *si_dbgaufs_xib; -#ifdef CONFIG_AUFS_EXPORT - struct dentry *si_dbgaufs_xigen; -#endif -#endif - -#ifdef CONFIG_AUFS_SBILIST - struct list_head si_list; -#endif - - /* dirty, necessary for unmounting, sysfs and sysrq */ - struct super_block *si_sb; -}; - -/* sbinfo status flags */ -/* - * set true when refresh_dirs() failed at remount time. - * then try refreshing dirs at access time again. - * if it is false, refreshing dirs at access time is unnecesary - */ -#define AuSi_FAILED_REFRESH_DIR 1 -#define AuSi_FHSM (1 << 1) /* fhsm is active now */ -#define AuSi_NO_DREVAL (1 << 2) /* disable all d_revalidate */ - -#ifndef CONFIG_AUFS_FHSM -#undef AuSi_FHSM -#define AuSi_FHSM 0 -#endif - -static inline unsigned char au_do_ftest_si(struct au_sbinfo *sbi, - unsigned int flag) -{ - AuRwMustAnyLock(&sbi->si_rwsem); - return sbi->au_si_status & flag; -} -#define au_ftest_si(sbinfo, name) au_do_ftest_si(sbinfo, AuSi_##name) -#define au_fset_si(sbinfo, name) do { \ - AuRwMustWriteLock(&(sbinfo)->si_rwsem); \ - (sbinfo)->au_si_status |= AuSi_##name; \ -} while (0) -#define au_fclr_si(sbinfo, name) do { \ - AuRwMustWriteLock(&(sbinfo)->si_rwsem); \ - (sbinfo)->au_si_status &= ~AuSi_##name; \ -} while (0) - -/* ---------------------------------------------------------------------- */ - -/* policy to select one among writable branches */ -#define AuWbrCopyup(sbinfo, ...) \ - ((sbinfo)->si_wbr_copyup_ops->copyup(__VA_ARGS__)) -#define AuWbrCreate(sbinfo, ...) \ - ((sbinfo)->si_wbr_create_ops->create(__VA_ARGS__)) - -/* flags for si_read_lock()/aufs_read_lock()/di_read_lock() */ -#define AuLock_DW 1 /* write-lock dentry */ -#define AuLock_IR (1 << 1) /* read-lock inode */ -#define AuLock_IW (1 << 2) /* write-lock inode */ -#define AuLock_FLUSH (1 << 3) /* wait for 'nowait' tasks */ -#define AuLock_DIRS (1 << 4) /* target is a pair of dirs */ -#define AuLock_NOPLM (1 << 5) /* return err in plm mode */ -#define AuLock_NOPLMW (1 << 6) /* wait for plm mode ends */ -#define AuLock_GEN (1 << 7) /* test digen/iigen */ -#define au_ftest_lock(flags, name) ((flags) & AuLock_##name) -#define au_fset_lock(flags, name) \ - do { (flags) |= AuLock_##name; } while (0) -#define au_fclr_lock(flags, name) \ - do { (flags) &= ~AuLock_##name; } while (0) - -/* ---------------------------------------------------------------------- */ - -/* super.c */ -extern struct file_system_type aufs_fs_type; -struct inode *au_iget_locked(struct super_block *sb, ino_t ino); -typedef unsigned long long (*au_arraycb_t)(struct super_block *sb, void *array, - unsigned long long max, void *arg); -void *au_array_alloc(unsigned long long *hint, au_arraycb_t cb, - struct super_block *sb, void *arg); -struct inode **au_iarray_alloc(struct super_block *sb, unsigned long long *max); -void au_iarray_free(struct inode **a, unsigned long long max); - -/* sbinfo.c */ -void au_si_free(struct kobject *kobj); -int au_si_alloc(struct super_block *sb); -int au_sbr_realloc(struct au_sbinfo *sbinfo, int nbr); - -unsigned int au_sigen_inc(struct super_block *sb); -aufs_bindex_t au_new_br_id(struct super_block *sb); - -int si_read_lock(struct super_block *sb, int flags); -int si_write_lock(struct super_block *sb, int flags); -int aufs_read_lock(struct dentry *dentry, int flags); -void aufs_read_unlock(struct dentry *dentry, int flags); -void aufs_write_lock(struct dentry *dentry); -void aufs_write_unlock(struct dentry *dentry); -int aufs_read_and_write_lock2(struct dentry *d1, struct dentry *d2, int flags); -void aufs_read_and_write_unlock2(struct dentry *d1, struct dentry *d2); - -int si_pid_test_slow(struct super_block *sb); -void si_pid_set_slow(struct super_block *sb); -void si_pid_clr_slow(struct super_block *sb); - -/* wbr_policy.c */ -extern struct au_wbr_copyup_operations au_wbr_copyup_ops[]; -extern struct au_wbr_create_operations au_wbr_create_ops[]; -int au_cpdown_dirs(struct dentry *dentry, aufs_bindex_t bdst); -int au_wbr_nonopq(struct dentry *dentry, aufs_bindex_t bindex); -int au_wbr_do_copyup_bu(struct dentry *dentry, aufs_bindex_t bstart); - -/* mvdown.c */ -int au_mvdown(struct dentry *dentry, struct aufs_mvdown __user *arg); - -#ifdef CONFIG_AUFS_FHSM -/* fhsm.c */ - -static inline pid_t au_fhsm_pid(struct au_fhsm *fhsm) -{ - pid_t pid; - - spin_lock(&fhsm->fhsm_spin); - pid = fhsm->fhsm_pid; - spin_unlock(&fhsm->fhsm_spin); - - return pid; -} - -void au_fhsm_wrote(struct super_block *sb, aufs_bindex_t bindex, int force); -void au_fhsm_wrote_all(struct super_block *sb, int force); -int au_fhsm_fd(struct super_block *sb, int oflags); -int au_fhsm_br_alloc(struct au_branch *br); -void au_fhsm_set_bottom(struct super_block *sb, aufs_bindex_t bindex); -void au_fhsm_fin(struct super_block *sb); -void au_fhsm_init(struct au_sbinfo *sbinfo); -void au_fhsm_set(struct au_sbinfo *sbinfo, unsigned int sec); -void au_fhsm_show(struct seq_file *seq, struct au_sbinfo *sbinfo); -#else -AuStubVoid(au_fhsm_wrote, struct super_block *sb, aufs_bindex_t bindex, - int force) -AuStubVoid(au_fhsm_wrote_all, struct super_block *sb, int force) -AuStub(int, au_fhsm_fd, return -EOPNOTSUPP, struct super_block *sb, int oflags) -AuStub(pid_t, au_fhsm_pid, return 0, struct au_fhsm *fhsm) -AuStubInt0(au_fhsm_br_alloc, struct au_branch *br) -AuStubVoid(au_fhsm_set_bottom, struct super_block *sb, aufs_bindex_t bindex) -AuStubVoid(au_fhsm_fin, struct super_block *sb) -AuStubVoid(au_fhsm_init, struct au_sbinfo *sbinfo) -AuStubVoid(au_fhsm_set, struct au_sbinfo *sbinfo, unsigned int sec) -AuStubVoid(au_fhsm_show, struct seq_file *seq, struct au_sbinfo *sbinfo) -#endif - -/* ---------------------------------------------------------------------- */ - -static inline struct au_sbinfo *au_sbi(struct super_block *sb) -{ - return sb->s_fs_info; -} - -/* ---------------------------------------------------------------------- */ - -#ifdef CONFIG_AUFS_EXPORT -int au_test_nfsd(void); -void au_export_init(struct super_block *sb); -void au_xigen_inc(struct inode *inode); -int au_xigen_new(struct inode *inode); -int au_xigen_set(struct super_block *sb, struct file *base); -void au_xigen_clr(struct super_block *sb); - -static inline int au_busy_or_stale(void) -{ - if (!au_test_nfsd()) - return -EBUSY; - return -ESTALE; -} -#else -AuStubInt0(au_test_nfsd, void) -AuStubVoid(au_export_init, struct super_block *sb) -AuStubVoid(au_xigen_inc, struct inode *inode) -AuStubInt0(au_xigen_new, struct inode *inode) -AuStubInt0(au_xigen_set, struct super_block *sb, struct file *base) -AuStubVoid(au_xigen_clr, struct super_block *sb) -AuStub(int, au_busy_or_stale, return -EBUSY, void) -#endif /* CONFIG_AUFS_EXPORT */ - -/* ---------------------------------------------------------------------- */ - -#ifdef CONFIG_AUFS_SBILIST -/* module.c */ -extern struct au_splhead au_sbilist; - -static inline void au_sbilist_init(void) -{ - au_spl_init(&au_sbilist); -} - -static inline void au_sbilist_add(struct super_block *sb) -{ - au_spl_add(&au_sbi(sb)->si_list, &au_sbilist); -} - -static inline void au_sbilist_del(struct super_block *sb) -{ - au_spl_del(&au_sbi(sb)->si_list, &au_sbilist); -} - -#ifdef CONFIG_AUFS_MAGIC_SYSRQ -static inline void au_sbilist_lock(void) -{ - spin_lock(&au_sbilist.spin); -} - -static inline void au_sbilist_unlock(void) -{ - spin_unlock(&au_sbilist.spin); -} -#define AuGFP_SBILIST GFP_ATOMIC -#else -AuStubVoid(au_sbilist_lock, void) -AuStubVoid(au_sbilist_unlock, void) -#define AuGFP_SBILIST GFP_NOFS -#endif /* CONFIG_AUFS_MAGIC_SYSRQ */ -#else -AuStubVoid(au_sbilist_init, void) -AuStubVoid(au_sbilist_add, struct super_block *sb) -AuStubVoid(au_sbilist_del, struct super_block *sb) -AuStubVoid(au_sbilist_lock, void) -AuStubVoid(au_sbilist_unlock, void) -#define AuGFP_SBILIST GFP_NOFS -#endif - -/* ---------------------------------------------------------------------- */ - -static inline void dbgaufs_si_null(struct au_sbinfo *sbinfo) -{ - /* - * This function is a dynamic '__init' function actually, - * so the tiny check for si_rwsem is unnecessary. - */ - /* AuRwMustWriteLock(&sbinfo->si_rwsem); */ -#ifdef CONFIG_DEBUG_FS - sbinfo->si_dbgaufs = NULL; - sbinfo->si_dbgaufs_plink = NULL; - sbinfo->si_dbgaufs_xib = NULL; -#ifdef CONFIG_AUFS_EXPORT - sbinfo->si_dbgaufs_xigen = NULL; -#endif -#endif -} - -/* ---------------------------------------------------------------------- */ - -static inline pid_t si_pid_bit(void) -{ - /* the origin of pid is 1, but the bitmap's is 0 */ - return current->pid - 1; -} - -static inline int si_pid_test(struct super_block *sb) -{ - pid_t bit; - - bit = si_pid_bit(); - if (bit < PID_MAX_DEFAULT) - return test_bit(bit, au_sbi(sb)->au_si_pid.bitmap); - return si_pid_test_slow(sb); -} - -static inline void si_pid_set(struct super_block *sb) -{ - pid_t bit; - - bit = si_pid_bit(); - if (bit < PID_MAX_DEFAULT) { - AuDebugOn(test_bit(bit, au_sbi(sb)->au_si_pid.bitmap)); - set_bit(bit, au_sbi(sb)->au_si_pid.bitmap); - /* smp_mb(); */ - } else - si_pid_set_slow(sb); -} - -static inline void si_pid_clr(struct super_block *sb) -{ - pid_t bit; - - bit = si_pid_bit(); - if (bit < PID_MAX_DEFAULT) { - AuDebugOn(!test_bit(bit, au_sbi(sb)->au_si_pid.bitmap)); - clear_bit(bit, au_sbi(sb)->au_si_pid.bitmap); - /* smp_mb(); */ - } else - si_pid_clr_slow(sb); -} - -/* ---------------------------------------------------------------------- */ - -/* lock superblock. mainly for entry point functions */ -/* - * __si_read_lock, __si_write_lock, - * __si_read_unlock, __si_write_unlock, __si_downgrade_lock - */ -AuSimpleRwsemFuncs(__si, struct super_block *sb, &au_sbi(sb)->si_rwsem); - -#define SiMustNoWaiters(sb) AuRwMustNoWaiters(&au_sbi(sb)->si_rwsem) -#define SiMustAnyLock(sb) AuRwMustAnyLock(&au_sbi(sb)->si_rwsem) -#define SiMustWriteLock(sb) AuRwMustWriteLock(&au_sbi(sb)->si_rwsem) - -static inline void si_noflush_read_lock(struct super_block *sb) -{ - __si_read_lock(sb); - si_pid_set(sb); -} - -static inline int si_noflush_read_trylock(struct super_block *sb) -{ - int locked; - - locked = __si_read_trylock(sb); - if (locked) - si_pid_set(sb); - return locked; -} - -static inline void si_noflush_write_lock(struct super_block *sb) -{ - __si_write_lock(sb); - si_pid_set(sb); -} - -static inline int si_noflush_write_trylock(struct super_block *sb) -{ - int locked; - - locked = __si_write_trylock(sb); - if (locked) - si_pid_set(sb); - return locked; -} - -#if 0 /* reserved */ -static inline int si_read_trylock(struct super_block *sb, int flags) -{ - if (au_ftest_lock(flags, FLUSH)) - au_nwt_flush(&au_sbi(sb)->si_nowait); - return si_noflush_read_trylock(sb); -} -#endif - -static inline void si_read_unlock(struct super_block *sb) -{ - si_pid_clr(sb); - __si_read_unlock(sb); -} - -#if 0 /* reserved */ -static inline int si_write_trylock(struct super_block *sb, int flags) -{ - if (au_ftest_lock(flags, FLUSH)) - au_nwt_flush(&au_sbi(sb)->si_nowait); - return si_noflush_write_trylock(sb); -} -#endif - -static inline void si_write_unlock(struct super_block *sb) -{ - si_pid_clr(sb); - __si_write_unlock(sb); -} - -#if 0 /* reserved */ -static inline void si_downgrade_lock(struct super_block *sb) -{ - __si_downgrade_lock(sb); -} -#endif - -/* ---------------------------------------------------------------------- */ - -static inline aufs_bindex_t au_sbend(struct super_block *sb) -{ - SiMustAnyLock(sb); - return au_sbi(sb)->si_bend; -} - -static inline unsigned int au_mntflags(struct super_block *sb) -{ - SiMustAnyLock(sb); - return au_sbi(sb)->si_mntflags; -} - -static inline unsigned int au_sigen(struct super_block *sb) -{ - SiMustAnyLock(sb); - return au_sbi(sb)->si_generation; -} - -static inline void au_ninodes_inc(struct super_block *sb) -{ - atomic_long_inc(&au_sbi(sb)->si_ninodes); -} - -static inline void au_ninodes_dec(struct super_block *sb) -{ - AuDebugOn(!atomic_long_read(&au_sbi(sb)->si_ninodes)); - atomic_long_dec(&au_sbi(sb)->si_ninodes); -} - -static inline void au_nfiles_inc(struct super_block *sb) -{ - atomic_long_inc(&au_sbi(sb)->si_nfiles); -} - -static inline void au_nfiles_dec(struct super_block *sb) -{ - AuDebugOn(!atomic_long_read(&au_sbi(sb)->si_nfiles)); - atomic_long_dec(&au_sbi(sb)->si_nfiles); -} - -static inline struct au_branch *au_sbr(struct super_block *sb, - aufs_bindex_t bindex) -{ - SiMustAnyLock(sb); - return au_sbi(sb)->si_branch[0 + bindex]; -} - -static inline void au_xino_brid_set(struct super_block *sb, aufs_bindex_t brid) -{ - SiMustWriteLock(sb); - au_sbi(sb)->si_xino_brid = brid; -} - -static inline aufs_bindex_t au_xino_brid(struct super_block *sb) -{ - SiMustAnyLock(sb); - return au_sbi(sb)->si_xino_brid; -} - -#endif /* __KERNEL__ */ -#endif /* __AUFS_SUPER_H__ */ diff --git a/fs/aufs/sysaufs.c b/fs/aufs/sysaufs.c deleted file mode 100644 index 8ec10fb31..000000000 --- a/fs/aufs/sysaufs.c +++ /dev/null @@ -1,91 +0,0 @@ -/* - * Copyright (C) 2005-2016 Junjiro R. Okajima - */ - -/* - * sysfs interface and lifetime management - * they are necessary regardless sysfs is disabled. - */ - -#include <linux/random.h> -#include "aufs.h" - -unsigned long sysaufs_si_mask; -struct kset *sysaufs_kset; - -#define AuSiAttr(_name) { \ - .attr = { .name = __stringify(_name), .mode = 0444 }, \ - .show = sysaufs_si_##_name, \ -} - -static struct sysaufs_si_attr sysaufs_si_attr_xi_path = AuSiAttr(xi_path); -struct attribute *sysaufs_si_attrs[] = { - &sysaufs_si_attr_xi_path.attr, - NULL, -}; - -static const struct sysfs_ops au_sbi_ops = { - .show = sysaufs_si_show -}; - -static struct kobj_type au_sbi_ktype = { - .release = au_si_free, - .sysfs_ops = &au_sbi_ops, - .default_attrs = sysaufs_si_attrs -}; - -/* ---------------------------------------------------------------------- */ - -int sysaufs_si_init(struct au_sbinfo *sbinfo) -{ - int err; - - sbinfo->si_kobj.kset = sysaufs_kset; - /* cf. sysaufs_name() */ - err = kobject_init_and_add - (&sbinfo->si_kobj, &au_sbi_ktype, /*&sysaufs_kset->kobj*/NULL, - SysaufsSiNamePrefix "%lx", sysaufs_si_id(sbinfo)); - - dbgaufs_si_null(sbinfo); - if (!err) { - err = dbgaufs_si_init(sbinfo); - if (unlikely(err)) - kobject_put(&sbinfo->si_kobj); - } - return err; -} - -void sysaufs_fin(void) -{ - dbgaufs_fin(); - sysfs_remove_group(&sysaufs_kset->kobj, sysaufs_attr_group); - kset_unregister(sysaufs_kset); -} - -int __init sysaufs_init(void) -{ - int err; - - do { - get_random_bytes(&sysaufs_si_mask, sizeof(sysaufs_si_mask)); - } while (!sysaufs_si_mask); - - err = -EINVAL; - sysaufs_kset = kset_create_and_add(AUFS_NAME, NULL, fs_kobj); - if (unlikely(!sysaufs_kset)) - goto out; - err = PTR_ERR(sysaufs_kset); - if (IS_ERR(sysaufs_kset)) - goto out; - err = sysfs_create_group(&sysaufs_kset->kobj, sysaufs_attr_group); - if (unlikely(err)) { - kset_unregister(sysaufs_kset); - goto out; - } - - err = dbgaufs_init(); - if (unlikely(err)) - sysaufs_fin(); -out: - return err; -} diff --git a/fs/aufs/sysaufs.h b/fs/aufs/sysaufs.h deleted file mode 100644 index 1f799835e..000000000 --- a/fs/aufs/sysaufs.h +++ /dev/null @@ -1,88 +0,0 @@ -/* - * Copyright (C) 2005-2016 Junjiro R. Okajima - */ - -/* - * sysfs interface and mount lifetime management - */ - -#ifndef __SYSAUFS_H__ -#define __SYSAUFS_H__ - -#ifdef __KERNEL__ - -#include <linux/sysfs.h> -#include "module.h" - -struct super_block; -struct au_sbinfo; - -struct sysaufs_si_attr { - struct attribute attr; - int (*show)(struct seq_file *seq, struct super_block *sb); -}; - -/* ---------------------------------------------------------------------- */ - -/* sysaufs.c */ -extern unsigned long sysaufs_si_mask; -extern struct kset *sysaufs_kset; -extern struct attribute *sysaufs_si_attrs[]; -int sysaufs_si_init(struct au_sbinfo *sbinfo); -int __init sysaufs_init(void); -void sysaufs_fin(void); - -/* ---------------------------------------------------------------------- */ - -/* some people doesn't like to show a pointer in kernel */ -static inline unsigned long sysaufs_si_id(struct au_sbinfo *sbinfo) -{ - return sysaufs_si_mask ^ (unsigned long)sbinfo; -} - -#define SysaufsSiNamePrefix "si_" -#define SysaufsSiNameLen (sizeof(SysaufsSiNamePrefix) + 16) -static inline void sysaufs_name(struct au_sbinfo *sbinfo, char *name) -{ - snprintf(name, SysaufsSiNameLen, SysaufsSiNamePrefix "%lx", - sysaufs_si_id(sbinfo)); -} - -struct au_branch; -#ifdef CONFIG_SYSFS -/* sysfs.c */ -extern struct attribute_group *sysaufs_attr_group; - -int sysaufs_si_xi_path(struct seq_file *seq, struct super_block *sb); -ssize_t sysaufs_si_show(struct kobject *kobj, struct attribute *attr, - char *buf); -long au_brinfo_ioctl(struct file *file, unsigned long arg); -#ifdef CONFIG_COMPAT -long au_brinfo_compat_ioctl(struct file *file, unsigned long arg); -#endif - -void sysaufs_br_init(struct au_branch *br); -void sysaufs_brs_add(struct super_block *sb, aufs_bindex_t bindex); -void sysaufs_brs_del(struct super_block *sb, aufs_bindex_t bindex); - -#define sysaufs_brs_init() do {} while (0) - -#else -#define sysaufs_attr_group NULL - -AuStubInt0(sysaufs_si_xi_path, struct seq_file *seq, struct super_block *sb) -AuStub(ssize_t, sysaufs_si_show, return 0, struct kobject *kobj, - struct attribute *attr, char *buf) -AuStubVoid(sysaufs_br_init, struct au_branch *br) -AuStubVoid(sysaufs_brs_add, struct super_block *sb, aufs_bindex_t bindex) -AuStubVoid(sysaufs_brs_del, struct super_block *sb, aufs_bindex_t bindex) - -static inline void sysaufs_brs_init(void) -{ - sysaufs_brs = 0; -} - -#endif /* CONFIG_SYSFS */ - -#endif /* __KERNEL__ */ -#endif /* __SYSAUFS_H__ */ diff --git a/fs/aufs/sysfs.c b/fs/aufs/sysfs.c deleted file mode 100644 index ed42f53d0..000000000 --- a/fs/aufs/sysfs.c +++ /dev/null @@ -1,340 +0,0 @@ -/* - * Copyright (C) 2005-2016 Junjiro R. Okajima - */ - -/* - * sysfs interface - */ - -#include <linux/compat.h> -#include <linux/seq_file.h> -#include "aufs.h" - -static struct attribute *au_attr[] = { - NULL, /* need to NULL terminate the list of attributes */ -}; - -static struct attribute_group sysaufs_attr_group_body = { - .attrs = au_attr -}; - -struct attribute_group *sysaufs_attr_group = &sysaufs_attr_group_body; - -/* ---------------------------------------------------------------------- */ - -int sysaufs_si_xi_path(struct seq_file *seq, struct super_block *sb) -{ - int err; - - SiMustAnyLock(sb); - - err = 0; - if (au_opt_test(au_mntflags(sb), XINO)) { - err = au_xino_path(seq, au_sbi(sb)->si_xib); - seq_putc(seq, '\n'); - } - return err; -} - -/* - * the lifetime of branch is independent from the entry under sysfs. - * sysfs handles the lifetime of the entry, and never call ->show() after it is - * unlinked. - */ -static int sysaufs_si_br(struct seq_file *seq, struct super_block *sb, - aufs_bindex_t bindex, int idx) -{ - int err; - struct path path; - struct dentry *root; - struct au_branch *br; - au_br_perm_str_t perm; - - AuDbg("b%d\n", bindex); - - err = 0; - root = sb->s_root; - di_read_lock_parent(root, !AuLock_IR); - br = au_sbr(sb, bindex); - - switch (idx) { - case AuBrSysfs_BR: - path.mnt = au_br_mnt(br); - path.dentry = au_h_dptr(root, bindex); - err = au_seq_path(seq, &path); - if (!err) { - au_optstr_br_perm(&perm, br->br_perm); - seq_printf(seq, "=%s\n", perm.a); - } - break; - case AuBrSysfs_BRID: - seq_printf(seq, "%d\n", br->br_id); - break; - } - di_read_unlock(root, !AuLock_IR); - if (unlikely(err || seq_has_overflowed(seq))) - err = -E2BIG; - - return err; -} - -/* ---------------------------------------------------------------------- */ - -static struct seq_file *au_seq(char *p, ssize_t len) -{ - struct seq_file *seq; - - seq = kzalloc(sizeof(*seq), GFP_NOFS); - if (seq) { - /* mutex_init(&seq.lock); */ - seq->buf = p; - seq->size = len; - return seq; /* success */ - } - - seq = ERR_PTR(-ENOMEM); - return seq; -} - -#define SysaufsBr_PREFIX "br" -#define SysaufsBrid_PREFIX "brid" - -/* todo: file size may exceed PAGE_SIZE */ -ssize_t sysaufs_si_show(struct kobject *kobj, struct attribute *attr, - char *buf) -{ - ssize_t err; - int idx; - long l; - aufs_bindex_t bend; - struct au_sbinfo *sbinfo; - struct super_block *sb; - struct seq_file *seq; - char *name; - struct attribute **cattr; - - sbinfo = container_of(kobj, struct au_sbinfo, si_kobj); - sb = sbinfo->si_sb; - - /* - * prevent a race condition between sysfs and aufs. - * for instance, sysfs_file_read() calls sysfs_get_active_two() which - * prohibits maintaining the sysfs entries. - * hew we acquire read lock after sysfs_get_active_two(). - * on the other hand, the remount process may maintain the sysfs/aufs - * entries after acquiring write lock. - * it can cause a deadlock. - * simply we gave up processing read here. - */ - err = -EBUSY; - if (unlikely(!si_noflush_read_trylock(sb))) - goto out; - - seq = au_seq(buf, PAGE_SIZE); - err = PTR_ERR(seq); - if (IS_ERR(seq)) - goto out_unlock; - - name = (void *)attr->name; - cattr = sysaufs_si_attrs; - while (*cattr) { - if (!strcmp(name, (*cattr)->name)) { - err = container_of(*cattr, struct sysaufs_si_attr, attr) - ->show(seq, sb); - goto out_seq; - } - cattr++; - } - - if (!strncmp(name, SysaufsBrid_PREFIX, - sizeof(SysaufsBrid_PREFIX) - 1)) { - idx = AuBrSysfs_BRID; - name += sizeof(SysaufsBrid_PREFIX) - 1; - } else if (!strncmp(name, SysaufsBr_PREFIX, - sizeof(SysaufsBr_PREFIX) - 1)) { - idx = AuBrSysfs_BR; - name += sizeof(SysaufsBr_PREFIX) - 1; - } else - BUG(); - - err = kstrtol(name, 10, &l); - if (!err) { - bend = au_sbend(sb); - if (l <= bend) - err = sysaufs_si_br(seq, sb, (aufs_bindex_t)l, idx); - else - err = -ENOENT; - } - -out_seq: - if (!err) { - err = seq->count; - /* sysfs limit */ - if (unlikely(err == PAGE_SIZE)) - err = -EFBIG; - } - kfree(seq); -out_unlock: - si_read_unlock(sb); -out: - return err; -} - -/* ---------------------------------------------------------------------- */ - -static int au_brinfo(struct super_block *sb, union aufs_brinfo __user *arg) -{ - int err; - int16_t brid; - aufs_bindex_t bindex, bend; - size_t sz; - char *buf; - struct seq_file *seq; - struct au_branch *br; - - si_read_lock(sb, AuLock_FLUSH); - bend = au_sbend(sb); - err = bend + 1; - if (!arg) - goto out; - - err = -ENOMEM; - buf = (void *)__get_free_page(GFP_NOFS); - if (unlikely(!buf)) - goto out; - - seq = au_seq(buf, PAGE_SIZE); - err = PTR_ERR(seq); - if (IS_ERR(seq)) - goto out_buf; - - sz = sizeof(*arg) - offsetof(union aufs_brinfo, path); - for (bindex = 0; bindex <= bend; bindex++, arg++) { - err = !access_ok(VERIFY_WRITE, arg, sizeof(*arg)); - if (unlikely(err)) - break; - - br = au_sbr(sb, bindex); - brid = br->br_id; - BUILD_BUG_ON(sizeof(brid) != sizeof(arg->id)); - err = __put_user(brid, &arg->id); - if (unlikely(err)) - break; - - BUILD_BUG_ON(sizeof(br->br_perm) != sizeof(arg->perm)); - err = __put_user(br->br_perm, &arg->perm); - if (unlikely(err)) - break; - - err = au_seq_path(seq, &br->br_path); - if (unlikely(err)) - break; - seq_putc(seq, '\0'); - if (!seq_has_overflowed(seq)) { - err = copy_to_user(arg->path, seq->buf, seq->count); - seq->count = 0; - if (unlikely(err)) - break; - } else { - err = -E2BIG; - goto out_seq; - } - } - if (unlikely(err)) - err = -EFAULT; - -out_seq: - kfree(seq); -out_buf: - free_page((unsigned long)buf); -out: - si_read_unlock(sb); - return err; -} - -long au_brinfo_ioctl(struct file *file, unsigned long arg) -{ - return au_brinfo(file->f_path.dentry->d_sb, (void __user *)arg); -} - -#ifdef CONFIG_COMPAT -long au_brinfo_compat_ioctl(struct file *file, unsigned long arg) -{ - return au_brinfo(file->f_path.dentry->d_sb, compat_ptr(arg)); -} -#endif - -/* ---------------------------------------------------------------------- */ - -void sysaufs_br_init(struct au_branch *br) -{ - int i; - struct au_brsysfs *br_sysfs; - struct attribute *attr; - - br_sysfs = br->br_sysfs; - for (i = 0; i < ARRAY_SIZE(br->br_sysfs); i++) { - attr = &br_sysfs->attr; - sysfs_attr_init(attr); - attr->name = br_sysfs->name; - attr->mode = S_IRUGO; - br_sysfs++; - } -} - -void sysaufs_brs_del(struct super_block *sb, aufs_bindex_t bindex) -{ - struct au_branch *br; - struct kobject *kobj; - struct au_brsysfs *br_sysfs; - int i; - aufs_bindex_t bend; - - dbgaufs_brs_del(sb, bindex); - - if (!sysaufs_brs) - return; - - kobj = &au_sbi(sb)->si_kobj; - bend = au_sbend(sb); - for (; bindex <= bend; bindex++) { - br = au_sbr(sb, bindex); - br_sysfs = br->br_sysfs; - for (i = 0; i < ARRAY_SIZE(br->br_sysfs); i++) { - sysfs_remove_file(kobj, &br_sysfs->attr); - br_sysfs++; - } - } -} - -void sysaufs_brs_add(struct super_block *sb, aufs_bindex_t bindex) -{ - int err, i; - aufs_bindex_t bend; - struct kobject *kobj; - struct au_branch *br; - struct au_brsysfs *br_sysfs; - - dbgaufs_brs_add(sb, bindex); - - if (!sysaufs_brs) - return; - - kobj = &au_sbi(sb)->si_kobj; - bend = au_sbend(sb); - for (; bindex <= bend; bindex++) { - br = au_sbr(sb, bindex); - br_sysfs = br->br_sysfs; - snprintf(br_sysfs[AuBrSysfs_BR].name, sizeof(br_sysfs->name), - SysaufsBr_PREFIX "%d", bindex); - snprintf(br_sysfs[AuBrSysfs_BRID].name, sizeof(br_sysfs->name), - SysaufsBrid_PREFIX "%d", bindex); - for (i = 0; i < ARRAY_SIZE(br->br_sysfs); i++) { - err = sysfs_create_file(kobj, &br_sysfs->attr); - if (unlikely(err)) - pr_warn("failed %s under sysfs(%d)\n", - br_sysfs->name, err); - br_sysfs++; - } - } -} diff --git a/fs/aufs/sysrq.c b/fs/aufs/sysrq.c deleted file mode 100644 index 7921ed716..000000000 --- a/fs/aufs/sysrq.c +++ /dev/null @@ -1,144 +0,0 @@ -/* - * Copyright (C) 2005-2016 Junjiro R. Okajima - */ - -/* - * magic sysrq hanlder - */ - -/* #include <linux/sysrq.h> */ -#include <linux/writeback.h> -#include "aufs.h" - -/* ---------------------------------------------------------------------- */ - -static void sysrq_sb(struct super_block *sb) -{ - char *plevel; - struct au_sbinfo *sbinfo; - struct file *file; - struct au_sphlhead *files; - struct au_finfo *finfo; - - plevel = au_plevel; - au_plevel = KERN_WARNING; - - /* since we define pr_fmt, call printk directly */ -#define pr(str) printk(KERN_WARNING AUFS_NAME ": " str) - - sbinfo = au_sbi(sb); - printk(KERN_WARNING "si=%lx\n", sysaufs_si_id(sbinfo)); - pr("superblock\n"); - au_dpri_sb(sb); - -#if 0 - pr("root dentry\n"); - au_dpri_dentry(sb->s_root); - pr("root inode\n"); - au_dpri_inode(d_inode(sb->s_root)); -#endif - -#if 0 - do { - int err, i, j, ndentry; - struct au_dcsub_pages dpages; - struct au_dpage *dpage; - - err = au_dpages_init(&dpages, GFP_ATOMIC); - if (unlikely(err)) - break; - err = au_dcsub_pages(&dpages, sb->s_root, NULL, NULL); - if (!err) - for (i = 0; i < dpages.ndpage; i++) { - dpage = dpages.dpages + i; - ndentry = dpage->ndentry; - for (j = 0; j < ndentry; j++) - au_dpri_dentry(dpage->dentries[j]); - } - au_dpages_free(&dpages); - } while (0); -#endif - -#if 1 - { - struct inode *i; - - pr("isolated inode\n"); - spin_lock(&sb->s_inode_list_lock); - list_for_each_entry(i, &sb->s_inodes, i_sb_list) { - spin_lock(&i->i_lock); - if (1 || hlist_empty(&i->i_dentry)) - au_dpri_inode(i); - spin_unlock(&i->i_lock); - } - spin_unlock(&sb->s_inode_list_lock); - } -#endif - pr("files\n"); - files = &au_sbi(sb)->si_files; - spin_lock(&files->spin); - hlist_for_each_entry(finfo, &files->head, fi_hlist) { - umode_t mode; - - file = finfo->fi_file; - mode = file_inode(file)->i_mode; - if (!special_file(mode)) - au_dpri_file(file); - } - spin_unlock(&files->spin); - pr("done\n"); - -#undef pr - au_plevel = plevel; -} - -/* ---------------------------------------------------------------------- */ - -/* module parameter */ -static char *aufs_sysrq_key = "a"; -module_param_named(sysrq, aufs_sysrq_key, charp, S_IRUGO); -MODULE_PARM_DESC(sysrq, "MagicSysRq key for " AUFS_NAME); - -static void au_sysrq(int key __maybe_unused) -{ - struct au_sbinfo *sbinfo; - - lockdep_off(); - au_sbilist_lock(); - list_for_each_entry(sbinfo, &au_sbilist.head, si_list) - sysrq_sb(sbinfo->si_sb); - au_sbilist_unlock(); - lockdep_on(); -} - -static struct sysrq_key_op au_sysrq_op = { - .handler = au_sysrq, - .help_msg = "Aufs", - .action_msg = "Aufs", - .enable_mask = SYSRQ_ENABLE_DUMP -}; - -/* ---------------------------------------------------------------------- */ - -int __init au_sysrq_init(void) -{ - int err; - char key; - - err = -1; - key = *aufs_sysrq_key; - if ('a' <= key && key <= 'z') - err = register_sysrq_key(key, &au_sysrq_op); - if (unlikely(err)) - pr_err("err %d, sysrq=%c\n", err, key); - return err; -} - -void au_sysrq_fin(void) -{ - int err; - - err = unregister_sysrq_key(*aufs_sysrq_key, &au_sysrq_op); - if (unlikely(err)) - pr_err("err %d (ignored)\n", err); -} diff --git a/fs/aufs/vdir.c b/fs/aufs/vdir.c deleted file mode 100644 index f64cc2b7a..000000000 --- a/fs/aufs/vdir.c +++ /dev/null @@ -1,875 +0,0 @@ -/* - * Copyright (C) 2005-2016 Junjiro R. Okajima - */ - -/* - * virtual or vertical directory - */ - -#include "aufs.h" - -static unsigned int calc_size(int nlen) -{ - return ALIGN(sizeof(struct au_vdir_de) + nlen, sizeof(ino_t)); -} - -static int set_deblk_end(union au_vdir_deblk_p *p, - union au_vdir_deblk_p *deblk_end) -{ - if (calc_size(0) <= deblk_end->deblk - p->deblk) { - p->de->de_str.len = 0; - /* smp_mb(); */ - return 0; - } - return -1; /* error */ -} - -/* returns true or false */ -static int is_deblk_end(union au_vdir_deblk_p *p, - union au_vdir_deblk_p *deblk_end) -{ - if (calc_size(0) <= deblk_end->deblk - p->deblk) - return !p->de->de_str.len; - return 1; -} - -static unsigned char *last_deblk(struct au_vdir *vdir) -{ - return vdir->vd_deblk[vdir->vd_nblk - 1]; -} - -/* ---------------------------------------------------------------------- */ - -/* estimate the appropriate size for name hash table */ -unsigned int au_rdhash_est(loff_t sz) -{ - unsigned int n; - - n = UINT_MAX; - sz >>= 10; - if (sz < n) - n = sz; - if (sz < AUFS_RDHASH_DEF) - n = AUFS_RDHASH_DEF; - /* pr_info("n %u\n", n); */ - return n; -} - -/* - * the allocated memory has to be freed by - * au_nhash_wh_free() or au_nhash_de_free(). - */ -int au_nhash_alloc(struct au_nhash *nhash, unsigned int num_hash, gfp_t gfp) -{ - struct hlist_head *head; - unsigned int u; - size_t sz; - - sz = sizeof(*nhash->nh_head) * num_hash; - head = kmalloc(sz, gfp); - if (head) { - nhash->nh_num = num_hash; - nhash->nh_head = head; - for (u = 0; u < num_hash; u++) - INIT_HLIST_HEAD(head++); - return 0; /* success */ - } - - return -ENOMEM; -} - -static void nhash_count(struct hlist_head *head) -{ -#if 0 - unsigned long n; - struct hlist_node *pos; - - n = 0; - hlist_for_each(pos, head) - n++; - pr_info("%lu\n", n); -#endif -} - -static void au_nhash_wh_do_free(struct hlist_head *head) -{ - struct au_vdir_wh *pos; - struct hlist_node *node; - - hlist_for_each_entry_safe(pos, node, head, wh_hash) - kfree(pos); -} - -static void au_nhash_de_do_free(struct hlist_head *head) -{ - struct au_vdir_dehstr *pos; - struct hlist_node *node; - - hlist_for_each_entry_safe(pos, node, head, hash) - au_cache_free_vdir_dehstr(pos); -} - -static void au_nhash_do_free(struct au_nhash *nhash, - void (*free)(struct hlist_head *head)) -{ - unsigned int n; - struct hlist_head *head; - - n = nhash->nh_num; - if (!n) - return; - - head = nhash->nh_head; - while (n-- > 0) { - nhash_count(head); - free(head++); - } - kfree(nhash->nh_head); -} - -void au_nhash_wh_free(struct au_nhash *whlist) -{ - au_nhash_do_free(whlist, au_nhash_wh_do_free); -} - -static void au_nhash_de_free(struct au_nhash *delist) -{ - au_nhash_do_free(delist, au_nhash_de_do_free); -} - -/* ---------------------------------------------------------------------- */ - -int au_nhash_test_longer_wh(struct au_nhash *whlist, aufs_bindex_t btgt, - int limit) -{ - int num; - unsigned int u, n; - struct hlist_head *head; - struct au_vdir_wh *pos; - - num = 0; - n = whlist->nh_num; - head = whlist->nh_head; - for (u = 0; u < n; u++, head++) - hlist_for_each_entry(pos, head, wh_hash) - if (pos->wh_bindex == btgt && ++num > limit) - return 1; - return 0; -} - -static struct hlist_head *au_name_hash(struct au_nhash *nhash, - unsigned char *name, - unsigned int len) -{ - unsigned int v; - /* const unsigned int magic_bit = 12; */ - - AuDebugOn(!nhash->nh_num || !nhash->nh_head); - - v = 0; - while (len--) - v += *name++; - /* v = hash_long(v, magic_bit); */ - v %= nhash->nh_num; - return nhash->nh_head + v; -} - -static int au_nhash_test_name(struct au_vdir_destr *str, const char *name, - int nlen) -{ - return str->len == nlen && !memcmp(str->name, name, nlen); -} - -/* returns found or not */ -int au_nhash_test_known_wh(struct au_nhash *whlist, char *name, int nlen) -{ - struct hlist_head *head; - struct au_vdir_wh *pos; - struct au_vdir_destr *str; - - head = au_name_hash(whlist, name, nlen); - hlist_for_each_entry(pos, head, wh_hash) { - str = &pos->wh_str; - AuDbg("%.*s\n", str->len, str->name); - if (au_nhash_test_name(str, name, nlen)) - return 1; - } - return 0; -} - -/* returns found(true) or not */ -static int test_known(struct au_nhash *delist, char *name, int nlen) -{ - struct hlist_head *head; - struct au_vdir_dehstr *pos; - struct au_vdir_destr *str; - - head = au_name_hash(delist, name, nlen); - hlist_for_each_entry(pos, head, hash) { - str = pos->str; - AuDbg("%.*s\n", str->len, str->name); - if (au_nhash_test_name(str, name, nlen)) - return 1; - } - return 0; -} - -static void au_shwh_init_wh(struct au_vdir_wh *wh, ino_t ino, - unsigned char d_type) -{ -#ifdef CONFIG_AUFS_SHWH - wh->wh_ino = ino; - wh->wh_type = d_type; -#endif -} - -/* ---------------------------------------------------------------------- */ - -int au_nhash_append_wh(struct au_nhash *whlist, char *name, int nlen, ino_t ino, - unsigned int d_type, aufs_bindex_t bindex, - unsigned char shwh) -{ - int err; - struct au_vdir_destr *str; - struct au_vdir_wh *wh; - - AuDbg("%.*s\n", nlen, name); - AuDebugOn(!whlist->nh_num || !whlist->nh_head); - - err = -ENOMEM; - wh = kmalloc(sizeof(*wh) + nlen, GFP_NOFS); - if (unlikely(!wh)) - goto out; - - err = 0; - wh->wh_bindex = bindex; - if (shwh) - au_shwh_init_wh(wh, ino, d_type); - str = &wh->wh_str; - str->len = nlen; - memcpy(str->name, name, nlen); - hlist_add_head(&wh->wh_hash, au_name_hash(whlist, name, nlen)); - /* smp_mb(); */ - -out: - return err; -} - -static int append_deblk(struct au_vdir *vdir) -{ - int err; - unsigned long ul; - const unsigned int deblk_sz = vdir->vd_deblk_sz; - union au_vdir_deblk_p p, deblk_end; - unsigned char **o; - - err = -ENOMEM; - o = krealloc(vdir->vd_deblk, sizeof(*o) * (vdir->vd_nblk + 1), - GFP_NOFS); - if (unlikely(!o)) - goto out; - - vdir->vd_deblk = o; - p.deblk = kmalloc(deblk_sz, GFP_NOFS); - if (p.deblk) { - ul = vdir->vd_nblk++; - vdir->vd_deblk[ul] = p.deblk; - vdir->vd_last.ul = ul; - vdir->vd_last.p.deblk = p.deblk; - deblk_end.deblk = p.deblk + deblk_sz; - err = set_deblk_end(&p, &deblk_end); - } - -out: - return err; -} - -static int append_de(struct au_vdir *vdir, char *name, int nlen, ino_t ino, - unsigned int d_type, struct au_nhash *delist) -{ - int err; - unsigned int sz; - const unsigned int deblk_sz = vdir->vd_deblk_sz; - union au_vdir_deblk_p p, *room, deblk_end; - struct au_vdir_dehstr *dehstr; - - p.deblk = last_deblk(vdir); - deblk_end.deblk = p.deblk + deblk_sz; - room = &vdir->vd_last.p; - AuDebugOn(room->deblk < p.deblk || deblk_end.deblk <= room->deblk - || !is_deblk_end(room, &deblk_end)); - - sz = calc_size(nlen); - if (unlikely(sz > deblk_end.deblk - room->deblk)) { - err = append_deblk(vdir); - if (unlikely(err)) - goto out; - - p.deblk = last_deblk(vdir); - deblk_end.deblk = p.deblk + deblk_sz; - /* smp_mb(); */ - AuDebugOn(room->deblk != p.deblk); - } - - err = -ENOMEM; - dehstr = au_cache_alloc_vdir_dehstr(); - if (unlikely(!dehstr)) - goto out; - - dehstr->str = &room->de->de_str; - hlist_add_head(&dehstr->hash, au_name_hash(delist, name, nlen)); - room->de->de_ino = ino; - room->de->de_type = d_type; - room->de->de_str.len = nlen; - memcpy(room->de->de_str.name, name, nlen); - - err = 0; - room->deblk += sz; - if (unlikely(set_deblk_end(room, &deblk_end))) - err = append_deblk(vdir); - /* smp_mb(); */ - -out: - return err; -} - -/* ---------------------------------------------------------------------- */ - -void au_vdir_free(struct au_vdir *vdir) -{ - unsigned char **deblk; - - deblk = vdir->vd_deblk; - while (vdir->vd_nblk--) - kfree(*deblk++); - kfree(vdir->vd_deblk); - au_cache_free_vdir(vdir); -} - -static struct au_vdir *alloc_vdir(struct file *file) -{ - struct au_vdir *vdir; - struct super_block *sb; - int err; - - sb = file->f_path.dentry->d_sb; - SiMustAnyLock(sb); - - err = -ENOMEM; - vdir = au_cache_alloc_vdir(); - if (unlikely(!vdir)) - goto out; - - vdir->vd_deblk = kzalloc(sizeof(*vdir->vd_deblk), GFP_NOFS); - if (unlikely(!vdir->vd_deblk)) - goto out_free; - - vdir->vd_deblk_sz = au_sbi(sb)->si_rdblk; - if (!vdir->vd_deblk_sz) { - /* estimate the appropriate size for deblk */ - vdir->vd_deblk_sz = au_dir_size(file, /*dentry*/NULL); - /* pr_info("vd_deblk_sz %u\n", vdir->vd_deblk_sz); */ - } - vdir->vd_nblk = 0; - vdir->vd_version = 0; - vdir->vd_jiffy = 0; - err = append_deblk(vdir); - if (!err) - return vdir; /* success */ - - kfree(vdir->vd_deblk); - -out_free: - au_cache_free_vdir(vdir); -out: - vdir = ERR_PTR(err); - return vdir; -} - -static int reinit_vdir(struct au_vdir *vdir) -{ - int err; - union au_vdir_deblk_p p, deblk_end; - - while (vdir->vd_nblk > 1) { - kfree(vdir->vd_deblk[vdir->vd_nblk - 1]); - /* vdir->vd_deblk[vdir->vd_nblk - 1] = NULL; */ - vdir->vd_nblk--; - } - p.deblk = vdir->vd_deblk[0]; - deblk_end.deblk = p.deblk + vdir->vd_deblk_sz; - err = set_deblk_end(&p, &deblk_end); - /* keep vd_dblk_sz */ - vdir->vd_last.ul = 0; - vdir->vd_last.p.deblk = vdir->vd_deblk[0]; - vdir->vd_version = 0; - vdir->vd_jiffy = 0; - /* smp_mb(); */ - return err; -} - -/* ---------------------------------------------------------------------- */ - -#define AuFillVdir_CALLED 1 -#define AuFillVdir_WHABLE (1 << 1) -#define AuFillVdir_SHWH (1 << 2) -#define au_ftest_fillvdir(flags, name) ((flags) & AuFillVdir_##name) -#define au_fset_fillvdir(flags, name) \ - do { (flags) |= AuFillVdir_##name; } while (0) -#define au_fclr_fillvdir(flags, name) \ - do { (flags) &= ~AuFillVdir_##name; } while (0) - -#ifndef CONFIG_AUFS_SHWH -#undef AuFillVdir_SHWH -#define AuFillVdir_SHWH 0 -#endif - -struct fillvdir_arg { - struct dir_context ctx; - struct file *file; - struct au_vdir *vdir; - struct au_nhash delist; - struct au_nhash whlist; - aufs_bindex_t bindex; - unsigned int flags; - int err; -}; - -static int fillvdir(struct dir_context *ctx, const char *__name, int nlen, - loff_t offset __maybe_unused, u64 h_ino, - unsigned int d_type) -{ - struct fillvdir_arg *arg = container_of(ctx, struct fillvdir_arg, ctx); - char *name = (void *)__name; - struct super_block *sb; - ino_t ino; - const unsigned char shwh = !!au_ftest_fillvdir(arg->flags, SHWH); - - arg->err = 0; - sb = arg->file->f_path.dentry->d_sb; - au_fset_fillvdir(arg->flags, CALLED); - /* smp_mb(); */ - if (nlen <= AUFS_WH_PFX_LEN - || memcmp(name, AUFS_WH_PFX, AUFS_WH_PFX_LEN)) { - if (test_known(&arg->delist, name, nlen) - || au_nhash_test_known_wh(&arg->whlist, name, nlen)) - goto out; /* already exists or whiteouted */ - - arg->err = au_ino(sb, arg->bindex, h_ino, d_type, &ino); - if (!arg->err) { - if (unlikely(nlen > AUFS_MAX_NAMELEN)) - d_type = DT_UNKNOWN; - arg->err = append_de(arg->vdir, name, nlen, ino, - d_type, &arg->delist); - } - } else if (au_ftest_fillvdir(arg->flags, WHABLE)) { - name += AUFS_WH_PFX_LEN; - nlen -= AUFS_WH_PFX_LEN; - if (au_nhash_test_known_wh(&arg->whlist, name, nlen)) - goto out; /* already whiteouted */ - - if (shwh) - arg->err = au_wh_ino(sb, arg->bindex, h_ino, d_type, - &ino); - if (!arg->err) { - if (nlen <= AUFS_MAX_NAMELEN + AUFS_WH_PFX_LEN) - d_type = DT_UNKNOWN; - arg->err = au_nhash_append_wh - (&arg->whlist, name, nlen, ino, d_type, - arg->bindex, shwh); - } - } - -out: - if (!arg->err) - arg->vdir->vd_jiffy = jiffies; - /* smp_mb(); */ - AuTraceErr(arg->err); - return arg->err; -} - -static int au_handle_shwh(struct super_block *sb, struct au_vdir *vdir, - struct au_nhash *whlist, struct au_nhash *delist) -{ -#ifdef CONFIG_AUFS_SHWH - int err; - unsigned int nh, u; - struct hlist_head *head; - struct au_vdir_wh *pos; - struct hlist_node *n; - char *p, *o; - struct au_vdir_destr *destr; - - AuDebugOn(!au_opt_test(au_mntflags(sb), SHWH)); - - err = -ENOMEM; - o = p = (void *)__get_free_page(GFP_NOFS); - if (unlikely(!p)) - goto out; - - err = 0; - nh = whlist->nh_num; - memcpy(p, AUFS_WH_PFX, AUFS_WH_PFX_LEN); - p += AUFS_WH_PFX_LEN; - for (u = 0; u < nh; u++) { - head = whlist->nh_head + u; - hlist_for_each_entry_safe(pos, n, head, wh_hash) { - destr = &pos->wh_str; - memcpy(p, destr->name, destr->len); - err = append_de(vdir, o, destr->len + AUFS_WH_PFX_LEN, - pos->wh_ino, pos->wh_type, delist); - if (unlikely(err)) - break; - } - } - - free_page((unsigned long)o); - -out: - AuTraceErr(err); - return err; -#else - return 0; -#endif -} - -static int au_do_read_vdir(struct fillvdir_arg *arg) -{ - int err; - unsigned int rdhash; - loff_t offset; - aufs_bindex_t bend, bindex, bstart; - unsigned char shwh; - struct file *hf, *file; - struct super_block *sb; - - file = arg->file; - sb = file->f_path.dentry->d_sb; - SiMustAnyLock(sb); - - rdhash = au_sbi(sb)->si_rdhash; - if (!rdhash) - rdhash = au_rdhash_est(au_dir_size(file, /*dentry*/NULL)); - err = au_nhash_alloc(&arg->delist, rdhash, GFP_NOFS); - if (unlikely(err)) - goto out; - err = au_nhash_alloc(&arg->whlist, rdhash, GFP_NOFS); - if (unlikely(err)) - goto out_delist; - - err = 0; - arg->flags = 0; - shwh = 0; - if (au_opt_test(au_mntflags(sb), SHWH)) { - shwh = 1; - au_fset_fillvdir(arg->flags, SHWH); - } - bstart = au_fbstart(file); - bend = au_fbend_dir(file); - for (bindex = bstart; !err && bindex <= bend; bindex++) { - hf = au_hf_dir(file, bindex); - if (!hf) - continue; - - offset = vfsub_llseek(hf, 0, SEEK_SET); - err = offset; - if (unlikely(offset)) - break; - - arg->bindex = bindex; - au_fclr_fillvdir(arg->flags, WHABLE); - if (shwh - || (bindex != bend - && au_br_whable(au_sbr_perm(sb, bindex)))) - au_fset_fillvdir(arg->flags, WHABLE); - do { - arg->err = 0; - au_fclr_fillvdir(arg->flags, CALLED); - /* smp_mb(); */ - err = vfsub_iterate_dir(hf, &arg->ctx); - if (err >= 0) - err = arg->err; - } while (!err && au_ftest_fillvdir(arg->flags, CALLED)); - - /* - * dir_relax() may be good for concurrency, but aufs should not - * use it since it will cause a lockdep problem. - */ - } - - if (!err && shwh) - err = au_handle_shwh(sb, arg->vdir, &arg->whlist, &arg->delist); - - au_nhash_wh_free(&arg->whlist); - -out_delist: - au_nhash_de_free(&arg->delist); -out: - return err; -} - -static int read_vdir(struct file *file, int may_read) -{ - int err; - unsigned long expire; - unsigned char do_read; - struct fillvdir_arg arg = { - .ctx = { - .actor = fillvdir - } - }; - struct inode *inode; - struct au_vdir *vdir, *allocated; - - err = 0; - inode = file_inode(file); - IMustLock(inode); - SiMustAnyLock(inode->i_sb); - - allocated = NULL; - do_read = 0; - expire = au_sbi(inode->i_sb)->si_rdcache; - vdir = au_ivdir(inode); - if (!vdir) { - do_read = 1; - vdir = alloc_vdir(file); - err = PTR_ERR(vdir); - if (IS_ERR(vdir)) - goto out; - err = 0; - allocated = vdir; - } else if (may_read - && (inode->i_version != vdir->vd_version - || time_after(jiffies, vdir->vd_jiffy + expire))) { - do_read = 1; - err = reinit_vdir(vdir); - if (unlikely(err)) - goto out; - } - - if (!do_read) - return 0; /* success */ - - arg.file = file; - arg.vdir = vdir; - err = au_do_read_vdir(&arg); - if (!err) { - /* file->f_pos = 0; */ /* todo: ctx->pos? */ - vdir->vd_version = inode->i_version; - vdir->vd_last.ul = 0; - vdir->vd_last.p.deblk = vdir->vd_deblk[0]; - if (allocated) - au_set_ivdir(inode, allocated); - } else if (allocated) - au_vdir_free(allocated); - -out: - return err; -} - -static int copy_vdir(struct au_vdir *tgt, struct au_vdir *src) -{ - int err, rerr; - unsigned long ul, n; - const unsigned int deblk_sz = src->vd_deblk_sz; - - AuDebugOn(tgt->vd_nblk != 1); - - err = -ENOMEM; - if (tgt->vd_nblk < src->vd_nblk) { - unsigned char **p; - - p = krealloc(tgt->vd_deblk, sizeof(*p) * src->vd_nblk, - GFP_NOFS); - if (unlikely(!p)) - goto out; - tgt->vd_deblk = p; - } - - if (tgt->vd_deblk_sz != deblk_sz) { - unsigned char *p; - - tgt->vd_deblk_sz = deblk_sz; - p = krealloc(tgt->vd_deblk[0], deblk_sz, GFP_NOFS); - if (unlikely(!p)) - goto out; - tgt->vd_deblk[0] = p; - } - memcpy(tgt->vd_deblk[0], src->vd_deblk[0], deblk_sz); - tgt->vd_version = src->vd_version; - tgt->vd_jiffy = src->vd_jiffy; - - n = src->vd_nblk; - for (ul = 1; ul < n; ul++) { - tgt->vd_deblk[ul] = kmemdup(src->vd_deblk[ul], deblk_sz, - GFP_NOFS); - if (unlikely(!tgt->vd_deblk[ul])) - goto out; - tgt->vd_nblk++; - } - tgt->vd_nblk = n; - tgt->vd_last.ul = tgt->vd_last.ul; - tgt->vd_last.p.deblk = tgt->vd_deblk[tgt->vd_last.ul]; - tgt->vd_last.p.deblk += src->vd_last.p.deblk - - src->vd_deblk[src->vd_last.ul]; - /* smp_mb(); */ - return 0; /* success */ - -out: - rerr = reinit_vdir(tgt); - BUG_ON(rerr); - return err; -} - -int au_vdir_init(struct file *file) -{ - int err; - struct inode *inode; - struct au_vdir *vdir_cache, *allocated; - - /* test file->f_pos here instead of ctx->pos */ - err = read_vdir(file, !file->f_pos); - if (unlikely(err)) - goto out; - - allocated = NULL; - vdir_cache = au_fvdir_cache(file); - if (!vdir_cache) { - vdir_cache = alloc_vdir(file); - err = PTR_ERR(vdir_cache); - if (IS_ERR(vdir_cache)) - goto out; - allocated = vdir_cache; - } else if (!file->f_pos && vdir_cache->vd_version != file->f_version) { - /* test file->f_pos here instead of ctx->pos */ - err = reinit_vdir(vdir_cache); - if (unlikely(err)) - goto out; - } else - return 0; /* success */ - - inode = file_inode(file); - err = copy_vdir(vdir_cache, au_ivdir(inode)); - if (!err) { - file->f_version = inode->i_version; - if (allocated) - au_set_fvdir_cache(file, allocated); - } else if (allocated) - au_vdir_free(allocated); - -out: - return err; -} - -static loff_t calc_offset(struct au_vdir *vdir) -{ - loff_t offset; - union au_vdir_deblk_p p; - - p.deblk = vdir->vd_deblk[vdir->vd_last.ul]; - offset = vdir->vd_last.p.deblk - p.deblk; - offset += vdir->vd_deblk_sz * vdir->vd_last.ul; - return offset; -} - -/* returns true or false */ -static int seek_vdir(struct file *file, struct dir_context *ctx) -{ - int valid; - unsigned int deblk_sz; - unsigned long ul, n; - loff_t offset; - union au_vdir_deblk_p p, deblk_end; - struct au_vdir *vdir_cache; - - valid = 1; - vdir_cache = au_fvdir_cache(file); - offset = calc_offset(vdir_cache); - AuDbg("offset %lld\n", offset); - if (ctx->pos == offset) - goto out; - - vdir_cache->vd_last.ul = 0; - vdir_cache->vd_last.p.deblk = vdir_cache->vd_deblk[0]; - if (!ctx->pos) - goto out; - - valid = 0; - deblk_sz = vdir_cache->vd_deblk_sz; - ul = div64_u64(ctx->pos, deblk_sz); - AuDbg("ul %lu\n", ul); - if (ul >= vdir_cache->vd_nblk) - goto out; - - n = vdir_cache->vd_nblk; - for (; ul < n; ul++) { - p.deblk = vdir_cache->vd_deblk[ul]; - deblk_end.deblk = p.deblk + deblk_sz; - offset = ul; - offset *= deblk_sz; - while (!is_deblk_end(&p, &deblk_end) && offset < ctx->pos) { - unsigned int l; - - l = calc_size(p.de->de_str.len); - offset += l; - p.deblk += l; - } - if (!is_deblk_end(&p, &deblk_end)) { - valid = 1; - vdir_cache->vd_last.ul = ul; - vdir_cache->vd_last.p = p; - break; - } - } - -out: - /* smp_mb(); */ - AuTraceErr(!valid); - return valid; -} - -int au_vdir_fill_de(struct file *file, struct dir_context *ctx) -{ - unsigned int l, deblk_sz; - union au_vdir_deblk_p deblk_end; - struct au_vdir *vdir_cache; - struct au_vdir_de *de; - - vdir_cache = au_fvdir_cache(file); - if (!seek_vdir(file, ctx)) - return 0; - - deblk_sz = vdir_cache->vd_deblk_sz; - while (1) { - deblk_end.deblk = vdir_cache->vd_deblk[vdir_cache->vd_last.ul]; - deblk_end.deblk += deblk_sz; - while (!is_deblk_end(&vdir_cache->vd_last.p, &deblk_end)) { - de = vdir_cache->vd_last.p.de; - AuDbg("%.*s, off%lld, i%lu, dt%d\n", - de->de_str.len, de->de_str.name, ctx->pos, - (unsigned long)de->de_ino, de->de_type); - if (unlikely(!dir_emit(ctx, de->de_str.name, - de->de_str.len, de->de_ino, - de->de_type))) { - /* todo: ignore the error caused by udba? */ - /* return err; */ - return 0; - } - - l = calc_size(de->de_str.len); - vdir_cache->vd_last.p.deblk += l; - ctx->pos += l; - } - if (vdir_cache->vd_last.ul < vdir_cache->vd_nblk - 1) { - vdir_cache->vd_last.ul++; - vdir_cache->vd_last.p.deblk - = vdir_cache->vd_deblk[vdir_cache->vd_last.ul]; - ctx->pos = deblk_sz * vdir_cache->vd_last.ul; - continue; - } - break; - } - - /* smp_mb(); */ - return 0; -} diff --git a/fs/aufs/vfsub.c b/fs/aufs/vfsub.c deleted file mode 100644 index 89f999c97..000000000 --- a/fs/aufs/vfsub.c +++ /dev/null @@ -1,853 +0,0 @@ -/* - * Copyright (C) 2005-2016 Junjiro R. Okajima - */ - -/* - * sub-routines for VFS - */ - -#include <linux/namei.h> -#include <linux/nsproxy.h> -#include <linux/security.h> -#include <linux/splice.h> -#include "../fs/mount.h" -#include "aufs.h" - -#ifdef CONFIG_AUFS_BR_FUSE -int vfsub_test_mntns(struct vfsmount *mnt, struct super_block *h_sb) -{ - struct nsproxy *ns; - - if (!au_test_fuse(h_sb) || !au_userns) - return 0; - - ns = current->nsproxy; - /* no {get,put}_nsproxy(ns) */ - return real_mount(mnt)->mnt_ns == ns->mnt_ns ? 0 : -EACCES; -} -#endif - -/* ---------------------------------------------------------------------- */ - -int vfsub_update_h_iattr(struct path *h_path, int *did) -{ - int err; - struct kstat st; - struct super_block *h_sb; - - /* for remote fs, leave work for its getattr or d_revalidate */ - /* for bad i_attr fs, handle them in aufs_getattr() */ - /* still some fs may acquire i_mutex. we need to skip them */ - err = 0; - if (!did) - did = &err; - h_sb = h_path->dentry->d_sb; - *did = (!au_test_fs_remote(h_sb) && au_test_fs_refresh_iattr(h_sb)); - if (*did) - err = vfs_getattr(h_path, &st); - - return err; -} - -/* ---------------------------------------------------------------------- */ - -struct file *vfsub_dentry_open(struct path *path, int flags) -{ - struct file *file; - - file = dentry_open(path, flags /* | __FMODE_NONOTIFY */, - current_cred()); - if (!IS_ERR_OR_NULL(file) - && (file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) - i_readcount_inc(d_inode(path->dentry)); - - return file; -} - -struct file *vfsub_filp_open(const char *path, int oflags, int mode) -{ - struct file *file; - - lockdep_off(); - file = filp_open(path, - oflags /* | __FMODE_NONOTIFY */, - mode); - lockdep_on(); - if (IS_ERR(file)) - goto out; - vfsub_update_h_iattr(&file->f_path, /*did*/NULL); /*ignore*/ - -out: - return file; -} - -/* - * Ideally this function should call VFS:do_last() in order to keep all its - * checkings. But it is very hard for aufs to regenerate several VFS internal - * structure such as nameidata. This is a second (or third) best approach. - * cf. linux/fs/namei.c:do_last(), lookup_open() and atomic_open(). - */ -int vfsub_atomic_open(struct inode *dir, struct dentry *dentry, - struct vfsub_aopen_args *args, struct au_branch *br) -{ - int err; - struct file *file = args->file; - /* copied from linux/fs/namei.c:atomic_open() */ - struct dentry *const DENTRY_NOT_SET = (void *)-1UL; - - IMustLock(dir); - AuDebugOn(!dir->i_op->atomic_open); - - err = au_br_test_oflag(args->open_flag, br); - if (unlikely(err)) - goto out; - - args->file->f_path.dentry = DENTRY_NOT_SET; - args->file->f_path.mnt = au_br_mnt(br); - err = dir->i_op->atomic_open(dir, dentry, file, args->open_flag, - args->create_mode, args->opened); - if (err >= 0) { - /* some filesystems don't set FILE_CREATED while succeeded? */ - if (*args->opened & FILE_CREATED) - fsnotify_create(dir, dentry); - } else - goto out; - - - if (!err) { - /* todo: call VFS:may_open() here */ - err = open_check_o_direct(file); - /* todo: ima_file_check() too? */ - if (!err && (args->open_flag & __FMODE_EXEC)) - err = deny_write_access(file); - if (unlikely(err)) - /* note that the file is created and still opened */ - goto out; - } - - atomic_inc(&br->br_count); - fsnotify_open(file); - -out: - return err; -} - -int vfsub_kern_path(const char *name, unsigned int flags, struct path *path) -{ - int err; - - err = kern_path(name, flags, path); - if (!err && d_is_positive(path->dentry)) - vfsub_update_h_iattr(path, /*did*/NULL); /*ignore*/ - return err; -} - -struct dentry *vfsub_lookup_one_len(const char *name, struct dentry *parent, - int len) -{ - struct path path = { - .mnt = NULL - }; - - /* VFS checks it too, but by WARN_ON_ONCE() */ - IMustLock(d_inode(parent)); - - path.dentry = lookup_one_len(name, parent, len); - if (IS_ERR(path.dentry)) - goto out; - if (d_is_positive(path.dentry)) - vfsub_update_h_iattr(&path, /*did*/NULL); /*ignore*/ - -out: - AuTraceErrPtr(path.dentry); - return path.dentry; -} - -void vfsub_call_lkup_one(void *args) -{ - struct vfsub_lkup_one_args *a = args; - *a->errp = vfsub_lkup_one(a->name, a->parent); -} - -/* ---------------------------------------------------------------------- */ - -struct dentry *vfsub_lock_rename(struct dentry *d1, struct au_hinode *hdir1, - struct dentry *d2, struct au_hinode *hdir2) -{ - struct dentry *d; - - lockdep_off(); - d = lock_rename(d1, d2); - lockdep_on(); - au_hn_suspend(hdir1); - if (hdir1 != hdir2) - au_hn_suspend(hdir2); - - return d; -} - -void vfsub_unlock_rename(struct dentry *d1, struct au_hinode *hdir1, - struct dentry *d2, struct au_hinode *hdir2) -{ - au_hn_resume(hdir1); - if (hdir1 != hdir2) - au_hn_resume(hdir2); - lockdep_off(); - unlock_rename(d1, d2); - lockdep_on(); -} - -/* ---------------------------------------------------------------------- */ - -int vfsub_create(struct inode *dir, struct path *path, int mode, bool want_excl) -{ - int err; - struct dentry *d; - - IMustLock(dir); - - d = path->dentry; - path->dentry = d->d_parent; - err = security_path_mknod(path, d, mode, 0); - path->dentry = d; - if (unlikely(err)) - goto out; - - lockdep_off(); - err = vfs_create(dir, path->dentry, mode, want_excl); - lockdep_on(); - if (!err) { - struct path tmp = *path; - int did; - - vfsub_update_h_iattr(&tmp, &did); - if (did) { - tmp.dentry = path->dentry->d_parent; - vfsub_update_h_iattr(&tmp, /*did*/NULL); - } - /*ignore*/ - } - -out: - return err; -} - -int vfsub_symlink(struct inode *dir, struct path *path, const char *symname) -{ - int err; - struct dentry *d; - - IMustLock(dir); - - d = path->dentry; - path->dentry = d->d_parent; - err = security_path_symlink(path, d, symname); - path->dentry = d; - if (unlikely(err)) - goto out; - - lockdep_off(); - err = vfs_symlink(dir, path->dentry, symname); - lockdep_on(); - if (!err) { - struct path tmp = *path; - int did; - - vfsub_update_h_iattr(&tmp, &did); - if (did) { - tmp.dentry = path->dentry->d_parent; - vfsub_update_h_iattr(&tmp, /*did*/NULL); - } - /*ignore*/ - } - -out: - return err; -} - -int vfsub_mknod(struct inode *dir, struct path *path, int mode, dev_t dev) -{ - int err; - struct dentry *d; - - IMustLock(dir); - - d = path->dentry; - path->dentry = d->d_parent; - err = security_path_mknod(path, d, mode, new_encode_dev(dev)); - path->dentry = d; - if (unlikely(err)) - goto out; - - lockdep_off(); - err = vfs_mknod(dir, path->dentry, mode, dev); - lockdep_on(); - if (!err) { - struct path tmp = *path; - int did; - - vfsub_update_h_iattr(&tmp, &did); - if (did) { - tmp.dentry = path->dentry->d_parent; - vfsub_update_h_iattr(&tmp, /*did*/NULL); - } - /*ignore*/ - } - -out: - return err; -} - -static int au_test_nlink(struct inode *inode) -{ - const unsigned int link_max = UINT_MAX >> 1; /* rough margin */ - - if (!au_test_fs_no_limit_nlink(inode->i_sb) - || inode->i_nlink < link_max) - return 0; - return -EMLINK; -} - -int vfsub_link(struct dentry *src_dentry, struct inode *dir, struct path *path, - struct inode **delegated_inode) -{ - int err; - struct dentry *d; - - IMustLock(dir); - - err = au_test_nlink(d_inode(src_dentry)); - if (unlikely(err)) - return err; - - /* we don't call may_linkat() */ - d = path->dentry; - path->dentry = d->d_parent; - err = security_path_link(src_dentry, path, d); - path->dentry = d; - if (unlikely(err)) - goto out; - - lockdep_off(); - err = vfs_link(src_dentry, dir, path->dentry, delegated_inode); - lockdep_on(); - if (!err) { - struct path tmp = *path; - int did; - - /* fuse has different memory inode for the same inumber */ - vfsub_update_h_iattr(&tmp, &did); - if (did) { - tmp.dentry = path->dentry->d_parent; - vfsub_update_h_iattr(&tmp, /*did*/NULL); - tmp.dentry = src_dentry; - vfsub_update_h_iattr(&tmp, /*did*/NULL); - } - /*ignore*/ - } - -out: - return err; -} - -int vfsub_rename(struct inode *src_dir, struct dentry *src_dentry, - struct inode *dir, struct path *path, - struct inode **delegated_inode) -{ - int err; - struct path tmp = { - .mnt = path->mnt - }; - struct dentry *d; - - IMustLock(dir); - IMustLock(src_dir); - - d = path->dentry; - path->dentry = d->d_parent; - tmp.dentry = src_dentry->d_parent; - err = security_path_rename(&tmp, src_dentry, path, d, /*flags*/0); - path->dentry = d; - if (unlikely(err)) - goto out; - - lockdep_off(); - err = vfs_rename(src_dir, src_dentry, dir, path->dentry, - delegated_inode, /*flags*/0); - lockdep_on(); - if (!err) { - int did; - - tmp.dentry = d->d_parent; - vfsub_update_h_iattr(&tmp, &did); - if (did) { - tmp.dentry = src_dentry; - vfsub_update_h_iattr(&tmp, /*did*/NULL); - tmp.dentry = src_dentry->d_parent; - vfsub_update_h_iattr(&tmp, /*did*/NULL); - } - /*ignore*/ - } - -out: - return err; -} - -int vfsub_mkdir(struct inode *dir, struct path *path, int mode) -{ - int err; - struct dentry *d; - - IMustLock(dir); - - d = path->dentry; - path->dentry = d->d_parent; - err = security_path_mkdir(path, d, mode); - path->dentry = d; - if (unlikely(err)) - goto out; - - lockdep_off(); - err = vfs_mkdir(dir, path->dentry, mode); - lockdep_on(); - if (!err) { - struct path tmp = *path; - int did; - - vfsub_update_h_iattr(&tmp, &did); - if (did) { - tmp.dentry = path->dentry->d_parent; - vfsub_update_h_iattr(&tmp, /*did*/NULL); - } - /*ignore*/ - } - -out: - return err; -} - -int vfsub_rmdir(struct inode *dir, struct path *path) -{ - int err; - struct dentry *d; - - IMustLock(dir); - - d = path->dentry; - path->dentry = d->d_parent; - err = security_path_rmdir(path, d); - path->dentry = d; - if (unlikely(err)) - goto out; - - lockdep_off(); - err = vfs_rmdir(dir, path->dentry); - lockdep_on(); - if (!err) { - struct path tmp = { - .dentry = path->dentry->d_parent, - .mnt = path->mnt - }; - - vfsub_update_h_iattr(&tmp, /*did*/NULL); /*ignore*/ - } - -out: - return err; -} - -/* ---------------------------------------------------------------------- */ - -/* todo: support mmap_sem? */ -ssize_t vfsub_read_u(struct file *file, char __user *ubuf, size_t count, - loff_t *ppos) -{ - ssize_t err; - - lockdep_off(); - err = vfs_read(file, ubuf, count, ppos); - lockdep_on(); - if (err >= 0) - vfsub_update_h_iattr(&file->f_path, /*did*/NULL); /*ignore*/ - return err; -} - -/* todo: kernel_read()? */ -ssize_t vfsub_read_k(struct file *file, void *kbuf, size_t count, - loff_t *ppos) -{ - ssize_t err; - mm_segment_t oldfs; - union { - void *k; - char __user *u; - } buf; - - buf.k = kbuf; - oldfs = get_fs(); - set_fs(KERNEL_DS); - err = vfsub_read_u(file, buf.u, count, ppos); - set_fs(oldfs); - return err; -} - -ssize_t vfsub_write_u(struct file *file, const char __user *ubuf, size_t count, - loff_t *ppos) -{ - ssize_t err; - - lockdep_off(); - err = vfs_write(file, ubuf, count, ppos); - lockdep_on(); - if (err >= 0) - vfsub_update_h_iattr(&file->f_path, /*did*/NULL); /*ignore*/ - return err; -} - -ssize_t vfsub_write_k(struct file *file, void *kbuf, size_t count, loff_t *ppos) -{ - ssize_t err; - mm_segment_t oldfs; - union { - void *k; - const char __user *u; - } buf; - - buf.k = kbuf; - oldfs = get_fs(); - set_fs(KERNEL_DS); - err = vfsub_write_u(file, buf.u, count, ppos); - set_fs(oldfs); - return err; -} - -int vfsub_flush(struct file *file, fl_owner_t id) -{ - int err; - - err = 0; - if (file->f_op->flush) { - if (!au_test_nfs(file->f_path.dentry->d_sb)) - err = file->f_op->flush(file, id); - else { - lockdep_off(); - err = file->f_op->flush(file, id); - lockdep_on(); - } - if (!err) - vfsub_update_h_iattr(&file->f_path, /*did*/NULL); - /*ignore*/ - } - return err; -} - -int vfsub_iterate_dir(struct file *file, struct dir_context *ctx) -{ - int err; - - AuDbg("%pD, ctx{%pf, %llu}\n", file, ctx->actor, ctx->pos); - - lockdep_off(); - err = iterate_dir(file, ctx); - lockdep_on(); - if (err >= 0) - vfsub_update_h_iattr(&file->f_path, /*did*/NULL); /*ignore*/ - return err; -} - -long vfsub_splice_to(struct file *in, loff_t *ppos, - struct pipe_inode_info *pipe, size_t len, - unsigned int flags) -{ - long err; - - lockdep_off(); - err = do_splice_to(in, ppos, pipe, len, flags); - lockdep_on(); - file_accessed(in); - if (err >= 0) - vfsub_update_h_iattr(&in->f_path, /*did*/NULL); /*ignore*/ - return err; -} - -long vfsub_splice_from(struct pipe_inode_info *pipe, struct file *out, - loff_t *ppos, size_t len, unsigned int flags) -{ - long err; - - lockdep_off(); - err = do_splice_from(pipe, out, ppos, len, flags); - lockdep_on(); - if (err >= 0) - vfsub_update_h_iattr(&out->f_path, /*did*/NULL); /*ignore*/ - return err; -} - -int vfsub_fsync(struct file *file, struct path *path, int datasync) -{ - int err; - - /* file can be NULL */ - lockdep_off(); - err = vfs_fsync(file, datasync); - lockdep_on(); - if (!err) { - if (!path) { - AuDebugOn(!file); - path = &file->f_path; - } - vfsub_update_h_iattr(path, /*did*/NULL); /*ignore*/ - } - return err; -} - -/* cf. open.c:do_sys_truncate() and do_sys_ftruncate() */ -int vfsub_trunc(struct path *h_path, loff_t length, unsigned int attr, - struct file *h_file) -{ - int err; - struct inode *h_inode; - struct super_block *h_sb; - - if (!h_file) { - err = vfsub_truncate(h_path, length); - goto out; - } - - h_inode = d_inode(h_path->dentry); - h_sb = h_inode->i_sb; - lockdep_off(); - sb_start_write(h_sb); - lockdep_on(); - err = locks_verify_truncate(h_inode, h_file, length); - if (!err) - err = security_path_truncate(h_path); - if (!err) { - lockdep_off(); - err = do_truncate(h_path->dentry, length, attr, h_file); - lockdep_on(); - } - lockdep_off(); - sb_end_write(h_sb); - lockdep_on(); - -out: - return err; -} - -/* ---------------------------------------------------------------------- */ - -struct au_vfsub_mkdir_args { - int *errp; - struct inode *dir; - struct path *path; - int mode; -}; - -static void au_call_vfsub_mkdir(void *args) -{ - struct au_vfsub_mkdir_args *a = args; - *a->errp = vfsub_mkdir(a->dir, a->path, a->mode); -} - -int vfsub_sio_mkdir(struct inode *dir, struct path *path, int mode) -{ - int err, do_sio, wkq_err; - - do_sio = au_test_h_perm_sio(dir, MAY_EXEC | MAY_WRITE); - if (!do_sio) { - lockdep_off(); - err = vfsub_mkdir(dir, path, mode); - lockdep_on(); - } else { - struct au_vfsub_mkdir_args args = { - .errp = &err, - .dir = dir, - .path = path, - .mode = mode - }; - wkq_err = au_wkq_wait(au_call_vfsub_mkdir, &args); - if (unlikely(wkq_err)) - err = wkq_err; - } - - return err; -} - -struct au_vfsub_rmdir_args { - int *errp; - struct inode *dir; - struct path *path; -}; - -static void au_call_vfsub_rmdir(void *args) -{ - struct au_vfsub_rmdir_args *a = args; - *a->errp = vfsub_rmdir(a->dir, a->path); -} - -int vfsub_sio_rmdir(struct inode *dir, struct path *path) -{ - int err, do_sio, wkq_err; - - do_sio = au_test_h_perm_sio(dir, MAY_EXEC | MAY_WRITE); - if (!do_sio) { - lockdep_off(); - err = vfsub_rmdir(dir, path); - lockdep_on(); - } else { - struct au_vfsub_rmdir_args args = { - .errp = &err, - .dir = dir, - .path = path - }; - wkq_err = au_wkq_wait(au_call_vfsub_rmdir, &args); - if (unlikely(wkq_err)) - err = wkq_err; - } - - return err; -} - -/* ---------------------------------------------------------------------- */ - -struct notify_change_args { - int *errp; - struct path *path; - struct iattr *ia; - struct inode **delegated_inode; -}; - -static void call_notify_change(void *args) -{ - struct notify_change_args *a = args; - struct inode *h_inode; - - h_inode = d_inode(a->path->dentry); - IMustLock(h_inode); - - *a->errp = -EPERM; - if (!IS_IMMUTABLE(h_inode) && !IS_APPEND(h_inode)) { - lockdep_off(); - *a->errp = notify_change(a->path->dentry, a->ia, - a->delegated_inode); - lockdep_on(); - if (!*a->errp) - vfsub_update_h_iattr(a->path, /*did*/NULL); /*ignore*/ - } - AuTraceErr(*a->errp); -} - -int vfsub_notify_change(struct path *path, struct iattr *ia, - struct inode **delegated_inode) -{ - int err; - struct notify_change_args args = { - .errp = &err, - .path = path, - .ia = ia, - .delegated_inode = delegated_inode - }; - - call_notify_change(&args); - - return err; -} - -int vfsub_sio_notify_change(struct path *path, struct iattr *ia, - struct inode **delegated_inode) -{ - int err, wkq_err; - struct notify_change_args args = { - .errp = &err, - .path = path, - .ia = ia, - .delegated_inode = delegated_inode - }; - - wkq_err = au_wkq_wait(call_notify_change, &args); - if (unlikely(wkq_err)) - err = wkq_err; - - return err; -} - -/* ---------------------------------------------------------------------- */ - -struct unlink_args { - int *errp; - struct inode *dir; - struct path *path; - struct inode **delegated_inode; -}; - -static void call_unlink(void *args) -{ - struct unlink_args *a = args; - struct dentry *d = a->path->dentry; - struct inode *h_inode; - const int stop_sillyrename = (au_test_nfs(d->d_sb) - && au_dcount(d) == 1); - - IMustLock(a->dir); - - a->path->dentry = d->d_parent; - *a->errp = security_path_unlink(a->path, d); - a->path->dentry = d; - if (unlikely(*a->errp)) - return; - - if (!stop_sillyrename) - dget(d); - h_inode = NULL; - if (d_is_positive(d)) { - h_inode = d_inode(d); - ihold(h_inode); - } - - lockdep_off(); - *a->errp = vfs_unlink(a->dir, d, a->delegated_inode); - lockdep_on(); - if (!*a->errp) { - struct path tmp = { - .dentry = d->d_parent, - .mnt = a->path->mnt - }; - vfsub_update_h_iattr(&tmp, /*did*/NULL); /*ignore*/ - } - - if (!stop_sillyrename) - dput(d); - if (h_inode) - iput(h_inode); - - AuTraceErr(*a->errp); -} - -/* - * @dir: must be locked. - * @dentry: target dentry. - */ -int vfsub_unlink(struct inode *dir, struct path *path, - struct inode **delegated_inode, int force) -{ - int err; - struct unlink_args args = { - .errp = &err, - .dir = dir, - .path = path, - .delegated_inode = delegated_inode - }; - - if (!force) - call_unlink(&args); - else { - int wkq_err; - - wkq_err = au_wkq_wait(call_unlink, &args); - if (unlikely(wkq_err)) - err = wkq_err; - } - - return err; -} diff --git a/fs/aufs/vfsub.h b/fs/aufs/vfsub.h deleted file mode 100644 index f2e1c49af..000000000 --- a/fs/aufs/vfsub.h +++ /dev/null @@ -1,295 +0,0 @@ -/* - * Copyright (C) 2005-2016 Junjiro R. Okajima - */ - -/* - * sub-routines for VFS - */ - -#ifndef __AUFS_VFSUB_H__ -#define __AUFS_VFSUB_H__ - -#ifdef __KERNEL__ - -#include <linux/fs.h> -#include <linux/mount.h> -#include <linux/posix_acl.h> -#include <linux/xattr.h> -#include "debug.h" - -/* copied from linux/fs/internal.h */ -/* todo: BAD approach!! */ -extern void __mnt_drop_write(struct vfsmount *); -extern int open_check_o_direct(struct file *f); - -/* ---------------------------------------------------------------------- */ - -/* lock subclass for lower inode */ -/* default MAX_LOCKDEP_SUBCLASSES(8) is not enough */ -/* reduce? gave up. */ -enum { - AuLsc_I_Begin = I_MUTEX_PARENT2, /* 5 */ - AuLsc_I_PARENT, /* lower inode, parent first */ - AuLsc_I_PARENT2, /* copyup dirs */ - AuLsc_I_PARENT3, /* copyup wh */ - AuLsc_I_CHILD, - AuLsc_I_CHILD2, - AuLsc_I_End -}; - -/* to debug easier, do not make them inlined functions */ -#define MtxMustLock(mtx) AuDebugOn(!mutex_is_locked(mtx)) -#define IMustLock(i) MtxMustLock(&(i)->i_mutex) - -/* ---------------------------------------------------------------------- */ - -static inline void vfsub_drop_nlink(struct inode *inode) -{ - AuDebugOn(!inode->i_nlink); - drop_nlink(inode); -} - -static inline void vfsub_dead_dir(struct inode *inode) -{ - AuDebugOn(!S_ISDIR(inode->i_mode)); - inode->i_flags |= S_DEAD; - clear_nlink(inode); -} - -static inline int vfsub_native_ro(struct inode *inode) -{ - return (inode->i_sb->s_flags & MS_RDONLY) - || IS_RDONLY(inode) - /* || IS_APPEND(inode) */ - || IS_IMMUTABLE(inode); -} - -#ifdef CONFIG_AUFS_BR_FUSE -int vfsub_test_mntns(struct vfsmount *mnt, struct super_block *h_sb); -#else -AuStubInt0(vfsub_test_mntns, struct vfsmount *mnt, struct super_block *h_sb); -#endif - -/* ---------------------------------------------------------------------- */ - -int vfsub_update_h_iattr(struct path *h_path, int *did); -struct file *vfsub_dentry_open(struct path *path, int flags); -struct file *vfsub_filp_open(const char *path, int oflags, int mode); -struct vfsub_aopen_args { - struct file *file; - unsigned int open_flag; - umode_t create_mode; - int *opened; -}; -struct au_branch; -int vfsub_atomic_open(struct inode *dir, struct dentry *dentry, - struct vfsub_aopen_args *args, struct au_branch *br); -int vfsub_kern_path(const char *name, unsigned int flags, struct path *path); - -struct dentry *vfsub_lookup_one_len(const char *name, struct dentry *parent, - int len); - -struct vfsub_lkup_one_args { - struct dentry **errp; - struct qstr *name; - struct dentry *parent; -}; - -static inline struct dentry *vfsub_lkup_one(struct qstr *name, - struct dentry *parent) -{ - return vfsub_lookup_one_len(name->name, parent, name->len); -} - -void vfsub_call_lkup_one(void *args); - -/* ---------------------------------------------------------------------- */ - -static inline int vfsub_mnt_want_write(struct vfsmount *mnt) -{ - int err; - - lockdep_off(); - err = mnt_want_write(mnt); - lockdep_on(); - return err; -} - -static inline void vfsub_mnt_drop_write(struct vfsmount *mnt) -{ - lockdep_off(); - mnt_drop_write(mnt); - lockdep_on(); -} - -#if 0 /* reserved */ -static inline void vfsub_mnt_drop_write_file(struct file *file) -{ - lockdep_off(); - mnt_drop_write_file(file); - lockdep_on(); -} -#endif - -/* ---------------------------------------------------------------------- */ - -struct au_hinode; -struct dentry *vfsub_lock_rename(struct dentry *d1, struct au_hinode *hdir1, - struct dentry *d2, struct au_hinode *hdir2); -void vfsub_unlock_rename(struct dentry *d1, struct au_hinode *hdir1, - struct dentry *d2, struct au_hinode *hdir2); - -int vfsub_create(struct inode *dir, struct path *path, int mode, - bool want_excl); -int vfsub_symlink(struct inode *dir, struct path *path, - const char *symname); -int vfsub_mknod(struct inode *dir, struct path *path, int mode, dev_t dev); -int vfsub_link(struct dentry *src_dentry, struct inode *dir, - struct path *path, struct inode **delegated_inode); -int vfsub_rename(struct inode *src_hdir, struct dentry *src_dentry, - struct inode *hdir, struct path *path, - struct inode **delegated_inode); -int vfsub_mkdir(struct inode *dir, struct path *path, int mode); -int vfsub_rmdir(struct inode *dir, struct path *path); - -/* ---------------------------------------------------------------------- */ - -ssize_t vfsub_read_u(struct file *file, char __user *ubuf, size_t count, - loff_t *ppos); -ssize_t vfsub_read_k(struct file *file, void *kbuf, size_t count, - loff_t *ppos); -ssize_t vfsub_write_u(struct file *file, const char __user *ubuf, size_t count, - loff_t *ppos); -ssize_t vfsub_write_k(struct file *file, void *kbuf, size_t count, - loff_t *ppos); -int vfsub_flush(struct file *file, fl_owner_t id); -int vfsub_iterate_dir(struct file *file, struct dir_context *ctx); - -static inline loff_t vfsub_f_size_read(struct file *file) -{ - return i_size_read(file_inode(file)); -} - -static inline unsigned int vfsub_file_flags(struct file *file) -{ - unsigned int flags; - - spin_lock(&file->f_lock); - flags = file->f_flags; - spin_unlock(&file->f_lock); - - return flags; -} - -#if 0 /* reserved */ -static inline void vfsub_file_accessed(struct file *h_file) -{ - file_accessed(h_file); - vfsub_update_h_iattr(&h_file->f_path, /*did*/NULL); /*ignore*/ -} -#endif - -#if 0 /* reserved */ -static inline void vfsub_touch_atime(struct vfsmount *h_mnt, - struct dentry *h_dentry) -{ - struct path h_path = { - .dentry = h_dentry, - .mnt = h_mnt - }; - touch_atime(&h_path); - vfsub_update_h_iattr(&h_path, /*did*/NULL); /*ignore*/ -} -#endif - -static inline int vfsub_update_time(struct inode *h_inode, struct timespec *ts, - int flags) -{ - return generic_update_time(h_inode, ts, flags); - /* no vfsub_update_h_iattr() since we don't have struct path */ -} - -#ifdef CONFIG_FS_POSIX_ACL -static inline int vfsub_acl_chmod(struct inode *h_inode, umode_t h_mode) -{ - int err; - - err = posix_acl_chmod(h_inode, h_mode); - if (err == -EOPNOTSUPP) - err = 0; - return err; -} -#else -AuStubInt0(vfsub_acl_chmod, struct inode *h_inode, umode_t h_mode); -#endif - -long vfsub_splice_to(struct file *in, loff_t *ppos, - struct pipe_inode_info *pipe, size_t len, - unsigned int flags); -long vfsub_splice_from(struct pipe_inode_info *pipe, struct file *out, - loff_t *ppos, size_t len, unsigned int flags); - -static inline long vfsub_truncate(struct path *path, loff_t length) -{ - long err; - - lockdep_off(); - err = vfs_truncate(path, length); - lockdep_on(); - return err; -} - -int vfsub_trunc(struct path *h_path, loff_t length, unsigned int attr, - struct file *h_file); -int vfsub_fsync(struct file *file, struct path *path, int datasync); - -/* ---------------------------------------------------------------------- */ - -static inline loff_t vfsub_llseek(struct file *file, loff_t offset, int origin) -{ - loff_t err; - - lockdep_off(); - err = vfs_llseek(file, offset, origin); - lockdep_on(); - return err; -} - -/* ---------------------------------------------------------------------- */ - -int vfsub_sio_mkdir(struct inode *dir, struct path *path, int mode); -int vfsub_sio_rmdir(struct inode *dir, struct path *path); -int vfsub_sio_notify_change(struct path *path, struct iattr *ia, - struct inode **delegated_inode); -int vfsub_notify_change(struct path *path, struct iattr *ia, - struct inode **delegated_inode); -int vfsub_unlink(struct inode *dir, struct path *path, - struct inode **delegated_inode, int force); - -/* ---------------------------------------------------------------------- */ - -static inline int vfsub_setxattr(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags) -{ - int err; - - lockdep_off(); - err = vfs_setxattr(dentry, name, value, size, flags); - lockdep_on(); - - return err; -} - -static inline int vfsub_removexattr(struct dentry *dentry, const char *name) -{ - int err; - - lockdep_off(); - err = vfs_removexattr(dentry, name); - lockdep_on(); - - return err; -} - -#endif /* __KERNEL__ */ -#endif /* __AUFS_VFSUB_H__ */ diff --git a/fs/aufs/wbr_policy.c b/fs/aufs/wbr_policy.c deleted file mode 100644 index c822b428d..000000000 --- a/fs/aufs/wbr_policy.c +++ /dev/null @@ -1,752 +0,0 @@ -/* - * Copyright (C) 2005-2016 Junjiro R. Okajima - */ - -/* - * policies for selecting one among multiple writable branches - */ - -#include <linux/statfs.h> -#include "aufs.h" - -/* subset of cpup_attr() */ -static noinline_for_stack -int au_cpdown_attr(struct path *h_path, struct dentry *h_src) -{ - int err, sbits; - struct iattr ia; - struct inode *h_isrc; - - h_isrc = d_inode(h_src); - ia.ia_valid = ATTR_FORCE | ATTR_MODE | ATTR_UID | ATTR_GID; - ia.ia_mode = h_isrc->i_mode; - ia.ia_uid = h_isrc->i_uid; - ia.ia_gid = h_isrc->i_gid; - sbits = !!(ia.ia_mode & (S_ISUID | S_ISGID)); - au_cpup_attr_flags(d_inode(h_path->dentry), h_isrc->i_flags); - /* no delegation since it is just created */ - err = vfsub_sio_notify_change(h_path, &ia, /*delegated*/NULL); - - /* is this nfs only? */ - if (!err && sbits && au_test_nfs(h_path->dentry->d_sb)) { - ia.ia_valid = ATTR_FORCE | ATTR_MODE; - ia.ia_mode = h_isrc->i_mode; - err = vfsub_sio_notify_change(h_path, &ia, /*delegated*/NULL); - } - - return err; -} - -#define AuCpdown_PARENT_OPQ 1 -#define AuCpdown_WHED (1 << 1) -#define AuCpdown_MADE_DIR (1 << 2) -#define AuCpdown_DIROPQ (1 << 3) -#define au_ftest_cpdown(flags, name) ((flags) & AuCpdown_##name) -#define au_fset_cpdown(flags, name) \ - do { (flags) |= AuCpdown_##name; } while (0) -#define au_fclr_cpdown(flags, name) \ - do { (flags) &= ~AuCpdown_##name; } while (0) - -static int au_cpdown_dir_opq(struct dentry *dentry, aufs_bindex_t bdst, - unsigned int *flags) -{ - int err; - struct dentry *opq_dentry; - - opq_dentry = au_diropq_create(dentry, bdst); - err = PTR_ERR(opq_dentry); - if (IS_ERR(opq_dentry)) - goto out; - dput(opq_dentry); - au_fset_cpdown(*flags, DIROPQ); - -out: - return err; -} - -static int au_cpdown_dir_wh(struct dentry *dentry, struct dentry *h_parent, - struct inode *dir, aufs_bindex_t bdst) -{ - int err; - struct path h_path; - struct au_branch *br; - - br = au_sbr(dentry->d_sb, bdst); - h_path.dentry = au_wh_lkup(h_parent, &dentry->d_name, br); - err = PTR_ERR(h_path.dentry); - if (IS_ERR(h_path.dentry)) - goto out; - - err = 0; - if (d_is_positive(h_path.dentry)) { - h_path.mnt = au_br_mnt(br); - err = au_wh_unlink_dentry(au_h_iptr(dir, bdst), &h_path, - dentry); - } - dput(h_path.dentry); - -out: - return err; -} - -static int au_cpdown_dir(struct dentry *dentry, aufs_bindex_t bdst, - struct au_pin *pin, - struct dentry *h_parent, void *arg) -{ - int err, rerr; - aufs_bindex_t bopq, bstart; - struct path h_path; - struct dentry *parent; - struct inode *h_dir, *h_inode, *inode, *dir; - unsigned int *flags = arg; - - bstart = au_dbstart(dentry); - /* dentry is di-locked */ - parent = dget_parent(dentry); - dir = d_inode(parent); - h_dir = d_inode(h_parent); - AuDebugOn(h_dir != au_h_iptr(dir, bdst)); - IMustLock(h_dir); - - err = au_lkup_neg(dentry, bdst, /*wh*/0); - if (unlikely(err < 0)) - goto out; - h_path.dentry = au_h_dptr(dentry, bdst); - h_path.mnt = au_sbr_mnt(dentry->d_sb, bdst); - err = vfsub_sio_mkdir(au_h_iptr(dir, bdst), &h_path, - S_IRWXU | S_IRUGO | S_IXUGO); - if (unlikely(err)) - goto out_put; - au_fset_cpdown(*flags, MADE_DIR); - - bopq = au_dbdiropq(dentry); - au_fclr_cpdown(*flags, WHED); - au_fclr_cpdown(*flags, DIROPQ); - if (au_dbwh(dentry) == bdst) - au_fset_cpdown(*flags, WHED); - if (!au_ftest_cpdown(*flags, PARENT_OPQ) && bopq <= bdst) - au_fset_cpdown(*flags, PARENT_OPQ); - h_inode = d_inode(h_path.dentry); - mutex_lock_nested(&h_inode->i_mutex, AuLsc_I_CHILD); - if (au_ftest_cpdown(*flags, WHED)) { - err = au_cpdown_dir_opq(dentry, bdst, flags); - if (unlikely(err)) { - mutex_unlock(&h_inode->i_mutex); - goto out_dir; - } - } - - err = au_cpdown_attr(&h_path, au_h_dptr(dentry, bstart)); - mutex_unlock(&h_inode->i_mutex); - if (unlikely(err)) - goto out_opq; - - if (au_ftest_cpdown(*flags, WHED)) { - err = au_cpdown_dir_wh(dentry, h_parent, dir, bdst); - if (unlikely(err)) - goto out_opq; - } - - inode = d_inode(dentry); - if (au_ibend(inode) < bdst) - au_set_ibend(inode, bdst); - au_set_h_iptr(inode, bdst, au_igrab(h_inode), - au_hi_flags(inode, /*isdir*/1)); - au_fhsm_wrote(dentry->d_sb, bdst, /*force*/0); - goto out; /* success */ - - /* revert */ -out_opq: - if (au_ftest_cpdown(*flags, DIROPQ)) { - mutex_lock_nested(&h_inode->i_mutex, AuLsc_I_CHILD); - rerr = au_diropq_remove(dentry, bdst); - mutex_unlock(&h_inode->i_mutex); - if (unlikely(rerr)) { - AuIOErr("failed removing diropq for %pd b%d (%d)\n", - dentry, bdst, rerr); - err = -EIO; - goto out; - } - } -out_dir: - if (au_ftest_cpdown(*flags, MADE_DIR)) { - rerr = vfsub_sio_rmdir(au_h_iptr(dir, bdst), &h_path); - if (unlikely(rerr)) { - AuIOErr("failed removing %pd b%d (%d)\n", - dentry, bdst, rerr); - err = -EIO; - } - } -out_put: - au_set_h_dptr(dentry, bdst, NULL); - if (au_dbend(dentry) == bdst) - au_update_dbend(dentry); -out: - dput(parent); - return err; -} - -int au_cpdown_dirs(struct dentry *dentry, aufs_bindex_t bdst) -{ - int err; - unsigned int flags; - - flags = 0; - err = au_cp_dirs(dentry, bdst, au_cpdown_dir, &flags); - - return err; -} - -/* ---------------------------------------------------------------------- */ - -/* policies for create */ - -int au_wbr_nonopq(struct dentry *dentry, aufs_bindex_t bindex) -{ - int err, i, j, ndentry; - aufs_bindex_t bopq; - struct au_dcsub_pages dpages; - struct au_dpage *dpage; - struct dentry **dentries, *parent, *d; - - err = au_dpages_init(&dpages, GFP_NOFS); - if (unlikely(err)) - goto out; - parent = dget_parent(dentry); - err = au_dcsub_pages_rev_aufs(&dpages, parent, /*do_include*/0); - if (unlikely(err)) - goto out_free; - - err = bindex; - for (i = 0; i < dpages.ndpage; i++) { - dpage = dpages.dpages + i; - dentries = dpage->dentries; - ndentry = dpage->ndentry; - for (j = 0; j < ndentry; j++) { - d = dentries[j]; - di_read_lock_parent2(d, !AuLock_IR); - bopq = au_dbdiropq(d); - di_read_unlock(d, !AuLock_IR); - if (bopq >= 0 && bopq < err) - err = bopq; - } - } - -out_free: - dput(parent); - au_dpages_free(&dpages); -out: - return err; -} - -static int au_wbr_bu(struct super_block *sb, aufs_bindex_t bindex) -{ - for (; bindex >= 0; bindex--) - if (!au_br_rdonly(au_sbr(sb, bindex))) - return bindex; - return -EROFS; -} - -/* top down parent */ -static int au_wbr_create_tdp(struct dentry *dentry, - unsigned int flags __maybe_unused) -{ - int err; - aufs_bindex_t bstart, bindex; - struct super_block *sb; - struct dentry *parent, *h_parent; - - sb = dentry->d_sb; - bstart = au_dbstart(dentry); - err = bstart; - if (!au_br_rdonly(au_sbr(sb, bstart))) - goto out; - - err = -EROFS; - parent = dget_parent(dentry); - for (bindex = au_dbstart(parent); bindex < bstart; bindex++) { - h_parent = au_h_dptr(parent, bindex); - if (!h_parent || d_is_negative(h_parent)) - continue; - - if (!au_br_rdonly(au_sbr(sb, bindex))) { - err = bindex; - break; - } - } - dput(parent); - - /* bottom up here */ - if (unlikely(err < 0)) { - err = au_wbr_bu(sb, bstart - 1); - if (err >= 0) - err = au_wbr_nonopq(dentry, err); - } - -out: - AuDbg("b%d\n", err); - return err; -} - -/* ---------------------------------------------------------------------- */ - -/* an exception for the policy other than tdp */ -static int au_wbr_create_exp(struct dentry *dentry) -{ - int err; - aufs_bindex_t bwh, bdiropq; - struct dentry *parent; - - err = -1; - bwh = au_dbwh(dentry); - parent = dget_parent(dentry); - bdiropq = au_dbdiropq(parent); - if (bwh >= 0) { - if (bdiropq >= 0) - err = min(bdiropq, bwh); - else - err = bwh; - AuDbg("%d\n", err); - } else if (bdiropq >= 0) { - err = bdiropq; - AuDbg("%d\n", err); - } - dput(parent); - - if (err >= 0) - err = au_wbr_nonopq(dentry, err); - - if (err >= 0 && au_br_rdonly(au_sbr(dentry->d_sb, err))) - err = -1; - - AuDbg("%d\n", err); - return err; -} - -/* ---------------------------------------------------------------------- */ - -/* round robin */ -static int au_wbr_create_init_rr(struct super_block *sb) -{ - int err; - - err = au_wbr_bu(sb, au_sbend(sb)); - atomic_set(&au_sbi(sb)->si_wbr_rr_next, -err); /* less important */ - /* smp_mb(); */ - - AuDbg("b%d\n", err); - return err; -} - -static int au_wbr_create_rr(struct dentry *dentry, unsigned int flags) -{ - int err, nbr; - unsigned int u; - aufs_bindex_t bindex, bend; - struct super_block *sb; - atomic_t *next; - - err = au_wbr_create_exp(dentry); - if (err >= 0) - goto out; - - sb = dentry->d_sb; - next = &au_sbi(sb)->si_wbr_rr_next; - bend = au_sbend(sb); - nbr = bend + 1; - for (bindex = 0; bindex <= bend; bindex++) { - if (!au_ftest_wbr(flags, DIR)) { - err = atomic_dec_return(next) + 1; - /* modulo for 0 is meaningless */ - if (unlikely(!err)) - err = atomic_dec_return(next) + 1; - } else - err = atomic_read(next); - AuDbg("%d\n", err); - u = err; - err = u % nbr; - AuDbg("%d\n", err); - if (!au_br_rdonly(au_sbr(sb, err))) - break; - err = -EROFS; - } - - if (err >= 0) - err = au_wbr_nonopq(dentry, err); - -out: - AuDbg("%d\n", err); - return err; -} - -/* ---------------------------------------------------------------------- */ - -/* most free space */ -static void au_mfs(struct dentry *dentry, struct dentry *parent) -{ - struct super_block *sb; - struct au_branch *br; - struct au_wbr_mfs *mfs; - struct dentry *h_parent; - aufs_bindex_t bindex, bend; - int err; - unsigned long long b, bavail; - struct path h_path; - /* reduce the stack usage */ - struct kstatfs *st; - - st = kmalloc(sizeof(*st), GFP_NOFS); - if (unlikely(!st)) { - AuWarn1("failed updating mfs(%d), ignored\n", -ENOMEM); - return; - } - - bavail = 0; - sb = dentry->d_sb; - mfs = &au_sbi(sb)->si_wbr_mfs; - MtxMustLock(&mfs->mfs_lock); - mfs->mfs_bindex = -EROFS; - mfs->mfsrr_bytes = 0; - if (!parent) { - bindex = 0; - bend = au_sbend(sb); - } else { - bindex = au_dbstart(parent); - bend = au_dbtaildir(parent); - } - - for (; bindex <= bend; bindex++) { - if (parent) { - h_parent = au_h_dptr(parent, bindex); - if (!h_parent || d_is_negative(h_parent)) - continue; - } - br = au_sbr(sb, bindex); - if (au_br_rdonly(br)) - continue; - - /* sb->s_root for NFS is unreliable */ - h_path.mnt = au_br_mnt(br); - h_path.dentry = h_path.mnt->mnt_root; - err = vfs_statfs(&h_path, st); - if (unlikely(err)) { - AuWarn1("failed statfs, b%d, %d\n", bindex, err); - continue; - } - - /* when the available size is equal, select the lower one */ - BUILD_BUG_ON(sizeof(b) < sizeof(st->f_bavail) - || sizeof(b) < sizeof(st->f_bsize)); - b = st->f_bavail * st->f_bsize; - br->br_wbr->wbr_bytes = b; - if (b >= bavail) { - bavail = b; - mfs->mfs_bindex = bindex; - mfs->mfs_jiffy = jiffies; - } - } - - mfs->mfsrr_bytes = bavail; - AuDbg("b%d\n", mfs->mfs_bindex); - kfree(st); -} - -static int au_wbr_create_mfs(struct dentry *dentry, unsigned int flags) -{ - int err; - struct dentry *parent; - struct super_block *sb; - struct au_wbr_mfs *mfs; - - err = au_wbr_create_exp(dentry); - if (err >= 0) - goto out; - - sb = dentry->d_sb; - parent = NULL; - if (au_ftest_wbr(flags, PARENT)) - parent = dget_parent(dentry); - mfs = &au_sbi(sb)->si_wbr_mfs; - mutex_lock(&mfs->mfs_lock); - if (time_after(jiffies, mfs->mfs_jiffy + mfs->mfs_expire) - || mfs->mfs_bindex < 0 - || au_br_rdonly(au_sbr(sb, mfs->mfs_bindex))) - au_mfs(dentry, parent); - mutex_unlock(&mfs->mfs_lock); - err = mfs->mfs_bindex; - dput(parent); - - if (err >= 0) - err = au_wbr_nonopq(dentry, err); - -out: - AuDbg("b%d\n", err); - return err; -} - -static int au_wbr_create_init_mfs(struct super_block *sb) -{ - struct au_wbr_mfs *mfs; - - mfs = &au_sbi(sb)->si_wbr_mfs; - mutex_init(&mfs->mfs_lock); - mfs->mfs_jiffy = 0; - mfs->mfs_bindex = -EROFS; - - return 0; -} - -static int au_wbr_create_fin_mfs(struct super_block *sb __maybe_unused) -{ - mutex_destroy(&au_sbi(sb)->si_wbr_mfs.mfs_lock); - return 0; -} - -/* ---------------------------------------------------------------------- */ - -/* most free space and then round robin */ -static int au_wbr_create_mfsrr(struct dentry *dentry, unsigned int flags) -{ - int err; - struct au_wbr_mfs *mfs; - - err = au_wbr_create_mfs(dentry, flags); - if (err >= 0) { - mfs = &au_sbi(dentry->d_sb)->si_wbr_mfs; - mutex_lock(&mfs->mfs_lock); - if (mfs->mfsrr_bytes < mfs->mfsrr_watermark) - err = au_wbr_create_rr(dentry, flags); - mutex_unlock(&mfs->mfs_lock); - } - - AuDbg("b%d\n", err); - return err; -} - -static int au_wbr_create_init_mfsrr(struct super_block *sb) -{ - int err; - - au_wbr_create_init_mfs(sb); /* ignore */ - err = au_wbr_create_init_rr(sb); - - return err; -} - -/* ---------------------------------------------------------------------- */ - -/* top down parent and most free space */ -static int au_wbr_create_pmfs(struct dentry *dentry, unsigned int flags) -{ - int err, e2; - unsigned long long b; - aufs_bindex_t bindex, bstart, bend; - struct super_block *sb; - struct dentry *parent, *h_parent; - struct au_branch *br; - - err = au_wbr_create_tdp(dentry, flags); - if (unlikely(err < 0)) - goto out; - parent = dget_parent(dentry); - bstart = au_dbstart(parent); - bend = au_dbtaildir(parent); - if (bstart == bend) - goto out_parent; /* success */ - - e2 = au_wbr_create_mfs(dentry, flags); - if (e2 < 0) - goto out_parent; /* success */ - - /* when the available size is equal, select upper one */ - sb = dentry->d_sb; - br = au_sbr(sb, err); - b = br->br_wbr->wbr_bytes; - AuDbg("b%d, %llu\n", err, b); - - for (bindex = bstart; bindex <= bend; bindex++) { - h_parent = au_h_dptr(parent, bindex); - if (!h_parent || d_is_negative(h_parent)) - continue; - - br = au_sbr(sb, bindex); - if (!au_br_rdonly(br) && br->br_wbr->wbr_bytes > b) { - b = br->br_wbr->wbr_bytes; - err = bindex; - AuDbg("b%d, %llu\n", err, b); - } - } - - if (err >= 0) - err = au_wbr_nonopq(dentry, err); - -out_parent: - dput(parent); -out: - AuDbg("b%d\n", err); - return err; -} - -/* ---------------------------------------------------------------------- */ - -/* - * - top down parent - * - most free space with parent - * - most free space round-robin regardless parent - */ -static int au_wbr_create_pmfsrr(struct dentry *dentry, unsigned int flags) -{ - int err; - unsigned long long watermark; - struct super_block *sb; - struct au_branch *br; - struct au_wbr_mfs *mfs; - - err = au_wbr_create_pmfs(dentry, flags | AuWbr_PARENT); - if (unlikely(err < 0)) - goto out; - - sb = dentry->d_sb; - br = au_sbr(sb, err); - mfs = &au_sbi(sb)->si_wbr_mfs; - mutex_lock(&mfs->mfs_lock); - watermark = mfs->mfsrr_watermark; - mutex_unlock(&mfs->mfs_lock); - if (br->br_wbr->wbr_bytes < watermark) - /* regardless the parent dir */ - err = au_wbr_create_mfsrr(dentry, flags); - -out: - AuDbg("b%d\n", err); - return err; -} - -/* ---------------------------------------------------------------------- */ - -/* policies for copyup */ - -/* top down parent */ -static int au_wbr_copyup_tdp(struct dentry *dentry) -{ - return au_wbr_create_tdp(dentry, /*flags, anything is ok*/0); -} - -/* bottom up parent */ -static int au_wbr_copyup_bup(struct dentry *dentry) -{ - int err; - aufs_bindex_t bindex, bstart; - struct dentry *parent, *h_parent; - struct super_block *sb; - - err = -EROFS; - sb = dentry->d_sb; - parent = dget_parent(dentry); - bstart = au_dbstart(parent); - for (bindex = au_dbstart(dentry); bindex >= bstart; bindex--) { - h_parent = au_h_dptr(parent, bindex); - if (!h_parent || d_is_negative(h_parent)) - continue; - - if (!au_br_rdonly(au_sbr(sb, bindex))) { - err = bindex; - break; - } - } - dput(parent); - - /* bottom up here */ - if (unlikely(err < 0)) - err = au_wbr_bu(sb, bstart - 1); - - AuDbg("b%d\n", err); - return err; -} - -/* bottom up */ -int au_wbr_do_copyup_bu(struct dentry *dentry, aufs_bindex_t bstart) -{ - int err; - - err = au_wbr_bu(dentry->d_sb, bstart); - AuDbg("b%d\n", err); - if (err > bstart) - err = au_wbr_nonopq(dentry, err); - - AuDbg("b%d\n", err); - return err; -} - -static int au_wbr_copyup_bu(struct dentry *dentry) -{ - int err; - aufs_bindex_t bstart; - - bstart = au_dbstart(dentry); - err = au_wbr_do_copyup_bu(dentry, bstart); - return err; -} - -/* ---------------------------------------------------------------------- */ - -struct au_wbr_copyup_operations au_wbr_copyup_ops[] = { - [AuWbrCopyup_TDP] = { - .copyup = au_wbr_copyup_tdp - }, - [AuWbrCopyup_BUP] = { - .copyup = au_wbr_copyup_bup - }, - [AuWbrCopyup_BU] = { - .copyup = au_wbr_copyup_bu - } -}; - -struct au_wbr_create_operations au_wbr_create_ops[] = { - [AuWbrCreate_TDP] = { - .create = au_wbr_create_tdp - }, - [AuWbrCreate_RR] = { - .create = au_wbr_create_rr, - .init = au_wbr_create_init_rr - }, - [AuWbrCreate_MFS] = { - .create = au_wbr_create_mfs, - .init = au_wbr_create_init_mfs, - .fin = au_wbr_create_fin_mfs - }, - [AuWbrCreate_MFSV] = { - .create = au_wbr_create_mfs, - .init = au_wbr_create_init_mfs, - .fin = au_wbr_create_fin_mfs - }, - [AuWbrCreate_MFSRR] = { - .create = au_wbr_create_mfsrr, - .init = au_wbr_create_init_mfsrr, - .fin = au_wbr_create_fin_mfs - }, - [AuWbrCreate_MFSRRV] = { - .create = au_wbr_create_mfsrr, - .init = au_wbr_create_init_mfsrr, - .fin = au_wbr_create_fin_mfs - }, - [AuWbrCreate_PMFS] = { - .create = au_wbr_create_pmfs, - .init = au_wbr_create_init_mfs, - .fin = au_wbr_create_fin_mfs - }, - [AuWbrCreate_PMFSV] = { - .create = au_wbr_create_pmfs, - .init = au_wbr_create_init_mfs, - .fin = au_wbr_create_fin_mfs - }, - [AuWbrCreate_PMFSRR] = { - .create = au_wbr_create_pmfsrr, - .init = au_wbr_create_init_mfsrr, - .fin = au_wbr_create_fin_mfs - }, - [AuWbrCreate_PMFSRRV] = { - .create = au_wbr_create_pmfsrr, - .init = au_wbr_create_init_mfsrr, - .fin = au_wbr_create_fin_mfs - } -}; diff --git a/fs/aufs/whout.c b/fs/aufs/whout.c deleted file mode 100644 index 04eb9af2b..000000000 --- a/fs/aufs/whout.c +++ /dev/null @@ -1,1047 +0,0 @@ -/* - * Copyright (C) 2005-2016 Junjiro R. Okajima - */ - -/* - * whiteout for logical deletion and opaque directory - */ - -#include "aufs.h" - -#define WH_MASK S_IRUGO - -/* - * If a directory contains this file, then it is opaque. We start with the - * .wh. flag so that it is blocked by lookup. - */ -static struct qstr diropq_name = QSTR_INIT(AUFS_WH_DIROPQ, - sizeof(AUFS_WH_DIROPQ) - 1); - -/* - * generate whiteout name, which is NOT terminated by NULL. - * @name: original d_name.name - * @len: original d_name.len - * @wh: whiteout qstr - * returns zero when succeeds, otherwise error. - * succeeded value as wh->name should be freed by kfree(). - */ -int au_wh_name_alloc(struct qstr *wh, const struct qstr *name) -{ - char *p; - - if (unlikely(name->len > PATH_MAX - AUFS_WH_PFX_LEN)) - return -ENAMETOOLONG; - - wh->len = name->len + AUFS_WH_PFX_LEN; - p = kmalloc(wh->len, GFP_NOFS); - wh->name = p; - if (p) { - memcpy(p, AUFS_WH_PFX, AUFS_WH_PFX_LEN); - memcpy(p + AUFS_WH_PFX_LEN, name->name, name->len); - /* smp_mb(); */ - return 0; - } - return -ENOMEM; -} - -/* ---------------------------------------------------------------------- */ - -/* - * test if the @wh_name exists under @h_parent. - * @try_sio specifies the necessary of super-io. - */ -int au_wh_test(struct dentry *h_parent, struct qstr *wh_name, int try_sio) -{ - int err; - struct dentry *wh_dentry; - - if (!try_sio) - wh_dentry = vfsub_lkup_one(wh_name, h_parent); - else - wh_dentry = au_sio_lkup_one(wh_name, h_parent); - err = PTR_ERR(wh_dentry); - if (IS_ERR(wh_dentry)) { - if (err == -ENAMETOOLONG) - err = 0; - goto out; - } - - err = 0; - if (d_is_negative(wh_dentry)) - goto out_wh; /* success */ - - err = 1; - if (d_is_reg(wh_dentry)) - goto out_wh; /* success */ - - err = -EIO; - AuIOErr("%pd Invalid whiteout entry type 0%o.\n", - wh_dentry, d_inode(wh_dentry)->i_mode); - -out_wh: - dput(wh_dentry); -out: - return err; -} - -/* - * test if the @h_dentry sets opaque or not. - */ -int au_diropq_test(struct dentry *h_dentry) -{ - int err; - struct inode *h_dir; - - h_dir = d_inode(h_dentry); - err = au_wh_test(h_dentry, &diropq_name, - au_test_h_perm_sio(h_dir, MAY_EXEC)); - return err; -} - -/* - * returns a negative dentry whose name is unique and temporary. - */ -struct dentry *au_whtmp_lkup(struct dentry *h_parent, struct au_branch *br, - struct qstr *prefix) -{ - struct dentry *dentry; - int i; - char defname[NAME_MAX - AUFS_MAX_NAMELEN + DNAME_INLINE_LEN + 1], - *name, *p; - /* strict atomic_t is unnecessary here */ - static unsigned short cnt; - struct qstr qs; - - BUILD_BUG_ON(sizeof(cnt) * 2 > AUFS_WH_TMP_LEN); - - name = defname; - qs.len = sizeof(defname) - DNAME_INLINE_LEN + prefix->len - 1; - if (unlikely(prefix->len > DNAME_INLINE_LEN)) { - dentry = ERR_PTR(-ENAMETOOLONG); - if (unlikely(qs.len > NAME_MAX)) - goto out; - dentry = ERR_PTR(-ENOMEM); - name = kmalloc(qs.len + 1, GFP_NOFS); - if (unlikely(!name)) - goto out; - } - - /* doubly whiteout-ed */ - memcpy(name, AUFS_WH_PFX AUFS_WH_PFX, AUFS_WH_PFX_LEN * 2); - p = name + AUFS_WH_PFX_LEN * 2; - memcpy(p, prefix->name, prefix->len); - p += prefix->len; - *p++ = '.'; - AuDebugOn(name + qs.len + 1 - p <= AUFS_WH_TMP_LEN); - - qs.name = name; - for (i = 0; i < 3; i++) { - sprintf(p, "%.*x", AUFS_WH_TMP_LEN, cnt++); - dentry = au_sio_lkup_one(&qs, h_parent); - if (IS_ERR(dentry) || d_is_negative(dentry)) - goto out_name; - dput(dentry); - } - /* pr_warn("could not get random name\n"); */ - dentry = ERR_PTR(-EEXIST); - AuDbg("%.*s\n", AuLNPair(&qs)); - BUG(); - -out_name: - if (name != defname) - kfree(name); -out: - AuTraceErrPtr(dentry); - return dentry; -} - -/* - * rename the @h_dentry on @br to the whiteouted temporary name. - */ -int au_whtmp_ren(struct dentry *h_dentry, struct au_branch *br) -{ - int err; - struct path h_path = { - .mnt = au_br_mnt(br) - }; - struct inode *h_dir, *delegated; - struct dentry *h_parent; - - h_parent = h_dentry->d_parent; /* dir inode is locked */ - h_dir = d_inode(h_parent); - IMustLock(h_dir); - - h_path.dentry = au_whtmp_lkup(h_parent, br, &h_dentry->d_name); - err = PTR_ERR(h_path.dentry); - if (IS_ERR(h_path.dentry)) - goto out; - - /* under the same dir, no need to lock_rename() */ - delegated = NULL; - err = vfsub_rename(h_dir, h_dentry, h_dir, &h_path, &delegated); - AuTraceErr(err); - if (unlikely(err == -EWOULDBLOCK)) { - pr_warn("cannot retry for NFSv4 delegation" - " for an internal rename\n"); - iput(delegated); - } - dput(h_path.dentry); - -out: - AuTraceErr(err); - return err; -} - -/* ---------------------------------------------------------------------- */ -/* - * functions for removing a whiteout - */ - -static int do_unlink_wh(struct inode *h_dir, struct path *h_path) -{ - int err, force; - struct inode *delegated; - - /* - * forces superio when the dir has a sticky bit. - * this may be a violation of unix fs semantics. - */ - force = (h_dir->i_mode & S_ISVTX) - && !uid_eq(current_fsuid(), d_inode(h_path->dentry)->i_uid); - delegated = NULL; - err = vfsub_unlink(h_dir, h_path, &delegated, force); - if (unlikely(err == -EWOULDBLOCK)) { - pr_warn("cannot retry for NFSv4 delegation" - " for an internal unlink\n"); - iput(delegated); - } - return err; -} - -int au_wh_unlink_dentry(struct inode *h_dir, struct path *h_path, - struct dentry *dentry) -{ - int err; - - err = do_unlink_wh(h_dir, h_path); - if (!err && dentry) - au_set_dbwh(dentry, -1); - - return err; -} - -static int unlink_wh_name(struct dentry *h_parent, struct qstr *wh, - struct au_branch *br) -{ - int err; - struct path h_path = { - .mnt = au_br_mnt(br) - }; - - err = 0; - h_path.dentry = vfsub_lkup_one(wh, h_parent); - if (IS_ERR(h_path.dentry)) - err = PTR_ERR(h_path.dentry); - else { - if (d_is_reg(h_path.dentry)) - err = do_unlink_wh(d_inode(h_parent), &h_path); - dput(h_path.dentry); - } - - return err; -} - -/* ---------------------------------------------------------------------- */ -/* - * initialize/clean whiteout for a branch - */ - -static void au_wh_clean(struct inode *h_dir, struct path *whpath, - const int isdir) -{ - int err; - struct inode *delegated; - - if (d_is_negative(whpath->dentry)) - return; - - if (isdir) - err = vfsub_rmdir(h_dir, whpath); - else { - delegated = NULL; - err = vfsub_unlink(h_dir, whpath, &delegated, /*force*/0); - if (unlikely(err == -EWOULDBLOCK)) { - pr_warn("cannot retry for NFSv4 delegation" - " for an internal unlink\n"); - iput(delegated); - } - } - if (unlikely(err)) - pr_warn("failed removing %pd (%d), ignored.\n", - whpath->dentry, err); -} - -static int test_linkable(struct dentry *h_root) -{ - struct inode *h_dir = d_inode(h_root); - - if (h_dir->i_op->link) - return 0; - - pr_err("%pd (%s) doesn't support link(2), use noplink and rw+nolwh\n", - h_root, au_sbtype(h_root->d_sb)); - return -ENOSYS; -} - -/* todo: should this mkdir be done in /sbin/mount.aufs helper? */ -static int au_whdir(struct inode *h_dir, struct path *path) -{ - int err; - - err = -EEXIST; - if (d_is_negative(path->dentry)) { - int mode = S_IRWXU; - - if (au_test_nfs(path->dentry->d_sb)) - mode |= S_IXUGO; - err = vfsub_mkdir(h_dir, path, mode); - } else if (d_is_dir(path->dentry)) - err = 0; - else - pr_err("unknown %pd exists\n", path->dentry); - - return err; -} - -struct au_wh_base { - const struct qstr *name; - struct dentry *dentry; -}; - -static void au_wh_init_ro(struct inode *h_dir, struct au_wh_base base[], - struct path *h_path) -{ - h_path->dentry = base[AuBrWh_BASE].dentry; - au_wh_clean(h_dir, h_path, /*isdir*/0); - h_path->dentry = base[AuBrWh_PLINK].dentry; - au_wh_clean(h_dir, h_path, /*isdir*/1); - h_path->dentry = base[AuBrWh_ORPH].dentry; - au_wh_clean(h_dir, h_path, /*isdir*/1); -} - -/* - * returns tri-state, - * minus: error, caller should print the message - * zero: succuess - * plus: error, caller should NOT print the message - */ -static int au_wh_init_rw_nolink(struct dentry *h_root, struct au_wbr *wbr, - int do_plink, struct au_wh_base base[], - struct path *h_path) -{ - int err; - struct inode *h_dir; - - h_dir = d_inode(h_root); - h_path->dentry = base[AuBrWh_BASE].dentry; - au_wh_clean(h_dir, h_path, /*isdir*/0); - h_path->dentry = base[AuBrWh_PLINK].dentry; - if (do_plink) { - err = test_linkable(h_root); - if (unlikely(err)) { - err = 1; - goto out; - } - - err = au_whdir(h_dir, h_path); - if (unlikely(err)) - goto out; - wbr->wbr_plink = dget(base[AuBrWh_PLINK].dentry); - } else - au_wh_clean(h_dir, h_path, /*isdir*/1); - h_path->dentry = base[AuBrWh_ORPH].dentry; - err = au_whdir(h_dir, h_path); - if (unlikely(err)) - goto out; - wbr->wbr_orph = dget(base[AuBrWh_ORPH].dentry); - -out: - return err; -} - -/* - * for the moment, aufs supports the branch filesystem which does not support - * link(2). testing on FAT which does not support i_op->setattr() fully either, - * copyup failed. finally, such filesystem will not be used as the writable - * branch. - * - * returns tri-state, see above. - */ -static int au_wh_init_rw(struct dentry *h_root, struct au_wbr *wbr, - int do_plink, struct au_wh_base base[], - struct path *h_path) -{ - int err; - struct inode *h_dir; - - WbrWhMustWriteLock(wbr); - - err = test_linkable(h_root); - if (unlikely(err)) { - err = 1; - goto out; - } - - /* - * todo: should this create be done in /sbin/mount.aufs helper? - */ - err = -EEXIST; - h_dir = d_inode(h_root); - if (d_is_negative(base[AuBrWh_BASE].dentry)) { - h_path->dentry = base[AuBrWh_BASE].dentry; - err = vfsub_create(h_dir, h_path, WH_MASK, /*want_excl*/true); - } else if (d_is_reg(base[AuBrWh_BASE].dentry)) - err = 0; - else - pr_err("unknown %pd2 exists\n", base[AuBrWh_BASE].dentry); - if (unlikely(err)) - goto out; - - h_path->dentry = base[AuBrWh_PLINK].dentry; - if (do_plink) { - err = au_whdir(h_dir, h_path); - if (unlikely(err)) - goto out; - wbr->wbr_plink = dget(base[AuBrWh_PLINK].dentry); - } else - au_wh_clean(h_dir, h_path, /*isdir*/1); - wbr->wbr_whbase = dget(base[AuBrWh_BASE].dentry); - - h_path->dentry = base[AuBrWh_ORPH].dentry; - err = au_whdir(h_dir, h_path); - if (unlikely(err)) - goto out; - wbr->wbr_orph = dget(base[AuBrWh_ORPH].dentry); - -out: - return err; -} - -/* - * initialize the whiteout base file/dir for @br. - */ -int au_wh_init(struct au_branch *br, struct super_block *sb) -{ - int err, i; - const unsigned char do_plink - = !!au_opt_test(au_mntflags(sb), PLINK); - struct inode *h_dir; - struct path path = br->br_path; - struct dentry *h_root = path.dentry; - struct au_wbr *wbr = br->br_wbr; - static const struct qstr base_name[] = { - [AuBrWh_BASE] = QSTR_INIT(AUFS_BASE_NAME, - sizeof(AUFS_BASE_NAME) - 1), - [AuBrWh_PLINK] = QSTR_INIT(AUFS_PLINKDIR_NAME, - sizeof(AUFS_PLINKDIR_NAME) - 1), - [AuBrWh_ORPH] = QSTR_INIT(AUFS_ORPHDIR_NAME, - sizeof(AUFS_ORPHDIR_NAME) - 1) - }; - struct au_wh_base base[] = { - [AuBrWh_BASE] = { - .name = base_name + AuBrWh_BASE, - .dentry = NULL - }, - [AuBrWh_PLINK] = { - .name = base_name + AuBrWh_PLINK, - .dentry = NULL - }, - [AuBrWh_ORPH] = { - .name = base_name + AuBrWh_ORPH, - .dentry = NULL - } - }; - - if (wbr) - WbrWhMustWriteLock(wbr); - - for (i = 0; i < AuBrWh_Last; i++) { - /* doubly whiteouted */ - struct dentry *d; - - d = au_wh_lkup(h_root, (void *)base[i].name, br); - err = PTR_ERR(d); - if (IS_ERR(d)) - goto out; - - base[i].dentry = d; - AuDebugOn(wbr - && wbr->wbr_wh[i] - && wbr->wbr_wh[i] != base[i].dentry); - } - - if (wbr) - for (i = 0; i < AuBrWh_Last; i++) { - dput(wbr->wbr_wh[i]); - wbr->wbr_wh[i] = NULL; - } - - err = 0; - if (!au_br_writable(br->br_perm)) { - h_dir = d_inode(h_root); - au_wh_init_ro(h_dir, base, &path); - } else if (!au_br_wh_linkable(br->br_perm)) { - err = au_wh_init_rw_nolink(h_root, wbr, do_plink, base, &path); - if (err > 0) - goto out; - else if (err) - goto out_err; - } else { - err = au_wh_init_rw(h_root, wbr, do_plink, base, &path); - if (err > 0) - goto out; - else if (err) - goto out_err; - } - goto out; /* success */ - -out_err: - pr_err("an error(%d) on the writable branch %pd(%s)\n", - err, h_root, au_sbtype(h_root->d_sb)); -out: - for (i = 0; i < AuBrWh_Last; i++) - dput(base[i].dentry); - return err; -} - -/* ---------------------------------------------------------------------- */ -/* - * whiteouts are all hard-linked usually. - * when its link count reaches a ceiling, we create a new whiteout base - * asynchronously. - */ - -struct reinit_br_wh { - struct super_block *sb; - struct au_branch *br; -}; - -static void reinit_br_wh(void *arg) -{ - int err; - aufs_bindex_t bindex; - struct path h_path; - struct reinit_br_wh *a = arg; - struct au_wbr *wbr; - struct inode *dir, *delegated; - struct dentry *h_root; - struct au_hinode *hdir; - - err = 0; - wbr = a->br->br_wbr; - /* big aufs lock */ - si_noflush_write_lock(a->sb); - if (!au_br_writable(a->br->br_perm)) - goto out; - bindex = au_br_index(a->sb, a->br->br_id); - if (unlikely(bindex < 0)) - goto out; - - di_read_lock_parent(a->sb->s_root, AuLock_IR); - dir = d_inode(a->sb->s_root); - hdir = au_hi(dir, bindex); - h_root = au_h_dptr(a->sb->s_root, bindex); - AuDebugOn(h_root != au_br_dentry(a->br)); - - au_hn_imtx_lock_nested(hdir, AuLsc_I_PARENT); - wbr_wh_write_lock(wbr); - err = au_h_verify(wbr->wbr_whbase, au_opt_udba(a->sb), hdir->hi_inode, - h_root, a->br); - if (!err) { - h_path.dentry = wbr->wbr_whbase; - h_path.mnt = au_br_mnt(a->br); - delegated = NULL; - err = vfsub_unlink(hdir->hi_inode, &h_path, &delegated, - /*force*/0); - if (unlikely(err == -EWOULDBLOCK)) { - pr_warn("cannot retry for NFSv4 delegation" - " for an internal unlink\n"); - iput(delegated); - } - } else { - pr_warn("%pd is moved, ignored\n", wbr->wbr_whbase); - err = 0; - } - dput(wbr->wbr_whbase); - wbr->wbr_whbase = NULL; - if (!err) - err = au_wh_init(a->br, a->sb); - wbr_wh_write_unlock(wbr); - au_hn_imtx_unlock(hdir); - di_read_unlock(a->sb->s_root, AuLock_IR); - if (!err) - au_fhsm_wrote(a->sb, bindex, /*force*/0); - -out: - if (wbr) - atomic_dec(&wbr->wbr_wh_running); - atomic_dec(&a->br->br_count); - si_write_unlock(a->sb); - au_nwt_done(&au_sbi(a->sb)->si_nowait); - kfree(arg); - if (unlikely(err)) - AuIOErr("err %d\n", err); -} - -static void kick_reinit_br_wh(struct super_block *sb, struct au_branch *br) -{ - int do_dec, wkq_err; - struct reinit_br_wh *arg; - - do_dec = 1; - if (atomic_inc_return(&br->br_wbr->wbr_wh_running) != 1) - goto out; - - /* ignore ENOMEM */ - arg = kmalloc(sizeof(*arg), GFP_NOFS); - if (arg) { - /* - * dec(wh_running), kfree(arg) and dec(br_count) - * in reinit function - */ - arg->sb = sb; - arg->br = br; - atomic_inc(&br->br_count); - wkq_err = au_wkq_nowait(reinit_br_wh, arg, sb, /*flags*/0); - if (unlikely(wkq_err)) { - atomic_dec(&br->br_wbr->wbr_wh_running); - atomic_dec(&br->br_count); - kfree(arg); - } - do_dec = 0; - } - -out: - if (do_dec) - atomic_dec(&br->br_wbr->wbr_wh_running); -} - -/* ---------------------------------------------------------------------- */ - -/* - * create the whiteout @wh. - */ -static int link_or_create_wh(struct super_block *sb, aufs_bindex_t bindex, - struct dentry *wh) -{ - int err; - struct path h_path = { - .dentry = wh - }; - struct au_branch *br; - struct au_wbr *wbr; - struct dentry *h_parent; - struct inode *h_dir, *delegated; - - h_parent = wh->d_parent; /* dir inode is locked */ - h_dir = d_inode(h_parent); - IMustLock(h_dir); - - br = au_sbr(sb, bindex); - h_path.mnt = au_br_mnt(br); - wbr = br->br_wbr; - wbr_wh_read_lock(wbr); - if (wbr->wbr_whbase) { - delegated = NULL; - err = vfsub_link(wbr->wbr_whbase, h_dir, &h_path, &delegated); - if (unlikely(err == -EWOULDBLOCK)) { - pr_warn("cannot retry for NFSv4 delegation" - " for an internal link\n"); - iput(delegated); - } - if (!err || err != -EMLINK) - goto out; - - /* link count full. re-initialize br_whbase. */ - kick_reinit_br_wh(sb, br); - } - - /* return this error in this context */ - err = vfsub_create(h_dir, &h_path, WH_MASK, /*want_excl*/true); - if (!err) - au_fhsm_wrote(sb, bindex, /*force*/0); - -out: - wbr_wh_read_unlock(wbr); - return err; -} - -/* ---------------------------------------------------------------------- */ - -/* - * create or remove the diropq. - */ -static struct dentry *do_diropq(struct dentry *dentry, aufs_bindex_t bindex, - unsigned int flags) -{ - struct dentry *opq_dentry, *h_dentry; - struct super_block *sb; - struct au_branch *br; - int err; - - sb = dentry->d_sb; - br = au_sbr(sb, bindex); - h_dentry = au_h_dptr(dentry, bindex); - opq_dentry = vfsub_lkup_one(&diropq_name, h_dentry); - if (IS_ERR(opq_dentry)) - goto out; - - if (au_ftest_diropq(flags, CREATE)) { - err = link_or_create_wh(sb, bindex, opq_dentry); - if (!err) { - au_set_dbdiropq(dentry, bindex); - goto out; /* success */ - } - } else { - struct path tmp = { - .dentry = opq_dentry, - .mnt = au_br_mnt(br) - }; - err = do_unlink_wh(au_h_iptr(d_inode(dentry), bindex), &tmp); - if (!err) - au_set_dbdiropq(dentry, -1); - } - dput(opq_dentry); - opq_dentry = ERR_PTR(err); - -out: - return opq_dentry; -} - -struct do_diropq_args { - struct dentry **errp; - struct dentry *dentry; - aufs_bindex_t bindex; - unsigned int flags; -}; - -static void call_do_diropq(void *args) -{ - struct do_diropq_args *a = args; - *a->errp = do_diropq(a->dentry, a->bindex, a->flags); -} - -struct dentry *au_diropq_sio(struct dentry *dentry, aufs_bindex_t bindex, - unsigned int flags) -{ - struct dentry *diropq, *h_dentry; - - h_dentry = au_h_dptr(dentry, bindex); - if (!au_test_h_perm_sio(d_inode(h_dentry), MAY_EXEC | MAY_WRITE)) - diropq = do_diropq(dentry, bindex, flags); - else { - int wkq_err; - struct do_diropq_args args = { - .errp = &diropq, - .dentry = dentry, - .bindex = bindex, - .flags = flags - }; - - wkq_err = au_wkq_wait(call_do_diropq, &args); - if (unlikely(wkq_err)) - diropq = ERR_PTR(wkq_err); - } - - return diropq; -} - -/* ---------------------------------------------------------------------- */ - -/* - * lookup whiteout dentry. - * @h_parent: lower parent dentry which must exist and be locked - * @base_name: name of dentry which will be whiteouted - * returns dentry for whiteout. - */ -struct dentry *au_wh_lkup(struct dentry *h_parent, struct qstr *base_name, - struct au_branch *br) -{ - int err; - struct qstr wh_name; - struct dentry *wh_dentry; - - err = au_wh_name_alloc(&wh_name, base_name); - wh_dentry = ERR_PTR(err); - if (!err) { - wh_dentry = vfsub_lkup_one(&wh_name, h_parent); - kfree(wh_name.name); - } - return wh_dentry; -} - -/* - * link/create a whiteout for @dentry on @bindex. - */ -struct dentry *au_wh_create(struct dentry *dentry, aufs_bindex_t bindex, - struct dentry *h_parent) -{ - struct dentry *wh_dentry; - struct super_block *sb; - int err; - - sb = dentry->d_sb; - wh_dentry = au_wh_lkup(h_parent, &dentry->d_name, au_sbr(sb, bindex)); - if (!IS_ERR(wh_dentry) && d_is_negative(wh_dentry)) { - err = link_or_create_wh(sb, bindex, wh_dentry); - if (!err) { - au_set_dbwh(dentry, bindex); - au_fhsm_wrote(sb, bindex, /*force*/0); - } else { - dput(wh_dentry); - wh_dentry = ERR_PTR(err); - } - } - - return wh_dentry; -} - -/* ---------------------------------------------------------------------- */ - -/* Delete all whiteouts in this directory on branch bindex. */ -static int del_wh_children(struct dentry *h_dentry, struct au_nhash *whlist, - aufs_bindex_t bindex, struct au_branch *br) -{ - int err; - unsigned long ul, n; - struct qstr wh_name; - char *p; - struct hlist_head *head; - struct au_vdir_wh *pos; - struct au_vdir_destr *str; - - err = -ENOMEM; - p = (void *)__get_free_page(GFP_NOFS); - wh_name.name = p; - if (unlikely(!wh_name.name)) - goto out; - - err = 0; - memcpy(p, AUFS_WH_PFX, AUFS_WH_PFX_LEN); - p += AUFS_WH_PFX_LEN; - n = whlist->nh_num; - head = whlist->nh_head; - for (ul = 0; !err && ul < n; ul++, head++) { - hlist_for_each_entry(pos, head, wh_hash) { - if (pos->wh_bindex != bindex) - continue; - - str = &pos->wh_str; - if (str->len + AUFS_WH_PFX_LEN <= PATH_MAX) { - memcpy(p, str->name, str->len); - wh_name.len = AUFS_WH_PFX_LEN + str->len; - err = unlink_wh_name(h_dentry, &wh_name, br); - if (!err) - continue; - break; - } - AuIOErr("whiteout name too long %.*s\n", - str->len, str->name); - err = -EIO; - break; - } - } - free_page((unsigned long)wh_name.name); - -out: - return err; -} - -struct del_wh_children_args { - int *errp; - struct dentry *h_dentry; - struct au_nhash *whlist; - aufs_bindex_t bindex; - struct au_branch *br; -}; - -static void call_del_wh_children(void *args) -{ - struct del_wh_children_args *a = args; - *a->errp = del_wh_children(a->h_dentry, a->whlist, a->bindex, a->br); -} - -/* ---------------------------------------------------------------------- */ - -struct au_whtmp_rmdir *au_whtmp_rmdir_alloc(struct super_block *sb, gfp_t gfp) -{ - struct au_whtmp_rmdir *whtmp; - int err; - unsigned int rdhash; - - SiMustAnyLock(sb); - - whtmp = kzalloc(sizeof(*whtmp), gfp); - if (unlikely(!whtmp)) { - whtmp = ERR_PTR(-ENOMEM); - goto out; - } - - /* no estimation for dir size */ - rdhash = au_sbi(sb)->si_rdhash; - if (!rdhash) - rdhash = AUFS_RDHASH_DEF; - err = au_nhash_alloc(&whtmp->whlist, rdhash, gfp); - if (unlikely(err)) { - kfree(whtmp); - whtmp = ERR_PTR(err); - } - -out: - return whtmp; -} - -void au_whtmp_rmdir_free(struct au_whtmp_rmdir *whtmp) -{ - if (whtmp->br) - atomic_dec(&whtmp->br->br_count); - dput(whtmp->wh_dentry); - iput(whtmp->dir); - au_nhash_wh_free(&whtmp->whlist); - kfree(whtmp); -} - -/* - * rmdir the whiteouted temporary named dir @h_dentry. - * @whlist: whiteouted children. - */ -int au_whtmp_rmdir(struct inode *dir, aufs_bindex_t bindex, - struct dentry *wh_dentry, struct au_nhash *whlist) -{ - int err; - unsigned int h_nlink; - struct path h_tmp; - struct inode *wh_inode, *h_dir; - struct au_branch *br; - - h_dir = d_inode(wh_dentry->d_parent); /* dir inode is locked */ - IMustLock(h_dir); - - br = au_sbr(dir->i_sb, bindex); - wh_inode = d_inode(wh_dentry); - mutex_lock_nested(&wh_inode->i_mutex, AuLsc_I_CHILD); - - /* - * someone else might change some whiteouts while we were sleeping. - * it means this whlist may have an obsoleted entry. - */ - if (!au_test_h_perm_sio(wh_inode, MAY_EXEC | MAY_WRITE)) - err = del_wh_children(wh_dentry, whlist, bindex, br); - else { - int wkq_err; - struct del_wh_children_args args = { - .errp = &err, - .h_dentry = wh_dentry, - .whlist = whlist, - .bindex = bindex, - .br = br - }; - - wkq_err = au_wkq_wait(call_del_wh_children, &args); - if (unlikely(wkq_err)) - err = wkq_err; - } - mutex_unlock(&wh_inode->i_mutex); - - if (!err) { - h_tmp.dentry = wh_dentry; - h_tmp.mnt = au_br_mnt(br); - h_nlink = h_dir->i_nlink; - err = vfsub_rmdir(h_dir, &h_tmp); - /* some fs doesn't change the parent nlink in some cases */ - h_nlink -= h_dir->i_nlink; - } - - if (!err) { - if (au_ibstart(dir) == bindex) { - /* todo: dir->i_mutex is necessary */ - au_cpup_attr_timesizes(dir); - if (h_nlink) - vfsub_drop_nlink(dir); - } - return 0; /* success */ - } - - pr_warn("failed removing %pd(%d), ignored\n", wh_dentry, err); - return err; -} - -static void call_rmdir_whtmp(void *args) -{ - int err; - aufs_bindex_t bindex; - struct au_whtmp_rmdir *a = args; - struct super_block *sb; - struct dentry *h_parent; - struct inode *h_dir; - struct au_hinode *hdir; - - /* rmdir by nfsd may cause deadlock with this i_mutex */ - /* mutex_lock(&a->dir->i_mutex); */ - err = -EROFS; - sb = a->dir->i_sb; - si_read_lock(sb, !AuLock_FLUSH); - if (!au_br_writable(a->br->br_perm)) - goto out; - bindex = au_br_index(sb, a->br->br_id); - if (unlikely(bindex < 0)) - goto out; - - err = -EIO; - ii_write_lock_parent(a->dir); - h_parent = dget_parent(a->wh_dentry); - h_dir = d_inode(h_parent); - hdir = au_hi(a->dir, bindex); - err = vfsub_mnt_want_write(au_br_mnt(a->br)); - if (unlikely(err)) - goto out_mnt; - au_hn_imtx_lock_nested(hdir, AuLsc_I_PARENT); - err = au_h_verify(a->wh_dentry, au_opt_udba(sb), h_dir, h_parent, - a->br); - if (!err) - err = au_whtmp_rmdir(a->dir, bindex, a->wh_dentry, &a->whlist); - au_hn_imtx_unlock(hdir); - vfsub_mnt_drop_write(au_br_mnt(a->br)); - -out_mnt: - dput(h_parent); - ii_write_unlock(a->dir); -out: - /* mutex_unlock(&a->dir->i_mutex); */ - au_whtmp_rmdir_free(a); - si_read_unlock(sb); - au_nwt_done(&au_sbi(sb)->si_nowait); - if (unlikely(err)) - AuIOErr("err %d\n", err); -} - -void au_whtmp_kick_rmdir(struct inode *dir, aufs_bindex_t bindex, - struct dentry *wh_dentry, struct au_whtmp_rmdir *args) -{ - int wkq_err; - struct super_block *sb; - - IMustLock(dir); - - /* all post-process will be done in do_rmdir_whtmp(). */ - sb = dir->i_sb; - args->dir = au_igrab(dir); - args->br = au_sbr(sb, bindex); - atomic_inc(&args->br->br_count); - args->wh_dentry = dget(wh_dentry); - wkq_err = au_wkq_nowait(call_rmdir_whtmp, args, sb, /*flags*/0); - if (unlikely(wkq_err)) { - pr_warn("rmdir error %pd (%d), ignored\n", wh_dentry, wkq_err); - au_whtmp_rmdir_free(args); - } -} diff --git a/fs/aufs/whout.h b/fs/aufs/whout.h deleted file mode 100644 index 4077dd19e..000000000 --- a/fs/aufs/whout.h +++ /dev/null @@ -1,72 +0,0 @@ -/* - * Copyright (C) 2005-2016 Junjiro R. Okajima - */ - -/* - * whiteout for logical deletion and opaque directory - */ - -#ifndef __AUFS_WHOUT_H__ -#define __AUFS_WHOUT_H__ - -#ifdef __KERNEL__ - -#include "dir.h" - -/* whout.c */ -int au_wh_name_alloc(struct qstr *wh, const struct qstr *name); -int au_wh_test(struct dentry *h_parent, struct qstr *wh_name, int try_sio); -int au_diropq_test(struct dentry *h_dentry); -struct au_branch; -struct dentry *au_whtmp_lkup(struct dentry *h_parent, struct au_branch *br, - struct qstr *prefix); -int au_whtmp_ren(struct dentry *h_dentry, struct au_branch *br); -int au_wh_unlink_dentry(struct inode *h_dir, struct path *h_path, - struct dentry *dentry); -int au_wh_init(struct au_branch *br, struct super_block *sb); - -/* diropq flags */ -#define AuDiropq_CREATE 1 -#define au_ftest_diropq(flags, name) ((flags) & AuDiropq_##name) -#define au_fset_diropq(flags, name) \ - do { (flags) |= AuDiropq_##name; } while (0) -#define au_fclr_diropq(flags, name) \ - do { (flags) &= ~AuDiropq_##name; } while (0) - -struct dentry *au_diropq_sio(struct dentry *dentry, aufs_bindex_t bindex, - unsigned int flags); -struct dentry *au_wh_lkup(struct dentry *h_parent, struct qstr *base_name, - struct au_branch *br); -struct dentry *au_wh_create(struct dentry *dentry, aufs_bindex_t bindex, - struct dentry *h_parent); - -/* real rmdir for the whiteout-ed dir */ -struct au_whtmp_rmdir { - struct inode *dir; - struct au_branch *br; - struct dentry *wh_dentry; - struct au_nhash whlist; -}; - -struct au_whtmp_rmdir *au_whtmp_rmdir_alloc(struct super_block *sb, gfp_t gfp); -void au_whtmp_rmdir_free(struct au_whtmp_rmdir *whtmp); -int au_whtmp_rmdir(struct inode *dir, aufs_bindex_t bindex, - struct dentry *wh_dentry, struct au_nhash *whlist); -void au_whtmp_kick_rmdir(struct inode *dir, aufs_bindex_t bindex, - struct dentry *wh_dentry, struct au_whtmp_rmdir *args); - -/* ---------------------------------------------------------------------- */ - -static inline struct dentry *au_diropq_create(struct dentry *dentry, - aufs_bindex_t bindex) -{ - return au_diropq_sio(dentry, bindex, AuDiropq_CREATE); -} - -static inline int au_diropq_remove(struct dentry *dentry, aufs_bindex_t bindex) -{ - return PTR_ERR(au_diropq_sio(dentry, bindex, !AuDiropq_CREATE)); -} - -#endif /* __KERNEL__ */ -#endif /* __AUFS_WHOUT_H__ */ diff --git a/fs/aufs/wkq.c b/fs/aufs/wkq.c deleted file mode 100644 index 0f1500e93..000000000 --- a/fs/aufs/wkq.c +++ /dev/null @@ -1,200 +0,0 @@ -/* - * Copyright (C) 2005-2016 Junjiro R. Okajima - */ - -/* - * workqueue for asynchronous/super-io operations - * todo: try new dredential scheme - */ - -#include <linux/module.h> -#include "aufs.h" - -/* internal workqueue named AUFS_WKQ_NAME */ - -static struct workqueue_struct *au_wkq; - -struct au_wkinfo { - struct work_struct wk; - struct kobject *kobj; - - unsigned int flags; /* see wkq.h */ - - au_wkq_func_t func; - void *args; - - struct completion *comp; -}; - -/* ---------------------------------------------------------------------- */ - -static void wkq_func(struct work_struct *wk) -{ - struct au_wkinfo *wkinfo = container_of(wk, struct au_wkinfo, wk); - - AuDebugOn(!uid_eq(current_fsuid(), GLOBAL_ROOT_UID)); - AuDebugOn(rlimit(RLIMIT_FSIZE) != RLIM_INFINITY); - - wkinfo->func(wkinfo->args); - if (au_ftest_wkq(wkinfo->flags, WAIT)) - complete(wkinfo->comp); - else { - kobject_put(wkinfo->kobj); - module_put(THIS_MODULE); /* todo: ?? */ - kfree(wkinfo); - } -} - -/* - * Since struct completion is large, try allocating it dynamically. - */ -#if 1 /* defined(CONFIG_4KSTACKS) || defined(AuTest4KSTACKS) */ -#define AuWkqCompDeclare(name) struct completion *comp = NULL - -static int au_wkq_comp_alloc(struct au_wkinfo *wkinfo, struct completion **comp) -{ - *comp = kmalloc(sizeof(**comp), GFP_NOFS); - if (*comp) { - init_completion(*comp); - wkinfo->comp = *comp; - return 0; - } - return -ENOMEM; -} - -static void au_wkq_comp_free(struct completion *comp) -{ - kfree(comp); -} - -#else - -/* no braces */ -#define AuWkqCompDeclare(name) \ - DECLARE_COMPLETION_ONSTACK(_ ## name); \ - struct completion *comp = &_ ## name - -static int au_wkq_comp_alloc(struct au_wkinfo *wkinfo, struct completion **comp) -{ - wkinfo->comp = *comp; - return 0; -} - -static void au_wkq_comp_free(struct completion *comp __maybe_unused) -{ - /* empty */ -} -#endif /* 4KSTACKS */ - -static void au_wkq_run(struct au_wkinfo *wkinfo) -{ - if (au_ftest_wkq(wkinfo->flags, NEST)) { - if (au_wkq_test()) { - AuWarn1("wkq from wkq, unless silly-rename on NFS," - " due to a dead dir by UDBA?\n"); - AuDebugOn(au_ftest_wkq(wkinfo->flags, WAIT)); - } - } else - au_dbg_verify_kthread(); - - if (au_ftest_wkq(wkinfo->flags, WAIT)) { - INIT_WORK_ONSTACK(&wkinfo->wk, wkq_func); - queue_work(au_wkq, &wkinfo->wk); - } else { - INIT_WORK(&wkinfo->wk, wkq_func); - schedule_work(&wkinfo->wk); - } -} - -/* - * Be careful. It is easy to make deadlock happen. - * processA: lock, wkq and wait - * processB: wkq and wait, lock in wkq - * --> deadlock - */ -int au_wkq_do_wait(unsigned int flags, au_wkq_func_t func, void *args) -{ - int err; - AuWkqCompDeclare(comp); - struct au_wkinfo wkinfo = { - .flags = flags, - .func = func, - .args = args - }; - - err = au_wkq_comp_alloc(&wkinfo, &comp); - if (!err) { - au_wkq_run(&wkinfo); - /* no timeout, no interrupt */ - wait_for_completion(wkinfo.comp); - au_wkq_comp_free(comp); - destroy_work_on_stack(&wkinfo.wk); - } - - return err; - -} - -/* - * Note: dget/dput() in func for aufs dentries are not supported. It will be a - * problem in a concurrent umounting. - */ -int au_wkq_nowait(au_wkq_func_t func, void *args, struct super_block *sb, - unsigned int flags) -{ - int err; - struct au_wkinfo *wkinfo; - - atomic_inc(&au_sbi(sb)->si_nowait.nw_len); - - /* - * wkq_func() must free this wkinfo. - * it highly depends upon the implementation of workqueue. - */ - err = 0; - wkinfo = kmalloc(sizeof(*wkinfo), GFP_NOFS); - if (wkinfo) { - wkinfo->kobj = &au_sbi(sb)->si_kobj; - wkinfo->flags = flags & ~AuWkq_WAIT; - wkinfo->func = func; - wkinfo->args = args; - wkinfo->comp = NULL; - kobject_get(wkinfo->kobj); - __module_get(THIS_MODULE); /* todo: ?? */ - - au_wkq_run(wkinfo); - } else { - err = -ENOMEM; - au_nwt_done(&au_sbi(sb)->si_nowait); - } - - return err; -} - -/* ---------------------------------------------------------------------- */ - -void au_nwt_init(struct au_nowait_tasks *nwt) -{ - atomic_set(&nwt->nw_len, 0); - /* smp_mb(); */ /* atomic_set */ - init_waitqueue_head(&nwt->nw_wq); -} - -void au_wkq_fin(void) -{ - destroy_workqueue(au_wkq); -} - -int __init au_wkq_init(void) -{ - int err; - - err = 0; - au_wkq = alloc_workqueue(AUFS_WKQ_NAME, 0, WQ_DFL_ACTIVE); - if (IS_ERR(au_wkq)) - err = PTR_ERR(au_wkq); - else if (!au_wkq) - err = -ENOMEM; - - return err; -} diff --git a/fs/aufs/wkq.h b/fs/aufs/wkq.h deleted file mode 100644 index f6c9b9902..000000000 --- a/fs/aufs/wkq.h +++ /dev/null @@ -1,78 +0,0 @@ -/* - * Copyright (C) 2005-2016 Junjiro R. Okajima - */ - -/* - * workqueue for asynchronous/super-io operations - * todo: try new credentials management scheme - */ - -#ifndef __AUFS_WKQ_H__ -#define __AUFS_WKQ_H__ - -#ifdef __KERNEL__ - -struct super_block; - -/* ---------------------------------------------------------------------- */ - -/* - * in the next operation, wait for the 'nowait' tasks in system-wide workqueue - */ -struct au_nowait_tasks { - atomic_t nw_len; - wait_queue_head_t nw_wq; -}; - -/* ---------------------------------------------------------------------- */ - -typedef void (*au_wkq_func_t)(void *args); - -/* wkq flags */ -#define AuWkq_WAIT 1 -#define AuWkq_NEST (1 << 1) -#define au_ftest_wkq(flags, name) ((flags) & AuWkq_##name) -#define au_fset_wkq(flags, name) \ - do { (flags) |= AuWkq_##name; } while (0) -#define au_fclr_wkq(flags, name) \ - do { (flags) &= ~AuWkq_##name; } while (0) - -#ifndef CONFIG_AUFS_HNOTIFY -#undef AuWkq_NEST -#define AuWkq_NEST 0 -#endif - -/* wkq.c */ -int au_wkq_do_wait(unsigned int flags, au_wkq_func_t func, void *args); -int au_wkq_nowait(au_wkq_func_t func, void *args, struct super_block *sb, - unsigned int flags); -void au_nwt_init(struct au_nowait_tasks *nwt); -int __init au_wkq_init(void); -void au_wkq_fin(void); - -/* ---------------------------------------------------------------------- */ - -static inline int au_wkq_test(void) -{ - return current->flags & PF_WQ_WORKER; -} - -static inline int au_wkq_wait(au_wkq_func_t func, void *args) -{ - return au_wkq_do_wait(AuWkq_WAIT, func, args); -} - -static inline void au_nwt_done(struct au_nowait_tasks *nwt) -{ - if (atomic_dec_and_test(&nwt->nw_len)) - wake_up_all(&nwt->nw_wq); -} - -static inline int au_nwt_flush(struct au_nowait_tasks *nwt) -{ - wait_event(nwt->nw_wq, !atomic_read(&nwt->nw_len)); - return 0; -} - -#endif /* __KERNEL__ */ -#endif /* __AUFS_WKQ_H__ */ diff --git a/fs/aufs/xattr.c b/fs/aufs/xattr.c deleted file mode 100644 index f592e05ea..000000000 --- a/fs/aufs/xattr.c +++ /dev/null @@ -1,331 +0,0 @@ -/* - * Copyright (C) 2014-2016 Junjiro R. Okajima - */ - -/* - * handling xattr functions - */ - -#include <linux/xattr.h> -#include "aufs.h" - -static int au_xattr_ignore(int err, char *name, unsigned int ignore_flags) -{ - if (!ignore_flags) - goto out; - switch (err) { - case -ENOMEM: - case -EDQUOT: - goto out; - } - - if ((ignore_flags & AuBrAttr_ICEX) == AuBrAttr_ICEX) { - err = 0; - goto out; - } - -#define cmp(brattr, prefix) do { \ - if (!strncmp(name, XATTR_##prefix##_PREFIX, \ - XATTR_##prefix##_PREFIX_LEN)) { \ - if (ignore_flags & AuBrAttr_ICEX_##brattr) \ - err = 0; \ - goto out; \ - } \ - } while (0) - - cmp(SEC, SECURITY); - cmp(SYS, SYSTEM); - cmp(TR, TRUSTED); - cmp(USR, USER); -#undef cmp - - if (ignore_flags & AuBrAttr_ICEX_OTH) - err = 0; - -out: - return err; -} - -static const int au_xattr_out_of_list = AuBrAttr_ICEX_OTH << 1; - -static int au_do_cpup_xattr(struct dentry *h_dst, struct dentry *h_src, - char *name, char **buf, unsigned int ignore_flags, - unsigned int verbose) -{ - int err; - ssize_t ssz; - struct inode *h_idst; - - ssz = vfs_getxattr_alloc(h_src, name, buf, 0, GFP_NOFS); - err = ssz; - if (unlikely(err <= 0)) { - if (err == -ENODATA - || (err == -EOPNOTSUPP - && ((ignore_flags & au_xattr_out_of_list) - || (au_test_nfs_noacl(d_inode(h_src)) - && (!strcmp(name, XATTR_NAME_POSIX_ACL_ACCESS) - || !strcmp(name, - XATTR_NAME_POSIX_ACL_DEFAULT)))) - )) - err = 0; - if (err && (verbose || au_debug_test())) - pr_err("%s, err %d\n", name, err); - goto out; - } - - /* unlock it temporary */ - h_idst = d_inode(h_dst); - mutex_unlock(&h_idst->i_mutex); - err = vfsub_setxattr(h_dst, name, *buf, ssz, /*flags*/0); - mutex_lock_nested(&h_idst->i_mutex, AuLsc_I_CHILD2); - if (unlikely(err)) { - if (verbose || au_debug_test()) - pr_err("%s, err %d\n", name, err); - err = au_xattr_ignore(err, name, ignore_flags); - } - -out: - return err; -} - -int au_cpup_xattr(struct dentry *h_dst, struct dentry *h_src, int ignore_flags, - unsigned int verbose) -{ - int err, unlocked, acl_access, acl_default; - ssize_t ssz; - struct inode *h_isrc, *h_idst; - char *value, *p, *o, *e; - - /* try stopping to update the source inode while we are referencing */ - /* there should not be the parent-child relationship between them */ - h_isrc = d_inode(h_src); - h_idst = d_inode(h_dst); - mutex_unlock(&h_idst->i_mutex); - mutex_lock_nested(&h_isrc->i_mutex, AuLsc_I_CHILD); - mutex_lock_nested(&h_idst->i_mutex, AuLsc_I_CHILD2); - unlocked = 0; - - /* some filesystems don't list POSIX ACL, for example tmpfs */ - ssz = vfs_listxattr(h_src, NULL, 0); - err = ssz; - if (unlikely(err < 0)) { - AuTraceErr(err); - if (err == -ENODATA - || err == -EOPNOTSUPP) - err = 0; /* ignore */ - goto out; - } - - err = 0; - p = NULL; - o = NULL; - if (ssz) { - err = -ENOMEM; - p = kmalloc(ssz, GFP_NOFS); - o = p; - if (unlikely(!p)) - goto out; - err = vfs_listxattr(h_src, p, ssz); - } - mutex_unlock(&h_isrc->i_mutex); - unlocked = 1; - AuDbg("err %d, ssz %zd\n", err, ssz); - if (unlikely(err < 0)) - goto out_free; - - err = 0; - e = p + ssz; - value = NULL; - acl_access = 0; - acl_default = 0; - while (!err && p < e) { - acl_access |= !strncmp(p, XATTR_NAME_POSIX_ACL_ACCESS, - sizeof(XATTR_NAME_POSIX_ACL_ACCESS) - 1); - acl_default |= !strncmp(p, XATTR_NAME_POSIX_ACL_DEFAULT, - sizeof(XATTR_NAME_POSIX_ACL_DEFAULT) - - 1); - err = au_do_cpup_xattr(h_dst, h_src, p, &value, ignore_flags, - verbose); - p += strlen(p) + 1; - } - AuTraceErr(err); - ignore_flags |= au_xattr_out_of_list; - if (!err && !acl_access) { - err = au_do_cpup_xattr(h_dst, h_src, - XATTR_NAME_POSIX_ACL_ACCESS, &value, - ignore_flags, verbose); - AuTraceErr(err); - } - if (!err && !acl_default) { - err = au_do_cpup_xattr(h_dst, h_src, - XATTR_NAME_POSIX_ACL_DEFAULT, &value, - ignore_flags, verbose); - AuTraceErr(err); - } - - kfree(value); - -out_free: - kfree(o); -out: - if (!unlocked) - mutex_unlock(&h_isrc->i_mutex); - AuTraceErr(err); - return err; -} - -/* ---------------------------------------------------------------------- */ - -enum { - AU_XATTR_LIST, - AU_XATTR_GET -}; - -struct au_lgxattr { - int type; - union { - struct { - char *list; - size_t size; - } list; - struct { - const char *name; - void *value; - size_t size; - } get; - } u; -}; - -static ssize_t au_lgxattr(struct dentry *dentry, struct au_lgxattr *arg) -{ - ssize_t err; - struct path h_path; - struct super_block *sb; - - sb = dentry->d_sb; - err = si_read_lock(sb, AuLock_FLUSH | AuLock_NOPLM); - if (unlikely(err)) - goto out; - err = au_h_path_getattr(dentry, /*force*/1, &h_path); - if (unlikely(err)) - goto out_si; - if (unlikely(!h_path.dentry)) - /* illegally overlapped or something */ - goto out_di; /* pretending success */ - - /* always topmost entry only */ - switch (arg->type) { - case AU_XATTR_LIST: - err = vfs_listxattr(h_path.dentry, - arg->u.list.list, arg->u.list.size); - break; - case AU_XATTR_GET: - err = vfs_getxattr(h_path.dentry, - arg->u.get.name, arg->u.get.value, - arg->u.get.size); - break; - } - -out_di: - di_read_unlock(dentry, AuLock_IR); -out_si: - si_read_unlock(sb); -out: - AuTraceErr(err); - return err; -} - -ssize_t aufs_listxattr(struct dentry *dentry, char *list, size_t size) -{ - struct au_lgxattr arg = { - .type = AU_XATTR_LIST, - .u.list = { - .list = list, - .size = size - }, - }; - - return au_lgxattr(dentry, &arg); -} - -ssize_t aufs_getxattr(struct dentry *dentry, const char *name, void *value, - size_t size) -{ - struct au_lgxattr arg = { - .type = AU_XATTR_GET, - .u.get = { - .name = name, - .value = value, - .size = size - }, - }; - - return au_lgxattr(dentry, &arg); -} - -int aufs_setxattr(struct dentry *dentry, const char *name, const void *value, - size_t size, int flags) -{ - struct au_srxattr arg = { - .type = AU_XATTR_SET, - .u.set = { - .name = name, - .value = value, - .size = size, - .flags = flags - }, - }; - - return au_srxattr(dentry, &arg); -} - -int aufs_removexattr(struct dentry *dentry, const char *name) -{ - struct au_srxattr arg = { - .type = AU_XATTR_REMOVE, - .u.remove = { - .name = name - }, - }; - - return au_srxattr(dentry, &arg); -} - -/* ---------------------------------------------------------------------- */ - -#if 0 -static size_t au_xattr_list(struct dentry *dentry, char *list, size_t list_size, - const char *name, size_t name_len, int type) -{ - return aufs_listxattr(dentry, list, list_size); -} - -static int au_xattr_get(struct dentry *dentry, const char *name, void *buffer, - size_t size, int type) -{ - return aufs_getxattr(dentry, name, buffer, size); -} - -static int au_xattr_set(struct dentry *dentry, const char *name, - const void *value, size_t size, int flags, int type) -{ - return aufs_setxattr(dentry, name, value, size, flags); -} - -static const struct xattr_handler au_xattr_handler = { - /* no prefix, no flags */ - .list = au_xattr_list, - .get = au_xattr_get, - .set = au_xattr_set - /* why no remove? */ -}; - -static const struct xattr_handler *au_xattr_handlers[] = { - &au_xattr_handler -}; - -void au_xattr_init(struct super_block *sb) -{ - /* sb->s_xattr = au_xattr_handlers; */ -} -#endif diff --git a/fs/aufs/xino.c b/fs/aufs/xino.c deleted file mode 100644 index 994258e3f..000000000 --- a/fs/aufs/xino.c +++ /dev/null @@ -1,1305 +0,0 @@ -/* - * Copyright (C) 2005-2016 Junjiro R. Okajima - */ - -/* - * external inode number translation table and bitmap - */ - -#include <linux/seq_file.h> -#include <linux/statfs.h> -#include "aufs.h" - -/* todo: unnecessary to support mmap_sem since kernel-space? */ -ssize_t xino_fread(vfs_readf_t func, struct file *file, void *kbuf, size_t size, - loff_t *pos) -{ - ssize_t err; - mm_segment_t oldfs; - union { - void *k; - char __user *u; - } buf; - - buf.k = kbuf; - oldfs = get_fs(); - set_fs(KERNEL_DS); - do { - /* todo: signal_pending? */ - err = func(file, buf.u, size, pos); - } while (err == -EAGAIN || err == -EINTR); - set_fs(oldfs); - -#if 0 /* reserved for future use */ - if (err > 0) - fsnotify_access(file->f_path.dentry); -#endif - - return err; -} - -/* ---------------------------------------------------------------------- */ - -static ssize_t xino_fwrite_wkq(vfs_writef_t func, struct file *file, void *buf, - size_t size, loff_t *pos); - -static ssize_t do_xino_fwrite(vfs_writef_t func, struct file *file, void *kbuf, - size_t size, loff_t *pos) -{ - ssize_t err; - mm_segment_t oldfs; - union { - void *k; - const char __user *u; - } buf; - int i; - const int prevent_endless = 10; - - i = 0; - buf.k = kbuf; - oldfs = get_fs(); - set_fs(KERNEL_DS); - do { - err = func(file, buf.u, size, pos); - if (err == -EINTR - && !au_wkq_test() - && fatal_signal_pending(current)) { - set_fs(oldfs); - err = xino_fwrite_wkq(func, file, kbuf, size, pos); - BUG_ON(err == -EINTR); - oldfs = get_fs(); - set_fs(KERNEL_DS); - } - } while (i++ < prevent_endless - && (err == -EAGAIN || err == -EINTR)); - set_fs(oldfs); - -#if 0 /* reserved for future use */ - if (err > 0) - fsnotify_modify(file->f_path.dentry); -#endif - - return err; -} - -struct do_xino_fwrite_args { - ssize_t *errp; - vfs_writef_t func; - struct file *file; - void *buf; - size_t size; - loff_t *pos; -}; - -static void call_do_xino_fwrite(void *args) -{ - struct do_xino_fwrite_args *a = args; - *a->errp = do_xino_fwrite(a->func, a->file, a->buf, a->size, a->pos); -} - -static ssize_t xino_fwrite_wkq(vfs_writef_t func, struct file *file, void *buf, - size_t size, loff_t *pos) -{ - ssize_t err; - int wkq_err; - struct do_xino_fwrite_args args = { - .errp = &err, - .func = func, - .file = file, - .buf = buf, - .size = size, - .pos = pos - }; - - /* - * it breaks RLIMIT_FSIZE and normal user's limit, - * users should care about quota and real 'filesystem full.' - */ - wkq_err = au_wkq_wait(call_do_xino_fwrite, &args); - if (unlikely(wkq_err)) - err = wkq_err; - - return err; -} - -ssize_t xino_fwrite(vfs_writef_t func, struct file *file, void *buf, - size_t size, loff_t *pos) -{ - ssize_t err; - - if (rlimit(RLIMIT_FSIZE) == RLIM_INFINITY) { - lockdep_off(); - err = do_xino_fwrite(func, file, buf, size, pos); - lockdep_on(); - } else - err = xino_fwrite_wkq(func, file, buf, size, pos); - - return err; -} - -/* ---------------------------------------------------------------------- */ - -/* - * create a new xinofile at the same place/path as @base_file. - */ -struct file *au_xino_create2(struct file *base_file, struct file *copy_src) -{ - struct file *file; - struct dentry *base, *parent; - struct inode *dir, *delegated; - struct qstr *name; - struct path path; - int err; - - base = base_file->f_path.dentry; - parent = base->d_parent; /* dir inode is locked */ - dir = d_inode(parent); - IMustLock(dir); - - file = ERR_PTR(-EINVAL); - name = &base->d_name; - path.dentry = vfsub_lookup_one_len(name->name, parent, name->len); - if (IS_ERR(path.dentry)) { - file = (void *)path.dentry; - pr_err("%pd lookup err %ld\n", - base, PTR_ERR(path.dentry)); - goto out; - } - - /* no need to mnt_want_write() since we call dentry_open() later */ - err = vfs_create(dir, path.dentry, S_IRUGO | S_IWUGO, NULL); - if (unlikely(err)) { - file = ERR_PTR(err); - pr_err("%pd create err %d\n", base, err); - goto out_dput; - } - - path.mnt = base_file->f_path.mnt; - file = vfsub_dentry_open(&path, - O_RDWR | O_CREAT | O_EXCL | O_LARGEFILE - /* | __FMODE_NONOTIFY */); - if (IS_ERR(file)) { - pr_err("%pd open err %ld\n", base, PTR_ERR(file)); - goto out_dput; - } - - delegated = NULL; - err = vfsub_unlink(dir, &file->f_path, &delegated, /*force*/0); - if (unlikely(err == -EWOULDBLOCK)) { - pr_warn("cannot retry for NFSv4 delegation" - " for an internal unlink\n"); - iput(delegated); - } - if (unlikely(err)) { - pr_err("%pd unlink err %d\n", base, err); - goto out_fput; - } - - if (copy_src) { - /* no one can touch copy_src xino */ - err = au_copy_file(file, copy_src, vfsub_f_size_read(copy_src)); - if (unlikely(err)) { - pr_err("%pd copy err %d\n", base, err); - goto out_fput; - } - } - goto out_dput; /* success */ - -out_fput: - fput(file); - file = ERR_PTR(err); -out_dput: - dput(path.dentry); -out: - return file; -} - -struct au_xino_lock_dir { - struct au_hinode *hdir; - struct dentry *parent; - struct mutex *mtx; -}; - -static void au_xino_lock_dir(struct super_block *sb, struct file *xino, - struct au_xino_lock_dir *ldir) -{ - aufs_bindex_t brid, bindex; - - ldir->hdir = NULL; - bindex = -1; - brid = au_xino_brid(sb); - if (brid >= 0) - bindex = au_br_index(sb, brid); - if (bindex >= 0) { - ldir->hdir = au_hi(d_inode(sb->s_root), bindex); - au_hn_imtx_lock_nested(ldir->hdir, AuLsc_I_PARENT); - } else { - ldir->parent = dget_parent(xino->f_path.dentry); - ldir->mtx = &d_inode(ldir->parent)->i_mutex; - mutex_lock_nested(ldir->mtx, AuLsc_I_PARENT); - } -} - -static void au_xino_unlock_dir(struct au_xino_lock_dir *ldir) -{ - if (ldir->hdir) - au_hn_imtx_unlock(ldir->hdir); - else { - mutex_unlock(ldir->mtx); - dput(ldir->parent); - } -} - -/* ---------------------------------------------------------------------- */ - -/* trucate xino files asynchronously */ - -int au_xino_trunc(struct super_block *sb, aufs_bindex_t bindex) -{ - int err; - unsigned long jiffy; - blkcnt_t blocks; - aufs_bindex_t bi, bend; - struct kstatfs *st; - struct au_branch *br; - struct file *new_xino, *file; - struct super_block *h_sb; - struct au_xino_lock_dir ldir; - - err = -ENOMEM; - st = kmalloc(sizeof(*st), GFP_NOFS); - if (unlikely(!st)) - goto out; - - err = -EINVAL; - bend = au_sbend(sb); - if (unlikely(bindex < 0 || bend < bindex)) - goto out_st; - br = au_sbr(sb, bindex); - file = br->br_xino.xi_file; - if (!file) - goto out_st; - - err = vfs_statfs(&file->f_path, st); - if (unlikely(err)) - AuErr1("statfs err %d, ignored\n", err); - jiffy = jiffies; - blocks = file_inode(file)->i_blocks; - pr_info("begin truncating xino(b%d), ib%llu, %llu/%llu free blks\n", - bindex, (u64)blocks, st->f_bfree, st->f_blocks); - - au_xino_lock_dir(sb, file, &ldir); - /* mnt_want_write() is unnecessary here */ - new_xino = au_xino_create2(file, file); - au_xino_unlock_dir(&ldir); - err = PTR_ERR(new_xino); - if (IS_ERR(new_xino)) { - pr_err("err %d, ignored\n", err); - goto out_st; - } - err = 0; - fput(file); - br->br_xino.xi_file = new_xino; - - h_sb = au_br_sb(br); - for (bi = 0; bi <= bend; bi++) { - if (unlikely(bi == bindex)) - continue; - br = au_sbr(sb, bi); - if (au_br_sb(br) != h_sb) - continue; - - fput(br->br_xino.xi_file); - br->br_xino.xi_file = new_xino; - get_file(new_xino); - } - - err = vfs_statfs(&new_xino->f_path, st); - if (!err) { - pr_info("end truncating xino(b%d), ib%llu, %llu/%llu free blks\n", - bindex, (u64)file_inode(new_xino)->i_blocks, - st->f_bfree, st->f_blocks); - if (file_inode(new_xino)->i_blocks < blocks) - au_sbi(sb)->si_xino_jiffy = jiffy; - } else - AuErr1("statfs err %d, ignored\n", err); - -out_st: - kfree(st); -out: - return err; -} - -struct xino_do_trunc_args { - struct super_block *sb; - struct au_branch *br; -}; - -static void xino_do_trunc(void *_args) -{ - struct xino_do_trunc_args *args = _args; - struct super_block *sb; - struct au_branch *br; - struct inode *dir; - int err; - aufs_bindex_t bindex; - - err = 0; - sb = args->sb; - dir = d_inode(sb->s_root); - br = args->br; - - si_noflush_write_lock(sb); - ii_read_lock_parent(dir); - bindex = au_br_index(sb, br->br_id); - err = au_xino_trunc(sb, bindex); - ii_read_unlock(dir); - if (unlikely(err)) - pr_warn("err b%d, (%d)\n", bindex, err); - atomic_dec(&br->br_xino_running); - atomic_dec(&br->br_count); - si_write_unlock(sb); - au_nwt_done(&au_sbi(sb)->si_nowait); - kfree(args); -} - -static int xino_trunc_test(struct super_block *sb, struct au_branch *br) -{ - int err; - struct kstatfs st; - struct au_sbinfo *sbinfo; - - /* todo: si_xino_expire and the ratio should be customizable */ - sbinfo = au_sbi(sb); - if (time_before(jiffies, - sbinfo->si_xino_jiffy + sbinfo->si_xino_expire)) - return 0; - - /* truncation border */ - err = vfs_statfs(&br->br_xino.xi_file->f_path, &st); - if (unlikely(err)) { - AuErr1("statfs err %d, ignored\n", err); - return 0; - } - if (div64_u64(st.f_bfree * 100, st.f_blocks) >= AUFS_XINO_DEF_TRUNC) - return 0; - - return 1; -} - -static void xino_try_trunc(struct super_block *sb, struct au_branch *br) -{ - struct xino_do_trunc_args *args; - int wkq_err; - - if (!xino_trunc_test(sb, br)) - return; - - if (atomic_inc_return(&br->br_xino_running) > 1) - goto out; - - /* lock and kfree() will be called in trunc_xino() */ - args = kmalloc(sizeof(*args), GFP_NOFS); - if (unlikely(!args)) { - AuErr1("no memory\n"); - goto out_args; - } - - atomic_inc(&br->br_count); - args->sb = sb; - args->br = br; - wkq_err = au_wkq_nowait(xino_do_trunc, args, sb, /*flags*/0); - if (!wkq_err) - return; /* success */ - - pr_err("wkq %d\n", wkq_err); - atomic_dec(&br->br_count); - -out_args: - kfree(args); -out: - atomic_dec(&br->br_xino_running); -} - -/* ---------------------------------------------------------------------- */ - -static int au_xino_do_write(vfs_writef_t write, struct file *file, - ino_t h_ino, ino_t ino) -{ - loff_t pos; - ssize_t sz; - - pos = h_ino; - if (unlikely(au_loff_max / sizeof(ino) - 1 < pos)) { - AuIOErr1("too large hi%lu\n", (unsigned long)h_ino); - return -EFBIG; - } - pos *= sizeof(ino); - sz = xino_fwrite(write, file, &ino, sizeof(ino), &pos); - if (sz == sizeof(ino)) - return 0; /* success */ - - AuIOErr("write failed (%zd)\n", sz); - return -EIO; -} - -/* - * write @ino to the xinofile for the specified branch{@sb, @bindex} - * at the position of @h_ino. - * even if @ino is zero, it is written to the xinofile and means no entry. - * if the size of the xino file on a specific filesystem exceeds the watermark, - * try truncating it. - */ -int au_xino_write(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino, - ino_t ino) -{ - int err; - unsigned int mnt_flags; - struct au_branch *br; - - BUILD_BUG_ON(sizeof(long long) != sizeof(au_loff_max) - || ((loff_t)-1) > 0); - SiMustAnyLock(sb); - - mnt_flags = au_mntflags(sb); - if (!au_opt_test(mnt_flags, XINO)) - return 0; - - br = au_sbr(sb, bindex); - err = au_xino_do_write(au_sbi(sb)->si_xwrite, br->br_xino.xi_file, - h_ino, ino); - if (!err) { - if (au_opt_test(mnt_flags, TRUNC_XINO) - && au_test_fs_trunc_xino(au_br_sb(br))) - xino_try_trunc(sb, br); - return 0; /* success */ - } - - AuIOErr("write failed (%d)\n", err); - return -EIO; -} - -/* ---------------------------------------------------------------------- */ - -/* aufs inode number bitmap */ - -static const int page_bits = (int)PAGE_SIZE * BITS_PER_BYTE; -static ino_t xib_calc_ino(unsigned long pindex, int bit) -{ - ino_t ino; - - AuDebugOn(bit < 0 || page_bits <= bit); - ino = AUFS_FIRST_INO + pindex * page_bits + bit; - return ino; -} - -static void xib_calc_bit(ino_t ino, unsigned long *pindex, int *bit) -{ - AuDebugOn(ino < AUFS_FIRST_INO); - ino -= AUFS_FIRST_INO; - *pindex = ino / page_bits; - *bit = ino % page_bits; -} - -static int xib_pindex(struct super_block *sb, unsigned long pindex) -{ - int err; - loff_t pos; - ssize_t sz; - struct au_sbinfo *sbinfo; - struct file *xib; - unsigned long *p; - - sbinfo = au_sbi(sb); - MtxMustLock(&sbinfo->si_xib_mtx); - AuDebugOn(pindex > ULONG_MAX / PAGE_SIZE - || !au_opt_test(sbinfo->si_mntflags, XINO)); - - if (pindex == sbinfo->si_xib_last_pindex) - return 0; - - xib = sbinfo->si_xib; - p = sbinfo->si_xib_buf; - pos = sbinfo->si_xib_last_pindex; - pos *= PAGE_SIZE; - sz = xino_fwrite(sbinfo->si_xwrite, xib, p, PAGE_SIZE, &pos); - if (unlikely(sz != PAGE_SIZE)) - goto out; - - pos = pindex; - pos *= PAGE_SIZE; - if (vfsub_f_size_read(xib) >= pos + PAGE_SIZE) - sz = xino_fread(sbinfo->si_xread, xib, p, PAGE_SIZE, &pos); - else { - memset(p, 0, PAGE_SIZE); - sz = xino_fwrite(sbinfo->si_xwrite, xib, p, PAGE_SIZE, &pos); - } - if (sz == PAGE_SIZE) { - sbinfo->si_xib_last_pindex = pindex; - return 0; /* success */ - } - -out: - AuIOErr1("write failed (%zd)\n", sz); - err = sz; - if (sz >= 0) - err = -EIO; - return err; -} - -/* ---------------------------------------------------------------------- */ - -static void au_xib_clear_bit(struct inode *inode) -{ - int err, bit; - unsigned long pindex; - struct super_block *sb; - struct au_sbinfo *sbinfo; - - AuDebugOn(inode->i_nlink); - - sb = inode->i_sb; - xib_calc_bit(inode->i_ino, &pindex, &bit); - AuDebugOn(page_bits <= bit); - sbinfo = au_sbi(sb); - mutex_lock(&sbinfo->si_xib_mtx); - err = xib_pindex(sb, pindex); - if (!err) { - clear_bit(bit, sbinfo->si_xib_buf); - sbinfo->si_xib_next_bit = bit; - } - mutex_unlock(&sbinfo->si_xib_mtx); -} - -/* for s_op->delete_inode() */ -void au_xino_delete_inode(struct inode *inode, const int unlinked) -{ - int err; - unsigned int mnt_flags; - aufs_bindex_t bindex, bend, bi; - unsigned char try_trunc; - struct au_iinfo *iinfo; - struct super_block *sb; - struct au_hinode *hi; - struct inode *h_inode; - struct au_branch *br; - vfs_writef_t xwrite; - - sb = inode->i_sb; - mnt_flags = au_mntflags(sb); - if (!au_opt_test(mnt_flags, XINO) - || inode->i_ino == AUFS_ROOT_INO) - return; - - if (unlinked) { - au_xigen_inc(inode); - au_xib_clear_bit(inode); - } - - iinfo = au_ii(inode); - if (!iinfo) - return; - - bindex = iinfo->ii_bstart; - if (bindex < 0) - return; - - xwrite = au_sbi(sb)->si_xwrite; - try_trunc = !!au_opt_test(mnt_flags, TRUNC_XINO); - hi = iinfo->ii_hinode + bindex; - bend = iinfo->ii_bend; - for (; bindex <= bend; bindex++, hi++) { - h_inode = hi->hi_inode; - if (!h_inode - || (!unlinked && h_inode->i_nlink)) - continue; - - /* inode may not be revalidated */ - bi = au_br_index(sb, hi->hi_id); - if (bi < 0) - continue; - - br = au_sbr(sb, bi); - err = au_xino_do_write(xwrite, br->br_xino.xi_file, - h_inode->i_ino, /*ino*/0); - if (!err && try_trunc - && au_test_fs_trunc_xino(au_br_sb(br))) - xino_try_trunc(sb, br); - } -} - -/* get an unused inode number from bitmap */ -ino_t au_xino_new_ino(struct super_block *sb) -{ - ino_t ino; - unsigned long *p, pindex, ul, pend; - struct au_sbinfo *sbinfo; - struct file *file; - int free_bit, err; - - if (!au_opt_test(au_mntflags(sb), XINO)) - return iunique(sb, AUFS_FIRST_INO); - - sbinfo = au_sbi(sb); - mutex_lock(&sbinfo->si_xib_mtx); - p = sbinfo->si_xib_buf; - free_bit = sbinfo->si_xib_next_bit; - if (free_bit < page_bits && !test_bit(free_bit, p)) - goto out; /* success */ - free_bit = find_first_zero_bit(p, page_bits); - if (free_bit < page_bits) - goto out; /* success */ - - pindex = sbinfo->si_xib_last_pindex; - for (ul = pindex - 1; ul < ULONG_MAX; ul--) { - err = xib_pindex(sb, ul); - if (unlikely(err)) - goto out_err; - free_bit = find_first_zero_bit(p, page_bits); - if (free_bit < page_bits) - goto out; /* success */ - } - - file = sbinfo->si_xib; - pend = vfsub_f_size_read(file) / PAGE_SIZE; - for (ul = pindex + 1; ul <= pend; ul++) { - err = xib_pindex(sb, ul); - if (unlikely(err)) - goto out_err; - free_bit = find_first_zero_bit(p, page_bits); - if (free_bit < page_bits) - goto out; /* success */ - } - BUG(); - -out: - set_bit(free_bit, p); - sbinfo->si_xib_next_bit = free_bit + 1; - pindex = sbinfo->si_xib_last_pindex; - mutex_unlock(&sbinfo->si_xib_mtx); - ino = xib_calc_ino(pindex, free_bit); - AuDbg("i%lu\n", (unsigned long)ino); - return ino; -out_err: - mutex_unlock(&sbinfo->si_xib_mtx); - AuDbg("i0\n"); - return 0; -} - -/* - * read @ino from xinofile for the specified branch{@sb, @bindex} - * at the position of @h_ino. - * if @ino does not exist and @do_new is true, get new one. - */ -int au_xino_read(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino, - ino_t *ino) -{ - int err; - ssize_t sz; - loff_t pos; - struct file *file; - struct au_sbinfo *sbinfo; - - *ino = 0; - if (!au_opt_test(au_mntflags(sb), XINO)) - return 0; /* no xino */ - - err = 0; - sbinfo = au_sbi(sb); - pos = h_ino; - if (unlikely(au_loff_max / sizeof(*ino) - 1 < pos)) { - AuIOErr1("too large hi%lu\n", (unsigned long)h_ino); - return -EFBIG; - } - pos *= sizeof(*ino); - - file = au_sbr(sb, bindex)->br_xino.xi_file; - if (vfsub_f_size_read(file) < pos + sizeof(*ino)) - return 0; /* no ino */ - - sz = xino_fread(sbinfo->si_xread, file, ino, sizeof(*ino), &pos); - if (sz == sizeof(*ino)) - return 0; /* success */ - - err = sz; - if (unlikely(sz >= 0)) { - err = -EIO; - AuIOErr("xino read error (%zd)\n", sz); - } - - return err; -} - -/* ---------------------------------------------------------------------- */ - -/* create and set a new xino file */ - -struct file *au_xino_create(struct super_block *sb, char *fname, int silent) -{ - struct file *file; - struct dentry *h_parent, *d; - struct inode *h_dir, *inode; - int err; - - /* - * at mount-time, and the xino file is the default path, - * hnotify is disabled so we have no notify events to ignore. - * when a user specified the xino, we cannot get au_hdir to be ignored. - */ - file = vfsub_filp_open(fname, O_RDWR | O_CREAT | O_EXCL | O_LARGEFILE - /* | __FMODE_NONOTIFY */, - S_IRUGO | S_IWUGO); - if (IS_ERR(file)) { - if (!silent) - pr_err("open %s(%ld)\n", fname, PTR_ERR(file)); - return file; - } - - /* keep file count */ - err = 0; - inode = file_inode(file); - h_parent = dget_parent(file->f_path.dentry); - h_dir = d_inode(h_parent); - mutex_lock_nested(&h_dir->i_mutex, AuLsc_I_PARENT); - /* mnt_want_write() is unnecessary here */ - /* no delegation since it is just created */ - if (inode->i_nlink) - err = vfsub_unlink(h_dir, &file->f_path, /*delegated*/NULL, - /*force*/0); - mutex_unlock(&h_dir->i_mutex); - dput(h_parent); - if (unlikely(err)) { - if (!silent) - pr_err("unlink %s(%d)\n", fname, err); - goto out; - } - - err = -EINVAL; - d = file->f_path.dentry; - if (unlikely(sb == d->d_sb)) { - if (!silent) - pr_err("%s must be outside\n", fname); - goto out; - } - if (unlikely(au_test_fs_bad_xino(d->d_sb))) { - if (!silent) - pr_err("xino doesn't support %s(%s)\n", - fname, au_sbtype(d->d_sb)); - goto out; - } - return file; /* success */ - -out: - fput(file); - file = ERR_PTR(err); - return file; -} - -/* - * find another branch who is on the same filesystem of the specified - * branch{@btgt}. search until @bend. - */ -static int is_sb_shared(struct super_block *sb, aufs_bindex_t btgt, - aufs_bindex_t bend) -{ - aufs_bindex_t bindex; - struct super_block *tgt_sb = au_sbr_sb(sb, btgt); - - for (bindex = 0; bindex < btgt; bindex++) - if (unlikely(tgt_sb == au_sbr_sb(sb, bindex))) - return bindex; - for (bindex++; bindex <= bend; bindex++) - if (unlikely(tgt_sb == au_sbr_sb(sb, bindex))) - return bindex; - return -1; -} - -/* ---------------------------------------------------------------------- */ - -/* - * initialize the xinofile for the specified branch @br - * at the place/path where @base_file indicates. - * test whether another branch is on the same filesystem or not, - * if @do_test is true. - */ -int au_xino_br(struct super_block *sb, struct au_branch *br, ino_t h_ino, - struct file *base_file, int do_test) -{ - int err; - ino_t ino; - aufs_bindex_t bend, bindex; - struct au_branch *shared_br, *b; - struct file *file; - struct super_block *tgt_sb; - - shared_br = NULL; - bend = au_sbend(sb); - if (do_test) { - tgt_sb = au_br_sb(br); - for (bindex = 0; bindex <= bend; bindex++) { - b = au_sbr(sb, bindex); - if (tgt_sb == au_br_sb(b)) { - shared_br = b; - break; - } - } - } - - if (!shared_br || !shared_br->br_xino.xi_file) { - struct au_xino_lock_dir ldir; - - au_xino_lock_dir(sb, base_file, &ldir); - /* mnt_want_write() is unnecessary here */ - file = au_xino_create2(base_file, NULL); - au_xino_unlock_dir(&ldir); - err = PTR_ERR(file); - if (IS_ERR(file)) - goto out; - br->br_xino.xi_file = file; - } else { - br->br_xino.xi_file = shared_br->br_xino.xi_file; - get_file(br->br_xino.xi_file); - } - - ino = AUFS_ROOT_INO; - err = au_xino_do_write(au_sbi(sb)->si_xwrite, br->br_xino.xi_file, - h_ino, ino); - if (unlikely(err)) { - fput(br->br_xino.xi_file); - br->br_xino.xi_file = NULL; - } - -out: - return err; -} - -/* ---------------------------------------------------------------------- */ - -/* trucate a xino bitmap file */ - -/* todo: slow */ -static int do_xib_restore(struct super_block *sb, struct file *file, void *page) -{ - int err, bit; - ssize_t sz; - unsigned long pindex; - loff_t pos, pend; - struct au_sbinfo *sbinfo; - vfs_readf_t func; - ino_t *ino; - unsigned long *p; - - err = 0; - sbinfo = au_sbi(sb); - MtxMustLock(&sbinfo->si_xib_mtx); - p = sbinfo->si_xib_buf; - func = sbinfo->si_xread; - pend = vfsub_f_size_read(file); - pos = 0; - while (pos < pend) { - sz = xino_fread(func, file, page, PAGE_SIZE, &pos); - err = sz; - if (unlikely(sz <= 0)) - goto out; - - err = 0; - for (ino = page; sz > 0; ino++, sz -= sizeof(ino)) { - if (unlikely(*ino < AUFS_FIRST_INO)) - continue; - - xib_calc_bit(*ino, &pindex, &bit); - AuDebugOn(page_bits <= bit); - err = xib_pindex(sb, pindex); - if (!err) - set_bit(bit, p); - else - goto out; - } - } - -out: - return err; -} - -static int xib_restore(struct super_block *sb) -{ - int err; - aufs_bindex_t bindex, bend; - void *page; - - err = -ENOMEM; - page = (void *)__get_free_page(GFP_NOFS); - if (unlikely(!page)) - goto out; - - err = 0; - bend = au_sbend(sb); - for (bindex = 0; !err && bindex <= bend; bindex++) - if (!bindex || is_sb_shared(sb, bindex, bindex - 1) < 0) - err = do_xib_restore - (sb, au_sbr(sb, bindex)->br_xino.xi_file, page); - else - AuDbg("b%d\n", bindex); - free_page((unsigned long)page); - -out: - return err; -} - -int au_xib_trunc(struct super_block *sb) -{ - int err; - ssize_t sz; - loff_t pos; - struct au_xino_lock_dir ldir; - struct au_sbinfo *sbinfo; - unsigned long *p; - struct file *file; - - SiMustWriteLock(sb); - - err = 0; - sbinfo = au_sbi(sb); - if (!au_opt_test(sbinfo->si_mntflags, XINO)) - goto out; - - file = sbinfo->si_xib; - if (vfsub_f_size_read(file) <= PAGE_SIZE) - goto out; - - au_xino_lock_dir(sb, file, &ldir); - /* mnt_want_write() is unnecessary here */ - file = au_xino_create2(sbinfo->si_xib, NULL); - au_xino_unlock_dir(&ldir); - err = PTR_ERR(file); - if (IS_ERR(file)) - goto out; - fput(sbinfo->si_xib); - sbinfo->si_xib = file; - - p = sbinfo->si_xib_buf; - memset(p, 0, PAGE_SIZE); - pos = 0; - sz = xino_fwrite(sbinfo->si_xwrite, sbinfo->si_xib, p, PAGE_SIZE, &pos); - if (unlikely(sz != PAGE_SIZE)) { - err = sz; - AuIOErr("err %d\n", err); - if (sz >= 0) - err = -EIO; - goto out; - } - - mutex_lock(&sbinfo->si_xib_mtx); - /* mnt_want_write() is unnecessary here */ - err = xib_restore(sb); - mutex_unlock(&sbinfo->si_xib_mtx); - -out: - return err; -} - -/* ---------------------------------------------------------------------- */ - -/* - * xino mount option handlers - */ - -/* xino bitmap */ -static void xino_clear_xib(struct super_block *sb) -{ - struct au_sbinfo *sbinfo; - - SiMustWriteLock(sb); - - sbinfo = au_sbi(sb); - sbinfo->si_xread = NULL; - sbinfo->si_xwrite = NULL; - if (sbinfo->si_xib) - fput(sbinfo->si_xib); - sbinfo->si_xib = NULL; - free_page((unsigned long)sbinfo->si_xib_buf); - sbinfo->si_xib_buf = NULL; -} - -static int au_xino_set_xib(struct super_block *sb, struct file *base) -{ - int err; - loff_t pos; - struct au_sbinfo *sbinfo; - struct file *file; - - SiMustWriteLock(sb); - - sbinfo = au_sbi(sb); - file = au_xino_create2(base, sbinfo->si_xib); - err = PTR_ERR(file); - if (IS_ERR(file)) - goto out; - if (sbinfo->si_xib) - fput(sbinfo->si_xib); - sbinfo->si_xib = file; - sbinfo->si_xread = vfs_readf(file); - sbinfo->si_xwrite = vfs_writef(file); - - err = -ENOMEM; - if (!sbinfo->si_xib_buf) - sbinfo->si_xib_buf = (void *)get_zeroed_page(GFP_NOFS); - if (unlikely(!sbinfo->si_xib_buf)) - goto out_unset; - - sbinfo->si_xib_last_pindex = 0; - sbinfo->si_xib_next_bit = 0; - if (vfsub_f_size_read(file) < PAGE_SIZE) { - pos = 0; - err = xino_fwrite(sbinfo->si_xwrite, file, sbinfo->si_xib_buf, - PAGE_SIZE, &pos); - if (unlikely(err != PAGE_SIZE)) - goto out_free; - } - err = 0; - goto out; /* success */ - -out_free: - free_page((unsigned long)sbinfo->si_xib_buf); - sbinfo->si_xib_buf = NULL; - if (err >= 0) - err = -EIO; -out_unset: - fput(sbinfo->si_xib); - sbinfo->si_xib = NULL; - sbinfo->si_xread = NULL; - sbinfo->si_xwrite = NULL; -out: - return err; -} - -/* xino for each branch */ -static void xino_clear_br(struct super_block *sb) -{ - aufs_bindex_t bindex, bend; - struct au_branch *br; - - bend = au_sbend(sb); - for (bindex = 0; bindex <= bend; bindex++) { - br = au_sbr(sb, bindex); - if (!br || !br->br_xino.xi_file) - continue; - - fput(br->br_xino.xi_file); - br->br_xino.xi_file = NULL; - } -} - -static int au_xino_set_br(struct super_block *sb, struct file *base) -{ - int err; - ino_t ino; - aufs_bindex_t bindex, bend, bshared; - struct { - struct file *old, *new; - } *fpair, *p; - struct au_branch *br; - struct inode *inode; - vfs_writef_t writef; - - SiMustWriteLock(sb); - - err = -ENOMEM; - bend = au_sbend(sb); - fpair = kcalloc(bend + 1, sizeof(*fpair), GFP_NOFS); - if (unlikely(!fpair)) - goto out; - - inode = d_inode(sb->s_root); - ino = AUFS_ROOT_INO; - writef = au_sbi(sb)->si_xwrite; - for (bindex = 0, p = fpair; bindex <= bend; bindex++, p++) { - br = au_sbr(sb, bindex); - bshared = is_sb_shared(sb, bindex, bindex - 1); - if (bshared >= 0) { - /* shared xino */ - *p = fpair[bshared]; - get_file(p->new); - } - - if (!p->new) { - /* new xino */ - p->old = br->br_xino.xi_file; - p->new = au_xino_create2(base, br->br_xino.xi_file); - err = PTR_ERR(p->new); - if (IS_ERR(p->new)) { - p->new = NULL; - goto out_pair; - } - } - - err = au_xino_do_write(writef, p->new, - au_h_iptr(inode, bindex)->i_ino, ino); - if (unlikely(err)) - goto out_pair; - } - - for (bindex = 0, p = fpair; bindex <= bend; bindex++, p++) { - br = au_sbr(sb, bindex); - if (br->br_xino.xi_file) - fput(br->br_xino.xi_file); - get_file(p->new); - br->br_xino.xi_file = p->new; - } - -out_pair: - for (bindex = 0, p = fpair; bindex <= bend; bindex++, p++) - if (p->new) - fput(p->new); - else - break; - kfree(fpair); -out: - return err; -} - -void au_xino_clr(struct super_block *sb) -{ - struct au_sbinfo *sbinfo; - - au_xigen_clr(sb); - xino_clear_xib(sb); - xino_clear_br(sb); - sbinfo = au_sbi(sb); - /* lvalue, do not call au_mntflags() */ - au_opt_clr(sbinfo->si_mntflags, XINO); -} - -int au_xino_set(struct super_block *sb, struct au_opt_xino *xino, int remount) -{ - int err, skip; - struct dentry *parent, *cur_parent; - struct qstr *dname, *cur_name; - struct file *cur_xino; - struct inode *dir; - struct au_sbinfo *sbinfo; - - SiMustWriteLock(sb); - - err = 0; - sbinfo = au_sbi(sb); - parent = dget_parent(xino->file->f_path.dentry); - if (remount) { - skip = 0; - dname = &xino->file->f_path.dentry->d_name; - cur_xino = sbinfo->si_xib; - if (cur_xino) { - cur_parent = dget_parent(cur_xino->f_path.dentry); - cur_name = &cur_xino->f_path.dentry->d_name; - skip = (cur_parent == parent - && au_qstreq(dname, cur_name)); - dput(cur_parent); - } - if (skip) - goto out; - } - - au_opt_set(sbinfo->si_mntflags, XINO); - dir = d_inode(parent); - mutex_lock_nested(&dir->i_mutex, AuLsc_I_PARENT); - /* mnt_want_write() is unnecessary here */ - err = au_xino_set_xib(sb, xino->file); - if (!err) - err = au_xigen_set(sb, xino->file); - if (!err) - err = au_xino_set_br(sb, xino->file); - mutex_unlock(&dir->i_mutex); - if (!err) - goto out; /* success */ - - /* reset all */ - AuIOErr("failed creating xino(%d).\n", err); - au_xigen_clr(sb); - xino_clear_xib(sb); - -out: - dput(parent); - return err; -} - -/* ---------------------------------------------------------------------- */ - -/* - * create a xinofile at the default place/path. - */ -struct file *au_xino_def(struct super_block *sb) -{ - struct file *file; - char *page, *p; - struct au_branch *br; - struct super_block *h_sb; - struct path path; - aufs_bindex_t bend, bindex, bwr; - - br = NULL; - bend = au_sbend(sb); - bwr = -1; - for (bindex = 0; bindex <= bend; bindex++) { - br = au_sbr(sb, bindex); - if (au_br_writable(br->br_perm) - && !au_test_fs_bad_xino(au_br_sb(br))) { - bwr = bindex; - break; - } - } - - if (bwr >= 0) { - file = ERR_PTR(-ENOMEM); - page = (void *)__get_free_page(GFP_NOFS); - if (unlikely(!page)) - goto out; - path.mnt = au_br_mnt(br); - path.dentry = au_h_dptr(sb->s_root, bwr); - p = d_path(&path, page, PATH_MAX - sizeof(AUFS_XINO_FNAME)); - file = (void *)p; - if (!IS_ERR(p)) { - strcat(p, "/" AUFS_XINO_FNAME); - AuDbg("%s\n", p); - file = au_xino_create(sb, p, /*silent*/0); - if (!IS_ERR(file)) - au_xino_brid_set(sb, br->br_id); - } - free_page((unsigned long)page); - } else { - file = au_xino_create(sb, AUFS_XINO_DEFPATH, /*silent*/0); - if (IS_ERR(file)) - goto out; - h_sb = file->f_path.dentry->d_sb; - if (unlikely(au_test_fs_bad_xino(h_sb))) { - pr_err("xino doesn't support %s(%s)\n", - AUFS_XINO_DEFPATH, au_sbtype(h_sb)); - fput(file); - file = ERR_PTR(-EINVAL); - } - if (!IS_ERR(file)) - au_xino_brid_set(sb, -1); - } - -out: - return file; -} - -/* ---------------------------------------------------------------------- */ - -int au_xino_path(struct seq_file *seq, struct file *file) -{ - int err; - - err = au_seq_path(seq, &file->f_path); - if (unlikely(err)) - goto out; - -#define Deleted "\\040(deleted)" - seq->count -= sizeof(Deleted) - 1; - AuDebugOn(memcmp(seq->buf + seq->count, Deleted, - sizeof(Deleted) - 1)); -#undef Deleted - -out: - return err; -} diff --git a/fs/autofs4/symlink.c b/fs/autofs4/symlink.c index da0c33481..84e037d1d 100644 --- a/fs/autofs4/symlink.c +++ b/fs/autofs4/symlink.c @@ -12,10 +12,16 @@ #include "autofs_i.h" -static const char *autofs4_follow_link(struct dentry *dentry, void **cookie) +static const char *autofs4_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) { - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); - struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_sb_info *sbi; + struct autofs_info *ino; + if (!dentry) + return ERR_PTR(-ECHILD); + sbi = autofs4_sbi(dentry->d_sb); + ino = autofs4_dentry_ino(dentry); if (ino && !autofs4_oz_mode(sbi)) ino->last_used = jiffies; return d_inode(dentry)->i_private; @@ -23,5 +29,5 @@ static const char *autofs4_follow_link(struct dentry *dentry, void **cookie) const struct inode_operations autofs4_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = autofs4_follow_link + .get_link = autofs4_get_link }; diff --git a/fs/bad_inode.c b/fs/bad_inode.c index 861b1e1c4..103f5d7c3 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -192,7 +192,7 @@ EXPORT_SYMBOL(make_bad_inode); * Returns true if the inode in question has been marked as bad. */ -int is_bad_inode(struct inode *inode) +bool is_bad_inode(struct inode *inode) { return (inode->i_op == &bad_inode_ops); } diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 46aedacfa..cc0e08252 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -42,7 +42,7 @@ static struct inode *befs_iget(struct super_block *, unsigned long); static struct inode *befs_alloc_inode(struct super_block *sb); static void befs_destroy_inode(struct inode *inode); static void befs_destroy_inodecache(void); -static const char *befs_follow_link(struct dentry *, void **); +static int befs_symlink_readpage(struct file *, struct page *); static int befs_utf2nls(struct super_block *sb, const char *in, int in_len, char **out, int *out_len); static int befs_nls2utf(struct super_block *sb, const char *in, int in_len, @@ -79,10 +79,8 @@ static const struct address_space_operations befs_aops = { .bmap = befs_bmap, }; -static const struct inode_operations befs_symlink_inode_operations = { - .readlink = generic_readlink, - .follow_link = befs_follow_link, - .put_link = kfree_put_link, +static const struct address_space_operations befs_symlink_aops = { + .readpage = befs_symlink_readpage, }; /* @@ -398,7 +396,9 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino) inode->i_fop = &befs_dir_operations; } else if (S_ISLNK(inode->i_mode)) { if (befs_ino->i_flags & BEFS_LONG_SYMLINK) { - inode->i_op = &befs_symlink_inode_operations; + inode->i_op = &page_symlink_inode_operations; + inode_nohighmem(inode); + inode->i_mapping->a_ops = &befs_symlink_aops; } else { inode->i_link = befs_ino->i_data.symlink; inode->i_op = &simple_symlink_inode_operations; @@ -434,7 +434,7 @@ befs_init_inodecache(void) befs_inode_cachep = kmem_cache_create("befs_inode_cache", sizeof (struct befs_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (befs_inode_cachep == NULL) { pr_err("%s: Couldn't initialize inode slabcache\n", __func__); @@ -463,31 +463,33 @@ befs_destroy_inodecache(void) * The data stream become link name. Unless the LONG_SYMLINK * flag is set. */ -static const char * -befs_follow_link(struct dentry *dentry, void **cookie) +static int befs_symlink_readpage(struct file *unused, struct page *page) { - struct super_block *sb = dentry->d_sb; - struct befs_inode_info *befs_ino = BEFS_I(d_inode(dentry)); + struct inode *inode = page->mapping->host; + struct super_block *sb = inode->i_sb; + struct befs_inode_info *befs_ino = BEFS_I(inode); befs_data_stream *data = &befs_ino->i_data.ds; befs_off_t len = data->size; - char *link; + char *link = page_address(page); - if (len == 0) { + if (len == 0 || len > PAGE_SIZE) { befs_error(sb, "Long symlink with illegal length"); - return ERR_PTR(-EIO); + goto fail; } befs_debug(sb, "Follow long symlink"); - link = kmalloc(len, GFP_NOFS); - if (!link) - return ERR_PTR(-ENOMEM); if (befs_read_lsymlink(sb, data, link, len) != len) { - kfree(link); befs_error(sb, "Failed to read entire long symlink"); - return ERR_PTR(-EIO); + goto fail; } link[len - 1] = '\0'; - return *cookie = link; + SetPageUptodate(page); + unlock_page(page); + return 0; +fail: + SetPageError(page); + unlock_page(page); + return -EIO; } /* diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index fdcb4d69f..1e5c896f6 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -270,7 +270,7 @@ static int __init init_inodecache(void) bfs_inode_cachep = kmem_cache_create("bfs_inode_cache", sizeof(struct bfs_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (bfs_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 3a93755e8..7d914c67a 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -491,6 +491,7 @@ static inline int arch_elf_pt_proc(struct elfhdr *ehdr, * arch_check_elf() - check an ELF executable * @ehdr: The main ELF header * @has_interp: True if the ELF has an interpreter, else false. + * @interp_ehdr: The interpreter's ELF header * @state: Architecture-specific state preserved throughout the process * of loading the ELF. * @@ -502,6 +503,7 @@ static inline int arch_elf_pt_proc(struct elfhdr *ehdr, * with that return code. */ static inline int arch_check_elf(struct elfhdr *ehdr, bool has_interp, + struct elfhdr *interp_ehdr, struct arch_elf_state *state) { /* Dummy implementation, always proceed */ @@ -651,7 +653,7 @@ static unsigned long randomize_stack_top(unsigned long stack_top) if ((current->flags & PF_RANDOMIZE) && !(current->personality & ADDR_NO_RANDOMIZE)) { - random_variable = (unsigned long) get_random_int(); + random_variable = get_random_long(); random_variable &= STACK_RND_MASK; random_variable <<= PAGE_SHIFT; } @@ -829,7 +831,9 @@ static int load_elf_binary(struct linux_binprm *bprm) * still possible to return an error to the code that invoked * the exec syscall. */ - retval = arch_check_elf(&loc->elf_ex, !!interpreter, &arch_state); + retval = arch_check_elf(&loc->elf_ex, + !!interpreter, &loc->interp_elf_ex, + &arch_state); if (retval) goto out_free_dentry; diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 78f005f37..3a3ced779 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -638,11 +638,11 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer, case 3: /* Delete this handler. */ root = dget(file->f_path.dentry->d_sb->s_root); - mutex_lock(&d_inode(root)->i_mutex); + inode_lock(d_inode(root)); kill_node(e); - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); dput(root); break; default: @@ -675,7 +675,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer, return PTR_ERR(e); root = dget(sb->s_root); - mutex_lock(&d_inode(root)->i_mutex); + inode_lock(d_inode(root)); dentry = lookup_one_len(e->name, root, strlen(e->name)); err = PTR_ERR(dentry); if (IS_ERR(dentry)) @@ -711,7 +711,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer, out2: dput(dentry); out: - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); dput(root); if (err) { @@ -754,12 +754,12 @@ static ssize_t bm_status_write(struct file *file, const char __user *buffer, case 3: /* Delete all handlers. */ root = dget(file->f_path.dentry->d_sb->s_root); - mutex_lock(&d_inode(root)->i_mutex); + inode_lock(d_inode(root)); while (!list_empty(&entries)) kill_node(list_entry(entries.next, Node, list)); - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); dput(root); break; default: diff --git a/fs/block_dev.c b/fs/block_dev.c index 44d4a1e92..826b164a4 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -75,7 +75,7 @@ void kill_bdev(struct block_device *bdev) { struct address_space *mapping = bdev->bd_inode->i_mapping; - if (mapping->nrpages == 0 && mapping->nrshadows == 0) + if (mapping->nrpages == 0 && mapping->nrexceptional == 0) return; invalidate_bh_lrus(); @@ -156,11 +156,16 @@ blkdev_get_block(struct inode *inode, sector_t iblock, return 0; } +static struct inode *bdev_file_inode(struct file *file) +{ + return file->f_mapping->host; +} + static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) { struct file *file = iocb->ki_filp; - struct inode *inode = file->f_mapping->host; + struct inode *inode = bdev_file_inode(file); if (IS_DAX(inode)) return dax_do_io(iocb, inode, iter, offset, blkdev_get_block, @@ -338,18 +343,18 @@ static int blkdev_write_end(struct file *file, struct address_space *mapping, */ static loff_t block_llseek(struct file *file, loff_t offset, int whence) { - struct inode *bd_inode = file->f_mapping->host; + struct inode *bd_inode = bdev_file_inode(file); loff_t retval; - mutex_lock(&bd_inode->i_mutex); + inode_lock(bd_inode); retval = fixed_size_llseek(file, offset, whence, i_size_read(bd_inode)); - mutex_unlock(&bd_inode->i_mutex); + inode_unlock(bd_inode); return retval; } int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync) { - struct inode *bd_inode = filp->f_mapping->host; + struct inode *bd_inode = bdev_file_inode(filp); struct block_device *bdev = I_BDEV(bd_inode); int error; @@ -395,7 +400,7 @@ int bdev_read_page(struct block_device *bdev, sector_t sector, if (!ops->rw_page || bdev_get_integrity(bdev)) return result; - result = blk_queue_enter(bdev->bd_queue, GFP_KERNEL); + result = blk_queue_enter(bdev->bd_queue, false); if (result) return result; result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, READ); @@ -432,7 +437,7 @@ int bdev_write_page(struct block_device *bdev, sector_t sector, if (!ops->rw_page || bdev_get_integrity(bdev)) return -EOPNOTSUPP; - result = blk_queue_enter(bdev->bd_queue, GFP_KERNEL); + result = blk_queue_enter(bdev->bd_queue, false); if (result) return result; @@ -450,10 +455,7 @@ EXPORT_SYMBOL_GPL(bdev_write_page); /** * bdev_direct_access() - Get the address for directly-accessibly memory * @bdev: The device containing the memory - * @sector: The offset within the device - * @addr: Where to put the address of the memory - * @pfn: The Page Frame Number for the memory - * @size: The number of bytes requested + * @dax: control and output parameters for ->direct_access * * If a block device is made up of directly addressable memory, this function * will tell the caller the PFN and the address of the memory. The address @@ -464,10 +466,10 @@ EXPORT_SYMBOL_GPL(bdev_write_page); * Return: negative errno if an error occurs, otherwise the number of bytes * accessible at this address. */ -long bdev_direct_access(struct block_device *bdev, sector_t sector, - void __pmem **addr, unsigned long *pfn, long size) +long bdev_direct_access(struct block_device *bdev, struct blk_dax_ctl *dax) { - long avail; + sector_t sector = dax->sector; + long avail, size = dax->size; const struct block_device_operations *ops = bdev->bd_disk->fops; /* @@ -486,9 +488,11 @@ long bdev_direct_access(struct block_device *bdev, sector_t sector, sector += get_start_sect(bdev); if (sector % (PAGE_SIZE / 512)) return -EINVAL; - avail = ops->direct_access(bdev, sector, addr, pfn); + avail = ops->direct_access(bdev, sector, &dax->addr, &dax->pfn); if (!avail) return -ERANGE; + if (avail > 0 && avail & ~PAGE_MASK) + return -ENXIO; return min(avail, size); } EXPORT_SYMBOL_GPL(bdev_direct_access); @@ -590,7 +594,7 @@ void __init bdev_cache_init(void) bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode), 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD|SLAB_PANIC), + SLAB_MEM_SPREAD|SLAB_ACCOUNT|SLAB_PANIC), init_once); err = register_filesystem(&bd_type); if (err) @@ -696,7 +700,7 @@ static struct block_device *bd_acquire(struct inode *inode) spin_lock(&bdev_lock); bdev = inode->i_bdev; if (bdev) { - ihold(bdev->bd_inode); + bdgrab(bdev); spin_unlock(&bdev_lock); return bdev; } @@ -712,7 +716,7 @@ static struct block_device *bd_acquire(struct inode *inode) * So, we can access it via ->i_mapping always * without igrab(). */ - ihold(bdev->bd_inode); + bdgrab(bdev); inode->i_bdev = bdev; inode->i_mapping = bdev->bd_inode->i_mapping; list_add(&inode->i_devices, &bdev->bd_inodes); @@ -735,7 +739,7 @@ void bd_forget(struct inode *inode) spin_unlock(&bdev_lock); if (bdev) - iput(bdev->bd_inode); + bdput(bdev); } /** @@ -1042,12 +1046,9 @@ EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); static void flush_disk(struct block_device *bdev, bool kill_dirty) { if (__invalidate_device(bdev, kill_dirty)) { - char name[BDEVNAME_SIZE] = ""; - - if (bdev->bd_disk) - disk_name(bdev->bd_disk, 0, name); printk(KERN_WARNING "VFS: busy inodes on changed media or " - "resized disk %s\n", name); + "resized disk %s\n", + bdev->bd_disk ? bdev->bd_disk->disk_name : ""); } if (!bdev->bd_disk) @@ -1071,12 +1072,9 @@ void check_disk_size_change(struct gendisk *disk, struct block_device *bdev) disk_size = (loff_t)get_capacity(disk) << 9; bdev_size = i_size_read(bdev->bd_inode); if (disk_size != bdev_size) { - char name[BDEVNAME_SIZE]; - - disk_name(disk, 0, name); printk(KERN_INFO "%s: detected capacity change from %lld to %lld\n", - name, bdev_size, disk_size); + disk->disk_name, bdev_size, disk_size); i_size_write(bdev->bd_inode, disk_size); flush_disk(bdev, false); } @@ -1144,9 +1142,9 @@ void bd_set_size(struct block_device *bdev, loff_t size) { unsigned bsize = bdev_logical_block_size(bdev); - mutex_lock(&bdev->bd_inode->i_mutex); + inode_lock(bdev->bd_inode); i_size_write(bdev->bd_inode, size); - mutex_unlock(&bdev->bd_inode->i_mutex); + inode_unlock(bdev->bd_inode); while (bsize < PAGE_CACHE_SIZE) { if (size & bsize) break; @@ -1203,7 +1201,11 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) bdev->bd_disk = disk; bdev->bd_queue = disk->queue; bdev->bd_contains = bdev; - bdev->bd_inode->i_flags = disk->fops->direct_access ? S_DAX : 0; + if (IS_ENABLED(CONFIG_BLK_DEV_DAX) && disk->fops->direct_access) + bdev->bd_inode->i_flags = S_DAX; + else + bdev->bd_inode->i_flags = 0; + if (!partno) { ret = -ENXIO; bdev->bd_part = disk_get_part(disk, partno); @@ -1230,8 +1232,11 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) } } - if (!ret) + if (!ret) { bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); + if (!blkdev_dax_capable(bdev)) + bdev->bd_inode->i_flags &= ~S_DAX; + } /* * If the device is invalidated, rescan partition @@ -1245,6 +1250,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) else if (ret == -ENOMEDIUM) invalidate_partitions(disk, bdev); } + if (ret) goto out_clear; } else { @@ -1265,12 +1271,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) goto out_clear; } bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9); - /* - * If the partition is not aligned on a page - * boundary, we can't do dax I/O to it. - */ - if ((bdev->bd_part->start_sect % (PAGE_SIZE / 512)) || - (bdev->bd_part->nr_sects % (PAGE_SIZE / 512))) + if (!blkdev_dax_capable(bdev)) bdev->bd_inode->i_flags &= ~S_DAX; } } else { @@ -1605,14 +1606,14 @@ EXPORT_SYMBOL(blkdev_put); static int blkdev_close(struct inode * inode, struct file * filp) { - struct block_device *bdev = I_BDEV(filp->f_mapping->host); + struct block_device *bdev = I_BDEV(bdev_file_inode(filp)); blkdev_put(bdev, filp->f_mode); return 0; } static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) { - struct block_device *bdev = I_BDEV(file->f_mapping->host); + struct block_device *bdev = I_BDEV(bdev_file_inode(file)); fmode_t mode = file->f_mode; /* @@ -1637,7 +1638,7 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) { struct file *file = iocb->ki_filp; - struct inode *bd_inode = file->f_mapping->host; + struct inode *bd_inode = bdev_file_inode(file); loff_t size = i_size_read(bd_inode); struct blk_plug plug; ssize_t ret; @@ -1669,7 +1670,7 @@ EXPORT_SYMBOL_GPL(blkdev_write_iter); ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) { struct file *file = iocb->ki_filp; - struct inode *bd_inode = file->f_mapping->host; + struct inode *bd_inode = bdev_file_inode(file); loff_t size = i_size_read(bd_inode); loff_t pos = iocb->ki_pos; @@ -1696,25 +1697,102 @@ static int blkdev_releasepage(struct page *page, gfp_t wait) return try_to_free_buffers(page); } +static int blkdev_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + if (dax_mapping(mapping)) { + struct block_device *bdev = I_BDEV(mapping->host); + + return dax_writeback_mapping_range(mapping, bdev, wbc); + } + return generic_writepages(mapping, wbc); +} + static const struct address_space_operations def_blk_aops = { .readpage = blkdev_readpage, .readpages = blkdev_readpages, .writepage = blkdev_writepage, .write_begin = blkdev_write_begin, .write_end = blkdev_write_end, - .writepages = generic_writepages, + .writepages = blkdev_writepages, .releasepage = blkdev_releasepage, .direct_IO = blkdev_direct_IO, .is_dirty_writeback = buffer_check_dirty_writeback, }; +#ifdef CONFIG_FS_DAX +/* + * In the raw block case we do not need to contend with truncation nor + * unwritten file extents. Without those concerns there is no need for + * additional locking beyond the mmap_sem context that these routines + * are already executing under. + * + * Note, there is no protection if the block device is dynamically + * resized (partition grow/shrink) during a fault. A stable block device + * size is already not enforced in the blkdev_direct_IO path. + * + * For DAX, it is the responsibility of the block device driver to + * ensure the whole-disk device size is stable while requests are in + * flight. + * + * Finally, unlike the filemap_page_mkwrite() case there is no + * filesystem superblock to sync against freezing. We still include a + * pfn_mkwrite callback for dax drivers to receive write fault + * notifications. + */ +static int blkdev_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + return __dax_fault(vma, vmf, blkdev_get_block, NULL); +} + +static int blkdev_dax_pfn_mkwrite(struct vm_area_struct *vma, + struct vm_fault *vmf) +{ + return dax_pfn_mkwrite(vma, vmf); +} + +static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmd, unsigned int flags) +{ + return __dax_pmd_fault(vma, addr, pmd, flags, blkdev_get_block, NULL); +} + +static const struct vm_operations_struct blkdev_dax_vm_ops = { + .fault = blkdev_dax_fault, + .pmd_fault = blkdev_dax_pmd_fault, + .pfn_mkwrite = blkdev_dax_pfn_mkwrite, +}; + +static const struct vm_operations_struct blkdev_default_vm_ops = { + .fault = filemap_fault, + .map_pages = filemap_map_pages, +}; + +static int blkdev_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct inode *bd_inode = bdev_file_inode(file); + + file_accessed(file); + if (IS_DAX(bd_inode)) { + vma->vm_ops = &blkdev_dax_vm_ops; + vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE; + } else { + vma->vm_ops = &blkdev_default_vm_ops; + } + + return 0; +} +#else +#define blkdev_mmap generic_file_mmap +#endif + const struct file_operations def_blk_fops = { .open = blkdev_open, .release = blkdev_close, .llseek = block_llseek, .read_iter = blkdev_read_iter, .write_iter = blkdev_write_iter, - .mmap = generic_file_mmap, + .mmap = blkdev_mmap, .fsync = blkdev_fsync, .unlocked_ioctl = block_ioctl, #ifdef CONFIG_COMPAT diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index 6d1d0b93b..128ce17a8 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -9,11 +9,12 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ export.o tree-log.o free-space-cache.o zlib.o lzo.o \ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ - uuid-tree.o props.o hash.o + uuid-tree.o props.o hash.o free-space-tree.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o \ tests/extent-buffer-tests.o tests/btrfs-tests.o \ - tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o + tests/extent-io-tests.o tests/inode-tests.o tests/qgroup-tests.o \ + tests/free-space-tree-tests.o diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c index 9a0124a95..6d263bb16 100644 --- a/fs/btrfs/acl.c +++ b/fs/btrfs/acl.c @@ -37,10 +37,10 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type) switch (type) { case ACL_TYPE_ACCESS: - name = POSIX_ACL_XATTR_ACCESS; + name = XATTR_NAME_POSIX_ACL_ACCESS; break; case ACL_TYPE_DEFAULT: - name = POSIX_ACL_XATTR_DEFAULT; + name = XATTR_NAME_POSIX_ACL_DEFAULT; break; default: BUG(); @@ -48,7 +48,7 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type) size = __btrfs_getxattr(inode, name, "", 0); if (size > 0) { - value = kzalloc(size, GFP_NOFS); + value = kzalloc(size, GFP_KERNEL); if (!value) return ERR_PTR(-ENOMEM); size = __btrfs_getxattr(inode, name, value, size); @@ -81,7 +81,7 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans, switch (type) { case ACL_TYPE_ACCESS: - name = POSIX_ACL_XATTR_ACCESS; + name = XATTR_NAME_POSIX_ACL_ACCESS; if (acl) { ret = posix_acl_equiv_mode(acl, &inode->i_mode); if (ret < 0) @@ -94,7 +94,7 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans, case ACL_TYPE_DEFAULT: if (!S_ISDIR(inode->i_mode)) return acl ? -EINVAL : 0; - name = POSIX_ACL_XATTR_DEFAULT; + name = XATTR_NAME_POSIX_ACL_DEFAULT; break; default: return -EINVAL; @@ -102,7 +102,7 @@ static int __btrfs_set_acl(struct btrfs_trans_handle *trans, if (acl) { size = posix_acl_xattr_size(acl->a_count); - value = kmalloc(size, GFP_NOFS); + value = kmalloc(size, GFP_KERNEL); if (!value) { ret = -ENOMEM; goto out; diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index 9aba42b78..5fb60ea7e 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -97,7 +97,7 @@ static struct __btrfs_workqueue * __btrfs_alloc_workqueue(const char *name, unsigned int flags, int limit_active, int thresh) { - struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS); + struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL); if (!ret) return NULL; @@ -148,7 +148,7 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, int limit_active, int thresh) { - struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS); + struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL); if (!ret) return NULL; diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index e2f659dc5..f6dac40f8 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -520,13 +520,10 @@ static inline int ref_for_same_block(struct __prelim_ref *ref1, static int __add_missing_keys(struct btrfs_fs_info *fs_info, struct list_head *head) { - struct list_head *pos; + struct __prelim_ref *ref; struct extent_buffer *eb; - list_for_each(pos, head) { - struct __prelim_ref *ref; - ref = list_entry(pos, struct __prelim_ref, list); - + list_for_each_entry(ref, head, list) { if (ref->parent) continue; if (ref->key_for_search.type) @@ -563,23 +560,15 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info, */ static void __merge_refs(struct list_head *head, int mode) { - struct list_head *pos1; + struct __prelim_ref *pos1; - list_for_each(pos1, head) { - struct list_head *n2; - struct list_head *pos2; - struct __prelim_ref *ref1; + list_for_each_entry(pos1, head, list) { + struct __prelim_ref *pos2 = pos1, *tmp; - ref1 = list_entry(pos1, struct __prelim_ref, list); - - for (pos2 = pos1->next, n2 = pos2->next; pos2 != head; - pos2 = n2, n2 = pos2->next) { - struct __prelim_ref *ref2; - struct __prelim_ref *xchg; + list_for_each_entry_safe_continue(pos2, tmp, head, list) { + struct __prelim_ref *xchg, *ref1 = pos1, *ref2 = pos2; struct extent_inode_elem *eie; - ref2 = list_entry(pos2, struct __prelim_ref, list); - if (!ref_for_same_block(ref1, ref2)) continue; if (mode == 1) { diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 0ef5cc13f..61205e3bb 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -192,6 +192,10 @@ struct btrfs_inode { /* File creation time. */ struct timespec i_otime; + /* Hook into fs_info->delayed_iputs */ + struct list_head delayed_iput; + long delayed_iput_count; + struct inode vfs_inode; }; diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 0340c57bf..861d47256 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -531,13 +531,9 @@ static struct btrfsic_block *btrfsic_block_hashtable_lookup( (((unsigned int)(dev_bytenr >> 16)) ^ ((unsigned int)((uintptr_t)bdev))) & (BTRFSIC_BLOCK_HASHTABLE_SIZE - 1); - struct list_head *elem; - - list_for_each(elem, h->table + hashval) { - struct btrfsic_block *const b = - list_entry(elem, struct btrfsic_block, - collision_resolving_node); + struct btrfsic_block *b; + list_for_each_entry(b, h->table + hashval, collision_resolving_node) { if (b->dev_state->bdev == bdev && b->dev_bytenr == dev_bytenr) return b; } @@ -588,13 +584,9 @@ static struct btrfsic_block_link *btrfsic_block_link_hashtable_lookup( ((unsigned int)((uintptr_t)bdev_ref_to)) ^ ((unsigned int)((uintptr_t)bdev_ref_from))) & (BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE - 1); - struct list_head *elem; - - list_for_each(elem, h->table + hashval) { - struct btrfsic_block_link *const l = - list_entry(elem, struct btrfsic_block_link, - collision_resolving_node); + struct btrfsic_block_link *l; + list_for_each_entry(l, h->table + hashval, collision_resolving_node) { BUG_ON(NULL == l->block_ref_to); BUG_ON(NULL == l->block_ref_from); if (l->block_ref_to->dev_state->bdev == bdev_ref_to && @@ -639,13 +631,9 @@ static struct btrfsic_dev_state *btrfsic_dev_state_hashtable_lookup( const unsigned int hashval = (((unsigned int)((uintptr_t)bdev)) & (BTRFSIC_DEV2STATE_HASHTABLE_SIZE - 1)); - struct list_head *elem; - - list_for_each(elem, h->table + hashval) { - struct btrfsic_dev_state *const ds = - list_entry(elem, struct btrfsic_dev_state, - collision_resolving_node); + struct btrfsic_dev_state *ds; + list_for_each_entry(ds, h->table + hashval, collision_resolving_node) { if (ds->bdev == bdev) return ds; } @@ -1720,29 +1708,20 @@ static int btrfsic_read_block(struct btrfsic_state *state, static void btrfsic_dump_database(struct btrfsic_state *state) { - struct list_head *elem_all; + const struct btrfsic_block *b_all; BUG_ON(NULL == state); printk(KERN_INFO "all_blocks_list:\n"); - list_for_each(elem_all, &state->all_blocks_list) { - const struct btrfsic_block *const b_all = - list_entry(elem_all, struct btrfsic_block, - all_blocks_node); - struct list_head *elem_ref_to; - struct list_head *elem_ref_from; + list_for_each_entry(b_all, &state->all_blocks_list, all_blocks_node) { + const struct btrfsic_block_link *l; printk(KERN_INFO "%c-block @%llu (%s/%llu/%d)\n", btrfsic_get_block_type(state, b_all), b_all->logical_bytenr, b_all->dev_state->name, b_all->dev_bytenr, b_all->mirror_num); - list_for_each(elem_ref_to, &b_all->ref_to_list) { - const struct btrfsic_block_link *const l = - list_entry(elem_ref_to, - struct btrfsic_block_link, - node_ref_to); - + list_for_each_entry(l, &b_all->ref_to_list, node_ref_to) { printk(KERN_INFO " %c @%llu (%s/%llu/%d)" " refers %u* to" " %c @%llu (%s/%llu/%d)\n", @@ -1757,12 +1736,7 @@ static void btrfsic_dump_database(struct btrfsic_state *state) l->block_ref_to->mirror_num); } - list_for_each(elem_ref_from, &b_all->ref_from_list) { - const struct btrfsic_block_link *const l = - list_entry(elem_ref_from, - struct btrfsic_block_link, - node_ref_from); - + list_for_each_entry(l, &b_all->ref_from_list, node_ref_from) { printk(KERN_INFO " %c @%llu (%s/%llu/%d)" " is ref %u* from" " %c @%llu (%s/%llu/%d)\n", @@ -1845,8 +1819,7 @@ again: &state->block_hashtable); if (NULL != block) { u64 bytenr = 0; - struct list_head *elem_ref_to; - struct list_head *tmp_ref_to; + struct btrfsic_block_link *l, *tmp; if (block->is_superblock) { bytenr = btrfs_super_bytenr((struct btrfs_super_block *) @@ -1967,13 +1940,8 @@ again: * because it still carries valueable information * like whether it was ever written and IO completed. */ - list_for_each_safe(elem_ref_to, tmp_ref_to, - &block->ref_to_list) { - struct btrfsic_block_link *const l = - list_entry(elem_ref_to, - struct btrfsic_block_link, - node_ref_to); - + list_for_each_entry_safe(l, tmp, &block->ref_to_list, + node_ref_to) { if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) btrfsic_print_rem_link(state, l); l->ref_cnt--; @@ -2436,7 +2404,7 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state, struct btrfsic_block *const block, int recursion_level) { - struct list_head *elem_ref_to; + const struct btrfsic_block_link *l; int ret = 0; if (recursion_level >= 3 + BTRFS_MAX_LEVEL) { @@ -2464,11 +2432,7 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state, * This algorithm is recursive because the amount of used stack * space is very small and the max recursion depth is limited. */ - list_for_each(elem_ref_to, &block->ref_to_list) { - const struct btrfsic_block_link *const l = - list_entry(elem_ref_to, struct btrfsic_block_link, - node_ref_to); - + list_for_each_entry(l, &block->ref_to_list, node_ref_to) { if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) printk(KERN_INFO "rl=%d, %c @%llu (%s/%llu/%d)" @@ -2561,7 +2525,7 @@ static int btrfsic_is_block_ref_by_superblock( const struct btrfsic_block *block, int recursion_level) { - struct list_head *elem_ref_from; + const struct btrfsic_block_link *l; if (recursion_level >= 3 + BTRFS_MAX_LEVEL) { /* refer to comment at "abort cyclic linkage (case 1)" */ @@ -2576,11 +2540,7 @@ static int btrfsic_is_block_ref_by_superblock( * This algorithm is recursive because the amount of used stack space * is very small and the max recursion depth is limited. */ - list_for_each(elem_ref_from, &block->ref_from_list) { - const struct btrfsic_block_link *const l = - list_entry(elem_ref_from, struct btrfsic_block_link, - node_ref_from); - + list_for_each_entry(l, &block->ref_from_list, node_ref_from) { if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) printk(KERN_INFO "rl=%d, %c @%llu (%s/%llu/%d)" @@ -2669,7 +2629,7 @@ static void btrfsic_dump_tree_sub(const struct btrfsic_state *state, const struct btrfsic_block *block, int indent_level) { - struct list_head *elem_ref_to; + const struct btrfsic_block_link *l; int indent_add; static char buf[80]; int cursor_position; @@ -2704,11 +2664,7 @@ static void btrfsic_dump_tree_sub(const struct btrfsic_state *state, } cursor_position = indent_level; - list_for_each(elem_ref_to, &block->ref_to_list) { - const struct btrfsic_block_link *const l = - list_entry(elem_ref_to, struct btrfsic_block_link, - node_ref_to); - + list_for_each_entry(l, &block->ref_to_list, node_ref_to) { while (cursor_position < indent_level) { printk(" "); cursor_position++; @@ -3165,8 +3121,7 @@ int btrfsic_mount(struct btrfs_root *root, void btrfsic_unmount(struct btrfs_root *root, struct btrfs_fs_devices *fs_devices) { - struct list_head *elem_all; - struct list_head *tmp_all; + struct btrfsic_block *b_all, *tmp_all; struct btrfsic_state *state; struct list_head *dev_head = &fs_devices->devices; struct btrfs_device *device; @@ -3206,20 +3161,12 @@ void btrfsic_unmount(struct btrfs_root *root, * just free all memory that was allocated dynamically. * Free the blocks and the block_links. */ - list_for_each_safe(elem_all, tmp_all, &state->all_blocks_list) { - struct btrfsic_block *const b_all = - list_entry(elem_all, struct btrfsic_block, - all_blocks_node); - struct list_head *elem_ref_to; - struct list_head *tmp_ref_to; - - list_for_each_safe(elem_ref_to, tmp_ref_to, - &b_all->ref_to_list) { - struct btrfsic_block_link *const l = - list_entry(elem_ref_to, - struct btrfsic_block_link, - node_ref_to); + list_for_each_entry_safe(b_all, tmp_all, &state->all_blocks_list, + all_blocks_node) { + struct btrfsic_block_link *l, *tmp; + list_for_each_entry_safe(l, tmp, &b_all->ref_to_list, + node_ref_to) { if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE) btrfsic_print_rem_link(state, l); diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index c473c42d7..3346cd8f9 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -637,11 +637,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, faili = nr_pages - 1; cb->nr_pages = nr_pages; - /* In the parent-locked case, we only locked the range we are - * interested in. In all other cases, we can opportunistically - * cache decompressed data that goes beyond the requested range. */ - if (!(bio_flags & EXTENT_BIO_PARENT_LOCKED)) - add_ra_bio_pages(inode, em_start + em_len, cb); + add_ra_bio_pages(inode, em_start + em_len, cb); /* include any pages we added in add_ra-bio_pages */ uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 5b8e235c4..769e0ff1b 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1555,7 +1555,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, return 0; } - search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1); + search_start = buf->start & ~((u64)SZ_1G - 1); if (parent) btrfs_set_lock_blocking(parent); @@ -2248,7 +2248,6 @@ static void reada_for_search(struct btrfs_root *root, u64 target; u64 nread = 0; u64 gen; - int direction = path->reada; struct extent_buffer *eb; u32 nr; u32 blocksize; @@ -2276,16 +2275,16 @@ static void reada_for_search(struct btrfs_root *root, nr = slot; while (1) { - if (direction < 0) { + if (path->reada == READA_BACK) { if (nr == 0) break; nr--; - } else if (direction > 0) { + } else if (path->reada == READA_FORWARD) { nr++; if (nr >= nritems) break; } - if (path->reada < 0 && objectid) { + if (path->reada == READA_BACK && objectid) { btrfs_node_key(node, &disk_key, nr); if (btrfs_disk_key_objectid(&disk_key) != objectid) break; @@ -2493,7 +2492,7 @@ read_block_for_search(struct btrfs_trans_handle *trans, btrfs_set_path_blocking(p); free_extent_buffer(tmp); - if (p->reada) + if (p->reada != READA_NONE) reada_for_search(root, p, level, slot, key->objectid); btrfs_release_path(p); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 385b449fd..bfe4a337f 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -35,6 +35,7 @@ #include <linux/btrfs.h> #include <linux/workqueue.h> #include <linux/security.h> +#include <linux/sizes.h> #include "extent_io.h" #include "extent_map.h" #include "async-thread.h" @@ -96,6 +97,9 @@ struct btrfs_ordered_sum; /* for storing items that use the BTRFS_UUID_KEY* types */ #define BTRFS_UUID_TREE_OBJECTID 9ULL +/* tracks free space in block groups. */ +#define BTRFS_FREE_SPACE_TREE_OBJECTID 10ULL + /* for storing balance parameters in the root tree */ #define BTRFS_BALANCE_OBJECTID -4ULL @@ -174,7 +178,7 @@ struct btrfs_ordered_sum; /* csum types */ #define BTRFS_CSUM_TYPE_CRC32 0 -static int btrfs_csum_sizes[] = { 4 }; +static const int btrfs_csum_sizes[] = { 4 }; /* four bytes for CRC32 */ #define BTRFS_EMPTY_DIR_SIZE 0 @@ -196,9 +200,9 @@ static int btrfs_csum_sizes[] = { 4 }; /* ioprio of readahead is set to idle */ #define BTRFS_IOPRIO_READA (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) -#define BTRFS_DIRTY_METADATA_THRESH (32 * 1024 * 1024) +#define BTRFS_DIRTY_METADATA_THRESH SZ_32M -#define BTRFS_MAX_EXTENT_SIZE (128 * 1024 * 1024) +#define BTRFS_MAX_EXTENT_SIZE SZ_128M /* * The key defines the order in the tree, and so it also defines (optimal) @@ -500,6 +504,8 @@ struct btrfs_super_block { * Compat flags that we support. If any incompat flags are set other than the * ones specified below then we will fail to mount */ +#define BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE (1ULL << 0) + #define BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF (1ULL << 0) #define BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL (1ULL << 1) #define BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS (1ULL << 2) @@ -526,7 +532,10 @@ struct btrfs_super_block { #define BTRFS_FEATURE_COMPAT_SUPP 0ULL #define BTRFS_FEATURE_COMPAT_SAFE_SET 0ULL #define BTRFS_FEATURE_COMPAT_SAFE_CLEAR 0ULL -#define BTRFS_FEATURE_COMPAT_RO_SUPP 0ULL + +#define BTRFS_FEATURE_COMPAT_RO_SUPP \ + (BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE) + #define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL #define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL @@ -590,14 +599,15 @@ struct btrfs_node { * The slots array records the index of the item or block pointer * used while walking the tree. */ +enum { READA_NONE = 0, READA_BACK, READA_FORWARD }; struct btrfs_path { struct extent_buffer *nodes[BTRFS_MAX_LEVEL]; int slots[BTRFS_MAX_LEVEL]; /* if there is real range locking, this locks field will change */ - int locks[BTRFS_MAX_LEVEL]; - int reada; + u8 locks[BTRFS_MAX_LEVEL]; + u8 reada; /* keep some upper locks as we walk down */ - int lowest_level; + u8 lowest_level; /* * set by btrfs_split_item, tells search_slot to keep all locks @@ -1088,6 +1098,13 @@ struct btrfs_block_group_item { __le64 flags; } __attribute__ ((__packed__)); +struct btrfs_free_space_info { + __le32 extent_count; + __le32 flags; +} __attribute__ ((__packed__)); + +#define BTRFS_FREE_SPACE_USING_BITMAPS (1ULL << 0) + #define BTRFS_QGROUP_LEVEL_SHIFT 48 static inline u64 btrfs_qgroup_level(u64 qgroupid) { @@ -1296,6 +1313,9 @@ struct btrfs_caching_control { atomic_t count; }; +/* Once caching_thread() finds this much free space, it will wake up waiters. */ +#define CACHING_CTL_WAKE_UP (1024 * 1024 * 2) + struct btrfs_io_ctl { void *cur, *orig; struct page *page; @@ -1321,8 +1341,20 @@ struct btrfs_block_group_cache { u64 delalloc_bytes; u64 bytes_super; u64 flags; - u64 sectorsize; u64 cache_generation; + u32 sectorsize; + + /* + * If the free space extent count exceeds this number, convert the block + * group to bitmaps. + */ + u32 bitmap_high_thresh; + + /* + * If the free space extent count drops below this number, convert the + * block group back to extents. + */ + u32 bitmap_low_thresh; /* * It is just used for the delayed data space allocation because @@ -1378,6 +1410,15 @@ struct btrfs_block_group_cache { struct list_head io_list; struct btrfs_io_ctl io_ctl; + + /* Lock for free space tree operations. */ + struct mutex free_space_lock; + + /* + * Does the block group need to be added to the free space tree? + * Protected by free_space_lock. + */ + int needs_free_space; }; /* delayed seq elem */ @@ -1429,6 +1470,7 @@ struct btrfs_fs_info { struct btrfs_root *csum_root; struct btrfs_root *quota_root; struct btrfs_root *uuid_root; + struct btrfs_root *free_space_root; /* the log root tree is a directory of all the other log roots */ struct btrfs_root *log_root_tree; @@ -1816,6 +1858,8 @@ struct btrfs_fs_info { * and will be latter freed. Protected by fs_info->chunk_mutex. */ struct list_head pinned_chunks; + + int creating_free_space_tree; }; struct btrfs_subvolume_writers { @@ -2092,6 +2136,27 @@ struct btrfs_ioctl_defrag_range_args { */ #define BTRFS_BLOCK_GROUP_ITEM_KEY 192 +/* + * Every block group is represented in the free space tree by a free space info + * item, which stores some accounting information. It is keyed on + * (block_group_start, FREE_SPACE_INFO, block_group_length). + */ +#define BTRFS_FREE_SPACE_INFO_KEY 198 + +/* + * A free space extent tracks an extent of space that is free in a block group. + * It is keyed on (start, FREE_SPACE_EXTENT, length). + */ +#define BTRFS_FREE_SPACE_EXTENT_KEY 199 + +/* + * When a block group becomes very fragmented, we convert it to use bitmaps + * instead of extents. A free space bitmap is keyed on + * (start, FREE_SPACE_BITMAP, length); the corresponding item is a bitmap with + * (length / sectorsize) bits. + */ +#define BTRFS_FREE_SPACE_BITMAP_KEY 200 + #define BTRFS_DEV_EXTENT_KEY 204 #define BTRFS_DEV_ITEM_KEY 216 #define BTRFS_CHUNK_ITEM_KEY 228 @@ -2184,6 +2249,7 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_MOUNT_RESCAN_UUID_TREE (1 << 23) #define BTRFS_MOUNT_FRAGMENT_DATA (1 << 24) #define BTRFS_MOUNT_FRAGMENT_METADATA (1 << 25) +#define BTRFS_MOUNT_FREE_SPACE_TREE (1 << 26) #define BTRFS_DEFAULT_COMMIT_INTERVAL (30) #define BTRFS_DEFAULT_MAX_INLINE (8192) @@ -2506,6 +2572,11 @@ BTRFS_SETGET_FUNCS(disk_block_group_flags, BTRFS_SETGET_STACK_FUNCS(block_group_flags, struct btrfs_block_group_item, flags, 64); +/* struct btrfs_free_space_info */ +BTRFS_SETGET_FUNCS(free_space_extent_count, struct btrfs_free_space_info, + extent_count, 32); +BTRFS_SETGET_FUNCS(free_space_flags, struct btrfs_free_space_info, flags, 32); + /* struct btrfs_inode_ref */ BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16); BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64); @@ -3570,9 +3641,13 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, int __get_raid_index(u64 flags); int btrfs_start_write_no_snapshoting(struct btrfs_root *root); void btrfs_end_write_no_snapshoting(struct btrfs_root *root); +void btrfs_wait_for_snapshot_creation(struct btrfs_root *root); void check_system_chunk(struct btrfs_trans_handle *trans, struct btrfs_root *root, const u64 type); +u64 add_new_free_space(struct btrfs_block_group_cache *block_group, + struct btrfs_fs_info *info, u64 start, u64 end); + /* ctree.c */ int btrfs_bin_search(struct extent_buffer *eb, struct btrfs_key *key, int level, int *slot); @@ -3737,6 +3812,7 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info) kfree(fs_info->csum_root); kfree(fs_info->quota_root); kfree(fs_info->uuid_root); + kfree(fs_info->free_space_root); kfree(fs_info->super_copy); kfree(fs_info->super_for_commit); security_free_mnt_opts(&fs_info->security_opts); @@ -3906,7 +3982,6 @@ void btrfs_extent_item_to_extent_map(struct inode *inode, /* inode.c */ struct btrfs_delalloc_work { struct inode *inode; - int wait; int delay_iput; struct completion completion; struct list_head list; @@ -3914,7 +3989,7 @@ struct btrfs_delalloc_work { }; struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode, - int wait, int delay_iput); + int delay_iput); void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work); struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page, @@ -4024,7 +4099,8 @@ void btrfs_get_block_group_info(struct list_head *groups_list, struct btrfs_ioctl_space_info *space); void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock, struct btrfs_ioctl_balance_args *bargs); - +ssize_t btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen, + struct file *dst_file, u64 dst_loff); /* file.c */ int btrfs_auto_defrag_init(void); @@ -4055,6 +4131,11 @@ int btrfs_dirty_pages(struct btrfs_root *root, struct inode *inode, loff_t pos, size_t write_bytes, struct extent_state **cached); int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end); +ssize_t btrfs_copy_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + size_t len, unsigned int flags); +int btrfs_clone_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, u64 len); /* tree-defrag.c */ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, @@ -4247,16 +4328,98 @@ static inline void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, } } +#define btrfs_clear_fs_incompat(__fs_info, opt) \ + __btrfs_clear_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt) + +static inline void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info, + u64 flag) +{ + struct btrfs_super_block *disk_super; + u64 features; + + disk_super = fs_info->super_copy; + features = btrfs_super_incompat_flags(disk_super); + if (features & flag) { + spin_lock(&fs_info->super_lock); + features = btrfs_super_incompat_flags(disk_super); + if (features & flag) { + features &= ~flag; + btrfs_set_super_incompat_flags(disk_super, features); + btrfs_info(fs_info, "clearing %llu feature flag", + flag); + } + spin_unlock(&fs_info->super_lock); + } +} + #define btrfs_fs_incompat(fs_info, opt) \ __btrfs_fs_incompat((fs_info), BTRFS_FEATURE_INCOMPAT_##opt) -static inline int __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag) +static inline bool __btrfs_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag) { struct btrfs_super_block *disk_super; disk_super = fs_info->super_copy; return !!(btrfs_super_incompat_flags(disk_super) & flag); } +#define btrfs_set_fs_compat_ro(__fs_info, opt) \ + __btrfs_set_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt) + +static inline void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, + u64 flag) +{ + struct btrfs_super_block *disk_super; + u64 features; + + disk_super = fs_info->super_copy; + features = btrfs_super_compat_ro_flags(disk_super); + if (!(features & flag)) { + spin_lock(&fs_info->super_lock); + features = btrfs_super_compat_ro_flags(disk_super); + if (!(features & flag)) { + features |= flag; + btrfs_set_super_compat_ro_flags(disk_super, features); + btrfs_info(fs_info, "setting %llu ro feature flag", + flag); + } + spin_unlock(&fs_info->super_lock); + } +} + +#define btrfs_clear_fs_compat_ro(__fs_info, opt) \ + __btrfs_clear_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt) + +static inline void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, + u64 flag) +{ + struct btrfs_super_block *disk_super; + u64 features; + + disk_super = fs_info->super_copy; + features = btrfs_super_compat_ro_flags(disk_super); + if (features & flag) { + spin_lock(&fs_info->super_lock); + features = btrfs_super_compat_ro_flags(disk_super); + if (features & flag) { + features &= ~flag; + btrfs_set_super_compat_ro_flags(disk_super, features); + btrfs_info(fs_info, "clearing %llu ro feature flag", + flag); + } + spin_unlock(&fs_info->super_lock); + } +} + +#define btrfs_fs_compat_ro(fs_info, opt) \ + __btrfs_fs_compat_ro((fs_info), BTRFS_FEATURE_COMPAT_RO_##opt) + +static inline int __btrfs_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag) +{ + struct btrfs_super_block *disk_super; + disk_super = fs_info->super_copy; + return !!(btrfs_super_compat_ro_flags(disk_super) & flag); +} + /* * Call btrfs_abort_transaction as early as possible when an error condition is * detected, that way the exact line number is reported. diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 02b934d0e..b57daa895 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -54,16 +54,11 @@ static inline void btrfs_init_delayed_node( delayed_node->root = root; delayed_node->inode_id = inode_id; atomic_set(&delayed_node->refs, 0); - delayed_node->count = 0; - delayed_node->flags = 0; delayed_node->ins_root = RB_ROOT; delayed_node->del_root = RB_ROOT; mutex_init(&delayed_node->mutex); - delayed_node->index_cnt = 0; INIT_LIST_HEAD(&delayed_node->n_list); INIT_LIST_HEAD(&delayed_node->p_list); - delayed_node->bytes_reserved = 0; - memset(&delayed_node->inode_item, 0, sizeof(delayed_node->inode_item)); } static inline int btrfs_is_continuous_delayed_item( @@ -132,7 +127,7 @@ again: if (node) return node; - node = kmem_cache_alloc(delayed_node_cache, GFP_NOFS); + node = kmem_cache_zalloc(delayed_node_cache, GFP_NOFS); if (!node) return ERR_PTR(-ENOMEM); btrfs_init_delayed_node(node, root, ino); diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index e06dd75ad..914ac13bd 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -493,12 +493,12 @@ update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs, memcpy(&existing_ref->extent_op->key, &ref->extent_op->key, sizeof(ref->extent_op->key)); - existing_ref->extent_op->update_key = 1; + existing_ref->extent_op->update_key = true; } if (ref->extent_op->update_flags) { existing_ref->extent_op->flags_to_set |= ref->extent_op->flags_to_set; - existing_ref->extent_op->update_flags = 1; + existing_ref->extent_op->update_flags = true; } btrfs_free_delayed_extent_op(ref->extent_op); } diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index 00ed02cbf..c24b653c7 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -75,11 +75,11 @@ struct btrfs_delayed_ref_node { struct btrfs_delayed_extent_op { struct btrfs_disk_key key; + u8 level; + bool update_key; + bool update_flags; + bool is_data; u64 flags_to_set; - int level; - unsigned int update_key:1; - unsigned int update_flags:1; - unsigned int is_data:1; }; /* diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 1e668fb7d..cbb7dbfb3 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -614,7 +614,7 @@ static void btrfs_dev_replace_update_device_in_mapping_tree( em = lookup_extent_mapping(em_tree, start, (u64)-1); if (!em) break; - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; for (i = 0; i < map->num_stripes; i++) if (srcdev == map->stripes[i].dev) map->stripes[i].dev = tgtdev; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 41fb43183..4545e2e2a 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -42,6 +42,7 @@ #include "locking.h" #include "tree-log.h" #include "free-space-cache.h" +#include "free-space-tree.h" #include "inode-map.h" #include "check-integrity.h" #include "rcu-string.h" @@ -54,6 +55,12 @@ #include <asm/cpufeature.h> #endif +#define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\ + BTRFS_HEADER_FLAG_RELOC |\ + BTRFS_SUPER_FLAG_ERROR |\ + BTRFS_SUPER_FLAG_SEEDING |\ + BTRFS_SUPER_FLAG_METADUMP) + static const struct extent_io_ops btree_extent_io_ops; static void end_workqueue_fn(struct btrfs_work *work); static void free_fs_root(struct btrfs_root *root); @@ -175,6 +182,7 @@ static struct btrfs_lockdep_keyset { { .id = BTRFS_TREE_RELOC_OBJECTID, .name_stem = "treloc" }, { .id = BTRFS_DATA_RELOC_TREE_OBJECTID, .name_stem = "dreloc" }, { .id = BTRFS_UUID_TREE_OBJECTID, .name_stem = "uuid" }, + { .id = BTRFS_FREE_SPACE_TREE_OBJECTID, .name_stem = "free-space" }, { .id = 0, .name_stem = "tree" }, }; @@ -362,7 +370,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, } lock_extent_bits(io_tree, eb->start, eb->start + eb->len - 1, - 0, &cached_state); + &cached_state); if (extent_buffer_uptodate(eb) && btrfs_header_generation(eb) == parent_transid) { ret = 0; @@ -923,7 +931,7 @@ static int check_async_write(struct inode *inode, unsigned long bio_flags) if (bio_flags & EXTENT_BIO_TREE_LOG) return 0; #ifdef CONFIG_X86 - if (cpu_has_xmm4_2) + if (static_cpu_has_safe(X86_FEATURE_XMM4_2)) return 0; #endif return 1; @@ -1665,6 +1673,9 @@ struct btrfs_root *btrfs_get_fs_root(struct btrfs_fs_info *fs_info, if (location->objectid == BTRFS_UUID_TREE_OBJECTID) return fs_info->uuid_root ? fs_info->uuid_root : ERR_PTR(-ENOENT); + if (location->objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) + return fs_info->free_space_root ? fs_info->free_space_root : + ERR_PTR(-ENOENT); again: root = btrfs_lookup_fs_root(fs_info, location->objectid); if (root) { @@ -2165,6 +2176,7 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root) free_root_extent_buffers(info->uuid_root); if (chunk_root) free_root_extent_buffers(info->chunk_root); + free_root_extent_buffers(info->free_space_root); } void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info) @@ -2465,6 +2477,15 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info, fs_info->uuid_root = root; } + if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { + location.objectid = BTRFS_FREE_SPACE_TREE_OBJECTID; + root = btrfs_read_tree_root(tree_root, &location); + if (IS_ERR(root)) + return PTR_ERR(root); + set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); + fs_info->free_space_root = root; + } + return 0; } @@ -2745,26 +2766,6 @@ int open_ctree(struct super_block *sb, goto fail_alloc; } - /* - * Leafsize and nodesize were always equal, this is only a sanity check. - */ - if (le32_to_cpu(disk_super->__unused_leafsize) != - btrfs_super_nodesize(disk_super)) { - printk(KERN_ERR "BTRFS: couldn't mount because metadata " - "blocksizes don't match. node %d leaf %d\n", - btrfs_super_nodesize(disk_super), - le32_to_cpu(disk_super->__unused_leafsize)); - err = -EINVAL; - goto fail_alloc; - } - if (btrfs_super_nodesize(disk_super) > BTRFS_MAX_METADATA_BLOCKSIZE) { - printk(KERN_ERR "BTRFS: couldn't mount because metadata " - "blocksize (%d) was too large\n", - btrfs_super_nodesize(disk_super)); - err = -EINVAL; - goto fail_alloc; - } - features = btrfs_super_incompat_flags(disk_super); features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF; if (tree_root->fs_info->compress_type == BTRFS_COMPRESS_LZO) @@ -2827,7 +2828,7 @@ int open_ctree(struct super_block *sb, fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, - 4 * 1024 * 1024 / PAGE_CACHE_SIZE); + SZ_4M / PAGE_CACHE_SIZE); tree_root->nodesize = nodesize; tree_root->sectorsize = sectorsize; @@ -2836,17 +2837,6 @@ int open_ctree(struct super_block *sb, sb->s_blocksize = sectorsize; sb->s_blocksize_bits = blksize_bits(sectorsize); - if (btrfs_super_magic(disk_super) != BTRFS_MAGIC) { - printk(KERN_ERR "BTRFS: valid FS not found on %s\n", sb->s_id); - goto fail_sb_buffer; - } - - if (sectorsize != PAGE_SIZE) { - printk(KERN_ERR "BTRFS: incompatible sector size (%lu) " - "found on %s\n", (unsigned long)sectorsize, sb->s_id); - goto fail_sb_buffer; - } - mutex_lock(&fs_info->chunk_mutex); ret = btrfs_read_sys_array(tree_root); mutex_unlock(&fs_info->chunk_mutex); @@ -3081,6 +3071,18 @@ retry_root_backup: if (sb->s_flags & MS_RDONLY) return 0; + if (btrfs_test_opt(tree_root, FREE_SPACE_TREE) && + !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { + pr_info("BTRFS: creating free space tree\n"); + ret = btrfs_create_free_space_tree(fs_info); + if (ret) { + pr_warn("BTRFS: failed to create free space tree %d\n", + ret); + close_ctree(tree_root); + return ret; + } + } + down_read(&fs_info->cleanup_work_sem); if ((ret = btrfs_orphan_cleanup(fs_info->fs_root)) || (ret = btrfs_orphan_cleanup(fs_info->tree_root))) { @@ -3106,6 +3108,18 @@ retry_root_backup: btrfs_qgroup_rescan_resume(fs_info); + if (btrfs_test_opt(tree_root, CLEAR_CACHE) && + btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { + pr_info("BTRFS: clearing free space tree\n"); + ret = btrfs_clear_free_space_tree(fs_info); + if (ret) { + pr_warn("BTRFS: failed to clear free space tree %d\n", + ret); + close_ctree(tree_root); + return ret; + } + } + if (!fs_info->uuid_root) { pr_info("BTRFS: creating UUID tree\n"); ret = btrfs_create_uuid_tree(fs_info); @@ -3932,11 +3946,6 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, return !ret; } -int btrfs_set_buffer_uptodate(struct extent_buffer *buf) -{ - return set_extent_buffer_uptodate(buf); -} - void btrfs_mark_buffer_dirty(struct extent_buffer *buf) { struct btrfs_root *root; @@ -3992,7 +4001,6 @@ static void __btrfs_btree_balance_dirty(struct btrfs_root *root, balance_dirty_pages_ratelimited( root->fs_info->btree_inode->i_mapping); } - return; } void btrfs_btree_balance_dirty(struct btrfs_root *root) @@ -4015,8 +4023,17 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, int read_only) { struct btrfs_super_block *sb = fs_info->super_copy; + u64 nodesize = btrfs_super_nodesize(sb); + u64 sectorsize = btrfs_super_sectorsize(sb); int ret = 0; + if (btrfs_super_magic(sb) != BTRFS_MAGIC) { + printk(KERN_ERR "BTRFS: no valid FS found\n"); + ret = -EINVAL; + } + if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP) + printk(KERN_WARNING "BTRFS: unrecognized super flag: %llu\n", + btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP); if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) { printk(KERN_ERR "BTRFS: tree_root level too big: %d >= %d\n", btrfs_super_root_level(sb), BTRFS_MAX_LEVEL); @@ -4034,31 +4051,46 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, } /* - * The common minimum, we don't know if we can trust the nodesize/sectorsize - * items yet, they'll be verified later. Issue just a warning. + * Check sectorsize and nodesize first, other check will need it. + * Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here. */ - if (!IS_ALIGNED(btrfs_super_root(sb), 4096)) + if (!is_power_of_2(sectorsize) || sectorsize < 4096 || + sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) { + printk(KERN_ERR "BTRFS: invalid sectorsize %llu\n", sectorsize); + ret = -EINVAL; + } + /* Only PAGE SIZE is supported yet */ + if (sectorsize != PAGE_CACHE_SIZE) { + printk(KERN_ERR "BTRFS: sectorsize %llu not supported yet, only support %lu\n", + sectorsize, PAGE_CACHE_SIZE); + ret = -EINVAL; + } + if (!is_power_of_2(nodesize) || nodesize < sectorsize || + nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) { + printk(KERN_ERR "BTRFS: invalid nodesize %llu\n", nodesize); + ret = -EINVAL; + } + if (nodesize != le32_to_cpu(sb->__unused_leafsize)) { + printk(KERN_ERR "BTRFS: invalid leafsize %u, should be %llu\n", + le32_to_cpu(sb->__unused_leafsize), + nodesize); + ret = -EINVAL; + } + + /* Root alignment check */ + if (!IS_ALIGNED(btrfs_super_root(sb), sectorsize)) { printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n", btrfs_super_root(sb)); - if (!IS_ALIGNED(btrfs_super_chunk_root(sb), 4096)) + ret = -EINVAL; + } + if (!IS_ALIGNED(btrfs_super_chunk_root(sb), sectorsize)) { printk(KERN_WARNING "BTRFS: chunk_root block unaligned: %llu\n", btrfs_super_chunk_root(sb)); - if (!IS_ALIGNED(btrfs_super_log_root(sb), 4096)) - printk(KERN_WARNING "BTRFS: log_root block unaligned: %llu\n", - btrfs_super_log_root(sb)); - - /* - * Check the lower bound, the alignment and other constraints are - * checked later. - */ - if (btrfs_super_nodesize(sb) < 4096) { - printk(KERN_ERR "BTRFS: nodesize too small: %u < 4096\n", - btrfs_super_nodesize(sb)); ret = -EINVAL; } - if (btrfs_super_sectorsize(sb) < 4096) { - printk(KERN_ERR "BTRFS: sectorsize too small: %u < 4096\n", - btrfs_super_sectorsize(sb)); + if (!IS_ALIGNED(btrfs_super_log_root(sb), sectorsize)) { + printk(KERN_WARNING "BTRFS: log_root block unaligned: %llu\n", + btrfs_super_log_root(sb)); ret = -EINVAL; } diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index adeb31830..8e79d0070 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -19,7 +19,7 @@ #ifndef __DISKIO__ #define __DISKIO__ -#define BTRFS_SUPER_INFO_OFFSET (64 * 1024) +#define BTRFS_SUPER_INFO_OFFSET SZ_64K #define BTRFS_SUPER_INFO_SIZE 4096 #define BTRFS_SUPER_MIRROR_MAX 3 @@ -35,7 +35,7 @@ enum btrfs_wq_endio_type { static inline u64 btrfs_sb_offset(int mirror) { - u64 start = 16 * 1024; + u64 start = SZ_16K; if (mirror) return start << (BTRFS_SUPER_MIRROR_SHIFT * mirror); return BTRFS_SUPER_INFO_OFFSET; @@ -116,7 +116,6 @@ static inline void btrfs_put_fs_root(struct btrfs_root *root) void btrfs_mark_buffer_dirty(struct extent_buffer *buf); int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int atomic); -int btrfs_set_buffer_uptodate(struct extent_buffer *buf); int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid); u32 btrfs_csum_data(char *data, u32 seed, size_t len); void btrfs_csum_final(u32 crc, char *result); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 2368cac11..e2287c7c1 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -33,6 +33,7 @@ #include "raid56.h" #include "locking.h" #include "free-space-cache.h" +#include "free-space-tree.h" #include "math.h" #include "sysfs.h" #include "qgroup.h" @@ -357,8 +358,8 @@ static void fragment_free_space(struct btrfs_root *root, * we need to check the pinned_extents for any extents that can't be used yet * since their free space will be released as soon as the transaction commits. */ -static u64 add_new_free_space(struct btrfs_block_group_cache *block_group, - struct btrfs_fs_info *info, u64 start, u64 end) +u64 add_new_free_space(struct btrfs_block_group_cache *block_group, + struct btrfs_fs_info *info, u64 start, u64 end) { u64 extent_start, extent_end, size, total_added = 0; int ret; @@ -395,11 +396,10 @@ static u64 add_new_free_space(struct btrfs_block_group_cache *block_group, return total_added; } -static noinline void caching_thread(struct btrfs_work *work) +static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) { struct btrfs_block_group_cache *block_group; struct btrfs_fs_info *fs_info; - struct btrfs_caching_control *caching_ctl; struct btrfs_root *extent_root; struct btrfs_path *path; struct extent_buffer *leaf; @@ -407,17 +407,16 @@ static noinline void caching_thread(struct btrfs_work *work) u64 total_found = 0; u64 last = 0; u32 nritems; - int ret = -ENOMEM; + int ret; bool wakeup = true; - caching_ctl = container_of(work, struct btrfs_caching_control, work); block_group = caching_ctl->block_group; fs_info = block_group->fs_info; extent_root = fs_info->extent_root; path = btrfs_alloc_path(); if (!path) - goto out; + return -ENOMEM; last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); @@ -438,20 +437,16 @@ static noinline void caching_thread(struct btrfs_work *work) */ path->skip_locking = 1; path->search_commit_root = 1; - path->reada = 1; + path->reada = READA_FORWARD; key.objectid = last; key.offset = 0; key.type = BTRFS_EXTENT_ITEM_KEY; -again: - mutex_lock(&caching_ctl->mutex); - /* need to make sure the commit_root doesn't disappear */ - down_read(&fs_info->commit_root_sem); next: ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) - goto err; + goto out; leaf = path->nodes[0]; nritems = btrfs_header_nritems(leaf); @@ -477,12 +472,14 @@ next: up_read(&fs_info->commit_root_sem); mutex_unlock(&caching_ctl->mutex); cond_resched(); - goto again; + mutex_lock(&caching_ctl->mutex); + down_read(&fs_info->commit_root_sem); + goto next; } ret = btrfs_next_leaf(extent_root, path); if (ret < 0) - goto err; + goto out; if (ret) break; leaf = path->nodes[0]; @@ -521,7 +518,7 @@ next: else last = key.objectid + key.offset; - if (total_found > (1024 * 1024 * 2)) { + if (total_found > CACHING_CTL_WAKE_UP) { total_found = 0; if (wakeup) wake_up(&caching_ctl->wait); @@ -534,9 +531,37 @@ next: total_found += add_new_free_space(block_group, fs_info, last, block_group->key.objectid + block_group->key.offset); + caching_ctl->progress = (u64)-1; + +out: + btrfs_free_path(path); + return ret; +} + +static noinline void caching_thread(struct btrfs_work *work) +{ + struct btrfs_block_group_cache *block_group; + struct btrfs_fs_info *fs_info; + struct btrfs_caching_control *caching_ctl; + struct btrfs_root *extent_root; + int ret; + + caching_ctl = container_of(work, struct btrfs_caching_control, work); + block_group = caching_ctl->block_group; + fs_info = block_group->fs_info; + extent_root = fs_info->extent_root; + + mutex_lock(&caching_ctl->mutex); + down_read(&fs_info->commit_root_sem); + + if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) + ret = load_free_space_tree(caching_ctl); + else + ret = load_extent_tree_free(caching_ctl); + spin_lock(&block_group->lock); block_group->caching_ctl = NULL; - block_group->cached = BTRFS_CACHE_FINISHED; + block_group->cached = ret ? BTRFS_CACHE_ERROR : BTRFS_CACHE_FINISHED; spin_unlock(&block_group->lock); #ifdef CONFIG_BTRFS_DEBUG @@ -555,20 +580,11 @@ next: #endif caching_ctl->progress = (u64)-1; -err: - btrfs_free_path(path); - up_read(&fs_info->commit_root_sem); - - free_excluded_extents(extent_root, block_group); + up_read(&fs_info->commit_root_sem); + free_excluded_extents(fs_info->extent_root, block_group); mutex_unlock(&caching_ctl->mutex); -out: - if (ret) { - spin_lock(&block_group->lock); - block_group->caching_ctl = NULL; - block_group->cached = BTRFS_CACHE_ERROR; - spin_unlock(&block_group->lock); - } + wake_up(&caching_ctl->wait); put_caching_control(caching_ctl); @@ -680,8 +696,8 @@ static int cache_block_group(struct btrfs_block_group_cache *cache, } } else { /* - * We are not going to do the fast caching, set cached to the - * appropriate value and wakeup any waiters. + * We're either using the free space tree or no caching at all. + * Set cached to the appropriate value and wakeup any waiters. */ spin_lock(&cache->lock); if (load_cache_only) { @@ -2115,7 +2131,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - path->reada = 1; + path->reada = READA_FORWARD; path->leave_spinning = 1; /* this will setup the path even if it fails to insert the back ref */ ret = insert_inline_extent_backref(trans, fs_info->extent_root, path, @@ -2141,7 +2157,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); - path->reada = 1; + path->reada = READA_FORWARD; path->leave_spinning = 1; /* now insert the actual backref */ ret = insert_extent_backref(trans, root->fs_info->extent_root, @@ -2254,7 +2270,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, } again: - path->reada = 1; + path->reada = READA_FORWARD; path->leave_spinning = 1; ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path, 0, 1); @@ -2910,6 +2926,9 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, if (trans->aborted) return 0; + if (root->fs_info->creating_free_space_tree) + return 0; + if (root == root->fs_info->extent_root) root = root->fs_info->tree_root; @@ -2988,9 +3007,9 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans, return -ENOMEM; extent_op->flags_to_set = flags; - extent_op->update_flags = 1; - extent_op->update_key = 0; - extent_op->is_data = is_data ? 1 : 0; + extent_op->update_flags = true; + extent_op->update_key = false; + extent_op->is_data = is_data ? true : false; extent_op->level = level; ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr, @@ -3328,7 +3347,7 @@ static int cache_save_setup(struct btrfs_block_group_cache *block_group, * If this block group is smaller than 100 megs don't bother caching the * block group. */ - if (block_group->key.offset < (100 * 1024 * 1024)) { + if (block_group->key.offset < (100 * SZ_1M)) { spin_lock(&block_group->lock); block_group->disk_cache_state = BTRFS_DC_WRITTEN; spin_unlock(&block_group->lock); @@ -3428,7 +3447,7 @@ again: * taking up quite a bit since it's not folded into the other space * cache. */ - num_pages = div_u64(block_group->key.offset, 256 * 1024 * 1024); + num_pages = div_u64(block_group->key.offset, SZ_256M); if (!num_pages) num_pages = 1; @@ -3684,11 +3703,21 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, return -ENOMEM; /* - * We don't need the lock here since we are protected by the transaction - * commit. We want to do the cache_save_setup first and then run the + * Even though we are in the critical section of the transaction commit, + * we can still have concurrent tasks adding elements to this + * transaction's list of dirty block groups. These tasks correspond to + * endio free space workers started when writeback finishes for a + * space cache, which run inode.c:btrfs_finish_ordered_io(), and can + * allocate new block groups as a result of COWing nodes of the root + * tree when updating the free space inode. The writeback for the space + * caches is triggered by an earlier call to + * btrfs_start_dirty_block_groups() and iterations of the following + * loop. + * Also we want to do the cache_save_setup first and then run the * delayed refs to make sure we have the best chance at doing this all * in one shot. */ + spin_lock(&cur_trans->dirty_bgs_lock); while (!list_empty(&cur_trans->dirty_bgs)) { cache = list_first_entry(&cur_trans->dirty_bgs, struct btrfs_block_group_cache, @@ -3700,11 +3729,13 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, * finish and then do it all again */ if (!list_empty(&cache->io_list)) { + spin_unlock(&cur_trans->dirty_bgs_lock); list_del_init(&cache->io_list); btrfs_wait_cache_io(root, trans, cache, &cache->io_ctl, path, cache->key.objectid); btrfs_put_block_group(cache); + spin_lock(&cur_trans->dirty_bgs_lock); } /* @@ -3712,6 +3743,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, * on any pending IO */ list_del_init(&cache->dirty_list); + spin_unlock(&cur_trans->dirty_bgs_lock); should_put = 1; cache_save_setup(cache, trans, path); @@ -3736,6 +3768,25 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, } if (!ret) { ret = write_one_cache_group(trans, root, path, cache); + /* + * One of the free space endio workers might have + * created a new block group while updating a free space + * cache's inode (at inode.c:btrfs_finish_ordered_io()) + * and hasn't released its transaction handle yet, in + * which case the new block group is still attached to + * its transaction handle and its creation has not + * finished yet (no block group item in the extent tree + * yet, etc). If this is the case, wait for all free + * space endio workers to finish and retry. This is a + * a very rare case so no need for a more efficient and + * complex approach. + */ + if (ret == -ENOENT) { + wait_event(cur_trans->writer_wait, + atomic_read(&cur_trans->num_writers) == 1); + ret = write_one_cache_group(trans, root, path, + cache); + } if (ret) btrfs_abort_transaction(trans, root, ret); } @@ -3743,7 +3794,9 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, /* if its not on the io list, we need to put the block group */ if (should_put) btrfs_put_block_group(cache); + spin_lock(&cur_trans->dirty_bgs_lock); } + spin_unlock(&cur_trans->dirty_bgs_lock); while (!list_empty(io)) { cache = list_first_entry(io, struct btrfs_block_group_cache, @@ -4242,14 +4295,13 @@ static int should_alloc_chunk(struct btrfs_root *root, */ if (force == CHUNK_ALLOC_LIMITED) { thresh = btrfs_super_total_bytes(root->fs_info->super_copy); - thresh = max_t(u64, 64 * 1024 * 1024, - div_factor_fine(thresh, 1)); + thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1)); if (num_bytes - num_allocated < thresh) return 1; } - if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8)) + if (num_allocated + SZ_2M < div_factor(num_bytes, 8)) return 0; return 1; } @@ -4449,7 +4501,7 @@ out: * transaction. */ if (trans->can_flush_pending_bgs && - trans->chunk_bytes_reserved >= (2 * 1024 * 1024ull)) { + trans->chunk_bytes_reserved >= (u64)SZ_2M) { btrfs_create_pending_block_groups(trans, trans->root); btrfs_trans_release_chunk_metadata(trans); } @@ -4547,7 +4599,7 @@ static inline int calc_reclaim_items_nr(struct btrfs_root *root, u64 to_reclaim) return nr; } -#define EXTENT_SIZE_PER_ITEM (256 * 1024) +#define EXTENT_SIZE_PER_ITEM SZ_256K /* * shrink metadata reservation for delalloc @@ -4752,8 +4804,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, u64 expected; u64 to_reclaim; - to_reclaim = min_t(u64, num_online_cpus() * 1024 * 1024, - 16 * 1024 * 1024); + to_reclaim = min_t(u64, num_online_cpus() * SZ_1M, SZ_16M); spin_lock(&space_info->lock); if (can_overcommit(root, space_info, to_reclaim, BTRFS_RESERVE_FLUSH_ALL)) { @@ -4764,8 +4815,7 @@ btrfs_calc_reclaim_metadata_size(struct btrfs_root *root, used = space_info->bytes_used + space_info->bytes_reserved + space_info->bytes_pinned + space_info->bytes_readonly + space_info->bytes_may_use; - if (can_overcommit(root, space_info, 1024 * 1024, - BTRFS_RESERVE_FLUSH_ALL)) + if (can_overcommit(root, space_info, SZ_1M, BTRFS_RESERVE_FLUSH_ALL)) expected = div_factor_fine(space_info->total_bytes, 95); else expected = div_factor_fine(space_info->total_bytes, 90); @@ -5321,7 +5371,7 @@ static void update_global_block_rsv(struct btrfs_fs_info *fs_info) spin_lock(&sinfo->lock); spin_lock(&block_rsv->lock); - block_rsv->size = min_t(u64, num_bytes, 512 * 1024 * 1024); + block_rsv->size = min_t(u64, num_bytes, SZ_512M); num_bytes = sinfo->bytes_used + sinfo->bytes_pinned + sinfo->bytes_reserved + sinfo->bytes_readonly + @@ -6225,11 +6275,11 @@ fetch_cluster_info(struct btrfs_root *root, struct btrfs_space_info *space_info, return ret; if (ssd) - *empty_cluster = 2 * 1024 * 1024; + *empty_cluster = SZ_2M; if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { ret = &root->fs_info->meta_alloc_cluster; if (!ssd) - *empty_cluster = 64 * 1024; + *empty_cluster = SZ_64K; } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && ssd) { ret = &root->fs_info->data_alloc_cluster; } @@ -6441,7 +6491,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, if (!path) return -ENOMEM; - path->reada = 1; + path->reada = READA_FORWARD; path->leave_spinning = 1; is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; @@ -6664,6 +6714,13 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } } + ret = add_to_free_space_tree(trans, root->fs_info, bytenr, + num_bytes); + if (ret) { + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } + ret = update_block_group(trans, root, bytenr, num_bytes, 0); if (ret) { btrfs_abort_transaction(trans, extent_root, ret); @@ -7675,6 +7732,11 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_free_path(path); + ret = remove_from_free_space_tree(trans, fs_info, ins->objectid, + ins->offset); + if (ret) + return ret; + ret = update_block_group(trans, root, ins->objectid, ins->offset, 1); if (ret) { /* -ENOENT, logic error */ btrfs_err(fs_info, "update block group failed for %llu %llu", @@ -7755,6 +7817,11 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(leaf); btrfs_free_path(path); + ret = remove_from_free_space_tree(trans, fs_info, ins->objectid, + num_bytes); + if (ret) + return ret; + ret = update_block_group(trans, root, ins->objectid, root->nodesize, 1); if (ret) { /* -ENOENT, logic error */ @@ -7837,7 +7904,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); btrfs_set_lock_blocking(buf); - btrfs_set_buffer_uptodate(buf); + set_extent_buffer_uptodate(buf); if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { buf->log_index = root->log_transid % 2; @@ -7983,12 +8050,9 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, else memset(&extent_op->key, 0, sizeof(extent_op->key)); extent_op->flags_to_set = flags; - if (skinny_metadata) - extent_op->update_key = 0; - else - extent_op->update_key = 1; - extent_op->update_flags = 1; - extent_op->is_data = 0; + extent_op->update_key = skinny_metadata ? false : true; + extent_op->update_flags = true; + extent_op->is_data = false; extent_op->level = level; ret = btrfs_add_delayed_tree_ref(root->fs_info, trans, @@ -9127,7 +9191,7 @@ static int inc_block_group_ro(struct btrfs_block_group_cache *cache, int force) if ((sinfo->flags & (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) && !force) - min_allocable_bytes = 1 * 1024 * 1024; + min_allocable_bytes = SZ_1M; else min_allocable_bytes = 0; @@ -9659,6 +9723,8 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size) cache->full_stripe_len = btrfs_full_stripe_len(root, &root->fs_info->mapping_tree, start); + set_free_space_tree_thresholds(cache); + atomic_set(&cache->count, 1); spin_lock_init(&cache->lock); init_rwsem(&cache->data_rwsem); @@ -9670,6 +9736,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size) INIT_LIST_HEAD(&cache->io_list); btrfs_init_free_space_ctl(cache); atomic_set(&cache->trimming, 0); + mutex_init(&cache->free_space_lock); return cache; } @@ -9694,7 +9761,7 @@ int btrfs_read_block_groups(struct btrfs_root *root) path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = 1; + path->reada = READA_FORWARD; cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); if (btrfs_test_opt(root, SPACE_CACHE) && @@ -9880,6 +9947,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, key.objectid, key.offset); if (ret) btrfs_abort_transaction(trans, extent_root, ret); + add_block_group_free_space(trans, root->fs_info, block_group); + /* already aborted the transaction if it failed. */ next: list_del_init(&block_group->bg_list); } @@ -9910,6 +9979,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, cache->flags = type; cache->last_byte_to_unpin = (u64)-1; cache->cached = BTRFS_CACHE_FINISHED; + cache->needs_free_space = 1; ret = exclude_super_stripes(root, cache); if (ret) { /* @@ -10280,6 +10350,10 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, unlock_chunks(root); + ret = remove_block_group_free_space(trans, root->fs_info, block_group); + if (ret) + goto out; + btrfs_put_block_group(block_group); btrfs_put_block_group(block_group); @@ -10328,7 +10402,7 @@ btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info, * more device items and remove one chunk item), but this is done at * btrfs_remove_chunk() through a call to check_system_chunk(). */ - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; num_items = 3 + map->num_stripes; free_extent_map(em); @@ -10515,7 +10589,7 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info) disk_super = fs_info->super_copy; if (!btrfs_super_root(disk_super)) - return 1; + return -EINVAL; features = btrfs_super_incompat_flags(disk_super); if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) @@ -10745,3 +10819,23 @@ int btrfs_start_write_no_snapshoting(struct btrfs_root *root) } return 1; } + +static int wait_snapshoting_atomic_t(atomic_t *a) +{ + schedule(); + return 0; +} + +void btrfs_wait_for_snapshot_creation(struct btrfs_root *root) +{ + while (true) { + int ret; + + ret = btrfs_start_write_no_snapshoting(root); + if (ret) + break; + wait_on_atomic_t(&root->will_be_snapshoted, + wait_snapshoting_atomic_t, + TASK_UNINTERRUPTIBLE); + } +} diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h deleted file mode 100644 index e69de29bb..000000000 --- a/fs/btrfs/extent-tree.h +++ /dev/null diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 9abe18763..392592dc7 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1285,20 +1285,6 @@ search_again: } /* wrappers around set/clear extent bit */ -int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask) -{ - return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL, - NULL, mask); -} - -int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, gfp_t mask) -{ - return set_extent_bit(tree, start, end, bits, NULL, - NULL, mask); -} - int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, gfp_t mask, struct extent_changeset *changeset) @@ -1323,17 +1309,6 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, cached, mask, NULL); } -int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, gfp_t mask) -{ - int wake = 0; - - if (bits & EXTENT_LOCKED) - wake = 1; - - return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, mask); -} - int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, gfp_t mask, struct extent_changeset *changeset) @@ -1348,63 +1323,18 @@ int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, changeset); } -int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached_state, gfp_t mask) -{ - return set_extent_bit(tree, start, end, - EXTENT_DELALLOC | EXTENT_UPTODATE, - NULL, cached_state, mask); -} - -int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached_state, gfp_t mask) -{ - return set_extent_bit(tree, start, end, - EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG, - NULL, cached_state, mask); -} - -int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask) -{ - return clear_extent_bit(tree, start, end, - EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask); -} - -int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask) -{ - return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, - NULL, mask); -} - -int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached_state, gfp_t mask) -{ - return set_extent_bit(tree, start, end, EXTENT_UPTODATE, NULL, - cached_state, mask); -} - -int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached_state, gfp_t mask) -{ - return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, - cached_state, mask); -} - /* * either insert or lock state struct between start and end use mask to tell * us if waiting is desired. */ int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, struct extent_state **cached_state) + struct extent_state **cached_state) { int err; u64 failed_start; while (1) { - err = __set_extent_bit(tree, start, end, EXTENT_LOCKED | bits, + err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, EXTENT_LOCKED, &failed_start, cached_state, GFP_NOFS, NULL); if (err == -EEXIST) { @@ -1417,11 +1347,6 @@ int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, return err; } -int lock_extent(struct extent_io_tree *tree, u64 start, u64 end) -{ - return lock_extent_bits(tree, start, end, 0, NULL); -} - int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end) { int err; @@ -1438,20 +1363,7 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end) return 1; } -int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached, gfp_t mask) -{ - return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached, - mask); -} - -int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end) -{ - return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL, - GFP_NOFS); -} - -int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) +void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) { unsigned long index = start >> PAGE_CACHE_SHIFT; unsigned long end_index = end >> PAGE_CACHE_SHIFT; @@ -1464,10 +1376,9 @@ int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) page_cache_release(page); index++; } - return 0; } -int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) +void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) { unsigned long index = start >> PAGE_CACHE_SHIFT; unsigned long end_index = end >> PAGE_CACHE_SHIFT; @@ -1481,13 +1392,12 @@ int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) page_cache_release(page); index++; } - return 0; } /* * helper function to set both pages and extents in the tree writeback */ -static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) +static void set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) { unsigned long index = start >> PAGE_CACHE_SHIFT; unsigned long end_index = end >> PAGE_CACHE_SHIFT; @@ -1500,7 +1410,6 @@ static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) page_cache_release(page); index++; } - return 0; } /* find the first state struct with 'bits' set after 'start', and @@ -1800,7 +1709,7 @@ again: BUG_ON(ret); /* Only valid values are 0 and -EAGAIN */ /* step three, lock the state bits for the whole range */ - lock_extent_bits(tree, delalloc_start, delalloc_end, 0, &cached_state); + lock_extent_bits(tree, delalloc_start, delalloc_end, &cached_state); /* then test to make sure it is all still delalloc */ ret = test_range_bit(tree, delalloc_start, delalloc_end, @@ -1820,7 +1729,7 @@ out_failed: return found; } -int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, +void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, struct page *locked_page, unsigned clear_bits, unsigned long page_ops) @@ -1835,7 +1744,7 @@ int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS); if (page_ops == 0) - return 0; + return; if ((page_ops & PAGE_SET_ERROR) && nr_pages > 0) mapping_set_error(inode->i_mapping, -EIO); @@ -1869,7 +1778,6 @@ int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, index += ret; cond_resched(); } - return 0; } /* @@ -2516,7 +2424,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, /* lots and lots of room for performance fixes in the end_bio funcs */ -int end_extent_writepage(struct page *page, int err, u64 start, u64 end) +void end_extent_writepage(struct page *page, int err, u64 start, u64 end) { int uptodate = (err == 0); struct extent_io_tree *tree; @@ -2537,7 +2445,6 @@ int end_extent_writepage(struct page *page, int err, u64 start, u64 end) ret = ret < 0 ? ret : -EIO; mapping_set_error(page->mapping, ret); } - return 0; } /* @@ -2579,9 +2486,7 @@ static void end_bio_extent_writepage(struct bio *bio) start = page_offset(page); end = start + bvec->bv_offset + bvec->bv_len - 1; - if (end_extent_writepage(page, bio->bi_error, start, end)) - continue; - + end_extent_writepage(page, bio->bi_error, start, end); end_page_writeback(page); } @@ -2992,12 +2897,11 @@ static int __do_readpage(struct extent_io_tree *tree, struct block_device *bdev; int ret; int nr = 0; - int parent_locked = *bio_flags & EXTENT_BIO_PARENT_LOCKED; size_t pg_offset = 0; size_t iosize; size_t disk_io_size; size_t blocksize = inode->i_sb->s_blocksize; - unsigned long this_bio_flag = *bio_flags & EXTENT_BIO_PARENT_LOCKED; + unsigned long this_bio_flag = 0; set_page_extent_mapped(page); @@ -3037,18 +2941,16 @@ static int __do_readpage(struct extent_io_tree *tree, kunmap_atomic(userpage); set_extent_uptodate(tree, cur, cur + iosize - 1, &cached, GFP_NOFS); - if (!parent_locked) - unlock_extent_cached(tree, cur, - cur + iosize - 1, - &cached, GFP_NOFS); + unlock_extent_cached(tree, cur, + cur + iosize - 1, + &cached, GFP_NOFS); break; } em = __get_extent_map(inode, page, pg_offset, cur, end - cur + 1, get_extent, em_cached); if (IS_ERR_OR_NULL(em)) { SetPageError(page); - if (!parent_locked) - unlock_extent(tree, cur, end); + unlock_extent(tree, cur, end); break; } extent_offset = cur - em->start; @@ -3133,12 +3035,9 @@ static int __do_readpage(struct extent_io_tree *tree, set_extent_uptodate(tree, cur, cur + iosize - 1, &cached, GFP_NOFS); - if (parent_locked) - free_extent_state(cached); - else - unlock_extent_cached(tree, cur, - cur + iosize - 1, - &cached, GFP_NOFS); + unlock_extent_cached(tree, cur, + cur + iosize - 1, + &cached, GFP_NOFS); cur = cur + iosize; pg_offset += iosize; continue; @@ -3147,8 +3046,7 @@ static int __do_readpage(struct extent_io_tree *tree, if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1, NULL)) { check_page_uptodate(tree, page); - if (!parent_locked) - unlock_extent(tree, cur, cur + iosize - 1); + unlock_extent(tree, cur, cur + iosize - 1); cur = cur + iosize; pg_offset += iosize; continue; @@ -3158,8 +3056,7 @@ static int __do_readpage(struct extent_io_tree *tree, */ if (block_start == EXTENT_MAP_INLINE) { SetPageError(page); - if (!parent_locked) - unlock_extent(tree, cur, cur + iosize - 1); + unlock_extent(tree, cur, cur + iosize - 1); cur = cur + iosize; pg_offset += iosize; continue; @@ -3178,8 +3075,7 @@ static int __do_readpage(struct extent_io_tree *tree, *bio_flags = this_bio_flag; } else { SetPageError(page); - if (!parent_locked) - unlock_extent(tree, cur, cur + iosize - 1); + unlock_extent(tree, cur, cur + iosize - 1); } cur = cur + iosize; pg_offset += iosize; @@ -3308,20 +3204,6 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page, return ret; } -int extent_read_full_page_nolock(struct extent_io_tree *tree, struct page *page, - get_extent_t *get_extent, int mirror_num) -{ - struct bio *bio = NULL; - unsigned long bio_flags = EXTENT_BIO_PARENT_LOCKED; - int ret; - - ret = __do_readpage(tree, page, get_extent, NULL, &bio, mirror_num, - &bio_flags, READ, NULL); - if (bio) - ret = submit_one_bio(READ, bio, mirror_num, bio_flags); - return ret; -} - static noinline void update_nr_written(struct page *page, struct writeback_control *wbc, unsigned long nr_written) @@ -4326,7 +4208,7 @@ int extent_invalidatepage(struct extent_io_tree *tree, if (start > end) return 0; - lock_extent_bits(tree, start, end, 0, &cached_state); + lock_extent_bits(tree, start, end, &cached_state); wait_on_page_writeback(page); clear_extent_bit(tree, start, end, EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | @@ -4387,7 +4269,7 @@ int try_release_extent_mapping(struct extent_map_tree *map, u64 end = start + PAGE_CACHE_SIZE - 1; if (gfpflags_allow_blocking(mask) && - page->mapping->host->i_size > 16 * 1024 * 1024) { + page->mapping->host->i_size > SZ_16M) { u64 len; while (start <= end) { len = end - start + 1; @@ -4536,7 +4418,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, last_for_get_extent = isize; } - lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1, 0, + lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1, &cached_state); em = get_extent_skip_holes(inode, start, last_for_get_extent, @@ -4797,24 +4679,14 @@ struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src) return new; } -struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start) +struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start, unsigned long len) { struct extent_buffer *eb; - unsigned long len; unsigned long num_pages; unsigned long i; - if (!fs_info) { - /* - * Called only from tests that don't always have a fs_info - * available, but we know that nodesize is 4096 - */ - len = 4096; - } else { - len = fs_info->tree_root->nodesize; - } - num_pages = num_extent_pages(0, len); + num_pages = num_extent_pages(start, len); eb = __alloc_extent_buffer(fs_info, start, len); if (!eb) @@ -4837,6 +4709,24 @@ err: return NULL; } +struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start) +{ + unsigned long len; + + if (!fs_info) { + /* + * Called only from tests that don't always have a fs_info + * available, but we know that nodesize is 4096 + */ + len = 4096; + } else { + len = fs_info->tree_root->nodesize; + } + + return __alloc_dummy_extent_buffer(fs_info, start, len); +} + static void check_buffer_tree_ref(struct extent_buffer *eb) { int refs; @@ -5227,7 +5117,7 @@ int set_extent_buffer_dirty(struct extent_buffer *eb) return was_dirty; } -int clear_extent_buffer_uptodate(struct extent_buffer *eb) +void clear_extent_buffer_uptodate(struct extent_buffer *eb) { unsigned long i; struct page *page; @@ -5240,10 +5130,9 @@ int clear_extent_buffer_uptodate(struct extent_buffer *eb) if (page) ClearPageUptodate(page); } - return 0; } -int set_extent_buffer_uptodate(struct extent_buffer *eb) +void set_extent_buffer_uptodate(struct extent_buffer *eb) { unsigned long i; struct page *page; @@ -5255,7 +5144,6 @@ int set_extent_buffer_uptodate(struct extent_buffer *eb) page = eb->pages[i]; SetPageUptodate(page); } - return 0; } int extent_buffer_uptodate(struct extent_buffer *eb) @@ -5594,6 +5482,155 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, } } +/* + * The extent buffer bitmap operations are done with byte granularity because + * bitmap items are not guaranteed to be aligned to a word and therefore a + * single word in a bitmap may straddle two pages in the extent buffer. + */ +#define BIT_BYTE(nr) ((nr) / BITS_PER_BYTE) +#define BYTE_MASK ((1 << BITS_PER_BYTE) - 1) +#define BITMAP_FIRST_BYTE_MASK(start) \ + ((BYTE_MASK << ((start) & (BITS_PER_BYTE - 1))) & BYTE_MASK) +#define BITMAP_LAST_BYTE_MASK(nbits) \ + (BYTE_MASK >> (-(nbits) & (BITS_PER_BYTE - 1))) + +/* + * eb_bitmap_offset() - calculate the page and offset of the byte containing the + * given bit number + * @eb: the extent buffer + * @start: offset of the bitmap item in the extent buffer + * @nr: bit number + * @page_index: return index of the page in the extent buffer that contains the + * given bit number + * @page_offset: return offset into the page given by page_index + * + * This helper hides the ugliness of finding the byte in an extent buffer which + * contains a given bit. + */ +static inline void eb_bitmap_offset(struct extent_buffer *eb, + unsigned long start, unsigned long nr, + unsigned long *page_index, + size_t *page_offset) +{ + size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); + size_t byte_offset = BIT_BYTE(nr); + size_t offset; + + /* + * The byte we want is the offset of the extent buffer + the offset of + * the bitmap item in the extent buffer + the offset of the byte in the + * bitmap item. + */ + offset = start_offset + start + byte_offset; + + *page_index = offset >> PAGE_CACHE_SHIFT; + *page_offset = offset & (PAGE_CACHE_SIZE - 1); +} + +/** + * extent_buffer_test_bit - determine whether a bit in a bitmap item is set + * @eb: the extent buffer + * @start: offset of the bitmap item in the extent buffer + * @nr: bit number to test + */ +int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start, + unsigned long nr) +{ + char *kaddr; + struct page *page; + unsigned long i; + size_t offset; + + eb_bitmap_offset(eb, start, nr, &i, &offset); + page = eb->pages[i]; + WARN_ON(!PageUptodate(page)); + kaddr = page_address(page); + return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1))); +} + +/** + * extent_buffer_bitmap_set - set an area of a bitmap + * @eb: the extent buffer + * @start: offset of the bitmap item in the extent buffer + * @pos: bit number of the first bit + * @len: number of bits to set + */ +void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start, + unsigned long pos, unsigned long len) +{ + char *kaddr; + struct page *page; + unsigned long i; + size_t offset; + const unsigned int size = pos + len; + int bits_to_set = BITS_PER_BYTE - (pos % BITS_PER_BYTE); + unsigned int mask_to_set = BITMAP_FIRST_BYTE_MASK(pos); + + eb_bitmap_offset(eb, start, pos, &i, &offset); + page = eb->pages[i]; + WARN_ON(!PageUptodate(page)); + kaddr = page_address(page); + + while (len >= bits_to_set) { + kaddr[offset] |= mask_to_set; + len -= bits_to_set; + bits_to_set = BITS_PER_BYTE; + mask_to_set = ~0U; + if (++offset >= PAGE_CACHE_SIZE && len > 0) { + offset = 0; + page = eb->pages[++i]; + WARN_ON(!PageUptodate(page)); + kaddr = page_address(page); + } + } + if (len) { + mask_to_set &= BITMAP_LAST_BYTE_MASK(size); + kaddr[offset] |= mask_to_set; + } +} + + +/** + * extent_buffer_bitmap_clear - clear an area of a bitmap + * @eb: the extent buffer + * @start: offset of the bitmap item in the extent buffer + * @pos: bit number of the first bit + * @len: number of bits to clear + */ +void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start, + unsigned long pos, unsigned long len) +{ + char *kaddr; + struct page *page; + unsigned long i; + size_t offset; + const unsigned int size = pos + len; + int bits_to_clear = BITS_PER_BYTE - (pos % BITS_PER_BYTE); + unsigned int mask_to_clear = BITMAP_FIRST_BYTE_MASK(pos); + + eb_bitmap_offset(eb, start, pos, &i, &offset); + page = eb->pages[i]; + WARN_ON(!PageUptodate(page)); + kaddr = page_address(page); + + while (len >= bits_to_clear) { + kaddr[offset] &= ~mask_to_clear; + len -= bits_to_clear; + bits_to_clear = BITS_PER_BYTE; + mask_to_clear = ~0U; + if (++offset >= PAGE_CACHE_SIZE && len > 0) { + offset = 0; + page = eb->pages[++i]; + WARN_ON(!PageUptodate(page)); + kaddr = page_address(page); + } + } + if (len) { + mask_to_clear &= BITMAP_LAST_BYTE_MASK(size); + kaddr[offset] &= ~mask_to_clear; + } +} + static inline bool areas_overlap(unsigned long src, unsigned long dst, unsigned long len) { unsigned long distance = (src > dst) ? src - dst : dst - src; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index f4c1ae118..880d5292e 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -29,7 +29,6 @@ */ #define EXTENT_BIO_COMPRESSED 1 #define EXTENT_BIO_TREE_LOG 2 -#define EXTENT_BIO_PARENT_LOCKED 4 #define EXTENT_BIO_FLAG_SHIFT 16 /* these are bit numbers for test/set bit */ @@ -199,17 +198,17 @@ int try_release_extent_mapping(struct extent_map_tree *map, struct extent_io_tree *tree, struct page *page, gfp_t mask); int try_release_extent_buffer(struct page *page); -int lock_extent(struct extent_io_tree *tree, u64 start, u64 end); int lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, struct extent_state **cached); -int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end); -int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached, gfp_t mask); + struct extent_state **cached); + +static inline int lock_extent(struct extent_io_tree *tree, u64 start, u64 end) +{ + return lock_extent_bits(tree, start, end, NULL); +} + int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end); int extent_read_full_page(struct extent_io_tree *tree, struct page *page, get_extent_t *get_extent, int mirror_num); -int extent_read_full_page_nolock(struct extent_io_tree *tree, struct page *page, - get_extent_t *get_extent, int mirror_num); int __init extent_io_init(void); void extent_io_exit(void); @@ -221,39 +220,105 @@ void free_extent_state(struct extent_state *state); int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, int filled, struct extent_state *cached_state); -int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, gfp_t mask); int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, gfp_t mask, struct extent_changeset *changeset); int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, int wake, int delete, struct extent_state **cached, gfp_t mask); -int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - unsigned bits, gfp_t mask); + +static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end) +{ + return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, NULL, + GFP_NOFS); +} + +static inline int unlock_extent_cached(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached, gfp_t mask) +{ + return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, cached, + mask); +} + +static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start, + u64 end, unsigned bits, gfp_t mask) +{ + int wake = 0; + + if (bits & EXTENT_LOCKED) + wake = 1; + + return clear_extent_bit(tree, start, end, bits, wake, 0, NULL, mask); +} + int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, gfp_t mask, struct extent_changeset *changeset); int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, u64 *failed_start, struct extent_state **cached_state, gfp_t mask); -int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached_state, gfp_t mask); -int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached_state, gfp_t mask); -int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask); -int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask); -int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, - gfp_t mask); + +static inline int set_extent_bits(struct extent_io_tree *tree, u64 start, + u64 end, unsigned bits, gfp_t mask) +{ + return set_extent_bit(tree, start, end, bits, NULL, NULL, mask); +} + +static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached_state, gfp_t mask) +{ + return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, + cached_state, mask); +} + +static inline int set_extent_dirty(struct extent_io_tree *tree, u64 start, + u64 end, gfp_t mask) +{ + return set_extent_bit(tree, start, end, EXTENT_DIRTY, NULL, + NULL, mask); +} + +static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start, + u64 end, gfp_t mask) +{ + return clear_extent_bit(tree, start, end, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING, 0, 0, NULL, mask); +} + int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, unsigned bits, unsigned clear_bits, struct extent_state **cached_state, gfp_t mask); -int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached_state, gfp_t mask); -int set_extent_defrag(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached_state, gfp_t mask); + +static inline int set_extent_delalloc(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached_state, gfp_t mask) +{ + return set_extent_bit(tree, start, end, + EXTENT_DELALLOC | EXTENT_UPTODATE, + NULL, cached_state, mask); +} + +static inline int set_extent_defrag(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached_state, gfp_t mask) +{ + return set_extent_bit(tree, start, end, + EXTENT_DELALLOC | EXTENT_UPTODATE | EXTENT_DEFRAG, + NULL, cached_state, mask); +} + +static inline int set_extent_new(struct extent_io_tree *tree, u64 start, + u64 end, gfp_t mask) +{ + return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, NULL, mask); +} + +static inline int set_extent_uptodate(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached_state, gfp_t mask) +{ + return set_extent_bit(tree, start, end, EXTENT_UPTODATE, NULL, + cached_state, mask); +} + int find_first_extent_bit(struct extent_io_tree *tree, u64 start, u64 *start_ret, u64 *end_ret, unsigned bits, struct extent_state **cached_state); @@ -282,8 +347,10 @@ void set_page_extent_mapped(struct page *page); struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start); +struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start, unsigned long len); struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start); + u64 start); struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src); struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, u64 start); @@ -328,19 +395,25 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_offset, unsigned long len); void memset_extent_buffer(struct extent_buffer *eb, char c, unsigned long start, unsigned long len); +int extent_buffer_test_bit(struct extent_buffer *eb, unsigned long start, + unsigned long pos); +void extent_buffer_bitmap_set(struct extent_buffer *eb, unsigned long start, + unsigned long pos, unsigned long len); +void extent_buffer_bitmap_clear(struct extent_buffer *eb, unsigned long start, + unsigned long pos, unsigned long len); void clear_extent_buffer_dirty(struct extent_buffer *eb); int set_extent_buffer_dirty(struct extent_buffer *eb); -int set_extent_buffer_uptodate(struct extent_buffer *eb); -int clear_extent_buffer_uptodate(struct extent_buffer *eb); +void set_extent_buffer_uptodate(struct extent_buffer *eb); +void clear_extent_buffer_uptodate(struct extent_buffer *eb); int extent_buffer_uptodate(struct extent_buffer *eb); int extent_buffer_under_io(struct extent_buffer *eb); int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset, unsigned long min_len, char **map, unsigned long *map_start, unsigned long *map_len); -int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end); -int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end); -int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, +void extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end); +void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end); +void extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end, struct page *locked_page, unsigned bits_to_clear, unsigned long page_ops); @@ -357,7 +430,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical, int mirror_num); int clean_io_failure(struct inode *inode, u64 start, struct page *page, unsigned int pg_offset); -int end_extent_writepage(struct page *page, int err, u64 start, u64 end); +void end_extent_writepage(struct page *page, int err, u64 start, u64 end); int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, int mirror_num); diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 6a98bddd8..84fb56d5c 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -76,7 +76,7 @@ void free_extent_map(struct extent_map *em) WARN_ON(extent_map_in_tree(em)); WARN_ON(!list_empty(&em->list)); if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags)) - kfree(em->bdev); + kfree(em->map_lookup); kmem_cache_free(extent_map_cache, em); } } diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index b2991fd85..eb8b8fae0 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -32,7 +32,15 @@ struct extent_map { u64 block_len; u64 generation; unsigned long flags; - struct block_device *bdev; + union { + struct block_device *bdev; + + /* + * used for chunk mappings + * flags & EXTENT_FLAG_FS_MAPPING must be set + */ + struct map_lookup *map_lookup; + }; atomic_t refs; unsigned int compress_type; struct list_head list; diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 58ece6558..a67e1c828 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -202,7 +202,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root, } if (bio->bi_iter.bi_size > PAGE_CACHE_SIZE * 8) - path->reada = 2; + path->reada = READA_FORWARD; WARN_ON(bio->bi_vcnt <= 0); @@ -328,7 +328,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, if (search_commit) { path->skip_locking = 1; - path->reada = 2; + path->reada = READA_FORWARD; path->search_commit_root = 1; } diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 0f09526aa..098bb8f69 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -406,8 +406,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) /* simple helper to fault in pages and copy. This should go away * and be replaced with calls into generic code. */ -static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, - size_t write_bytes, +static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes, struct page **prepared_pages, struct iov_iter *i) { @@ -1394,7 +1393,7 @@ lock_and_cleanup_extent_if_need(struct inode *inode, struct page **pages, if (start_pos < inode->i_size) { struct btrfs_ordered_extent *ordered; lock_extent_bits(&BTRFS_I(inode)->io_tree, - start_pos, last_pos, 0, cached_state); + start_pos, last_pos, cached_state); ordered = btrfs_lookup_ordered_range(inode, start_pos, last_pos - start_pos + 1); if (ordered && @@ -1588,8 +1587,7 @@ again: ret = 0; } - copied = btrfs_copy_from_user(pos, num_pages, - write_bytes, pages, i); + copied = btrfs_copy_from_user(pos, write_bytes, pages, i); /* * if we have trouble faulting in the pages, fall @@ -1764,17 +1762,17 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, loff_t pos; size_t count; - mutex_lock(&inode->i_mutex); + inode_lock(inode); err = generic_write_checks(iocb, from); if (err <= 0) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return err; } current->backing_dev_info = inode_to_bdi(inode); err = file_remove_privs(file); if (err) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); goto out; } @@ -1785,7 +1783,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, * to stop this write operation to ensure FS consistency. */ if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); err = -EROFS; goto out; } @@ -1806,7 +1804,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, end_pos = round_up(pos + count, root->sectorsize); err = btrfs_cont_expand(inode, i_size_read(inode), end_pos); if (err) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); goto out; } } @@ -1822,7 +1820,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, iocb->ki_pos = pos + num_written; } - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); /* * We also have to set last_sub_trans to the current log transid, @@ -1911,7 +1909,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (ret) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); atomic_inc(&root->log_batch); full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); @@ -1963,7 +1961,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) ret = start_ordered_ops(inode, start, end); } if (ret) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); goto out; } atomic_inc(&root->log_batch); @@ -2009,7 +2007,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); goto out; } @@ -2033,7 +2031,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); goto out; } trans->sync = true; @@ -2056,7 +2054,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * file again, but that will end up using the synchronization * inside btrfs_sync_log to keep things safe. */ - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); /* * If any of the ordered extents had an error, just return it to user @@ -2305,7 +2303,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) if (ret) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE); ret = find_first_non_hole(inode, &offset, &len); if (ret < 0) @@ -2345,7 +2343,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) truncated_page = true; ret = btrfs_truncate_page(inode, offset, 0, 0); if (ret) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } } @@ -2398,7 +2396,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) truncate_pagecache_range(inode, lockstart, lockend); lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, - 0, &cached_state); + &cached_state); ordered = btrfs_lookup_first_ordered_extent(inode, lockend); /* @@ -2421,7 +2419,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) ret = btrfs_wait_ordered_range(inode, lockstart, lockend - lockstart + 1); if (ret) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } } @@ -2576,7 +2574,7 @@ out_only_mutex: ret = btrfs_end_transaction(trans, root); } } - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (ret && !err) err = ret; return err; @@ -2660,7 +2658,7 @@ static long btrfs_fallocate(struct file *file, int mode, if (ret < 0) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = inode_newsize_ok(inode, alloc_end); if (ret) goto out; @@ -2705,7 +2703,7 @@ static long btrfs_fallocate(struct file *file, int mode, * transaction */ lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start, - locked_end, 0, &cached_state); + locked_end, &cached_state); ordered = btrfs_lookup_first_ordered_extent(inode, alloc_end - 1); if (ordered && @@ -2818,7 +2816,7 @@ out: * So this is completely used as cleanup. */ btrfs_qgroup_free_data(inode, alloc_start, alloc_end - alloc_start); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); /* Let go of our reservation. */ btrfs_free_reserved_data_space(inode, alloc_start, alloc_end - alloc_start); @@ -2852,7 +2850,7 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence) lockend--; len = lockend - lockstart + 1; - lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0, + lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state); while (start < inode->i_size) { @@ -2894,7 +2892,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence) struct inode *inode = file->f_mapping->host; int ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); switch (whence) { case SEEK_END: case SEEK_CUR: @@ -2903,20 +2901,20 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence) case SEEK_DATA: case SEEK_HOLE: if (offset >= i_size_read(inode)) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return -ENXIO; } ret = find_desired_extent(inode, &offset, whence); if (ret) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } } offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return offset; } @@ -2934,6 +2932,9 @@ const struct file_operations btrfs_file_operations = { #ifdef CONFIG_COMPAT .compat_ioctl = btrfs_ioctl, #endif + .copy_file_range = btrfs_copy_file_range, + .clone_file_range = btrfs_clone_file_range, + .dedupe_file_range = btrfs_dedupe_file_range, }; void btrfs_auto_defrag_exit(void) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index cfe99bec4..8f835bfa1 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -30,7 +30,7 @@ #include "volumes.h" #define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8) -#define MAX_CACHE_BYTES_PER_GIG (32 * 1024) +#define MAX_CACHE_BYTES_PER_GIG SZ_32K struct btrfs_trim_range { u64 start; @@ -1086,14 +1086,11 @@ write_pinned_extent_entries(struct btrfs_root *root, static noinline_for_stack int write_bitmap_entries(struct btrfs_io_ctl *io_ctl, struct list_head *bitmap_list) { - struct list_head *pos, *n; + struct btrfs_free_space *entry, *next; int ret; /* Write out the bitmaps */ - list_for_each_safe(pos, n, bitmap_list) { - struct btrfs_free_space *entry = - list_entry(pos, struct btrfs_free_space, list); - + list_for_each_entry_safe(entry, next, bitmap_list, list) { ret = io_ctl_add_bitmap(io_ctl, entry->bitmap); if (ret) return -ENOSPC; @@ -1119,13 +1116,10 @@ static int flush_dirty_cache(struct inode *inode) static void noinline_for_stack cleanup_bitmap_list(struct list_head *bitmap_list) { - struct list_head *pos, *n; + struct btrfs_free_space *entry, *next; - list_for_each_safe(pos, n, bitmap_list) { - struct btrfs_free_space *entry = - list_entry(pos, struct btrfs_free_space, list); + list_for_each_entry_safe(entry, next, bitmap_list, list) list_del_init(&entry->list); - } } static void noinline_for_stack @@ -1261,7 +1255,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, goto out; lock_extent_bits(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, - 0, &cached_state); + &cached_state); io_ctl_set_generation(io_ctl, trans->transid); @@ -1656,11 +1650,10 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl) * at or below 32k, so we need to adjust how much memory we allow to be * used by extent based free space tracking */ - if (size < 1024 * 1024 * 1024) + if (size < SZ_1G) max_bytes = MAX_CACHE_BYTES_PER_GIG; else - max_bytes = MAX_CACHE_BYTES_PER_GIG * - div_u64(size, 1024 * 1024 * 1024); + max_bytes = MAX_CACHE_BYTES_PER_GIG * div_u64(size, SZ_1G); /* * we want to account for 1 more bitmap than what we have so we can make @@ -2016,7 +2009,7 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl, return true; } -static struct btrfs_free_space_op free_space_op = { +static const struct btrfs_free_space_op free_space_op = { .recalc_thresholds = recalculate_thresholds, .use_bitmap = use_bitmap, }; @@ -2489,8 +2482,7 @@ void btrfs_init_free_space_ctl(struct btrfs_block_group_cache *block_group) * track of free space, and if we pass 1/2 of that we want to * start converting things over to using bitmaps */ - ctl->extents_thresh = ((1024 * 32) / 2) / - sizeof(struct btrfs_free_space); + ctl->extents_thresh = (SZ_32K / 2) / sizeof(struct btrfs_free_space); } /* diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h index f251865eb..33178c490 100644 --- a/fs/btrfs/free-space-cache.h +++ b/fs/btrfs/free-space-cache.h @@ -37,7 +37,7 @@ struct btrfs_free_space_ctl { int total_bitmaps; int unit; u64 start; - struct btrfs_free_space_op *op; + const struct btrfs_free_space_op *op; void *private; struct mutex cache_writeout_mutex; struct list_head trimming_ranges; diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c new file mode 100644 index 000000000..53dbeaf6c --- /dev/null +++ b/fs/btrfs/free-space-tree.c @@ -0,0 +1,1605 @@ +/* + * Copyright (C) 2015 Facebook. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/kernel.h> +#include <linux/vmalloc.h> +#include "ctree.h" +#include "disk-io.h" +#include "locking.h" +#include "free-space-tree.h" +#include "transaction.h" + +static int __add_block_group_free_space(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path); + +void set_free_space_tree_thresholds(struct btrfs_block_group_cache *cache) +{ + u32 bitmap_range; + size_t bitmap_size; + u64 num_bitmaps, total_bitmap_size; + + /* + * We convert to bitmaps when the disk space required for using extents + * exceeds that required for using bitmaps. + */ + bitmap_range = cache->sectorsize * BTRFS_FREE_SPACE_BITMAP_BITS; + num_bitmaps = div_u64(cache->key.offset + bitmap_range - 1, + bitmap_range); + bitmap_size = sizeof(struct btrfs_item) + BTRFS_FREE_SPACE_BITMAP_SIZE; + total_bitmap_size = num_bitmaps * bitmap_size; + cache->bitmap_high_thresh = div_u64(total_bitmap_size, + sizeof(struct btrfs_item)); + + /* + * We allow for a small buffer between the high threshold and low + * threshold to avoid thrashing back and forth between the two formats. + */ + if (cache->bitmap_high_thresh > 100) + cache->bitmap_low_thresh = cache->bitmap_high_thresh - 100; + else + cache->bitmap_low_thresh = 0; +} + +static int add_new_free_space_info(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path) +{ + struct btrfs_root *root = fs_info->free_space_root; + struct btrfs_free_space_info *info; + struct btrfs_key key; + struct extent_buffer *leaf; + int ret; + + key.objectid = block_group->key.objectid; + key.type = BTRFS_FREE_SPACE_INFO_KEY; + key.offset = block_group->key.offset; + + ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*info)); + if (ret) + goto out; + + leaf = path->nodes[0]; + info = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_free_space_info); + btrfs_set_free_space_extent_count(leaf, info, 0); + btrfs_set_free_space_flags(leaf, info, 0); + btrfs_mark_buffer_dirty(leaf); + + ret = 0; +out: + btrfs_release_path(path); + return ret; +} + +struct btrfs_free_space_info * +search_free_space_info(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path, int cow) +{ + struct btrfs_root *root = fs_info->free_space_root; + struct btrfs_key key; + int ret; + + key.objectid = block_group->key.objectid; + key.type = BTRFS_FREE_SPACE_INFO_KEY; + key.offset = block_group->key.offset; + + ret = btrfs_search_slot(trans, root, &key, path, 0, cow); + if (ret < 0) + return ERR_PTR(ret); + if (ret != 0) { + btrfs_warn(fs_info, "missing free space info for %llu\n", + block_group->key.objectid); + ASSERT(0); + return ERR_PTR(-ENOENT); + } + + return btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_free_space_info); +} + +/* + * btrfs_search_slot() but we're looking for the greatest key less than the + * passed key. + */ +static int btrfs_search_prev_slot(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_key *key, struct btrfs_path *p, + int ins_len, int cow) +{ + int ret; + + ret = btrfs_search_slot(trans, root, key, p, ins_len, cow); + if (ret < 0) + return ret; + + if (ret == 0) { + ASSERT(0); + return -EIO; + } + + if (p->slots[0] == 0) { + ASSERT(0); + return -EIO; + } + p->slots[0]--; + + return 0; +} + +static inline u32 free_space_bitmap_size(u64 size, u32 sectorsize) +{ + return DIV_ROUND_UP((u32)div_u64(size, sectorsize), BITS_PER_BYTE); +} + +static unsigned long *alloc_bitmap(u32 bitmap_size) +{ + void *mem; + + /* + * The allocation size varies, observed numbers were < 4K up to 16K. + * Using vmalloc unconditionally would be too heavy, we'll try + * contiguous allocations first. + */ + if (bitmap_size <= PAGE_SIZE) + return kzalloc(bitmap_size, GFP_NOFS); + + mem = kzalloc(bitmap_size, GFP_NOFS | __GFP_NOWARN); + if (mem) + return mem; + + return __vmalloc(bitmap_size, GFP_NOFS | __GFP_HIGHMEM | __GFP_ZERO, + PAGE_KERNEL); +} + +int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path) +{ + struct btrfs_root *root = fs_info->free_space_root; + struct btrfs_free_space_info *info; + struct btrfs_key key, found_key; + struct extent_buffer *leaf; + unsigned long *bitmap; + char *bitmap_cursor; + u64 start, end; + u64 bitmap_range, i; + u32 bitmap_size, flags, expected_extent_count; + u32 extent_count = 0; + int done = 0, nr; + int ret; + + bitmap_size = free_space_bitmap_size(block_group->key.offset, + block_group->sectorsize); + bitmap = alloc_bitmap(bitmap_size); + if (!bitmap) { + ret = -ENOMEM; + goto out; + } + + start = block_group->key.objectid; + end = block_group->key.objectid + block_group->key.offset; + + key.objectid = end - 1; + key.type = (u8)-1; + key.offset = (u64)-1; + + while (!done) { + ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); + if (ret) + goto out; + + leaf = path->nodes[0]; + nr = 0; + path->slots[0]++; + while (path->slots[0] > 0) { + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0] - 1); + + if (found_key.type == BTRFS_FREE_SPACE_INFO_KEY) { + ASSERT(found_key.objectid == block_group->key.objectid); + ASSERT(found_key.offset == block_group->key.offset); + done = 1; + break; + } else if (found_key.type == BTRFS_FREE_SPACE_EXTENT_KEY) { + u64 first, last; + + ASSERT(found_key.objectid >= start); + ASSERT(found_key.objectid < end); + ASSERT(found_key.objectid + found_key.offset <= end); + + first = div_u64(found_key.objectid - start, + block_group->sectorsize); + last = div_u64(found_key.objectid + found_key.offset - start, + block_group->sectorsize); + bitmap_set(bitmap, first, last - first); + + extent_count++; + nr++; + path->slots[0]--; + } else { + ASSERT(0); + } + } + + ret = btrfs_del_items(trans, root, path, path->slots[0], nr); + if (ret) + goto out; + btrfs_release_path(path); + } + + info = search_free_space_info(trans, fs_info, block_group, path, 1); + if (IS_ERR(info)) { + ret = PTR_ERR(info); + goto out; + } + leaf = path->nodes[0]; + flags = btrfs_free_space_flags(leaf, info); + flags |= BTRFS_FREE_SPACE_USING_BITMAPS; + btrfs_set_free_space_flags(leaf, info, flags); + expected_extent_count = btrfs_free_space_extent_count(leaf, info); + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(path); + + if (extent_count != expected_extent_count) { + btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u", + block_group->key.objectid, extent_count, + expected_extent_count); + ASSERT(0); + ret = -EIO; + goto out; + } + + bitmap_cursor = (char *)bitmap; + bitmap_range = block_group->sectorsize * BTRFS_FREE_SPACE_BITMAP_BITS; + i = start; + while (i < end) { + unsigned long ptr; + u64 extent_size; + u32 data_size; + + extent_size = min(end - i, bitmap_range); + data_size = free_space_bitmap_size(extent_size, + block_group->sectorsize); + + key.objectid = i; + key.type = BTRFS_FREE_SPACE_BITMAP_KEY; + key.offset = extent_size; + + ret = btrfs_insert_empty_item(trans, root, path, &key, + data_size); + if (ret) + goto out; + + leaf = path->nodes[0]; + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + write_extent_buffer(leaf, bitmap_cursor, ptr, + data_size); + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(path); + + i += extent_size; + bitmap_cursor += data_size; + } + + ret = 0; +out: + kvfree(bitmap); + if (ret) + btrfs_abort_transaction(trans, root, ret); + return ret; +} + +int convert_free_space_to_extents(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path) +{ + struct btrfs_root *root = fs_info->free_space_root; + struct btrfs_free_space_info *info; + struct btrfs_key key, found_key; + struct extent_buffer *leaf; + unsigned long *bitmap; + u64 start, end; + /* Initialize to silence GCC. */ + u64 extent_start = 0; + u64 offset; + u32 bitmap_size, flags, expected_extent_count; + int prev_bit = 0, bit, bitnr; + u32 extent_count = 0; + int done = 0, nr; + int ret; + + bitmap_size = free_space_bitmap_size(block_group->key.offset, + block_group->sectorsize); + bitmap = alloc_bitmap(bitmap_size); + if (!bitmap) { + ret = -ENOMEM; + goto out; + } + + start = block_group->key.objectid; + end = block_group->key.objectid + block_group->key.offset; + + key.objectid = end - 1; + key.type = (u8)-1; + key.offset = (u64)-1; + + while (!done) { + ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); + if (ret) + goto out; + + leaf = path->nodes[0]; + nr = 0; + path->slots[0]++; + while (path->slots[0] > 0) { + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0] - 1); + + if (found_key.type == BTRFS_FREE_SPACE_INFO_KEY) { + ASSERT(found_key.objectid == block_group->key.objectid); + ASSERT(found_key.offset == block_group->key.offset); + done = 1; + break; + } else if (found_key.type == BTRFS_FREE_SPACE_BITMAP_KEY) { + unsigned long ptr; + char *bitmap_cursor; + u32 bitmap_pos, data_size; + + ASSERT(found_key.objectid >= start); + ASSERT(found_key.objectid < end); + ASSERT(found_key.objectid + found_key.offset <= end); + + bitmap_pos = div_u64(found_key.objectid - start, + block_group->sectorsize * + BITS_PER_BYTE); + bitmap_cursor = ((char *)bitmap) + bitmap_pos; + data_size = free_space_bitmap_size(found_key.offset, + block_group->sectorsize); + + ptr = btrfs_item_ptr_offset(leaf, path->slots[0] - 1); + read_extent_buffer(leaf, bitmap_cursor, ptr, + data_size); + + nr++; + path->slots[0]--; + } else { + ASSERT(0); + } + } + + ret = btrfs_del_items(trans, root, path, path->slots[0], nr); + if (ret) + goto out; + btrfs_release_path(path); + } + + info = search_free_space_info(trans, fs_info, block_group, path, 1); + if (IS_ERR(info)) { + ret = PTR_ERR(info); + goto out; + } + leaf = path->nodes[0]; + flags = btrfs_free_space_flags(leaf, info); + flags &= ~BTRFS_FREE_SPACE_USING_BITMAPS; + btrfs_set_free_space_flags(leaf, info, flags); + expected_extent_count = btrfs_free_space_extent_count(leaf, info); + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(path); + + offset = start; + bitnr = 0; + while (offset < end) { + bit = !!test_bit(bitnr, bitmap); + if (prev_bit == 0 && bit == 1) { + extent_start = offset; + } else if (prev_bit == 1 && bit == 0) { + key.objectid = extent_start; + key.type = BTRFS_FREE_SPACE_EXTENT_KEY; + key.offset = offset - extent_start; + + ret = btrfs_insert_empty_item(trans, root, path, &key, 0); + if (ret) + goto out; + btrfs_release_path(path); + + extent_count++; + } + prev_bit = bit; + offset += block_group->sectorsize; + bitnr++; + } + if (prev_bit == 1) { + key.objectid = extent_start; + key.type = BTRFS_FREE_SPACE_EXTENT_KEY; + key.offset = end - extent_start; + + ret = btrfs_insert_empty_item(trans, root, path, &key, 0); + if (ret) + goto out; + btrfs_release_path(path); + + extent_count++; + } + + if (extent_count != expected_extent_count) { + btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u", + block_group->key.objectid, extent_count, + expected_extent_count); + ASSERT(0); + ret = -EIO; + goto out; + } + + ret = 0; +out: + kvfree(bitmap); + if (ret) + btrfs_abort_transaction(trans, root, ret); + return ret; +} + +static int update_free_space_extent_count(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path, + int new_extents) +{ + struct btrfs_free_space_info *info; + u32 flags; + u32 extent_count; + int ret = 0; + + if (new_extents == 0) + return 0; + + info = search_free_space_info(trans, fs_info, block_group, path, 1); + if (IS_ERR(info)) { + ret = PTR_ERR(info); + goto out; + } + flags = btrfs_free_space_flags(path->nodes[0], info); + extent_count = btrfs_free_space_extent_count(path->nodes[0], info); + + extent_count += new_extents; + btrfs_set_free_space_extent_count(path->nodes[0], info, extent_count); + btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_release_path(path); + + if (!(flags & BTRFS_FREE_SPACE_USING_BITMAPS) && + extent_count > block_group->bitmap_high_thresh) { + ret = convert_free_space_to_bitmaps(trans, fs_info, block_group, + path); + } else if ((flags & BTRFS_FREE_SPACE_USING_BITMAPS) && + extent_count < block_group->bitmap_low_thresh) { + ret = convert_free_space_to_extents(trans, fs_info, block_group, + path); + } + +out: + return ret; +} + +int free_space_test_bit(struct btrfs_block_group_cache *block_group, + struct btrfs_path *path, u64 offset) +{ + struct extent_buffer *leaf; + struct btrfs_key key; + u64 found_start, found_end; + unsigned long ptr, i; + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + ASSERT(key.type == BTRFS_FREE_SPACE_BITMAP_KEY); + + found_start = key.objectid; + found_end = key.objectid + key.offset; + ASSERT(offset >= found_start && offset < found_end); + + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + i = div_u64(offset - found_start, block_group->sectorsize); + return !!extent_buffer_test_bit(leaf, ptr, i); +} + +static void free_space_set_bits(struct btrfs_block_group_cache *block_group, + struct btrfs_path *path, u64 *start, u64 *size, + int bit) +{ + struct extent_buffer *leaf; + struct btrfs_key key; + u64 end = *start + *size; + u64 found_start, found_end; + unsigned long ptr, first, last; + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + ASSERT(key.type == BTRFS_FREE_SPACE_BITMAP_KEY); + + found_start = key.objectid; + found_end = key.objectid + key.offset; + ASSERT(*start >= found_start && *start < found_end); + ASSERT(end > found_start); + + if (end > found_end) + end = found_end; + + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + first = div_u64(*start - found_start, block_group->sectorsize); + last = div_u64(end - found_start, block_group->sectorsize); + if (bit) + extent_buffer_bitmap_set(leaf, ptr, first, last - first); + else + extent_buffer_bitmap_clear(leaf, ptr, first, last - first); + btrfs_mark_buffer_dirty(leaf); + + *size -= end - *start; + *start = end; +} + +/* + * We can't use btrfs_next_item() in modify_free_space_bitmap() because + * btrfs_next_leaf() doesn't get the path for writing. We can forgo the fancy + * tree walking in btrfs_next_leaf() anyways because we know exactly what we're + * looking for. + */ +static int free_space_next_bitmap(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *p) +{ + struct btrfs_key key; + + if (p->slots[0] + 1 < btrfs_header_nritems(p->nodes[0])) { + p->slots[0]++; + return 0; + } + + btrfs_item_key_to_cpu(p->nodes[0], &key, p->slots[0]); + btrfs_release_path(p); + + key.objectid += key.offset; + key.type = (u8)-1; + key.offset = (u64)-1; + + return btrfs_search_prev_slot(trans, root, &key, p, 0, 1); +} + +/* + * If remove is 1, then we are removing free space, thus clearing bits in the + * bitmap. If remove is 0, then we are adding free space, thus setting bits in + * the bitmap. + */ +static int modify_free_space_bitmap(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path, + u64 start, u64 size, int remove) +{ + struct btrfs_root *root = fs_info->free_space_root; + struct btrfs_key key; + u64 end = start + size; + u64 cur_start, cur_size; + int prev_bit, next_bit; + int new_extents; + int ret; + + /* + * Read the bit for the block immediately before the extent of space if + * that block is within the block group. + */ + if (start > block_group->key.objectid) { + u64 prev_block = start - block_group->sectorsize; + + key.objectid = prev_block; + key.type = (u8)-1; + key.offset = (u64)-1; + + ret = btrfs_search_prev_slot(trans, root, &key, path, 0, 1); + if (ret) + goto out; + + prev_bit = free_space_test_bit(block_group, path, prev_block); + + /* The previous block may have been in the previous bitmap. */ + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (start >= key.objectid + key.offset) { + ret = free_space_next_bitmap(trans, root, path); + if (ret) + goto out; + } + } else { + key.objectid = start; + key.type = (u8)-1; + key.offset = (u64)-1; + + ret = btrfs_search_prev_slot(trans, root, &key, path, 0, 1); + if (ret) + goto out; + + prev_bit = -1; + } + + /* + * Iterate over all of the bitmaps overlapped by the extent of space, + * clearing/setting bits as required. + */ + cur_start = start; + cur_size = size; + while (1) { + free_space_set_bits(block_group, path, &cur_start, &cur_size, + !remove); + if (cur_size == 0) + break; + ret = free_space_next_bitmap(trans, root, path); + if (ret) + goto out; + } + + /* + * Read the bit for the block immediately after the extent of space if + * that block is within the block group. + */ + if (end < block_group->key.objectid + block_group->key.offset) { + /* The next block may be in the next bitmap. */ + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (end >= key.objectid + key.offset) { + ret = free_space_next_bitmap(trans, root, path); + if (ret) + goto out; + } + + next_bit = free_space_test_bit(block_group, path, end); + } else { + next_bit = -1; + } + + if (remove) { + new_extents = -1; + if (prev_bit == 1) { + /* Leftover on the left. */ + new_extents++; + } + if (next_bit == 1) { + /* Leftover on the right. */ + new_extents++; + } + } else { + new_extents = 1; + if (prev_bit == 1) { + /* Merging with neighbor on the left. */ + new_extents--; + } + if (next_bit == 1) { + /* Merging with neighbor on the right. */ + new_extents--; + } + } + + btrfs_release_path(path); + ret = update_free_space_extent_count(trans, fs_info, block_group, path, + new_extents); + +out: + return ret; +} + +static int remove_free_space_extent(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path, + u64 start, u64 size) +{ + struct btrfs_root *root = fs_info->free_space_root; + struct btrfs_key key; + u64 found_start, found_end; + u64 end = start + size; + int new_extents = -1; + int ret; + + key.objectid = start; + key.type = (u8)-1; + key.offset = (u64)-1; + + ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); + if (ret) + goto out; + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + + ASSERT(key.type == BTRFS_FREE_SPACE_EXTENT_KEY); + + found_start = key.objectid; + found_end = key.objectid + key.offset; + ASSERT(start >= found_start && end <= found_end); + + /* + * Okay, now that we've found the free space extent which contains the + * free space that we are removing, there are four cases: + * + * 1. We're using the whole extent: delete the key we found and + * decrement the free space extent count. + * 2. We are using part of the extent starting at the beginning: delete + * the key we found and insert a new key representing the leftover at + * the end. There is no net change in the number of extents. + * 3. We are using part of the extent ending at the end: delete the key + * we found and insert a new key representing the leftover at the + * beginning. There is no net change in the number of extents. + * 4. We are using part of the extent in the middle: delete the key we + * found and insert two new keys representing the leftovers on each + * side. Where we used to have one extent, we now have two, so increment + * the extent count. We may need to convert the block group to bitmaps + * as a result. + */ + + /* Delete the existing key (cases 1-4). */ + ret = btrfs_del_item(trans, root, path); + if (ret) + goto out; + + /* Add a key for leftovers at the beginning (cases 3 and 4). */ + if (start > found_start) { + key.objectid = found_start; + key.type = BTRFS_FREE_SPACE_EXTENT_KEY; + key.offset = start - found_start; + + btrfs_release_path(path); + ret = btrfs_insert_empty_item(trans, root, path, &key, 0); + if (ret) + goto out; + new_extents++; + } + + /* Add a key for leftovers at the end (cases 2 and 4). */ + if (end < found_end) { + key.objectid = end; + key.type = BTRFS_FREE_SPACE_EXTENT_KEY; + key.offset = found_end - end; + + btrfs_release_path(path); + ret = btrfs_insert_empty_item(trans, root, path, &key, 0); + if (ret) + goto out; + new_extents++; + } + + btrfs_release_path(path); + ret = update_free_space_extent_count(trans, fs_info, block_group, path, + new_extents); + +out: + return ret; +} + +int __remove_from_free_space_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path, u64 start, u64 size) +{ + struct btrfs_free_space_info *info; + u32 flags; + int ret; + + if (block_group->needs_free_space) { + ret = __add_block_group_free_space(trans, fs_info, block_group, + path); + if (ret) + return ret; + } + + info = search_free_space_info(NULL, fs_info, block_group, path, 0); + if (IS_ERR(info)) + return PTR_ERR(info); + flags = btrfs_free_space_flags(path->nodes[0], info); + btrfs_release_path(path); + + if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) { + return modify_free_space_bitmap(trans, fs_info, block_group, + path, start, size, 1); + } else { + return remove_free_space_extent(trans, fs_info, block_group, + path, start, size); + } +} + +int remove_from_free_space_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + u64 start, u64 size) +{ + struct btrfs_block_group_cache *block_group; + struct btrfs_path *path; + int ret; + + if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) + return 0; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + block_group = btrfs_lookup_block_group(fs_info, start); + if (!block_group) { + ASSERT(0); + ret = -ENOENT; + goto out; + } + + mutex_lock(&block_group->free_space_lock); + ret = __remove_from_free_space_tree(trans, fs_info, block_group, path, + start, size); + mutex_unlock(&block_group->free_space_lock); + + btrfs_put_block_group(block_group); +out: + btrfs_free_path(path); + if (ret) + btrfs_abort_transaction(trans, fs_info->free_space_root, ret); + return ret; +} + +static int add_free_space_extent(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path, + u64 start, u64 size) +{ + struct btrfs_root *root = fs_info->free_space_root; + struct btrfs_key key, new_key; + u64 found_start, found_end; + u64 end = start + size; + int new_extents = 1; + int ret; + + /* + * We are adding a new extent of free space, but we need to merge + * extents. There are four cases here: + * + * 1. The new extent does not have any immediate neighbors to merge + * with: add the new key and increment the free space extent count. We + * may need to convert the block group to bitmaps as a result. + * 2. The new extent has an immediate neighbor before it: remove the + * previous key and insert a new key combining both of them. There is no + * net change in the number of extents. + * 3. The new extent has an immediate neighbor after it: remove the next + * key and insert a new key combining both of them. There is no net + * change in the number of extents. + * 4. The new extent has immediate neighbors on both sides: remove both + * of the keys and insert a new key combining all of them. Where we used + * to have two extents, we now have one, so decrement the extent count. + */ + + new_key.objectid = start; + new_key.type = BTRFS_FREE_SPACE_EXTENT_KEY; + new_key.offset = size; + + /* Search for a neighbor on the left. */ + if (start == block_group->key.objectid) + goto right; + key.objectid = start - 1; + key.type = (u8)-1; + key.offset = (u64)-1; + + ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); + if (ret) + goto out; + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + + if (key.type != BTRFS_FREE_SPACE_EXTENT_KEY) { + ASSERT(key.type == BTRFS_FREE_SPACE_INFO_KEY); + btrfs_release_path(path); + goto right; + } + + found_start = key.objectid; + found_end = key.objectid + key.offset; + ASSERT(found_start >= block_group->key.objectid && + found_end > block_group->key.objectid); + ASSERT(found_start < start && found_end <= start); + + /* + * Delete the neighbor on the left and absorb it into the new key (cases + * 2 and 4). + */ + if (found_end == start) { + ret = btrfs_del_item(trans, root, path); + if (ret) + goto out; + new_key.objectid = found_start; + new_key.offset += key.offset; + new_extents--; + } + btrfs_release_path(path); + +right: + /* Search for a neighbor on the right. */ + if (end == block_group->key.objectid + block_group->key.offset) + goto insert; + key.objectid = end; + key.type = (u8)-1; + key.offset = (u64)-1; + + ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); + if (ret) + goto out; + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + + if (key.type != BTRFS_FREE_SPACE_EXTENT_KEY) { + ASSERT(key.type == BTRFS_FREE_SPACE_INFO_KEY); + btrfs_release_path(path); + goto insert; + } + + found_start = key.objectid; + found_end = key.objectid + key.offset; + ASSERT(found_start >= block_group->key.objectid && + found_end > block_group->key.objectid); + ASSERT((found_start < start && found_end <= start) || + (found_start >= end && found_end > end)); + + /* + * Delete the neighbor on the right and absorb it into the new key + * (cases 3 and 4). + */ + if (found_start == end) { + ret = btrfs_del_item(trans, root, path); + if (ret) + goto out; + new_key.offset += key.offset; + new_extents--; + } + btrfs_release_path(path); + +insert: + /* Insert the new key (cases 1-4). */ + ret = btrfs_insert_empty_item(trans, root, path, &new_key, 0); + if (ret) + goto out; + + btrfs_release_path(path); + ret = update_free_space_extent_count(trans, fs_info, block_group, path, + new_extents); + +out: + return ret; +} + +int __add_to_free_space_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path, u64 start, u64 size) +{ + struct btrfs_free_space_info *info; + u32 flags; + int ret; + + if (block_group->needs_free_space) { + ret = __add_block_group_free_space(trans, fs_info, block_group, + path); + if (ret) + return ret; + } + + info = search_free_space_info(NULL, fs_info, block_group, path, 0); + if (IS_ERR(info)) + return PTR_ERR(info); + flags = btrfs_free_space_flags(path->nodes[0], info); + btrfs_release_path(path); + + if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) { + return modify_free_space_bitmap(trans, fs_info, block_group, + path, start, size, 0); + } else { + return add_free_space_extent(trans, fs_info, block_group, path, + start, size); + } +} + +int add_to_free_space_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + u64 start, u64 size) +{ + struct btrfs_block_group_cache *block_group; + struct btrfs_path *path; + int ret; + + if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) + return 0; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + block_group = btrfs_lookup_block_group(fs_info, start); + if (!block_group) { + ASSERT(0); + ret = -ENOENT; + goto out; + } + + mutex_lock(&block_group->free_space_lock); + ret = __add_to_free_space_tree(trans, fs_info, block_group, path, start, + size); + mutex_unlock(&block_group->free_space_lock); + + btrfs_put_block_group(block_group); +out: + btrfs_free_path(path); + if (ret) + btrfs_abort_transaction(trans, fs_info->free_space_root, ret); + return ret; +} + +/* + * Populate the free space tree by walking the extent tree. Operations on the + * extent tree that happen as a result of writes to the free space tree will go + * through the normal add/remove hooks. + */ +static int populate_free_space_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group) +{ + struct btrfs_root *extent_root = fs_info->extent_root; + struct btrfs_path *path, *path2; + struct btrfs_key key; + u64 start, end; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->reada = 1; + + path2 = btrfs_alloc_path(); + if (!path2) { + btrfs_free_path(path); + return -ENOMEM; + } + + ret = add_new_free_space_info(trans, fs_info, block_group, path2); + if (ret) + goto out; + + mutex_lock(&block_group->free_space_lock); + + /* + * Iterate through all of the extent and metadata items in this block + * group, adding the free space between them and the free space at the + * end. Note that EXTENT_ITEM and METADATA_ITEM are less than + * BLOCK_GROUP_ITEM, so an extent may precede the block group that it's + * contained in. + */ + key.objectid = block_group->key.objectid; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = 0; + + ret = btrfs_search_slot_for_read(extent_root, &key, path, 1, 0); + if (ret < 0) + goto out_locked; + ASSERT(ret == 0); + + start = block_group->key.objectid; + end = block_group->key.objectid + block_group->key.offset; + while (1) { + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + + if (key.type == BTRFS_EXTENT_ITEM_KEY || + key.type == BTRFS_METADATA_ITEM_KEY) { + if (key.objectid >= end) + break; + + if (start < key.objectid) { + ret = __add_to_free_space_tree(trans, fs_info, + block_group, + path2, start, + key.objectid - + start); + if (ret) + goto out_locked; + } + start = key.objectid; + if (key.type == BTRFS_METADATA_ITEM_KEY) + start += fs_info->tree_root->nodesize; + else + start += key.offset; + } else if (key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { + if (key.objectid != block_group->key.objectid) + break; + } + + ret = btrfs_next_item(extent_root, path); + if (ret < 0) + goto out_locked; + if (ret) + break; + } + if (start < end) { + ret = __add_to_free_space_tree(trans, fs_info, block_group, + path2, start, end - start); + if (ret) + goto out_locked; + } + + ret = 0; +out_locked: + mutex_unlock(&block_group->free_space_lock); +out: + btrfs_free_path(path2); + btrfs_free_path(path); + return ret; +} + +int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *tree_root = fs_info->tree_root; + struct btrfs_root *free_space_root; + struct btrfs_block_group_cache *block_group; + struct rb_node *node; + int ret; + + trans = btrfs_start_transaction(tree_root, 0); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + fs_info->creating_free_space_tree = 1; + free_space_root = btrfs_create_tree(trans, fs_info, + BTRFS_FREE_SPACE_TREE_OBJECTID); + if (IS_ERR(free_space_root)) { + ret = PTR_ERR(free_space_root); + goto abort; + } + fs_info->free_space_root = free_space_root; + + node = rb_first(&fs_info->block_group_cache_tree); + while (node) { + block_group = rb_entry(node, struct btrfs_block_group_cache, + cache_node); + ret = populate_free_space_tree(trans, fs_info, block_group); + if (ret) + goto abort; + node = rb_next(node); + } + + btrfs_set_fs_compat_ro(fs_info, FREE_SPACE_TREE); + fs_info->creating_free_space_tree = 0; + + ret = btrfs_commit_transaction(trans, tree_root); + if (ret) + return ret; + + return 0; + +abort: + fs_info->creating_free_space_tree = 0; + btrfs_abort_transaction(trans, tree_root, ret); + btrfs_end_transaction(trans, tree_root); + return ret; +} + +static int clear_free_space_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_path *path; + struct btrfs_key key; + int nr; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + path->leave_spinning = 1; + + key.objectid = 0; + key.type = 0; + key.offset = 0; + + while (1) { + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + goto out; + + nr = btrfs_header_nritems(path->nodes[0]); + if (!nr) + break; + + path->slots[0] = 0; + ret = btrfs_del_items(trans, root, path, 0, nr); + if (ret) + goto out; + + btrfs_release_path(path); + } + + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *tree_root = fs_info->tree_root; + struct btrfs_root *free_space_root = fs_info->free_space_root; + int ret; + + trans = btrfs_start_transaction(tree_root, 0); + if (IS_ERR(trans)) + return PTR_ERR(trans); + + btrfs_clear_fs_compat_ro(fs_info, FREE_SPACE_TREE); + fs_info->free_space_root = NULL; + + ret = clear_free_space_tree(trans, free_space_root); + if (ret) + goto abort; + + ret = btrfs_del_root(trans, tree_root, &free_space_root->root_key); + if (ret) + goto abort; + + list_del(&free_space_root->dirty_list); + + btrfs_tree_lock(free_space_root->node); + clean_tree_block(trans, tree_root->fs_info, free_space_root->node); + btrfs_tree_unlock(free_space_root->node); + btrfs_free_tree_block(trans, free_space_root, free_space_root->node, + 0, 1); + + free_extent_buffer(free_space_root->node); + free_extent_buffer(free_space_root->commit_root); + kfree(free_space_root); + + ret = btrfs_commit_transaction(trans, tree_root); + if (ret) + return ret; + + return 0; + +abort: + btrfs_abort_transaction(trans, tree_root, ret); + btrfs_end_transaction(trans, tree_root); + return ret; +} + +static int __add_block_group_free_space(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path) +{ + u64 start, end; + int ret; + + start = block_group->key.objectid; + end = block_group->key.objectid + block_group->key.offset; + + block_group->needs_free_space = 0; + + ret = add_new_free_space_info(trans, fs_info, block_group, path); + if (ret) + return ret; + + return __add_to_free_space_tree(trans, fs_info, block_group, path, + block_group->key.objectid, + block_group->key.offset); +} + +int add_block_group_free_space(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group) +{ + struct btrfs_path *path = NULL; + int ret = 0; + + if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) + return 0; + + mutex_lock(&block_group->free_space_lock); + if (!block_group->needs_free_space) + goto out; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + ret = __add_block_group_free_space(trans, fs_info, block_group, path); + +out: + btrfs_free_path(path); + mutex_unlock(&block_group->free_space_lock); + if (ret) + btrfs_abort_transaction(trans, fs_info->free_space_root, ret); + return ret; +} + +int remove_block_group_free_space(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group) +{ + struct btrfs_root *root = fs_info->free_space_root; + struct btrfs_path *path; + struct btrfs_key key, found_key; + struct extent_buffer *leaf; + u64 start, end; + int done = 0, nr; + int ret; + + if (!btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) + return 0; + + if (block_group->needs_free_space) { + /* We never added this block group to the free space tree. */ + return 0; + } + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + start = block_group->key.objectid; + end = block_group->key.objectid + block_group->key.offset; + + key.objectid = end - 1; + key.type = (u8)-1; + key.offset = (u64)-1; + + while (!done) { + ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); + if (ret) + goto out; + + leaf = path->nodes[0]; + nr = 0; + path->slots[0]++; + while (path->slots[0] > 0) { + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0] - 1); + + if (found_key.type == BTRFS_FREE_SPACE_INFO_KEY) { + ASSERT(found_key.objectid == block_group->key.objectid); + ASSERT(found_key.offset == block_group->key.offset); + done = 1; + nr++; + path->slots[0]--; + break; + } else if (found_key.type == BTRFS_FREE_SPACE_EXTENT_KEY || + found_key.type == BTRFS_FREE_SPACE_BITMAP_KEY) { + ASSERT(found_key.objectid >= start); + ASSERT(found_key.objectid < end); + ASSERT(found_key.objectid + found_key.offset <= end); + nr++; + path->slots[0]--; + } else { + ASSERT(0); + } + } + + ret = btrfs_del_items(trans, root, path, path->slots[0], nr); + if (ret) + goto out; + btrfs_release_path(path); + } + + ret = 0; +out: + btrfs_free_path(path); + if (ret) + btrfs_abort_transaction(trans, root, ret); + return ret; +} + +static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, + struct btrfs_path *path, + u32 expected_extent_count) +{ + struct btrfs_block_group_cache *block_group; + struct btrfs_fs_info *fs_info; + struct btrfs_root *root; + struct btrfs_key key; + int prev_bit = 0, bit; + /* Initialize to silence GCC. */ + u64 extent_start = 0; + u64 end, offset; + u64 total_found = 0; + u32 extent_count = 0; + int ret; + + block_group = caching_ctl->block_group; + fs_info = block_group->fs_info; + root = fs_info->free_space_root; + + end = block_group->key.objectid + block_group->key.offset; + + while (1) { + ret = btrfs_next_item(root, path); + if (ret < 0) + goto out; + if (ret) + break; + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + + if (key.type == BTRFS_FREE_SPACE_INFO_KEY) + break; + + ASSERT(key.type == BTRFS_FREE_SPACE_BITMAP_KEY); + ASSERT(key.objectid < end && key.objectid + key.offset <= end); + + caching_ctl->progress = key.objectid; + + offset = key.objectid; + while (offset < key.objectid + key.offset) { + bit = free_space_test_bit(block_group, path, offset); + if (prev_bit == 0 && bit == 1) { + extent_start = offset; + } else if (prev_bit == 1 && bit == 0) { + total_found += add_new_free_space(block_group, + fs_info, + extent_start, + offset); + if (total_found > CACHING_CTL_WAKE_UP) { + total_found = 0; + wake_up(&caching_ctl->wait); + } + extent_count++; + } + prev_bit = bit; + offset += block_group->sectorsize; + } + } + if (prev_bit == 1) { + total_found += add_new_free_space(block_group, fs_info, + extent_start, end); + extent_count++; + } + + if (extent_count != expected_extent_count) { + btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u", + block_group->key.objectid, extent_count, + expected_extent_count); + ASSERT(0); + ret = -EIO; + goto out; + } + + caching_ctl->progress = (u64)-1; + + ret = 0; +out: + return ret; +} + +static int load_free_space_extents(struct btrfs_caching_control *caching_ctl, + struct btrfs_path *path, + u32 expected_extent_count) +{ + struct btrfs_block_group_cache *block_group; + struct btrfs_fs_info *fs_info; + struct btrfs_root *root; + struct btrfs_key key; + u64 end; + u64 total_found = 0; + u32 extent_count = 0; + int ret; + + block_group = caching_ctl->block_group; + fs_info = block_group->fs_info; + root = fs_info->free_space_root; + + end = block_group->key.objectid + block_group->key.offset; + + while (1) { + ret = btrfs_next_item(root, path); + if (ret < 0) + goto out; + if (ret) + break; + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + + if (key.type == BTRFS_FREE_SPACE_INFO_KEY) + break; + + ASSERT(key.type == BTRFS_FREE_SPACE_EXTENT_KEY); + ASSERT(key.objectid < end && key.objectid + key.offset <= end); + + caching_ctl->progress = key.objectid; + + total_found += add_new_free_space(block_group, fs_info, + key.objectid, + key.objectid + key.offset); + if (total_found > CACHING_CTL_WAKE_UP) { + total_found = 0; + wake_up(&caching_ctl->wait); + } + extent_count++; + } + + if (extent_count != expected_extent_count) { + btrfs_err(fs_info, "incorrect extent count for %llu; counted %u, expected %u", + block_group->key.objectid, extent_count, + expected_extent_count); + ASSERT(0); + ret = -EIO; + goto out; + } + + caching_ctl->progress = (u64)-1; + + ret = 0; +out: + return ret; +} + +int load_free_space_tree(struct btrfs_caching_control *caching_ctl) +{ + struct btrfs_block_group_cache *block_group; + struct btrfs_fs_info *fs_info; + struct btrfs_free_space_info *info; + struct btrfs_path *path; + u32 extent_count, flags; + int ret; + + block_group = caching_ctl->block_group; + fs_info = block_group->fs_info; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* + * Just like caching_thread() doesn't want to deadlock on the extent + * tree, we don't want to deadlock on the free space tree. + */ + path->skip_locking = 1; + path->search_commit_root = 1; + path->reada = 1; + + info = search_free_space_info(NULL, fs_info, block_group, path, 0); + if (IS_ERR(info)) { + ret = PTR_ERR(info); + goto out; + } + extent_count = btrfs_free_space_extent_count(path->nodes[0], info); + flags = btrfs_free_space_flags(path->nodes[0], info); + + /* + * We left path pointing to the free space info item, so now + * load_free_space_foo can just iterate through the free space tree from + * there. + */ + if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) + ret = load_free_space_bitmaps(caching_ctl, path, extent_count); + else + ret = load_free_space_extents(caching_ctl, path, extent_count); + +out: + btrfs_free_path(path); + return ret; +} diff --git a/fs/btrfs/free-space-tree.h b/fs/btrfs/free-space-tree.h new file mode 100644 index 000000000..54ffced3b --- /dev/null +++ b/fs/btrfs/free-space-tree.h @@ -0,0 +1,72 @@ +/* + * Copyright (C) 2015 Facebook. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_FREE_SPACE_TREE +#define __BTRFS_FREE_SPACE_TREE + +/* + * The default size for new free space bitmap items. The last bitmap in a block + * group may be truncated, and none of the free space tree code assumes that + * existing bitmaps are this size. + */ +#define BTRFS_FREE_SPACE_BITMAP_SIZE 256 +#define BTRFS_FREE_SPACE_BITMAP_BITS (BTRFS_FREE_SPACE_BITMAP_SIZE * BITS_PER_BYTE) + +void set_free_space_tree_thresholds(struct btrfs_block_group_cache *block_group); +int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info); +int btrfs_clear_free_space_tree(struct btrfs_fs_info *fs_info); +int load_free_space_tree(struct btrfs_caching_control *caching_ctl); +int add_block_group_free_space(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group); +int remove_block_group_free_space(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group); +int add_to_free_space_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + u64 start, u64 size); +int remove_from_free_space_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + u64 start, u64 size); + +/* Exposed for testing. */ +struct btrfs_free_space_info * +search_free_space_info(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path, int cow); +int __add_to_free_space_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path, u64 start, u64 size); +int __remove_from_free_space_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path, u64 start, u64 size); +int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path); +int convert_free_space_to_extents(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *block_group, + struct btrfs_path *path); +int free_space_test_bit(struct btrfs_block_group_cache *block_group, + struct btrfs_path *path, u64 offset); + +#endif diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 07573dc16..e50316c4a 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -48,7 +48,7 @@ static int caching_kthread(void *data) /* Since the commit root is read-only, we can safely skip locking. */ path->skip_locking = 1; path->search_commit_root = 1; - path->reada = 2; + path->reada = READA_FORWARD; key.objectid = BTRFS_FIRST_FREE_OBJECTID; key.offset = 0; @@ -282,7 +282,7 @@ void btrfs_unpin_free_ino(struct btrfs_root *root) } } -#define INIT_THRESHOLD (((1024 * 32) / 2) / sizeof(struct btrfs_free_space)) +#define INIT_THRESHOLD ((SZ_32K / 2) / sizeof(struct btrfs_free_space)) #define INODES_PER_BITMAP (PAGE_CACHE_SIZE * 8) /* @@ -334,7 +334,7 @@ static bool use_bitmap(struct btrfs_free_space_ctl *ctl, return true; } -static struct btrfs_free_space_op free_ino_op = { +static const struct btrfs_free_space_op free_ino_op = { .recalc_thresholds = recalculate_thresholds, .use_bitmap = use_bitmap, }; @@ -356,7 +356,7 @@ static bool pinned_use_bitmap(struct btrfs_free_space_ctl *ctl, return false; } -static struct btrfs_free_space_op pinned_free_ino_op = { +static const struct btrfs_free_space_op pinned_free_ino_op = { .recalc_thresholds = pinned_recalc_thresholds, .use_bitmap = pinned_use_bitmap, }; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 4bc9dbf29..d96f5cf38 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -66,6 +66,13 @@ struct btrfs_iget_args { struct btrfs_root *root; }; +struct btrfs_dio_data { + u64 outstanding_extents; + u64 reserve; + u64 unsubmitted_oe_range_start; + u64 unsubmitted_oe_range_end; +}; + static const struct inode_operations btrfs_dir_inode_operations; static const struct inode_operations btrfs_symlink_inode_operations; static const struct inode_operations btrfs_dir_ro_inode_operations; @@ -74,17 +81,16 @@ static const struct inode_operations btrfs_file_inode_operations; static const struct address_space_operations btrfs_aops; static const struct address_space_operations btrfs_symlink_aops; static const struct file_operations btrfs_dir_file_operations; -static struct extent_io_ops btrfs_extent_io_ops; +static const struct extent_io_ops btrfs_extent_io_ops; static struct kmem_cache *btrfs_inode_cachep; -static struct kmem_cache *btrfs_delalloc_work_cachep; struct kmem_cache *btrfs_trans_handle_cachep; struct kmem_cache *btrfs_transaction_cachep; struct kmem_cache *btrfs_path_cachep; struct kmem_cache *btrfs_free_space_cachep; #define S_SHIFT 12 -static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { +static const unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { [S_IFREG >> S_SHIFT] = BTRFS_FT_REG_FILE, [S_IFDIR >> S_SHIFT] = BTRFS_FT_DIR, [S_IFCHR >> S_SHIFT] = BTRFS_FT_CHRDEV, @@ -414,15 +420,15 @@ static noinline void compress_file_range(struct inode *inode, unsigned long nr_pages_ret = 0; unsigned long total_compressed = 0; unsigned long total_in = 0; - unsigned long max_compressed = 128 * 1024; - unsigned long max_uncompressed = 128 * 1024; + unsigned long max_compressed = SZ_128K; + unsigned long max_uncompressed = SZ_128K; int i; int will_compress; int compress_type = root->fs_info->compress_type; int redirty = 0; /* if this is a small write inside eof, kick off a defrag */ - if ((end - start + 1) < 16 * 1024 && + if ((end - start + 1) < SZ_16K && (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) btrfs_add_inode_defrag(NULL, inode); @@ -430,7 +436,7 @@ static noinline void compress_file_range(struct inode *inode, again: will_compress = 0; nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1; - nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE); + nr_pages = min_t(unsigned long, nr_pages, SZ_128K / PAGE_CACHE_SIZE); /* * we don't want to send crud past the end of i_size through @@ -944,7 +950,7 @@ static noinline int cow_file_range(struct inode *inode, disk_num_bytes = num_bytes; /* if this is a small write inside eof, kick off defrag */ - if (num_bytes < 64 * 1024 && + if (num_bytes < SZ_64K && (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size)) btrfs_add_inode_defrag(NULL, inode); @@ -1107,7 +1113,7 @@ static noinline void async_cow_submit(struct btrfs_work *work) * atomic_sub_return implies a barrier for waitqueue_active */ if (atomic_sub_return(nr_pages, &root->fs_info->async_delalloc_pages) < - 5 * 1024 * 1024 && + 5 * SZ_1M && waitqueue_active(&root->fs_info->async_submit_wait)) wake_up(&root->fs_info->async_submit_wait); @@ -1132,7 +1138,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, struct btrfs_root *root = BTRFS_I(inode)->root; unsigned long nr_pages; u64 cur_end; - int limit = 10 * 1024 * 1024; + int limit = 10 * SZ_1M; clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED, 1, 0, NULL, GFP_NOFS); @@ -1148,7 +1154,7 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, !btrfs_test_opt(root, FORCE_COMPRESS)) cur_end = end; else - cur_end = min(end, start + 512 * 1024 - 1); + cur_end = min(end, start + SZ_512K - 1); async_cow->end = cur_end; INIT_LIST_HEAD(&async_cow->extents); @@ -1989,7 +1995,7 @@ again: page_start = page_offset(page); page_end = page_offset(page) + PAGE_CACHE_SIZE - 1; - lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, 0, + lock_extent_bits(&BTRFS_I(inode)->io_tree, page_start, page_end, &cached_state); /* already ordered? We're done */ @@ -2482,7 +2488,7 @@ static noinline int relink_extent_backref(struct btrfs_path *path, lock_start = backref->file_pos; lock_end = backref->file_pos + backref->num_bytes - 1; lock_extent_bits(&BTRFS_I(inode)->io_tree, lock_start, lock_end, - 0, &cached); + &cached); ordered = btrfs_lookup_first_ordered_extent(inode, lock_end); if (ordered) { @@ -2874,7 +2880,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) lock_extent_bits(io_tree, ordered_extent->file_offset, ordered_extent->file_offset + ordered_extent->len - 1, - 0, &cached_state); + &cached_state); ret = test_range_bit(io_tree, ordered_extent->file_offset, ordered_extent->file_offset + ordered_extent->len - 1, @@ -3106,52 +3112,46 @@ static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio, start, (size_t)(end - start + 1)); } -struct delayed_iput { - struct list_head list; - struct inode *inode; -}; - -/* JDM: If this is fs-wide, why can't we add a pointer to - * btrfs_inode instead and avoid the allocation? */ void btrfs_add_delayed_iput(struct inode *inode) { struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; - struct delayed_iput *delayed; + struct btrfs_inode *binode = BTRFS_I(inode); if (atomic_add_unless(&inode->i_count, -1, 1)) return; - delayed = kmalloc(sizeof(*delayed), GFP_NOFS | __GFP_NOFAIL); - delayed->inode = inode; - spin_lock(&fs_info->delayed_iput_lock); - list_add_tail(&delayed->list, &fs_info->delayed_iputs); + if (binode->delayed_iput_count == 0) { + ASSERT(list_empty(&binode->delayed_iput)); + list_add_tail(&binode->delayed_iput, &fs_info->delayed_iputs); + } else { + binode->delayed_iput_count++; + } spin_unlock(&fs_info->delayed_iput_lock); } void btrfs_run_delayed_iputs(struct btrfs_root *root) { - LIST_HEAD(list); struct btrfs_fs_info *fs_info = root->fs_info; - struct delayed_iput *delayed; - int empty; spin_lock(&fs_info->delayed_iput_lock); - empty = list_empty(&fs_info->delayed_iputs); - spin_unlock(&fs_info->delayed_iput_lock); - if (empty) - return; - - spin_lock(&fs_info->delayed_iput_lock); - list_splice_init(&fs_info->delayed_iputs, &list); - spin_unlock(&fs_info->delayed_iput_lock); - - while (!list_empty(&list)) { - delayed = list_entry(list.next, struct delayed_iput, list); - list_del(&delayed->list); - iput(delayed->inode); - kfree(delayed); + while (!list_empty(&fs_info->delayed_iputs)) { + struct btrfs_inode *inode; + + inode = list_first_entry(&fs_info->delayed_iputs, + struct btrfs_inode, delayed_iput); + if (inode->delayed_iput_count) { + inode->delayed_iput_count--; + list_move_tail(&inode->delayed_iput, + &fs_info->delayed_iputs); + } else { + list_del_init(&inode->delayed_iput); + } + spin_unlock(&fs_info->delayed_iput_lock); + iput(&inode->vfs_inode); + spin_lock(&fs_info->delayed_iput_lock); } + spin_unlock(&fs_info->delayed_iput_lock); } /* @@ -3347,7 +3347,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) ret = -ENOMEM; goto out; } - path->reada = -1; + path->reada = READA_BACK; key.objectid = BTRFS_ORPHAN_OBJECTID; key.type = BTRFS_ORPHAN_ITEM_KEY; @@ -3546,10 +3546,10 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf, int scanned = 0; if (!xattr_access) { - xattr_access = btrfs_name_hash(POSIX_ACL_XATTR_ACCESS, - strlen(POSIX_ACL_XATTR_ACCESS)); - xattr_default = btrfs_name_hash(POSIX_ACL_XATTR_DEFAULT, - strlen(POSIX_ACL_XATTR_DEFAULT)); + xattr_access = btrfs_name_hash(XATTR_NAME_POSIX_ACL_ACCESS, + strlen(XATTR_NAME_POSIX_ACL_ACCESS)); + xattr_default = btrfs_name_hash(XATTR_NAME_POSIX_ACL_DEFAULT, + strlen(XATTR_NAME_POSIX_ACL_DEFAULT)); } slot++; @@ -3770,6 +3770,7 @@ cache_acl: break; case S_IFLNK: inode->i_op = &btrfs_symlink_inode_operations; + inode_nohighmem(inode); inode->i_mapping->a_ops = &btrfs_symlink_aops; break; default: @@ -4313,7 +4314,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = -1; + path->reada = READA_BACK; /* * We want to drop from the next block forward in case this new size is @@ -4344,7 +4345,7 @@ search_again: * up a huge file in a single leaf. Most of the time that * bytes_deleted is > 0, it will be huge by the time we get here */ - if (be_nice && bytes_deleted > 32 * 1024 * 1024) { + if (be_nice && bytes_deleted > SZ_32M) { if (btrfs_should_end_transaction(trans, root)) { err = -EAGAIN; goto error; @@ -4587,7 +4588,7 @@ error: btrfs_free_path(path); - if (be_nice && bytes_deleted > 32 * 1024 * 1024) { + if (be_nice && bytes_deleted > SZ_32M) { unsigned long updates = trans->delayed_ref_updates; if (updates) { trans->delayed_ref_updates = 0; @@ -4664,7 +4665,7 @@ again: } wait_on_page_writeback(page); - lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state); + lock_extent_bits(io_tree, page_start, page_end, &cached_state); set_page_extent_mapped(page); ordered = btrfs_lookup_ordered_extent(inode, page_start); @@ -4795,7 +4796,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t oldsize, loff_t size) while (1) { struct btrfs_ordered_extent *ordered; - lock_extent_bits(io_tree, hole_start, block_end - 1, 0, + lock_extent_bits(io_tree, hole_start, block_end - 1, &cached_state); ordered = btrfs_lookup_ordered_range(inode, hole_start, block_end - hole_start); @@ -4871,26 +4872,6 @@ next: return err; } -static int wait_snapshoting_atomic_t(atomic_t *a) -{ - schedule(); - return 0; -} - -static void wait_for_snapshot_creation(struct btrfs_root *root) -{ - while (true) { - int ret; - - ret = btrfs_start_write_no_snapshoting(root); - if (ret) - break; - wait_on_atomic_t(&root->will_be_snapshoted, - wait_snapshoting_atomic_t, - TASK_UNINTERRUPTIBLE); - } -} - static int btrfs_setsize(struct inode *inode, struct iattr *attr) { struct btrfs_root *root = BTRFS_I(inode)->root; @@ -4922,7 +4903,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) * truncation, it must capture all writes that happened before * this truncation. */ - wait_for_snapshot_creation(root); + btrfs_wait_for_snapshot_creation(root); ret = btrfs_cont_expand(inode, oldsize, newsize); if (ret) { btrfs_end_write_no_snapshoting(root); @@ -5107,7 +5088,7 @@ static void evict_inode_truncate_pages(struct inode *inode) end = state->end; spin_unlock(&io_tree->lock); - lock_extent_bits(io_tree, start, end, 0, &cached_state); + lock_extent_bits(io_tree, start, end, &cached_state); /* * If still has DELALLOC flag, the extent didn't reach disk, @@ -5300,7 +5281,6 @@ void btrfs_evict_inode(struct inode *inode) no_delete: btrfs_remove_delayed_node(inode); clear_inode(inode); - return; } /* @@ -5750,7 +5730,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) if (!path) return -ENOMEM; - path->reada = 1; + path->reada = READA_FORWARD; if (key_type == BTRFS_DIR_INDEX_KEY) { INIT_LIST_HEAD(&ins_list); @@ -6697,7 +6677,7 @@ static int merge_extent_mapping(struct extent_map_tree *em_tree, } static noinline int uncompress_inline(struct btrfs_path *path, - struct inode *inode, struct page *page, + struct page *page, size_t pg_offset, u64 extent_offset, struct btrfs_file_extent_item *item) { @@ -6794,7 +6774,7 @@ again: * Chances are we'll be called again, so go ahead and do * readahead */ - path->reada = 1; + path->reada = READA_FORWARD; } ret = btrfs_lookup_file_extent(trans, root, path, @@ -6893,8 +6873,7 @@ next: if (create == 0 && !PageUptodate(page)) { if (btrfs_file_extent_compression(leaf, item) != BTRFS_COMPRESS_NONE) { - ret = uncompress_inline(path, inode, page, - pg_offset, + ret = uncompress_inline(path, page, pg_offset, extent_offset, item); if (ret) { err = ret; @@ -7149,21 +7128,41 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode, if (ret) return ERR_PTR(ret); - em = create_pinned_em(inode, start, ins.offset, start, ins.objectid, - ins.offset, ins.offset, ins.offset, 0); - if (IS_ERR(em)) { - btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); - return em; - } - + /* + * Create the ordered extent before the extent map. This is to avoid + * races with the fast fsync path that would lead to it logging file + * extent items that point to disk extents that were not yet written to. + * The fast fsync path collects ordered extents into a local list and + * then collects all the new extent maps, so we must create the ordered + * extent first and make sure the fast fsync path collects any new + * ordered extents after collecting new extent maps as well. + * The fsync path simply can not rely on inode_dio_wait() because it + * causes deadlock with AIO. + */ ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid, ins.offset, ins.offset, 0); if (ret) { btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); - free_extent_map(em); return ERR_PTR(ret); } + em = create_pinned_em(inode, start, ins.offset, start, ins.objectid, + ins.offset, ins.offset, ins.offset, 0); + if (IS_ERR(em)) { + struct btrfs_ordered_extent *oe; + + btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); + oe = btrfs_lookup_ordered_extent(inode, start); + ASSERT(oe); + if (WARN_ON(!oe)) + return em; + set_bit(BTRFS_ORDERED_IOERR, &oe->flags); + set_bit(BTRFS_ORDERED_IO_DONE, &oe->flags); + btrfs_remove_ordered_extent(inode, oe); + /* Once for our lookup and once for the ordered extents tree. */ + btrfs_put_ordered_extent(oe); + btrfs_put_ordered_extent(oe); + } return em; } @@ -7390,7 +7389,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, while (1) { lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, - 0, cached_state); + cached_state); /* * We're concerned with the entire range that we're going to be * doing DIO to, so we need to make sure theres no ordered @@ -7418,25 +7417,21 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, btrfs_start_ordered_extent(inode, ordered, 1); btrfs_put_ordered_extent(ordered); } else { - /* Screw you mmap */ - ret = btrfs_fdatawrite_range(inode, lockstart, lockend); - if (ret) - break; - ret = filemap_fdatawait_range(inode->i_mapping, - lockstart, - lockend); - if (ret) - break; - /* - * If we found a page that couldn't be invalidated just - * fall back to buffered. + * We could trigger writeback for this range (and wait + * for it to complete) and then invalidate the pages for + * this range (through invalidate_inode_pages2_range()), + * but that can lead us to a deadlock with a concurrent + * call to readpages() (a buffered read or a defrag call + * triggered a readahead) on a page lock due to an + * ordered dio extent we created before but did not have + * yet a corresponding bio submitted (whence it can not + * complete), which makes readpages() wait for that + * ordered extent to complete while holding a lock on + * that page. */ - ret = invalidate_inode_pages2_range(inode->i_mapping, - lockstart >> PAGE_CACHE_SHIFT, - lockend >> PAGE_CACHE_SHIFT); - if (ret) - break; + ret = -ENOTBLK; + break; } cond_resched(); @@ -7492,11 +7487,6 @@ static struct extent_map *create_pinned_em(struct inode *inode, u64 start, return em; } -struct btrfs_dio_data { - u64 outstanding_extents; - u64 reserve; -}; - static void adjust_dio_outstanding_extents(struct inode *inode, struct btrfs_dio_data *dio_data, const u64 len) @@ -7680,6 +7670,7 @@ unlock: btrfs_free_reserved_data_space(inode, start, len); WARN_ON(dio_data->reserve < len); dio_data->reserve -= len; + dio_data->unsubmitted_oe_range_end = start + len; current->journal_info = dio_data; } @@ -8003,22 +7994,22 @@ static void btrfs_endio_direct_read(struct bio *bio) bio_put(bio); } -static void btrfs_endio_direct_write(struct bio *bio) +static void btrfs_endio_direct_write_update_ordered(struct inode *inode, + const u64 offset, + const u64 bytes, + const int uptodate) { - struct btrfs_dio_private *dip = bio->bi_private; - struct inode *inode = dip->inode; struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_ordered_extent *ordered = NULL; - u64 ordered_offset = dip->logical_offset; - u64 ordered_bytes = dip->bytes; - struct bio *dio_bio; + u64 ordered_offset = offset; + u64 ordered_bytes = bytes; int ret; again: ret = btrfs_dec_test_first_ordered_pending(inode, &ordered, &ordered_offset, ordered_bytes, - !bio->bi_error); + uptodate); if (!ret) goto out_test; @@ -8031,13 +8022,22 @@ out_test: * our bio might span multiple ordered extents. If we haven't * completed the accounting for the whole dio, go back and try again */ - if (ordered_offset < dip->logical_offset + dip->bytes) { - ordered_bytes = dip->logical_offset + dip->bytes - - ordered_offset; + if (ordered_offset < offset + bytes) { + ordered_bytes = offset + bytes - ordered_offset; ordered = NULL; goto again; } - dio_bio = dip->dio_bio; +} + +static void btrfs_endio_direct_write(struct bio *bio) +{ + struct btrfs_dio_private *dip = bio->bi_private; + struct bio *dio_bio = dip->dio_bio; + + btrfs_endio_direct_write_update_ordered(dip->inode, + dip->logical_offset, + dip->bytes, + !bio->bi_error); kfree(dip); @@ -8346,6 +8346,21 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio, dip->subio_endio = btrfs_subio_endio_read; } + /* + * Reset the range for unsubmitted ordered extents (to a 0 length range) + * even if we fail to submit a bio, because in such case we do the + * corresponding error handling below and it must not be done a second + * time by btrfs_direct_IO(). + */ + if (write) { + struct btrfs_dio_data *dio_data = current->journal_info; + + dio_data->unsubmitted_oe_range_end = dip->logical_offset + + dip->bytes; + dio_data->unsubmitted_oe_range_start = + dio_data->unsubmitted_oe_range_end; + } + ret = btrfs_submit_direct_hook(rw, dip, skip_sum); if (!ret) return; @@ -8374,24 +8389,15 @@ free_ordered: dip = NULL; io_bio = NULL; } else { - if (write) { - struct btrfs_ordered_extent *ordered; - - ordered = btrfs_lookup_ordered_extent(inode, - file_offset); - set_bit(BTRFS_ORDERED_IOERR, &ordered->flags); - /* - * Decrements our ref on the ordered extent and removes - * the ordered extent from the inode's ordered tree, - * doing all the proper resource cleanup such as for the - * reserved space and waking up any waiters for this - * ordered extent (through btrfs_remove_ordered_extent). - */ - btrfs_finish_ordered_io(ordered); - } else { + if (write) + btrfs_endio_direct_write_update_ordered(inode, + file_offset, + dio_bio->bi_iter.bi_size, + 0); + else unlock_extent(&BTRFS_I(inode)->io_tree, file_offset, file_offset + dio_bio->bi_iter.bi_size - 1); - } + dio_bio->bi_error = -EIO; /* * Releases and cleans up our dio_bio, no need to bio_put() @@ -8475,7 +8481,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, * not unlock the i_mutex at this case. */ if (offset + count <= inode->i_size) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); relock = true; } ret = btrfs_delalloc_reserve_space(inode, offset, count); @@ -8491,6 +8497,8 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, * originally calculated. Abuse current->journal_info for this. */ dio_data.reserve = round_up(count, root->sectorsize); + dio_data.unsubmitted_oe_range_start = (u64)offset; + dio_data.unsubmitted_oe_range_end = (u64)offset; current->journal_info = &dio_data; } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK, &BTRFS_I(inode)->runtime_flags)) { @@ -8509,6 +8517,19 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, if (dio_data.reserve) btrfs_delalloc_release_space(inode, offset, dio_data.reserve); + /* + * On error we might have left some ordered extents + * without submitting corresponding bios for them, so + * cleanup them up to avoid other tasks getting them + * and waiting for them to complete forever. + */ + if (dio_data.unsubmitted_oe_range_start < + dio_data.unsubmitted_oe_range_end) + btrfs_endio_direct_write_update_ordered(inode, + dio_data.unsubmitted_oe_range_start, + dio_data.unsubmitted_oe_range_end - + dio_data.unsubmitted_oe_range_start, + 0); } else if (ret >= 0 && (size_t)ret < count) btrfs_delalloc_release_space(inode, offset, count - (size_t)ret); @@ -8517,7 +8538,7 @@ out: if (wakeup) inode_dio_end(inode); if (relock) - mutex_lock(&inode->i_mutex); + inode_lock(inode); return ret; } @@ -8639,7 +8660,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, } if (!inode_evicting) - lock_extent_bits(tree, page_start, page_end, 0, &cached_state); + lock_extent_bits(tree, page_start, page_end, &cached_state); ordered = btrfs_lookup_ordered_extent(inode, page_start); if (ordered) { /* @@ -8677,7 +8698,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, btrfs_put_ordered_extent(ordered); if (!inode_evicting) { cached_state = NULL; - lock_extent_bits(tree, page_start, page_end, 0, + lock_extent_bits(tree, page_start, page_end, &cached_state); } } @@ -8775,7 +8796,7 @@ again: } wait_on_page_writeback(page); - lock_extent_bits(io_tree, page_start, page_end, 0, &cached_state); + lock_extent_bits(io_tree, page_start, page_end, &cached_state); set_page_extent_mapped(page); /* @@ -9049,6 +9070,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->dir_index = 0; ei->last_unlink_trans = 0; ei->last_log_commit = 0; + ei->delayed_iput_count = 0; spin_lock_init(&ei->lock); ei->outstanding_extents = 0; @@ -9073,6 +9095,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) mutex_init(&ei->delalloc_mutex); btrfs_ordered_inode_tree_init(&ei->ordered_tree); INIT_LIST_HEAD(&ei->delalloc_inodes); + INIT_LIST_HEAD(&ei->delayed_iput); RB_CLEAR_NODE(&ei->rb_node); return inode; @@ -9177,15 +9200,14 @@ void btrfs_destroy_cachep(void) kmem_cache_destroy(btrfs_path_cachep); if (btrfs_free_space_cachep) kmem_cache_destroy(btrfs_free_space_cachep); - if (btrfs_delalloc_work_cachep) - kmem_cache_destroy(btrfs_delalloc_work_cachep); } int btrfs_init_cachep(void) { btrfs_inode_cachep = kmem_cache_create("btrfs_inode", sizeof(struct btrfs_inode), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once); + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT, + init_once); if (!btrfs_inode_cachep) goto fail; @@ -9213,13 +9235,6 @@ int btrfs_init_cachep(void) if (!btrfs_free_space_cachep) goto fail; - btrfs_delalloc_work_cachep = kmem_cache_create("btrfs_delalloc_work", - sizeof(struct btrfs_delalloc_work), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, - NULL); - if (!btrfs_delalloc_work_cachep) - goto fail; - return 0; fail: btrfs_destroy_cachep(); @@ -9443,14 +9458,10 @@ static void btrfs_run_delalloc_work(struct btrfs_work *work) delalloc_work = container_of(work, struct btrfs_delalloc_work, work); inode = delalloc_work->inode; - if (delalloc_work->wait) { - btrfs_wait_ordered_range(inode, 0, (u64)-1); - } else { + filemap_flush(inode->i_mapping); + if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) filemap_flush(inode->i_mapping); - if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, - &BTRFS_I(inode)->runtime_flags)) - filemap_flush(inode->i_mapping); - } if (delalloc_work->delay_iput) btrfs_add_delayed_iput(inode); @@ -9460,18 +9471,17 @@ static void btrfs_run_delalloc_work(struct btrfs_work *work) } struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode, - int wait, int delay_iput) + int delay_iput) { struct btrfs_delalloc_work *work; - work = kmem_cache_zalloc(btrfs_delalloc_work_cachep, GFP_NOFS); + work = kmalloc(sizeof(*work), GFP_NOFS); if (!work) return NULL; init_completion(&work->completion); INIT_LIST_HEAD(&work->list); work->inode = inode; - work->wait = wait; work->delay_iput = delay_iput; WARN_ON_ONCE(!inode); btrfs_init_work(&work->work, btrfs_flush_delalloc_helper, @@ -9483,7 +9493,7 @@ struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode, void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work) { wait_for_completion(&work->completion); - kmem_cache_free(btrfs_delalloc_work_cachep, work); + kfree(work); } /* @@ -9519,7 +9529,7 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput, } spin_unlock(&root->delalloc_lock); - work = btrfs_alloc_delalloc_work(inode, 0, delay_iput); + work = btrfs_alloc_delalloc_work(inode, delay_iput); if (!work) { if (delay_iput) btrfs_add_delayed_iput(inode); @@ -9696,10 +9706,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, if (err) goto out_unlock_inode; - err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); - if (err) - goto out_unlock_inode; - path = btrfs_alloc_path(); if (!path) { err = -ENOMEM; @@ -9732,10 +9738,18 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, btrfs_free_path(path); inode->i_op = &btrfs_symlink_inode_operations; + inode_nohighmem(inode); inode->i_mapping->a_ops = &btrfs_symlink_aops; inode_set_bytes(inode, name_len); btrfs_i_size_write(inode, name_len); err = btrfs_update_inode(trans, root, inode); + /* + * Last step, add directory indexes for our symlink inode. This is the + * last step to avoid extra cleanup of these indexes if an error happens + * elsewhere above. + */ + if (!err) + err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); if (err) { drop_inode = 1; goto out_unlock_inode; @@ -9786,7 +9800,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, } } - cur_bytes = min(num_bytes, 256ULL * 1024 * 1024); + cur_bytes = min_t(u64, num_bytes, SZ_256M); cur_bytes = max(cur_bytes, min_size); /* * If we are severely fragmented we could end up with really @@ -10021,7 +10035,7 @@ static const struct inode_operations btrfs_dir_inode_operations = { .setattr = btrfs_setattr, .mknod = btrfs_mknod, .setxattr = btrfs_setxattr, - .getxattr = btrfs_getxattr, + .getxattr = generic_getxattr, .listxattr = btrfs_listxattr, .removexattr = btrfs_removexattr, .permission = btrfs_permission, @@ -10050,7 +10064,7 @@ static const struct file_operations btrfs_dir_file_operations = { .fsync = btrfs_sync_file, }; -static struct extent_io_ops btrfs_extent_io_ops = { +static const struct extent_io_ops btrfs_extent_io_ops = { .fill_delalloc = run_delalloc_range, .submit_bio_hook = btrfs_submit_bio_hook, .merge_bio_hook = btrfs_merge_bio_hook, @@ -10098,7 +10112,7 @@ static const struct inode_operations btrfs_file_inode_operations = { .getattr = btrfs_getattr, .setattr = btrfs_setattr, .setxattr = btrfs_setxattr, - .getxattr = btrfs_getxattr, + .getxattr = generic_getxattr, .listxattr = btrfs_listxattr, .removexattr = btrfs_removexattr, .permission = btrfs_permission, @@ -10112,7 +10126,7 @@ static const struct inode_operations btrfs_special_inode_operations = { .setattr = btrfs_setattr, .permission = btrfs_permission, .setxattr = btrfs_setxattr, - .getxattr = btrfs_getxattr, + .getxattr = generic_getxattr, .listxattr = btrfs_listxattr, .removexattr = btrfs_removexattr, .get_acl = btrfs_get_acl, @@ -10121,13 +10135,12 @@ static const struct inode_operations btrfs_special_inode_operations = { }; static const struct inode_operations btrfs_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = page_follow_link_light, - .put_link = page_put_link, + .get_link = page_get_link, .getattr = btrfs_getattr, .setattr = btrfs_setattr, .permission = btrfs_permission, .setxattr = btrfs_setxattr, - .getxattr = btrfs_getxattr, + .getxattr = generic_getxattr, .listxattr = btrfs_listxattr, .removexattr = btrfs_removexattr, .update_time = btrfs_update_time, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index f07d01bc4..48aee9846 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -240,7 +240,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) if (ret) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ip_oldflags = ip->flags; i_oldflags = inode->i_flags; @@ -358,7 +358,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) } out_unlock: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); mnt_drop_write_file(file); return ret; } @@ -659,22 +659,28 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state)) return -EINVAL; + pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); + if (!pending_snapshot) + return -ENOMEM; + + pending_snapshot->root_item = kzalloc(sizeof(struct btrfs_root_item), + GFP_NOFS); + pending_snapshot->path = btrfs_alloc_path(); + if (!pending_snapshot->root_item || !pending_snapshot->path) { + ret = -ENOMEM; + goto free_pending; + } + atomic_inc(&root->will_be_snapshoted); smp_mb__after_atomic(); btrfs_wait_for_no_snapshoting_writes(root); ret = btrfs_start_delalloc_inodes(root, 0); if (ret) - goto out; + goto dec_and_free; btrfs_wait_ordered_extents(root, -1); - pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); - if (!pending_snapshot) { - ret = -ENOMEM; - goto out; - } - btrfs_init_block_rsv(&pending_snapshot->block_rsv, BTRFS_BLOCK_RSV_TEMP); /* @@ -690,7 +696,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, &pending_snapshot->qgroup_reserved, false); if (ret) - goto free; + goto dec_and_free; pending_snapshot->dentry = dentry; pending_snapshot->root = root; @@ -741,11 +747,14 @@ fail: btrfs_subvolume_release_metadata(BTRFS_I(dir)->root, &pending_snapshot->block_rsv, pending_snapshot->qgroup_reserved); -free: - kfree(pending_snapshot); -out: +dec_and_free: if (atomic_dec_and_test(&root->will_be_snapshoted)) wake_up_atomic_t(&root->will_be_snapshoted); +free_pending: + kfree(pending_snapshot->root_item); + btrfs_free_path(pending_snapshot->path); + kfree(pending_snapshot); + return ret; } @@ -872,7 +881,7 @@ out_up_read: out_dput: dput(dentry); out_unlock: - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); return error; } @@ -996,7 +1005,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start) u64 end = start + len - 1; /* get the big lock and read metadata off disk */ - lock_extent_bits(io_tree, start, end, 0, &cached); + lock_extent_bits(io_tree, start, end, &cached); em = btrfs_get_extent(inode, NULL, 0, start, len, 0); unlock_extent_cached(io_tree, start, end, &cached, GFP_NOFS); @@ -1020,7 +1029,7 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em) if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) ret = false; else if ((em->block_start + em->block_len == next->block_start) && - (em->block_len > 128 * 1024 && next->block_len > 128 * 1024)) + (em->block_len > SZ_128K && next->block_len > SZ_128K)) ret = false; free_extent_map(next); @@ -1144,7 +1153,7 @@ again: page_end = page_start + PAGE_CACHE_SIZE - 1; while (1) { lock_extent_bits(tree, page_start, page_end, - 0, &cached_state); + &cached_state); ordered = btrfs_lookup_ordered_extent(inode, page_start); unlock_extent_cached(tree, page_start, page_end, @@ -1204,7 +1213,7 @@ again: page_end = page_offset(pages[i_done - 1]) + PAGE_CACHE_SIZE; lock_extent_bits(&BTRFS_I(inode)->io_tree, - page_start, page_end - 1, 0, &cached_state); + page_start, page_end - 1, &cached_state); clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, page_end - 1, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, @@ -1266,9 +1275,9 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, int defrag_count = 0; int compress_type = BTRFS_COMPRESS_ZLIB; u32 extent_thresh = range->extent_thresh; - unsigned long max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT; + unsigned long max_cluster = SZ_256K >> PAGE_CACHE_SHIFT; unsigned long cluster = max_cluster; - u64 new_align = ~((u64)128 * 1024 - 1); + u64 new_align = ~((u64)SZ_128K - 1); struct page **pages = NULL; if (isize == 0) @@ -1285,7 +1294,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, } if (extent_thresh == 0) - extent_thresh = 256 * 1024; + extent_thresh = SZ_256K; /* * if we were not given a file, allocate a readahead @@ -1317,7 +1326,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, if (newer_than) { ret = find_new_extents(root, inode, newer_than, - &newer_off, 64 * 1024); + &newer_off, SZ_64K); if (!ret) { range->start = newer_off; /* @@ -1384,18 +1393,18 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, ra_index += cluster; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) BTRFS_I(inode)->force_compress = compress_type; ret = cluster_pages_for_defrag(inode, pages, i, cluster); if (ret < 0) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); goto out_ra; } defrag_count += ret; balance_dirty_pages_ratelimited(inode->i_mapping); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (newer_than) { if (newer_off == (u64)-1) @@ -1407,9 +1416,8 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, newer_off = max(newer_off + 1, (u64)i << PAGE_CACHE_SHIFT); - ret = find_new_extents(root, inode, - newer_than, &newer_off, - 64 * 1024); + ret = find_new_extents(root, inode, newer_than, + &newer_off, SZ_64K); if (!ret) { range->start = newer_off; i = (newer_off & new_align) >> PAGE_CACHE_SHIFT; @@ -1457,9 +1465,9 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, out_ra: if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } if (!file) kfree(ra); @@ -1575,7 +1583,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, new_size = old_size + new_size; } - if (new_size < 256 * 1024 * 1024) { + if (new_size < SZ_256M) { ret = -EINVAL; goto out_free; } @@ -2164,7 +2172,7 @@ static noinline int btrfs_ioctl_tree_search_v2(struct file *file, struct inode *inode; int ret; size_t buf_size; - const size_t buf_limit = 16 * 1024 * 1024; + const size_t buf_limit = SZ_16M; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -2422,7 +2430,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, goto out_dput; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* * Don't allow to delete a subvolume with send in progress. This is @@ -2535,7 +2543,7 @@ out_up_write: spin_unlock(&dest->root_item_lock); } out_unlock_inode: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (!err) { d_invalidate(dentry); btrfs_invalidate_inodes(dest); @@ -2551,7 +2559,7 @@ out_unlock_inode: out_dput: dput(dentry); out_unlock_dir: - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); out_drop_write: mnt_drop_write_file(file); out: @@ -2871,8 +2879,8 @@ static int lock_extent_range(struct inode *inode, u64 off, u64 len, static void btrfs_double_inode_unlock(struct inode *inode1, struct inode *inode2) { - mutex_unlock(&inode1->i_mutex); - mutex_unlock(&inode2->i_mutex); + inode_unlock(inode1); + inode_unlock(inode2); } static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2) @@ -2880,8 +2888,8 @@ static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2) if (inode1 < inode2) swap(inode1, inode2); - mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(inode1, I_MUTEX_PARENT); + inode_lock_nested(inode2, I_MUTEX_CHILD); } static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1, @@ -3003,7 +3011,7 @@ static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst, flush_dcache_page(dst_page); if (memcmp(addr, dst_addr, cmp_len)) - ret = BTRFS_SAME_DATA_DIFFERS; + ret = -EBADE; kunmap_atomic(addr); kunmap_atomic(dst_addr); @@ -3055,7 +3063,7 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, return 0; if (same_inode) { - mutex_lock(&src->i_mutex); + inode_lock(src); ret = extent_same_check_offsets(src, loff, &len, olen); if (ret) @@ -3162,62 +3170,25 @@ again: btrfs_cmp_data_free(&cmp); out_unlock: if (same_inode) - mutex_unlock(&src->i_mutex); + inode_unlock(src); else btrfs_double_inode_unlock(src, dst); return ret; } -#define BTRFS_MAX_DEDUPE_LEN (16 * 1024 * 1024) +#define BTRFS_MAX_DEDUPE_LEN SZ_16M -static long btrfs_ioctl_file_extent_same(struct file *file, - struct btrfs_ioctl_same_args __user *argp) +ssize_t btrfs_dedupe_file_range(struct file *src_file, u64 loff, u64 olen, + struct file *dst_file, u64 dst_loff) { - struct btrfs_ioctl_same_args *same = NULL; - struct btrfs_ioctl_same_extent_info *info; - struct inode *src = file_inode(file); - u64 off; - u64 len; - int i; - int ret; - unsigned long size; + struct inode *src = file_inode(src_file); + struct inode *dst = file_inode(dst_file); u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize; - bool is_admin = capable(CAP_SYS_ADMIN); - u16 count; - - if (!(file->f_mode & FMODE_READ)) - return -EINVAL; + ssize_t res; - ret = mnt_want_write_file(file); - if (ret) - return ret; - - if (get_user(count, &argp->dest_count)) { - ret = -EFAULT; - goto out; - } - - size = offsetof(struct btrfs_ioctl_same_args __user, info[count]); - - same = memdup_user(argp, size); - - if (IS_ERR(same)) { - ret = PTR_ERR(same); - same = NULL; - goto out; - } - - off = same->logical_offset; - len = same->length; - - /* - * Limit the total length we will dedupe for each operation. - * This is intended to bound the total time spent in this - * ioctl to something sane. - */ - if (len > BTRFS_MAX_DEDUPE_LEN) - len = BTRFS_MAX_DEDUPE_LEN; + if (olen > BTRFS_MAX_DEDUPE_LEN) + olen = BTRFS_MAX_DEDUPE_LEN; if (WARN_ON_ONCE(bs < PAGE_CACHE_SIZE)) { /* @@ -3225,58 +3196,13 @@ static long btrfs_ioctl_file_extent_same(struct file *file, * result, btrfs_cmp_data() won't correctly handle * this situation without an update. */ - ret = -EINVAL; - goto out; - } - - ret = -EISDIR; - if (S_ISDIR(src->i_mode)) - goto out; - - ret = -EACCES; - if (!S_ISREG(src->i_mode)) - goto out; - - /* pre-format output fields to sane values */ - for (i = 0; i < count; i++) { - same->info[i].bytes_deduped = 0ULL; - same->info[i].status = 0; - } - - for (i = 0, info = same->info; i < count; i++, info++) { - struct inode *dst; - struct fd dst_file = fdget(info->fd); - if (!dst_file.file) { - info->status = -EBADF; - continue; - } - dst = file_inode(dst_file.file); - - if (!(is_admin || (dst_file.file->f_mode & FMODE_WRITE))) { - info->status = -EINVAL; - } else if (file->f_path.mnt != dst_file.file->f_path.mnt) { - info->status = -EXDEV; - } else if (S_ISDIR(dst->i_mode)) { - info->status = -EISDIR; - } else if (!S_ISREG(dst->i_mode)) { - info->status = -EACCES; - } else { - info->status = btrfs_extent_same(src, off, len, dst, - info->logical_offset); - if (info->status == 0) - info->bytes_deduped += len; - } - fdput(dst_file); + return -EINVAL; } - ret = copy_to_user(argp, same, size); - if (ret) - ret = -EFAULT; - -out: - mnt_drop_write_file(file); - kfree(same); - return ret; + res = btrfs_extent_same(src, loff, olen, dst, dst_loff); + if (res) + return res; + return olen; } static int clone_finish_inode_update(struct btrfs_trans_handle *trans, @@ -3551,7 +3477,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode, return ret; } - path->reada = 2; + path->reada = READA_FORWARD; /* clone data */ key.objectid = btrfs_ino(src); key.type = BTRFS_EXTENT_DATA_KEY; @@ -3852,17 +3778,16 @@ out: return ret; } -static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, - u64 off, u64 olen, u64 destoff) +static noinline int btrfs_clone_files(struct file *file, struct file *file_src, + u64 off, u64 olen, u64 destoff) { struct inode *inode = file_inode(file); + struct inode *src = file_inode(file_src); struct btrfs_root *root = BTRFS_I(inode)->root; - struct fd src_file; - struct inode *src; int ret; u64 len = olen; u64 bs = root->fs_info->sb->s_blocksize; - int same_inode = 0; + int same_inode = src == inode; /* * TODO: @@ -3875,54 +3800,25 @@ static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, * be either compressed or non-compressed. */ - /* the destination must be opened for writing */ - if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND)) - return -EINVAL; - if (btrfs_root_readonly(root)) return -EROFS; - ret = mnt_want_write_file(file); - if (ret) - return ret; - - src_file = fdget(srcfd); - if (!src_file.file) { - ret = -EBADF; - goto out_drop_write; - } - - ret = -EXDEV; - if (src_file.file->f_path.mnt != file->f_path.mnt) - goto out_fput; - - src = file_inode(src_file.file); - - ret = -EINVAL; - if (src == inode) - same_inode = 1; - - /* the src must be open for reading */ - if (!(src_file.file->f_mode & FMODE_READ)) - goto out_fput; + if (file_src->f_path.mnt != file->f_path.mnt || + src->i_sb != inode->i_sb) + return -EXDEV; /* don't make the dst file partly checksummed */ if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) != (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) - goto out_fput; + return -EINVAL; - ret = -EISDIR; if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode)) - goto out_fput; - - ret = -EXDEV; - if (src->i_sb != inode->i_sb) - goto out_fput; + return -EISDIR; if (!same_inode) { btrfs_double_inode_lock(src, inode); } else { - mutex_lock(&src->i_mutex); + inode_lock(src); } /* determine range to clone */ @@ -3999,22 +3895,26 @@ out_unlock: if (!same_inode) btrfs_double_inode_unlock(src, inode); else - mutex_unlock(&src->i_mutex); -out_fput: - fdput(src_file); -out_drop_write: - mnt_drop_write_file(file); + inode_unlock(src); return ret; } -static long btrfs_ioctl_clone_range(struct file *file, void __user *argp) +ssize_t btrfs_copy_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + size_t len, unsigned int flags) { - struct btrfs_ioctl_clone_range_args args; + ssize_t ret; - if (copy_from_user(&args, argp, sizeof(args))) - return -EFAULT; - return btrfs_ioctl_clone(file, args.src_fd, args.src_offset, - args.src_length, args.dest_offset); + ret = btrfs_clone_files(file_out, file_in, pos_in, len, pos_out); + if (ret == 0) + ret = len; + return ret; +} + +int btrfs_clone_file_range(struct file *src_file, loff_t off, + struct file *dst_file, loff_t destoff, u64 len) +{ + return btrfs_clone_files(dst_file, src_file, off, len, destoff); } /* @@ -4226,7 +4126,7 @@ static long btrfs_ioctl_space_info(struct btrfs_root *root, void __user *arg) return -ENOMEM; space_args.total_spaces = 0; - dest = kmalloc(alloc_size, GFP_NOFS); + dest = kmalloc(alloc_size, GFP_KERNEL); if (!dest) return -ENOMEM; dest_orig = dest; @@ -4603,7 +4503,7 @@ static long btrfs_ioctl_logical_to_ino(struct btrfs_root *root, goto out; } - size = min_t(u32, loi->size, 64 * 1024); + size = min_t(u32, loi->size, SZ_64K); inodes = init_data_container(size); if (IS_ERR(inodes)) { ret = PTR_ERR(inodes); @@ -4752,7 +4652,7 @@ locked: goto out_bargs; } - bctl = kzalloc(sizeof(*bctl), GFP_NOFS); + bctl = kzalloc(sizeof(*bctl), GFP_KERNEL); if (!bctl) { ret = -ENOMEM; goto out_bargs; @@ -4838,7 +4738,7 @@ static long btrfs_ioctl_balance_progress(struct btrfs_root *root, goto out; } - bargs = kzalloc(sizeof(*bargs), GFP_NOFS); + bargs = kzalloc(sizeof(*bargs), GFP_KERNEL); if (!bargs) { ret = -ENOMEM; goto out; @@ -5098,7 +4998,7 @@ static long btrfs_ioctl_quota_rescan_status(struct file *file, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - qsa = kzalloc(sizeof(*qsa), GFP_NOFS); + qsa = kzalloc(sizeof(*qsa), GFP_KERNEL); if (!qsa) return -ENOMEM; @@ -5228,7 +5128,7 @@ static long btrfs_ioctl_set_received_subvol_32(struct file *file, goto out; } - args64 = kmalloc(sizeof(*args64), GFP_NOFS); + args64 = kmalloc(sizeof(*args64), GFP_KERNEL); if (!args64) { ret = -ENOMEM; goto out; @@ -5365,7 +5265,7 @@ out_unlock: static int btrfs_ioctl_get_supported_features(struct file *file, void __user *arg) { - static struct btrfs_ioctl_feature_flags features[3] = { + static const struct btrfs_ioctl_feature_flags features[3] = { INIT_FEATURE_FLAGS(SUPP), INIT_FEATURE_FLAGS(SAFE_SET), INIT_FEATURE_FLAGS(SAFE_CLEAR) @@ -5564,10 +5464,6 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_dev_info(root, argp); case BTRFS_IOC_BALANCE: return btrfs_ioctl_balance(file, NULL); - case BTRFS_IOC_CLONE: - return btrfs_ioctl_clone(file, arg, 0, 0, 0); - case BTRFS_IOC_CLONE_RANGE: - return btrfs_ioctl_clone_range(file, argp); case BTRFS_IOC_TRANS_START: return btrfs_ioctl_trans_start(file); case BTRFS_IOC_TRANS_END: @@ -5645,8 +5541,6 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_get_fslabel(file, argp); case BTRFS_IOC_SET_FSLABEL: return btrfs_ioctl_set_fslabel(file, argp); - case BTRFS_IOC_FILE_EXTENT_SAME: - return btrfs_ioctl_file_extent_same(file, argp); case BTRFS_IOC_GET_SUPPORTED_FEATURES: return btrfs_ioctl_get_supported_features(file, argp); case BTRFS_IOC_GET_FEATURES: diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 8077461fc..d13128c70 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -56,7 +56,6 @@ void btrfs_set_lock_blocking_rw(struct extent_buffer *eb, int rw) atomic_dec(&eb->spinning_readers); read_unlock(&eb->lock); } - return; } /* @@ -96,7 +95,6 @@ void btrfs_clear_lock_blocking_rw(struct extent_buffer *eb, int rw) waitqueue_active(&eb->read_lock_wq)) wake_up(&eb->read_lock_wq); } - return; } /* diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 1a33d3eb3..55161369f 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -503,7 +503,6 @@ static void cache_rbio(struct btrfs_raid_bio *rbio) } spin_unlock_irqrestore(&table->cache_lock, flags); - return; } /* @@ -610,13 +609,28 @@ static int rbio_can_merge(struct btrfs_raid_bio *last, return 1; } +static int rbio_stripe_page_index(struct btrfs_raid_bio *rbio, int stripe, + int index) +{ + return stripe * rbio->stripe_npages + index; +} + +/* + * these are just the pages from the rbio array, not from anything + * the FS sent down to us + */ +static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, + int index) +{ + return rbio->stripe_pages[rbio_stripe_page_index(rbio, stripe, index)]; +} + /* * helper to index into the pstripe */ static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index) { - index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT; - return rbio->stripe_pages[index]; + return rbio_stripe_page(rbio, rbio->nr_data, index); } /* @@ -627,10 +641,7 @@ static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index) { if (rbio->nr_data + 1 == rbio->real_stripes) return NULL; - - index += ((rbio->nr_data + 1) * rbio->stripe_len) >> - PAGE_CACHE_SHIFT; - return rbio->stripe_pages[index]; + return rbio_stripe_page(rbio, rbio->nr_data + 1, index); } /* @@ -890,6 +901,7 @@ static void raid_write_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; int err = bio->bi_error; + int max_errors; if (err) fail_bio_stripe(rbio, bio); @@ -902,11 +914,12 @@ static void raid_write_end_io(struct bio *bio) err = 0; /* OK, we have read all the stripes we need to. */ - if (atomic_read(&rbio->error) > rbio->bbio->max_errors) + max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ? + 0 : rbio->bbio->max_errors; + if (atomic_read(&rbio->error) > max_errors) err = -EIO; rbio_orig_end_io(rbio, err); - return; } /* @@ -949,8 +962,7 @@ static struct page *page_in_rbio(struct btrfs_raid_bio *rbio, */ static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes) { - unsigned long nr = stripe_len * nr_stripes; - return DIV_ROUND_UP(nr, PAGE_CACHE_SIZE); + return DIV_ROUND_UP(stripe_len, PAGE_CACHE_SIZE) * nr_stripes; } /* @@ -968,8 +980,8 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root, void *p; rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2 + - DIV_ROUND_UP(stripe_npages, BITS_PER_LONG / 8), - GFP_NOFS); + DIV_ROUND_UP(stripe_npages, BITS_PER_LONG) * + sizeof(long), GFP_NOFS); if (!rbio) return ERR_PTR(-ENOMEM); @@ -1023,18 +1035,17 @@ static int alloc_rbio_pages(struct btrfs_raid_bio *rbio) if (!page) return -ENOMEM; rbio->stripe_pages[i] = page; - ClearPageUptodate(page); } return 0; } -/* allocate pages for just the p/q stripes */ +/* only allocate pages for p/q stripes */ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) { int i; struct page *page; - i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT; + i = rbio_stripe_page_index(rbio, rbio->nr_data, 0); for (; i < rbio->nr_pages; i++) { if (rbio->stripe_pages[i]) @@ -1123,18 +1134,6 @@ static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) } /* - * these are just the pages from the rbio array, not from anything - * the FS sent down to us - */ -static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page) -{ - int index; - index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT); - index += page; - return rbio->stripe_pages[index]; -} - -/* * helper function to walk our bio list and populate the bio_pages array with * the result. This seems expensive, but it is faster than constantly * searching through the bio list as we setup the IO in finish_rmw or stripe @@ -1177,7 +1176,6 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) { struct btrfs_bio *bbio = rbio->bbio; void *pointers[rbio->real_stripes]; - int stripe_len = rbio->stripe_len; int nr_data = rbio->nr_data; int stripe; int pagenr; @@ -1185,7 +1183,6 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) int q_stripe = -1; struct bio_list bio_list; struct bio *bio; - int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT; int ret; bio_list_init(&bio_list); @@ -1228,7 +1225,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) else clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); - for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { + for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { struct page *p; /* first collect one page from each data stripe */ for (stripe = 0; stripe < nr_data; stripe++) { @@ -1270,7 +1267,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) * everything else. */ for (stripe = 0; stripe < rbio->real_stripes; stripe++) { - for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { + for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { struct page *page; if (stripe < rbio->nr_data) { page = page_in_rbio(rbio, stripe, pagenr, 1); @@ -1294,7 +1291,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) if (!bbio->tgtdev_map[stripe]) continue; - for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { + for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { struct page *page; if (stripe < rbio->nr_data) { page = page_in_rbio(rbio, stripe, pagenr, 1); @@ -1508,7 +1505,6 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) int bios_to_read = 0; struct bio_list bio_list; int ret; - int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE); int pagenr; int stripe; struct bio *bio; @@ -1527,7 +1523,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) * stripe */ for (stripe = 0; stripe < rbio->nr_data; stripe++) { - for (pagenr = 0; pagenr < nr_pages; pagenr++) { + for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { struct page *page; /* * we want to find all the pages missing from @@ -1803,7 +1799,6 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) int pagenr, stripe; void **pointers; int faila = -1, failb = -1; - int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE); struct page *page; int err; int i; @@ -1826,7 +1821,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) index_rbio_pages(rbio); - for (pagenr = 0; pagenr < nr_pages; pagenr++) { + for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { /* * Now we just use bitmap to mark the horizontal stripes in * which we have data when doing parity scrub. @@ -1937,7 +1932,7 @@ pstripe: * other endio functions will fiddle the uptodate bits */ if (rbio->operation == BTRFS_RBIO_WRITE) { - for (i = 0; i < nr_pages; i++) { + for (i = 0; i < rbio->stripe_npages; i++) { if (faila != -1) { page = rbio_stripe_page(rbio, faila, i); SetPageUptodate(page); @@ -2033,7 +2028,6 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) int bios_to_read = 0; struct bio_list bio_list; int ret; - int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE); int pagenr; int stripe; struct bio *bio; @@ -2057,7 +2051,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) continue; } - for (pagenr = 0; pagenr < nr_pages; pagenr++) { + for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { struct page *p; /* @@ -2281,37 +2275,11 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) if (!page) return -ENOMEM; rbio->stripe_pages[index] = page; - ClearPageUptodate(page); } } return 0; } -/* - * end io function used by finish_rmw. When we finally - * get here, we've written a full stripe - */ -static void raid_write_parity_end_io(struct bio *bio) -{ - struct btrfs_raid_bio *rbio = bio->bi_private; - int err = bio->bi_error; - - if (bio->bi_error) - fail_bio_stripe(rbio, bio); - - bio_put(bio); - - if (!atomic_dec_and_test(&rbio->stripes_pending)) - return; - - err = 0; - - if (atomic_read(&rbio->error)) - err = -EIO; - - rbio_orig_end_io(rbio, err); -} - static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check) { @@ -2464,7 +2432,7 @@ submit_write: break; bio->bi_private = rbio; - bio->bi_end_io = raid_write_parity_end_io; + bio->bi_end_io = raid_write_end_io; submit_bio(WRITE, bio); } return; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index b4ca5454e..2bd001145 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -575,7 +575,8 @@ static int is_cowonly_root(u64 root_objectid) root_objectid == BTRFS_TREE_LOG_OBJECTID || root_objectid == BTRFS_CSUM_TREE_OBJECTID || root_objectid == BTRFS_UUID_TREE_OBJECTID || - root_objectid == BTRFS_QUOTA_TREE_OBJECTID) + root_objectid == BTRFS_QUOTA_TREE_OBJECTID || + root_objectid == BTRFS_FREE_SPACE_TREE_OBJECTID) return 1; return 0; } @@ -708,8 +709,8 @@ struct backref_node *build_backref_tree(struct reloc_control *rc, err = -ENOMEM; goto out; } - path1->reada = 1; - path2->reada = 2; + path1->reada = READA_FORWARD; + path2->reada = READA_FORWARD; node = alloc_backref_node(cache); if (!node) { @@ -2130,7 +2131,7 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = 1; + path->reada = READA_FORWARD; reloc_root = root->reloc_root; root_item = &reloc_root->root_item; @@ -3030,7 +3031,7 @@ int prealloc_file_extent_cluster(struct inode *inode, int ret = 0; BUG_ON(cluster->start != cluster->boundary[0]); - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = btrfs_check_data_free_space(inode, cluster->start, cluster->end + 1 - cluster->start); @@ -3057,7 +3058,7 @@ int prealloc_file_extent_cluster(struct inode *inode, btrfs_free_reserved_data_space(inode, cluster->start, cluster->end + 1 - cluster->start); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } @@ -3527,7 +3528,7 @@ static int find_data_references(struct reloc_control *rc, path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = 1; + path->reada = READA_FORWARD; root = read_fs_root(rc->extent_root->fs_info, ref_root); if (IS_ERR(root)) { @@ -3917,7 +3918,7 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = 1; + path->reada = READA_FORWARD; ret = prepare_to_relocate(rc); if (ret) { @@ -4343,7 +4344,7 @@ int btrfs_recover_relocation(struct btrfs_root *root) path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = -1; + path->reada = READA_BACK; key.objectid = BTRFS_TREE_RELOC_OBJECTID; key.type = BTRFS_ROOT_ITEM_KEY; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index b091d94ce..92bf5ee73 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -1514,8 +1514,6 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info, if (sblock->no_io_error_seen) scrub_recheck_block_checksum(sblock); - - return; } static inline int scrub_check_fsid(u8 fsid[], @@ -2815,7 +2813,7 @@ out: static inline int scrub_calc_parity_bitmap_len(int nsectors) { - return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * (BITS_PER_LONG / 8); + return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * sizeof(long); } static void scrub_parity_get(struct scrub_parity *sparity) @@ -3460,7 +3458,7 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, return ret; } - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; if (em->start != chunk_offset) goto out; @@ -3507,7 +3505,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, if (!path) return -ENOMEM; - path->reada = 2; + path->reada = READA_FORWARD; path->search_commit_root = 1; path->skip_locking = 1; @@ -3735,27 +3733,27 @@ static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info, if (fs_info->scrub_workers_refcnt == 0) { if (is_dev_replace) fs_info->scrub_workers = - btrfs_alloc_workqueue("btrfs-scrub", flags, + btrfs_alloc_workqueue("scrub", flags, 1, 4); else fs_info->scrub_workers = - btrfs_alloc_workqueue("btrfs-scrub", flags, + btrfs_alloc_workqueue("scrub", flags, max_active, 4); if (!fs_info->scrub_workers) goto fail_scrub_workers; fs_info->scrub_wr_completion_workers = - btrfs_alloc_workqueue("btrfs-scrubwrc", flags, + btrfs_alloc_workqueue("scrubwrc", flags, max_active, 2); if (!fs_info->scrub_wr_completion_workers) goto fail_scrub_wr_completion_workers; fs_info->scrub_nocow_workers = - btrfs_alloc_workqueue("btrfs-scrubnc", flags, 1, 0); + btrfs_alloc_workqueue("scrubnc", flags, 1, 0); if (!fs_info->scrub_nocow_workers) goto fail_scrub_nocow_workers; fs_info->scrub_parity_workers = - btrfs_alloc_workqueue("btrfs-scrubparity", flags, + btrfs_alloc_workqueue("scrubparity", flags, max_active, 2); if (!fs_info->scrub_parity_workers) goto fail_scrub_parity_workers; @@ -4211,7 +4209,7 @@ static int check_extent_to_block(struct inode *inode, u64 start, u64 len, io_tree = &BTRFS_I(inode)->io_tree; - lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state); + lock_extent_bits(io_tree, lockstart, lockend, &cached_state); ordered = btrfs_lookup_ordered_range(inode, lockstart, len); if (ordered) { btrfs_put_ordered_extent(ordered); @@ -4281,7 +4279,7 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, return PTR_ERR(inode); /* Avoid truncate/dio/punch hole.. */ - mutex_lock(&inode->i_mutex); + inode_lock(inode); inode_dio_wait(inode); physical_for_dev_replace = nocow_ctx->physical_for_dev_replace; @@ -4360,7 +4358,7 @@ next_page: } ret = COPY_COMPLETE; out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); iput(inode); return ret; } diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index 48d425aef..02e00166c 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -22,8 +22,8 @@ #define BTRFS_SEND_STREAM_MAGIC "btrfs-stream" #define BTRFS_SEND_STREAM_VERSION 1 -#define BTRFS_SEND_BUF_SIZE (1024 * 64) -#define BTRFS_SEND_READ_SIZE (1024 * 48) +#define BTRFS_SEND_BUF_SIZE SZ_64K +#define BTRFS_SEND_READ_SIZE (48 * SZ_1K) enum btrfs_tlv_type { BTRFS_TLV_U8, diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index fe609b81d..d41e09fe8 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -295,10 +295,11 @@ enum { Opt_nossd, Opt_ssd_spread, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_compress_type, Opt_compress_force, Opt_compress_force_type, Opt_notreelog, Opt_ratio, Opt_flushoncommit, Opt_discard, - Opt_space_cache, Opt_clear_cache, Opt_user_subvol_rm_allowed, - Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache, - Opt_no_space_cache, Opt_recovery, Opt_skip_balance, - Opt_check_integrity, Opt_check_integrity_including_extent_data, + Opt_space_cache, Opt_space_cache_version, Opt_clear_cache, + Opt_user_subvol_rm_allowed, Opt_enospc_debug, Opt_subvolrootid, + Opt_defrag, Opt_inode_cache, Opt_no_space_cache, Opt_recovery, + Opt_skip_balance, Opt_check_integrity, + Opt_check_integrity_including_extent_data, Opt_check_integrity_print_mask, Opt_fatal_errors, Opt_rescan_uuid_tree, Opt_commit_interval, Opt_barrier, Opt_nodefrag, Opt_nodiscard, Opt_noenospc_debug, Opt_noflushoncommit, Opt_acl, Opt_datacow, @@ -309,7 +310,7 @@ enum { Opt_err, }; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_degraded, "degraded"}, {Opt_subvol, "subvol=%s"}, {Opt_subvolid, "subvolid=%s"}, @@ -340,6 +341,7 @@ static match_table_t tokens = { {Opt_discard, "discard"}, {Opt_nodiscard, "nodiscard"}, {Opt_space_cache, "space_cache"}, + {Opt_space_cache_version, "space_cache=%s"}, {Opt_clear_cache, "clear_cache"}, {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"}, {Opt_enospc_debug, "enospc_debug"}, @@ -381,9 +383,14 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) int ret = 0; char *compress_type; bool compress_force = false; + enum btrfs_compression_type saved_compress_type; + bool saved_compress_force; + int no_compress = 0; cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); - if (cache_gen) + if (btrfs_fs_compat_ro(root->fs_info, FREE_SPACE_TREE)) + btrfs_set_opt(info->mount_opt, FREE_SPACE_TREE); + else if (cache_gen) btrfs_set_opt(info->mount_opt, SPACE_CACHE); if (!options) @@ -458,6 +465,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) /* Fallthrough */ case Opt_compress: case Opt_compress_type: + saved_compress_type = btrfs_test_opt(root, COMPRESS) ? + info->compress_type : BTRFS_COMPRESS_NONE; + saved_compress_force = + btrfs_test_opt(root, FORCE_COMPRESS); if (token == Opt_compress || token == Opt_compress_force || strcmp(args[0].from, "zlib") == 0) { @@ -466,6 +477,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) btrfs_set_opt(info->mount_opt, COMPRESS); btrfs_clear_opt(info->mount_opt, NODATACOW); btrfs_clear_opt(info->mount_opt, NODATASUM); + no_compress = 0; } else if (strcmp(args[0].from, "lzo") == 0) { compress_type = "lzo"; info->compress_type = BTRFS_COMPRESS_LZO; @@ -473,25 +485,21 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) btrfs_clear_opt(info->mount_opt, NODATACOW); btrfs_clear_opt(info->mount_opt, NODATASUM); btrfs_set_fs_incompat(info, COMPRESS_LZO); + no_compress = 0; } else if (strncmp(args[0].from, "no", 2) == 0) { compress_type = "no"; btrfs_clear_opt(info->mount_opt, COMPRESS); btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS); compress_force = false; + no_compress++; } else { ret = -EINVAL; goto out; } if (compress_force) { - btrfs_set_and_info(root, FORCE_COMPRESS, - "force %s compression", - compress_type); + btrfs_set_opt(info->mount_opt, FORCE_COMPRESS); } else { - if (!btrfs_test_opt(root, COMPRESS)) - btrfs_info(root->fs_info, - "btrfs: use %s compression", - compress_type); /* * If we remount from compress-force=xxx to * compress=xxx, we need clear FORCE_COMPRESS @@ -500,6 +508,17 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) */ btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS); } + if ((btrfs_test_opt(root, COMPRESS) && + (info->compress_type != saved_compress_type || + compress_force != saved_compress_force)) || + (!btrfs_test_opt(root, COMPRESS) && + no_compress == 1)) { + btrfs_info(root->fs_info, + "%s %s compression", + (compress_force) ? "force" : "use", + compress_type); + } + compress_force = false; break; case Opt_ssd: btrfs_set_and_info(root, SSD, @@ -617,15 +636,35 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) "turning off discard"); break; case Opt_space_cache: - btrfs_set_and_info(root, SPACE_CACHE, - "enabling disk space caching"); + case Opt_space_cache_version: + if (token == Opt_space_cache || + strcmp(args[0].from, "v1") == 0) { + btrfs_clear_opt(root->fs_info->mount_opt, + FREE_SPACE_TREE); + btrfs_set_and_info(root, SPACE_CACHE, + "enabling disk space caching"); + } else if (strcmp(args[0].from, "v2") == 0) { + btrfs_clear_opt(root->fs_info->mount_opt, + SPACE_CACHE); + btrfs_set_and_info(root, FREE_SPACE_TREE, + "enabling free space tree"); + } else { + ret = -EINVAL; + goto out; + } break; case Opt_rescan_uuid_tree: btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE); break; case Opt_no_space_cache: - btrfs_clear_and_info(root, SPACE_CACHE, - "disabling disk space caching"); + if (btrfs_test_opt(root, SPACE_CACHE)) { + btrfs_clear_and_info(root, SPACE_CACHE, + "disabling disk space caching"); + } + if (btrfs_test_opt(root, FREE_SPACE_TREE)) { + btrfs_clear_and_info(root, FREE_SPACE_TREE, + "disabling free space tree"); + } break; case Opt_inode_cache: btrfs_set_pending_and_info(info, INODE_MAP_CACHE, @@ -754,8 +793,17 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) } } out: + if (btrfs_fs_compat_ro(root->fs_info, FREE_SPACE_TREE) && + !btrfs_test_opt(root, FREE_SPACE_TREE) && + !btrfs_test_opt(root, CLEAR_CACHE)) { + btrfs_err(root->fs_info, "cannot disable free space tree"); + ret = -EINVAL; + + } if (!ret && btrfs_test_opt(root, SPACE_CACHE)) btrfs_info(root->fs_info, "disk space caching is enabled"); + if (!ret && btrfs_test_opt(root, FREE_SPACE_TREE)) + btrfs_info(root->fs_info, "using free space tree"); kfree(orig); return ret; } @@ -1162,6 +1210,8 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) seq_puts(seq, ",noacl"); if (btrfs_test_opt(root, SPACE_CACHE)) seq_puts(seq, ",space_cache"); + else if (btrfs_test_opt(root, FREE_SPACE_TREE)) + seq_puts(seq, ",space_cache=v2"); else seq_puts(seq, ",nospace_cache"); if (btrfs_test_opt(root, RESCAN_UUID_TREE)) @@ -1514,9 +1564,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, if ((flags ^ s->s_flags) & MS_RDONLY) error = -EBUSY; } else { - char b[BDEVNAME_SIZE]; - - strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); + snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev); btrfs_sb(s)->bdev_holder = fs_type; error = btrfs_fill_super(s, fs_devices, data, flags & MS_SILENT ? 1 : 0); @@ -1865,7 +1913,7 @@ static int btrfs_calc_avail_data_space(struct btrfs_root *root, u64 *free_bytes) * btrfs starts at an offset of at least 1MB when doing chunk * allocation. */ - skip_space = 1024 * 1024; + skip_space = SZ_1M; /* user can set the offset in fs_info->alloc_start. */ if (fs_info->alloc_start && @@ -2249,6 +2297,9 @@ static int btrfs_run_sanity_tests(void) if (ret) goto out; ret = btrfs_test_qgroups(); + if (ret) + goto out; + ret = btrfs_test_free_space_tree(); out: btrfs_destroy_test_fs(); return ret; diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index e0ac85949..539e7b5e3 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -202,6 +202,7 @@ BTRFS_FEAT_ATTR_INCOMPAT(extended_iref, EXTENDED_IREF); BTRFS_FEAT_ATTR_INCOMPAT(raid56, RAID56); BTRFS_FEAT_ATTR_INCOMPAT(skinny_metadata, SKINNY_METADATA); BTRFS_FEAT_ATTR_INCOMPAT(no_holes, NO_HOLES); +BTRFS_FEAT_ATTR_COMPAT_RO(free_space_tree, FREE_SPACE_TREE); static struct attribute *btrfs_supported_feature_attrs[] = { BTRFS_FEAT_ATTR_PTR(mixed_backref), @@ -213,6 +214,7 @@ static struct attribute *btrfs_supported_feature_attrs[] = { BTRFS_FEAT_ATTR_PTR(raid56), BTRFS_FEAT_ATTR_PTR(skinny_metadata), BTRFS_FEAT_ATTR_PTR(no_holes), + BTRFS_FEAT_ATTR_PTR(free_space_tree), NULL }; @@ -780,6 +782,39 @@ failure: return error; } + +/* + * Change per-fs features in /sys/fs/btrfs/UUID/features to match current + * values in superblock. Call after any changes to incompat/compat_ro flags + */ +void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info, + u64 bit, enum btrfs_feature_set set) +{ + struct btrfs_fs_devices *fs_devs; + struct kobject *fsid_kobj; + u64 features; + int ret; + + if (!fs_info) + return; + + features = get_features(fs_info, set); + ASSERT(bit & supported_feature_masks[set]); + + fs_devs = fs_info->fs_devices; + fsid_kobj = &fs_devs->fsid_kobj; + + if (!fsid_kobj->state_initialized) + return; + + /* + * FIXME: this is too heavy to update just one value, ideally we'd like + * to use sysfs_update_group but some refactoring is needed first. + */ + sysfs_remove_group(fsid_kobj, &btrfs_feature_attr_group); + ret = sysfs_create_group(fsid_kobj, &btrfs_feature_attr_group); +} + static int btrfs_init_debugfs(void) { #ifdef CONFIG_DEBUG_FS diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index 9c0952212..d7da1a4c2 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -56,7 +56,7 @@ static struct btrfs_feature_attr btrfs_attr_##_name = { \ #define BTRFS_FEAT_ATTR_COMPAT(name, feature) \ BTRFS_FEAT_ATTR(name, FEAT_COMPAT, BTRFS_FEATURE_COMPAT, feature) #define BTRFS_FEAT_ATTR_COMPAT_RO(name, feature) \ - BTRFS_FEAT_ATTR(name, FEAT_COMPAT_RO, BTRFS_FEATURE_COMPAT, feature) + BTRFS_FEAT_ATTR(name, FEAT_COMPAT_RO, BTRFS_FEATURE_COMPAT_RO, feature) #define BTRFS_FEAT_ATTR_INCOMPAT(name, feature) \ BTRFS_FEAT_ATTR(name, FEAT_INCOMPAT, BTRFS_FEATURE_INCOMPAT, feature) @@ -90,4 +90,7 @@ int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs, struct kobject *parent); int btrfs_sysfs_add_device(struct btrfs_fs_devices *fs_devs); void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs); +void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info, + u64 bit, enum btrfs_feature_set set); + #endif /* _BTRFS_SYSFS_H_ */ diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index 9626252ee..0e1e61a7e 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -21,6 +21,9 @@ #include <linux/magic.h> #include "btrfs-tests.h" #include "../ctree.h" +#include "../free-space-cache.h" +#include "../free-space-tree.h" +#include "../transaction.h" #include "../volumes.h" #include "../disk-io.h" #include "../qgroup.h" @@ -79,18 +82,18 @@ void btrfs_destroy_test_fs(void) struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void) { struct btrfs_fs_info *fs_info = kzalloc(sizeof(struct btrfs_fs_info), - GFP_NOFS); + GFP_KERNEL); if (!fs_info) return fs_info; fs_info->fs_devices = kzalloc(sizeof(struct btrfs_fs_devices), - GFP_NOFS); + GFP_KERNEL); if (!fs_info->fs_devices) { kfree(fs_info); return NULL; } fs_info->super_copy = kzalloc(sizeof(struct btrfs_super_block), - GFP_NOFS); + GFP_KERNEL); if (!fs_info->super_copy) { kfree(fs_info->fs_devices); kfree(fs_info); @@ -122,6 +125,9 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void) INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); + extent_io_tree_init(&fs_info->freed_extents[0], NULL); + extent_io_tree_init(&fs_info->freed_extents[1], NULL); + fs_info->pinned_extents = &fs_info->freed_extents[0]; return fs_info; } @@ -169,3 +175,55 @@ void btrfs_free_dummy_root(struct btrfs_root *root) kfree(root); } +struct btrfs_block_group_cache * +btrfs_alloc_dummy_block_group(unsigned long length) +{ + struct btrfs_block_group_cache *cache; + + cache = kzalloc(sizeof(*cache), GFP_KERNEL); + if (!cache) + return NULL; + cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), + GFP_KERNEL); + if (!cache->free_space_ctl) { + kfree(cache); + return NULL; + } + cache->fs_info = btrfs_alloc_dummy_fs_info(); + if (!cache->fs_info) { + kfree(cache->free_space_ctl); + kfree(cache); + return NULL; + } + + cache->key.objectid = 0; + cache->key.offset = length; + cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; + cache->sectorsize = 4096; + cache->full_stripe_len = 4096; + + INIT_LIST_HEAD(&cache->list); + INIT_LIST_HEAD(&cache->cluster_list); + INIT_LIST_HEAD(&cache->bg_list); + btrfs_init_free_space_ctl(cache); + mutex_init(&cache->free_space_lock); + + return cache; +} + +void btrfs_free_dummy_block_group(struct btrfs_block_group_cache *cache) +{ + if (!cache) + return; + __btrfs_remove_free_space_cache(cache->free_space_ctl); + kfree(cache->free_space_ctl); + kfree(cache); +} + +void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans) +{ + memset(trans, 0, sizeof(*trans)); + trans->transid = 1; + INIT_LIST_HEAD(&trans->qgroup_ref_list); + trans->type = __TRANS_DUMMY; +} diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h index fd3954224..054b8c73c 100644 --- a/fs/btrfs/tests/btrfs-tests.h +++ b/fs/btrfs/tests/btrfs-tests.h @@ -24,17 +24,23 @@ #define test_msg(fmt, ...) pr_info("BTRFS: selftest: " fmt, ##__VA_ARGS__) struct btrfs_root; +struct btrfs_trans_handle; int btrfs_test_free_space_cache(void); int btrfs_test_extent_buffer_operations(void); int btrfs_test_extent_io(void); int btrfs_test_inodes(void); int btrfs_test_qgroups(void); +int btrfs_test_free_space_tree(void); int btrfs_init_test_fs(void); void btrfs_destroy_test_fs(void); struct inode *btrfs_new_test_inode(void); struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(void); void btrfs_free_dummy_root(struct btrfs_root *root); +struct btrfs_block_group_cache * +btrfs_alloc_dummy_block_group(unsigned long length); +void btrfs_free_dummy_block_group(struct btrfs_block_group_cache *cache); +void btrfs_init_dummy_trans(struct btrfs_trans_handle *trans); #else static inline int btrfs_test_free_space_cache(void) { @@ -63,6 +69,10 @@ static inline int btrfs_test_qgroups(void) { return 0; } +static inline int btrfs_test_free_space_tree(void) +{ + return 0; +} #endif #endif diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 9e9f23681..669b58201 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -18,6 +18,8 @@ #include <linux/pagemap.h> #include <linux/sched.h> +#include <linux/slab.h> +#include <linux/sizes.h> #include "btrfs-tests.h" #include "../extent_io.h" @@ -70,12 +72,14 @@ static int test_find_delalloc(void) struct page *page; struct page *locked_page = NULL; unsigned long index = 0; - u64 total_dirty = 256 * 1024 * 1024; - u64 max_bytes = 128 * 1024 * 1024; + u64 total_dirty = SZ_256M; + u64 max_bytes = SZ_128M; u64 start, end, test_start; u64 found; int ret = -EINVAL; + test_msg("Running find delalloc tests\n"); + inode = btrfs_new_test_inode(); if (!inode) { test_msg("Failed to allocate test inode\n"); @@ -90,7 +94,7 @@ static int test_find_delalloc(void) * test. */ for (index = 0; index < (total_dirty >> PAGE_CACHE_SHIFT); index++) { - page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); + page = find_or_create_page(inode->i_mapping, index, GFP_KERNEL); if (!page) { test_msg("Failed to allocate test page\n"); ret = -ENOMEM; @@ -109,7 +113,7 @@ static int test_find_delalloc(void) * |--- delalloc ---| * |--- search ---| */ - set_extent_delalloc(&tmp, 0, 4095, NULL, GFP_NOFS); + set_extent_delalloc(&tmp, 0, 4095, NULL, GFP_KERNEL); start = 0; end = 0; found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, @@ -133,14 +137,14 @@ static int test_find_delalloc(void) * |--- delalloc ---| * |--- search ---| */ - test_start = 64 * 1024 * 1024; + test_start = SZ_64M; locked_page = find_lock_page(inode->i_mapping, test_start >> PAGE_CACHE_SHIFT); if (!locked_page) { test_msg("Couldn't find the locked page\n"); goto out_bits; } - set_extent_delalloc(&tmp, 4096, max_bytes - 1, NULL, GFP_NOFS); + set_extent_delalloc(&tmp, 4096, max_bytes - 1, NULL, GFP_KERNEL); start = test_start; end = 0; found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, @@ -195,7 +199,7 @@ static int test_find_delalloc(void) * * We are re-using our test_start from above since it works out well. */ - set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, NULL, GFP_NOFS); + set_extent_delalloc(&tmp, max_bytes, total_dirty - 1, NULL, GFP_KERNEL); start = test_start; end = 0; found = find_lock_delalloc_range(inode, &tmp, locked_page, &start, @@ -220,8 +224,8 @@ static int test_find_delalloc(void) * Now to test where we run into a page that is no longer dirty in the * range we want to find. */ - page = find_get_page(inode->i_mapping, (max_bytes + (1 * 1024 * 1024)) - >> PAGE_CACHE_SHIFT); + page = find_get_page(inode->i_mapping, + (max_bytes + SZ_1M) >> PAGE_CACHE_SHIFT); if (!page) { test_msg("Couldn't find our page\n"); goto out_bits; @@ -258,7 +262,7 @@ static int test_find_delalloc(void) } ret = 0; out_bits: - clear_extent_bits(&tmp, 0, total_dirty - 1, (unsigned)-1, GFP_NOFS); + clear_extent_bits(&tmp, 0, total_dirty - 1, (unsigned)-1, GFP_KERNEL); out: if (locked_page) page_cache_release(locked_page); @@ -268,8 +272,139 @@ out: return ret; } +static int __test_eb_bitmaps(unsigned long *bitmap, struct extent_buffer *eb, + unsigned long len) +{ + unsigned long i, x; + + memset(bitmap, 0, len); + memset_extent_buffer(eb, 0, 0, len); + if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) { + test_msg("Bitmap was not zeroed\n"); + return -EINVAL; + } + + bitmap_set(bitmap, 0, len * BITS_PER_BYTE); + extent_buffer_bitmap_set(eb, 0, 0, len * BITS_PER_BYTE); + if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) { + test_msg("Setting all bits failed\n"); + return -EINVAL; + } + + bitmap_clear(bitmap, 0, len * BITS_PER_BYTE); + extent_buffer_bitmap_clear(eb, 0, 0, len * BITS_PER_BYTE); + if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) { + test_msg("Clearing all bits failed\n"); + return -EINVAL; + } + + bitmap_set(bitmap, (PAGE_CACHE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE, + sizeof(long) * BITS_PER_BYTE); + extent_buffer_bitmap_set(eb, PAGE_CACHE_SIZE - sizeof(long) / 2, 0, + sizeof(long) * BITS_PER_BYTE); + if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) { + test_msg("Setting straddling pages failed\n"); + return -EINVAL; + } + + bitmap_set(bitmap, 0, len * BITS_PER_BYTE); + bitmap_clear(bitmap, + (PAGE_CACHE_SIZE - sizeof(long) / 2) * BITS_PER_BYTE, + sizeof(long) * BITS_PER_BYTE); + extent_buffer_bitmap_set(eb, 0, 0, len * BITS_PER_BYTE); + extent_buffer_bitmap_clear(eb, PAGE_CACHE_SIZE - sizeof(long) / 2, 0, + sizeof(long) * BITS_PER_BYTE); + if (memcmp_extent_buffer(eb, bitmap, 0, len) != 0) { + test_msg("Clearing straddling pages failed\n"); + return -EINVAL; + } + + /* + * Generate a wonky pseudo-random bit pattern for the sake of not using + * something repetitive that could miss some hypothetical off-by-n bug. + */ + x = 0; + for (i = 0; i < len / sizeof(long); i++) { + x = (0x19660dULL * (u64)x + 0x3c6ef35fULL) & 0xffffffffUL; + bitmap[i] = x; + } + write_extent_buffer(eb, bitmap, 0, len); + + for (i = 0; i < len * BITS_PER_BYTE; i++) { + int bit, bit1; + + bit = !!test_bit(i, bitmap); + bit1 = !!extent_buffer_test_bit(eb, 0, i); + if (bit1 != bit) { + test_msg("Testing bit pattern failed\n"); + return -EINVAL; + } + + bit1 = !!extent_buffer_test_bit(eb, i / BITS_PER_BYTE, + i % BITS_PER_BYTE); + if (bit1 != bit) { + test_msg("Testing bit pattern with offset failed\n"); + return -EINVAL; + } + } + + return 0; +} + +static int test_eb_bitmaps(void) +{ + unsigned long len = PAGE_CACHE_SIZE * 4; + unsigned long *bitmap; + struct extent_buffer *eb; + int ret; + + test_msg("Running extent buffer bitmap tests\n"); + + bitmap = kmalloc(len, GFP_KERNEL); + if (!bitmap) { + test_msg("Couldn't allocate test bitmap\n"); + return -ENOMEM; + } + + eb = __alloc_dummy_extent_buffer(NULL, 0, len); + if (!eb) { + test_msg("Couldn't allocate test extent buffer\n"); + kfree(bitmap); + return -ENOMEM; + } + + ret = __test_eb_bitmaps(bitmap, eb, len); + if (ret) + goto out; + + /* Do it over again with an extent buffer which isn't page-aligned. */ + free_extent_buffer(eb); + eb = __alloc_dummy_extent_buffer(NULL, PAGE_CACHE_SIZE / 2, len); + if (!eb) { + test_msg("Couldn't allocate test extent buffer\n"); + kfree(bitmap); + return -ENOMEM; + } + + ret = __test_eb_bitmaps(bitmap, eb, len); +out: + free_extent_buffer(eb); + kfree(bitmap); + return ret; +} + int btrfs_test_extent_io(void) { - test_msg("Running find delalloc tests\n"); - return test_find_delalloc(); + int ret; + + test_msg("Running extent I/O tests\n"); + + ret = test_find_delalloc(); + if (ret) + goto out; + + ret = test_eb_bitmaps(); +out: + test_msg("Extent I/O tests finished\n"); + return ret; } diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c index 8b72b005b..c9ad97b1e 100644 --- a/fs/btrfs/tests/free-space-tests.c +++ b/fs/btrfs/tests/free-space-tests.c @@ -23,41 +23,6 @@ #include "../free-space-cache.h" #define BITS_PER_BITMAP (PAGE_CACHE_SIZE * 8) -static struct btrfs_block_group_cache *init_test_block_group(void) -{ - struct btrfs_block_group_cache *cache; - - cache = kzalloc(sizeof(*cache), GFP_NOFS); - if (!cache) - return NULL; - cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl), - GFP_NOFS); - if (!cache->free_space_ctl) { - kfree(cache); - return NULL; - } - cache->fs_info = btrfs_alloc_dummy_fs_info(); - if (!cache->fs_info) { - kfree(cache->free_space_ctl); - kfree(cache); - return NULL; - } - - cache->key.objectid = 0; - cache->key.offset = 1024 * 1024 * 1024; - cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; - cache->sectorsize = 4096; - cache->full_stripe_len = 4096; - - spin_lock_init(&cache->lock); - INIT_LIST_HEAD(&cache->list); - INIT_LIST_HEAD(&cache->cluster_list); - INIT_LIST_HEAD(&cache->bg_list); - - btrfs_init_free_space_ctl(cache); - - return cache; -} /* * This test just does basic sanity checking, making sure we can add an exten @@ -71,59 +36,59 @@ static int test_extents(struct btrfs_block_group_cache *cache) test_msg("Running extent only tests\n"); /* First just make sure we can remove an entire entry */ - ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024); + ret = btrfs_add_free_space(cache, 0, SZ_4M); if (ret) { test_msg("Error adding initial extents %d\n", ret); return ret; } - ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024); + ret = btrfs_remove_free_space(cache, 0, SZ_4M); if (ret) { test_msg("Error removing extent %d\n", ret); return ret; } - if (test_check_exists(cache, 0, 4 * 1024 * 1024)) { + if (test_check_exists(cache, 0, SZ_4M)) { test_msg("Full remove left some lingering space\n"); return -1; } /* Ok edge and middle cases now */ - ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024); + ret = btrfs_add_free_space(cache, 0, SZ_4M); if (ret) { test_msg("Error adding half extent %d\n", ret); return ret; } - ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 1 * 1024 * 1024); + ret = btrfs_remove_free_space(cache, 3 * SZ_1M, SZ_1M); if (ret) { test_msg("Error removing tail end %d\n", ret); return ret; } - ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024); + ret = btrfs_remove_free_space(cache, 0, SZ_1M); if (ret) { test_msg("Error removing front end %d\n", ret); return ret; } - ret = btrfs_remove_free_space(cache, 2 * 1024 * 1024, 4096); + ret = btrfs_remove_free_space(cache, SZ_2M, 4096); if (ret) { test_msg("Error removing middle piece %d\n", ret); return ret; } - if (test_check_exists(cache, 0, 1 * 1024 * 1024)) { + if (test_check_exists(cache, 0, SZ_1M)) { test_msg("Still have space at the front\n"); return -1; } - if (test_check_exists(cache, 2 * 1024 * 1024, 4096)) { + if (test_check_exists(cache, SZ_2M, 4096)) { test_msg("Still have space in the middle\n"); return -1; } - if (test_check_exists(cache, 3 * 1024 * 1024, 1 * 1024 * 1024)) { + if (test_check_exists(cache, 3 * SZ_1M, SZ_1M)) { test_msg("Still have space at the end\n"); return -1; } @@ -141,30 +106,30 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache) test_msg("Running bitmap only tests\n"); - ret = test_add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1); + ret = test_add_free_space_entry(cache, 0, SZ_4M, 1); if (ret) { test_msg("Couldn't create a bitmap entry %d\n", ret); return ret; } - ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024); + ret = btrfs_remove_free_space(cache, 0, SZ_4M); if (ret) { test_msg("Error removing bitmap full range %d\n", ret); return ret; } - if (test_check_exists(cache, 0, 4 * 1024 * 1024)) { + if (test_check_exists(cache, 0, SZ_4M)) { test_msg("Left some space in bitmap\n"); return -1; } - ret = test_add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1); + ret = test_add_free_space_entry(cache, 0, SZ_4M, 1); if (ret) { test_msg("Couldn't add to our bitmap entry %d\n", ret); return ret; } - ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 2 * 1024 * 1024); + ret = btrfs_remove_free_space(cache, SZ_1M, SZ_2M); if (ret) { test_msg("Couldn't remove middle chunk %d\n", ret); return ret; @@ -177,23 +142,21 @@ static int test_bitmaps(struct btrfs_block_group_cache *cache) next_bitmap_offset = (u64)(BITS_PER_BITMAP * 4096); /* Test a bit straddling two bitmaps */ - ret = test_add_free_space_entry(cache, next_bitmap_offset - - (2 * 1024 * 1024), 4 * 1024 * 1024, 1); + ret = test_add_free_space_entry(cache, next_bitmap_offset - SZ_2M, + SZ_4M, 1); if (ret) { test_msg("Couldn't add space that straddles two bitmaps %d\n", ret); return ret; } - ret = btrfs_remove_free_space(cache, next_bitmap_offset - - (1 * 1024 * 1024), 2 * 1024 * 1024); + ret = btrfs_remove_free_space(cache, next_bitmap_offset - SZ_1M, SZ_2M); if (ret) { test_msg("Couldn't remove overlapping space %d\n", ret); return ret; } - if (test_check_exists(cache, next_bitmap_offset - (1 * 1024 * 1024), - 2 * 1024 * 1024)) { + if (test_check_exists(cache, next_bitmap_offset - SZ_1M, SZ_2M)) { test_msg("Left some space when removing overlapping\n"); return -1; } @@ -216,43 +179,43 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache) * bitmap, but the free space completely in the extent and then * completely in the bitmap. */ - ret = test_add_free_space_entry(cache, 4 * 1024 * 1024, 1 * 1024 * 1024, 1); + ret = test_add_free_space_entry(cache, SZ_4M, SZ_1M, 1); if (ret) { test_msg("Couldn't create bitmap entry %d\n", ret); return ret; } - ret = test_add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0); + ret = test_add_free_space_entry(cache, 0, SZ_1M, 0); if (ret) { test_msg("Couldn't add extent entry %d\n", ret); return ret; } - ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024); + ret = btrfs_remove_free_space(cache, 0, SZ_1M); if (ret) { test_msg("Couldn't remove extent entry %d\n", ret); return ret; } - if (test_check_exists(cache, 0, 1 * 1024 * 1024)) { + if (test_check_exists(cache, 0, SZ_1M)) { test_msg("Left remnants after our remove\n"); return -1; } /* Now to add back the extent entry and remove from the bitmap */ - ret = test_add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0); + ret = test_add_free_space_entry(cache, 0, SZ_1M, 0); if (ret) { test_msg("Couldn't re-add extent entry %d\n", ret); return ret; } - ret = btrfs_remove_free_space(cache, 4 * 1024 * 1024, 1 * 1024 * 1024); + ret = btrfs_remove_free_space(cache, SZ_4M, SZ_1M); if (ret) { test_msg("Couldn't remove from bitmap %d\n", ret); return ret; } - if (test_check_exists(cache, 4 * 1024 * 1024, 1 * 1024 * 1024)) { + if (test_check_exists(cache, SZ_4M, SZ_1M)) { test_msg("Left remnants in the bitmap\n"); return -1; } @@ -261,19 +224,19 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache) * Ok so a little more evil, extent entry and bitmap at the same offset, * removing an overlapping chunk. */ - ret = test_add_free_space_entry(cache, 1 * 1024 * 1024, 4 * 1024 * 1024, 1); + ret = test_add_free_space_entry(cache, SZ_1M, SZ_4M, 1); if (ret) { test_msg("Couldn't add to a bitmap %d\n", ret); return ret; } - ret = btrfs_remove_free_space(cache, 512 * 1024, 3 * 1024 * 1024); + ret = btrfs_remove_free_space(cache, SZ_512K, 3 * SZ_1M); if (ret) { test_msg("Couldn't remove overlapping space %d\n", ret); return ret; } - if (test_check_exists(cache, 512 * 1024, 3 * 1024 * 1024)) { + if (test_check_exists(cache, SZ_512K, 3 * SZ_1M)) { test_msg("Left over pieces after removing overlapping\n"); return -1; } @@ -281,25 +244,25 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache) __btrfs_remove_free_space_cache(cache->free_space_ctl); /* Now with the extent entry offset into the bitmap */ - ret = test_add_free_space_entry(cache, 4 * 1024 * 1024, 4 * 1024 * 1024, 1); + ret = test_add_free_space_entry(cache, SZ_4M, SZ_4M, 1); if (ret) { test_msg("Couldn't add space to the bitmap %d\n", ret); return ret; } - ret = test_add_free_space_entry(cache, 2 * 1024 * 1024, 2 * 1024 * 1024, 0); + ret = test_add_free_space_entry(cache, SZ_2M, SZ_2M, 0); if (ret) { test_msg("Couldn't add extent to the cache %d\n", ret); return ret; } - ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 4 * 1024 * 1024); + ret = btrfs_remove_free_space(cache, 3 * SZ_1M, SZ_4M); if (ret) { test_msg("Problem removing overlapping space %d\n", ret); return ret; } - if (test_check_exists(cache, 3 * 1024 * 1024, 4 * 1024 * 1024)) { + if (test_check_exists(cache, 3 * SZ_1M, SZ_4M)) { test_msg("Left something behind when removing space"); return -1; } @@ -315,29 +278,26 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache) * [ del ] */ __btrfs_remove_free_space_cache(cache->free_space_ctl); - ret = test_add_free_space_entry(cache, bitmap_offset + 4 * 1024 * 1024, - 4 * 1024 * 1024, 1); + ret = test_add_free_space_entry(cache, bitmap_offset + SZ_4M, SZ_4M, 1); if (ret) { test_msg("Couldn't add bitmap %d\n", ret); return ret; } - ret = test_add_free_space_entry(cache, bitmap_offset - 1 * 1024 * 1024, - 5 * 1024 * 1024, 0); + ret = test_add_free_space_entry(cache, bitmap_offset - SZ_1M, + 5 * SZ_1M, 0); if (ret) { test_msg("Couldn't add extent entry %d\n", ret); return ret; } - ret = btrfs_remove_free_space(cache, bitmap_offset + 1 * 1024 * 1024, - 5 * 1024 * 1024); + ret = btrfs_remove_free_space(cache, bitmap_offset + SZ_1M, 5 * SZ_1M); if (ret) { test_msg("Failed to free our space %d\n", ret); return ret; } - if (test_check_exists(cache, bitmap_offset + 1 * 1024 * 1024, - 5 * 1024 * 1024)) { + if (test_check_exists(cache, bitmap_offset + SZ_1M, 5 * SZ_1M)) { test_msg("Left stuff over\n"); return -1; } @@ -350,19 +310,19 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache) * to return -EAGAIN back from btrfs_remove_extent, make sure this * doesn't happen. */ - ret = test_add_free_space_entry(cache, 1 * 1024 * 1024, 2 * 1024 * 1024, 1); + ret = test_add_free_space_entry(cache, SZ_1M, SZ_2M, 1); if (ret) { test_msg("Couldn't add bitmap entry %d\n", ret); return ret; } - ret = test_add_free_space_entry(cache, 3 * 1024 * 1024, 1 * 1024 * 1024, 0); + ret = test_add_free_space_entry(cache, 3 * SZ_1M, SZ_1M, 0); if (ret) { test_msg("Couldn't add extent entry %d\n", ret); return ret; } - ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 3 * 1024 * 1024); + ret = btrfs_remove_free_space(cache, SZ_1M, 3 * SZ_1M); if (ret) { test_msg("Error removing bitmap and extent overlapping %d\n", ret); return ret; @@ -445,9 +405,11 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) int ret; u64 offset; u64 max_extent_size; - - bool (*use_bitmap_op)(struct btrfs_free_space_ctl *, - struct btrfs_free_space *); + const struct btrfs_free_space_op test_free_space_ops = { + .recalc_thresholds = cache->free_space_ctl->op->recalc_thresholds, + .use_bitmap = test_use_bitmap, + }; + const struct btrfs_free_space_op *orig_free_space_ops; test_msg("Running space stealing from bitmap to extent\n"); @@ -469,22 +431,21 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * that forces use of bitmaps as soon as we have at least 1 * extent entry. */ - use_bitmap_op = cache->free_space_ctl->op->use_bitmap; - cache->free_space_ctl->op->use_bitmap = test_use_bitmap; + orig_free_space_ops = cache->free_space_ctl->op; + cache->free_space_ctl->op = &test_free_space_ops; /* * Extent entry covering free space range [128Mb - 256Kb, 128Mb - 128Kb[ */ - ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 - 256 * 1024, - 128 * 1024, 0); + ret = test_add_free_space_entry(cache, SZ_128M - SZ_256K, SZ_128K, 0); if (ret) { test_msg("Couldn't add extent entry %d\n", ret); return ret; } /* Bitmap entry covering free space range [128Mb + 512Kb, 256Mb[ */ - ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 + 512 * 1024, - 128 * 1024 * 1024 - 512 * 1024, 1); + ret = test_add_free_space_entry(cache, SZ_128M + SZ_512K, + SZ_128M - SZ_512K, 1); if (ret) { test_msg("Couldn't add bitmap entry %d\n", ret); return ret; @@ -502,21 +463,19 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * [128Mb + 512Kb, 128Mb + 768Kb[ */ ret = btrfs_remove_free_space(cache, - 128 * 1024 * 1024 + 768 * 1024, - 128 * 1024 * 1024 - 768 * 1024); + SZ_128M + 768 * SZ_1K, + SZ_128M - 768 * SZ_1K); if (ret) { test_msg("Failed to free part of bitmap space %d\n", ret); return ret; } /* Confirm that only those 2 ranges are marked as free. */ - if (!test_check_exists(cache, 128 * 1024 * 1024 - 256 * 1024, - 128 * 1024)) { + if (!test_check_exists(cache, SZ_128M - SZ_256K, SZ_128K)) { test_msg("Free space range missing\n"); return -ENOENT; } - if (!test_check_exists(cache, 128 * 1024 * 1024 + 512 * 1024, - 256 * 1024)) { + if (!test_check_exists(cache, SZ_128M + SZ_512K, SZ_256K)) { test_msg("Free space range missing\n"); return -ENOENT; } @@ -525,8 +484,8 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * Confirm that the bitmap range [128Mb + 768Kb, 256Mb[ isn't marked * as free anymore. */ - if (test_check_exists(cache, 128 * 1024 * 1024 + 768 * 1024, - 128 * 1024 * 1024 - 768 * 1024)) { + if (test_check_exists(cache, SZ_128M + 768 * SZ_1K, + SZ_128M - 768 * SZ_1K)) { test_msg("Bitmap region not removed from space cache\n"); return -EINVAL; } @@ -535,8 +494,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * Confirm that the region [128Mb + 256Kb, 128Mb + 512Kb[, which is * covered by the bitmap, isn't marked as free. */ - if (test_check_exists(cache, 128 * 1024 * 1024 + 256 * 1024, - 256 * 1024)) { + if (test_check_exists(cache, SZ_128M + SZ_256K, SZ_256K)) { test_msg("Invalid bitmap region marked as free\n"); return -EINVAL; } @@ -545,8 +503,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * Confirm that the region [128Mb, 128Mb + 256Kb[, which is covered * by the bitmap too, isn't marked as free either. */ - if (test_check_exists(cache, 128 * 1024 * 1024, - 256 * 1024)) { + if (test_check_exists(cache, SZ_128M, SZ_256K)) { test_msg("Invalid bitmap region marked as free\n"); return -EINVAL; } @@ -556,13 +513,13 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * lets make sure the free space cache marks it as free in the bitmap, * and doesn't insert a new extent entry to represent this region. */ - ret = btrfs_add_free_space(cache, 128 * 1024 * 1024, 512 * 1024); + ret = btrfs_add_free_space(cache, SZ_128M, SZ_512K); if (ret) { test_msg("Error adding free space: %d\n", ret); return ret; } /* Confirm the region is marked as free. */ - if (!test_check_exists(cache, 128 * 1024 * 1024, 512 * 1024)) { + if (!test_check_exists(cache, SZ_128M, SZ_512K)) { test_msg("Bitmap region not marked as free\n"); return -ENOENT; } @@ -581,8 +538,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * The goal is to test that the bitmap entry space stealing doesn't * steal this space region. */ - ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 + 16 * 1024 * 1024, - 4096); + ret = btrfs_add_free_space(cache, SZ_128M + SZ_16M, 4096); if (ret) { test_msg("Error adding free space: %d\n", ret); return ret; @@ -601,15 +557,13 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * expand the range covered by the existing extent entry that represents * the free space [128Mb - 256Kb, 128Mb - 128Kb[. */ - ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 - 128 * 1024, - 128 * 1024); + ret = btrfs_add_free_space(cache, SZ_128M - SZ_128K, SZ_128K); if (ret) { test_msg("Error adding free space: %d\n", ret); return ret; } /* Confirm the region is marked as free. */ - if (!test_check_exists(cache, 128 * 1024 * 1024 - 128 * 1024, - 128 * 1024)) { + if (!test_check_exists(cache, SZ_128M - SZ_128K, SZ_128K)) { test_msg("Extent region not marked as free\n"); return -ENOENT; } @@ -637,21 +591,20 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * that represents the 1Mb free space, and therefore we're able to * allocate the whole free space at once. */ - if (!test_check_exists(cache, 128 * 1024 * 1024 - 256 * 1024, - 1 * 1024 * 1024)) { + if (!test_check_exists(cache, SZ_128M - SZ_256K, SZ_1M)) { test_msg("Expected region not marked as free\n"); return -ENOENT; } - if (cache->free_space_ctl->free_space != (1 * 1024 * 1024 + 4096)) { + if (cache->free_space_ctl->free_space != (SZ_1M + 4096)) { test_msg("Cache free space is not 1Mb + 4Kb\n"); return -EINVAL; } offset = btrfs_find_space_for_alloc(cache, - 0, 1 * 1024 * 1024, 0, + 0, SZ_1M, 0, &max_extent_size); - if (offset != (128 * 1024 * 1024 - 256 * 1024)) { + if (offset != (SZ_128M - SZ_256K)) { test_msg("Failed to allocate 1Mb from space cache, returned offset is: %llu\n", offset); return -EINVAL; @@ -670,7 +623,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) offset = btrfs_find_space_for_alloc(cache, 0, 4096, 0, &max_extent_size); - if (offset != (128 * 1024 * 1024 + 16 * 1024 * 1024)) { + if (offset != (SZ_128M + SZ_16M)) { test_msg("Failed to allocate 4Kb from space cache, returned offset is: %llu\n", offset); return -EINVAL; @@ -691,16 +644,14 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) /* * Extent entry covering free space range [128Mb + 128Kb, 128Mb + 256Kb[ */ - ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 + 128 * 1024, - 128 * 1024, 0); + ret = test_add_free_space_entry(cache, SZ_128M + SZ_128K, SZ_128K, 0); if (ret) { test_msg("Couldn't add extent entry %d\n", ret); return ret; } /* Bitmap entry covering free space range [0, 128Mb - 512Kb[ */ - ret = test_add_free_space_entry(cache, 0, - 128 * 1024 * 1024 - 512 * 1024, 1); + ret = test_add_free_space_entry(cache, 0, SZ_128M - SZ_512K, 1); if (ret) { test_msg("Couldn't add bitmap entry %d\n", ret); return ret; @@ -717,22 +668,18 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * [128Mb + 128b, 128Mb + 256Kb[ * [128Mb - 768Kb, 128Mb - 512Kb[ */ - ret = btrfs_remove_free_space(cache, - 0, - 128 * 1024 * 1024 - 768 * 1024); + ret = btrfs_remove_free_space(cache, 0, SZ_128M - 768 * SZ_1K); if (ret) { test_msg("Failed to free part of bitmap space %d\n", ret); return ret; } /* Confirm that only those 2 ranges are marked as free. */ - if (!test_check_exists(cache, 128 * 1024 * 1024 + 128 * 1024, - 128 * 1024)) { + if (!test_check_exists(cache, SZ_128M + SZ_128K, SZ_128K)) { test_msg("Free space range missing\n"); return -ENOENT; } - if (!test_check_exists(cache, 128 * 1024 * 1024 - 768 * 1024, - 256 * 1024)) { + if (!test_check_exists(cache, SZ_128M - 768 * SZ_1K, SZ_256K)) { test_msg("Free space range missing\n"); return -ENOENT; } @@ -741,8 +688,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * Confirm that the bitmap range [0, 128Mb - 768Kb[ isn't marked * as free anymore. */ - if (test_check_exists(cache, 0, - 128 * 1024 * 1024 - 768 * 1024)) { + if (test_check_exists(cache, 0, SZ_128M - 768 * SZ_1K)) { test_msg("Bitmap region not removed from space cache\n"); return -EINVAL; } @@ -751,8 +697,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * Confirm that the region [128Mb - 512Kb, 128Mb[, which is * covered by the bitmap, isn't marked as free. */ - if (test_check_exists(cache, 128 * 1024 * 1024 - 512 * 1024, - 512 * 1024)) { + if (test_check_exists(cache, SZ_128M - SZ_512K, SZ_512K)) { test_msg("Invalid bitmap region marked as free\n"); return -EINVAL; } @@ -762,15 +707,13 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * lets make sure the free space cache marks it as free in the bitmap, * and doesn't insert a new extent entry to represent this region. */ - ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 - 512 * 1024, - 512 * 1024); + ret = btrfs_add_free_space(cache, SZ_128M - SZ_512K, SZ_512K); if (ret) { test_msg("Error adding free space: %d\n", ret); return ret; } /* Confirm the region is marked as free. */ - if (!test_check_exists(cache, 128 * 1024 * 1024 - 512 * 1024, - 512 * 1024)) { + if (!test_check_exists(cache, SZ_128M - SZ_512K, SZ_512K)) { test_msg("Bitmap region not marked as free\n"); return -ENOENT; } @@ -789,7 +732,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * The goal is to test that the bitmap entry space stealing doesn't * steal this space region. */ - ret = btrfs_add_free_space(cache, 32 * 1024 * 1024, 8192); + ret = btrfs_add_free_space(cache, SZ_32M, 8192); if (ret) { test_msg("Error adding free space: %d\n", ret); return ret; @@ -800,13 +743,13 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * expand the range covered by the existing extent entry that represents * the free space [128Mb + 128Kb, 128Mb + 256Kb[. */ - ret = btrfs_add_free_space(cache, 128 * 1024 * 1024, 128 * 1024); + ret = btrfs_add_free_space(cache, SZ_128M, SZ_128K); if (ret) { test_msg("Error adding free space: %d\n", ret); return ret; } /* Confirm the region is marked as free. */ - if (!test_check_exists(cache, 128 * 1024 * 1024, 128 * 1024)) { + if (!test_check_exists(cache, SZ_128M, SZ_128K)) { test_msg("Extent region not marked as free\n"); return -ENOENT; } @@ -834,21 +777,19 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) * that represents the 1Mb free space, and therefore we're able to * allocate the whole free space at once. */ - if (!test_check_exists(cache, 128 * 1024 * 1024 - 768 * 1024, - 1 * 1024 * 1024)) { + if (!test_check_exists(cache, SZ_128M - 768 * SZ_1K, SZ_1M)) { test_msg("Expected region not marked as free\n"); return -ENOENT; } - if (cache->free_space_ctl->free_space != (1 * 1024 * 1024 + 8192)) { + if (cache->free_space_ctl->free_space != (SZ_1M + 8192)) { test_msg("Cache free space is not 1Mb + 8Kb\n"); return -EINVAL; } - offset = btrfs_find_space_for_alloc(cache, - 0, 1 * 1024 * 1024, 0, + offset = btrfs_find_space_for_alloc(cache, 0, SZ_1M, 0, &max_extent_size); - if (offset != (128 * 1024 * 1024 - 768 * 1024)) { + if (offset != (SZ_128M - 768 * SZ_1K)) { test_msg("Failed to allocate 1Mb from space cache, returned offset is: %llu\n", offset); return -EINVAL; @@ -867,7 +808,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) offset = btrfs_find_space_for_alloc(cache, 0, 8192, 0, &max_extent_size); - if (offset != (32 * 1024 * 1024)) { + if (offset != SZ_32M) { test_msg("Failed to allocate 8Kb from space cache, returned offset is: %llu\n", offset); return -EINVAL; @@ -877,7 +818,7 @@ test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) if (ret) return ret; - cache->free_space_ctl->op->use_bitmap = use_bitmap_op; + cache->free_space_ctl->op = orig_free_space_ops; __btrfs_remove_free_space_cache(cache->free_space_ctl); return 0; @@ -891,7 +832,7 @@ int btrfs_test_free_space_cache(void) test_msg("Running btrfs free space cache tests\n"); - cache = init_test_block_group(); + cache = btrfs_alloc_dummy_block_group(1024 * 1024 * 1024); if (!cache) { test_msg("Couldn't run the tests\n"); return 0; @@ -922,9 +863,7 @@ int btrfs_test_free_space_cache(void) ret = test_steal_space_from_bitmap_to_extent(cache); out: - __btrfs_remove_free_space_cache(cache->free_space_ctl); - kfree(cache->free_space_ctl); - kfree(cache); + btrfs_free_dummy_block_group(cache); btrfs_free_dummy_root(root); test_msg("Free space cache tests finished\n"); return ret; diff --git a/fs/btrfs/tests/free-space-tree-tests.c b/fs/btrfs/tests/free-space-tree-tests.c new file mode 100644 index 000000000..d05fe1ab4 --- /dev/null +++ b/fs/btrfs/tests/free-space-tree-tests.c @@ -0,0 +1,571 @@ +/* + * Copyright (C) 2015 Facebook. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include "btrfs-tests.h" +#include "../ctree.h" +#include "../disk-io.h" +#include "../free-space-tree.h" +#include "../transaction.h" + +struct free_space_extent { + u64 start, length; +}; + +/* + * The test cases align their operations to this in order to hit some of the + * edge cases in the bitmap code. + */ +#define BITMAP_RANGE (BTRFS_FREE_SPACE_BITMAP_BITS * 4096) + +static int __check_free_space_extents(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *cache, + struct btrfs_path *path, + struct free_space_extent *extents, + unsigned int num_extents) +{ + struct btrfs_free_space_info *info; + struct btrfs_key key; + int prev_bit = 0, bit; + u64 extent_start = 0, offset, end; + u32 flags, extent_count; + unsigned int i; + int ret; + + info = search_free_space_info(trans, fs_info, cache, path, 0); + if (IS_ERR(info)) { + test_msg("Could not find free space info\n"); + ret = PTR_ERR(info); + goto out; + } + flags = btrfs_free_space_flags(path->nodes[0], info); + extent_count = btrfs_free_space_extent_count(path->nodes[0], info); + + if (extent_count != num_extents) { + test_msg("Extent count is wrong\n"); + ret = -EINVAL; + goto out; + } + if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) { + if (path->slots[0] != 0) + goto invalid; + end = cache->key.objectid + cache->key.offset; + i = 0; + while (++path->slots[0] < btrfs_header_nritems(path->nodes[0])) { + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.type != BTRFS_FREE_SPACE_BITMAP_KEY) + goto invalid; + offset = key.objectid; + while (offset < key.objectid + key.offset) { + bit = free_space_test_bit(cache, path, offset); + if (prev_bit == 0 && bit == 1) { + extent_start = offset; + } else if (prev_bit == 1 && bit == 0) { + if (i >= num_extents) + goto invalid; + if (i >= num_extents || + extent_start != extents[i].start || + offset - extent_start != extents[i].length) + goto invalid; + i++; + } + prev_bit = bit; + offset += cache->sectorsize; + } + } + if (prev_bit == 1) { + if (i >= num_extents || + extent_start != extents[i].start || + end - extent_start != extents[i].length) + goto invalid; + i++; + } + if (i != num_extents) + goto invalid; + } else { + if (btrfs_header_nritems(path->nodes[0]) != num_extents + 1 || + path->slots[0] != 0) + goto invalid; + for (i = 0; i < num_extents; i++) { + path->slots[0]++; + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.type != BTRFS_FREE_SPACE_EXTENT_KEY || + key.objectid != extents[i].start || + key.offset != extents[i].length) + goto invalid; + } + } + + ret = 0; +out: + btrfs_release_path(path); + return ret; +invalid: + test_msg("Free space tree is invalid\n"); + ret = -EINVAL; + goto out; +} + +static int check_free_space_extents(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *cache, + struct btrfs_path *path, + struct free_space_extent *extents, + unsigned int num_extents) +{ + struct btrfs_free_space_info *info; + u32 flags; + int ret; + + info = search_free_space_info(trans, fs_info, cache, path, 0); + if (IS_ERR(info)) { + test_msg("Could not find free space info\n"); + btrfs_release_path(path); + return PTR_ERR(info); + } + flags = btrfs_free_space_flags(path->nodes[0], info); + btrfs_release_path(path); + + ret = __check_free_space_extents(trans, fs_info, cache, path, extents, + num_extents); + if (ret) + return ret; + + /* Flip it to the other format and check that for good measure. */ + if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) { + ret = convert_free_space_to_extents(trans, fs_info, cache, path); + if (ret) { + test_msg("Could not convert to extents\n"); + return ret; + } + } else { + ret = convert_free_space_to_bitmaps(trans, fs_info, cache, path); + if (ret) { + test_msg("Could not convert to bitmaps\n"); + return ret; + } + } + return __check_free_space_extents(trans, fs_info, cache, path, extents, + num_extents); +} + +static int test_empty_block_group(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *cache, + struct btrfs_path *path) +{ + struct free_space_extent extents[] = { + {cache->key.objectid, cache->key.offset}, + }; + + return check_free_space_extents(trans, fs_info, cache, path, + extents, ARRAY_SIZE(extents)); +} + +static int test_remove_all(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *cache, + struct btrfs_path *path) +{ + struct free_space_extent extents[] = {}; + int ret; + + ret = __remove_from_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid, + cache->key.offset); + if (ret) { + test_msg("Could not remove free space\n"); + return ret; + } + + return check_free_space_extents(trans, fs_info, cache, path, + extents, ARRAY_SIZE(extents)); +} + +static int test_remove_beginning(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *cache, + struct btrfs_path *path) +{ + struct free_space_extent extents[] = { + {cache->key.objectid + BITMAP_RANGE, + cache->key.offset - BITMAP_RANGE}, + }; + int ret; + + ret = __remove_from_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid, BITMAP_RANGE); + if (ret) { + test_msg("Could not remove free space\n"); + return ret; + } + + return check_free_space_extents(trans, fs_info, cache, path, + extents, ARRAY_SIZE(extents)); + +} + +static int test_remove_end(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *cache, + struct btrfs_path *path) +{ + struct free_space_extent extents[] = { + {cache->key.objectid, cache->key.offset - BITMAP_RANGE}, + }; + int ret; + + ret = __remove_from_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid + + cache->key.offset - BITMAP_RANGE, + BITMAP_RANGE); + if (ret) { + test_msg("Could not remove free space\n"); + return ret; + } + + return check_free_space_extents(trans, fs_info, cache, path, + extents, ARRAY_SIZE(extents)); +} + +static int test_remove_middle(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *cache, + struct btrfs_path *path) +{ + struct free_space_extent extents[] = { + {cache->key.objectid, BITMAP_RANGE}, + {cache->key.objectid + 2 * BITMAP_RANGE, + cache->key.offset - 2 * BITMAP_RANGE}, + }; + int ret; + + ret = __remove_from_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid + BITMAP_RANGE, + BITMAP_RANGE); + if (ret) { + test_msg("Could not remove free space\n"); + return ret; + } + + return check_free_space_extents(trans, fs_info, cache, path, + extents, ARRAY_SIZE(extents)); +} + +static int test_merge_left(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *cache, + struct btrfs_path *path) +{ + struct free_space_extent extents[] = { + {cache->key.objectid, 2 * BITMAP_RANGE}, + }; + int ret; + + ret = __remove_from_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid, + cache->key.offset); + if (ret) { + test_msg("Could not remove free space\n"); + return ret; + } + + ret = __add_to_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid, BITMAP_RANGE); + if (ret) { + test_msg("Could not add free space\n"); + return ret; + } + + ret = __add_to_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid + BITMAP_RANGE, + BITMAP_RANGE); + if (ret) { + test_msg("Could not add free space\n"); + return ret; + } + + return check_free_space_extents(trans, fs_info, cache, path, + extents, ARRAY_SIZE(extents)); +} + +static int test_merge_right(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *cache, + struct btrfs_path *path) +{ + struct free_space_extent extents[] = { + {cache->key.objectid + BITMAP_RANGE, 2 * BITMAP_RANGE}, + }; + int ret; + + ret = __remove_from_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid, + cache->key.offset); + if (ret) { + test_msg("Could not remove free space\n"); + return ret; + } + + ret = __add_to_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid + 2 * BITMAP_RANGE, + BITMAP_RANGE); + if (ret) { + test_msg("Could not add free space\n"); + return ret; + } + + ret = __add_to_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid + BITMAP_RANGE, + BITMAP_RANGE); + if (ret) { + test_msg("Could not add free space\n"); + return ret; + } + + return check_free_space_extents(trans, fs_info, cache, path, + extents, ARRAY_SIZE(extents)); +} + +static int test_merge_both(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *cache, + struct btrfs_path *path) +{ + struct free_space_extent extents[] = { + {cache->key.objectid, 3 * BITMAP_RANGE}, + }; + int ret; + + ret = __remove_from_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid, + cache->key.offset); + if (ret) { + test_msg("Could not remove free space\n"); + return ret; + } + + ret = __add_to_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid, BITMAP_RANGE); + if (ret) { + test_msg("Could not add free space\n"); + return ret; + } + + ret = __add_to_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid + 2 * BITMAP_RANGE, + BITMAP_RANGE); + if (ret) { + test_msg("Could not add free space\n"); + return ret; + } + + ret = __add_to_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid + BITMAP_RANGE, + BITMAP_RANGE); + if (ret) { + test_msg("Could not add free space\n"); + return ret; + } + + return check_free_space_extents(trans, fs_info, cache, path, + extents, ARRAY_SIZE(extents)); +} + +static int test_merge_none(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *cache, + struct btrfs_path *path) +{ + struct free_space_extent extents[] = { + {cache->key.objectid, BITMAP_RANGE}, + {cache->key.objectid + 2 * BITMAP_RANGE, BITMAP_RANGE}, + {cache->key.objectid + 4 * BITMAP_RANGE, BITMAP_RANGE}, + }; + int ret; + + ret = __remove_from_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid, + cache->key.offset); + if (ret) { + test_msg("Could not remove free space\n"); + return ret; + } + + ret = __add_to_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid, BITMAP_RANGE); + if (ret) { + test_msg("Could not add free space\n"); + return ret; + } + + ret = __add_to_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid + 4 * BITMAP_RANGE, + BITMAP_RANGE); + if (ret) { + test_msg("Could not add free space\n"); + return ret; + } + + ret = __add_to_free_space_tree(trans, fs_info, cache, path, + cache->key.objectid + 2 * BITMAP_RANGE, + BITMAP_RANGE); + if (ret) { + test_msg("Could not add free space\n"); + return ret; + } + + return check_free_space_extents(trans, fs_info, cache, path, + extents, ARRAY_SIZE(extents)); +} + +typedef int (*test_func_t)(struct btrfs_trans_handle *, + struct btrfs_fs_info *, + struct btrfs_block_group_cache *, + struct btrfs_path *); + +static int run_test(test_func_t test_func, int bitmaps) +{ + struct btrfs_root *root = NULL; + struct btrfs_block_group_cache *cache = NULL; + struct btrfs_trans_handle trans; + struct btrfs_path *path = NULL; + int ret; + + root = btrfs_alloc_dummy_root(); + if (IS_ERR(root)) { + test_msg("Couldn't allocate dummy root\n"); + ret = PTR_ERR(root); + goto out; + } + + root->fs_info = btrfs_alloc_dummy_fs_info(); + if (!root->fs_info) { + test_msg("Couldn't allocate dummy fs info\n"); + ret = -ENOMEM; + goto out; + } + + btrfs_set_super_compat_ro_flags(root->fs_info->super_copy, + BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE); + root->fs_info->free_space_root = root; + root->fs_info->tree_root = root; + + root->node = alloc_test_extent_buffer(root->fs_info, 4096); + if (!root->node) { + test_msg("Couldn't allocate dummy buffer\n"); + ret = -ENOMEM; + goto out; + } + btrfs_set_header_level(root->node, 0); + btrfs_set_header_nritems(root->node, 0); + root->alloc_bytenr += 8192; + + cache = btrfs_alloc_dummy_block_group(8 * BITMAP_RANGE); + if (!cache) { + test_msg("Couldn't allocate dummy block group cache\n"); + ret = -ENOMEM; + goto out; + } + cache->bitmap_low_thresh = 0; + cache->bitmap_high_thresh = (u32)-1; + cache->needs_free_space = 1; + + btrfs_init_dummy_trans(&trans); + + path = btrfs_alloc_path(); + if (!path) { + test_msg("Couldn't allocate path\n"); + return -ENOMEM; + } + + ret = add_block_group_free_space(&trans, root->fs_info, cache); + if (ret) { + test_msg("Could not add block group free space\n"); + goto out; + } + + if (bitmaps) { + ret = convert_free_space_to_bitmaps(&trans, root->fs_info, + cache, path); + if (ret) { + test_msg("Could not convert block group to bitmaps\n"); + goto out; + } + } + + ret = test_func(&trans, root->fs_info, cache, path); + if (ret) + goto out; + + ret = remove_block_group_free_space(&trans, root->fs_info, cache); + if (ret) { + test_msg("Could not remove block group free space\n"); + goto out; + } + + if (btrfs_header_nritems(root->node) != 0) { + test_msg("Free space tree has leftover items\n"); + ret = -EINVAL; + goto out; + } + + ret = 0; +out: + btrfs_free_path(path); + btrfs_free_dummy_block_group(cache); + btrfs_free_dummy_root(root); + return ret; +} + +static int run_test_both_formats(test_func_t test_func) +{ + int ret; + + ret = run_test(test_func, 0); + if (ret) + return ret; + return run_test(test_func, 1); +} + +int btrfs_test_free_space_tree(void) +{ + test_func_t tests[] = { + test_empty_block_group, + test_remove_all, + test_remove_beginning, + test_remove_end, + test_remove_middle, + test_merge_left, + test_merge_right, + test_merge_both, + test_merge_none, + }; + int i; + + test_msg("Running free space tree tests\n"); + for (i = 0; i < ARRAY_SIZE(tests); i++) { + int ret = run_test_both_formats(tests[i]); + if (ret) { + test_msg("%pf failed\n", tests[i]); + return ret; + } + } + + return 0; +} diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 054fc0d97..e2d3da02d 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -100,7 +100,7 @@ static void insert_inode_item_key(struct btrfs_root *root) static void setup_file_extents(struct btrfs_root *root) { int slot = 0; - u64 disk_bytenr = 1 * 1024 * 1024; + u64 disk_bytenr = SZ_1M; u64 offset = 0; /* First we want a hole */ @@ -974,7 +974,7 @@ static int test_extent_accounting(void) (BTRFS_MAX_EXTENT_SIZE >> 1) + 4095, EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE | EXTENT_DO_ACCOUNTING, 0, 0, - NULL, GFP_NOFS); + NULL, GFP_KERNEL); if (ret) { test_msg("clear_extent_bit returned %d\n", ret); goto out; @@ -1045,7 +1045,7 @@ static int test_extent_accounting(void) BTRFS_MAX_EXTENT_SIZE+8191, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0, - NULL, GFP_NOFS); + NULL, GFP_KERNEL); if (ret) { test_msg("clear_extent_bit returned %d\n", ret); goto out; @@ -1079,7 +1079,7 @@ static int test_extent_accounting(void) ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0, - NULL, GFP_NOFS); + NULL, GFP_KERNEL); if (ret) { test_msg("clear_extent_bit returned %d\n", ret); goto out; @@ -1096,7 +1096,7 @@ out: clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, EXTENT_DIRTY | EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_UPTODATE, 0, 0, - NULL, GFP_NOFS); + NULL, GFP_KERNEL); iput(inode); btrfs_free_dummy_root(root); return ret; diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c index 846d277b1..8ea5d34bc 100644 --- a/fs/btrfs/tests/qgroup-tests.c +++ b/fs/btrfs/tests/qgroup-tests.c @@ -23,14 +23,6 @@ #include "../qgroup.h" #include "../backref.h" -static void init_dummy_trans(struct btrfs_trans_handle *trans) -{ - memset(trans, 0, sizeof(*trans)); - trans->transid = 1; - INIT_LIST_HEAD(&trans->qgroup_ref_list); - trans->type = __TRANS_DUMMY; -} - static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid) { @@ -44,7 +36,7 @@ static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr, u32 size = sizeof(*item) + sizeof(*iref) + sizeof(*block_info); int ret; - init_dummy_trans(&trans); + btrfs_init_dummy_trans(&trans); ins.objectid = bytenr; ins.type = BTRFS_EXTENT_ITEM_KEY; @@ -94,7 +86,7 @@ static int add_tree_ref(struct btrfs_root *root, u64 bytenr, u64 num_bytes, u64 refs; int ret; - init_dummy_trans(&trans); + btrfs_init_dummy_trans(&trans); key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; @@ -144,7 +136,7 @@ static int remove_extent_item(struct btrfs_root *root, u64 bytenr, struct btrfs_path *path; int ret; - init_dummy_trans(&trans); + btrfs_init_dummy_trans(&trans); key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; @@ -178,7 +170,7 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr, u64 refs; int ret; - init_dummy_trans(&trans); + btrfs_init_dummy_trans(&trans); key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; @@ -232,7 +224,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root) struct ulist *new_roots = NULL; int ret; - init_dummy_trans(&trans); + btrfs_init_dummy_trans(&trans); test_msg("Qgroup basic add\n"); ret = btrfs_create_qgroup(NULL, fs_info, 5); @@ -326,7 +318,7 @@ static int test_multiple_refs(struct btrfs_root *root) struct ulist *new_roots = NULL; int ret; - init_dummy_trans(&trans); + btrfs_init_dummy_trans(&trans); test_msg("Qgroup multiple refs test\n"); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index be8eae80f..b6031ce47 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -75,6 +75,23 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) list_del_init(&em->list); free_extent_map(em); } + /* + * If any block groups are found in ->deleted_bgs then it's + * because the transaction was aborted and a commit did not + * happen (things failed before writing the new superblock + * and calling btrfs_finish_extent_commit()), so we can not + * discard the physical locations of the block groups. + */ + while (!list_empty(&transaction->deleted_bgs)) { + struct btrfs_block_group_cache *cache; + + cache = list_first_entry(&transaction->deleted_bgs, + struct btrfs_block_group_cache, + bg_list); + list_del_init(&cache->bg_list); + btrfs_put_block_group_trimming(cache); + btrfs_put_block_group(cache); + } kmem_cache_free(btrfs_transaction_cachep, transaction); } } @@ -634,17 +651,20 @@ struct btrfs_trans_handle *btrfs_start_transaction_lflush( struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root) { - return start_transaction(root, 0, TRANS_JOIN, 0); + return start_transaction(root, 0, TRANS_JOIN, + BTRFS_RESERVE_NO_FLUSH); } struct btrfs_trans_handle *btrfs_join_transaction_nolock(struct btrfs_root *root) { - return start_transaction(root, 0, TRANS_JOIN_NOLOCK, 0); + return start_transaction(root, 0, TRANS_JOIN_NOLOCK, + BTRFS_RESERVE_NO_FLUSH); } struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root) { - return start_transaction(root, 0, TRANS_USERSPACE, 0); + return start_transaction(root, 0, TRANS_USERSPACE, + BTRFS_RESERVE_NO_FLUSH); } /* @@ -662,7 +682,8 @@ struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *root */ struct btrfs_trans_handle *btrfs_attach_transaction(struct btrfs_root *root) { - return start_transaction(root, 0, TRANS_ATTACH, 0); + return start_transaction(root, 0, TRANS_ATTACH, + BTRFS_RESERVE_NO_FLUSH); } /* @@ -677,7 +698,8 @@ btrfs_attach_transaction_barrier(struct btrfs_root *root) { struct btrfs_trans_handle *trans; - trans = start_transaction(root, 0, TRANS_ATTACH, 0); + trans = start_transaction(root, 0, TRANS_ATTACH, + BTRFS_RESERVE_NO_FLUSH); if (IS_ERR(trans) && PTR_ERR(trans) == -ENOENT) btrfs_wait_for_commit(root, 0); @@ -1319,17 +1341,11 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, u64 root_flags; uuid_le new_uuid; - path = btrfs_alloc_path(); - if (!path) { - pending->error = -ENOMEM; - return 0; - } + ASSERT(pending->path); + path = pending->path; - new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); - if (!new_root_item) { - pending->error = -ENOMEM; - goto root_item_alloc_fail; - } + ASSERT(pending->root_item); + new_root_item = pending->root_item; pending->error = btrfs_find_free_objectid(tree_root, &objectid); if (pending->error) @@ -1562,8 +1578,10 @@ clear_skip_qgroup: btrfs_clear_skip_qgroup(trans); no_free_objectid: kfree(new_root_item); -root_item_alloc_fail: + pending->root_item = NULL; btrfs_free_path(path); + pending->path = NULL; + return ret; } diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 64c8221b6..72be51f7c 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -137,8 +137,10 @@ struct btrfs_pending_snapshot { struct dentry *dentry; struct inode *dir; struct btrfs_root *root; + struct btrfs_root_item *root_item; struct btrfs_root *snap; struct btrfs_qgroup_inherit *inherit; + struct btrfs_path *path; /* block reservation for the operation */ struct btrfs_block_rsv block_rsv; u64 qgroup_reserved; diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c index f31db4325..cb6508912 100644 --- a/fs/btrfs/tree-defrag.c +++ b/fs/btrfs/tree-defrag.c @@ -89,6 +89,12 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, goto out; } btrfs_release_path(path); + /* + * We don't need a lock on a leaf. btrfs_realloc_node() will lock all + * leafs from path->nodes[1], so set lowest_level to 1 to avoid later + * a deadlock (attempting to write lock an already write locked leaf). + */ + path->lowest_level = 1; wret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (wret < 0) { @@ -99,9 +105,12 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, ret = 0; goto out; } - path->slots[1] = btrfs_header_nritems(path->nodes[1]); - next_key_ret = btrfs_find_next_key(root, path, &key, 1, - min_trans); + /* + * The node at level 1 must always be locked when our path has + * keep_locks set and lowest_level is 1, regardless of the value of + * path->slots[1]. + */ + BUG_ON(path->locks[1] == 0); ret = btrfs_realloc_node(trans, root, path->nodes[1], 0, &last_ret, @@ -110,6 +119,18 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, WARN_ON(ret == -EAGAIN); goto out; } + /* + * Now that we reallocated the node we can find the next key. Note that + * btrfs_find_next_key() can release our path and do another search + * without COWing, this is because even with path->keep_locks = 1, + * btrfs_search_slot() / ctree.c:unlock_up() does not keeps a lock on a + * node when path->slots[node_level - 1] does not point to the last + * item or a slot beyond the last item (ctree.c:unlock_up()). Therefore + * we search for the next key after reallocating our node. + */ + path->slots[1] = btrfs_header_nritems(path->nodes[1]); + next_key_ret = btrfs_find_next_key(root, path, &key, 1, + min_trans); if (next_key_ret == 0) { memcpy(&root->defrag_progress, &key, sizeof(key)); ret = -EAGAIN; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 323e12cc9..978c3a810 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4127,7 +4127,9 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, struct inode *inode, struct btrfs_path *path, struct list_head *logged_list, - struct btrfs_log_ctx *ctx) + struct btrfs_log_ctx *ctx, + const u64 start, + const u64 end) { struct extent_map *em, *n; struct list_head extents; @@ -4166,7 +4168,13 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, } list_sort(NULL, &extents, extent_cmp); - + /* + * Collect any new ordered extents within the range. This is to + * prevent logging file extent items without waiting for the disk + * location they point to being written. We do this only to deal + * with races against concurrent lockless direct IO writes. + */ + btrfs_get_logged_extents(inode, logged_list, start, end); process: while (!list_empty(&extents)) { em = list_entry(extents.next, struct extent_map, list); @@ -4701,7 +4709,7 @@ log_extents: goto out_unlock; } ret = btrfs_log_changed_extents(trans, root, inode, dst_path, - &logged_list, ctx); + &logged_list, ctx, start, end); if (ret) { err = ret; goto out_unlock; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 9c62a6f97..366b33594 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -108,7 +108,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { }, }; -const u64 const btrfs_raid_group[BTRFS_NR_RAID_TYPES] = { +const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES] = { [BTRFS_RAID_RAID10] = BTRFS_BLOCK_GROUP_RAID10, [BTRFS_RAID_RAID1] = BTRFS_BLOCK_GROUP_RAID1, [BTRFS_RAID_DUP] = BTRFS_BLOCK_GROUP_DUP, @@ -125,6 +125,7 @@ static int btrfs_relocate_sys_chunks(struct btrfs_root *root); static void __btrfs_reset_dev_stats(struct btrfs_device *dev); static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev); static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); +static void btrfs_close_one_device(struct btrfs_device *device); DEFINE_MUTEX(uuid_mutex); static LIST_HEAD(fs_uuids); @@ -1103,7 +1104,7 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = 2; + path->reada = READA_FORWARD; key.objectid = device->devid; key.offset = start; @@ -1183,7 +1184,7 @@ again: struct map_lookup *map; int i; - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; for (i = 0; i < map->num_stripes; i++) { u64 end; @@ -1281,7 +1282,7 @@ again: goto out; } - path->reada = 2; + path->reada = READA_FORWARD; path->search_commit_root = 1; path->skip_locking = 1; @@ -1643,7 +1644,6 @@ static void update_dev_time(char *path_name) return; file_update_time(filp); filp_close(filp, NULL); - return; } static int btrfs_rm_dev_item(struct btrfs_root *root, @@ -2756,7 +2756,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, free_extent_map(em); return -EINVAL; } - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; lock_chunks(root->fs_info->chunk_root); check_system_chunk(trans, extent_root, map->type); unlock_chunks(root->fs_info->chunk_root); @@ -3407,7 +3407,7 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) list_for_each_entry(device, devices, dev_list) { old_size = btrfs_device_get_total_bytes(device); size_to_free = div_factor(old_size, 1); - size_to_free = min(size_to_free, (u64)1 * 1024 * 1024); + size_to_free = min_t(u64, size_to_free, SZ_1M); if (!device->writeable || btrfs_device_get_total_bytes(device) - btrfs_device_get_bytes_used(device) > size_to_free || @@ -3724,14 +3724,6 @@ int btrfs_balance(struct btrfs_balance_control *bctl, goto out; } - /* allow dup'ed data chunks only in mixed mode */ - if (!mixed && (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) && - (bctl->data.target & BTRFS_BLOCK_GROUP_DUP)) { - btrfs_err(fs_info, "dup for data is not allowed"); - ret = -EINVAL; - goto out; - } - /* allow to reduce meta or sys integrity only if force set */ allowed = BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10 | @@ -3757,6 +3749,13 @@ int btrfs_balance(struct btrfs_balance_control *bctl, } } while (read_seqretry(&fs_info->profiles_lock, seq)); + if (btrfs_get_num_tolerated_disk_barrier_failures(bctl->meta.target) < + btrfs_get_num_tolerated_disk_barrier_failures(bctl->data.target)) { + btrfs_warn(fs_info, + "metadata profile 0x%llx has lower redundancy than data profile 0x%llx", + bctl->meta.target, bctl->data.target); + } + if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) { fs_info->num_tolerated_disk_barrier_failures = min( btrfs_calc_num_tolerated_disk_barrier_failures(fs_info), @@ -4269,7 +4268,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) if (!path) return -ENOMEM; - path->reada = 2; + path->reada = READA_FORWARD; lock_chunks(root); @@ -4461,7 +4460,7 @@ static int btrfs_cmp_device_info(const void *a, const void *b) static u32 find_raid56_stripe_len(u32 data_devices, u32 dev_stripe_target) { /* TODO allow them to set a preferred stripe size */ - return 64 * 1024; + return SZ_64K; } static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type) @@ -4529,21 +4528,21 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, ncopies = btrfs_raid_array[index].ncopies; if (type & BTRFS_BLOCK_GROUP_DATA) { - max_stripe_size = 1024 * 1024 * 1024; + max_stripe_size = SZ_1G; max_chunk_size = 10 * max_stripe_size; if (!devs_max) devs_max = BTRFS_MAX_DEVS(info->chunk_root); } else if (type & BTRFS_BLOCK_GROUP_METADATA) { /* for larger filesystems, use larger metadata chunks */ - if (fs_devices->total_rw_bytes > 50ULL * 1024 * 1024 * 1024) - max_stripe_size = 1024 * 1024 * 1024; + if (fs_devices->total_rw_bytes > 50ULL * SZ_1G) + max_stripe_size = SZ_1G; else - max_stripe_size = 256 * 1024 * 1024; + max_stripe_size = SZ_256M; max_chunk_size = max_stripe_size; if (!devs_max) devs_max = BTRFS_MAX_DEVS(info->chunk_root); } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { - max_stripe_size = 32 * 1024 * 1024; + max_stripe_size = SZ_32M; max_chunk_size = 2 * max_stripe_size; if (!devs_max) devs_max = BTRFS_MAX_DEVS_SYS_CHUNK; @@ -4720,7 +4719,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, goto error; } set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); - em->bdev = (struct block_device *)map; + em->map_lookup = map; em->start = start; em->len = num_bytes; em->block_start = 0; @@ -4794,7 +4793,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, u64 dev_offset; u64 stripe_size; int i = 0; - int ret; + int ret = 0; em_tree = &extent_root->fs_info->mapping_tree.map_tree; read_lock(&em_tree->lock); @@ -4815,7 +4814,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, return -EINVAL; } - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; item_size = btrfs_chunk_item_size(map->num_stripes); stripe_size = em->orig_block_len; @@ -4825,20 +4824,32 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, goto out; } + /* + * Take the device list mutex to prevent races with the final phase of + * a device replace operation that replaces the device object associated + * with the map's stripes, because the device object's id can change + * at any time during that final phase of the device replace operation + * (dev-replace.c:btrfs_dev_replace_finishing()). + */ + mutex_lock(&chunk_root->fs_info->fs_devices->device_list_mutex); for (i = 0; i < map->num_stripes; i++) { device = map->stripes[i].dev; dev_offset = map->stripes[i].physical; ret = btrfs_update_device(trans, device); if (ret) - goto out; + break; ret = btrfs_alloc_dev_extent(trans, device, chunk_root->root_key.objectid, BTRFS_FIRST_CHUNK_TREE_OBJECTID, chunk_offset, dev_offset, stripe_size); if (ret) - goto out; + break; + } + if (ret) { + mutex_unlock(&chunk_root->fs_info->fs_devices->device_list_mutex); + goto out; } stripe = &chunk->stripe; @@ -4851,6 +4862,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); stripe++; } + mutex_unlock(&chunk_root->fs_info->fs_devices->device_list_mutex); btrfs_set_stack_chunk_length(chunk, chunk_size); btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid); @@ -4957,7 +4969,7 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset) if (!em) return 1; - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; for (i = 0; i < map->num_stripes; i++) { if (map->stripes[i].dev->missing) { miss_ndevs++; @@ -5037,7 +5049,7 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) return 1; } - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1)) ret = map->num_stripes; else if (map->type & BTRFS_BLOCK_GROUP_RAID10) @@ -5073,7 +5085,7 @@ unsigned long btrfs_full_stripe_len(struct btrfs_root *root, BUG_ON(!em); BUG_ON(em->start > logical || em->start + em->len < logical); - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) len = map->stripe_len * nr_data_stripes(map); free_extent_map(em); @@ -5094,7 +5106,7 @@ int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, BUG_ON(!em); BUG_ON(em->start > logical || em->start + em->len < logical); - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) ret = 1; free_extent_map(em); @@ -5253,7 +5265,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, return -EINVAL; } - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; offset = logical - em->start; stripe_len = map->stripe_len; @@ -5367,35 +5379,33 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, * target drive. */ for (i = 0; i < tmp_num_stripes; i++) { - if (tmp_bbio->stripes[i].dev->devid == srcdev_devid) { - /* - * In case of DUP, in order to keep it - * simple, only add the mirror with the - * lowest physical address - */ - if (found && - physical_of_found <= - tmp_bbio->stripes[i].physical) - continue; - index_srcdev = i; - found = 1; - physical_of_found = - tmp_bbio->stripes[i].physical; - } + if (tmp_bbio->stripes[i].dev->devid != srcdev_devid) + continue; + + /* + * In case of DUP, in order to keep it simple, only add + * the mirror with the lowest physical address + */ + if (found && + physical_of_found <= tmp_bbio->stripes[i].physical) + continue; + + index_srcdev = i; + found = 1; + physical_of_found = tmp_bbio->stripes[i].physical; } - if (found) { - mirror_num = index_srcdev + 1; - patch_the_first_stripe_for_dev_replace = 1; - physical_to_patch_in_first_stripe = physical_of_found; - } else { + btrfs_put_bbio(tmp_bbio); + + if (!found) { WARN_ON(1); ret = -EIO; - btrfs_put_bbio(tmp_bbio); goto out; } - btrfs_put_bbio(tmp_bbio); + mirror_num = index_srcdev + 1; + patch_the_first_stripe_for_dev_replace = 1; + physical_to_patch_in_first_stripe = physical_of_found; } else if (mirror_num > map->num_stripes) { mirror_num = 0; } @@ -5795,7 +5805,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, free_extent_map(em); return -EIO; } - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; length = em->len; rmap_len = map->stripe_len; @@ -6058,7 +6068,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, bbio->fs_info = root->fs_info; atomic_set(&bbio->stripes_pending, bbio->num_stripes); - if (bbio->raid_map) { + if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) && + ((rw & WRITE) || (mirror_num > 1))) { /* In this case, map_length has been set to the length of a single stripe; not the whole write */ if (rw & WRITE) { @@ -6199,6 +6210,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, struct extent_map *em; u64 logical; u64 length; + u64 stripe_len; u64 devid; u8 uuid[BTRFS_UUID_SIZE]; int num_stripes; @@ -6207,6 +6219,37 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, logical = key->offset; length = btrfs_chunk_length(leaf, chunk); + stripe_len = btrfs_chunk_stripe_len(leaf, chunk); + num_stripes = btrfs_chunk_num_stripes(leaf, chunk); + /* Validation check */ + if (!num_stripes) { + btrfs_err(root->fs_info, "invalid chunk num_stripes: %u", + num_stripes); + return -EIO; + } + if (!IS_ALIGNED(logical, root->sectorsize)) { + btrfs_err(root->fs_info, + "invalid chunk logical %llu", logical); + return -EIO; + } + if (!length || !IS_ALIGNED(length, root->sectorsize)) { + btrfs_err(root->fs_info, + "invalid chunk length %llu", length); + return -EIO; + } + if (!is_power_of_2(stripe_len)) { + btrfs_err(root->fs_info, "invalid chunk stripe length: %llu", + stripe_len); + return -EIO; + } + if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) & + btrfs_chunk_type(leaf, chunk)) { + btrfs_err(root->fs_info, "unrecognized chunk type: %llu", + ~(BTRFS_BLOCK_GROUP_TYPE_MASK | + BTRFS_BLOCK_GROUP_PROFILE_MASK) & + btrfs_chunk_type(leaf, chunk)); + return -EIO; + } read_lock(&map_tree->map_tree.lock); em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); @@ -6223,7 +6266,6 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, em = alloc_extent_map(); if (!em) return -ENOMEM; - num_stripes = btrfs_chunk_num_stripes(leaf, chunk); map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); if (!map) { free_extent_map(em); @@ -6231,7 +6273,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, } set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); - em->bdev = (struct block_device *)map; + em->map_lookup = map; em->start = logical; em->len = length; em->orig_start = 0; @@ -6466,11 +6508,11 @@ int btrfs_read_sys_array(struct btrfs_root *root) sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET); if (!sb) return -ENOMEM; - btrfs_set_buffer_uptodate(sb); + set_extent_buffer_uptodate(sb); btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0); /* * The sb extent buffer is artifical and just used to read the system array. - * btrfs_set_buffer_uptodate() call does not properly mark all it's + * set_extent_buffer_uptodate() call does not properly mark all it's * pages up-to-date when the page is larger: extent does not cover the * whole page and consequently check_page_uptodate does not find all * the page's extents up-to-date (the hole beyond sb), @@ -6529,6 +6571,9 @@ int btrfs_read_sys_array(struct btrfs_root *root) if (ret) break; } else { + printk(KERN_ERR + "BTRFS: unexpected item type %u in sys_array at offset %u\n", + (u32)key.type, cur_offset); ret = -EIO; break; } @@ -6930,7 +6975,7 @@ void btrfs_update_commit_device_bytes_used(struct btrfs_root *root, /* In order to kick the device replace finish process */ lock_chunks(root); list_for_each_entry(em, &transaction->pending_chunks, list) { - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; for (i = 0; i < map->num_stripes; i++) { dev = map->stripes[i].dev; @@ -6958,7 +7003,7 @@ void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info) } } -void btrfs_close_one_device(struct btrfs_device *device) +static void btrfs_close_one_device(struct btrfs_device *device) { struct btrfs_fs_devices *fs_devices = device->fs_devices; struct btrfs_device *new_device; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index d5c84f6b1..1939ebde6 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -26,7 +26,7 @@ extern struct mutex uuid_mutex; -#define BTRFS_STRIPE_LEN (64 * 1024) +#define BTRFS_STRIPE_LEN SZ_64K struct buffer_head; struct btrfs_pending_bios { @@ -566,6 +566,5 @@ static inline void unlock_chunks(struct btrfs_root *root) struct list_head *btrfs_get_fs_uuids(void); void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info); void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info); -void btrfs_close_one_device(struct btrfs_device *device); #endif diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 1fcd7b6e7..6c68d6356 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -126,7 +126,7 @@ static int do_setxattr(struct btrfs_trans_handle *trans, * locks the inode's i_mutex before calling setxattr or removexattr. */ if (flags & XATTR_REPLACE) { - ASSERT(mutex_is_locked(&inode->i_mutex)); + ASSERT(inode_is_locked(inode)); di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode), name, name_len, 0); if (!di) @@ -283,7 +283,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = 2; + path->reada = READA_FORWARD; /* search for our xattrs */ ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); @@ -351,137 +351,89 @@ err: return ret; } -/* - * List of handlers for synthetic system.* attributes. All real ondisk - * attributes are handled directly. - */ -const struct xattr_handler *btrfs_xattr_handlers[] = { -#ifdef CONFIG_BTRFS_FS_POSIX_ACL - &posix_acl_access_xattr_handler, - &posix_acl_default_xattr_handler, -#endif - NULL, -}; - -/* - * Check if the attribute is in a supported namespace. - * - * This is applied after the check for the synthetic attributes in the system - * namespace. - */ -static int btrfs_is_valid_xattr(const char *name) +static int btrfs_xattr_handler_get(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + void *buffer, size_t size) { - int len = strlen(name); - int prefixlen = 0; - - if (!strncmp(name, XATTR_SECURITY_PREFIX, - XATTR_SECURITY_PREFIX_LEN)) - prefixlen = XATTR_SECURITY_PREFIX_LEN; - else if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) - prefixlen = XATTR_SYSTEM_PREFIX_LEN; - else if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) - prefixlen = XATTR_TRUSTED_PREFIX_LEN; - else if (!strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) - prefixlen = XATTR_USER_PREFIX_LEN; - else if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN)) - prefixlen = XATTR_BTRFS_PREFIX_LEN; - else - return -EOPNOTSUPP; - - /* - * The name cannot consist of just prefix - */ - if (len <= prefixlen) - return -EINVAL; + struct inode *inode = d_inode(dentry); - return 0; + name = xattr_full_name(handler, name); + return __btrfs_getxattr(inode, name, buffer, size); } -ssize_t btrfs_getxattr(struct dentry *dentry, const char *name, - void *buffer, size_t size) +static int btrfs_xattr_handler_set(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + const void *buffer, size_t size, + int flags) { - int ret; + struct inode *inode = d_inode(dentry); - /* - * If this is a request for a synthetic attribute in the system.* - * namespace use the generic infrastructure to resolve a handler - * for it via sb->s_xattr. - */ - if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) - return generic_getxattr(dentry, name, buffer, size); + name = xattr_full_name(handler, name); + return __btrfs_setxattr(NULL, inode, name, buffer, size, flags); +} - ret = btrfs_is_valid_xattr(name); - if (ret) - return ret; - return __btrfs_getxattr(d_inode(dentry), name, buffer, size); +static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler, + struct dentry *dentry, + const char *name, const void *value, + size_t size, int flags) +{ + name = xattr_full_name(handler, name); + return btrfs_set_prop(d_inode(dentry), name, value, size, flags); } +static const struct xattr_handler btrfs_security_xattr_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .get = btrfs_xattr_handler_get, + .set = btrfs_xattr_handler_set, +}; + +static const struct xattr_handler btrfs_trusted_xattr_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .get = btrfs_xattr_handler_get, + .set = btrfs_xattr_handler_set, +}; + +static const struct xattr_handler btrfs_user_xattr_handler = { + .prefix = XATTR_USER_PREFIX, + .get = btrfs_xattr_handler_get, + .set = btrfs_xattr_handler_set, +}; + +static const struct xattr_handler btrfs_btrfs_xattr_handler = { + .prefix = XATTR_BTRFS_PREFIX, + .get = btrfs_xattr_handler_get, + .set = btrfs_xattr_handler_set_prop, +}; + +const struct xattr_handler *btrfs_xattr_handlers[] = { + &btrfs_security_xattr_handler, +#ifdef CONFIG_BTRFS_FS_POSIX_ACL + &posix_acl_access_xattr_handler, + &posix_acl_default_xattr_handler, +#endif + &btrfs_trusted_xattr_handler, + &btrfs_user_xattr_handler, + &btrfs_btrfs_xattr_handler, + NULL, +}; + int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { struct btrfs_root *root = BTRFS_I(d_inode(dentry))->root; - int ret; - /* - * The permission on security.* and system.* is not checked - * in permission(). - */ if (btrfs_root_readonly(root)) return -EROFS; - - /* - * If this is a request for a synthetic attribute in the system.* - * namespace use the generic infrastructure to resolve a handler - * for it via sb->s_xattr. - */ - if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) - return generic_setxattr(dentry, name, value, size, flags); - - ret = btrfs_is_valid_xattr(name); - if (ret) - return ret; - - if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN)) - return btrfs_set_prop(d_inode(dentry), name, - value, size, flags); - - if (size == 0) - value = ""; /* empty EA, do not remove */ - - return __btrfs_setxattr(NULL, d_inode(dentry), name, value, size, - flags); + return generic_setxattr(dentry, name, value, size, flags); } int btrfs_removexattr(struct dentry *dentry, const char *name) { struct btrfs_root *root = BTRFS_I(d_inode(dentry))->root; - int ret; - /* - * The permission on security.* and system.* is not checked - * in permission(). - */ if (btrfs_root_readonly(root)) return -EROFS; - - /* - * If this is a request for a synthetic attribute in the system.* - * namespace use the generic infrastructure to resolve a handler - * for it via sb->s_xattr. - */ - if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) - return generic_removexattr(dentry, name); - - ret = btrfs_is_valid_xattr(name); - if (ret) - return ret; - - if (!strncmp(name, XATTR_BTRFS_PREFIX, XATTR_BTRFS_PREFIX_LEN)) - return btrfs_set_prop(d_inode(dentry), name, - NULL, 0, XATTR_REPLACE); - - return __btrfs_setxattr(NULL, d_inode(dentry), name, NULL, 0, - XATTR_REPLACE); + return generic_removexattr(dentry, name); } static int btrfs_initxattrs(struct inode *inode, @@ -494,7 +446,7 @@ static int btrfs_initxattrs(struct inode *inode, for (xattr = xattr_array; xattr->name != NULL; xattr++) { name = kmalloc(XATTR_SECURITY_PREFIX_LEN + - strlen(xattr->name) + 1, GFP_NOFS); + strlen(xattr->name) + 1, GFP_KERNEL); if (!name) { err = -ENOMEM; break; diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h index 5049608d1..96807b3d2 100644 --- a/fs/btrfs/xattr.h +++ b/fs/btrfs/xattr.h @@ -28,8 +28,6 @@ extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name, extern int __btrfs_setxattr(struct btrfs_trans_handle *trans, struct inode *inode, const char *name, const void *value, size_t size, int flags); -extern ssize_t btrfs_getxattr(struct dentry *dentry, const char *name, - void *buffer, size_t size); extern int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags); extern int btrfs_removexattr(struct dentry *dentry, const char *name); diff --git a/fs/buffer.c b/fs/buffer.c index 4f4cd959d..e1632abb4 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -134,13 +134,10 @@ __clear_page_buffers(struct page *page) static void buffer_io_error(struct buffer_head *bh, char *msg) { - char b[BDEVNAME_SIZE]; - if (!test_bit(BH_Quiet, &bh->b_state)) printk_ratelimited(KERN_ERR - "Buffer I/O error on dev %s, logical block %llu%s\n", - bdevname(bh->b_bdev, b), - (unsigned long long)bh->b_blocknr, msg); + "Buffer I/O error on dev %pg, logical block %llu%s\n", + bh->b_bdev, (unsigned long long)bh->b_blocknr, msg); } /* @@ -237,15 +234,13 @@ __find_get_block_slow(struct block_device *bdev, sector_t block) * elsewhere, don't buffer_error if we had some unmapped buffers */ if (all_mapped) { - char b[BDEVNAME_SIZE]; - printk("__find_get_block_slow() failed. " "block=%llu, b_blocknr=%llu\n", (unsigned long long)block, (unsigned long long)bh->b_blocknr); printk("b_state=0x%08lx, b_size=%zu\n", bh->b_state, bh->b_size); - printk("device %s blocksize: %d\n", bdevname(bdev, b), + printk("device %pg blocksize: %d\n", bdev, 1 << bd_inode->i_blkbits); } out_unlock: @@ -531,10 +526,8 @@ repeat: static void do_thaw_one(struct super_block *sb, void *unused) { - char b[BDEVNAME_SIZE]; while (sb->s_bdev && !thaw_bdev(sb->s_bdev, sb)) - printk(KERN_WARNING "Emergency Thaw on %s\n", - bdevname(sb->s_bdev, b)); + printk(KERN_WARNING "Emergency Thaw on %pg\n", sb->s_bdev); } static void do_thaw_all(struct work_struct *work) @@ -1074,12 +1067,10 @@ grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp) * pagecache index. (this comparison is done using sector_t types). */ if (unlikely(index != block >> sizebits)) { - char b[BDEVNAME_SIZE]; - printk(KERN_ERR "%s: requested out-of-range block %llu for " - "device %s\n", + "device %pg\n", __func__, (unsigned long long)block, - bdevname(bdev, b)); + bdev); return -EIO; } diff --git a/fs/cachefiles/daemon.c b/fs/cachefiles/daemon.c index f601def05..452e98dd7 100644 --- a/fs/cachefiles/daemon.c +++ b/fs/cachefiles/daemon.c @@ -226,15 +226,9 @@ static ssize_t cachefiles_daemon_write(struct file *file, return -EOPNOTSUPP; /* drag the command string into the kernel so we can parse it */ - data = kmalloc(datalen + 1, GFP_KERNEL); - if (!data) - return -ENOMEM; - - ret = -EFAULT; - if (copy_from_user(data, _data, datalen) != 0) - goto error; - - data[datalen] = '\0'; + data = memdup_user_nul(_data, datalen); + if (IS_ERR(data)) + return PTR_ERR(data); ret = -EINVAL; if (memchr(data, '\0', datalen)) diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index afa023dde..675a3332d 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -446,7 +446,7 @@ static int cachefiles_attr_changed(struct fscache_object *_object) return 0; cachefiles_begin_secure(cache, &saved_cred); - mutex_lock(&d_inode(object->backer)->i_mutex); + inode_lock(d_inode(object->backer)); /* if there's an extension to a partial page at the end of the backing * file, we need to discard the partial page so that we pick up new @@ -465,7 +465,7 @@ static int cachefiles_attr_changed(struct fscache_object *_object) ret = notify_change(object->backer, &newattrs, NULL); truncate_failed: - mutex_unlock(&d_inode(object->backer)->i_mutex); + inode_unlock(d_inode(object->backer)); cachefiles_end_secure(cache, saved_cred); if (ret == -EIO) { diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index c4b893453..1c2334c16 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -295,7 +295,7 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache, cachefiles_mark_object_buried(cache, rep, why); } - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); if (ret == -EIO) cachefiles_io_error(cache, "Unlink failed"); @@ -306,7 +306,7 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache, /* directories have to be moved to the graveyard */ _debug("move stale object to graveyard"); - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); try_again: /* first step is to make up a grave dentry in the graveyard */ @@ -423,13 +423,13 @@ int cachefiles_delete_object(struct cachefiles_cache *cache, dir = dget_parent(object->dentry); - mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); if (test_bit(FSCACHE_OBJECT_KILLED_BY_CACHE, &object->fscache.flags)) { /* object allocation for the same key preemptively deleted this * object's file so that it could create its own file */ _debug("object preemptively buried"); - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); ret = 0; } else { /* we need to check that our parent is _still_ our parent - it @@ -442,7 +442,7 @@ int cachefiles_delete_object(struct cachefiles_cache *cache, /* it got moved, presumably by cachefilesd culling it, * so it's no longer in the key path and we can ignore * it */ - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); ret = 0; } } @@ -501,7 +501,7 @@ lookup_again: /* search the current directory for the element name */ _debug("lookup '%s'", name); - mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); start = jiffies; next = lookup_one_len(name, dir, nlen); @@ -585,7 +585,7 @@ lookup_again: /* process the next component */ if (key) { _debug("advance"); - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); dput(dir); dir = next; next = NULL; @@ -623,7 +623,7 @@ lookup_again: /* note that we're now using this object */ ret = cachefiles_mark_object_active(cache, object); - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); dput(dir); dir = NULL; @@ -705,7 +705,7 @@ lookup_error: cachefiles_io_error(cache, "Lookup failed"); next = NULL; error: - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); dput(next); error_out2: dput(dir); @@ -729,7 +729,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, _enter(",,%s", dirname); /* search the current directory for the element name */ - mutex_lock(&d_inode(dir)->i_mutex); + inode_lock(d_inode(dir)); start = jiffies; subdir = lookup_one_len(dirname, dir, strlen(dirname)); @@ -768,7 +768,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, d_backing_inode(subdir)->i_ino); } - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); /* we need to make sure the subdir is a directory */ ASSERT(d_backing_inode(subdir)); @@ -800,19 +800,19 @@ check_error: return ERR_PTR(ret); mkdir_error: - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); dput(subdir); pr_err("mkdir %s failed with error %d\n", dirname, ret); return ERR_PTR(ret); lookup_error: - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); ret = PTR_ERR(subdir); pr_err("Lookup %s failed with error %d\n", dirname, ret); return ERR_PTR(ret); nomem_d_alloc: - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); _leave(" = -ENOMEM"); return ERR_PTR(-ENOMEM); } @@ -837,7 +837,7 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache, // dir, filename); /* look up the victim */ - mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); start = jiffies; victim = lookup_one_len(filename, dir, strlen(filename)); @@ -852,7 +852,7 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache, * at the netfs's request whilst the cull was in progress */ if (d_is_negative(victim)) { - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); dput(victim); _leave(" = -ENOENT [absent]"); return ERR_PTR(-ENOENT); @@ -881,13 +881,13 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache, object_in_use: read_unlock(&cache->active_lock); - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); dput(victim); //_leave(" = -EBUSY [in use]"); return ERR_PTR(-EBUSY); lookup_error: - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); ret = PTR_ERR(victim); if (ret == -ENOENT) { /* file or dir now absent - probably retired by netfs */ @@ -947,7 +947,7 @@ int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir, return 0; error_unlock: - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); error: dput(victim); if (ret == -ENOENT) { @@ -982,7 +982,7 @@ int cachefiles_check_in_use(struct cachefiles_cache *cache, struct dentry *dir, if (IS_ERR(victim)) return PTR_ERR(victim); - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); dput(victim); //_leave(" = 0"); return 0; diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c index 8f84646f1..f19708487 100644 --- a/fs/ceph/acl.c +++ b/fs/ceph/acl.c @@ -49,10 +49,10 @@ struct posix_acl *ceph_get_acl(struct inode *inode, int type) switch (type) { case ACL_TYPE_ACCESS: - name = POSIX_ACL_XATTR_ACCESS; + name = XATTR_NAME_POSIX_ACL_ACCESS; break; case ACL_TYPE_DEFAULT: - name = POSIX_ACL_XATTR_DEFAULT; + name = XATTR_NAME_POSIX_ACL_DEFAULT; break; default: BUG(); @@ -92,7 +92,7 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type) switch (type) { case ACL_TYPE_ACCESS: - name = POSIX_ACL_XATTR_ACCESS; + name = XATTR_NAME_POSIX_ACL_ACCESS; if (acl) { ret = posix_acl_equiv_mode(acl, &new_mode); if (ret < 0) @@ -106,7 +106,7 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type) ret = acl ? -EINVAL : 0; goto out; } - name = POSIX_ACL_XATTR_DEFAULT; + name = XATTR_NAME_POSIX_ACL_DEFAULT; break; default: ret = -EINVAL; @@ -202,11 +202,11 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode, ceph_pagelist_encode_32(pagelist, acl && default_acl ? 2 : 1); if (acl) { - size_t len = strlen(POSIX_ACL_XATTR_ACCESS); + size_t len = strlen(XATTR_NAME_POSIX_ACL_ACCESS); err = ceph_pagelist_reserve(pagelist, len + val_size1 + 8); if (err) goto out_err; - ceph_pagelist_encode_string(pagelist, POSIX_ACL_XATTR_ACCESS, + ceph_pagelist_encode_string(pagelist, XATTR_NAME_POSIX_ACL_ACCESS, len); err = posix_acl_to_xattr(&init_user_ns, acl, tmp_buf, val_size1); @@ -216,12 +216,12 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode, ceph_pagelist_append(pagelist, tmp_buf, val_size1); } if (default_acl) { - size_t len = strlen(POSIX_ACL_XATTR_DEFAULT); + size_t len = strlen(XATTR_NAME_POSIX_ACL_DEFAULT); err = ceph_pagelist_reserve(pagelist, len + val_size2 + 8); if (err) goto out_err; err = ceph_pagelist_encode_string(pagelist, - POSIX_ACL_XATTR_DEFAULT, len); + XATTR_NAME_POSIX_ACL_DEFAULT, len); err = posix_acl_to_xattr(&init_user_ns, default_acl, tmp_buf, val_size2); if (err < 0) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index b7d218a16..19adeb0ef 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1108,7 +1108,7 @@ retry_locked: return 0; /* past end of file? */ - i_size = inode->i_size; /* caller holds i_mutex */ + i_size = i_size_read(inode); if (page_off >= i_size || (pos_in_page == 0 && (pos+len) >= i_size && @@ -1149,7 +1149,6 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping, page = grab_cache_page_write_begin(mapping, index, 0); if (!page) return -ENOMEM; - *pagep = page; dout("write_begin file %p inode %p page %p %d~%d\n", file, inode, page, (int)pos, (int)len); @@ -1184,8 +1183,7 @@ static int ceph_write_end(struct file *file, struct address_space *mapping, zero_user_segment(page, from+copied, len); /* did file size increase? */ - /* (no need for i_size_read(); we caller holds i_mutex */ - if (pos+copied > inode->i_size) + if (pos+copied > i_size_read(inode)) check_cap = ceph_inode_set_size(inode, pos+copied); if (!PageUptodate(page)) @@ -1378,11 +1376,13 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) ret = VM_FAULT_NOPAGE; if ((off > size) || - (page->mapping != inode->i_mapping)) + (page->mapping != inode->i_mapping)) { + unlock_page(page); goto out; + } ret = ceph_update_writeable_page(vma->vm_file, off, len, page); - if (ret == 0) { + if (ret >= 0) { /* success. we'll keep the page locked. */ set_page_dirty(page); ret = VM_FAULT_LOCKED; @@ -1393,8 +1393,6 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) ret = VM_FAULT_SIGBUS; } out: - if (ret != VM_FAULT_LOCKED) - unlock_page(page); if (ret == VM_FAULT_LOCKED || ci->i_inline_version != CEPH_INLINE_NONE) { int dirty; @@ -1758,6 +1756,10 @@ int ceph_pool_perm_check(struct ceph_inode_info *ci, int need) u32 pool; int ret, flags; + /* does not support pool namespace yet */ + if (ci->i_pool_ns_len) + return -EIO; + if (ceph_test_mount_opt(ceph_inode_to_client(&ci->vfs_inode), NOPOOLPERM)) return 0; diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index a4766ded1..a351480db 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -106,7 +106,7 @@ static uint16_t ceph_fscache_inode_get_aux(const void *cookie_netfs_data, memset(&aux, 0, sizeof(aux)); aux.mtime = inode->i_mtime; - aux.size = inode->i_size; + aux.size = i_size_read(inode); memcpy(buffer, &aux, sizeof(aux)); @@ -117,9 +117,7 @@ static void ceph_fscache_inode_get_attr(const void *cookie_netfs_data, uint64_t *size) { const struct ceph_inode_info* ci = cookie_netfs_data; - const struct inode* inode = &ci->vfs_inode; - - *size = inode->i_size; + *size = i_size_read(&ci->vfs_inode); } static enum fscache_checkaux ceph_fscache_inode_check_aux( @@ -134,7 +132,7 @@ static enum fscache_checkaux ceph_fscache_inode_check_aux( memset(&aux, 0, sizeof(aux)); aux.mtime = inode->i_mtime; - aux.size = inode->i_size; + aux.size = i_size_read(inode); if (memcmp(data, &aux, sizeof(aux)) != 0) return FSCACHE_CHECKAUX_OBSOLETE; @@ -197,7 +195,7 @@ void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc, return; /* Avoid multiple racing open requests */ - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (ci->fscache) goto done; @@ -207,7 +205,7 @@ void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc, ci, true); fscache_check_consistency(ci->fscache); done: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index c69e1253b..6fe0ad26a 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2030,7 +2030,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) if (datasync) goto out; - mutex_lock(&inode->i_mutex); + inode_lock(inode); dirty = try_flush_caps(inode, &flush_tid); dout("fsync dirty caps are %s\n", ceph_cap_string(dirty)); @@ -2046,7 +2046,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) ret = wait_event_interruptible(ci->i_cap_wq, caps_are_flushed(inode, flush_tid)); } - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); out: dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret); return ret; @@ -2753,7 +2753,8 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, void *inline_data, int inline_len, struct ceph_buffer *xattr_buf, struct ceph_mds_session *session, - struct ceph_cap *cap, int issued) + struct ceph_cap *cap, int issued, + u32 pool_ns_len) __releases(ci->i_ceph_lock) __releases(mdsc->snap_rwsem) { @@ -2873,6 +2874,8 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, if (newcaps & (CEPH_CAP_ANY_FILE_RD | CEPH_CAP_ANY_FILE_WR)) { /* file layout may have changed */ ci->i_layout = grant->layout; + ci->i_pool_ns_len = pool_ns_len; + /* size/truncate_seq? */ queue_trunc = ceph_fill_file_size(inode, issued, le32_to_cpu(grant->truncate_seq), @@ -3411,6 +3414,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, u32 inline_len = 0; void *snaptrace; size_t snaptrace_len; + u32 pool_ns_len = 0; void *p, *end; dout("handle_caps from mds%d\n", mds); @@ -3463,6 +3467,21 @@ void ceph_handle_caps(struct ceph_mds_session *session, p += inline_len; } + if (le16_to_cpu(msg->hdr.version) >= 8) { + u64 flush_tid; + u32 caller_uid, caller_gid; + u32 osd_epoch_barrier; + /* version >= 5 */ + ceph_decode_32_safe(&p, end, osd_epoch_barrier, bad); + /* version >= 6 */ + ceph_decode_64_safe(&p, end, flush_tid, bad); + /* version >= 7 */ + ceph_decode_32_safe(&p, end, caller_uid, bad); + ceph_decode_32_safe(&p, end, caller_gid, bad); + /* version >= 8 */ + ceph_decode_32_safe(&p, end, pool_ns_len, bad); + } + /* lookup ino */ inode = ceph_find_inode(sb, vino); ci = ceph_inode(inode); @@ -3518,7 +3537,8 @@ void ceph_handle_caps(struct ceph_mds_session *session, &cap, &issued); handle_cap_grant(mdsc, inode, h, inline_version, inline_data, inline_len, - msg->middle, session, cap, issued); + msg->middle, session, cap, issued, + pool_ns_len); if (realm) ceph_put_snap_realm(mdsc, realm); goto done_unlocked; @@ -3542,7 +3562,8 @@ void ceph_handle_caps(struct ceph_mds_session *session, issued |= __ceph_caps_dirty(ci); handle_cap_grant(mdsc, inode, h, inline_version, inline_data, inline_len, - msg->middle, session, cap, issued); + msg->middle, session, cap, issued, + pool_ns_len); goto done_unlocked; case CEPH_CAP_OP_FLUSH_ACK: diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 9314b4ea2..fd11fb231 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -507,7 +507,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) loff_t old_offset = ceph_make_fpos(fi->frag, fi->next_offset); loff_t retval; - mutex_lock(&inode->i_mutex); + inode_lock(inode); retval = -EINVAL; switch (whence) { case SEEK_CUR: @@ -542,7 +542,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) } } out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return retval; } diff --git a/fs/ceph/export.c b/fs/ceph/export.c index fe02ae7f0..3b3172357 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -215,7 +215,7 @@ static int ceph_get_name(struct dentry *parent, char *name, if (IS_ERR(req)) return PTR_ERR(req); - mutex_lock(&d_inode(parent)->i_mutex); + inode_lock(d_inode(parent)); req->r_inode = d_inode(child); ihold(d_inode(child)); @@ -224,7 +224,7 @@ static int ceph_get_name(struct dentry *parent, char *name, req->r_num_caps = 2; err = ceph_mdsc_do_request(mdsc, NULL, req); - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); if (!err) { struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 3c68e6aee..eb9028e8c 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -397,8 +397,9 @@ int ceph_release(struct inode *inode, struct file *file) } enum { - CHECK_EOF = 1, - READ_INLINE = 2, + HAVE_RETRIED = 1, + CHECK_EOF = 2, + READ_INLINE = 3, }; /* @@ -411,17 +412,15 @@ enum { static int striped_read(struct inode *inode, u64 off, u64 len, struct page **pages, int num_pages, - int *checkeof, bool o_direct, - unsigned long buf_align) + int *checkeof) { struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); u64 pos, this_len, left; - int io_align, page_align; - int pages_left; - int read; + loff_t i_size; + int page_align, pages_left; + int read, ret; struct page **page_pos; - int ret; bool hit_stripe, was_short; /* @@ -432,13 +431,9 @@ static int striped_read(struct inode *inode, page_pos = pages; pages_left = num_pages; read = 0; - io_align = off & ~PAGE_MASK; more: - if (o_direct) - page_align = (pos - io_align + buf_align) & ~PAGE_MASK; - else - page_align = pos & ~PAGE_MASK; + page_align = pos & ~PAGE_MASK; this_len = left; ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode), &ci->i_layout, pos, &this_len, @@ -452,13 +447,12 @@ more: dout("striped_read %llu~%llu (read %u) got %d%s%s\n", pos, left, read, ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : ""); + i_size = i_size_read(inode); if (ret >= 0) { int didpages; - if (was_short && (pos + ret < inode->i_size)) { - int zlen = min(this_len - ret, - inode->i_size - pos - ret); - int zoff = (o_direct ? buf_align : io_align) + - read + ret; + if (was_short && (pos + ret < i_size)) { + int zlen = min(this_len - ret, i_size - pos - ret); + int zoff = (off & ~PAGE_MASK) + read + ret; dout(" zero gap %llu to %llu\n", pos + ret, pos + ret + zlen); ceph_zero_page_vector_range(zoff, zlen, pages); @@ -473,14 +467,14 @@ more: pages_left -= didpages; /* hit stripe and need continue*/ - if (left && hit_stripe && pos < inode->i_size) + if (left && hit_stripe && pos < i_size) goto more; } if (read > 0) { ret = read; /* did we bounce off eof? */ - if (pos + left > inode->i_size) + if (pos + left > i_size) *checkeof = CHECK_EOF; } @@ -521,54 +515,28 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i, if (ret < 0) return ret; - if (iocb->ki_flags & IOCB_DIRECT) { - while (iov_iter_count(i)) { - size_t start; - ssize_t n; - - n = dio_get_pagev_size(i); - pages = dio_get_pages_alloc(i, n, &start, &num_pages); - if (IS_ERR(pages)) - return PTR_ERR(pages); - - ret = striped_read(inode, off, n, - pages, num_pages, checkeof, - 1, start); - - ceph_put_page_vector(pages, num_pages, true); - - if (ret <= 0) - break; - off += ret; - iov_iter_advance(i, ret); - if (ret < n) + num_pages = calc_pages_for(off, len); + pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); + if (IS_ERR(pages)) + return PTR_ERR(pages); + ret = striped_read(inode, off, len, pages, + num_pages, checkeof); + if (ret > 0) { + int l, k = 0; + size_t left = ret; + + while (left) { + size_t page_off = off & ~PAGE_MASK; + size_t copy = min_t(size_t, left, + PAGE_SIZE - page_off); + l = copy_page_to_iter(pages[k++], page_off, copy, i); + off += l; + left -= l; + if (l < copy) break; } - } else { - num_pages = calc_pages_for(off, len); - pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); - if (IS_ERR(pages)) - return PTR_ERR(pages); - ret = striped_read(inode, off, len, pages, - num_pages, checkeof, 0, 0); - if (ret > 0) { - int l, k = 0; - size_t left = ret; - - while (left) { - size_t page_off = off & ~PAGE_MASK; - size_t copy = min_t(size_t, - PAGE_SIZE - page_off, left); - l = copy_page_to_iter(pages[k++], page_off, - copy, i); - off += l; - left -= l; - if (l < copy) - break; - } - } - ceph_release_page_vector(pages, num_pages); } + ceph_release_page_vector(pages, num_pages); if (off > iocb->ki_pos) { ret = off - iocb->ki_pos; @@ -579,6 +547,193 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i, return ret; } +struct ceph_aio_request { + struct kiocb *iocb; + size_t total_len; + int write; + int error; + struct list_head osd_reqs; + unsigned num_reqs; + atomic_t pending_reqs; + struct timespec mtime; + struct ceph_cap_flush *prealloc_cf; +}; + +struct ceph_aio_work { + struct work_struct work; + struct ceph_osd_request *req; +}; + +static void ceph_aio_retry_work(struct work_struct *work); + +static void ceph_aio_complete(struct inode *inode, + struct ceph_aio_request *aio_req) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int ret; + + if (!atomic_dec_and_test(&aio_req->pending_reqs)) + return; + + ret = aio_req->error; + if (!ret) + ret = aio_req->total_len; + + dout("ceph_aio_complete %p rc %d\n", inode, ret); + + if (ret >= 0 && aio_req->write) { + int dirty; + + loff_t endoff = aio_req->iocb->ki_pos + aio_req->total_len; + if (endoff > i_size_read(inode)) { + if (ceph_inode_set_size(inode, endoff)) + ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); + } + + spin_lock(&ci->i_ceph_lock); + ci->i_inline_version = CEPH_INLINE_NONE; + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, + &aio_req->prealloc_cf); + spin_unlock(&ci->i_ceph_lock); + if (dirty) + __mark_inode_dirty(inode, dirty); + + } + + ceph_put_cap_refs(ci, (aio_req->write ? CEPH_CAP_FILE_WR : + CEPH_CAP_FILE_RD)); + + aio_req->iocb->ki_complete(aio_req->iocb, ret, 0); + + ceph_free_cap_flush(aio_req->prealloc_cf); + kfree(aio_req); +} + +static void ceph_aio_complete_req(struct ceph_osd_request *req, + struct ceph_msg *msg) +{ + int rc = req->r_result; + struct inode *inode = req->r_inode; + struct ceph_aio_request *aio_req = req->r_priv; + struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0); + int num_pages = calc_pages_for((u64)osd_data->alignment, + osd_data->length); + + dout("ceph_aio_complete_req %p rc %d bytes %llu\n", + inode, rc, osd_data->length); + + if (rc == -EOLDSNAPC) { + struct ceph_aio_work *aio_work; + BUG_ON(!aio_req->write); + + aio_work = kmalloc(sizeof(*aio_work), GFP_NOFS); + if (aio_work) { + INIT_WORK(&aio_work->work, ceph_aio_retry_work); + aio_work->req = req; + queue_work(ceph_inode_to_client(inode)->wb_wq, + &aio_work->work); + return; + } + rc = -ENOMEM; + } else if (!aio_req->write) { + if (rc == -ENOENT) + rc = 0; + if (rc >= 0 && osd_data->length > rc) { + int zoff = osd_data->alignment + rc; + int zlen = osd_data->length - rc; + /* + * If read is satisfied by single OSD request, + * it can pass EOF. Otherwise read is within + * i_size. + */ + if (aio_req->num_reqs == 1) { + loff_t i_size = i_size_read(inode); + loff_t endoff = aio_req->iocb->ki_pos + rc; + if (endoff < i_size) + zlen = min_t(size_t, zlen, + i_size - endoff); + aio_req->total_len = rc + zlen; + } + + if (zlen > 0) + ceph_zero_page_vector_range(zoff, zlen, + osd_data->pages); + } + } + + ceph_put_page_vector(osd_data->pages, num_pages, false); + ceph_osdc_put_request(req); + + if (rc < 0) + cmpxchg(&aio_req->error, 0, rc); + + ceph_aio_complete(inode, aio_req); + return; +} + +static void ceph_aio_retry_work(struct work_struct *work) +{ + struct ceph_aio_work *aio_work = + container_of(work, struct ceph_aio_work, work); + struct ceph_osd_request *orig_req = aio_work->req; + struct ceph_aio_request *aio_req = orig_req->r_priv; + struct inode *inode = orig_req->r_inode; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_snap_context *snapc; + struct ceph_osd_request *req; + int ret; + + spin_lock(&ci->i_ceph_lock); + if (__ceph_have_pending_cap_snap(ci)) { + struct ceph_cap_snap *capsnap = + list_last_entry(&ci->i_cap_snaps, + struct ceph_cap_snap, + ci_item); + snapc = ceph_get_snap_context(capsnap->context); + } else { + BUG_ON(!ci->i_head_snapc); + snapc = ceph_get_snap_context(ci->i_head_snapc); + } + spin_unlock(&ci->i_ceph_lock); + + req = ceph_osdc_alloc_request(orig_req->r_osdc, snapc, 2, + false, GFP_NOFS); + if (!req) { + ret = -ENOMEM; + req = orig_req; + goto out; + } + + req->r_flags = CEPH_OSD_FLAG_ORDERSNAP | + CEPH_OSD_FLAG_ONDISK | + CEPH_OSD_FLAG_WRITE; + req->r_base_oloc = orig_req->r_base_oloc; + req->r_base_oid = orig_req->r_base_oid; + + req->r_ops[0] = orig_req->r_ops[0]; + osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0); + + ceph_osdc_build_request(req, req->r_ops[0].extent.offset, + snapc, CEPH_NOSNAP, &aio_req->mtime); + + ceph_osdc_put_request(orig_req); + + req->r_callback = ceph_aio_complete_req; + req->r_inode = inode; + req->r_priv = aio_req; + + ret = ceph_osdc_start_request(req->r_osdc, req, false); +out: + if (ret < 0) { + BUG_ON(ret == -EOLDSNAPC); + req->r_result = ret; + ceph_aio_complete_req(req, NULL); + } + + ceph_put_snap_context(snapc); + kfree(aio_work); +} + /* * Write commit request unsafe callback, called to tell us when a * request is unsafe (that is, in flight--has been handed to the @@ -612,16 +767,10 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe) } -/* - * Synchronous write, straight from __user pointer or user pages. - * - * If write spans object boundary, just do multiple writes. (For a - * correct atomic write, we should e.g. take write locks on all - * objects, rollback on failure, etc.) - */ static ssize_t -ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, - struct ceph_snap_context *snapc) +ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, + struct ceph_snap_context *snapc, + struct ceph_cap_flush **pcf) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); @@ -630,44 +779,52 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, struct ceph_vino vino; struct ceph_osd_request *req; struct page **pages; - int num_pages; - int written = 0; + struct ceph_aio_request *aio_req = NULL; + int num_pages = 0; int flags; - int check_caps = 0; int ret; struct timespec mtime = CURRENT_TIME; - size_t count = iov_iter_count(from); + size_t count = iov_iter_count(iter); + loff_t pos = iocb->ki_pos; + bool write = iov_iter_rw(iter) == WRITE; - if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) + if (write && ceph_snap(file_inode(file)) != CEPH_NOSNAP) return -EROFS; - dout("sync_direct_write on file %p %lld~%u\n", file, pos, - (unsigned)count); + dout("sync_direct_read_write (%s) on file %p %lld~%u\n", + (write ? "write" : "read"), file, pos, (unsigned)count); ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count); if (ret < 0) return ret; - ret = invalidate_inode_pages2_range(inode->i_mapping, - pos >> PAGE_CACHE_SHIFT, - (pos + count) >> PAGE_CACHE_SHIFT); - if (ret < 0) - dout("invalidate_inode_pages2_range returned %d\n", ret); + if (write) { + ret = invalidate_inode_pages2_range(inode->i_mapping, + pos >> PAGE_CACHE_SHIFT, + (pos + count) >> PAGE_CACHE_SHIFT); + if (ret < 0) + dout("invalidate_inode_pages2_range returned %d\n", ret); - flags = CEPH_OSD_FLAG_ORDERSNAP | - CEPH_OSD_FLAG_ONDISK | - CEPH_OSD_FLAG_WRITE; + flags = CEPH_OSD_FLAG_ORDERSNAP | + CEPH_OSD_FLAG_ONDISK | + CEPH_OSD_FLAG_WRITE; + } else { + flags = CEPH_OSD_FLAG_READ; + } - while (iov_iter_count(from) > 0) { - u64 len = dio_get_pagev_size(from); - size_t start; - ssize_t n; + while (iov_iter_count(iter) > 0) { + u64 size = dio_get_pagev_size(iter); + size_t start = 0; + ssize_t len; vino = ceph_vino(inode); req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, - vino, pos, &len, 0, - 2,/*include a 'startsync' command*/ - CEPH_OSD_OP_WRITE, flags, snapc, + vino, pos, &size, 0, + /*include a 'startsync' command*/ + write ? 2 : 1, + write ? CEPH_OSD_OP_WRITE : + CEPH_OSD_OP_READ, + flags, snapc, ci->i_truncate_seq, ci->i_truncate_size, false); @@ -676,10 +833,8 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, break; } - osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0); - - n = len; - pages = dio_get_pages_alloc(from, len, &start, &num_pages); + len = size; + pages = dio_get_pages_alloc(iter, len, &start, &num_pages); if (IS_ERR(pages)) { ceph_osdc_put_request(req); ret = PTR_ERR(pages); @@ -687,47 +842,128 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, } /* - * throw out any page cache pages in this range. this - * may block. + * To simplify error handling, allow AIO when IO within i_size + * or IO can be satisfied by single OSD request. */ - truncate_inode_pages_range(inode->i_mapping, pos, - (pos+n) | (PAGE_CACHE_SIZE-1)); - osd_req_op_extent_osd_data_pages(req, 0, pages, n, start, - false, false); + if (pos == iocb->ki_pos && !is_sync_kiocb(iocb) && + (len == count || pos + count <= i_size_read(inode))) { + aio_req = kzalloc(sizeof(*aio_req), GFP_KERNEL); + if (aio_req) { + aio_req->iocb = iocb; + aio_req->write = write; + INIT_LIST_HEAD(&aio_req->osd_reqs); + if (write) { + aio_req->mtime = mtime; + swap(aio_req->prealloc_cf, *pcf); + } + } + /* ignore error */ + } + + if (write) { + /* + * throw out any page cache pages in this range. this + * may block. + */ + truncate_inode_pages_range(inode->i_mapping, pos, + (pos+len) | (PAGE_CACHE_SIZE - 1)); + + osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0); + } + + + osd_req_op_extent_osd_data_pages(req, 0, pages, len, start, + false, false); - /* BUG_ON(vino.snap != CEPH_NOSNAP); */ ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime); - ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); + if (aio_req) { + aio_req->total_len += len; + aio_req->num_reqs++; + atomic_inc(&aio_req->pending_reqs); + + req->r_callback = ceph_aio_complete_req; + req->r_inode = inode; + req->r_priv = aio_req; + list_add_tail(&req->r_unsafe_item, &aio_req->osd_reqs); + + pos += len; + iov_iter_advance(iter, len); + continue; + } + + ret = ceph_osdc_start_request(req->r_osdc, req, false); if (!ret) ret = ceph_osdc_wait_request(&fsc->client->osdc, req); + size = i_size_read(inode); + if (!write) { + if (ret == -ENOENT) + ret = 0; + if (ret >= 0 && ret < len && pos + ret < size) { + int zlen = min_t(size_t, len - ret, + size - pos - ret); + ceph_zero_page_vector_range(start + ret, zlen, + pages); + ret += zlen; + } + if (ret >= 0) + len = ret; + } + ceph_put_page_vector(pages, num_pages, false); ceph_osdc_put_request(req); - if (ret) + if (ret < 0) + break; + + pos += len; + iov_iter_advance(iter, len); + + if (!write && pos >= size) break; - pos += n; - written += n; - iov_iter_advance(from, n); - if (pos > i_size_read(inode)) { - check_caps = ceph_inode_set_size(inode, pos); - if (check_caps) + if (write && pos > size) { + if (ceph_inode_set_size(inode, pos)) ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL); } } - if (ret != -EOLDSNAPC && written > 0) { + if (aio_req) { + if (aio_req->num_reqs == 0) { + kfree(aio_req); + return ret; + } + + ceph_get_cap_refs(ci, write ? CEPH_CAP_FILE_WR : + CEPH_CAP_FILE_RD); + + while (!list_empty(&aio_req->osd_reqs)) { + req = list_first_entry(&aio_req->osd_reqs, + struct ceph_osd_request, + r_unsafe_item); + list_del_init(&req->r_unsafe_item); + if (ret >= 0) + ret = ceph_osdc_start_request(req->r_osdc, + req, false); + if (ret < 0) { + BUG_ON(ret == -EOLDSNAPC); + req->r_result = ret; + ceph_aio_complete_req(req, NULL); + } + } + return -EIOCBQUEUED; + } + + if (ret != -EOLDSNAPC && pos > iocb->ki_pos) { + ret = pos - iocb->ki_pos; iocb->ki_pos = pos; - ret = written; } return ret; } - /* * Synchronous write, straight from __user pointer or user pages. * @@ -897,8 +1133,14 @@ again: ceph_cap_string(got)); if (ci->i_inline_version == CEPH_INLINE_NONE) { - /* hmm, this isn't really async... */ - ret = ceph_sync_read(iocb, to, &retry_op); + if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) { + ret = ceph_direct_read_write(iocb, to, + NULL, NULL); + if (ret >= 0 && ret < len) + retry_op = CHECK_EOF; + } else { + ret = ceph_sync_read(iocb, to, &retry_op); + } } else { retry_op = READ_INLINE; } @@ -916,7 +1158,7 @@ again: pinned_page = NULL; } ceph_put_cap_refs(ci, got); - if (retry_op && ret >= 0) { + if (retry_op > HAVE_RETRIED && ret >= 0) { int statret; struct page *page = NULL; loff_t i_size; @@ -968,12 +1210,11 @@ again: if (retry_op == CHECK_EOF && iocb->ki_pos < i_size && ret < len) { dout("sync_read hit hole, ppos %lld < size %lld" - ", reading more\n", iocb->ki_pos, - inode->i_size); + ", reading more\n", iocb->ki_pos, i_size); read += ret; len -= ret; - retry_op = 0; + retry_op = HAVE_RETRIED; goto again; } } @@ -1014,7 +1255,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) if (!prealloc_cf) return -ENOMEM; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* We can write back this queue in page reclaim */ current->backing_dev_info = inode_to_bdi(inode); @@ -1052,7 +1293,7 @@ retry_snap: } dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n", - inode, ceph_vinop(inode), pos, count, inode->i_size); + inode, ceph_vinop(inode), pos, count, i_size_read(inode)); if (fi->fmode & CEPH_FILE_MODE_LAZY) want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; else @@ -1070,7 +1311,7 @@ retry_snap: (iocb->ki_flags & IOCB_DIRECT) || (fi->flags & CEPH_F_SYNC)) { struct ceph_snap_context *snapc; struct iov_iter data; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); spin_lock(&ci->i_ceph_lock); if (__ceph_have_pending_cap_snap(ci)) { @@ -1088,8 +1329,8 @@ retry_snap: /* we might need to revert back to that point */ data = *from; if (iocb->ki_flags & IOCB_DIRECT) - written = ceph_sync_direct_write(iocb, &data, pos, - snapc); + written = ceph_direct_read_write(iocb, &data, snapc, + &prealloc_cf); else written = ceph_sync_write(iocb, &data, pos, snapc); if (written == -EOLDSNAPC) { @@ -1097,14 +1338,14 @@ retry_snap: "got EOLDSNAPC, retrying\n", inode, ceph_vinop(inode), pos, (unsigned)count); - mutex_lock(&inode->i_mutex); + inode_lock(inode); goto retry_snap; } if (written > 0) iov_iter_advance(from, written); ceph_put_snap_context(snapc); } else { - loff_t old_size = inode->i_size; + loff_t old_size = i_size_read(inode); /* * No need to acquire the i_truncate_mutex. Because * the MDS revokes Fwb caps before sending truncate @@ -1115,9 +1356,9 @@ retry_snap: written = generic_perform_write(file, from, pos); if (likely(written >= 0)) iocb->ki_pos = pos + written; - if (inode->i_size > old_size) + if (i_size_read(inode) > old_size) ceph_fscache_update_objectsize(inode); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } if (written >= 0) { @@ -1147,7 +1388,7 @@ retry_snap: goto out_unlocked; out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); out_unlocked: ceph_free_cap_flush(prealloc_cf); current->backing_dev_info = NULL; @@ -1160,9 +1401,10 @@ out_unlocked: static loff_t ceph_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; + loff_t i_size; int ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) { ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false); @@ -1172,9 +1414,10 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence) } } + i_size = i_size_read(inode); switch (whence) { case SEEK_END: - offset += inode->i_size; + offset += i_size; break; case SEEK_CUR: /* @@ -1190,24 +1433,24 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence) offset += file->f_pos; break; case SEEK_DATA: - if (offset >= inode->i_size) { + if (offset >= i_size) { ret = -ENXIO; goto out; } break; case SEEK_HOLE: - if (offset >= inode->i_size) { + if (offset >= i_size) { ret = -ENXIO; goto out; } - offset = inode->i_size; + offset = i_size; break; } offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return offset; } @@ -1363,7 +1606,7 @@ static long ceph_fallocate(struct file *file, int mode, if (!prealloc_cf) return -ENOMEM; - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (ceph_snap(inode) != CEPH_NOSNAP) { ret = -EROFS; @@ -1418,7 +1661,7 @@ static long ceph_fallocate(struct file *file, int mode, ceph_put_cap_refs(ci, got); unlock: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); ceph_free_cap_flush(prealloc_cf); return ret; } diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 498dcfa2d..5849b88bb 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -396,6 +396,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_symlink = NULL; memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout)); + ci->i_pool_ns_len = 0; ci->i_fragtree = RB_ROOT; mutex_init(&ci->i_fragtree_mutex); @@ -548,7 +549,7 @@ int ceph_fill_file_size(struct inode *inode, int issued, if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 || (truncate_seq == ci->i_truncate_seq && size > inode->i_size)) { dout("size %lld -> %llu\n", inode->i_size, size); - inode->i_size = size; + i_size_write(inode, size); inode->i_blocks = (size + (1<<9) - 1) >> 9; ci->i_reported_size = size; if (truncate_seq != ci->i_truncate_seq) { @@ -756,6 +757,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page, if (ci->i_layout.fl_pg_pool != info->layout.fl_pg_pool) ci->i_ceph_flags &= ~CEPH_I_POOL_PERM; ci->i_layout = info->layout; + ci->i_pool_ns_len = iinfo->pool_ns_len; queue_trunc = ceph_fill_file_size(inode, issued, le32_to_cpu(info->truncate_seq), @@ -808,7 +810,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page, spin_unlock(&ci->i_ceph_lock); err = -EINVAL; - if (WARN_ON(symlen != inode->i_size)) + if (WARN_ON(symlen != i_size_read(inode))) goto out; err = -ENOMEM; @@ -1549,7 +1551,7 @@ int ceph_inode_set_size(struct inode *inode, loff_t size) spin_lock(&ci->i_ceph_lock); dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size); - inode->i_size = size; + i_size_write(inode, size); inode->i_blocks = (size + (1 << 9) - 1) >> 9; /* tell the MDS if we are approaching max_size */ @@ -1756,7 +1758,7 @@ retry: */ static const struct inode_operations ceph_symlink_iops = { .readlink = generic_readlink, - .follow_link = simple_follow_link, + .get_link = simple_get_link, .setattr = ceph_setattr, .getattr = ceph_getattr, .setxattr = ceph_setxattr, @@ -1911,7 +1913,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) inode->i_size, attr->ia_size); if ((issued & CEPH_CAP_FILE_EXCL) && attr->ia_size > inode->i_size) { - inode->i_size = attr->ia_size; + i_size_write(inode, attr->ia_size); inode->i_blocks = (attr->ia_size + (1 << 9) - 1) >> 9; inode->i_ctime = attr->ia_ctime; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index e7b130a63..911d64d86 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -100,6 +100,14 @@ static int parse_reply_info_in(void **p, void *end, } else info->inline_version = CEPH_INLINE_NONE; + if (features & CEPH_FEATURE_FS_FILE_LAYOUT_V2) { + ceph_decode_32_safe(p, end, info->pool_ns_len, bad); + ceph_decode_need(p, end, info->pool_ns_len, bad); + *p += info->pool_ns_len; + } else { + info->pool_ns_len = 0; + } + return 0; bad: return err; @@ -2298,6 +2306,14 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc, ceph_get_cap_refs(ceph_inode(req->r_old_dentry_dir), CEPH_CAP_PIN); + /* deny access to directories with pool_ns layouts */ + if (req->r_inode && S_ISDIR(req->r_inode->i_mode) && + ceph_inode(req->r_inode)->i_pool_ns_len) + return -EIO; + if (req->r_locked_dir && + ceph_inode(req->r_locked_dir)->i_pool_ns_len) + return -EIO; + /* issue */ mutex_lock(&mdsc->mutex); __register_request(mdsc, req, dir); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index ccf11ef0c..37712ccff 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -44,6 +44,7 @@ struct ceph_mds_reply_info_in { u64 inline_version; u32 inline_len; char *inline_data; + u32 pool_ns_len; }; /* diff --git a/fs/ceph/super.c b/fs/ceph/super.c index f446afada..ca4d5e845 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -639,8 +639,8 @@ static int __init init_caches(void) ceph_inode_cachep = kmem_cache_create("ceph_inode_info", sizeof(struct ceph_inode_info), __alignof__(struct ceph_inode_info), - (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD), - ceph_inode_init_once); + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD| + SLAB_ACCOUNT, ceph_inode_init_once); if (ceph_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 75b7d125c..9c458eb52 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -287,6 +287,7 @@ struct ceph_inode_info { struct ceph_dir_layout i_dir_layout; struct ceph_file_layout i_layout; + size_t i_pool_ns_len; char *i_symlink; /* for dirs */ diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index 7dc886c9a..e956cba94 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -175,7 +175,7 @@ char *cifs_compose_mount_options(const char *sb_mountdata, * string to the length of the original string to allow for worst case. */ md_len = strlen(sb_mountdata) + INET6_ADDRSTRLEN; - mountdata = kzalloc(md_len + 1, GFP_KERNEL); + mountdata = kzalloc(md_len + sizeof("ip=") + 1, GFP_KERNEL); if (mountdata == NULL) { rc = -ENOMEM; goto compose_mount_options_err; diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index cbc0f4bca..2eea40353 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -507,6 +507,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root) seq_printf(s, ",rsize=%u", cifs_sb->rsize); seq_printf(s, ",wsize=%u", cifs_sb->wsize); + seq_printf(s, ",echo_interval=%lu", + tcon->ses->server->echo_interval / HZ); /* convert actimeo and display it in seconds */ seq_printf(s, ",actimeo=%lu", cifs_sb->actimeo / HZ); @@ -640,9 +642,9 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb) while (*s && *s != sep) s++; - mutex_lock(&dir->i_mutex); + inode_lock(dir); child = lookup_one_len(p, dentry, s - p); - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); dput(dentry); dentry = child; } while (!IS_ERR(dentry)); @@ -752,6 +754,9 @@ cifs_loose_read_iter(struct kiocb *iocb, struct iov_iter *iter) ssize_t rc; struct inode *inode = file_inode(iocb->ki_filp); + if (iocb->ki_filp->f_flags & O_DIRECT) + return cifs_user_readv(iocb, iter); + rc = cifs_revalidate_mapping(inode); if (rc) return rc; @@ -766,6 +771,18 @@ static ssize_t cifs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) ssize_t written; int rc; + if (iocb->ki_filp->f_flags & O_DIRECT) { + written = cifs_user_writev(iocb, from); + if (written > 0 && CIFS_CACHE_READ(cinode)) { + cifs_zap_mapping(inode); + cifs_dbg(FYI, + "Set no oplock for inode=%p after a write operation\n", + inode); + cinode->oplock = 0; + } + return written; + } + written = cifs_get_writer(cinode); if (written) return written; @@ -900,8 +917,7 @@ const struct inode_operations cifs_file_inode_ops = { const struct inode_operations cifs_symlink_inode_ops = { .readlink = generic_readlink, - .follow_link = cifs_follow_link, - .put_link = kfree_put_link, + .get_link = cifs_get_link, .permission = cifs_permission, /* BB add the following two eventually */ /* revalidate: cifs_revalidate, @@ -914,6 +930,59 @@ const struct inode_operations cifs_symlink_inode_ops = { #endif }; +static int cifs_clone_file_range(struct file *src_file, loff_t off, + struct file *dst_file, loff_t destoff, u64 len) +{ + struct inode *src_inode = file_inode(src_file); + struct inode *target_inode = file_inode(dst_file); + struct cifsFileInfo *smb_file_src = src_file->private_data; + struct cifsFileInfo *smb_file_target = dst_file->private_data; + struct cifs_tcon *target_tcon = tlink_tcon(smb_file_target->tlink); + unsigned int xid; + int rc; + + cifs_dbg(FYI, "clone range\n"); + + xid = get_xid(); + + if (!src_file->private_data || !dst_file->private_data) { + rc = -EBADF; + cifs_dbg(VFS, "missing cifsFileInfo on copy range src file\n"); + goto out; + } + + /* + * Note: cifs case is easier than btrfs since server responsible for + * checks for proper open modes and file type and if it wants + * server could even support copy of range where source = target + */ + lock_two_nondirectories(target_inode, src_inode); + + if (len == 0) + len = src_inode->i_size - off; + + cifs_dbg(FYI, "about to flush pages\n"); + /* should we flush first and last page first */ + truncate_inode_pages_range(&target_inode->i_data, destoff, + PAGE_CACHE_ALIGN(destoff + len)-1); + + if (target_tcon->ses->server->ops->duplicate_extents) + rc = target_tcon->ses->server->ops->duplicate_extents(xid, + smb_file_src, smb_file_target, off, len, destoff); + else + rc = -EOPNOTSUPP; + + /* force revalidate of size and timestamps of target file now + that target is updated on the server */ + CIFS_I(target_inode)->time = 0; + /* although unlocking in the reverse order from locking is not + strictly necessary here it is a little cleaner to be consistent */ + unlock_two_nondirectories(src_inode, target_inode); +out: + free_xid(xid); + return rc; +} + const struct file_operations cifs_file_ops = { .read_iter = cifs_loose_read_iter, .write_iter = cifs_file_write_iter, @@ -926,6 +995,7 @@ const struct file_operations cifs_file_ops = { .splice_read = generic_file_splice_read, .llseek = cifs_llseek, .unlocked_ioctl = cifs_ioctl, + .clone_file_range = cifs_clone_file_range, .setlease = cifs_setlease, .fallocate = cifs_fallocate, }; @@ -942,6 +1012,7 @@ const struct file_operations cifs_file_strict_ops = { .splice_read = generic_file_splice_read, .llseek = cifs_llseek, .unlocked_ioctl = cifs_ioctl, + .clone_file_range = cifs_clone_file_range, .setlease = cifs_setlease, .fallocate = cifs_fallocate, }; @@ -958,6 +1029,7 @@ const struct file_operations cifs_file_direct_ops = { .mmap = cifs_file_mmap, .splice_read = generic_file_splice_read, .unlocked_ioctl = cifs_ioctl, + .clone_file_range = cifs_clone_file_range, .llseek = cifs_llseek, .setlease = cifs_setlease, .fallocate = cifs_fallocate, @@ -974,6 +1046,7 @@ const struct file_operations cifs_file_nobrl_ops = { .splice_read = generic_file_splice_read, .llseek = cifs_llseek, .unlocked_ioctl = cifs_ioctl, + .clone_file_range = cifs_clone_file_range, .setlease = cifs_setlease, .fallocate = cifs_fallocate, }; @@ -989,6 +1062,7 @@ const struct file_operations cifs_file_strict_nobrl_ops = { .splice_read = generic_file_splice_read, .llseek = cifs_llseek, .unlocked_ioctl = cifs_ioctl, + .clone_file_range = cifs_clone_file_range, .setlease = cifs_setlease, .fallocate = cifs_fallocate, }; @@ -1004,6 +1078,7 @@ const struct file_operations cifs_file_direct_nobrl_ops = { .mmap = cifs_file_mmap, .splice_read = generic_file_splice_read, .unlocked_ioctl = cifs_ioctl, + .clone_file_range = cifs_clone_file_range, .llseek = cifs_llseek, .setlease = cifs_setlease, .fallocate = cifs_fallocate, @@ -1014,6 +1089,7 @@ const struct file_operations cifs_dir_ops = { .release = cifs_closedir, .read = generic_read_dir, .unlocked_ioctl = cifs_ioctl, + .clone_file_range = cifs_clone_file_range, .llseek = generic_file_llseek, }; @@ -1032,7 +1108,7 @@ cifs_init_inodecache(void) cifs_inode_cachep = kmem_cache_create("cifs_inode_cache", sizeof(struct cifsInodeInfo), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), cifs_init_once); if (cifs_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 44b3d4280..83aac8ba5 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -116,9 +116,8 @@ extern struct vfsmount *cifs_dfs_d_automount(struct path *path); #endif /* Functions related to symlinks */ -extern const char *cifs_follow_link(struct dentry *direntry, void **cookie); -extern int cifs_readlink(struct dentry *direntry, char __user *buffer, - int buflen); +extern const char *cifs_get_link(struct dentry *, struct inode *, + struct delayed_call *); extern int cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname); extern int cifs_removexattr(struct dentry *, const char *); @@ -127,7 +126,6 @@ extern int cifs_setxattr(struct dentry *, const char *, const void *, extern ssize_t cifs_getxattr(struct dentry *, const char *, void *, size_t); extern ssize_t cifs_listxattr(struct dentry *, char *, size_t); extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); - #ifdef CONFIG_CIFS_NFSD_EXPORT extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 2b510c537..a25b2513f 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -70,8 +70,10 @@ #define SERVER_NAME_LENGTH 40 #define SERVER_NAME_LEN_WITH_NULL (SERVER_NAME_LENGTH + 1) -/* SMB echo "timeout" -- FIXME: tunable? */ -#define SMB_ECHO_INTERVAL (60 * HZ) +/* echo interval in seconds */ +#define SMB_ECHO_INTERVAL_MIN 1 +#define SMB_ECHO_INTERVAL_MAX 600 +#define SMB_ECHO_INTERVAL_DEFAULT 60 #include "cifspdu.h" @@ -225,7 +227,7 @@ struct smb_version_operations { void (*print_stats)(struct seq_file *m, struct cifs_tcon *); void (*dump_share_caps)(struct seq_file *, struct cifs_tcon *); /* verify the message */ - int (*check_message)(char *, unsigned int); + int (*check_message)(char *, unsigned int, struct TCP_Server_Info *); bool (*is_oplock_break)(char *, struct TCP_Server_Info *); void (*downgrade_oplock)(struct TCP_Server_Info *, struct cifsInodeInfo *, bool); @@ -507,6 +509,7 @@ struct smb_vol { struct sockaddr_storage dstaddr; /* destination address */ struct sockaddr_storage srcaddr; /* allow binding to a local IP */ struct nls_table *local_nls; + unsigned int echo_interval; /* echo interval in secs */ }; #define CIFS_MOUNT_MASK (CIFS_MOUNT_NO_PERM | CIFS_MOUNT_SET_UID | \ @@ -627,7 +630,9 @@ struct TCP_Server_Info { #ifdef CONFIG_CIFS_SMB2 unsigned int max_read; unsigned int max_write; + __u8 preauth_hash[512]; #endif /* CONFIG_CIFS_SMB2 */ + unsigned long echo_interval; }; static inline unsigned int @@ -809,7 +814,10 @@ struct cifs_ses { bool need_reconnect:1; /* connection reset, uid now invalid */ #ifdef CONFIG_CIFS_SMB2 __u16 session_flags; - char smb3signingkey[SMB3_SIGN_KEY_SIZE]; /* for signing smb3 packets */ + __u8 smb3signingkey[SMB3_SIGN_KEY_SIZE]; + __u8 smb3encryptionkey[SMB3_SIGN_KEY_SIZE]; + __u8 smb3decryptionkey[SMB3_SIGN_KEY_SIZE]; + __u8 preauth_hash[512]; #endif /* CONFIG_CIFS_SMB2 */ }; diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index c63fd1dde..eed7ff50f 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -102,7 +102,7 @@ extern int SendReceiveBlockingLock(const unsigned int xid, struct smb_hdr *out_buf, int *bytes_returned); extern int cifs_reconnect(struct TCP_Server_Info *server); -extern int checkSMB(char *buf, unsigned int length); +extern int checkSMB(char *buf, unsigned int len, struct TCP_Server_Info *srvr); extern bool is_valid_oplock_break(char *, struct TCP_Server_Info *); extern bool backup_cred(struct cifs_sb_info *); extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof); @@ -439,7 +439,8 @@ extern int setup_ntlm_response(struct cifs_ses *, const struct nls_table *); extern int setup_ntlmv2_rsp(struct cifs_ses *, const struct nls_table *); extern void cifs_crypto_shash_release(struct TCP_Server_Info *); extern int calc_seckey(struct cifs_ses *); -extern int generate_smb3signingkey(struct cifs_ses *); +extern int generate_smb30signingkey(struct cifs_ses *); +extern int generate_smb311signingkey(struct cifs_ses *); #ifdef CONFIG_CIFS_WEAK_PW_HASH extern int calc_lanman_hash(const char *password, const char *cryptkey, diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 3c194ff0d..a763cd3d9 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -95,6 +95,7 @@ enum { Opt_cruid, Opt_gid, Opt_file_mode, Opt_dirmode, Opt_port, Opt_rsize, Opt_wsize, Opt_actimeo, + Opt_echo_interval, /* Mount options which take string value */ Opt_user, Opt_pass, Opt_ip, @@ -188,6 +189,7 @@ static const match_table_t cifs_mount_option_tokens = { { Opt_rsize, "rsize=%s" }, { Opt_wsize, "wsize=%s" }, { Opt_actimeo, "actimeo=%s" }, + { Opt_echo_interval, "echo_interval=%s" }, { Opt_blank_user, "user=" }, { Opt_blank_user, "username=" }, @@ -418,6 +420,7 @@ cifs_echo_request(struct work_struct *work) int rc; struct TCP_Server_Info *server = container_of(work, struct TCP_Server_Info, echo.work); + unsigned long echo_interval = server->echo_interval; /* * We cannot send an echo if it is disabled or until the @@ -427,7 +430,7 @@ cifs_echo_request(struct work_struct *work) */ if (!server->ops->need_neg || server->ops->need_neg(server) || (server->ops->can_echo && !server->ops->can_echo(server)) || - time_before(jiffies, server->lstrp + SMB_ECHO_INTERVAL - HZ)) + time_before(jiffies, server->lstrp + echo_interval - HZ)) goto requeue_echo; rc = server->ops->echo ? server->ops->echo(server) : -ENOSYS; @@ -436,7 +439,7 @@ cifs_echo_request(struct work_struct *work) server->hostname); requeue_echo: - queue_delayed_work(cifsiod_wq, &server->echo, SMB_ECHO_INTERVAL); + queue_delayed_work(cifsiod_wq, &server->echo, echo_interval); } static bool @@ -487,9 +490,9 @@ server_unresponsive(struct TCP_Server_Info *server) * a response in >60s. */ if (server->tcpStatus == CifsGood && - time_after(jiffies, server->lstrp + 2 * SMB_ECHO_INTERVAL)) { - cifs_dbg(VFS, "Server %s has not responded in %d seconds. Reconnecting...\n", - server->hostname, (2 * SMB_ECHO_INTERVAL) / HZ); + time_after(jiffies, server->lstrp + 2 * server->echo_interval)) { + cifs_dbg(VFS, "Server %s has not responded in %lu seconds. Reconnecting...\n", + server->hostname, (2 * server->echo_interval) / HZ); cifs_reconnect(server); wake_up(&server->response_q); return true; @@ -828,7 +831,7 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid) * 48 bytes is enough to display the header and a little bit * into the payload for debugging purposes. */ - length = server->ops->check_message(buf, server->total_read); + length = server->ops->check_message(buf, server->total_read, server); if (length != 0) cifs_dump_mem("Bad SMB: ", buf, min_t(unsigned int, server->total_read, 48)); @@ -1624,6 +1627,14 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, goto cifs_parse_mount_err; } break; + case Opt_echo_interval: + if (get_option_ul(args, &option)) { + cifs_dbg(VFS, "%s: Invalid echo interval value\n", + __func__); + goto cifs_parse_mount_err; + } + vol->echo_interval = option; + break; /* String Arguments */ @@ -2089,6 +2100,9 @@ static int match_server(struct TCP_Server_Info *server, struct smb_vol *vol) if (!match_security(server, vol)) return 0; + if (server->echo_interval != vol->echo_interval) + return 0; + return 1; } @@ -2208,6 +2222,12 @@ cifs_get_tcp_session(struct smb_vol *volume_info) tcp_ses->tcpStatus = CifsNew; ++tcp_ses->srv_count; + if (volume_info->echo_interval >= SMB_ECHO_INTERVAL_MIN && + volume_info->echo_interval <= SMB_ECHO_INTERVAL_MAX) + tcp_ses->echo_interval = volume_info->echo_interval * HZ; + else + tcp_ses->echo_interval = SMB_ECHO_INTERVAL_DEFAULT * HZ; + rc = ip_connect(tcp_ses); if (rc < 0) { cifs_dbg(VFS, "Error connecting to socket. Aborting operation.\n"); @@ -2237,7 +2257,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info) cifs_fscache_get_client_cookie(tcp_ses); /* queue echo request delayed work */ - queue_delayed_work(cifsiod_wq, &tcp_ses->echo, SMB_ECHO_INTERVAL); + queue_delayed_work(cifsiod_wq, &tcp_ses->echo, tcp_ses->echo_interval); return tcp_ses; @@ -2979,8 +2999,7 @@ ip_rfc1001_connect(struct TCP_Server_Info *server) if (ses_init_buf) { ses_init_buf->trailer.session_req.called_len = 32; - if (server->server_RFC1001_name && - server->server_RFC1001_name[0] != 0) + if (server->server_RFC1001_name[0] != 0) rfc1002mangle(ses_init_buf->trailer. session_req.called_name, server->server_RFC1001_name, diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 0068e8221..ff882aeac 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2267,7 +2267,7 @@ int cifs_strict_fsync(struct file *file, loff_t start, loff_t end, rc = filemap_write_and_wait_range(inode->i_mapping, start, end); if (rc) return rc; - mutex_lock(&inode->i_mutex); + inode_lock(inode); xid = get_xid(); @@ -2292,7 +2292,7 @@ int cifs_strict_fsync(struct file *file, loff_t start, loff_t end, } free_xid(xid); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return rc; } @@ -2309,7 +2309,7 @@ int cifs_fsync(struct file *file, loff_t start, loff_t end, int datasync) rc = filemap_write_and_wait_range(inode->i_mapping, start, end); if (rc) return rc; - mutex_lock(&inode->i_mutex); + inode_lock(inode); xid = get_xid(); @@ -2326,7 +2326,7 @@ int cifs_fsync(struct file *file, loff_t start, loff_t end, int datasync) } free_xid(xid); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return rc; } @@ -2672,7 +2672,7 @@ cifs_writev(struct kiocb *iocb, struct iov_iter *from) * with a brlock that prevents writing. */ down_read(&cinode->lock_sem); - mutex_lock(&inode->i_mutex); + inode_lock(inode); rc = generic_write_checks(iocb, from); if (rc <= 0) @@ -2685,7 +2685,7 @@ cifs_writev(struct kiocb *iocb, struct iov_iter *from) else rc = -EACCES; out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (rc > 0) { ssize_t err = generic_write_sync(file, iocb->ki_pos - rc, rc); @@ -3391,13 +3391,13 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list, * should have access to this page, we're safe to simply set * PG_locked without checking it first. */ - __set_page_locked(page); + __SetPageLocked(page); rc = add_to_page_cache_locked(page, mapping, page->index, gfp); /* give up if we can't stick it in the cache */ if (rc) { - __clear_page_locked(page); + __ClearPageLocked(page); return rc; } @@ -3418,9 +3418,9 @@ readpages_get_pages(struct address_space *mapping, struct list_head *page_list, if (*bytes + PAGE_CACHE_SIZE > rsize) break; - __set_page_locked(page); + __SetPageLocked(page); if (add_to_page_cache_locked(page, mapping, page->index, gfp)) { - __clear_page_locked(page); + __ClearPageLocked(page); break; } list_move_tail(&page->lru, tmplist); diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index a329f5ba3..aeb26dbfa 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -814,8 +814,21 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, } } else fattr.cf_uniqueid = iunique(sb, ROOT_I); - } else - fattr.cf_uniqueid = CIFS_I(*inode)->uniqueid; + } else { + if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) && + validinum == false && server->ops->get_srv_inum) { + /* + * Pass a NULL tcon to ensure we don't make a round + * trip to the server. This only works for SMB2+. + */ + tmprc = server->ops->get_srv_inum(xid, + NULL, cifs_sb, full_path, + &fattr.cf_uniqueid, data); + if (tmprc) + fattr.cf_uniqueid = CIFS_I(*inode)->uniqueid; + } else + fattr.cf_uniqueid = CIFS_I(*inode)->uniqueid; + } /* query for SFU type info if supported and needed */ if (fattr.cf_cifsattrs & ATTR_SYSTEM && @@ -856,6 +869,13 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, } else { /* we already have inode, update it */ + /* if uniqueid is different, return error */ + if (unlikely(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM && + CIFS_I(*inode)->uniqueid != fattr.cf_uniqueid)) { + rc = -ESTALE; + goto cgii_exit; + } + /* if filetype is different, return error */ if (unlikely(((*inode)->i_mode & S_IFMT) != (fattr.cf_mode & S_IFMT))) { diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index 35cf990f8..7a3b84e30 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -34,73 +34,36 @@ #include "cifs_ioctl.h" #include <linux/btrfs.h> -static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file, - unsigned long srcfd, u64 off, u64 len, u64 destoff, - bool dup_extents) +static int cifs_file_clone_range(unsigned int xid, struct file *src_file, + struct file *dst_file) { - int rc; - struct cifsFileInfo *smb_file_target = dst_file->private_data; + struct inode *src_inode = file_inode(src_file); struct inode *target_inode = file_inode(dst_file); - struct cifs_tcon *target_tcon; - struct fd src_file; struct cifsFileInfo *smb_file_src; - struct inode *src_inode; + struct cifsFileInfo *smb_file_target; struct cifs_tcon *src_tcon; + struct cifs_tcon *target_tcon; + int rc; cifs_dbg(FYI, "ioctl clone range\n"); - /* the destination must be opened for writing */ - if (!(dst_file->f_mode & FMODE_WRITE)) { - cifs_dbg(FYI, "file target not open for write\n"); - return -EINVAL; - } - /* check if target volume is readonly and take reference */ - rc = mnt_want_write_file(dst_file); - if (rc) { - cifs_dbg(FYI, "mnt_want_write failed with rc %d\n", rc); - return rc; - } - - src_file = fdget(srcfd); - if (!src_file.file) { - rc = -EBADF; - goto out_drop_write; - } - - if (src_file.file->f_op->unlocked_ioctl != cifs_ioctl) { - rc = -EBADF; - cifs_dbg(VFS, "src file seems to be from a different filesystem type\n"); - goto out_fput; - } - - if ((!src_file.file->private_data) || (!dst_file->private_data)) { + if (!src_file->private_data || !dst_file->private_data) { rc = -EBADF; cifs_dbg(VFS, "missing cifsFileInfo on copy range src file\n"); - goto out_fput; + goto out; } rc = -EXDEV; smb_file_target = dst_file->private_data; - smb_file_src = src_file.file->private_data; + smb_file_src = src_file->private_data; src_tcon = tlink_tcon(smb_file_src->tlink); target_tcon = tlink_tcon(smb_file_target->tlink); - /* check source and target on same server (or volume if dup_extents) */ - if (dup_extents && (src_tcon != target_tcon)) { - cifs_dbg(VFS, "source and target of copy not on same share\n"); - goto out_fput; - } - - if (!dup_extents && (src_tcon->ses != target_tcon->ses)) { + if (src_tcon->ses != target_tcon->ses) { cifs_dbg(VFS, "source and target of copy not on same server\n"); - goto out_fput; + goto out; } - src_inode = file_inode(src_file.file); - rc = -EINVAL; - if (S_ISDIR(src_inode->i_mode)) - goto out_fput; - /* * Note: cifs case is easier than btrfs since server responsible for * checks for proper open modes and file type and if it wants @@ -108,34 +71,66 @@ static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file, */ lock_two_nondirectories(target_inode, src_inode); - /* determine range to clone */ - rc = -EINVAL; - if (off + len > src_inode->i_size || off + len < off) - goto out_unlock; - if (len == 0) - len = src_inode->i_size - off; - cifs_dbg(FYI, "about to flush pages\n"); /* should we flush first and last page first */ - truncate_inode_pages_range(&target_inode->i_data, destoff, - PAGE_CACHE_ALIGN(destoff + len)-1); + truncate_inode_pages(&target_inode->i_data, 0); - if (dup_extents && target_tcon->ses->server->ops->duplicate_extents) - rc = target_tcon->ses->server->ops->duplicate_extents(xid, - smb_file_src, smb_file_target, off, len, destoff); - else if (!dup_extents && target_tcon->ses->server->ops->clone_range) + if (target_tcon->ses->server->ops->clone_range) rc = target_tcon->ses->server->ops->clone_range(xid, - smb_file_src, smb_file_target, off, len, destoff); + smb_file_src, smb_file_target, 0, src_inode->i_size, 0); else rc = -EOPNOTSUPP; /* force revalidate of size and timestamps of target file now that target is updated on the server */ CIFS_I(target_inode)->time = 0; -out_unlock: /* although unlocking in the reverse order from locking is not strictly necessary here it is a little cleaner to be consistent */ unlock_two_nondirectories(src_inode, target_inode); +out: + return rc; +} + +static long cifs_ioctl_clone(unsigned int xid, struct file *dst_file, + unsigned long srcfd) +{ + int rc; + struct fd src_file; + struct inode *src_inode; + + cifs_dbg(FYI, "ioctl clone range\n"); + /* the destination must be opened for writing */ + if (!(dst_file->f_mode & FMODE_WRITE)) { + cifs_dbg(FYI, "file target not open for write\n"); + return -EINVAL; + } + + /* check if target volume is readonly and take reference */ + rc = mnt_want_write_file(dst_file); + if (rc) { + cifs_dbg(FYI, "mnt_want_write failed with rc %d\n", rc); + return rc; + } + + src_file = fdget(srcfd); + if (!src_file.file) { + rc = -EBADF; + goto out_drop_write; + } + + if (src_file.file->f_op->unlocked_ioctl != cifs_ioctl) { + rc = -EBADF; + cifs_dbg(VFS, "src file seems to be from a different filesystem type\n"); + goto out_fput; + } + + src_inode = file_inode(src_file.file); + rc = -EINVAL; + if (S_ISDIR(src_inode->i_mode)) + goto out_fput; + + rc = cifs_file_clone_range(xid, src_file.file, dst_file); + out_fput: fdput(src_file); out_drop_write: @@ -256,10 +251,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) } break; case CIFS_IOC_COPYCHUNK_FILE: - rc = cifs_ioctl_clone(xid, filep, arg, 0, 0, 0, false); - break; - case BTRFS_IOC_CLONE: - rc = cifs_ioctl_clone(xid, filep, arg, 0, 0, 0, true); + rc = cifs_ioctl_clone(xid, filep, arg); break; case CIFS_IOC_SET_INTEGRITY: if (pSMBFile == NULL) diff --git a/fs/cifs/link.c b/fs/cifs/link.c index e3548f73b..062c23755 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -627,9 +627,9 @@ cifs_hl_exit: } const char * -cifs_follow_link(struct dentry *direntry, void **cookie) +cifs_get_link(struct dentry *direntry, struct inode *inode, + struct delayed_call *done) { - struct inode *inode = d_inode(direntry); int rc = -ENOMEM; unsigned int xid; char *full_path = NULL; @@ -639,6 +639,9 @@ cifs_follow_link(struct dentry *direntry, void **cookie) struct cifs_tcon *tcon; struct TCP_Server_Info *server; + if (!direntry) + return ERR_PTR(-ECHILD); + xid = get_xid(); tlink = cifs_sb_tlink(cifs_sb); @@ -678,7 +681,8 @@ cifs_follow_link(struct dentry *direntry, void **cookie) kfree(target_path); return ERR_PTR(rc); } - return *cookie = target_path; + set_delayed_call(done, kfree_link, target_path); + return target_path; } int diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 8442b8b8e..813fe13c2 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -310,7 +310,7 @@ check_smb_hdr(struct smb_hdr *smb) } int -checkSMB(char *buf, unsigned int total_read) +checkSMB(char *buf, unsigned int total_read, struct TCP_Server_Info *server) { struct smb_hdr *smb = (struct smb_hdr *)buf; __u32 rfclen = be32_to_cpu(smb->smb_buf_length); diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 1c5907019..389fb9f8c 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -38,7 +38,7 @@ check_smb2_hdr(struct smb2_hdr *hdr, __u64 mid) * Make sure that this really is an SMB, that it is a response, * and that the message ids match. */ - if ((*(__le32 *)hdr->ProtocolId == SMB2_PROTO_NUMBER) && + if ((hdr->ProtocolId == SMB2_PROTO_NUMBER) && (mid == wire_mid)) { if (hdr->Flags & SMB2_FLAGS_SERVER_TO_REDIR) return 0; @@ -50,9 +50,9 @@ check_smb2_hdr(struct smb2_hdr *hdr, __u64 mid) cifs_dbg(VFS, "Received Request not response\n"); } } else { /* bad signature or mid */ - if (*(__le32 *)hdr->ProtocolId != SMB2_PROTO_NUMBER) + if (hdr->ProtocolId != SMB2_PROTO_NUMBER) cifs_dbg(VFS, "Bad protocol string signature header %x\n", - *(unsigned int *) hdr->ProtocolId); + le32_to_cpu(hdr->ProtocolId)); if (mid != wire_mid) cifs_dbg(VFS, "Mids do not match: %llu and %llu\n", mid, wire_mid); @@ -93,11 +93,11 @@ static const __le16 smb2_rsp_struct_sizes[NUMBER_OF_SMB2_COMMANDS] = { }; int -smb2_check_message(char *buf, unsigned int length) +smb2_check_message(char *buf, unsigned int length, struct TCP_Server_Info *srvr) { struct smb2_hdr *hdr = (struct smb2_hdr *)buf; struct smb2_pdu *pdu = (struct smb2_pdu *)hdr; - __u64 mid = le64_to_cpu(hdr->MessageId); + __u64 mid; __u32 len = get_rfc1002_length(buf); __u32 clc_len; /* calculated length */ int command; @@ -111,6 +111,30 @@ smb2_check_message(char *buf, unsigned int length) * ie Validate the wct via smb2_struct_sizes table above */ + if (hdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM) { + struct smb2_transform_hdr *thdr = + (struct smb2_transform_hdr *)buf; + struct cifs_ses *ses = NULL; + struct list_head *tmp; + + /* decrypt frame now that it is completely read in */ + spin_lock(&cifs_tcp_ses_lock); + list_for_each(tmp, &srvr->smb_ses_list) { + ses = list_entry(tmp, struct cifs_ses, smb_ses_list); + if (ses->Suid == thdr->SessionId) + break; + + ses = NULL; + } + spin_unlock(&cifs_tcp_ses_lock); + if (ses == NULL) { + cifs_dbg(VFS, "no decryption - session id not found\n"); + return 1; + } + } + + + mid = le64_to_cpu(hdr->MessageId); if (length < sizeof(struct smb2_pdu)) { if ((length >= sizeof(struct smb2_hdr)) && (hdr->Status != 0)) { pdu->StructureSize2 = 0; @@ -322,7 +346,7 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr) /* return pointer to beginning of data area, ie offset from SMB start */ if ((*off != 0) && (*len != 0)) - return (char *)(&hdr->ProtocolId[0]) + *off; + return (char *)(&hdr->ProtocolId) + *off; else return NULL; } diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 53ccdde6f..3525ed756 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -182,6 +182,11 @@ smb2_find_mid(struct TCP_Server_Info *server, char *buf) struct smb2_hdr *hdr = (struct smb2_hdr *)buf; __u64 wire_mid = le64_to_cpu(hdr->MessageId); + if (hdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM) { + cifs_dbg(VFS, "encrypted frame parsing not supported yet"); + return NULL; + } + spin_lock(&GlobalMid_Lock); list_for_each_entry(mid, &server->pending_mid_q, qhead) { if ((mid->mid == wire_mid) && @@ -1692,7 +1697,7 @@ struct smb_version_operations smb30_operations = { .get_lease_key = smb2_get_lease_key, .set_lease_key = smb2_set_lease_key, .new_lease_key = smb2_new_lease_key, - .generate_signingkey = generate_smb3signingkey, + .generate_signingkey = generate_smb30signingkey, .calc_signature = smb3_calc_signature, .set_integrity = smb3_set_integrity, .is_read_op = smb21_is_read_op, @@ -1779,7 +1784,7 @@ struct smb_version_operations smb311_operations = { .get_lease_key = smb2_get_lease_key, .set_lease_key = smb2_set_lease_key, .new_lease_key = smb2_new_lease_key, - .generate_signingkey = generate_smb3signingkey, + .generate_signingkey = generate_smb311signingkey, .calc_signature = smb3_calc_signature, .set_integrity = smb3_set_integrity, .is_read_op = smb21_is_read_op, @@ -1838,7 +1843,7 @@ struct smb_version_values smb21_values = { struct smb_version_values smb30_values = { .version_string = SMB30_VERSION_STRING, .protocol_id = SMB30_PROT_ID, - .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES, + .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES | SMB2_GLOBAL_CAP_ENCRYPTION, .large_lock_type = 0, .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, @@ -1858,7 +1863,7 @@ struct smb_version_values smb30_values = { struct smb_version_values smb302_values = { .version_string = SMB302_VERSION_STRING, .protocol_id = SMB302_PROT_ID, - .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES, + .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES | SMB2_GLOBAL_CAP_ENCRYPTION, .large_lock_type = 0, .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 373b5cd1c..42e1f440e 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -97,10 +97,7 @@ smb2_hdr_assemble(struct smb2_hdr *hdr, __le16 smb2_cmd /* command */ , hdr->smb2_buf_length = cpu_to_be32(parmsize + sizeof(struct smb2_hdr) - 4 /* RFC 1001 length field itself not counted */); - hdr->ProtocolId[0] = 0xFE; - hdr->ProtocolId[1] = 'S'; - hdr->ProtocolId[2] = 'M'; - hdr->ProtocolId[3] = 'B'; + hdr->ProtocolId = SMB2_PROTO_NUMBER; hdr->StructureSize = cpu_to_le16(64); hdr->Command = smb2_cmd; hdr->CreditRequest = cpu_to_le16(2); /* BB make this dynamic */ @@ -1577,7 +1574,8 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, goto ioctl_exit; } - memcpy(*out_data, rsp->hdr.ProtocolId + le32_to_cpu(rsp->OutputOffset), + memcpy(*out_data, + (char *)&rsp->hdr.ProtocolId + le32_to_cpu(rsp->OutputOffset), *plen); ioctl_exit: free_rsp_buf(resp_buftype, rsp); @@ -2097,7 +2095,7 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms, } if (*buf) { - memcpy(*buf, (char *)rsp->hdr.ProtocolId + rsp->DataOffset, + memcpy(*buf, (char *)&rsp->hdr.ProtocolId + rsp->DataOffset, *nbytes); free_rsp_buf(resp_buftype, iov[0].iov_base); } else if (resp_buftype != CIFS_NO_BUFFER) { diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 4af52780e..ff88d9feb 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -86,6 +86,7 @@ #define MAX_SMB2_HDR_SIZE 0x78 /* 4 len + 64 hdr + (2*24 wct) + 2 bct + 2 pad */ #define SMB2_PROTO_NUMBER cpu_to_le32(0x424d53fe) +#define SMB2_TRANSFORM_PROTO_NUM cpu_to_le32(0x424d53fd) /* * SMB2 Header Definition @@ -102,7 +103,7 @@ struct smb2_hdr { __be32 smb2_buf_length; /* big endian on wire */ /* length is only two or three bytes - with one or two byte type preceding it that MBZ */ - __u8 ProtocolId[4]; /* 0xFE 'S' 'M' 'B' */ + __le32 ProtocolId; /* 0xFE 'S' 'M' 'B' */ __le16 StructureSize; /* 64 */ __le16 CreditCharge; /* MBZ */ __le32 Status; /* Error from server */ @@ -128,11 +129,10 @@ struct smb2_transform_hdr { one or two byte type preceding it that MBZ */ __u8 ProtocolId[4]; /* 0xFD 'S' 'M' 'B' */ __u8 Signature[16]; - __u8 Nonce[11]; - __u8 Reserved[5]; + __u8 Nonce[16]; __le32 OriginalMessageSize; __u16 Reserved1; - __le16 EncryptionAlgorithm; + __le16 Flags; /* EncryptionAlgorithm */ __u64 SessionId; } __packed; diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 79dc650c1..4f07dc936 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -34,7 +34,8 @@ struct smb_rqst; ***************************************************************** */ extern int map_smb2_to_linux_error(char *buf, bool log_err); -extern int smb2_check_message(char *buf, unsigned int length); +extern int smb2_check_message(char *buf, unsigned int length, + struct TCP_Server_Info *server); extern unsigned int smb2_calc_size(void *buf); extern char *smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr); extern __le16 *cifs_convert_path_to_utf16(const char *from, diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index d4c5b6f10..8732a43b1 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -222,8 +222,8 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) return rc; } -int -generate_smb3signingkey(struct cifs_ses *ses) +static int generate_key(struct cifs_ses *ses, struct kvec label, + struct kvec context, __u8 *key, unsigned int key_size) { unsigned char zero = 0x0; __u8 i[4] = {0, 0, 0, 1}; @@ -233,7 +233,7 @@ generate_smb3signingkey(struct cifs_ses *ses) unsigned char *hashptr = prfhash; memset(prfhash, 0x0, SMB2_HMACSHA256_SIZE); - memset(ses->smb3signingkey, 0x0, SMB3_SIGNKEY_SIZE); + memset(key, 0x0, key_size); rc = smb3_crypto_shash_allocate(ses->server); if (rc) { @@ -262,7 +262,7 @@ generate_smb3signingkey(struct cifs_ses *ses) } rc = crypto_shash_update(&ses->server->secmech.sdeschmacsha256->shash, - "SMB2AESCMAC", 12); + label.iov_base, label.iov_len); if (rc) { cifs_dbg(VFS, "%s: Could not update with label\n", __func__); goto smb3signkey_ret; @@ -276,7 +276,7 @@ generate_smb3signingkey(struct cifs_ses *ses) } rc = crypto_shash_update(&ses->server->secmech.sdeschmacsha256->shash, - "SmbSign", 8); + context.iov_base, context.iov_len); if (rc) { cifs_dbg(VFS, "%s: Could not update with context\n", __func__); goto smb3signkey_ret; @@ -296,12 +296,102 @@ generate_smb3signingkey(struct cifs_ses *ses) goto smb3signkey_ret; } - memcpy(ses->smb3signingkey, hashptr, SMB3_SIGNKEY_SIZE); + memcpy(key, hashptr, key_size); smb3signkey_ret: return rc; } +struct derivation { + struct kvec label; + struct kvec context; +}; + +struct derivation_triplet { + struct derivation signing; + struct derivation encryption; + struct derivation decryption; +}; + +static int +generate_smb3signingkey(struct cifs_ses *ses, + const struct derivation_triplet *ptriplet) +{ + int rc; + + rc = generate_key(ses, ptriplet->signing.label, + ptriplet->signing.context, ses->smb3signingkey, + SMB3_SIGN_KEY_SIZE); + if (rc) + return rc; + + rc = generate_key(ses, ptriplet->encryption.label, + ptriplet->encryption.context, ses->smb3encryptionkey, + SMB3_SIGN_KEY_SIZE); + if (rc) + return rc; + + return generate_key(ses, ptriplet->decryption.label, + ptriplet->decryption.context, + ses->smb3decryptionkey, SMB3_SIGN_KEY_SIZE); +} + +int +generate_smb30signingkey(struct cifs_ses *ses) + +{ + struct derivation_triplet triplet; + struct derivation *d; + + d = &triplet.signing; + d->label.iov_base = "SMB2AESCMAC"; + d->label.iov_len = 12; + d->context.iov_base = "SmbSign"; + d->context.iov_len = 8; + + d = &triplet.encryption; + d->label.iov_base = "SMB2AESCCM"; + d->label.iov_len = 11; + d->context.iov_base = "ServerIn "; + d->context.iov_len = 10; + + d = &triplet.decryption; + d->label.iov_base = "SMB2AESCCM"; + d->label.iov_len = 11; + d->context.iov_base = "ServerOut"; + d->context.iov_len = 10; + + return generate_smb3signingkey(ses, &triplet); +} + +int +generate_smb311signingkey(struct cifs_ses *ses) + +{ + struct derivation_triplet triplet; + struct derivation *d; + + d = &triplet.signing; + d->label.iov_base = "SMB2AESCMAC"; + d->label.iov_len = 12; + d->context.iov_base = "SmbSign"; + d->context.iov_len = 8; + + d = &triplet.encryption; + d->label.iov_base = "SMB2AESCCM"; + d->label.iov_len = 11; + d->context.iov_base = "ServerIn "; + d->context.iov_len = 10; + + d = &triplet.decryption; + d->label.iov_base = "SMB2AESCCM"; + d->label.iov_len = 11; + d->context.iov_base = "ServerOut"; + d->context.iov_len = 10; + + return generate_smb3signingkey(ses, &triplet); +} + int smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) { diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index ff9e1f8b1..f5dc2f0df 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -190,8 +190,8 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name, #endif /* CONFIG_CIFS_ACL */ } else { int temp; - temp = strncmp(ea_name, POSIX_ACL_XATTR_ACCESS, - strlen(POSIX_ACL_XATTR_ACCESS)); + temp = strncmp(ea_name, XATTR_NAME_POSIX_ACL_ACCESS, + strlen(XATTR_NAME_POSIX_ACL_ACCESS)); if (temp == 0) { #ifdef CONFIG_CIFS_POSIX if (sb->s_flags & MS_POSIXACL) @@ -203,8 +203,8 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name, #else cifs_dbg(FYI, "set POSIX ACL not supported\n"); #endif - } else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT, - strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) { + } else if (strncmp(ea_name, XATTR_NAME_POSIX_ACL_DEFAULT, + strlen(XATTR_NAME_POSIX_ACL_DEFAULT)) == 0) { #ifdef CONFIG_CIFS_POSIX if (sb->s_flags & MS_POSIXACL) rc = CIFSSMBSetPosixACL(xid, pTcon, full_path, @@ -292,8 +292,8 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name, rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon, full_path, ea_name, ea_value, buf_size, cifs_sb->local_nls, cifs_remap(cifs_sb)); - } else if (strncmp(ea_name, POSIX_ACL_XATTR_ACCESS, - strlen(POSIX_ACL_XATTR_ACCESS)) == 0) { + } else if (strncmp(ea_name, XATTR_NAME_POSIX_ACL_ACCESS, + strlen(XATTR_NAME_POSIX_ACL_ACCESS)) == 0) { #ifdef CONFIG_CIFS_POSIX if (sb->s_flags & MS_POSIXACL) rc = CIFSSMBGetPosixACL(xid, pTcon, full_path, @@ -303,8 +303,8 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name, #else cifs_dbg(FYI, "Query POSIX ACL not supported yet\n"); #endif /* CONFIG_CIFS_POSIX */ - } else if (strncmp(ea_name, POSIX_ACL_XATTR_DEFAULT, - strlen(POSIX_ACL_XATTR_DEFAULT)) == 0) { + } else if (strncmp(ea_name, XATTR_NAME_POSIX_ACL_DEFAULT, + strlen(XATTR_NAME_POSIX_ACL_DEFAULT)) == 0) { #ifdef CONFIG_CIFS_POSIX if (sb->s_flags & MS_POSIXACL) rc = CIFSSMBGetPosixACL(xid, pTcon, full_path, diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c index 7740b1c87..1bfb7ba4e 100644 --- a/fs/coda/cnode.c +++ b/fs/coda/cnode.c @@ -8,6 +8,7 @@ #include <linux/coda.h> #include <linux/coda_psdev.h> +#include <linux/pagemap.h> #include "coda_linux.h" static inline int coda_fideq(struct CodaFid *fid1, struct CodaFid *fid2) @@ -17,8 +18,7 @@ static inline int coda_fideq(struct CodaFid *fid1, struct CodaFid *fid2) static const struct inode_operations coda_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = page_follow_link_light, - .put_link = page_put_link, + .get_link = page_get_link, .setattr = coda_setattr, }; @@ -35,6 +35,7 @@ static void coda_fill_inode(struct inode *inode, struct coda_vattr *attr) inode->i_fop = &coda_dir_operations; } else if (S_ISLNK(inode->i_mode)) { inode->i_op = &coda_symlink_inode_operations; + inode_nohighmem(inode); inode->i_data.a_ops = &coda_symlink_aops; inode->i_mapping = &inode->i_data; } else diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h index f829fe963..5104d84c4 100644 --- a/fs/coda/coda_linux.h +++ b/fs/coda/coda_linux.h @@ -72,8 +72,7 @@ void coda_sysctl_clean(void); } while (0) -#define CODA_FREE(ptr,size) \ - do { if (size < PAGE_SIZE) kfree((ptr)); else vfree((ptr)); } while (0) +#define CODA_FREE(ptr, size) kvfree((ptr)) /* inode to cnode access functions */ diff --git a/fs/coda/dir.c b/fs/coda/dir.c index fda9f4311..42e731b8c 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -427,13 +427,13 @@ static int coda_readdir(struct file *coda_file, struct dir_context *ctx) if (host_file->f_op->iterate) { struct inode *host_inode = file_inode(host_file); - mutex_lock(&host_inode->i_mutex); + inode_lock(host_inode); ret = -ENOENT; if (!IS_DEADDIR(host_inode)) { ret = host_file->f_op->iterate(host_file, ctx); file_accessed(host_file); } - mutex_unlock(&host_inode->i_mutex); + inode_unlock(host_inode); return ret; } /* Venus: we must read Venus dirents from a file */ diff --git a/fs/coda/file.c b/fs/coda/file.c index 1da3805f3..f47c74838 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -71,12 +71,12 @@ coda_file_write_iter(struct kiocb *iocb, struct iov_iter *to) host_file = cfi->cfi_container; file_start_write(host_file); - mutex_lock(&coda_inode->i_mutex); + inode_lock(coda_inode); ret = vfs_iter_write(cfi->cfi_container, to, &iocb->ki_pos); coda_inode->i_size = file_inode(host_file)->i_size; coda_inode->i_blocks = (coda_inode->i_size + 511) >> 9; coda_inode->i_mtime = coda_inode->i_ctime = CURRENT_TIME_SEC; - mutex_unlock(&coda_inode->i_mutex); + inode_unlock(coda_inode); file_end_write(host_file); return ret; } @@ -203,7 +203,7 @@ int coda_fsync(struct file *coda_file, loff_t start, loff_t end, int datasync) err = filemap_write_and_wait_range(coda_inode->i_mapping, start, end); if (err) return err; - mutex_lock(&coda_inode->i_mutex); + inode_lock(coda_inode); cfi = CODA_FTOC(coda_file); BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); @@ -212,7 +212,7 @@ int coda_fsync(struct file *coda_file, loff_t start, loff_t end, int datasync) err = vfs_fsync(host_file, datasync); if (!err && !datasync) err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode)); - mutex_unlock(&coda_inode->i_mutex); + inode_unlock(coda_inode); return err; } diff --git a/fs/coda/inode.c b/fs/coda/inode.c index cac1390b8..57e81cbba 100644 --- a/fs/coda/inode.c +++ b/fs/coda/inode.c @@ -74,9 +74,9 @@ static void init_once(void *foo) int __init coda_init_inodecache(void) { coda_inode_cachep = kmem_cache_create("coda_inode_cache", - sizeof(struct coda_inode_info), - 0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, - init_once); + sizeof(struct coda_inode_info), 0, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD| + SLAB_ACCOUNT, init_once); if (coda_inode_cachep == NULL) return -ENOMEM; return 0; diff --git a/fs/coda/symlink.c b/fs/coda/symlink.c index ab94ef63c..03736e20d 100644 --- a/fs/coda/symlink.c +++ b/fs/coda/symlink.c @@ -26,7 +26,7 @@ static int coda_symlink_filler(struct file *file, struct page *page) int error; struct coda_inode_info *cii; unsigned int len = PAGE_SIZE; - char *p = kmap(page); + char *p = page_address(page); cii = ITOC(inode); @@ -34,13 +34,11 @@ static int coda_symlink_filler(struct file *file, struct page *page) if (error) goto fail; SetPageUptodate(page); - kunmap(page); unlock_page(page); return 0; fail: SetPageError(page); - kunmap(page); unlock_page(page); return error; } diff --git a/fs/compat.c b/fs/compat.c index 6fd272d45..a71936a3f 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -792,7 +792,7 @@ COMPAT_SYSCALL_DEFINE5(mount, const char __user *, dev_name, const void __user *, data) { char *kernel_type; - unsigned long data_page; + void *options; char *kernel_dev; int retval; @@ -806,26 +806,25 @@ COMPAT_SYSCALL_DEFINE5(mount, const char __user *, dev_name, if (IS_ERR(kernel_dev)) goto out1; - retval = copy_mount_options(data, &data_page); - if (retval < 0) + options = copy_mount_options(data); + retval = PTR_ERR(options); + if (IS_ERR(options)) goto out2; - retval = -EINVAL; - - if (kernel_type && data_page) { + if (kernel_type && options) { if (!strcmp(kernel_type, NCPFS_NAME)) { - do_ncp_super_data_conv((void *)data_page); + do_ncp_super_data_conv(options); } else if (!strcmp(kernel_type, NFS4_NAME)) { - if (do_nfs4_super_data_conv((void *) data_page)) + retval = -EINVAL; + if (do_nfs4_super_data_conv(options)) goto out3; } } - retval = do_mount(kernel_dev, dir_name, kernel_type, - flags, (void*)data_page); + retval = do_mount(kernel_dev, dir_name, kernel_type, flags, options); out3: - free_page(data_page); + kfree(options); out2: kfree(kernel_dev); out1: diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index dcf26537c..6402eaf8a 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -58,6 +58,8 @@ #include <linux/atalk.h> #include <linux/gfp.h> +#include "internal.h" + #include <net/bluetooth/bluetooth.h> #include <net/bluetooth/hci_sock.h> #include <net/bluetooth/rfcomm.h> @@ -115,19 +117,38 @@ #include <asm/fbio.h> #endif -static int w_long(unsigned int fd, unsigned int cmd, - compat_ulong_t __user *argp) +#define convert_in_user(srcptr, dstptr) \ +({ \ + typeof(*srcptr) val; \ + \ + get_user(val, srcptr) || put_user(val, dstptr); \ +}) + +static int do_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - mm_segment_t old_fs = get_fs(); int err; - unsigned long val; - set_fs (KERNEL_DS); - err = sys_ioctl(fd, cmd, (unsigned long)&val); - set_fs (old_fs); - if (!err && put_user(val, argp)) + err = security_file_ioctl(file, cmd, arg); + if (err) + return err; + + return vfs_ioctl(file, cmd, arg); +} + +static int w_long(struct file *file, + unsigned int cmd, compat_ulong_t __user *argp) +{ + int err; + unsigned long __user *valp = compat_alloc_user_space(sizeof(*valp)); + + if (valp == NULL) return -EFAULT; - return err; + err = do_ioctl(file, cmd, (unsigned long)valp); + if (err) + return err; + if (convert_in_user(valp, argp)) + return -EFAULT; + return 0; } struct compat_video_event { @@ -139,23 +160,23 @@ struct compat_video_event { } u; }; -static int do_video_get_event(unsigned int fd, unsigned int cmd, - struct compat_video_event __user *up) +static int do_video_get_event(struct file *file, + unsigned int cmd, struct compat_video_event __user *up) { - struct video_event kevent; - mm_segment_t old_fs = get_fs(); + struct video_event __user *kevent = + compat_alloc_user_space(sizeof(*kevent)); int err; - set_fs(KERNEL_DS); - err = sys_ioctl(fd, cmd, (unsigned long) &kevent); - set_fs(old_fs); + if (kevent == NULL) + return -EFAULT; + err = do_ioctl(file, cmd, (unsigned long)kevent); if (!err) { - err = put_user(kevent.type, &up->type); - err |= put_user(kevent.timestamp, &up->timestamp); - err |= put_user(kevent.u.size.w, &up->u.size.w); - err |= put_user(kevent.u.size.h, &up->u.size.h); - err |= put_user(kevent.u.size.aspect_ratio, + err = convert_in_user(&kevent->type, &up->type); + err |= convert_in_user(&kevent->timestamp, &up->timestamp); + err |= convert_in_user(&kevent->u.size.w, &up->u.size.w); + err |= convert_in_user(&kevent->u.size.h, &up->u.size.h); + err |= convert_in_user(&kevent->u.size.aspect_ratio, &up->u.size.aspect_ratio); if (err) err = -EFAULT; @@ -169,8 +190,8 @@ struct compat_video_still_picture { int32_t size; }; -static int do_video_stillpicture(unsigned int fd, unsigned int cmd, - struct compat_video_still_picture __user *up) +static int do_video_stillpicture(struct file *file, + unsigned int cmd, struct compat_video_still_picture __user *up) { struct video_still_picture __user *up_native; compat_uptr_t fp; @@ -190,7 +211,7 @@ static int do_video_stillpicture(unsigned int fd, unsigned int cmd, if (err) return -EFAULT; - err = sys_ioctl(fd, cmd, (unsigned long) up_native); + err = do_ioctl(file, cmd, (unsigned long) up_native); return err; } @@ -200,8 +221,8 @@ struct compat_video_spu_palette { compat_uptr_t palette; }; -static int do_video_set_spu_palette(unsigned int fd, unsigned int cmd, - struct compat_video_spu_palette __user *up) +static int do_video_set_spu_palette(struct file *file, + unsigned int cmd, struct compat_video_spu_palette __user *up) { struct video_spu_palette __user *up_native; compat_uptr_t palp; @@ -218,7 +239,7 @@ static int do_video_set_spu_palette(unsigned int fd, unsigned int cmd, if (err) return -EFAULT; - err = sys_ioctl(fd, cmd, (unsigned long) up_native); + err = do_ioctl(file, cmd, (unsigned long) up_native); return err; } @@ -276,7 +297,7 @@ static int sg_build_iovec(sg_io_hdr_t __user *sgio, void __user *dxferp, u16 iov return 0; } -static int sg_ioctl_trans(unsigned int fd, unsigned int cmd, +static int sg_ioctl_trans(struct file *file, unsigned int cmd, sg_io_hdr32_t __user *sgio32) { sg_io_hdr_t __user *sgio; @@ -289,7 +310,7 @@ static int sg_ioctl_trans(unsigned int fd, unsigned int cmd, if (get_user(interface_id, &sgio32->interface_id)) return -EFAULT; if (interface_id != 'S') - return sys_ioctl(fd, cmd, (unsigned long)sgio32); + return do_ioctl(file, cmd, (unsigned long)sgio32); if (get_user(iovec_count, &sgio32->iovec_count)) return -EFAULT; @@ -349,7 +370,7 @@ static int sg_ioctl_trans(unsigned int fd, unsigned int cmd, if (put_user(compat_ptr(data), &sgio->usr_ptr)) return -EFAULT; - err = sys_ioctl(fd, cmd, (unsigned long) sgio); + err = do_ioctl(file, cmd, (unsigned long) sgio); if (err >= 0) { void __user *datap; @@ -380,13 +401,13 @@ struct compat_sg_req_info { /* used by SG_GET_REQUEST_TABLE ioctl() */ int unused; }; -static int sg_grt_trans(unsigned int fd, unsigned int cmd, struct - compat_sg_req_info __user *o) +static int sg_grt_trans(struct file *file, + unsigned int cmd, struct compat_sg_req_info __user *o) { int err, i; sg_req_info_t __user *r; r = compat_alloc_user_space(sizeof(sg_req_info_t)*SG_MAX_QUEUE); - err = sys_ioctl(fd,cmd,(unsigned long)r); + err = do_ioctl(file, cmd, (unsigned long)r); if (err < 0) return err; for (i = 0; i < SG_MAX_QUEUE; i++) { @@ -412,8 +433,8 @@ struct sock_fprog32 { #define PPPIOCSPASS32 _IOW('t', 71, struct sock_fprog32) #define PPPIOCSACTIVE32 _IOW('t', 70, struct sock_fprog32) -static int ppp_sock_fprog_ioctl_trans(unsigned int fd, unsigned int cmd, - struct sock_fprog32 __user *u_fprog32) +static int ppp_sock_fprog_ioctl_trans(struct file *file, + unsigned int cmd, struct sock_fprog32 __user *u_fprog32) { struct sock_fprog __user *u_fprog64 = compat_alloc_user_space(sizeof(struct sock_fprog)); void __user *fptr64; @@ -435,7 +456,7 @@ static int ppp_sock_fprog_ioctl_trans(unsigned int fd, unsigned int cmd, else cmd = PPPIOCSACTIVE; - return sys_ioctl(fd, cmd, (unsigned long) u_fprog64); + return do_ioctl(file, cmd, (unsigned long) u_fprog64); } struct ppp_option_data32 { @@ -451,7 +472,7 @@ struct ppp_idle32 { }; #define PPPIOCGIDLE32 _IOR('t', 63, struct ppp_idle32) -static int ppp_gidle(unsigned int fd, unsigned int cmd, +static int ppp_gidle(struct file *file, unsigned int cmd, struct ppp_idle32 __user *idle32) { struct ppp_idle __user *idle; @@ -460,7 +481,7 @@ static int ppp_gidle(unsigned int fd, unsigned int cmd, idle = compat_alloc_user_space(sizeof(*idle)); - err = sys_ioctl(fd, PPPIOCGIDLE, (unsigned long) idle); + err = do_ioctl(file, PPPIOCGIDLE, (unsigned long) idle); if (!err) { if (get_user(xmit, &idle->xmit_idle) || @@ -472,7 +493,7 @@ static int ppp_gidle(unsigned int fd, unsigned int cmd, return err; } -static int ppp_scompress(unsigned int fd, unsigned int cmd, +static int ppp_scompress(struct file *file, unsigned int cmd, struct ppp_option_data32 __user *odata32) { struct ppp_option_data __user *odata; @@ -492,7 +513,7 @@ static int ppp_scompress(unsigned int fd, unsigned int cmd, sizeof(__u32) + sizeof(int))) return -EFAULT; - return sys_ioctl(fd, PPPIOCSCOMPRESS, (unsigned long) odata); + return do_ioctl(file, PPPIOCSCOMPRESS, (unsigned long) odata); } #ifdef CONFIG_BLOCK @@ -512,12 +533,13 @@ struct mtpos32 { }; #define MTIOCPOS32 _IOR('m', 3, struct mtpos32) -static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, void __user *argp) +static int mt_ioctl_trans(struct file *file, + unsigned int cmd, void __user *argp) { - mm_segment_t old_fs = get_fs(); - struct mtget get; + /* NULL initialization to make gcc shut up */ + struct mtget __user *get = NULL; struct mtget32 __user *umget32; - struct mtpos pos; + struct mtpos __user *pos = NULL; struct mtpos32 __user *upos32; unsigned long kcmd; void *karg; @@ -526,32 +548,34 @@ static int mt_ioctl_trans(unsigned int fd, unsigned int cmd, void __user *argp) switch(cmd) { case MTIOCPOS32: kcmd = MTIOCPOS; - karg = &pos; + pos = compat_alloc_user_space(sizeof(*pos)); + karg = pos; break; default: /* MTIOCGET32 */ kcmd = MTIOCGET; - karg = &get; + get = compat_alloc_user_space(sizeof(*get)); + karg = get; break; } - set_fs (KERNEL_DS); - err = sys_ioctl (fd, kcmd, (unsigned long)karg); - set_fs (old_fs); + if (karg == NULL) + return -EFAULT; + err = do_ioctl(file, kcmd, (unsigned long)karg); if (err) return err; switch (cmd) { case MTIOCPOS32: upos32 = argp; - err = __put_user(pos.mt_blkno, &upos32->mt_blkno); + err = convert_in_user(&pos->mt_blkno, &upos32->mt_blkno); break; case MTIOCGET32: umget32 = argp; - err = __put_user(get.mt_type, &umget32->mt_type); - err |= __put_user(get.mt_resid, &umget32->mt_resid); - err |= __put_user(get.mt_dsreg, &umget32->mt_dsreg); - err |= __put_user(get.mt_gstat, &umget32->mt_gstat); - err |= __put_user(get.mt_erreg, &umget32->mt_erreg); - err |= __put_user(get.mt_fileno, &umget32->mt_fileno); - err |= __put_user(get.mt_blkno, &umget32->mt_blkno); + err = convert_in_user(&get->mt_type, &umget32->mt_type); + err |= convert_in_user(&get->mt_resid, &umget32->mt_resid); + err |= convert_in_user(&get->mt_dsreg, &umget32->mt_dsreg); + err |= convert_in_user(&get->mt_gstat, &umget32->mt_gstat); + err |= convert_in_user(&get->mt_erreg, &umget32->mt_erreg); + err |= convert_in_user(&get->mt_fileno, &umget32->mt_fileno); + err |= convert_in_user(&get->mt_blkno, &umget32->mt_blkno); break; } return err ? -EFAULT: 0; @@ -605,42 +629,41 @@ struct serial_struct32 { compat_int_t reserved[1]; }; -static int serial_struct_ioctl(unsigned fd, unsigned cmd, - struct serial_struct32 __user *ss32) +static int serial_struct_ioctl(struct file *file, + unsigned cmd, struct serial_struct32 __user *ss32) { typedef struct serial_struct32 SS32; int err; - struct serial_struct ss; - mm_segment_t oldseg = get_fs(); + struct serial_struct __user *ss = compat_alloc_user_space(sizeof(*ss)); __u32 udata; unsigned int base; + unsigned char *iomem_base; + if (ss == NULL) + return -EFAULT; if (cmd == TIOCSSERIAL) { - if (!access_ok(VERIFY_READ, ss32, sizeof(SS32))) - return -EFAULT; - if (__copy_from_user(&ss, ss32, offsetof(SS32, iomem_base))) - return -EFAULT; - if (__get_user(udata, &ss32->iomem_base)) + if (copy_in_user(ss, ss32, offsetof(SS32, iomem_base)) || + get_user(udata, &ss32->iomem_base)) return -EFAULT; - ss.iomem_base = compat_ptr(udata); - if (__get_user(ss.iomem_reg_shift, &ss32->iomem_reg_shift) || - __get_user(ss.port_high, &ss32->port_high)) + iomem_base = compat_ptr(udata); + if (put_user(iomem_base, &ss->iomem_base) || + convert_in_user(&ss32->iomem_reg_shift, + &ss->iomem_reg_shift) || + convert_in_user(&ss32->port_high, &ss->port_high) || + put_user(0UL, &ss->iomap_base)) return -EFAULT; - ss.iomap_base = 0UL; } - set_fs(KERNEL_DS); - err = sys_ioctl(fd,cmd,(unsigned long)(&ss)); - set_fs(oldseg); + err = do_ioctl(file, cmd, (unsigned long)ss); if (cmd == TIOCGSERIAL && err >= 0) { - if (!access_ok(VERIFY_WRITE, ss32, sizeof(SS32))) - return -EFAULT; - if (__copy_to_user(ss32,&ss,offsetof(SS32,iomem_base))) + if (copy_in_user(ss32, ss, offsetof(SS32, iomem_base)) || + get_user(iomem_base, &ss->iomem_base)) return -EFAULT; - base = (unsigned long)ss.iomem_base >> 32 ? - 0xffffffff : (unsigned)(unsigned long)ss.iomem_base; - if (__put_user(base, &ss32->iomem_base) || - __put_user(ss.iomem_reg_shift, &ss32->iomem_reg_shift) || - __put_user(ss.port_high, &ss32->port_high)) + base = (unsigned long)iomem_base >> 32 ? + 0xffffffff : (unsigned)(unsigned long)iomem_base; + if (put_user(base, &ss32->iomem_base) || + convert_in_user(&ss->iomem_reg_shift, + &ss32->iomem_reg_shift) || + convert_in_user(&ss->port_high, &ss32->port_high)) return -EFAULT; } return err; @@ -674,8 +697,8 @@ struct i2c_rdwr_aligned { struct i2c_msg msgs[0]; }; -static int do_i2c_rdwr_ioctl(unsigned int fd, unsigned int cmd, - struct i2c_rdwr_ioctl_data32 __user *udata) +static int do_i2c_rdwr_ioctl(struct file *file, + unsigned int cmd, struct i2c_rdwr_ioctl_data32 __user *udata) { struct i2c_rdwr_aligned __user *tdata; struct i2c_msg __user *tmsgs; @@ -708,11 +731,11 @@ static int do_i2c_rdwr_ioctl(unsigned int fd, unsigned int cmd, put_user(compat_ptr(datap), &tmsgs[i].buf)) return -EFAULT; } - return sys_ioctl(fd, cmd, (unsigned long)tdata); + return do_ioctl(file, cmd, (unsigned long)tdata); } -static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd, - struct i2c_smbus_ioctl_data32 __user *udata) +static int do_i2c_smbus_ioctl(struct file *file, + unsigned int cmd, struct i2c_smbus_ioctl_data32 __user *udata) { struct i2c_smbus_ioctl_data __user *tdata; compat_caddr_t datap; @@ -734,7 +757,7 @@ static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd, __put_user(compat_ptr(datap), &tdata->data)) return -EFAULT; - return sys_ioctl(fd, cmd, (unsigned long)tdata); + return do_ioctl(file, cmd, (unsigned long)tdata); } #define RTC_IRQP_READ32 _IOR('p', 0x0b, compat_ulong_t) @@ -742,29 +765,27 @@ static int do_i2c_smbus_ioctl(unsigned int fd, unsigned int cmd, #define RTC_EPOCH_READ32 _IOR('p', 0x0d, compat_ulong_t) #define RTC_EPOCH_SET32 _IOW('p', 0x0e, compat_ulong_t) -static int rtc_ioctl(unsigned fd, unsigned cmd, void __user *argp) +static int rtc_ioctl(struct file *file, + unsigned cmd, void __user *argp) { - mm_segment_t oldfs = get_fs(); - compat_ulong_t val32; - unsigned long kval; + unsigned long __user *valp = compat_alloc_user_space(sizeof(*valp)); int ret; + if (valp == NULL) + return -EFAULT; switch (cmd) { case RTC_IRQP_READ32: case RTC_EPOCH_READ32: - set_fs(KERNEL_DS); - ret = sys_ioctl(fd, (cmd == RTC_IRQP_READ32) ? + ret = do_ioctl(file, (cmd == RTC_IRQP_READ32) ? RTC_IRQP_READ : RTC_EPOCH_READ, - (unsigned long)&kval); - set_fs(oldfs); + (unsigned long)valp); if (ret) return ret; - val32 = kval; - return put_user(val32, (unsigned int __user *)argp); + return convert_in_user(valp, (unsigned int __user *)argp); case RTC_IRQP_SET32: - return sys_ioctl(fd, RTC_IRQP_SET, (unsigned long)argp); + return do_ioctl(file, RTC_IRQP_SET, (unsigned long)argp); case RTC_EPOCH_SET32: - return sys_ioctl(fd, RTC_EPOCH_SET, (unsigned long)argp); + return do_ioctl(file, RTC_EPOCH_SET, (unsigned long)argp); } return -ENOIOCTLCMD; @@ -1240,6 +1261,9 @@ COMPATIBLE_IOCTL(HCIUNBLOCKADDR) COMPATIBLE_IOCTL(HCIINQUIRY) COMPATIBLE_IOCTL(HCIUARTSETPROTO) COMPATIBLE_IOCTL(HCIUARTGETPROTO) +COMPATIBLE_IOCTL(HCIUARTGETDEVICE) +COMPATIBLE_IOCTL(HCIUARTSETFLAGS) +COMPATIBLE_IOCTL(HCIUARTGETFLAGS) COMPATIBLE_IOCTL(RFCOMMCREATEDEV) COMPATIBLE_IOCTL(RFCOMMRELEASEDEV) COMPATIBLE_IOCTL(RFCOMMGETDEVLIST) @@ -1284,12 +1308,6 @@ COMPATIBLE_IOCTL(PCIIOC_CONTROLLER) COMPATIBLE_IOCTL(PCIIOC_MMAP_IS_IO) COMPATIBLE_IOCTL(PCIIOC_MMAP_IS_MEM) COMPATIBLE_IOCTL(PCIIOC_WRITE_COMBINE) -/* NBD */ -COMPATIBLE_IOCTL(NBD_DO_IT) -COMPATIBLE_IOCTL(NBD_CLEAR_SOCK) -COMPATIBLE_IOCTL(NBD_CLEAR_QUE) -COMPATIBLE_IOCTL(NBD_PRINT_DEBUG) -COMPATIBLE_IOCTL(NBD_DISCONNECT) /* i2c */ COMPATIBLE_IOCTL(I2C_SLAVE) COMPATIBLE_IOCTL(I2C_SLAVE_FORCE) @@ -1436,53 +1454,53 @@ IGNORE_IOCTL(FBIOGCURSOR32) * a compat_ioctl operation in the place that handleÑ• the * ioctl for the native case. */ -static long do_ioctl_trans(int fd, unsigned int cmd, +static long do_ioctl_trans(unsigned int cmd, unsigned long arg, struct file *file) { void __user *argp = compat_ptr(arg); switch (cmd) { case PPPIOCGIDLE32: - return ppp_gidle(fd, cmd, argp); + return ppp_gidle(file, cmd, argp); case PPPIOCSCOMPRESS32: - return ppp_scompress(fd, cmd, argp); + return ppp_scompress(file, cmd, argp); case PPPIOCSPASS32: case PPPIOCSACTIVE32: - return ppp_sock_fprog_ioctl_trans(fd, cmd, argp); + return ppp_sock_fprog_ioctl_trans(file, cmd, argp); #ifdef CONFIG_BLOCK case SG_IO: - return sg_ioctl_trans(fd, cmd, argp); + return sg_ioctl_trans(file, cmd, argp); case SG_GET_REQUEST_TABLE: - return sg_grt_trans(fd, cmd, argp); + return sg_grt_trans(file, cmd, argp); case MTIOCGET32: case MTIOCPOS32: - return mt_ioctl_trans(fd, cmd, argp); + return mt_ioctl_trans(file, cmd, argp); #endif /* Serial */ case TIOCGSERIAL: case TIOCSSERIAL: - return serial_struct_ioctl(fd, cmd, argp); + return serial_struct_ioctl(file, cmd, argp); /* i2c */ case I2C_FUNCS: - return w_long(fd, cmd, argp); + return w_long(file, cmd, argp); case I2C_RDWR: - return do_i2c_rdwr_ioctl(fd, cmd, argp); + return do_i2c_rdwr_ioctl(file, cmd, argp); case I2C_SMBUS: - return do_i2c_smbus_ioctl(fd, cmd, argp); + return do_i2c_smbus_ioctl(file, cmd, argp); /* Not implemented in the native kernel */ case RTC_IRQP_READ32: case RTC_IRQP_SET32: case RTC_EPOCH_READ32: case RTC_EPOCH_SET32: - return rtc_ioctl(fd, cmd, argp); + return rtc_ioctl(file, cmd, argp); /* dvb */ case VIDEO_GET_EVENT: - return do_video_get_event(fd, cmd, argp); + return do_video_get_event(file, cmd, argp); case VIDEO_STILLPICTURE: - return do_video_stillpicture(fd, cmd, argp); + return do_video_stillpicture(file, cmd, argp); case VIDEO_SET_SPU_PALETTE: - return do_video_set_spu_palette(fd, cmd, argp); + return do_video_set_spu_palette(file, cmd, argp); } /* @@ -1508,12 +1526,7 @@ static long do_ioctl_trans(int fd, unsigned int cmd, case KDSKBMETA: case KDSKBLED: case KDSETLED: - /* NBD */ - case NBD_SET_SOCK: - case NBD_SET_BLKSIZE: - case NBD_SET_SIZE: - case NBD_SET_SIZE_BLOCKS: - return do_vfs_ioctl(file, fd, cmd, arg); + return vfs_ioctl(file, cmd, arg); } return -ENOIOCTLCMD; @@ -1580,6 +1593,11 @@ COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, goto out_fput; #endif + case FICLONE: + case FICLONERANGE: + case FIDEDUPERANGE: + goto do_ioctl; + case FIBMAP: case FIGETBSZ: case FIONREAD: @@ -1602,7 +1620,7 @@ COMPAT_SYSCALL_DEFINE3(ioctl, unsigned int, fd, unsigned int, cmd, if (compat_ioctl_check_table(XFORM(cmd))) goto found_handler; - error = do_ioctl_trans(fd, cmd, arg, f.file); + error = do_ioctl_trans(cmd, arg, f.file); if (error == -ENOIOCTLCMD) error = -ENOTTY; diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h index b65d1ef53..ccc31fa6f 100644 --- a/fs/configfs/configfs_internal.h +++ b/fs/configfs/configfs_internal.h @@ -53,13 +53,14 @@ struct configfs_dirent { #define CONFIGFS_ROOT 0x0001 #define CONFIGFS_DIR 0x0002 #define CONFIGFS_ITEM_ATTR 0x0004 +#define CONFIGFS_ITEM_BIN_ATTR 0x0008 #define CONFIGFS_ITEM_LINK 0x0020 #define CONFIGFS_USET_DIR 0x0040 #define CONFIGFS_USET_DEFAULT 0x0080 #define CONFIGFS_USET_DROPPING 0x0100 #define CONFIGFS_USET_IN_MKDIR 0x0200 #define CONFIGFS_USET_CREATING 0x0400 -#define CONFIGFS_NOT_PINNED (CONFIGFS_ITEM_ATTR) +#define CONFIGFS_NOT_PINNED (CONFIGFS_ITEM_ATTR | CONFIGFS_ITEM_BIN_ATTR) extern struct mutex configfs_symlink_mutex; extern spinlock_t configfs_dirent_lock; @@ -72,6 +73,8 @@ extern struct inode * configfs_new_inode(umode_t mode, struct configfs_dirent *, extern int configfs_create(struct dentry *, umode_t mode, void (*init)(struct inode *)); extern int configfs_create_file(struct config_item *, const struct configfs_attribute *); +extern int configfs_create_bin_file(struct config_item *, + const struct configfs_bin_attribute *); extern int configfs_make_dirent(struct configfs_dirent *, struct dentry *, void *, umode_t, int); extern int configfs_dirent_is_ready(struct configfs_dirent *); @@ -88,7 +91,7 @@ extern void configfs_release_fs(void); extern struct rw_semaphore configfs_rename_sem; extern const struct file_operations configfs_dir_operations; extern const struct file_operations configfs_file_operations; -extern const struct file_operations bin_fops; +extern const struct file_operations configfs_bin_file_operations; extern const struct inode_operations configfs_dir_inode_operations; extern const struct inode_operations configfs_root_inode_operations; extern const struct inode_operations configfs_symlink_inode_operations; @@ -119,6 +122,13 @@ static inline struct configfs_attribute * to_attr(struct dentry * dentry) return ((struct configfs_attribute *) sd->s_element); } +static inline struct configfs_bin_attribute *to_bin_attr(struct dentry *dentry) +{ + struct configfs_attribute *attr = to_attr(dentry); + + return container_of(attr, struct configfs_bin_attribute, cb_attr); +} + static inline struct config_item *configfs_get_config_item(struct dentry *dentry) { struct config_item * item = NULL; diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index a7a1b218f..f419519ec 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -255,6 +255,12 @@ static void configfs_init_file(struct inode * inode) inode->i_fop = &configfs_file_operations; } +static void configfs_init_bin_file(struct inode *inode) +{ + inode->i_size = 0; + inode->i_fop = &configfs_bin_file_operations; +} + static void init_symlink(struct inode * inode) { inode->i_op = &configfs_symlink_inode_operations; @@ -423,7 +429,9 @@ static int configfs_attach_attr(struct configfs_dirent * sd, struct dentry * den spin_unlock(&configfs_dirent_lock); error = configfs_create(dentry, (attr->ca_mode & S_IALLUGO) | S_IFREG, - configfs_init_file); + (sd->s_type & CONFIGFS_ITEM_BIN_ATTR) ? + configfs_init_bin_file : + configfs_init_file); if (error) { configfs_put(sd); return error; @@ -583,6 +591,7 @@ static int populate_attrs(struct config_item *item) { struct config_item_type *t = item->ci_type; struct configfs_attribute *attr; + struct configfs_bin_attribute *bin_attr; int error = 0; int i; @@ -594,6 +603,13 @@ static int populate_attrs(struct config_item *item) break; } } + if (t->ct_bin_attrs) { + for (i = 0; (bin_attr = t->ct_bin_attrs[i]) != NULL; i++) { + error = configfs_create_bin_file(item, bin_attr); + if (error) + break; + } + } if (error) detach_attrs(item); @@ -624,13 +640,13 @@ static void detach_groups(struct config_group *group) child = sd->s_dentry; - mutex_lock(&d_inode(child)->i_mutex); + inode_lock(d_inode(child)); configfs_detach_group(sd->s_element); d_inode(child)->i_flags |= S_DEAD; dont_mount(child); - mutex_unlock(&d_inode(child)->i_mutex); + inode_unlock(d_inode(child)); d_delete(child); dput(child); @@ -818,11 +834,11 @@ static int configfs_attach_item(struct config_item *parent_item, * the VFS may already have hit and used them. Thus, * we must lock them as rmdir() would. */ - mutex_lock(&d_inode(dentry)->i_mutex); + inode_lock(d_inode(dentry)); configfs_remove_dir(item); d_inode(dentry)->i_flags |= S_DEAD; dont_mount(dentry); - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); d_delete(dentry); } } @@ -858,7 +874,7 @@ static int configfs_attach_group(struct config_item *parent_item, * We must also lock the inode to remove it safely in case of * error, as rmdir() would. */ - mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD); configfs_adjust_dir_dirent_depth_before_populate(sd); ret = populate_groups(to_config_group(item)); if (ret) { @@ -867,7 +883,7 @@ static int configfs_attach_group(struct config_item *parent_item, dont_mount(dentry); } configfs_adjust_dir_dirent_depth_after_populate(sd); - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); if (ret) d_delete(dentry); } @@ -1054,11 +1070,55 @@ out: return ret; } +static int configfs_do_depend_item(struct dentry *subsys_dentry, + struct config_item *target) +{ + struct configfs_dirent *p; + int ret; + + spin_lock(&configfs_dirent_lock); + /* Scan the tree, return 0 if found */ + ret = configfs_depend_prep(subsys_dentry, target); + if (ret) + goto out_unlock_dirent_lock; + + /* + * We are sure that the item is not about to be removed by rmdir(), and + * not in the middle of attachment by mkdir(). + */ + p = target->ci_dentry->d_fsdata; + p->s_dependent_count += 1; + +out_unlock_dirent_lock: + spin_unlock(&configfs_dirent_lock); + + return ret; +} + +static inline struct configfs_dirent * +configfs_find_subsys_dentry(struct configfs_dirent *root_sd, + struct config_item *subsys_item) +{ + struct configfs_dirent *p; + struct configfs_dirent *ret = NULL; + + list_for_each_entry(p, &root_sd->s_children, s_sibling) { + if (p->s_type & CONFIGFS_DIR && + p->s_element == subsys_item) { + ret = p; + break; + } + } + + return ret; +} + + int configfs_depend_item(struct configfs_subsystem *subsys, struct config_item *target) { int ret; - struct configfs_dirent *p, *root_sd, *subsys_sd = NULL; + struct configfs_dirent *subsys_sd; struct config_item *s_item = &subsys->su_group.cg_item; struct dentry *root; @@ -1075,43 +1135,19 @@ int configfs_depend_item(struct configfs_subsystem *subsys, * subsystem is really registered, and so we need to lock out * configfs_[un]register_subsystem(). */ - mutex_lock(&d_inode(root)->i_mutex); - - root_sd = root->d_fsdata; - - list_for_each_entry(p, &root_sd->s_children, s_sibling) { - if (p->s_type & CONFIGFS_DIR) { - if (p->s_element == s_item) { - subsys_sd = p; - break; - } - } - } + inode_lock(d_inode(root)); + subsys_sd = configfs_find_subsys_dentry(root->d_fsdata, s_item); if (!subsys_sd) { ret = -ENOENT; goto out_unlock_fs; } /* Ok, now we can trust subsys/s_item */ + ret = configfs_do_depend_item(subsys_sd->s_dentry, target); - spin_lock(&configfs_dirent_lock); - /* Scan the tree, return 0 if found */ - ret = configfs_depend_prep(subsys_sd->s_dentry, target); - if (ret) - goto out_unlock_dirent_lock; - - /* - * We are sure that the item is not about to be removed by rmdir(), and - * not in the middle of attachment by mkdir(). - */ - p = target->ci_dentry->d_fsdata; - p->s_dependent_count += 1; - -out_unlock_dirent_lock: - spin_unlock(&configfs_dirent_lock); out_unlock_fs: - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); /* * If we succeeded, the fs is pinned via other methods. If not, @@ -1128,8 +1164,7 @@ EXPORT_SYMBOL(configfs_depend_item); * configfs_depend_item() because we know that that the client driver is * pinned, thus the subsystem is pinned, and therefore configfs is pinned. */ -void configfs_undepend_item(struct configfs_subsystem *subsys, - struct config_item *target) +void configfs_undepend_item(struct config_item *target) { struct configfs_dirent *sd; @@ -1152,6 +1187,79 @@ void configfs_undepend_item(struct configfs_subsystem *subsys, } EXPORT_SYMBOL(configfs_undepend_item); +/* + * caller_subsys is a caller's subsystem not target's. This is used to + * determine if we should lock root and check subsys or not. When we are + * in the same subsystem as our target there is no need to do locking as + * we know that subsys is valid and is not unregistered during this function + * as we are called from callback of one of his children and VFS holds a lock + * on some inode. Otherwise we have to lock our root to ensure that target's + * subsystem it is not unregistered during this function. + */ +int configfs_depend_item_unlocked(struct configfs_subsystem *caller_subsys, + struct config_item *target) +{ + struct configfs_subsystem *target_subsys; + struct config_group *root, *parent; + struct configfs_dirent *subsys_sd; + int ret = -ENOENT; + + /* Disallow this function for configfs root */ + if (configfs_is_root(target)) + return -EINVAL; + + parent = target->ci_group; + /* + * This may happen when someone is trying to depend root + * directory of some subsystem + */ + if (configfs_is_root(&parent->cg_item)) { + target_subsys = to_configfs_subsystem(to_config_group(target)); + root = parent; + } else { + target_subsys = parent->cg_subsys; + /* Find a cofnigfs root as we may need it for locking */ + for (root = parent; !configfs_is_root(&root->cg_item); + root = root->cg_item.ci_group) + ; + } + + if (target_subsys != caller_subsys) { + /* + * We are in other configfs subsystem, so we have to do + * additional locking to prevent other subsystem from being + * unregistered + */ + inode_lock(d_inode(root->cg_item.ci_dentry)); + + /* + * As we are trying to depend item from other subsystem + * we have to check if this subsystem is still registered + */ + subsys_sd = configfs_find_subsys_dentry( + root->cg_item.ci_dentry->d_fsdata, + &target_subsys->su_group.cg_item); + if (!subsys_sd) + goto out_root_unlock; + } else { + subsys_sd = target_subsys->su_group.cg_item.ci_dentry->d_fsdata; + } + + /* Now we can execute core of depend item */ + ret = configfs_do_depend_item(subsys_sd->s_dentry, target); + + if (target_subsys != caller_subsys) +out_root_unlock: + /* + * We were called from subsystem other than our target so we + * took some locks so now it's time to release them + */ + inode_unlock(d_inode(root->cg_item.ci_dentry)); + + return ret; +} +EXPORT_SYMBOL(configfs_depend_item_unlocked); + static int configfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { int ret = 0; @@ -1453,7 +1561,7 @@ int configfs_rename_dir(struct config_item * item, const char *new_name) down_write(&configfs_rename_sem); parent = item->parent->dentry; - mutex_lock(&d_inode(parent)->i_mutex); + inode_lock(d_inode(parent)); new_dentry = lookup_one_len(new_name, parent, strlen(new_name)); if (!IS_ERR(new_dentry)) { @@ -1469,7 +1577,7 @@ int configfs_rename_dir(struct config_item * item, const char *new_name) error = -EEXIST; dput(new_dentry); } - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); up_write(&configfs_rename_sem); return error; @@ -1482,7 +1590,7 @@ static int configfs_dir_open(struct inode *inode, struct file *file) struct configfs_dirent * parent_sd = dentry->d_fsdata; int err; - mutex_lock(&d_inode(dentry)->i_mutex); + inode_lock(d_inode(dentry)); /* * Fake invisibility if dir belongs to a group/default groups hierarchy * being attached @@ -1495,7 +1603,7 @@ static int configfs_dir_open(struct inode *inode, struct file *file) else err = 0; } - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); return err; } @@ -1505,11 +1613,11 @@ static int configfs_dir_close(struct inode *inode, struct file *file) struct dentry * dentry = file->f_path.dentry; struct configfs_dirent * cursor = file->private_data; - mutex_lock(&d_inode(dentry)->i_mutex); + inode_lock(d_inode(dentry)); spin_lock(&configfs_dirent_lock); list_del_init(&cursor->s_sibling); spin_unlock(&configfs_dirent_lock); - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); release_configfs_dirent(cursor); @@ -1590,7 +1698,7 @@ static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence) { struct dentry * dentry = file->f_path.dentry; - mutex_lock(&d_inode(dentry)->i_mutex); + inode_lock(d_inode(dentry)); switch (whence) { case 1: offset += file->f_pos; @@ -1598,7 +1706,7 @@ static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence) if (offset >= 0) break; default: - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); return -EINVAL; } if (offset != file->f_pos) { @@ -1624,7 +1732,7 @@ static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence) spin_unlock(&configfs_dirent_lock); } } - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); return offset; } @@ -1659,14 +1767,14 @@ int configfs_register_group(struct config_group *parent_group, parent = parent_group->cg_item.ci_dentry; - mutex_lock_nested(&d_inode(parent)->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(d_inode(parent), I_MUTEX_PARENT); ret = create_default_group(parent_group, group); if (!ret) { spin_lock(&configfs_dirent_lock); configfs_dir_set_ready(group->cg_item.ci_dentry->d_fsdata); spin_unlock(&configfs_dirent_lock); } - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); return ret; } EXPORT_SYMBOL(configfs_register_group); @@ -1683,7 +1791,7 @@ void configfs_unregister_group(struct config_group *group) struct dentry *dentry = group->cg_item.ci_dentry; struct dentry *parent = group->cg_item.ci_parent->ci_dentry; - mutex_lock_nested(&d_inode(parent)->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(d_inode(parent), I_MUTEX_PARENT); spin_lock(&configfs_dirent_lock); configfs_detach_prep(dentry, NULL); spin_unlock(&configfs_dirent_lock); @@ -1692,7 +1800,7 @@ void configfs_unregister_group(struct config_group *group) d_inode(dentry)->i_flags |= S_DEAD; dont_mount(dentry); d_delete(dentry); - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); dput(dentry); @@ -1764,7 +1872,7 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys) sd = root->d_fsdata; link_group(to_config_group(sd->s_element), group); - mutex_lock_nested(&d_inode(root)->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(d_inode(root), I_MUTEX_PARENT); err = -ENOMEM; dentry = d_alloc_name(root, group->cg_item.ci_name); @@ -1784,7 +1892,7 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys) } } - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); if (err) { unlink_group(group); @@ -1805,9 +1913,9 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys) return; } - mutex_lock_nested(&d_inode(root)->i_mutex, + inode_lock_nested(d_inode(root), I_MUTEX_PARENT); - mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD); mutex_lock(&configfs_symlink_mutex); spin_lock(&configfs_dirent_lock); if (configfs_detach_prep(dentry, NULL)) { @@ -1818,11 +1926,11 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys) configfs_detach_group(&group->cg_item); d_inode(dentry)->i_flags |= S_DEAD; dont_mount(dentry); - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); d_delete(dentry); - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); dput(dentry); diff --git a/fs/configfs/file.c b/fs/configfs/file.c index d39099ea7..33b7ee34e 100644 --- a/fs/configfs/file.c +++ b/fs/configfs/file.c @@ -28,6 +28,7 @@ #include <linux/module.h> #include <linux/slab.h> #include <linux/mutex.h> +#include <linux/vmalloc.h> #include <asm/uaccess.h> #include <linux/configfs.h> @@ -48,6 +49,10 @@ struct configfs_buffer { struct configfs_item_operations * ops; struct mutex mutex; int needs_read_fill; + bool read_in_progress; + bool write_in_progress; + char *bin_buffer; + int bin_buffer_size; }; @@ -123,6 +128,87 @@ out: return retval; } +/** + * configfs_read_bin_file - read a binary attribute. + * @file: file pointer. + * @buf: buffer to fill. + * @count: number of bytes to read. + * @ppos: starting offset in file. + * + * Userspace wants to read a binary attribute file. The attribute + * descriptor is in the file's ->d_fsdata. The target item is in the + * directory's ->d_fsdata. + * + * We check whether we need to refill the buffer. If so we will + * call the attributes' attr->read() twice. The first time we + * will pass a NULL as a buffer pointer, which the attributes' method + * will use to return the size of the buffer required. If no error + * occurs we will allocate the buffer using vmalloc and call + * attr->read() again passing that buffer as an argument. + * Then we just copy to user-space using simple_read_from_buffer. + */ + +static ssize_t +configfs_read_bin_file(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct configfs_buffer *buffer = file->private_data; + struct dentry *dentry = file->f_path.dentry; + struct config_item *item = to_item(dentry->d_parent); + struct configfs_bin_attribute *bin_attr = to_bin_attr(dentry); + ssize_t retval = 0; + ssize_t len = min_t(size_t, count, PAGE_SIZE); + + mutex_lock(&buffer->mutex); + + /* we don't support switching read/write modes */ + if (buffer->write_in_progress) { + retval = -ETXTBSY; + goto out; + } + buffer->read_in_progress = 1; + + if (buffer->needs_read_fill) { + /* perform first read with buf == NULL to get extent */ + len = bin_attr->read(item, NULL, 0); + if (len <= 0) { + retval = len; + goto out; + } + + /* do not exceed the maximum value */ + if (bin_attr->cb_max_size && len > bin_attr->cb_max_size) { + retval = -EFBIG; + goto out; + } + + buffer->bin_buffer = vmalloc(len); + if (buffer->bin_buffer == NULL) { + retval = -ENOMEM; + goto out; + } + buffer->bin_buffer_size = len; + + /* perform second read to fill buffer */ + len = bin_attr->read(item, buffer->bin_buffer, len); + if (len < 0) { + retval = len; + vfree(buffer->bin_buffer); + buffer->bin_buffer_size = 0; + buffer->bin_buffer = NULL; + goto out; + } + + buffer->needs_read_fill = 0; + } + + retval = simple_read_from_buffer(buf, count, ppos, buffer->bin_buffer, + buffer->bin_buffer_size); +out: + mutex_unlock(&buffer->mutex); + return retval; +} + /** * fill_write_buffer - copy buffer from userspace. @@ -209,10 +295,80 @@ configfs_write_file(struct file *file, const char __user *buf, size_t count, lof return len; } -static int check_perm(struct inode * inode, struct file * file) +/** + * configfs_write_bin_file - write a binary attribute. + * @file: file pointer + * @buf: data to write + * @count: number of bytes + * @ppos: starting offset + * + * Writing to a binary attribute file is similar to a normal read. + * We buffer the consecutive writes (binary attribute files do not + * support lseek) in a continuously growing buffer, but we don't + * commit until the close of the file. + */ + +static ssize_t +configfs_write_bin_file(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct configfs_buffer *buffer = file->private_data; + struct dentry *dentry = file->f_path.dentry; + struct configfs_bin_attribute *bin_attr = to_bin_attr(dentry); + void *tbuf = NULL; + ssize_t len; + + mutex_lock(&buffer->mutex); + + /* we don't support switching read/write modes */ + if (buffer->read_in_progress) { + len = -ETXTBSY; + goto out; + } + buffer->write_in_progress = 1; + + /* buffer grows? */ + if (*ppos + count > buffer->bin_buffer_size) { + + if (bin_attr->cb_max_size && + *ppos + count > bin_attr->cb_max_size) { + len = -EFBIG; + } + + tbuf = vmalloc(*ppos + count); + if (tbuf == NULL) { + len = -ENOMEM; + goto out; + } + + /* copy old contents */ + if (buffer->bin_buffer) { + memcpy(tbuf, buffer->bin_buffer, + buffer->bin_buffer_size); + vfree(buffer->bin_buffer); + } + + /* clear the new area */ + memset(tbuf + buffer->bin_buffer_size, 0, + *ppos + count - buffer->bin_buffer_size); + buffer->bin_buffer = tbuf; + buffer->bin_buffer_size = *ppos + count; + } + + len = simple_write_to_buffer(buffer->bin_buffer, + buffer->bin_buffer_size, ppos, buf, count); + if (len > 0) + *ppos += len; +out: + mutex_unlock(&buffer->mutex); + return len; +} + +static int check_perm(struct inode * inode, struct file * file, int type) { struct config_item *item = configfs_get_config_item(file->f_path.dentry->d_parent); struct configfs_attribute * attr = to_attr(file->f_path.dentry); + struct configfs_bin_attribute *bin_attr = NULL; struct configfs_buffer * buffer; struct configfs_item_operations * ops = NULL; int error = 0; @@ -220,6 +376,9 @@ static int check_perm(struct inode * inode, struct file * file) if (!item || !attr) goto Einval; + if (type & CONFIGFS_ITEM_BIN_ATTR) + bin_attr = to_bin_attr(file->f_path.dentry); + /* Grab the module reference for this attribute if we have one */ if (!try_module_get(attr->ca_owner)) { error = -ENODEV; @@ -236,9 +395,14 @@ static int check_perm(struct inode * inode, struct file * file) * and we must have a store method. */ if (file->f_mode & FMODE_WRITE) { - if (!(inode->i_mode & S_IWUGO) || !attr->store) + if (!(inode->i_mode & S_IWUGO)) + goto Eaccess; + + if ((type & CONFIGFS_ITEM_ATTR) && !attr->store) goto Eaccess; + if ((type & CONFIGFS_ITEM_BIN_ATTR) && !bin_attr->write) + goto Eaccess; } /* File needs read support. @@ -246,7 +410,13 @@ static int check_perm(struct inode * inode, struct file * file) * must be a show method for it. */ if (file->f_mode & FMODE_READ) { - if (!(inode->i_mode & S_IRUGO) || !attr->show) + if (!(inode->i_mode & S_IRUGO)) + goto Eaccess; + + if ((type & CONFIGFS_ITEM_ATTR) && !attr->show) + goto Eaccess; + + if ((type & CONFIGFS_ITEM_BIN_ATTR) && !bin_attr->read) goto Eaccess; } @@ -260,6 +430,8 @@ static int check_perm(struct inode * inode, struct file * file) } mutex_init(&buffer->mutex); buffer->needs_read_fill = 1; + buffer->read_in_progress = 0; + buffer->write_in_progress = 0; buffer->ops = ops; file->private_data = buffer; goto Done; @@ -277,12 +449,7 @@ static int check_perm(struct inode * inode, struct file * file) return error; } -static int configfs_open_file(struct inode * inode, struct file * filp) -{ - return check_perm(inode,filp); -} - -static int configfs_release(struct inode * inode, struct file * filp) +static int configfs_release(struct inode *inode, struct file *filp) { struct config_item * item = to_item(filp->f_path.dentry->d_parent); struct configfs_attribute * attr = to_attr(filp->f_path.dentry); @@ -303,6 +470,47 @@ static int configfs_release(struct inode * inode, struct file * filp) return 0; } +static int configfs_open_file(struct inode *inode, struct file *filp) +{ + return check_perm(inode, filp, CONFIGFS_ITEM_ATTR); +} + +static int configfs_open_bin_file(struct inode *inode, struct file *filp) +{ + return check_perm(inode, filp, CONFIGFS_ITEM_BIN_ATTR); +} + +static int configfs_release_bin_file(struct inode *inode, struct file *filp) +{ + struct configfs_buffer *buffer = filp->private_data; + struct dentry *dentry = filp->f_path.dentry; + struct config_item *item = to_item(dentry->d_parent); + struct configfs_bin_attribute *bin_attr = to_bin_attr(dentry); + ssize_t len = 0; + int ret; + + buffer->read_in_progress = 0; + + if (buffer->write_in_progress) { + buffer->write_in_progress = 0; + + len = bin_attr->write(item, buffer->bin_buffer, + buffer->bin_buffer_size); + + /* vfree on NULL is safe */ + vfree(buffer->bin_buffer); + buffer->bin_buffer = NULL; + buffer->bin_buffer_size = 0; + buffer->needs_read_fill = 1; + } + + ret = configfs_release(inode, filp); + if (len < 0) + return len; + return ret; +} + + const struct file_operations configfs_file_operations = { .read = configfs_read_file, .write = configfs_write_file, @@ -311,6 +519,14 @@ const struct file_operations configfs_file_operations = { .release = configfs_release, }; +const struct file_operations configfs_bin_file_operations = { + .read = configfs_read_bin_file, + .write = configfs_write_bin_file, + .llseek = NULL, /* bin file is not seekable */ + .open = configfs_open_bin_file, + .release = configfs_release_bin_file, +}; + /** * configfs_create_file - create an attribute file for an item. * @item: item we're creating for. @@ -324,11 +540,32 @@ int configfs_create_file(struct config_item * item, const struct configfs_attrib umode_t mode = (attr->ca_mode & S_IALLUGO) | S_IFREG; int error = 0; - mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_NORMAL); + inode_lock_nested(d_inode(dir), I_MUTEX_NORMAL); error = configfs_make_dirent(parent_sd, NULL, (void *) attr, mode, CONFIGFS_ITEM_ATTR); - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); return error; } +/** + * configfs_create_bin_file - create a binary attribute file for an item. + * @item: item we're creating for. + * @attr: atrribute descriptor. + */ + +int configfs_create_bin_file(struct config_item *item, + const struct configfs_bin_attribute *bin_attr) +{ + struct dentry *dir = item->ci_dentry; + struct configfs_dirent *parent_sd = dir->d_fsdata; + umode_t mode = (bin_attr->cb_attr.ca_mode & S_IALLUGO) | S_IFREG; + int error = 0; + + inode_lock_nested(dir->d_inode, I_MUTEX_NORMAL); + error = configfs_make_dirent(parent_sd, NULL, (void *) bin_attr, mode, + CONFIGFS_ITEM_BIN_ATTR); + inode_unlock(dir->d_inode); + + return error; +} diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index eae87575e..cee087d8f 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -218,7 +218,7 @@ const unsigned char * configfs_get_name(struct configfs_dirent *sd) if (sd->s_type & (CONFIGFS_DIR | CONFIGFS_ITEM_LINK)) return sd->s_dentry->d_name.name; - if (sd->s_type & CONFIGFS_ITEM_ATTR) { + if (sd->s_type & (CONFIGFS_ITEM_ATTR | CONFIGFS_ITEM_BIN_ATTR)) { attr = sd->s_element; return attr->ca_name; } @@ -255,7 +255,7 @@ void configfs_hash_and_remove(struct dentry * dir, const char * name) /* no inode means this hasn't been made visible yet */ return; - mutex_lock(&d_inode(dir)->i_mutex); + inode_lock(d_inode(dir)); list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { if (!sd->s_element) continue; @@ -268,5 +268,5 @@ void configfs_hash_and_remove(struct dentry * dir, const char * name) break; } } - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); } diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c index ec5c8325b..db6d69289 100644 --- a/fs/configfs/symlink.c +++ b/fs/configfs/symlink.c @@ -279,27 +279,33 @@ static int configfs_getlink(struct dentry *dentry, char * path) } -static const char *configfs_follow_link(struct dentry *dentry, void **cookie) +static const char *configfs_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) { - unsigned long page = get_zeroed_page(GFP_KERNEL); + char *body; int error; - if (!page) + if (!dentry) + return ERR_PTR(-ECHILD); + + body = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!body) return ERR_PTR(-ENOMEM); - error = configfs_getlink(dentry, (char *)page); + error = configfs_getlink(dentry, body); if (!error) { - return *cookie = (void *)page; + set_delayed_call(done, kfree_link, body); + return body; } - free_page(page); + kfree(body); return ERR_PTR(error); } const struct inode_operations configfs_symlink_inode_operations = { - .follow_link = configfs_follow_link, + .get_link = configfs_get_link, .readlink = generic_readlink, - .put_link = free_page_put_link, .setattr = configfs_setattr, }; diff --git a/fs/coredump.c b/fs/coredump.c index 1777331ee..9ea87e9fd 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -32,6 +32,7 @@ #include <linux/pipe_fs_i.h> #include <linux/oom.h> #include <linux/compat.h> +#include <linux/timekeeping.h> #include <asm/uaccess.h> #include <asm/mmu_context.h> @@ -117,6 +118,26 @@ int cn_esc_printf(struct core_name *cn, const char *fmt, ...) ret = cn_vprintf(cn, fmt, arg); va_end(arg); + if (ret == 0) { + /* + * Ensure that this coredump name component can't cause the + * resulting corefile path to consist of a ".." or ".". + */ + if ((cn->used - cur == 1 && cn->corename[cur] == '.') || + (cn->used - cur == 2 && cn->corename[cur] == '.' + && cn->corename[cur+1] == '.')) + cn->corename[cur] = '!'; + + /* + * Empty names are fishy and could be used to create a "//" in a + * corefile name, causing the coredump to happen one directory + * level too high. Enforce that all components of the core + * pattern are at least one character long. + */ + if (cn->used == cur) + ret = cn_printf(cn, "!"); + } + for (; cur < cn->used; ++cur) { if (cn->corename[cur] == '/') cn->corename[cur] = '!'; @@ -232,9 +253,10 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm) break; /* UNIX time of coredump */ case 't': { - struct timeval tv; - do_gettimeofday(&tv); - err = cn_printf(cn, "%lu", tv.tv_sec); + time64_t time; + + time = ktime_get_real_seconds(); + err = cn_printf(cn, "%lld", time); break; } /* hostname */ diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index 355c522f3..b862bc219 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -100,6 +100,7 @@ static struct inode *get_cramfs_inode(struct super_block *sb, break; case S_IFLNK: inode->i_op = &page_symlink_inode_operations; + inode_nohighmem(inode); inode->i_data.a_ops = &cramfs_aops; break; default: @@ -24,57 +24,91 @@ #include <linux/memcontrol.h> #include <linux/mm.h> #include <linux/mutex.h> +#include <linux/pagevec.h> #include <linux/pmem.h> #include <linux/sched.h> #include <linux/uio.h> #include <linux/vmstat.h> +#include <linux/pfn_t.h> +#include <linux/sizes.h> + +static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax) +{ + struct request_queue *q = bdev->bd_queue; + long rc = -EIO; + + dax->addr = (void __pmem *) ERR_PTR(-EIO); + if (blk_queue_enter(q, true) != 0) + return rc; + + rc = bdev_direct_access(bdev, dax); + if (rc < 0) { + dax->addr = (void __pmem *) ERR_PTR(rc); + blk_queue_exit(q); + return rc; + } + return rc; +} + +static void dax_unmap_atomic(struct block_device *bdev, + const struct blk_dax_ctl *dax) +{ + if (IS_ERR(dax->addr)) + return; + blk_queue_exit(bdev->bd_queue); +} + +struct page *read_dax_sector(struct block_device *bdev, sector_t n) +{ + struct page *page = alloc_pages(GFP_KERNEL, 0); + struct blk_dax_ctl dax = { + .size = PAGE_SIZE, + .sector = n & ~((((int) PAGE_SIZE) / 512) - 1), + }; + long rc; + + if (!page) + return ERR_PTR(-ENOMEM); + + rc = dax_map_atomic(bdev, &dax); + if (rc < 0) + return ERR_PTR(rc); + memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE); + dax_unmap_atomic(bdev, &dax); + return page; +} /* - * dax_clear_blocks() is called from within transaction context from XFS, + * dax_clear_sectors() is called from within transaction context from XFS, * and hence this means the stack from this point must follow GFP_NOFS * semantics for all operations. */ -int dax_clear_blocks(struct inode *inode, sector_t block, long size) +int dax_clear_sectors(struct block_device *bdev, sector_t _sector, long _size) { - struct block_device *bdev = inode->i_sb->s_bdev; - sector_t sector = block << (inode->i_blkbits - 9); + struct blk_dax_ctl dax = { + .sector = _sector, + .size = _size, + }; might_sleep(); do { - void __pmem *addr; - unsigned long pfn; - long count; + long count, sz; - count = bdev_direct_access(bdev, sector, &addr, &pfn, size); + count = dax_map_atomic(bdev, &dax); if (count < 0) return count; - BUG_ON(size < count); - while (count > 0) { - unsigned pgsz = PAGE_SIZE - offset_in_page(addr); - if (pgsz > count) - pgsz = count; - clear_pmem(addr, pgsz); - addr += pgsz; - size -= pgsz; - count -= pgsz; - BUG_ON(pgsz & 511); - sector += pgsz / 512; - cond_resched(); - } - } while (size); + sz = min_t(long, count, SZ_128K); + clear_pmem(dax.addr, sz); + dax.size -= sz; + dax.sector += sz / 512; + dax_unmap_atomic(bdev, &dax); + cond_resched(); + } while (dax.size); wmb_pmem(); return 0; } -EXPORT_SYMBOL_GPL(dax_clear_blocks); - -static long dax_get_addr(struct buffer_head *bh, void __pmem **addr, - unsigned blkbits) -{ - unsigned long pfn; - sector_t sector = bh->b_blocknr << (blkbits - 9); - return bdev_direct_access(bh->b_bdev, sector, addr, &pfn, bh->b_size); -} +EXPORT_SYMBOL_GPL(dax_clear_sectors); /* the clear_pmem() calls are ordered by a wmb_pmem() in the caller */ static void dax_new_buf(void __pmem *addr, unsigned size, unsigned first, @@ -105,19 +139,29 @@ static bool buffer_size_valid(struct buffer_head *bh) return bh->b_state != 0; } + +static sector_t to_sector(const struct buffer_head *bh, + const struct inode *inode) +{ + sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9); + + return sector; +} + static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, loff_t start, loff_t end, get_block_t get_block, struct buffer_head *bh) { - ssize_t retval = 0; - loff_t pos = start; - loff_t max = start; - loff_t bh_max = start; - void __pmem *addr; - bool hole = false; - bool need_wmb = false; - - if (iov_iter_rw(iter) != WRITE) + loff_t pos = start, max = start, bh_max = start; + bool hole = false, need_wmb = false; + struct block_device *bdev = NULL; + int rw = iov_iter_rw(iter), rc; + long map_len = 0; + struct blk_dax_ctl dax = { + .addr = (void __pmem *) ERR_PTR(-EIO), + }; + + if (rw == READ) end = min(end, i_size_read(inode)); while (pos < end) { @@ -132,13 +176,13 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, if (pos == bh_max) { bh->b_size = PAGE_ALIGN(end - pos); bh->b_state = 0; - retval = get_block(inode, block, bh, - iov_iter_rw(iter) == WRITE); - if (retval) + rc = get_block(inode, block, bh, rw == WRITE); + if (rc) break; if (!buffer_size_valid(bh)) bh->b_size = 1 << blkbits; bh_max = pos - first + bh->b_size; + bdev = bh->b_bdev; } else { unsigned done = bh->b_size - (bh_max - (pos - first)); @@ -146,47 +190,53 @@ static ssize_t dax_io(struct inode *inode, struct iov_iter *iter, bh->b_size -= done; } - hole = iov_iter_rw(iter) != WRITE && !buffer_written(bh); + hole = rw == READ && !buffer_written(bh); if (hole) { - addr = NULL; size = bh->b_size - first; } else { - retval = dax_get_addr(bh, &addr, blkbits); - if (retval < 0) + dax_unmap_atomic(bdev, &dax); + dax.sector = to_sector(bh, inode); + dax.size = bh->b_size; + map_len = dax_map_atomic(bdev, &dax); + if (map_len < 0) { + rc = map_len; break; + } if (buffer_unwritten(bh) || buffer_new(bh)) { - dax_new_buf(addr, retval, first, pos, - end); + dax_new_buf(dax.addr, map_len, first, + pos, end); need_wmb = true; } - addr += first; - size = retval - first; + dax.addr += first; + size = map_len - first; } max = min(pos + size, end); } if (iov_iter_rw(iter) == WRITE) { - len = copy_from_iter_pmem(addr, max - pos, iter); + len = copy_from_iter_pmem(dax.addr, max - pos, iter); need_wmb = true; } else if (!hole) - len = copy_to_iter((void __force *)addr, max - pos, + len = copy_to_iter((void __force *) dax.addr, max - pos, iter); else len = iov_iter_zero(max - pos, iter); if (!len) { - retval = -EFAULT; + rc = -EFAULT; break; } pos += len; - addr += len; + if (!IS_ERR(dax.addr)) + dax.addr += len; } if (need_wmb) wmb_pmem(); + dax_unmap_atomic(bdev, &dax); - return (pos == start) ? retval : pos - start; + return (pos == start) ? rc : pos - start; } /** @@ -215,13 +265,14 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode, loff_t end = pos + iov_iter_count(iter); memset(&bh, 0, sizeof(bh)); + bh.b_bdev = inode->i_sb->s_bdev; if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) { struct address_space *mapping = inode->i_mapping; - mutex_lock(&inode->i_mutex); + inode_lock(inode); retval = filemap_write_and_wait_range(mapping, pos, end - 1); if (retval) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); goto out; } } @@ -233,7 +284,7 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode, retval = dax_io(inode, iter, pos, end, get_block, &bh); if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if ((retval > 0) && end_io) end_io(iocb, pos, retval, bh.b_private); @@ -275,28 +326,231 @@ static int dax_load_hole(struct address_space *mapping, struct page *page, return VM_FAULT_LOCKED; } -static int copy_user_bh(struct page *to, struct buffer_head *bh, - unsigned blkbits, unsigned long vaddr) +static int copy_user_bh(struct page *to, struct inode *inode, + struct buffer_head *bh, unsigned long vaddr) { - void __pmem *vfrom; + struct blk_dax_ctl dax = { + .sector = to_sector(bh, inode), + .size = bh->b_size, + }; + struct block_device *bdev = bh->b_bdev; void *vto; - if (dax_get_addr(bh, &vfrom, blkbits) < 0) - return -EIO; + if (dax_map_atomic(bdev, &dax) < 0) + return PTR_ERR(dax.addr); vto = kmap_atomic(to); - copy_user_page(vto, (void __force *)vfrom, vaddr, to); + copy_user_page(vto, (void __force *)dax.addr, vaddr, to); kunmap_atomic(vto); + dax_unmap_atomic(bdev, &dax); return 0; } +#define NO_SECTOR -1 +#define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_CACHE_SHIFT)) + +static int dax_radix_entry(struct address_space *mapping, pgoff_t index, + sector_t sector, bool pmd_entry, bool dirty) +{ + struct radix_tree_root *page_tree = &mapping->page_tree; + pgoff_t pmd_index = DAX_PMD_INDEX(index); + int type, error = 0; + void *entry; + + WARN_ON_ONCE(pmd_entry && !dirty); + if (dirty) + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); + + spin_lock_irq(&mapping->tree_lock); + + entry = radix_tree_lookup(page_tree, pmd_index); + if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) { + index = pmd_index; + goto dirty; + } + + entry = radix_tree_lookup(page_tree, index); + if (entry) { + type = RADIX_DAX_TYPE(entry); + if (WARN_ON_ONCE(type != RADIX_DAX_PTE && + type != RADIX_DAX_PMD)) { + error = -EIO; + goto unlock; + } + + if (!pmd_entry || type == RADIX_DAX_PMD) + goto dirty; + + /* + * We only insert dirty PMD entries into the radix tree. This + * means we don't need to worry about removing a dirty PTE + * entry and inserting a clean PMD entry, thus reducing the + * range we would flush with a follow-up fsync/msync call. + */ + radix_tree_delete(&mapping->page_tree, index); + mapping->nrexceptional--; + } + + if (sector == NO_SECTOR) { + /* + * This can happen during correct operation if our pfn_mkwrite + * fault raced against a hole punch operation. If this + * happens the pte that was hole punched will have been + * unmapped and the radix tree entry will have been removed by + * the time we are called, but the call will still happen. We + * will return all the way up to wp_pfn_shared(), where the + * pte_same() check will fail, eventually causing page fault + * to be retried by the CPU. + */ + goto unlock; + } + + error = radix_tree_insert(page_tree, index, + RADIX_DAX_ENTRY(sector, pmd_entry)); + if (error) + goto unlock; + + mapping->nrexceptional++; + dirty: + if (dirty) + radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); + unlock: + spin_unlock_irq(&mapping->tree_lock); + return error; +} + +static int dax_writeback_one(struct block_device *bdev, + struct address_space *mapping, pgoff_t index, void *entry) +{ + struct radix_tree_root *page_tree = &mapping->page_tree; + int type = RADIX_DAX_TYPE(entry); + struct radix_tree_node *node; + struct blk_dax_ctl dax; + void **slot; + int ret = 0; + + spin_lock_irq(&mapping->tree_lock); + /* + * Regular page slots are stabilized by the page lock even + * without the tree itself locked. These unlocked entries + * need verification under the tree lock. + */ + if (!__radix_tree_lookup(page_tree, index, &node, &slot)) + goto unlock; + if (*slot != entry) + goto unlock; + + /* another fsync thread may have already written back this entry */ + if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) + goto unlock; + + if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) { + ret = -EIO; + goto unlock; + } + + dax.sector = RADIX_DAX_SECTOR(entry); + dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE); + spin_unlock_irq(&mapping->tree_lock); + + /* + * We cannot hold tree_lock while calling dax_map_atomic() because it + * eventually calls cond_resched(). + */ + ret = dax_map_atomic(bdev, &dax); + if (ret < 0) + return ret; + + if (WARN_ON_ONCE(ret < dax.size)) { + ret = -EIO; + goto unmap; + } + + wb_cache_pmem(dax.addr, dax.size); + + spin_lock_irq(&mapping->tree_lock); + radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE); + spin_unlock_irq(&mapping->tree_lock); + unmap: + dax_unmap_atomic(bdev, &dax); + return ret; + + unlock: + spin_unlock_irq(&mapping->tree_lock); + return ret; +} + +/* + * Flush the mapping to the persistent domain within the byte range of [start, + * end]. This is required by data integrity operations to ensure file data is + * on persistent storage prior to completion of the operation. + */ +int dax_writeback_mapping_range(struct address_space *mapping, + struct block_device *bdev, struct writeback_control *wbc) +{ + struct inode *inode = mapping->host; + pgoff_t start_index, end_index, pmd_index; + pgoff_t indices[PAGEVEC_SIZE]; + struct pagevec pvec; + bool done = false; + int i, ret = 0; + void *entry; + + if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT)) + return -EIO; + + if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL) + return 0; + + start_index = wbc->range_start >> PAGE_CACHE_SHIFT; + end_index = wbc->range_end >> PAGE_CACHE_SHIFT; + pmd_index = DAX_PMD_INDEX(start_index); + + rcu_read_lock(); + entry = radix_tree_lookup(&mapping->page_tree, pmd_index); + rcu_read_unlock(); + + /* see if the start of our range is covered by a PMD entry */ + if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) + start_index = pmd_index; + + tag_pages_for_writeback(mapping, start_index, end_index); + + pagevec_init(&pvec, 0); + while (!done) { + pvec.nr = find_get_entries_tag(mapping, start_index, + PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE, + pvec.pages, indices); + + if (pvec.nr == 0) + break; + + for (i = 0; i < pvec.nr; i++) { + if (indices[i] > end_index) { + done = true; + break; + } + + ret = dax_writeback_one(bdev, mapping, indices[i], + pvec.pages[i]); + if (ret < 0) + return ret; + } + } + wmb_pmem(); + return 0; +} +EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); + static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, struct vm_area_struct *vma, struct vm_fault *vmf) { - struct address_space *mapping = inode->i_mapping; - sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9); unsigned long vaddr = (unsigned long)vmf->virtual_address; - void __pmem *addr; - unsigned long pfn; + struct address_space *mapping = inode->i_mapping; + struct block_device *bdev = bh->b_bdev; + struct blk_dax_ctl dax = { + .sector = to_sector(bh, inode), + .size = bh->b_size, + }; pgoff_t size; int error; @@ -315,20 +569,23 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, goto out; } - error = bdev_direct_access(bh->b_bdev, sector, &addr, &pfn, bh->b_size); - if (error < 0) - goto out; - if (error < PAGE_SIZE) { - error = -EIO; + if (dax_map_atomic(bdev, &dax) < 0) { + error = PTR_ERR(dax.addr); goto out; } if (buffer_unwritten(bh) || buffer_new(bh)) { - clear_pmem(addr, PAGE_SIZE); + clear_pmem(dax.addr, PAGE_SIZE); wmb_pmem(); } + dax_unmap_atomic(bdev, &dax); - error = vm_insert_mixed(vma, vaddr, pfn); + error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false, + vmf->flags & FAULT_FLAG_WRITE); + if (error) + goto out; + + error = vm_insert_mixed(vma, vaddr, dax.pfn); out: i_mmap_unlock_read(mapping); @@ -373,6 +630,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, memset(&bh, 0, sizeof(bh)); block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits); + bh.b_bdev = inode->i_sb->s_bdev; bh.b_size = PAGE_SIZE; repeat: @@ -422,7 +680,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, if (vmf->cow_page) { struct page *new_page = vmf->cow_page; if (buffer_written(&bh)) - error = copy_user_bh(new_page, &bh, blkbits, vaddr); + error = copy_user_bh(new_page, inode, &bh, vaddr); else clear_user_highpage(new_page, vaddr); if (error) @@ -452,6 +710,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, delete_from_page_cache(page); unlock_page(page); page_cache_release(page); + page = NULL; } /* @@ -523,6 +782,24 @@ EXPORT_SYMBOL_GPL(dax_fault); */ #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) +static void __dax_dbg(struct buffer_head *bh, unsigned long address, + const char *reason, const char *fn) +{ + if (bh) { + char bname[BDEVNAME_SIZE]; + bdevname(bh->b_bdev, bname); + pr_debug("%s: %s addr: %lx dev %s state %lx start %lld " + "length %zd fallback: %s\n", fn, current->comm, + address, bname, bh->b_state, (u64)bh->b_blocknr, + bh->b_size, reason); + } else { + pr_debug("%s: %s addr: %lx fallback: %s\n", fn, + current->comm, address, reason); + } +} + +#define dax_pmd_dbg(bh, address, reason) __dax_dbg(bh, address, reason, "dax_pmd") + int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, unsigned int flags, get_block_t get_block, dax_iodone_t complete_unwritten) @@ -534,61 +811,83 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, unsigned blkbits = inode->i_blkbits; unsigned long pmd_addr = address & PMD_MASK; bool write = flags & FAULT_FLAG_WRITE; - long length; - void __pmem *kaddr; + struct block_device *bdev; pgoff_t size, pgoff; - sector_t block, sector; - unsigned long pfn; - int result = 0; + sector_t block; + int error, result = 0; + bool alloc = false; - /* dax pmd mappings are broken wrt gup and fork */ + /* dax pmd mappings require pfn_t_devmap() */ if (!IS_ENABLED(CONFIG_FS_DAX_PMD)) return VM_FAULT_FALLBACK; /* Fall back to PTEs if we're going to COW */ - if (write && !(vma->vm_flags & VM_SHARED)) + if (write && !(vma->vm_flags & VM_SHARED)) { + split_huge_pmd(vma, pmd, address); + dax_pmd_dbg(NULL, address, "cow write"); return VM_FAULT_FALLBACK; + } /* If the PMD would extend outside the VMA */ - if (pmd_addr < vma->vm_start) + if (pmd_addr < vma->vm_start) { + dax_pmd_dbg(NULL, address, "vma start unaligned"); return VM_FAULT_FALLBACK; - if ((pmd_addr + PMD_SIZE) > vma->vm_end) + } + if ((pmd_addr + PMD_SIZE) > vma->vm_end) { + dax_pmd_dbg(NULL, address, "vma end unaligned"); return VM_FAULT_FALLBACK; + } pgoff = linear_page_index(vma, pmd_addr); size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (pgoff >= size) return VM_FAULT_SIGBUS; /* If the PMD would cover blocks out of the file */ - if ((pgoff | PG_PMD_COLOUR) >= size) + if ((pgoff | PG_PMD_COLOUR) >= size) { + dax_pmd_dbg(NULL, address, + "offset + huge page size > file size"); return VM_FAULT_FALLBACK; + } memset(&bh, 0, sizeof(bh)); + bh.b_bdev = inode->i_sb->s_bdev; block = (sector_t)pgoff << (PAGE_SHIFT - blkbits); bh.b_size = PMD_SIZE; - length = get_block(inode, block, &bh, write); - if (length) + + if (get_block(inode, block, &bh, 0) != 0) return VM_FAULT_SIGBUS; - i_mmap_lock_read(mapping); + + if (!buffer_mapped(&bh) && write) { + if (get_block(inode, block, &bh, 1) != 0) + return VM_FAULT_SIGBUS; + alloc = true; + } + + bdev = bh.b_bdev; /* * If the filesystem isn't willing to tell us the length of a hole, * just fall back to PTEs. Calling get_block 512 times in a loop * would be silly. */ - if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) - goto fallback; + if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) { + dax_pmd_dbg(&bh, address, "allocated block too small"); + return VM_FAULT_FALLBACK; + } /* * If we allocated new storage, make sure no process has any * zero pages covering this hole */ - if (buffer_new(&bh)) { - i_mmap_unlock_read(mapping); - unmap_mapping_range(mapping, pgoff << PAGE_SHIFT, PMD_SIZE, 0); - i_mmap_lock_read(mapping); + if (alloc) { + loff_t lstart = pgoff << PAGE_SHIFT; + loff_t lend = lstart + PMD_SIZE - 1; /* inclusive */ + + truncate_pagecache_range(inode, lstart, lend); } + i_mmap_lock_read(mapping); + /* * If a truncate happened while we were allocating blocks, we may * leave blocks allocated to the file that are beyond EOF. We can't @@ -600,57 +899,108 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, result = VM_FAULT_SIGBUS; goto out; } - if ((pgoff | PG_PMD_COLOUR) >= size) + if ((pgoff | PG_PMD_COLOUR) >= size) { + dax_pmd_dbg(&bh, address, + "offset + huge page size > file size"); goto fallback; + } if (!write && !buffer_mapped(&bh) && buffer_uptodate(&bh)) { spinlock_t *ptl; pmd_t entry; struct page *zero_page = get_huge_zero_page(); - if (unlikely(!zero_page)) + if (unlikely(!zero_page)) { + dax_pmd_dbg(&bh, address, "no zero page"); goto fallback; + } ptl = pmd_lock(vma->vm_mm, pmd); if (!pmd_none(*pmd)) { spin_unlock(ptl); + dax_pmd_dbg(&bh, address, "pmd already present"); goto fallback; } + dev_dbg(part_to_dev(bdev->bd_part), + "%s: %s addr: %lx pfn: <zero> sect: %llx\n", + __func__, current->comm, address, + (unsigned long long) to_sector(&bh, inode)); + entry = mk_pmd(zero_page, vma->vm_page_prot); entry = pmd_mkhuge(entry); set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry); result = VM_FAULT_NOPAGE; spin_unlock(ptl); } else { - sector = bh.b_blocknr << (blkbits - 9); - length = bdev_direct_access(bh.b_bdev, sector, &kaddr, &pfn, - bh.b_size); + struct blk_dax_ctl dax = { + .sector = to_sector(&bh, inode), + .size = PMD_SIZE, + }; + long length = dax_map_atomic(bdev, &dax); + if (length < 0) { result = VM_FAULT_SIGBUS; goto out; } - if ((length < PMD_SIZE) || (pfn & PG_PMD_COLOUR)) + if (length < PMD_SIZE) { + dax_pmd_dbg(&bh, address, "dax-length too small"); + dax_unmap_atomic(bdev, &dax); goto fallback; + } + if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) { + dax_pmd_dbg(&bh, address, "pfn unaligned"); + dax_unmap_atomic(bdev, &dax); + goto fallback; + } - /* - * TODO: teach vmf_insert_pfn_pmd() to support - * 'pte_special' for pmds - */ - if (pfn_valid(pfn)) + if (!pfn_t_devmap(dax.pfn)) { + dax_unmap_atomic(bdev, &dax); + dax_pmd_dbg(&bh, address, "pfn not in memmap"); goto fallback; + } if (buffer_unwritten(&bh) || buffer_new(&bh)) { - int i; - for (i = 0; i < PTRS_PER_PMD; i++) - clear_pmem(kaddr + i * PAGE_SIZE, PAGE_SIZE); + clear_pmem(dax.addr, PMD_SIZE); wmb_pmem(); count_vm_event(PGMAJFAULT); mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); result |= VM_FAULT_MAJOR; } + dax_unmap_atomic(bdev, &dax); - result |= vmf_insert_pfn_pmd(vma, address, pmd, pfn, write); + /* + * For PTE faults we insert a radix tree entry for reads, and + * leave it clean. Then on the first write we dirty the radix + * tree entry via the dax_pfn_mkwrite() path. This sequence + * allows the dax_pfn_mkwrite() call to be simpler and avoid a + * call into get_block() to translate the pgoff to a sector in + * order to be able to create a new radix tree entry. + * + * The PMD path doesn't have an equivalent to + * dax_pfn_mkwrite(), though, so for a read followed by a + * write we traverse all the way through __dax_pmd_fault() + * twice. This means we can just skip inserting a radix tree + * entry completely on the initial read and just wait until + * the write to insert a dirty entry. + */ + if (write) { + error = dax_radix_entry(mapping, pgoff, dax.sector, + true, true); + if (error) { + dax_pmd_dbg(&bh, address, + "PMD radix insertion failed"); + goto fallback; + } + } + + dev_dbg(part_to_dev(bdev->bd_part), + "%s: %s addr: %lx pfn: %lx sect: %llx\n", + __func__, current->comm, address, + pfn_t_to_pfn(dax.pfn), + (unsigned long long) dax.sector); + result |= vmf_insert_pfn_pmd(vma, address, pmd, + dax.pfn, write); } out: @@ -702,15 +1052,27 @@ EXPORT_SYMBOL_GPL(dax_pmd_fault); * dax_pfn_mkwrite - handle first write to DAX page * @vma: The virtual memory area where the fault occurred * @vmf: The description of the fault - * */ int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { - struct super_block *sb = file_inode(vma->vm_file)->i_sb; + struct file *file = vma->vm_file; + int error; + + /* + * We pass NO_SECTOR to dax_radix_entry() because we expect that a + * RADIX_DAX_PTE entry already exists in the radix tree from a + * previous call to __dax_fault(). We just want to look up that PTE + * entry using vmf->pgoff and make sure the dirty tag is set. This + * saves us from having to make a call to get_block() here to look + * up the sector. + */ + error = dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false, + true); - sb_start_pagefault(sb); - file_update_time(vma->vm_file); - sb_end_pagefault(sb); + if (error == -ENOMEM) + return VM_FAULT_OOM; + if (error) + return VM_FAULT_SIGBUS; return VM_FAULT_NOPAGE; } EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); @@ -747,17 +1109,23 @@ int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length, BUG_ON((offset + length) > PAGE_CACHE_SIZE); memset(&bh, 0, sizeof(bh)); + bh.b_bdev = inode->i_sb->s_bdev; bh.b_size = PAGE_CACHE_SIZE; err = get_block(inode, index, &bh, 0); if (err < 0) return err; if (buffer_written(&bh)) { - void __pmem *addr; - err = dax_get_addr(&bh, &addr, inode->i_blkbits); - if (err < 0) - return err; - clear_pmem(addr + offset, length); + struct block_device *bdev = bh.b_bdev; + struct blk_dax_ctl dax = { + .sector = to_sector(&bh, inode), + .size = PAGE_CACHE_SIZE, + }; + + if (dax_map_atomic(bdev, &dax) < 0) + return PTR_ERR(dax.addr); + clear_pmem(dax.addr + offset, length); wmb_pmem(); + dax_unmap_atomic(bdev, &dax); } return 0; diff --git a/fs/dcache.c b/fs/dcache.c index 927ed93af..2398f9f94 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1156,7 +1156,7 @@ enum d_walk_ret { * * The @enter() and @finish() callbacks are called with d_lock held. */ -void d_walk(struct dentry *parent, void *data, +static void d_walk(struct dentry *parent, void *data, enum d_walk_ret (*enter)(void *, struct dentry *), void (*finish)(void *)) { @@ -1261,7 +1261,6 @@ rename_retry: seq = 1; goto again; } -EXPORT_SYMBOL_GPL(d_walk); /* * Search for at least 1 mount point in the dentry's subdirs. @@ -1561,7 +1560,8 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) dentry->d_iname[DNAME_INLINE_LEN-1] = 0; if (name->len > DNAME_INLINE_LEN-1) { size_t size = offsetof(struct external_name, name[1]); - struct external_name *p = kmalloc(size + name->len, GFP_KERNEL); + struct external_name *p = kmalloc(size + name->len, + GFP_KERNEL_ACCOUNT); if (!p) { kmem_cache_free(dentry_cache, dentry); return NULL; @@ -1724,7 +1724,7 @@ static unsigned d_flags_for_inode(struct inode *inode) } if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) { - if (unlikely(inode->i_op->follow_link)) { + if (unlikely(inode->i_op->get_link)) { add_flags = DCACHE_SYMLINK_TYPE; goto type_determined; } @@ -2452,7 +2452,7 @@ EXPORT_SYMBOL(d_rehash); */ void dentry_update_name_case(struct dentry *dentry, struct qstr *name) { - BUG_ON(!mutex_is_locked(&dentry->d_parent->d_inode->i_mutex)); + BUG_ON(!inode_is_locked(dentry->d_parent->d_inode)); BUG_ON(dentry->d_name.len != name->len); /* d_lookup gives this */ spin_lock(&dentry->d_lock); @@ -2728,7 +2728,7 @@ static int __d_unalias(struct inode *inode, if (!mutex_trylock(&dentry->d_sb->s_vfs_rename_mutex)) goto out_err; m1 = &dentry->d_sb->s_vfs_rename_mutex; - if (!mutex_trylock(&alias->d_parent->d_inode->i_mutex)) + if (!inode_trylock(alias->d_parent->d_inode)) goto out_err; m2 = &alias->d_parent->d_inode->i_mutex; out_unalias: @@ -3294,18 +3294,18 @@ out: * @new_dentry: new dentry * @old_dentry: old dentry * - * Returns 1 if new_dentry is a subdirectory of the parent (at any depth). - * Returns 0 otherwise. + * Returns true if new_dentry is a subdirectory of the parent (at any depth). + * Returns false otherwise. * Caller must ensure that "new_dentry" is pinned before calling is_subdir() */ -int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry) +bool is_subdir(struct dentry *new_dentry, struct dentry *old_dentry) { - int result; + bool result; unsigned seq; if (new_dentry == old_dentry) - return 1; + return true; do { /* for restarting inner loop in case of seq retry */ @@ -3316,9 +3316,9 @@ int is_subdir(struct dentry *new_dentry, struct dentry *old_dentry) */ rcu_read_lock(); if (d_ancestor(old_dentry, new_dentry)) - result = 1; + result = true; else - result = 0; + result = false; rcu_read_unlock(); } while (read_seqretry(&rename_lock, seq)); @@ -3406,7 +3406,7 @@ static void __init dcache_init(void) * of the dcache. */ dentry_cache = KMEM_CACHE(dentry, - SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD); + SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD|SLAB_ACCOUNT); /* Hash may have been set up in dcache_init_early */ if (!hashdist) diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index b7fcc0de0..bece948b3 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -265,7 +265,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent) if (!parent) parent = debugfs_mount->mnt_root; - mutex_lock(&d_inode(parent)->i_mutex); + inode_lock(d_inode(parent)); dentry = lookup_one_len(name, parent, strlen(name)); if (!IS_ERR(dentry) && d_really_is_positive(dentry)) { dput(dentry); @@ -273,7 +273,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent) } if (IS_ERR(dentry)) { - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); simple_release_fs(&debugfs_mount, &debugfs_mount_count); } @@ -282,7 +282,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent) static struct dentry *failed_creating(struct dentry *dentry) { - mutex_unlock(&d_inode(dentry->d_parent)->i_mutex); + inode_unlock(d_inode(dentry->d_parent)); dput(dentry); simple_release_fs(&debugfs_mount, &debugfs_mount_count); return NULL; @@ -290,7 +290,7 @@ static struct dentry *failed_creating(struct dentry *dentry) static struct dentry *end_creating(struct dentry *dentry) { - mutex_unlock(&d_inode(dentry->d_parent)->i_mutex); + inode_unlock(d_inode(dentry->d_parent)); return dentry; } @@ -560,9 +560,9 @@ void debugfs_remove(struct dentry *dentry) if (!parent || d_really_is_negative(parent)) return; - mutex_lock(&d_inode(parent)->i_mutex); + inode_lock(d_inode(parent)); ret = __debugfs_remove(dentry, parent); - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); if (!ret) simple_release_fs(&debugfs_mount, &debugfs_mount_count); } @@ -594,7 +594,7 @@ void debugfs_remove_recursive(struct dentry *dentry) parent = dentry; down: - mutex_lock(&d_inode(parent)->i_mutex); + inode_lock(d_inode(parent)); loop: /* * The parent->d_subdirs is protected by the d_lock. Outside that @@ -609,7 +609,7 @@ void debugfs_remove_recursive(struct dentry *dentry) /* perhaps simple_empty(child) makes more sense */ if (!list_empty(&child->d_subdirs)) { spin_unlock(&parent->d_lock); - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); parent = child; goto down; } @@ -630,10 +630,10 @@ void debugfs_remove_recursive(struct dentry *dentry) } spin_unlock(&parent->d_lock); - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); child = parent; parent = parent->d_parent; - mutex_lock(&d_inode(parent)->i_mutex); + inode_lock(d_inode(parent)); if (child != dentry) /* go up */ @@ -641,7 +641,7 @@ void debugfs_remove_recursive(struct dentry *dentry) if (!__debugfs_remove(child, parent)) simple_release_fs(&debugfs_mount, &debugfs_mount_count); - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); } EXPORT_SYMBOL_GPL(debugfs_remove_recursive); diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 706de324f..655f21f99 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -255,7 +255,7 @@ static int mknod_ptmx(struct super_block *sb) if (!uid_valid(root_uid) || !gid_valid(root_gid)) return -EINVAL; - mutex_lock(&d_inode(root)->i_mutex); + inode_lock(d_inode(root)); /* If we have already created ptmx node, return */ if (fsi->ptmx_dentry) { @@ -292,7 +292,7 @@ static int mknod_ptmx(struct super_block *sb) fsi->ptmx_dentry = dentry; rc = 0; out: - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); return rc; } @@ -635,7 +635,7 @@ struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index, sprintf(s, "%d", index); - mutex_lock(&d_inode(root)->i_mutex); + inode_lock(d_inode(root)); dentry = d_alloc_name(root, s); if (dentry) { @@ -646,7 +646,7 @@ struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index, inode = ERR_PTR(-ENOMEM); } - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); return inode; } @@ -691,7 +691,7 @@ void devpts_pty_kill(struct inode *inode) BUG_ON(inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR)); - mutex_lock(&d_inode(root)->i_mutex); + inode_lock(d_inode(root)); dentry = d_find_alias(inode); @@ -700,7 +700,7 @@ void devpts_pty_kill(struct inode *inode) dput(dentry); /* d_alloc_name() in devpts_pty_new() */ dput(dentry); /* d_find_alias above */ - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); } static int __init init_devpts_fs(void) diff --git a/fs/direct-io.c b/fs/direct-io.c index 01171d8a6..d6a9012d4 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -1157,12 +1157,12 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, iocb->ki_filp->f_mapping; /* will be released by direct_io_worker */ - mutex_lock(&inode->i_mutex); + inode_lock(inode); retval = filemap_write_and_wait_range(mapping, offset, end - 1); if (retval) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); kmem_cache_free(dio_cache, dio); goto out; } @@ -1173,7 +1173,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, dio->i_size = i_size_read(inode); if (iov_iter_rw(iter) == READ && offset >= dio->i_size) { if (dio->flags & DIO_LOCKING) - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); kmem_cache_free(dio_cache, dio); retval = 0; goto out; @@ -1295,7 +1295,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, * of protecting us from looking up uninitialized blocks. */ if (iov_iter_rw(iter) == READ && (dio->flags & DIO_LOCKING)) - mutex_unlock(&dio->inode->i_mutex); + inode_unlock(dio->inode); /* * The only time we want to leave bios in flight is when a successful diff --git a/fs/dlm/user.c b/fs/dlm/user.c index 173b3873a..58c2f4a21 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -515,14 +515,9 @@ static ssize_t device_write(struct file *file, const char __user *buf, if (count > sizeof(struct dlm_write_request) + DLM_RESNAME_MAXLEN) return -EINVAL; - kbuf = kzalloc(count + 1, GFP_NOFS); - if (!kbuf) - return -ENOMEM; - - if (copy_from_user(kbuf, buf, count)) { - error = -EFAULT; - goto out_free; - } + kbuf = memdup_user_nul(buf, count); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); if (check_version(kbuf)) { error = -EBADE; diff --git a/fs/drop_caches.c b/fs/drop_caches.c index 4f591f190..d72d52b90 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -8,7 +8,6 @@ #include <linux/writeback.h> #include <linux/sysctl.h> #include <linux/gfp.h> -#include <linux/export.h> #include "internal.h" /* A global variable is a bit ugly, but it keeps the code simple */ @@ -40,12 +39,6 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) iput(toput_inode); } -/* For TuxOnIce */ -void drop_pagecache(void) -{ - iterate_supers(drop_pagecache_sb, NULL); -} - int drop_caches_sysctl_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index e2e47ba5d..4e685ac10 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -41,13 +41,13 @@ static struct dentry *lock_parent(struct dentry *dentry) struct dentry *dir; dir = dget_parent(dentry); - mutex_lock_nested(&(d_inode(dir)->i_mutex), I_MUTEX_PARENT); + inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); return dir; } static void unlock_dir(struct dentry *dir) { - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); dput(dir); } @@ -282,9 +282,7 @@ ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry, if (rc) { ecryptfs_do_unlink(directory_inode, ecryptfs_dentry, ecryptfs_inode); - make_bad_inode(ecryptfs_inode); - unlock_new_inode(ecryptfs_inode); - iput(ecryptfs_inode); + iget_failed(ecryptfs_inode); goto out; } unlock_new_inode(ecryptfs_inode); @@ -399,11 +397,11 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, int rc = 0; lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent); - mutex_lock(&d_inode(lower_dir_dentry)->i_mutex); + inode_lock(d_inode(lower_dir_dentry)); lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name, lower_dir_dentry, ecryptfs_dentry->d_name.len); - mutex_unlock(&d_inode(lower_dir_dentry)->i_mutex); + inode_unlock(d_inode(lower_dir_dentry)); if (IS_ERR(lower_dentry)) { rc = PTR_ERR(lower_dentry); ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned " @@ -428,11 +426,11 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, "filename; rc = [%d]\n", __func__, rc); goto out; } - mutex_lock(&d_inode(lower_dir_dentry)->i_mutex); + inode_lock(d_inode(lower_dir_dentry)); lower_dentry = lookup_one_len(encrypted_and_encoded_name, lower_dir_dentry, encrypted_and_encoded_name_size); - mutex_unlock(&d_inode(lower_dir_dentry)->i_mutex); + inode_unlock(d_inode(lower_dir_dentry)); if (IS_ERR(lower_dentry)) { rc = PTR_ERR(lower_dentry); ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned " @@ -674,16 +672,24 @@ out: return rc ? ERR_PTR(rc) : buf; } -static const char *ecryptfs_follow_link(struct dentry *dentry, void **cookie) +static const char *ecryptfs_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) { size_t len; - char *buf = ecryptfs_readlink_lower(dentry, &len); + char *buf; + + if (!dentry) + return ERR_PTR(-ECHILD); + + buf = ecryptfs_readlink_lower(dentry, &len); if (IS_ERR(buf)) return buf; fsstack_copy_attr_atime(d_inode(dentry), d_inode(ecryptfs_dentry_to_lower(dentry))); buf[len] = '\0'; - return *cookie = buf; + set_delayed_call(done, kfree_link, buf); + return buf; } /** @@ -863,9 +869,9 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length) if (!rc && lower_ia.ia_valid & ATTR_SIZE) { struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); - mutex_lock(&d_inode(lower_dentry)->i_mutex); + inode_lock(d_inode(lower_dentry)); rc = notify_change(lower_dentry, &lower_ia, NULL); - mutex_unlock(&d_inode(lower_dentry)->i_mutex); + inode_unlock(d_inode(lower_dentry)); } return rc; } @@ -964,9 +970,9 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia) if (lower_ia.ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID)) lower_ia.ia_valid &= ~ATTR_MODE; - mutex_lock(&d_inode(lower_dentry)->i_mutex); + inode_lock(d_inode(lower_dentry)); rc = notify_change(lower_dentry, &lower_ia, NULL); - mutex_unlock(&d_inode(lower_dentry)->i_mutex); + inode_unlock(d_inode(lower_dentry)); out: fsstack_copy_attr_all(inode, lower_inode); return rc; @@ -1042,10 +1048,10 @@ ecryptfs_getxattr_lower(struct dentry *lower_dentry, const char *name, rc = -EOPNOTSUPP; goto out; } - mutex_lock(&d_inode(lower_dentry)->i_mutex); + inode_lock(d_inode(lower_dentry)); rc = d_inode(lower_dentry)->i_op->getxattr(lower_dentry, name, value, size); - mutex_unlock(&d_inode(lower_dentry)->i_mutex); + inode_unlock(d_inode(lower_dentry)); out: return rc; } @@ -1069,9 +1075,9 @@ ecryptfs_listxattr(struct dentry *dentry, char *list, size_t size) rc = -EOPNOTSUPP; goto out; } - mutex_lock(&d_inode(lower_dentry)->i_mutex); + inode_lock(d_inode(lower_dentry)); rc = d_inode(lower_dentry)->i_op->listxattr(lower_dentry, list, size); - mutex_unlock(&d_inode(lower_dentry)->i_mutex); + inode_unlock(d_inode(lower_dentry)); out: return rc; } @@ -1086,17 +1092,16 @@ static int ecryptfs_removexattr(struct dentry *dentry, const char *name) rc = -EOPNOTSUPP; goto out; } - mutex_lock(&d_inode(lower_dentry)->i_mutex); + inode_lock(d_inode(lower_dentry)); rc = d_inode(lower_dentry)->i_op->removexattr(lower_dentry, name); - mutex_unlock(&d_inode(lower_dentry)->i_mutex); + inode_unlock(d_inode(lower_dentry)); out: return rc; } const struct inode_operations ecryptfs_symlink_iops = { .readlink = generic_readlink, - .follow_link = ecryptfs_follow_link, - .put_link = kfree_put_link, + .get_link = ecryptfs_get_link, .permission = ecryptfs_permission, .setattr = ecryptfs_setattr, .getattr = ecryptfs_getattr_link, diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 4f4d0474b..e25b6b06b 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -663,6 +663,7 @@ static struct ecryptfs_cache_info { struct kmem_cache **cache; const char *name; size_t size; + unsigned long flags; void (*ctor)(void *obj); } ecryptfs_cache_infos[] = { { @@ -684,6 +685,7 @@ static struct ecryptfs_cache_info { .cache = &ecryptfs_inode_info_cache, .name = "ecryptfs_inode_cache", .size = sizeof(struct ecryptfs_inode_info), + .flags = SLAB_ACCOUNT, .ctor = inode_info_init_once, }, { @@ -755,8 +757,8 @@ static int ecryptfs_init_kmem_caches(void) struct ecryptfs_cache_info *info; info = &ecryptfs_cache_infos[i]; - *(info->cache) = kmem_cache_create(info->name, info->size, - 0, SLAB_HWCACHE_ALIGN, info->ctor); + *(info->cache) = kmem_cache_create(info->name, info->size, 0, + SLAB_HWCACHE_ALIGN | info->flags, info->ctor); if (!*(info->cache)) { ecryptfs_free_kmem_caches(); ecryptfs_printk(KERN_WARNING, "%s: " diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index caba848ac..c6ced4cbf 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -436,7 +436,7 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode) rc = -ENOMEM; goto out; } - mutex_lock(&lower_inode->i_mutex); + inode_lock(lower_inode); size = lower_inode->i_op->getxattr(lower_dentry, ECRYPTFS_XATTR_NAME, xattr_virt, PAGE_CACHE_SIZE); if (size < 0) @@ -444,7 +444,7 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode) put_unaligned_be64(i_size_read(ecryptfs_inode), xattr_virt); rc = lower_inode->i_op->setxattr(lower_dentry, ECRYPTFS_XATTR_NAME, xattr_virt, size, 0); - mutex_unlock(&lower_inode->i_mutex); + inode_unlock(lower_inode); if (rc) printk(KERN_ERR "Error whilst attempting to write inode size " "to lower file xattr; rc = [%d]\n", rc); diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c index 66842e55c..d48e0d261 100644 --- a/fs/efivarfs/file.c +++ b/fs/efivarfs/file.c @@ -51,9 +51,9 @@ static ssize_t efivarfs_file_write(struct file *file, d_delete(file->f_path.dentry); dput(file->f_path.dentry); } else { - mutex_lock(&inode->i_mutex); + inode_lock(inode); i_size_write(inode, datasize + sizeof(attributes)); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } bytes = count; @@ -148,9 +148,9 @@ efivarfs_ioc_setxflags(struct file *file, void __user *arg) if (error) return error; - mutex_lock(&inode->i_mutex); + inode_lock(inode); inode_set_flags(inode, i_flags, S_IMMUTABLE); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); mnt_drop_write_file(file); diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index abb244b06..dd029d13e 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -164,10 +164,10 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor, efivar_entry_size(entry, &size); efivar_entry_add(entry, &efivarfs_list); - mutex_lock(&inode->i_mutex); + inode_lock(inode); inode->i_private = entry; i_size_write(inode, size + sizeof(entry->var.Attributes)); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); d_add(dentry, inode); return 0; diff --git a/fs/efs/inode.c b/fs/efs/inode.c index 079d20306..cdf087238 100644 --- a/fs/efs/inode.c +++ b/fs/efs/inode.c @@ -151,6 +151,7 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino) break; case S_IFLNK: inode->i_op = &page_symlink_inode_operations; + inode_nohighmem(inode); inode->i_data.a_ops = &efs_symlink_aops; break; case S_IFCHR: diff --git a/fs/efs/super.c b/fs/efs/super.c index c8411a30f..cb68dac4f 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -94,9 +94,9 @@ static void init_once(void *foo) static int __init init_inodecache(void) { efs_inode_cachep = kmem_cache_create("efs_inode_cache", - sizeof(struct efs_inode_info), - 0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, - init_once); + sizeof(struct efs_inode_info), 0, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD| + SLAB_ACCOUNT, init_once); if (efs_inode_cachep == NULL) return -ENOMEM; return 0; diff --git a/fs/efs/symlink.c b/fs/efs/symlink.c index 75117d0da..4870cc82d 100644 --- a/fs/efs/symlink.c +++ b/fs/efs/symlink.c @@ -13,7 +13,7 @@ static int efs_symlink_readpage(struct file *file, struct page *page) { - char *link = kmap(page); + char *link = page_address(page); struct buffer_head * bh; struct inode * inode = page->mapping->host; efs_block_t size = inode->i_size; @@ -39,12 +39,10 @@ static int efs_symlink_readpage(struct file *file, struct page *page) } link[size] = '\0'; SetPageUptodate(page); - kunmap(page); unlock_page(page); return 0; fail: SetPageError(page); - kunmap(page); unlock_page(page); return err; } diff --git a/fs/eventfd.c b/fs/eventfd.c index 8d0c0df01..ed70cf9fd 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -45,10 +45,10 @@ struct eventfd_ctx { * * This function is supposed to be called by the kernel in paths that do not * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX - * value, and we signal this as overflow condition by returining a POLLERR + * value, and we signal this as overflow condition by returning a POLLERR * to poll(2). * - * Returns the amount by which the counter was incrememnted. This will be less + * Returns the amount by which the counter was incremented. This will be less * than @n if the counter has overflowed. */ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 1e009cad8..cde60741c 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -92,7 +92,12 @@ */ /* Epoll private bits inside the event mask */ -#define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET) +#define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | EPOLLEXCLUSIVE) + +#define EPOLLINOUT_BITS (POLLIN | POLLOUT) + +#define EPOLLEXCLUSIVE_OK_BITS (EPOLLINOUT_BITS | POLLERR | POLLHUP | \ + EPOLLWAKEUP | EPOLLET | EPOLLEXCLUSIVE) /* Maximum number of nesting allowed inside epoll sets */ #define EP_MAX_NESTS 4 @@ -1002,6 +1007,7 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k unsigned long flags; struct epitem *epi = ep_item_from_wait(wait); struct eventpoll *ep = epi->ep; + int ewake = 0; if ((unsigned long)key & POLLFREE) { ep_pwq_from_wait(wait)->whead = NULL; @@ -1066,8 +1072,25 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k * Wake up ( if active ) both the eventpoll wait list and the ->poll() * wait list. */ - if (waitqueue_active(&ep->wq)) + if (waitqueue_active(&ep->wq)) { + if ((epi->event.events & EPOLLEXCLUSIVE) && + !((unsigned long)key & POLLFREE)) { + switch ((unsigned long)key & EPOLLINOUT_BITS) { + case POLLIN: + if (epi->event.events & POLLIN) + ewake = 1; + break; + case POLLOUT: + if (epi->event.events & POLLOUT) + ewake = 1; + break; + case 0: + ewake = 1; + break; + } + } wake_up_locked(&ep->wq); + } if (waitqueue_active(&ep->poll_wait)) pwake++; @@ -1078,6 +1101,9 @@ out_unlock: if (pwake) ep_poll_safewake(&ep->poll_wait); + if (epi->event.events & EPOLLEXCLUSIVE) + return ewake; + return 1; } @@ -1095,7 +1121,10 @@ static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, init_waitqueue_func_entry(&pwq->wait, ep_poll_callback); pwq->whead = whead; pwq->base = epi; - add_wait_queue(whead, &pwq->wait); + if (epi->event.events & EPOLLEXCLUSIVE) + add_wait_queue_exclusive(whead, &pwq->wait); + else + add_wait_queue(whead, &pwq->wait); list_add_tail(&pwq->llink, &epi->pwqlist); epi->nwait++; } else { @@ -1862,6 +1891,19 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, goto error_tgt_fput; /* + * epoll adds to the wakeup queue at EPOLL_CTL_ADD time only, + * so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation. + * Also, we do not currently supported nested exclusive wakeups. + */ + if (epds.events & EPOLLEXCLUSIVE) { + if (op == EPOLL_CTL_MOD) + goto error_tgt_fput; + if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) || + (epds.events & ~EPOLLEXCLUSIVE_OK_BITS))) + goto error_tgt_fput; + } + + /* * At this point it is safe to assume that the "private_data" contains * our own data structure. */ @@ -1932,8 +1974,10 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, break; case EPOLL_CTL_MOD: if (epi) { - epds.events |= POLLERR | POLLHUP; - error = ep_modify(ep, epi, &epds); + if (!(epi->event.events & EPOLLEXCLUSIVE)) { + epds.events |= POLLERR | POLLHUP; + error = ep_modify(ep, epi, &epds); + } } else error = -ENOENT; break; @@ -106,7 +106,6 @@ bool path_noexec(const struct path *path) return (path->mnt->mnt_flags & MNT_NOEXEC) || (path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC); } -EXPORT_SYMBOL_GPL(path_noexec); #ifdef CONFIG_USELIB /* @@ -123,7 +122,7 @@ SYSCALL_DEFINE1(uselib, const char __user *, library) int error = PTR_ERR(tmp); static const struct open_flags uselib_flags = { .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, - .acc_mode = MAY_READ | MAY_EXEC | MAY_OPEN, + .acc_mode = MAY_READ | MAY_EXEC, .intent = LOOKUP_OPEN, .lookup_flags = LOOKUP_FOLLOW, }; @@ -767,7 +766,7 @@ static struct file *do_open_execat(int fd, struct filename *name, int flags) int err; struct open_flags open_exec_flags = { .open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC, - .acc_mode = MAY_EXEC | MAY_OPEN, + .acc_mode = MAY_EXEC, .intent = LOOKUP_OPEN, .lookup_flags = LOOKUP_FOLLOW, }; @@ -1314,13 +1313,13 @@ static void bprm_fill_uid(struct linux_binprm *bprm) return; /* Be careful if suid/sgid is set */ - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* reload atomically mode/uid/gid now that lock held */ mode = inode->i_mode; uid = inode->i_uid; gid = inode->i_gid; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); /* We ignore suid/sgid if there are no mappings for them in the ns */ if (!kuid_has_mapping(bprm->cred->user_ns, uid) || diff --git a/fs/exofs/file.c b/fs/exofs/file.c index 906de66e8..28645f064 100644 --- a/fs/exofs/file.c +++ b/fs/exofs/file.c @@ -52,9 +52,9 @@ static int exofs_file_fsync(struct file *filp, loff_t start, loff_t end, if (ret) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = sync_inode_metadata(filp->f_mapping->host, 1); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 60f03b789..9eaf595ae 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -1224,6 +1224,7 @@ struct inode *exofs_iget(struct super_block *sb, unsigned long ino) inode->i_link = (char *)oi->i_data; } else { inode->i_op = &page_symlink_inode_operations; + inode_nohighmem(inode); inode->i_mapping->a_ops = &exofs_aops; } } else { diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c index 994e078da..c20d77df2 100644 --- a/fs/exofs/namei.c +++ b/fs/exofs/namei.c @@ -111,6 +111,7 @@ static int exofs_symlink(struct inode *dir, struct dentry *dentry, if (l > sizeof(oi->i_data)) { /* slow symlink */ inode->i_op = &page_symlink_inode_operations; + inode_nohighmem(inode); inode->i_mapping->a_ops = &exofs_aops; memset(oi->i_data, 0, sizeof(oi->i_data)); diff --git a/fs/exofs/super.c b/fs/exofs/super.c index b795c567b..6658a5053 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -194,8 +194,8 @@ static int init_inodecache(void) { exofs_inode_cachep = kmem_cache_create("exofs_inode_cache", sizeof(struct exofs_i_info), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, - exofs_init_once); + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | + SLAB_ACCOUNT, exofs_init_once); if (exofs_inode_cachep == NULL) return -ENOMEM; return 0; diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 714cd37a6..c46f1a190 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -124,10 +124,10 @@ static struct dentry *reconnect_one(struct vfsmount *mnt, int err; parent = ERR_PTR(-EACCES); - mutex_lock(&dentry->d_inode->i_mutex); + inode_lock(dentry->d_inode); if (mnt->mnt_sb->s_export_op->get_parent) parent = mnt->mnt_sb->s_export_op->get_parent(dentry); - mutex_unlock(&dentry->d_inode->i_mutex); + inode_unlock(dentry->d_inode); if (IS_ERR(parent)) { dprintk("%s: get_parent of %ld failed, err %d\n", @@ -143,9 +143,9 @@ static struct dentry *reconnect_one(struct vfsmount *mnt, if (err) goto out_err; dprintk("%s: found name: %s\n", __func__, nbuf); - mutex_lock(&parent->d_inode->i_mutex); + inode_lock(parent->d_inode); tmp = lookup_one_len(nbuf, parent, strlen(nbuf)); - mutex_unlock(&parent->d_inode->i_mutex); + inode_unlock(parent->d_inode); if (IS_ERR(tmp)) { dprintk("%s: lookup failed: %d\n", __func__, PTR_ERR(tmp)); goto out_err; @@ -503,10 +503,10 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid, */ err = exportfs_get_name(mnt, target_dir, nbuf, result); if (!err) { - mutex_lock(&target_dir->d_inode->i_mutex); + inode_lock(target_dir->d_inode); nresult = lookup_one_len(nbuf, target_dir, strlen(nbuf)); - mutex_unlock(&target_dir->d_inode->i_mutex); + inode_unlock(target_dir->d_inode); if (!IS_ERR(nresult)) { if (nresult->d_inode) { dput(result); diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 11a42c5a0..c1400b109 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -80,30 +80,13 @@ static int ext2_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, return ret; } -static int ext2_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) -{ - struct inode *inode = file_inode(vma->vm_file); - struct ext2_inode_info *ei = EXT2_I(inode); - int ret; - - sb_start_pagefault(inode->i_sb); - file_update_time(vma->vm_file); - down_read(&ei->dax_sem); - - ret = __dax_mkwrite(vma, vmf, ext2_get_block, NULL); - - up_read(&ei->dax_sem); - sb_end_pagefault(inode->i_sb); - return ret; -} - static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { struct inode *inode = file_inode(vma->vm_file); struct ext2_inode_info *ei = EXT2_I(inode); - int ret = VM_FAULT_NOPAGE; loff_t size; + int ret; sb_start_pagefault(inode->i_sb); file_update_time(vma->vm_file); @@ -113,6 +96,8 @@ static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma, size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (vmf->pgoff >= size) ret = VM_FAULT_SIGBUS; + else + ret = dax_pfn_mkwrite(vma, vmf); up_read(&ei->dax_sem); sb_end_pagefault(inode->i_sb); @@ -122,7 +107,7 @@ static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma, static const struct vm_operations_struct ext2_dax_vm_ops = { .fault = ext2_dax_fault, .pmd_fault = ext2_dax_pmd_fault, - .page_mkwrite = ext2_dax_mkwrite, + .page_mkwrite = ext2_dax_fault, .pfn_mkwrite = ext2_dax_pfn_mkwrite, }; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 0aa9bf6e6..6bd58e6ff 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -737,8 +737,10 @@ static int ext2_get_blocks(struct inode *inode, * so that it's not found by another thread before it's * initialised */ - err = dax_clear_blocks(inode, le32_to_cpu(chain[depth-1].key), - 1 << inode->i_blkbits); + err = dax_clear_sectors(inode->i_sb->s_bdev, + le32_to_cpu(chain[depth-1].key) << + (inode->i_blkbits - 9), + 1 << inode->i_blkbits); if (err) { mutex_unlock(&ei->truncate_mutex); goto cleanup; @@ -874,6 +876,14 @@ ext2_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset) static int ext2_writepages(struct address_space *mapping, struct writeback_control *wbc) { +#ifdef CONFIG_FS_DAX + if (dax_mapping(mapping)) { + return dax_writeback_mapping_range(mapping, + mapping->host->i_sb->s_bdev, + wbc); + } +#endif + return mpage_writepages(mapping, wbc, ext2_get_block); } @@ -1296,7 +1306,7 @@ void ext2_set_inode_flags(struct inode *inode) inode->i_flags |= S_NOATIME; if (flags & EXT2_DIRSYNC_FL) inode->i_flags |= S_DIRSYNC; - if (test_opt(inode->i_sb, DAX)) + if (test_opt(inode->i_sb, DAX) && S_ISREG(inode->i_mode)) inode->i_flags |= S_DAX; } @@ -1420,6 +1430,7 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino) sizeof(ei->i_data) - 1); } else { inode->i_op = &ext2_symlink_inode_operations; + inode_nohighmem(inode); if (test_opt(inode->i_sb, NOBH)) inode->i_mapping->a_ops = &ext2_nobh_aops; else diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c index 5d46c0986..b386af2e4 100644 --- a/fs/ext2/ioctl.c +++ b/fs/ext2/ioctl.c @@ -51,10 +51,10 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) flags = ext2_mask_flags(inode->i_mode, flags); - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* Is it quota file? Do not allow user to mess with it */ if (IS_NOQUOTA(inode)) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); ret = -EPERM; goto setflags_out; } @@ -68,7 +68,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) */ if ((flags ^ oldflags) & (EXT2_APPEND_FL | EXT2_IMMUTABLE_FL)) { if (!capable(CAP_LINUX_IMMUTABLE)) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); ret = -EPERM; goto setflags_out; } @@ -80,7 +80,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) ext2_set_inode_flags(inode); inode->i_ctime = CURRENT_TIME_SEC; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); mark_inode_dirty(inode); setflags_out: @@ -102,10 +102,10 @@ setflags_out: goto setversion_out; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); inode->i_ctime = CURRENT_TIME_SEC; inode->i_generation = generation; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); mark_inode_dirty(inode); setversion_out: diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 3267a80db..7a2be8f7f 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -183,6 +183,7 @@ static int ext2_symlink (struct inode * dir, struct dentry * dentry, if (l > sizeof (EXT2_I(inode)->i_data)) { /* slow symlink */ inode->i_op = &ext2_symlink_inode_operations; + inode_nohighmem(inode); if (test_opt(inode->i_sb, NOBH)) inode->i_mapping->a_ops = &ext2_nobh_aops; else diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 748d35afc..2a188413a 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -203,7 +203,7 @@ static int __init init_inodecache(void) ext2_inode_cachep = kmem_cache_create("ext2_inode_cache", sizeof(struct ext2_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (ext2_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/ext2/symlink.c b/fs/ext2/symlink.c index ae17179f3..3495d8ae4 100644 --- a/fs/ext2/symlink.c +++ b/fs/ext2/symlink.c @@ -22,8 +22,7 @@ const struct inode_operations ext2_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = page_follow_link_light, - .put_link = page_put_link, + .get_link = page_get_link, .setattr = ext2_setattr, #ifdef CONFIG_EXT2_FS_XATTR .setxattr = generic_setxattr, @@ -35,7 +34,7 @@ const struct inode_operations ext2_symlink_inode_operations = { const struct inode_operations ext2_fast_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = simple_follow_link, + .get_link = simple_get_link, .setattr = ext2_setattr, #ifdef CONFIG_EXT2_FS_XATTR .setxattr = generic_setxattr, diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index fa70848af..f57a7aba3 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -77,10 +77,8 @@ printk("\n"); \ } while (0) # define ea_bdebug(bh, f...) do { \ - char b[BDEVNAME_SIZE]; \ - printk(KERN_DEBUG "block %s:%lu: ", \ - bdevname(bh->b_bdev, b), \ - (unsigned long) bh->b_blocknr); \ + printk(KERN_DEBUG "block %pg:%lu: ", \ + bh->b_bdev, (unsigned long) bh->b_blocknr); \ printk(f); \ printk("\n"); \ } while (0) @@ -292,16 +290,21 @@ bad_block: ext2_error(inode->i_sb, "ext2_xattr_list", const struct xattr_handler *handler = ext2_xattr_handler(entry->e_name_index); - if (handler) { - size_t size = handler->list(handler, dentry, buffer, - rest, entry->e_name, - entry->e_name_len); + if (handler && (!handler->list || handler->list(dentry))) { + const char *prefix = handler->prefix ?: handler->name; + size_t prefix_len = strlen(prefix); + size_t size = prefix_len + entry->e_name_len + 1; + if (buffer) { if (size > rest) { error = -ERANGE; goto cleanup; } - buffer += size; + memcpy(buffer, prefix, prefix_len); + buffer += prefix_len; + memcpy(buffer, entry->e_name, entry->e_name_len); + buffer += entry->e_name_len; + *buffer++ = 0; } rest -= size; } diff --git a/fs/ext2/xattr_security.c b/fs/ext2/xattr_security.c index dfb087503..ba97f243b 100644 --- a/fs/ext2/xattr_security.c +++ b/fs/ext2/xattr_security.c @@ -7,29 +7,11 @@ #include <linux/security.h> #include "xattr.h" -static size_t -ext2_xattr_security_list(const struct xattr_handler *handler, - struct dentry *dentry, char *list, size_t list_size, - const char *name, size_t name_len) -{ - const int prefix_len = XATTR_SECURITY_PREFIX_LEN; - const size_t total_len = prefix_len + name_len + 1; - - if (list && total_len <= list_size) { - memcpy(list, XATTR_SECURITY_PREFIX, prefix_len); - memcpy(list+prefix_len, name, name_len); - list[prefix_len + name_len] = '\0'; - } - return total_len; -} - static int ext2_xattr_security_get(const struct xattr_handler *handler, struct dentry *dentry, const char *name, void *buffer, size_t size) { - if (strcmp(name, "") == 0) - return -EINVAL; return ext2_xattr_get(d_inode(dentry), EXT2_XATTR_INDEX_SECURITY, name, buffer, size); } @@ -39,8 +21,6 @@ ext2_xattr_security_set(const struct xattr_handler *handler, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { - if (strcmp(name, "") == 0) - return -EINVAL; return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_SECURITY, name, value, size, flags); } @@ -71,7 +51,6 @@ ext2_init_security(struct inode *inode, struct inode *dir, const struct xattr_handler ext2_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, - .list = ext2_xattr_security_list, .get = ext2_xattr_security_get, .set = ext2_xattr_security_set, }; diff --git a/fs/ext2/xattr_trusted.c b/fs/ext2/xattr_trusted.c index 3150dd3a7..2c94d1930 100644 --- a/fs/ext2/xattr_trusted.c +++ b/fs/ext2/xattr_trusted.c @@ -8,23 +8,10 @@ #include "ext2.h" #include "xattr.h" -static size_t -ext2_xattr_trusted_list(const struct xattr_handler *handler, - struct dentry *dentry, char *list, size_t list_size, - const char *name, size_t name_len) +static bool +ext2_xattr_trusted_list(struct dentry *dentry) { - const int prefix_len = XATTR_TRUSTED_PREFIX_LEN; - const size_t total_len = prefix_len + name_len + 1; - - if (!capable(CAP_SYS_ADMIN)) - return 0; - - if (list && total_len <= list_size) { - memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len); - memcpy(list+prefix_len, name, name_len); - list[prefix_len + name_len] = '\0'; - } - return total_len; + return capable(CAP_SYS_ADMIN); } static int @@ -32,8 +19,6 @@ ext2_xattr_trusted_get(const struct xattr_handler *handler, struct dentry *dentry, const char *name, void *buffer, size_t size) { - if (strcmp(name, "") == 0) - return -EINVAL; return ext2_xattr_get(d_inode(dentry), EXT2_XATTR_INDEX_TRUSTED, name, buffer, size); } @@ -43,8 +28,6 @@ ext2_xattr_trusted_set(const struct xattr_handler *handler, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { - if (strcmp(name, "") == 0) - return -EINVAL; return ext2_xattr_set(d_inode(dentry), EXT2_XATTR_INDEX_TRUSTED, name, value, size, flags); } diff --git a/fs/ext2/xattr_user.c b/fs/ext2/xattr_user.c index 339a49bbb..72a2a96d6 100644 --- a/fs/ext2/xattr_user.c +++ b/fs/ext2/xattr_user.c @@ -10,23 +10,10 @@ #include "ext2.h" #include "xattr.h" -static size_t -ext2_xattr_user_list(const struct xattr_handler *handler, - struct dentry *dentry, char *list, size_t list_size, - const char *name, size_t name_len) +static bool +ext2_xattr_user_list(struct dentry *dentry) { - const size_t prefix_len = XATTR_USER_PREFIX_LEN; - const size_t total_len = prefix_len + name_len + 1; - - if (!test_opt(dentry->d_sb, XATTR_USER)) - return 0; - - if (list && total_len <= list_size) { - memcpy(list, XATTR_USER_PREFIX, prefix_len); - memcpy(list+prefix_len, name, name_len); - list[prefix_len + name_len] = '\0'; - } - return total_len; + return test_opt(dentry->d_sb, XATTR_USER); } static int @@ -34,8 +21,6 @@ ext2_xattr_user_get(const struct xattr_handler *handler, struct dentry *dentry, const char *name, void *buffer, size_t size) { - if (strcmp(name, "") == 0) - return -EINVAL; if (!test_opt(dentry->d_sb, XATTR_USER)) return -EOPNOTSUPP; return ext2_xattr_get(d_inode(dentry), EXT2_XATTR_INDEX_USER, @@ -47,8 +32,6 @@ ext2_xattr_user_set(const struct xattr_handler *handler, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { - if (strcmp(name, "") == 0) - return -EINVAL; if (!test_opt(dentry->d_sb, XATTR_USER)) return -EOPNOTSUPP; diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c index 1a0835073..38f756248 100644 --- a/fs/ext4/crypto.c +++ b/fs/ext4/crypto.c @@ -384,14 +384,12 @@ int ext4_decrypt(struct page *page) EXT4_DECRYPT, page->index, page, page); } -int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex) +int ext4_encrypted_zeroout(struct inode *inode, ext4_lblk_t lblk, + ext4_fsblk_t pblk, ext4_lblk_t len) { struct ext4_crypto_ctx *ctx; struct page *ciphertext_page = NULL; struct bio *bio; - ext4_lblk_t lblk = le32_to_cpu(ex->ee_block); - ext4_fsblk_t pblk = ext4_ext_pblock(ex); - unsigned int len = ext4_ext_get_actual_len(ex); int ret, err = 0; #if 0 @@ -469,3 +467,59 @@ uint32_t ext4_validate_encryption_key_size(uint32_t mode, uint32_t size) return size; return 0; } + +/* + * Validate dentries for encrypted directories to make sure we aren't + * potentially caching stale data after a key has been added or + * removed. + */ +static int ext4_d_revalidate(struct dentry *dentry, unsigned int flags) +{ + struct inode *dir = d_inode(dentry->d_parent); + struct ext4_crypt_info *ci = EXT4_I(dir)->i_crypt_info; + int dir_has_key, cached_with_key; + + if (!ext4_encrypted_inode(dir)) + return 0; + + if (ci && ci->ci_keyring_key && + (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) | + (1 << KEY_FLAG_REVOKED) | + (1 << KEY_FLAG_DEAD)))) + ci = NULL; + + /* this should eventually be an flag in d_flags */ + cached_with_key = dentry->d_fsdata != NULL; + dir_has_key = (ci != NULL); + + /* + * If the dentry was cached without the key, and it is a + * negative dentry, it might be a valid name. We can't check + * if the key has since been made available due to locking + * reasons, so we fail the validation so ext4_lookup() can do + * this check. + * + * We also fail the validation if the dentry was created with + * the key present, but we no longer have the key, or vice versa. + */ + if ((!cached_with_key && d_is_negative(dentry)) || + (!cached_with_key && dir_has_key) || + (cached_with_key && !dir_has_key)) { +#if 0 /* Revalidation debug */ + char buf[80]; + char *cp = simple_dname(dentry, buf, sizeof(buf)); + + if (IS_ERR(cp)) + cp = (char *) "???"; + pr_err("revalidate: %s %p %d %d %d\n", cp, dentry->d_fsdata, + cached_with_key, d_is_negative(dentry), + dir_has_key); +#endif + return 0; + } + return 1; +} + +const struct dentry_operations ext4_encrypted_d_ops = { + .d_revalidate = ext4_d_revalidate, +}; diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 1d1bca74f..33f5e2a50 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -111,6 +111,12 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) int dir_has_error = 0; struct ext4_str fname_crypto_str = {.name = NULL, .len = 0}; + if (ext4_encrypted_inode(inode)) { + err = ext4_get_encryption_info(inode); + if (err && err != -ENOKEY) + return err; + } + if (is_dx_dir(inode)) { err = ext4_dx_readdir(file, ctx); if (err != ERR_BAD_DX_DIR) { @@ -157,8 +163,11 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) index, 1); file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; bh = ext4_bread(NULL, inode, map.m_lblk, 0); - if (IS_ERR(bh)) - return PTR_ERR(bh); + if (IS_ERR(bh)) { + err = PTR_ERR(bh); + bh = NULL; + goto errout; + } } if (!bh) { diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index cc7ca4e87..157b458a6 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -378,14 +378,22 @@ struct flex_groups { #define EXT4_PROJINHERIT_FL 0x20000000 /* Create with parents projid */ #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ -#define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */ -#define EXT4_FL_USER_MODIFIABLE 0x004380FF /* User modifiable flags */ +#define EXT4_FL_USER_VISIBLE 0x304BDFFF /* User visible flags */ +#define EXT4_FL_USER_MODIFIABLE 0x204380FF /* User modifiable flags */ + +#define EXT4_FL_XFLAG_VISIBLE (EXT4_SYNC_FL | \ + EXT4_IMMUTABLE_FL | \ + EXT4_APPEND_FL | \ + EXT4_NODUMP_FL | \ + EXT4_NOATIME_FL | \ + EXT4_PROJINHERIT_FL) /* Flags that should be inherited by new inodes from their parent. */ #define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\ EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\ EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\ - EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL) + EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL |\ + EXT4_PROJINHERIT_FL) /* Flags that are appropriate for regular files (all but dir-specific ones). */ #define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL)) @@ -555,10 +563,12 @@ enum { #define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040 /* Request will not result in inode size update (user for fallocate) */ #define EXT4_GET_BLOCKS_KEEP_SIZE 0x0080 - /* Do not take i_data_sem locking in ext4_map_blocks */ -#define EXT4_GET_BLOCKS_NO_LOCK 0x0100 /* Convert written extents to unwritten */ -#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0200 +#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0100 + /* Write zeros to newly created written extents */ +#define EXT4_GET_BLOCKS_ZERO 0x0200 +#define EXT4_GET_BLOCKS_CREATE_ZERO (EXT4_GET_BLOCKS_CREATE |\ + EXT4_GET_BLOCKS_ZERO) /* * The bit position of these flags must not overlap with any of the @@ -616,6 +626,46 @@ enum { #define EXT4_IOC_GET_ENCRYPTION_PWSALT _IOW('f', 20, __u8[16]) #define EXT4_IOC_GET_ENCRYPTION_POLICY _IOW('f', 21, struct ext4_encryption_policy) +#ifndef FS_IOC_FSGETXATTR +/* Until the uapi changes get merged for project quota... */ + +#define FS_IOC_FSGETXATTR _IOR('X', 31, struct fsxattr) +#define FS_IOC_FSSETXATTR _IOW('X', 32, struct fsxattr) + +/* + * Structure for FS_IOC_FSGETXATTR and FS_IOC_FSSETXATTR. + */ +struct fsxattr { + __u32 fsx_xflags; /* xflags field value (get/set) */ + __u32 fsx_extsize; /* extsize field value (get/set)*/ + __u32 fsx_nextents; /* nextents field value (get) */ + __u32 fsx_projid; /* project identifier (get/set) */ + unsigned char fsx_pad[12]; +}; + +/* + * Flags for the fsx_xflags field + */ +#define FS_XFLAG_REALTIME 0x00000001 /* data in realtime volume */ +#define FS_XFLAG_PREALLOC 0x00000002 /* preallocated file extents */ +#define FS_XFLAG_IMMUTABLE 0x00000008 /* file cannot be modified */ +#define FS_XFLAG_APPEND 0x00000010 /* all writes append */ +#define FS_XFLAG_SYNC 0x00000020 /* all writes synchronous */ +#define FS_XFLAG_NOATIME 0x00000040 /* do not update access time */ +#define FS_XFLAG_NODUMP 0x00000080 /* do not include in backups */ +#define FS_XFLAG_RTINHERIT 0x00000100 /* create with rt bit set */ +#define FS_XFLAG_PROJINHERIT 0x00000200 /* create with parents projid */ +#define FS_XFLAG_NOSYMLINKS 0x00000400 /* disallow symlink creation */ +#define FS_XFLAG_EXTSIZE 0x00000800 /* extent size allocator hint */ +#define FS_XFLAG_EXTSZINHERIT 0x00001000 /* inherit inode extent size */ +#define FS_XFLAG_NODEFRAG 0x00002000 /* do not defragment */ +#define FS_XFLAG_FILESTREAM 0x00004000 /* use filestream allocator */ +#define FS_XFLAG_HASATTR 0x80000000 /* no DIFLAG for this */ +#endif /* !defined(FS_IOC_FSGETXATTR) */ + +#define EXT4_IOC_FSGETXATTR FS_IOC_FSGETXATTR +#define EXT4_IOC_FSSETXATTR FS_IOC_FSSETXATTR + #if defined(__KERNEL__) && defined(CONFIG_COMPAT) /* * ioctl commands in 32 bit emulation @@ -910,6 +960,15 @@ struct ext4_inode_info { * by other means, so we have i_data_sem. */ struct rw_semaphore i_data_sem; + /* + * i_mmap_sem is for serializing page faults with truncate / punch hole + * operations. We have to make sure that new page cannot be faulted in + * a section of the inode that is being punched. We cannot easily use + * i_data_sem for this since we need protection for the whole punch + * operation and i_data_sem ranks below transaction start so we have + * to occasionally drop it. + */ + struct rw_semaphore i_mmap_sem; struct inode vfs_inode; struct jbd2_inode *jinode; @@ -993,6 +1052,7 @@ struct ext4_inode_info { /* Encryption params */ struct ext4_crypt_info *i_crypt_info; #endif + kprojid_t i_projid; }; /* @@ -1248,7 +1308,7 @@ struct ext4_super_block { #endif /* Number of quota types we support */ -#define EXT4_MAXQUOTAS 2 +#define EXT4_MAXQUOTAS 3 /* * fourth extended-fs super-block data in memory @@ -1754,7 +1814,8 @@ EXT4_FEATURE_INCOMPAT_FUNCS(encrypt, ENCRYPT) EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\ EXT4_FEATURE_RO_COMPAT_BIGALLOC |\ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\ - EXT4_FEATURE_RO_COMPAT_QUOTA) + EXT4_FEATURE_RO_COMPAT_QUOTA |\ + EXT4_FEATURE_RO_COMPAT_PROJECT) #define EXTN_FEATURE_FUNCS(ver) \ static inline bool ext4_has_unknown_ext##ver##_compat_features(struct super_block *sb) \ @@ -1796,6 +1857,11 @@ static inline bool ext4_has_incompat_features(struct super_block *sb) #define EXT4_DEF_RESUID 0 #define EXT4_DEF_RESGID 0 +/* + * Default project ID + */ +#define EXT4_DEF_PROJID 0 + #define EXT4_DEF_INODE_READAHEAD_BLKS 32 /* @@ -2234,7 +2300,9 @@ void ext4_restore_control_page(struct page *data_page); struct page *ext4_encrypt(struct inode *inode, struct page *plaintext_page); int ext4_decrypt(struct page *page); -int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex); +int ext4_encrypted_zeroout(struct inode *inode, ext4_lblk_t lblk, + ext4_fsblk_t pblk, ext4_lblk_t len); +extern const struct dentry_operations ext4_encrypted_d_ops; #ifdef CONFIG_EXT4_FS_ENCRYPTION int ext4_init_crypto(void); @@ -2440,8 +2508,8 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int); struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int); int ext4_get_block_write(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); -int ext4_get_block_dax(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create); +int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); int ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, @@ -2484,9 +2552,13 @@ extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, loff_t lstart, loff_t lend); extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); +extern int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf); extern qsize_t *ext4_get_reserved_space(struct inode *inode); +extern int ext4_get_projid(struct inode *inode, kprojid_t *projid); extern void ext4_da_update_reserve_space(struct inode *inode, int used, int quota_claim); +extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, + ext4_fsblk_t pblk, ext4_lblk_t len); /* indirect.c */ extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, @@ -2825,7 +2897,7 @@ do { \ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize) { WARN_ON_ONCE(S_ISREG(inode->i_mode) && - !mutex_is_locked(&inode->i_mutex)); + !inode_is_locked(inode)); down_write(&EXT4_I(inode)->i_data_sem); if (newsize > EXT4_I(inode)->i_disksize) EXT4_I(inode)->i_disksize = newsize; @@ -2848,6 +2920,9 @@ static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize) return changed; } +int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, + loff_t len); + struct ext4_group_info { unsigned long bb_state; struct rb_root bb_free_root; @@ -2986,8 +3061,7 @@ extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, struct page *page); extern int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, - struct dentry *dentry, - struct inode *inode); + struct inode *dir, struct inode *inode); extern int ext4_try_create_inline_dir(handle_t *handle, struct inode *parent, struct inode *inode); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 551353b1b..3753ceb0b 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3119,19 +3119,11 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) { ext4_fsblk_t ee_pblock; unsigned int ee_len; - int ret; ee_len = ext4_ext_get_actual_len(ex); ee_pblock = ext4_ext_pblock(ex); - - if (ext4_encrypted_inode(inode)) - return ext4_encrypted_zeroout(inode, ex); - - ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS); - if (ret > 0) - ret = 0; - - return ret; + return ext4_issue_zeroout(inode, le32_to_cpu(ex->ee_block), ee_pblock, + ee_len); } /* @@ -3936,7 +3928,7 @@ static int convert_initialized_extent(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, struct ext4_ext_path **ppath, int flags, - unsigned int allocated, ext4_fsblk_t newblock) + unsigned int allocated) { struct ext4_ext_path *path = *ppath; struct ext4_extent *ex; @@ -4052,6 +4044,14 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, } /* IO end_io complete, convert the filled extent to written */ if (flags & EXT4_GET_BLOCKS_CONVERT) { + if (flags & EXT4_GET_BLOCKS_ZERO) { + if (allocated > map->m_len) + allocated = map->m_len; + err = ext4_issue_zeroout(inode, map->m_lblk, newblock, + allocated); + if (err < 0) + goto out2; + } ret = ext4_convert_unwritten_extents_endio(handle, inode, map, ppath); if (ret >= 0) { @@ -4347,7 +4347,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) { allocated = convert_initialized_extent( handle, inode, map, &path, - flags, allocated, newblock); + flags, allocated); goto out2; } else if (!ext4_ext_is_unwritten(ex)) goto out; @@ -4685,10 +4685,6 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, if (len <= EXT_UNWRITTEN_MAX_LEN) flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; - /* Wait all existing dio workers, newcomers will block on i_mutex */ - ext4_inode_block_unlocked_dio(inode); - inode_dio_wait(inode); - /* * credits to insert 1 extent into extent tree */ @@ -4752,8 +4748,6 @@ retry: goto retry; } - ext4_inode_resume_unlocked_dio(inode); - return ret > 0 ? ret2 : ret; } @@ -4770,7 +4764,6 @@ static long ext4_zero_range(struct file *file, loff_t offset, int partial_begin, partial_end; loff_t start, end; ext4_lblk_t lblk; - struct address_space *mapping = inode->i_mapping; unsigned int blkbits = inode->i_blkbits; trace_ext4_zero_range(inode, offset, len, mode); @@ -4786,17 +4779,6 @@ static long ext4_zero_range(struct file *file, loff_t offset, } /* - * Write out all dirty pages to avoid race conditions - * Then release them. - */ - if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { - ret = filemap_write_and_wait_range(mapping, offset, - offset + len - 1); - if (ret) - return ret; - } - - /* * Round up offset. This is not fallocate, we neet to zero out * blocks, so convert interior block aligned part of the range to * unwritten and possibly manually zero out unaligned parts of the @@ -4817,7 +4799,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, else max_blocks -= lblk; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* * Indirect files do not support unwritten extnets @@ -4839,6 +4821,10 @@ static long ext4_zero_range(struct file *file, loff_t offset, if (mode & FALLOC_FL_KEEP_SIZE) flags |= EXT4_GET_BLOCKS_KEEP_SIZE; + /* Wait all existing dio workers, newcomers will block on i_mutex */ + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + /* Preallocate the range including the unaligned edges */ if (partial_begin || partial_end) { ret = ext4_alloc_file_blocks(file, @@ -4847,7 +4833,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, round_down(offset, 1 << blkbits)) >> blkbits, new_size, flags, mode); if (ret) - goto out_mutex; + goto out_dio; } @@ -4856,16 +4842,23 @@ static long ext4_zero_range(struct file *file, loff_t offset, flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | EXT4_EX_NOCACHE); - /* Now release the pages and zero block aligned part of pages*/ + /* + * Prevent page faults from reinstantiating pages we have + * released from page cache. + */ + down_write(&EXT4_I(inode)->i_mmap_sem); + ret = ext4_update_disksize_before_punch(inode, offset, len); + if (ret) { + up_write(&EXT4_I(inode)->i_mmap_sem); + goto out_dio; + } + /* Now release the pages and zero block aligned part of pages */ truncate_pagecache_range(inode, start, end - 1); inode->i_mtime = inode->i_ctime = ext4_current_time(inode); - /* Wait all existing dio workers, newcomers will block on i_mutex */ - ext4_inode_block_unlocked_dio(inode); - inode_dio_wait(inode); - ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags, mode); + up_write(&EXT4_I(inode)->i_mmap_sem); if (ret) goto out_dio; } @@ -4909,7 +4902,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, out_dio: ext4_inode_resume_unlocked_dio(inode); out_mutex: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } @@ -4980,7 +4973,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (mode & FALLOC_FL_KEEP_SIZE) flags |= EXT4_GET_BLOCKS_KEEP_SIZE; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* * We only support preallocation for extent-based files only @@ -4998,8 +4991,13 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) goto out; } + /* Wait all existing dio workers, newcomers will block on i_mutex */ + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags, mode); + ext4_inode_resume_unlocked_dio(inode); if (ret) goto out; @@ -5008,7 +5006,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) EXT4_I(inode)->i_sync_tid); } out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); return ret; } @@ -5494,21 +5492,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) return ret; } - /* - * Need to round down offset to be aligned with page size boundary - * for page size > block size. - */ - ioffset = round_down(offset, PAGE_SIZE); - - /* Write out all dirty pages */ - ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, - LLONG_MAX); - if (ret) - return ret; - - /* Take mutex lock */ - mutex_lock(&inode->i_mutex); - + inode_lock(inode); /* * There is no need to overlap collapse range with EOF, in which case * it is effectively a truncate operation @@ -5524,17 +5508,43 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) goto out_mutex; } - truncate_pagecache(inode, ioffset); - /* Wait for existing dio to complete */ ext4_inode_block_unlocked_dio(inode); inode_dio_wait(inode); + /* + * Prevent page faults from reinstantiating pages we have released from + * page cache. + */ + down_write(&EXT4_I(inode)->i_mmap_sem); + /* + * Need to round down offset to be aligned with page size boundary + * for page size > block size. + */ + ioffset = round_down(offset, PAGE_SIZE); + /* + * Write tail of the last page before removed range since it will get + * removed from the page cache below. + */ + ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, offset); + if (ret) + goto out_mmap; + /* + * Write data that will be shifted to preserve them when discarding + * page cache below. We are also protected from pages becoming dirty + * by i_mmap_sem. + */ + ret = filemap_write_and_wait_range(inode->i_mapping, offset + len, + LLONG_MAX); + if (ret) + goto out_mmap; + truncate_pagecache(inode, ioffset); + credits = ext4_writepage_trans_blocks(inode); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); - goto out_dio; + goto out_mmap; } down_write(&EXT4_I(inode)->i_data_sem); @@ -5573,10 +5583,11 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) out_stop: ext4_journal_stop(handle); -out_dio: +out_mmap: + up_write(&EXT4_I(inode)->i_mmap_sem); ext4_inode_resume_unlocked_dio(inode); out_mutex: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } @@ -5627,21 +5638,7 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) return ret; } - /* - * Need to round down to align start offset to page size boundary - * for page size > block size. - */ - ioffset = round_down(offset, PAGE_SIZE); - - /* Write out all dirty pages */ - ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, - LLONG_MAX); - if (ret) - return ret; - - /* Take mutex lock */ - mutex_lock(&inode->i_mutex); - + inode_lock(inode); /* Currently just for extent based files */ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { ret = -EOPNOTSUPP; @@ -5660,17 +5657,32 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) goto out_mutex; } - truncate_pagecache(inode, ioffset); - /* Wait for existing dio to complete */ ext4_inode_block_unlocked_dio(inode); inode_dio_wait(inode); + /* + * Prevent page faults from reinstantiating pages we have released from + * page cache. + */ + down_write(&EXT4_I(inode)->i_mmap_sem); + /* + * Need to round down to align start offset to page size boundary + * for page size > block size. + */ + ioffset = round_down(offset, PAGE_SIZE); + /* Write out all dirty pages */ + ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, + LLONG_MAX); + if (ret) + goto out_mmap; + truncate_pagecache(inode, ioffset); + credits = ext4_writepage_trans_blocks(inode); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); - goto out_dio; + goto out_mmap; } /* Expand file to avoid data loss if there is error while shifting */ @@ -5741,10 +5753,11 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) out_stop: ext4_journal_stop(handle); -out_dio: +out_mmap: + up_write(&EXT4_I(inode)->i_mmap_sem); ext4_inode_resume_unlocked_dio(inode); out_mutex: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } @@ -5779,8 +5792,8 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, BUG_ON(!rwsem_is_locked(&EXT4_I(inode1)->i_data_sem)); BUG_ON(!rwsem_is_locked(&EXT4_I(inode2)->i_data_sem)); - BUG_ON(!mutex_is_locked(&inode1->i_mutex)); - BUG_ON(!mutex_is_locked(&inode2->i_mutex)); + BUG_ON(!inode_is_locked(inode1)); + BUG_ON(!inode_is_locked(inode2)); *erp = ext4_es_remove_extent(inode1, lblk1, count); if (unlikely(*erp)) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 113837e7b..4cd318f31 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -113,7 +113,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) ext4_unwritten_wait(inode); } - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = generic_write_checks(iocb, from); if (ret <= 0) goto out; @@ -169,7 +169,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) } ret = __generic_file_write_iter(iocb, from); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (ret > 0) { ssize_t err; @@ -186,50 +186,42 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) return ret; out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (aio_mutex) mutex_unlock(aio_mutex); return ret; } #ifdef CONFIG_FS_DAX -static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate) -{ - struct inode *inode = bh->b_assoc_map->host; - /* XXX: breaks on 32-bit > 16TB. Is that even supported? */ - loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits; - int err; - if (!uptodate) - return; - WARN_ON(!buffer_unwritten(bh)); - err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size); -} - static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { int result; handle_t *handle = NULL; - struct super_block *sb = file_inode(vma->vm_file)->i_sb; + struct inode *inode = file_inode(vma->vm_file); + struct super_block *sb = inode->i_sb; bool write = vmf->flags & FAULT_FLAG_WRITE; if (write) { sb_start_pagefault(sb); file_update_time(vma->vm_file); + down_read(&EXT4_I(inode)->i_mmap_sem); handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, EXT4_DATA_TRANS_BLOCKS(sb)); - } + } else + down_read(&EXT4_I(inode)->i_mmap_sem); if (IS_ERR(handle)) result = VM_FAULT_SIGBUS; else - result = __dax_fault(vma, vmf, ext4_get_block_dax, - ext4_end_io_unwritten); + result = __dax_fault(vma, vmf, ext4_dax_mmap_get_block, NULL); if (write) { if (!IS_ERR(handle)) ext4_journal_stop(handle); + up_read(&EXT4_I(inode)->i_mmap_sem); sb_end_pagefault(sb); - } + } else + up_read(&EXT4_I(inode)->i_mmap_sem); return result; } @@ -246,44 +238,73 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, if (write) { sb_start_pagefault(sb); file_update_time(vma->vm_file); + down_read(&EXT4_I(inode)->i_mmap_sem); handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, ext4_chunk_trans_blocks(inode, PMD_SIZE / PAGE_SIZE)); - } + } else + down_read(&EXT4_I(inode)->i_mmap_sem); if (IS_ERR(handle)) result = VM_FAULT_SIGBUS; else result = __dax_pmd_fault(vma, addr, pmd, flags, - ext4_get_block_dax, ext4_end_io_unwritten); + ext4_dax_mmap_get_block, NULL); if (write) { if (!IS_ERR(handle)) ext4_journal_stop(handle); + up_read(&EXT4_I(inode)->i_mmap_sem); sb_end_pagefault(sb); - } + } else + up_read(&EXT4_I(inode)->i_mmap_sem); return result; } -static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +/* + * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_fault() + * handler we check for races agaist truncate. Note that since we cycle through + * i_mmap_sem, we are sure that also any hole punching that began before we + * were called is finished by now and so if it included part of the file we + * are working on, our pte will get unmapped and the check for pte_same() in + * wp_pfn_shared() fails. Thus fault gets retried and things work out as + * desired. + */ +static int ext4_dax_pfn_mkwrite(struct vm_area_struct *vma, + struct vm_fault *vmf) { - return dax_mkwrite(vma, vmf, ext4_get_block_dax, - ext4_end_io_unwritten); + struct inode *inode = file_inode(vma->vm_file); + struct super_block *sb = inode->i_sb; + loff_t size; + int ret; + + sb_start_pagefault(sb); + file_update_time(vma->vm_file); + down_read(&EXT4_I(inode)->i_mmap_sem); + size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (vmf->pgoff >= size) + ret = VM_FAULT_SIGBUS; + else + ret = dax_pfn_mkwrite(vma, vmf); + up_read(&EXT4_I(inode)->i_mmap_sem); + sb_end_pagefault(sb); + + return ret; } static const struct vm_operations_struct ext4_dax_vm_ops = { .fault = ext4_dax_fault, .pmd_fault = ext4_dax_pmd_fault, - .page_mkwrite = ext4_dax_mkwrite, - .pfn_mkwrite = dax_pfn_mkwrite, + .page_mkwrite = ext4_dax_fault, + .pfn_mkwrite = ext4_dax_pfn_mkwrite, }; #else #define ext4_dax_vm_ops ext4_file_vm_ops #endif static const struct vm_operations_struct ext4_file_vm_ops = { - .fault = filemap_fault, + .fault = ext4_filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = ext4_page_mkwrite, }; @@ -314,6 +335,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp) struct super_block *sb = inode->i_sb; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct vfsmount *mnt = filp->f_path.mnt; + struct inode *dir = filp->f_path.dentry->d_parent->d_inode; struct path path; char buf[64], *cp; int ret; @@ -357,6 +379,14 @@ static int ext4_file_open(struct inode * inode, struct file * filp) if (ext4_encryption_info(inode) == NULL) return -ENOKEY; } + if (ext4_encrypted_inode(dir) && + !ext4_is_child_context_consistent_with_parent(dir, inode)) { + ext4_warning(inode->i_sb, + "Inconsistent encryption contexts: %lu/%lu\n", + (unsigned long) dir->i_ino, + (unsigned long) inode->i_ino); + return -EPERM; + } /* * Set up the jbd2_inode if we are opening the inode for * writing and the journal is present @@ -527,11 +557,11 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) int blkbits; int ret = 0; - mutex_lock(&inode->i_mutex); + inode_lock(inode); isize = i_size_read(inode); if (offset >= isize) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return -ENXIO; } @@ -579,7 +609,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) dataoff = (loff_t)last << blkbits; } while (last <= end); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (dataoff > isize) return -ENXIO; @@ -600,11 +630,11 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) int blkbits; int ret = 0; - mutex_lock(&inode->i_mutex); + inode_lock(inode); isize = i_size_read(inode); if (offset >= isize) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return -ENXIO; } @@ -655,7 +685,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) break; } while (last <= end); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (holeoff > isize) holeoff = isize; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 53f2b98a6..acc0ad56b 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -801,6 +801,13 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, inode->i_gid = dir->i_gid; } else inode_init_owner(inode, dir, mode); + + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_PROJECT) && + ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT)) + ei->i_projid = EXT4_I(dir)->i_projid; + else + ei->i_projid = make_kprojid(&init_user_ns, EXT4_DEF_PROJID); + err = dquot_initialize(inode); if (err) goto out; diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index d884989cc..dfe3b9baf 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -995,12 +995,11 @@ void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh, */ static int ext4_add_dirent_to_inline(handle_t *handle, struct ext4_filename *fname, - struct dentry *dentry, + struct inode *dir, struct inode *inode, struct ext4_iloc *iloc, void *inline_start, int inline_size) { - struct inode *dir = d_inode(dentry->d_parent); int err; struct ext4_dir_entry_2 *de; @@ -1245,12 +1244,11 @@ out: * the new created block. */ int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, - struct dentry *dentry, struct inode *inode) + struct inode *dir, struct inode *inode) { int ret, inline_size; void *inline_start; struct ext4_iloc iloc; - struct inode *dir = d_inode(dentry->d_parent); ret = ext4_get_inode_loc(dir, &iloc); if (ret) @@ -1264,7 +1262,7 @@ int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, EXT4_INLINE_DOTDOT_SIZE; inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE; - ret = ext4_add_dirent_to_inline(handle, fname, dentry, inode, &iloc, + ret = ext4_add_dirent_to_inline(handle, fname, dir, inode, &iloc, inline_start, inline_size); if (ret != -ENOSPC) goto out; @@ -1285,7 +1283,7 @@ int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, if (inline_size) { inline_start = ext4_get_inline_xattr_pos(dir, &iloc); - ret = ext4_add_dirent_to_inline(handle, fname, dentry, + ret = ext4_add_dirent_to_inline(handle, fname, dir, inode, &iloc, inline_start, inline_size); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 06bda0361..aee960b1a 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -383,6 +383,21 @@ static int __check_block_validity(struct inode *inode, const char *func, return 0; } +int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, + ext4_lblk_t len) +{ + int ret; + + if (ext4_encrypted_inode(inode)) + return ext4_encrypted_zeroout(inode, lblk, pblk, len); + + ret = sb_issue_zeroout(inode->i_sb, pblk, len, GFP_NOFS); + if (ret > 0) + ret = 0; + + return ret; +} + #define check_block_validity(inode, map) \ __check_block_validity((inode), __func__, __LINE__, (map)) @@ -403,8 +418,7 @@ static void ext4_map_blocks_es_recheck(handle_t *handle, * out taking i_data_sem. So at the time the unwritten extent * could be converted. */ - if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) - down_read(&EXT4_I(inode)->i_data_sem); + down_read(&EXT4_I(inode)->i_data_sem); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { retval = ext4_ext_map_blocks(handle, inode, map, flags & EXT4_GET_BLOCKS_KEEP_SIZE); @@ -412,8 +426,7 @@ static void ext4_map_blocks_es_recheck(handle_t *handle, retval = ext4_ind_map_blocks(handle, inode, map, flags & EXT4_GET_BLOCKS_KEEP_SIZE); } - if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) - up_read((&EXT4_I(inode)->i_data_sem)); + up_read((&EXT4_I(inode)->i_data_sem)); /* * We don't check m_len because extent will be collpased in status @@ -509,8 +522,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, * Try to see if we can get the block without requesting a new * file system block. */ - if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) - down_read(&EXT4_I(inode)->i_data_sem); + down_read(&EXT4_I(inode)->i_data_sem); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { retval = ext4_ext_map_blocks(handle, inode, map, flags & EXT4_GET_BLOCKS_KEEP_SIZE); @@ -541,8 +553,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, if (ret < 0) retval = ret; } - if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) - up_read((&EXT4_I(inode)->i_data_sem)); + up_read((&EXT4_I(inode)->i_data_sem)); found: if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { @@ -626,13 +637,29 @@ found: } /* + * We have to zeroout blocks before inserting them into extent + * status tree. Otherwise someone could look them up there and + * use them before they are really zeroed. + */ + if (flags & EXT4_GET_BLOCKS_ZERO && + map->m_flags & EXT4_MAP_MAPPED && + map->m_flags & EXT4_MAP_NEW) { + ret = ext4_issue_zeroout(inode, map->m_lblk, + map->m_pblk, map->m_len); + if (ret) { + retval = ret; + goto out_sem; + } + } + + /* * If the extent has been zeroed out, we don't need to update * extent status tree. */ if ((flags & EXT4_GET_BLOCKS_PRE_IO) && ext4_es_lookup_extent(inode, map->m_lblk, &es)) { if (ext4_es_is_written(&es)) - goto has_zeroout; + goto out_sem; } status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; @@ -643,11 +670,13 @@ found: status |= EXTENT_STATUS_DELAYED; ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk, status); - if (ret < 0) + if (ret < 0) { retval = ret; + goto out_sem; + } } -has_zeroout: +out_sem: up_write((&EXT4_I(inode)->i_data_sem)); if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { ret = check_block_validity(inode, map); @@ -702,7 +731,7 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock, map.m_lblk = iblock; map.m_len = bh->b_size >> inode->i_blkbits; - if (flags && !(flags & EXT4_GET_BLOCKS_NO_LOCK) && !handle) { + if (flags && !handle) { /* Direct IO write... */ if (map.m_len > DIO_MAX_BLOCKS) map.m_len = DIO_MAX_BLOCKS; @@ -722,16 +751,6 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock, map_bh(bh, inode->i_sb, map.m_pblk); ext4_update_bh_state(bh, map.m_flags); - if (IS_DAX(inode) && buffer_unwritten(bh)) { - /* - * dgc: I suspect unwritten conversion on ext4+DAX is - * fundamentally broken here when there are concurrent - * read/write in progress on this inode. - */ - WARN_ON_ONCE(io_end); - bh->b_assoc_map = inode->i_mapping; - bh->b_private = (void *)(unsigned long)iblock; - } if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN) set_buffer_defer_completion(bh); bh->b_size = inode->i_sb->s_blocksize * map.m_len; @@ -907,9 +926,6 @@ int do_journal_get_write_access(handle_t *handle, return ret; } -static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create); - #ifdef CONFIG_EXT4_FS_ENCRYPTION static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, get_block_t *get_block) @@ -2462,6 +2478,10 @@ static int ext4_writepages(struct address_space *mapping, trace_ext4_writepages(inode, wbc); + if (dax_mapping(mapping)) + return dax_writeback_mapping_range(mapping, inode->i_sb->s_bdev, + wbc); + /* * No pages to write? This is mainly a kludge to avoid starting * a transaction for special inodes like journal inode on last iput() @@ -3082,25 +3102,96 @@ int ext4_get_block_write(struct inode *inode, sector_t iblock, EXT4_GET_BLOCKS_IO_CREATE_EXT); } -static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock, +static int ext4_get_block_overwrite(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { - ext4_debug("ext4_get_block_write_nolock: inode %lu, create flag %d\n", + int ret; + + ext4_debug("ext4_get_block_overwrite: inode %lu, create flag %d\n", inode->i_ino, create); - return _ext4_get_block(inode, iblock, bh_result, - EXT4_GET_BLOCKS_NO_LOCK); + ret = _ext4_get_block(inode, iblock, bh_result, 0); + /* + * Blocks should have been preallocated! ext4_file_write_iter() checks + * that. + */ + WARN_ON_ONCE(!buffer_mapped(bh_result)); + + return ret; } -int ext4_get_block_dax(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) +#ifdef CONFIG_FS_DAX +int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) { - int flags = EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_UNWRIT_EXT; - if (create) - flags |= EXT4_GET_BLOCKS_CREATE; - ext4_debug("ext4_get_block_dax: inode %lu, create flag %d\n", + int ret, err; + int credits; + struct ext4_map_blocks map; + handle_t *handle = NULL; + int flags = 0; + + ext4_debug("ext4_dax_mmap_get_block: inode %lu, create flag %d\n", inode->i_ino, create); - return _ext4_get_block(inode, iblock, bh_result, flags); + map.m_lblk = iblock; + map.m_len = bh_result->b_size >> inode->i_blkbits; + credits = ext4_chunk_trans_blocks(inode, map.m_len); + if (create) { + flags |= EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_CREATE_ZERO; + handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + return ret; + } + } + + ret = ext4_map_blocks(handle, inode, &map, flags); + if (create) { + err = ext4_journal_stop(handle); + if (ret >= 0 && err < 0) + ret = err; + } + if (ret <= 0) + goto out; + if (map.m_flags & EXT4_MAP_UNWRITTEN) { + int err2; + + /* + * We are protected by i_mmap_sem so we know block cannot go + * away from under us even though we dropped i_data_sem. + * Convert extent to written and write zeros there. + * + * Note: We may get here even when create == 0. + */ + handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + + err = ext4_map_blocks(handle, inode, &map, + EXT4_GET_BLOCKS_CONVERT | EXT4_GET_BLOCKS_CREATE_ZERO); + if (err < 0) + ret = err; + err2 = ext4_journal_stop(handle); + if (err2 < 0 && ret > 0) + ret = err2; + } +out: + WARN_ON_ONCE(ret == 0 && create); + if (ret > 0) { + map_bh(bh_result, inode->i_sb, map.m_pblk); + bh_result->b_state = (bh_result->b_state & ~EXT4_MAP_FLAGS) | + map.m_flags; + /* + * At least for now we have to clear BH_New so that DAX code + * doesn't attempt to zero blocks again in a racy way. + */ + bh_result->b_state &= ~(1 << BH_New); + bh_result->b_size = map.m_len << inode->i_blkbits; + ret = 0; + } + return ret; } +#endif static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, ssize_t size, void *private) @@ -3171,10 +3262,8 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, /* If we do a overwrite dio, i_mutex locking can be released */ overwrite = *((int *)iocb->private); - if (overwrite) { - down_read(&EXT4_I(inode)->i_data_sem); - mutex_unlock(&inode->i_mutex); - } + if (overwrite) + inode_unlock(inode); /* * We could direct write to holes and fallocate. @@ -3196,29 +3285,29 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, * case, we allocate an io_end structure to hook to the iocb. */ iocb->private = NULL; - ext4_inode_aio_set(inode, NULL); - if (!is_sync_kiocb(iocb)) { - io_end = ext4_init_io_end(inode, GFP_NOFS); - if (!io_end) { - ret = -ENOMEM; - goto retake_lock; - } - /* - * Grab reference for DIO. Will be dropped in ext4_end_io_dio() - */ - iocb->private = ext4_get_io_end(io_end); - /* - * we save the io structure for current async direct - * IO, so that later ext4_map_blocks() could flag the - * io structure whether there is a unwritten extents - * needs to be converted when IO is completed. - */ - ext4_inode_aio_set(inode, io_end); - } - if (overwrite) { - get_block_func = ext4_get_block_write_nolock; + get_block_func = ext4_get_block_overwrite; } else { + ext4_inode_aio_set(inode, NULL); + if (!is_sync_kiocb(iocb)) { + io_end = ext4_init_io_end(inode, GFP_NOFS); + if (!io_end) { + ret = -ENOMEM; + goto retake_lock; + } + /* + * Grab reference for DIO. Will be dropped in + * ext4_end_io_dio() + */ + iocb->private = ext4_get_io_end(io_end); + /* + * we save the io structure for current async direct + * IO, so that later ext4_map_blocks() could flag the + * io structure whether there is a unwritten extents + * needs to be converted when IO is completed. + */ + ext4_inode_aio_set(inode, io_end); + } get_block_func = ext4_get_block_write; dio_flags = DIO_LOCKING; } @@ -3273,10 +3362,8 @@ retake_lock: if (iov_iter_rw(iter) == WRITE) inode_dio_end(inode); /* take i_mutex locking again if we do a ovewrite dio */ - if (overwrite) { - up_read(&EXT4_I(inode)->i_data_sem); - mutex_lock(&inode->i_mutex); - } + if (overwrite) + inode_lock(inode); return ret; } @@ -3587,6 +3674,35 @@ int ext4_can_truncate(struct inode *inode) } /* + * We have to make sure i_disksize gets properly updated before we truncate + * page cache due to hole punching or zero range. Otherwise i_disksize update + * can get lost as it may have been postponed to submission of writeback but + * that will never happen after we truncate page cache. + */ +int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, + loff_t len) +{ + handle_t *handle; + loff_t size = i_size_read(inode); + + WARN_ON(!inode_is_locked(inode)); + if (offset > size || offset + len < size) + return 0; + + if (EXT4_I(inode)->i_disksize >= size) + return 0; + + handle = ext4_journal_start(inode, EXT4_HT_MISC, 1); + if (IS_ERR(handle)) + return PTR_ERR(handle); + ext4_update_i_disksize(inode, size); + ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); + + return 0; +} + +/* * ext4_punch_hole: punches a hole in a file by releaseing the blocks * associated with the given offset and length * @@ -3623,7 +3739,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) return ret; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* No need to punch hole beyond i_size */ if (offset >= inode->i_size) @@ -3651,17 +3767,26 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) } + /* Wait all existing dio workers, newcomers will block on i_mutex */ + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + + /* + * Prevent page faults from reinstantiating pages we have released from + * page cache. + */ + down_write(&EXT4_I(inode)->i_mmap_sem); first_block_offset = round_up(offset, sb->s_blocksize); last_block_offset = round_down((offset + length), sb->s_blocksize) - 1; /* Now release the pages and zero block aligned part of pages*/ - if (last_block_offset > first_block_offset) + if (last_block_offset > first_block_offset) { + ret = ext4_update_disksize_before_punch(inode, offset, length); + if (ret) + goto out_dio; truncate_pagecache_range(inode, first_block_offset, last_block_offset); - - /* Wait all existing dio workers, newcomers will block on i_mutex */ - ext4_inode_block_unlocked_dio(inode); - inode_dio_wait(inode); + } if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) credits = ext4_writepage_trans_blocks(inode); @@ -3708,19 +3833,15 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) if (IS_SYNC(inode)) ext4_handle_sync(handle); - /* Now release the pages again to reduce race window */ - if (last_block_offset > first_block_offset) - truncate_pagecache_range(inode, first_block_offset, - last_block_offset); - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); ext4_mark_inode_dirty(handle, inode); out_stop: ext4_journal_stop(handle); out_dio: + up_write(&EXT4_I(inode)->i_mmap_sem); ext4_inode_resume_unlocked_dio(inode); out_mutex: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } @@ -3790,7 +3911,7 @@ void ext4_truncate(struct inode *inode) * have i_mutex locked because it's not necessary. */ if (!(inode->i_state & (I_NEW|I_FREEING))) - WARN_ON(!mutex_is_locked(&inode->i_mutex)); + WARN_ON(!inode_is_locked(inode)); trace_ext4_truncate_enter(inode); if (!ext4_can_truncate(inode)) @@ -4038,7 +4159,7 @@ void ext4_set_inode_flags(struct inode *inode) new_fl |= S_NOATIME; if (flags & EXT4_DIRSYNC_FL) new_fl |= S_DIRSYNC; - if (test_opt(inode->i_sb, DAX)) + if (test_opt(inode->i_sb, DAX) && S_ISREG(inode->i_mode)) new_fl |= S_DAX; inode_set_flags(inode, new_fl, S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX); @@ -4104,6 +4225,14 @@ static inline void ext4_iget_extra_inode(struct inode *inode, EXT4_I(inode)->i_inline_off = 0; } +int ext4_get_projid(struct inode *inode, kprojid_t *projid) +{ + if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, EXT4_FEATURE_RO_COMPAT_PROJECT)) + return -EOPNOTSUPP; + *projid = EXT4_I(inode)->i_projid; + return 0; +} + struct inode *ext4_iget(struct super_block *sb, unsigned long ino) { struct ext4_iloc iloc; @@ -4115,6 +4244,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) int block; uid_t i_uid; gid_t i_gid; + projid_t i_projid; inode = iget_locked(sb, ino); if (!inode) @@ -4164,12 +4294,20 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) inode->i_mode = le16_to_cpu(raw_inode->i_mode); i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_PROJECT) && + EXT4_INODE_SIZE(sb) > EXT4_GOOD_OLD_INODE_SIZE && + EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) + i_projid = (projid_t)le32_to_cpu(raw_inode->i_projid); + else + i_projid = EXT4_DEF_PROJID; + if (!(test_opt(inode->i_sb, NO_UID32))) { i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; } i_uid_write(inode, i_uid); i_gid_write(inode, i_gid); + ei->i_projid = make_kprojid(&init_user_ns, i_projid); set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ @@ -4311,6 +4449,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) inode->i_op = &ext4_symlink_inode_operations; ext4_set_aops(inode); } + inode_nohighmem(inode); } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { inode->i_op = &ext4_special_inode_operations; @@ -4467,6 +4606,7 @@ static int ext4_do_update_inode(handle_t *handle, int need_datasync = 0, set_large_file = 0; uid_t i_uid; gid_t i_gid; + projid_t i_projid; spin_lock(&ei->i_raw_lock); @@ -4479,6 +4619,7 @@ static int ext4_do_update_inode(handle_t *handle, raw_inode->i_mode = cpu_to_le16(inode->i_mode); i_uid = i_uid_read(inode); i_gid = i_gid_read(inode); + i_projid = from_kprojid(&init_user_ns, ei->i_projid); if (!(test_opt(inode->i_sb, NO_UID32))) { raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid)); raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid)); @@ -4556,6 +4697,15 @@ static int ext4_do_update_inode(handle_t *handle, cpu_to_le16(ei->i_extra_isize); } } + + BUG_ON(!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_PROJECT) && + i_projid != EXT4_DEF_PROJID); + + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && + EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) + raw_inode->i_projid = cpu_to_le32(i_projid); + ext4_inode_csum_set(inode, raw_inode, ei); spin_unlock(&ei->i_raw_lock); if (inode->i_sb->s_flags & MS_LAZYTIME) @@ -4851,6 +5001,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) } else ext4_wait_for_tail_page_commit(inode); } + down_write(&EXT4_I(inode)->i_mmap_sem); /* * Truncate pagecache after we've waited for commit * in data=journal mode to make pages freeable. @@ -4858,6 +5009,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) truncate_pagecache(inode, inode->i_size); if (shrink) ext4_truncate(inode); + up_write(&EXT4_I(inode)->i_mmap_sem); } if (!rc) { @@ -5306,6 +5458,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) sb_start_pagefault(inode->i_sb); file_update_time(vma->vm_file); + + down_read(&EXT4_I(inode)->i_mmap_sem); /* Delalloc case is easy... */ if (test_opt(inode->i_sb, DELALLOC) && !ext4_should_journal_data(inode) && @@ -5375,6 +5529,19 @@ retry_alloc: out_ret: ret = block_page_mkwrite_return(ret); out: + up_read(&EXT4_I(inode)->i_mmap_sem); sb_end_pagefault(inode->i_sb); return ret; } + +int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct inode *inode = file_inode(vma->vm_file); + int err; + + down_read(&EXT4_I(inode)->i_mmap_sem); + err = filemap_fault(vma, vmf); + up_read(&EXT4_I(inode)->i_mmap_sem); + + return err; +} diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 5e872fd40..eae5917c5 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -14,6 +14,7 @@ #include <linux/mount.h> #include <linux/file.h> #include <linux/random.h> +#include <linux/quotaops.h> #include <asm/uaccess.h> #include "ext4_jbd2.h" #include "ext4.h" @@ -202,6 +203,238 @@ static int uuid_is_zero(__u8 u[16]) return 1; } +static int ext4_ioctl_setflags(struct inode *inode, + unsigned int flags) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + handle_t *handle = NULL; + int err = -EPERM, migrate = 0; + struct ext4_iloc iloc; + unsigned int oldflags, mask, i; + unsigned int jflag; + + /* Is it quota file? Do not allow user to mess with it */ + if (IS_NOQUOTA(inode)) + goto flags_out; + + oldflags = ei->i_flags; + + /* The JOURNAL_DATA flag is modifiable only by root */ + jflag = flags & EXT4_JOURNAL_DATA_FL; + + /* + * The IMMUTABLE and APPEND_ONLY flags can only be changed by + * the relevant capability. + * + * This test looks nicer. Thanks to Pauline Middelink + */ + if ((flags ^ oldflags) & (EXT4_APPEND_FL | EXT4_IMMUTABLE_FL)) { + if (!capable(CAP_LINUX_IMMUTABLE)) + goto flags_out; + } + + /* + * The JOURNAL_DATA flag can only be changed by + * the relevant capability. + */ + if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) { + if (!capable(CAP_SYS_RESOURCE)) + goto flags_out; + } + if ((flags ^ oldflags) & EXT4_EXTENTS_FL) + migrate = 1; + + if (flags & EXT4_EOFBLOCKS_FL) { + /* we don't support adding EOFBLOCKS flag */ + if (!(oldflags & EXT4_EOFBLOCKS_FL)) { + err = -EOPNOTSUPP; + goto flags_out; + } + } else if (oldflags & EXT4_EOFBLOCKS_FL) + ext4_truncate(inode); + + handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto flags_out; + } + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + err = ext4_reserve_inode_write(handle, inode, &iloc); + if (err) + goto flags_err; + + for (i = 0, mask = 1; i < 32; i++, mask <<= 1) { + if (!(mask & EXT4_FL_USER_MODIFIABLE)) + continue; + if (mask & flags) + ext4_set_inode_flag(inode, i); + else + ext4_clear_inode_flag(inode, i); + } + + ext4_set_inode_flags(inode); + inode->i_ctime = ext4_current_time(inode); + + err = ext4_mark_iloc_dirty(handle, inode, &iloc); +flags_err: + ext4_journal_stop(handle); + if (err) + goto flags_out; + + if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) + err = ext4_change_inode_journal_flag(inode, jflag); + if (err) + goto flags_out; + if (migrate) { + if (flags & EXT4_EXTENTS_FL) + err = ext4_ext_migrate(inode); + else + err = ext4_ind_migrate(inode); + } + +flags_out: + return err; +} + +#ifdef CONFIG_QUOTA +static int ext4_ioctl_setproject(struct file *filp, __u32 projid) +{ + struct inode *inode = file_inode(filp); + struct super_block *sb = inode->i_sb; + struct ext4_inode_info *ei = EXT4_I(inode); + int err, rc; + handle_t *handle; + kprojid_t kprojid; + struct ext4_iloc iloc; + struct ext4_inode *raw_inode; + + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_PROJECT)) { + if (projid != EXT4_DEF_PROJID) + return -EOPNOTSUPP; + else + return 0; + } + + if (EXT4_INODE_SIZE(sb) <= EXT4_GOOD_OLD_INODE_SIZE) + return -EOPNOTSUPP; + + kprojid = make_kprojid(&init_user_ns, (projid_t)projid); + + if (projid_eq(kprojid, EXT4_I(inode)->i_projid)) + return 0; + + err = mnt_want_write_file(filp); + if (err) + return err; + + err = -EPERM; + inode_lock(inode); + /* Is it quota file? Do not allow user to mess with it */ + if (IS_NOQUOTA(inode)) + goto out_unlock; + + err = ext4_get_inode_loc(inode, &iloc); + if (err) + goto out_unlock; + + raw_inode = ext4_raw_inode(&iloc); + if (!EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) { + err = -EOVERFLOW; + brelse(iloc.bh); + goto out_unlock; + } + brelse(iloc.bh); + + dquot_initialize(inode); + + handle = ext4_journal_start(inode, EXT4_HT_QUOTA, + EXT4_QUOTA_INIT_BLOCKS(sb) + + EXT4_QUOTA_DEL_BLOCKS(sb) + 3); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto out_unlock; + } + + err = ext4_reserve_inode_write(handle, inode, &iloc); + if (err) + goto out_stop; + + if (sb_has_quota_limits_enabled(sb, PRJQUOTA)) { + struct dquot *transfer_to[MAXQUOTAS] = { }; + + transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid)); + if (transfer_to[PRJQUOTA]) { + err = __dquot_transfer(inode, transfer_to); + dqput(transfer_to[PRJQUOTA]); + if (err) + goto out_dirty; + } + } + EXT4_I(inode)->i_projid = kprojid; + inode->i_ctime = ext4_current_time(inode); +out_dirty: + rc = ext4_mark_iloc_dirty(handle, inode, &iloc); + if (!err) + err = rc; +out_stop: + ext4_journal_stop(handle); +out_unlock: + inode_unlock(inode); + mnt_drop_write_file(filp); + return err; +} +#else +static int ext4_ioctl_setproject(struct file *filp, __u32 projid) +{ + if (projid != EXT4_DEF_PROJID) + return -EOPNOTSUPP; + return 0; +} +#endif + +/* Transfer internal flags to xflags */ +static inline __u32 ext4_iflags_to_xflags(unsigned long iflags) +{ + __u32 xflags = 0; + + if (iflags & EXT4_SYNC_FL) + xflags |= FS_XFLAG_SYNC; + if (iflags & EXT4_IMMUTABLE_FL) + xflags |= FS_XFLAG_IMMUTABLE; + if (iflags & EXT4_APPEND_FL) + xflags |= FS_XFLAG_APPEND; + if (iflags & EXT4_NODUMP_FL) + xflags |= FS_XFLAG_NODUMP; + if (iflags & EXT4_NOATIME_FL) + xflags |= FS_XFLAG_NOATIME; + if (iflags & EXT4_PROJINHERIT_FL) + xflags |= FS_XFLAG_PROJINHERIT; + return xflags; +} + +/* Transfer xflags flags to internal */ +static inline unsigned long ext4_xflags_to_iflags(__u32 xflags) +{ + unsigned long iflags = 0; + + if (xflags & FS_XFLAG_SYNC) + iflags |= EXT4_SYNC_FL; + if (xflags & FS_XFLAG_IMMUTABLE) + iflags |= EXT4_IMMUTABLE_FL; + if (xflags & FS_XFLAG_APPEND) + iflags |= EXT4_APPEND_FL; + if (xflags & FS_XFLAG_NODUMP) + iflags |= EXT4_NODUMP_FL; + if (xflags & FS_XFLAG_NOATIME) + iflags |= EXT4_NOATIME_FL; + if (xflags & FS_XFLAG_PROJINHERIT) + iflags |= EXT4_PROJINHERIT_FL; + + return iflags; +} + long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -217,11 +450,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) flags = ei->i_flags & EXT4_FL_USER_VISIBLE; return put_user(flags, (int __user *) arg); case EXT4_IOC_SETFLAGS: { - handle_t *handle = NULL; - int err, migrate = 0; - struct ext4_iloc iloc; - unsigned int oldflags, mask, i; - unsigned int jflag; + int err; if (!inode_owner_or_capable(inode)) return -EACCES; @@ -235,90 +464,9 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) flags = ext4_mask_flags(inode->i_mode, flags); - err = -EPERM; - mutex_lock(&inode->i_mutex); - /* Is it quota file? Do not allow user to mess with it */ - if (IS_NOQUOTA(inode)) - goto flags_out; - - oldflags = ei->i_flags; - - /* The JOURNAL_DATA flag is modifiable only by root */ - jflag = flags & EXT4_JOURNAL_DATA_FL; - - /* - * The IMMUTABLE and APPEND_ONLY flags can only be changed by - * the relevant capability. - * - * This test looks nicer. Thanks to Pauline Middelink - */ - if ((flags ^ oldflags) & (EXT4_APPEND_FL | EXT4_IMMUTABLE_FL)) { - if (!capable(CAP_LINUX_IMMUTABLE)) - goto flags_out; - } - - /* - * The JOURNAL_DATA flag can only be changed by - * the relevant capability. - */ - if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) { - if (!capable(CAP_SYS_RESOURCE)) - goto flags_out; - } - if ((flags ^ oldflags) & EXT4_EXTENTS_FL) - migrate = 1; - - if (flags & EXT4_EOFBLOCKS_FL) { - /* we don't support adding EOFBLOCKS flag */ - if (!(oldflags & EXT4_EOFBLOCKS_FL)) { - err = -EOPNOTSUPP; - goto flags_out; - } - } else if (oldflags & EXT4_EOFBLOCKS_FL) - ext4_truncate(inode); - - handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); - if (IS_ERR(handle)) { - err = PTR_ERR(handle); - goto flags_out; - } - if (IS_SYNC(inode)) - ext4_handle_sync(handle); - err = ext4_reserve_inode_write(handle, inode, &iloc); - if (err) - goto flags_err; - - for (i = 0, mask = 1; i < 32; i++, mask <<= 1) { - if (!(mask & EXT4_FL_USER_MODIFIABLE)) - continue; - if (mask & flags) - ext4_set_inode_flag(inode, i); - else - ext4_clear_inode_flag(inode, i); - } - - ext4_set_inode_flags(inode); - inode->i_ctime = ext4_current_time(inode); - - err = ext4_mark_iloc_dirty(handle, inode, &iloc); -flags_err: - ext4_journal_stop(handle); - if (err) - goto flags_out; - - if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) - err = ext4_change_inode_journal_flag(inode, jflag); - if (err) - goto flags_out; - if (migrate) { - if (flags & EXT4_EXTENTS_FL) - err = ext4_ext_migrate(inode); - else - err = ext4_ind_migrate(inode); - } - -flags_out: - mutex_unlock(&inode->i_mutex); + inode_lock(inode); + err = ext4_ioctl_setflags(inode, flags); + inode_unlock(inode); mnt_drop_write_file(filp); return err; } @@ -349,7 +497,7 @@ flags_out: goto setversion_out; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) { err = PTR_ERR(handle); @@ -364,7 +512,7 @@ flags_out: ext4_journal_stop(handle); unlock_out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); setversion_out: mnt_drop_write_file(filp); return err; @@ -435,6 +583,11 @@ group_extend_out: "Online defrag not supported with bigalloc"); err = -EOPNOTSUPP; goto mext_out; + } else if (IS_DAX(inode)) { + ext4_msg(sb, KERN_ERR, + "Online defrag not supported with DAX"); + err = -EOPNOTSUPP; + goto mext_out; } err = mnt_want_write_file(filp); @@ -510,9 +663,9 @@ group_add_out: * ext4_ext_swap_inode_data before we switch the * inode format to prevent read. */ - mutex_lock(&(inode->i_mutex)); + inode_lock((inode)); err = ext4_ext_migrate(inode); - mutex_unlock(&(inode->i_mutex)); + inode_unlock((inode)); mnt_drop_write_file(filp); return err; } @@ -689,6 +842,60 @@ encryption_policy_out: return -EOPNOTSUPP; #endif } + case EXT4_IOC_FSGETXATTR: + { + struct fsxattr fa; + + memset(&fa, 0, sizeof(struct fsxattr)); + ext4_get_inode_flags(ei); + fa.fsx_xflags = ext4_iflags_to_xflags(ei->i_flags & EXT4_FL_USER_VISIBLE); + + if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_PROJECT)) { + fa.fsx_projid = (__u32)from_kprojid(&init_user_ns, + EXT4_I(inode)->i_projid); + } + + if (copy_to_user((struct fsxattr __user *)arg, + &fa, sizeof(fa))) + return -EFAULT; + return 0; + } + case EXT4_IOC_FSSETXATTR: + { + struct fsxattr fa; + int err; + + if (copy_from_user(&fa, (struct fsxattr __user *)arg, + sizeof(fa))) + return -EFAULT; + + /* Make sure caller has proper permission */ + if (!inode_owner_or_capable(inode)) + return -EACCES; + + err = mnt_want_write_file(filp); + if (err) + return err; + + flags = ext4_xflags_to_iflags(fa.fsx_xflags); + flags = ext4_mask_flags(inode->i_mode, flags); + + inode_lock(inode); + flags = (ei->i_flags & ~EXT4_FL_XFLAG_VISIBLE) | + (flags & EXT4_FL_XFLAG_VISIBLE); + err = ext4_ioctl_setflags(inode, flags); + inode_unlock(inode); + mnt_drop_write_file(filp); + if (err) + return err; + + err = ext4_ioctl_setproject(filp, fa.fsx_projid); + if (err) + return err; + + return 0; + } default: return -ENOTTY; } diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 61eaf74dc..4424b7bf8 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2285,7 +2285,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) if (group == 0) seq_puts(seq, "#group: free frags first [" " 2^0 2^1 2^2 2^3 2^4 2^5 2^6 " - " 2^7 2^8 2^9 2^10 2^11 2^12 2^13 ]"); + " 2^7 2^8 2^9 2^10 2^11 2^12 2^13 ]\n"); i = (sb->s_blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + sizeof(struct ext4_group_info); diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index e032a0423..4098acc70 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -390,6 +390,7 @@ data_copy: *err = ext4_get_block(orig_inode, orig_blk_offset + i, bh, 0); if (*err < 0) break; + bh = bh->b_this_page; } if (!*err) *err = block_commit_write(pagep[0], from, from + replaced_size); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index a969ab39f..48e4b8907 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -273,7 +273,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, struct ext4_filename *fname, struct ext4_dir_entry_2 **res_dir); static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, - struct dentry *dentry, struct inode *inode); + struct inode *dir, struct inode *inode); /* checksumming functions */ void initialize_dirent_tail(struct ext4_dir_entry_tail *t, @@ -1558,6 +1558,24 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi struct ext4_dir_entry_2 *de; struct buffer_head *bh; + if (ext4_encrypted_inode(dir)) { + int res = ext4_get_encryption_info(dir); + + /* + * This should be a properly defined flag for + * dentry->d_flags when we uplift this to the VFS. + * d_fsdata is set to (void *) 1 if if the dentry is + * created while the directory was encrypted and we + * don't have access to the key. + */ + dentry->d_fsdata = NULL; + if (ext4_encryption_info(dir)) + dentry->d_fsdata = (void *) 1; + d_set_d_op(dentry, &ext4_encrypted_d_ops); + if (res && res != -ENOKEY) + return ERR_PTR(res); + } + if (dentry->d_name.len > EXT4_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); @@ -1585,11 +1603,15 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi return ERR_PTR(-EFSCORRUPTED); } if (!IS_ERR(inode) && ext4_encrypted_inode(dir) && - (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || - S_ISLNK(inode->i_mode)) && + (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && !ext4_is_child_context_consistent_with_parent(dir, inode)) { + int nokey = ext4_encrypted_inode(inode) && + !ext4_encryption_info(inode); + iput(inode); + if (nokey) + return ERR_PTR(-ENOKEY); ext4_warning(inode->i_sb, "Inconsistent encryption contexts: %lu/%lu\n", (unsigned long) dir->i_ino, @@ -1928,10 +1950,9 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, * directory, and adds the dentry to the indexed directory. */ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, - struct dentry *dentry, + struct inode *dir, struct inode *inode, struct buffer_head *bh) { - struct inode *dir = d_inode(dentry->d_parent); struct buffer_head *bh2; struct dx_root *root; struct dx_frame frames[2], *frame; @@ -2086,8 +2107,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, return retval; if (ext4_has_inline_data(dir)) { - retval = ext4_try_add_inline_entry(handle, &fname, - dentry, inode); + retval = ext4_try_add_inline_entry(handle, &fname, dir, inode); if (retval < 0) goto out; if (retval == 1) { @@ -2097,7 +2117,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, } if (is_dx(dir)) { - retval = ext4_dx_add_entry(handle, &fname, dentry, inode); + retval = ext4_dx_add_entry(handle, &fname, dir, inode); if (!retval || (retval != ERR_BAD_DX_DIR)) goto out; ext4_clear_inode_flag(dir, EXT4_INODE_INDEX); @@ -2119,7 +2139,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, if (blocks == 1 && !dx_fallback && ext4_has_feature_dir_index(sb)) { - retval = make_indexed_dir(handle, &fname, dentry, + retval = make_indexed_dir(handle, &fname, dir, inode, bh); bh = NULL; /* make_indexed_dir releases bh */ goto out; @@ -2154,12 +2174,11 @@ out: * Returns 0 for success, or a negative error value */ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, - struct dentry *dentry, struct inode *inode) + struct inode *dir, struct inode *inode) { struct dx_frame frames[2], *frame; struct dx_entry *entries, *at; struct buffer_head *bh; - struct inode *dir = d_inode(dentry->d_parent); struct super_block *sb = dir->i_sb; struct ext4_dir_entry_2 *de; int err; @@ -2756,7 +2775,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) return 0; WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && - !mutex_is_locked(&inode->i_mutex)); + !inode_is_locked(inode)); /* * Exit early if inode already is on orphan list. This is a big speedup * since we don't have to contend on the global s_orphan_lock. @@ -2838,7 +2857,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) return 0; WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && - !mutex_is_locked(&inode->i_mutex)); + !inode_is_locked(inode)); /* Do this quick check before taking global s_orphan_lock. */ if (list_empty(&ei->i_orphan)) return 0; @@ -3132,6 +3151,7 @@ static int ext4_symlink(struct inode *dir, if ((disk_link.len > EXT4_N_BLOCKS * 4)) { if (!encryption_required) inode->i_op = &ext4_symlink_inode_operations; + inode_nohighmem(inode); ext4_set_aops(inode); /* * We cannot call page_symlink() with transaction started @@ -3211,6 +3231,12 @@ static int ext4_link(struct dentry *old_dentry, if (ext4_encrypted_inode(dir) && !ext4_is_child_context_consistent_with_parent(dir, inode)) return -EPERM; + + if ((ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT)) && + (!projid_eq(EXT4_I(dir)->i_projid, + EXT4_I(old_dentry->d_inode)->i_projid))) + return -EXDEV; + err = dquot_initialize(dir); if (err) return err; @@ -3491,6 +3517,11 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, int credits; u8 old_file_type; + if ((ext4_test_inode_flag(new_dir, EXT4_INODE_PROJINHERIT)) && + (!projid_eq(EXT4_I(new_dir)->i_projid, + EXT4_I(old_dentry->d_inode)->i_projid))) + return -EXDEV; + retval = dquot_initialize(old.dir); if (retval) return retval; @@ -3700,6 +3731,14 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, new.inode))) return -EPERM; + if ((ext4_test_inode_flag(new_dir, EXT4_INODE_PROJINHERIT) && + !projid_eq(EXT4_I(new_dir)->i_projid, + EXT4_I(old_dentry->d_inode)->i_projid)) || + (ext4_test_inode_flag(old_dir, EXT4_INODE_PROJINHERIT) && + !projid_eq(EXT4_I(old_dir)->i_projid, + EXT4_I(new_dentry->d_inode)->i_projid))) + return -EXDEV; + retval = dquot_initialize(old.dir); if (retval) return retval; diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 17fbe3882..090b34986 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -52,9 +52,8 @@ void ext4_exit_pageio(void) */ static void buffer_io_error(struct buffer_head *bh) { - char b[BDEVNAME_SIZE]; - printk_ratelimited(KERN_ERR "Buffer I/O error on device %s, logical block %llu\n", - bdevname(bh->b_bdev, b), + printk_ratelimited(KERN_ERR "Buffer I/O error on device %pg, logical block %llu\n", + bh->b_bdev, (unsigned long long)bh->b_blocknr); } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index c9ab67da6..3ed01ec01 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -80,6 +80,36 @@ static void ext4_destroy_lazyinit_thread(void); static void ext4_unregister_li_request(struct super_block *sb); static void ext4_clear_request_list(void); +/* + * Lock ordering + * + * Note the difference between i_mmap_sem (EXT4_I(inode)->i_mmap_sem) and + * i_mmap_rwsem (inode->i_mmap_rwsem)! + * + * page fault path: + * mmap_sem -> sb_start_pagefault -> i_mmap_sem (r) -> transaction start -> + * page lock -> i_data_sem (rw) + * + * buffered write path: + * sb_start_write -> i_mutex -> mmap_sem + * sb_start_write -> i_mutex -> transaction start -> page lock -> + * i_data_sem (rw) + * + * truncate: + * sb_start_write -> i_mutex -> EXT4_STATE_DIOREAD_LOCK (w) -> i_mmap_sem (w) -> + * i_mmap_rwsem (w) -> page lock + * sb_start_write -> i_mutex -> EXT4_STATE_DIOREAD_LOCK (w) -> i_mmap_sem (w) -> + * transaction start -> i_data_sem (rw) + * + * direct IO: + * sb_start_write -> i_mutex -> EXT4_STATE_DIOREAD_LOCK (r) -> mmap_sem + * sb_start_write -> i_mutex -> EXT4_STATE_DIOREAD_LOCK (r) -> + * transaction start -> i_data_sem (rw) + * + * writepages: + * transaction start -> page lock(s) -> i_data_sem (rw) + */ + #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2) static struct file_system_type ext2_fs_type = { .owner = THIS_MODULE, @@ -958,6 +988,7 @@ static void init_once(void *foo) INIT_LIST_HEAD(&ei->i_orphan); init_rwsem(&ei->xattr_sem); init_rwsem(&ei->i_data_sem); + init_rwsem(&ei->i_mmap_sem); inode_init_once(&ei->vfs_inode); } @@ -966,7 +997,7 @@ static int __init init_inodecache(void) ext4_inode_cachep = kmem_cache_create("ext4_inode_cache", sizeof(struct ext4_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (ext4_inode_cachep == NULL) return -ENOMEM; @@ -1066,8 +1097,8 @@ static int bdev_try_to_free_page(struct super_block *sb, struct page *page, } #ifdef CONFIG_QUOTA -#define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group") -#define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA)) +static char *quotatypes[] = INITQFNAMES; +#define QTYPE2NAME(t) (quotatypes[t]) static int ext4_write_dquot(struct dquot *dquot); static int ext4_acquire_dquot(struct dquot *dquot); @@ -1100,6 +1131,7 @@ static const struct dquot_operations ext4_quota_operations = { .write_info = ext4_write_info, .alloc_dquot = dquot_alloc, .destroy_dquot = dquot_destroy, + .get_projid = ext4_get_projid, }; static const struct quotactl_ops ext4_qctl_operations = { @@ -2254,10 +2286,10 @@ static void ext4_orphan_cleanup(struct super_block *sb, __func__, inode->i_ino, inode->i_size); jbd_debug(2, "truncating inode %lu to %lld bytes\n", inode->i_ino, inode->i_size); - mutex_lock(&inode->i_mutex); + inode_lock(inode); truncate_inode_pages(inode->i_mapping, inode->i_size); ext4_truncate(inode); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); nr_truncates++; } else { if (test_opt(sb, DEBUG)) @@ -2526,6 +2558,12 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly) "without CONFIG_QUOTA"); return 0; } + if (ext4_has_feature_project(sb) && !readonly) { + ext4_msg(sb, KERN_ERR, + "Filesystem with project quota feature cannot be mounted RDWR " + "without CONFIG_QUOTA"); + return 0; + } #endif /* CONFIG_QUOTA */ return 1; } @@ -3654,7 +3692,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sb->s_qcop = &dquot_quotactl_sysfile_ops; else sb->s_qcop = &ext4_qctl_operations; - sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; + sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ; #endif memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid)); @@ -4790,6 +4828,48 @@ restore_opts: return err; } +#ifdef CONFIG_QUOTA +static int ext4_statfs_project(struct super_block *sb, + kprojid_t projid, struct kstatfs *buf) +{ + struct kqid qid; + struct dquot *dquot; + u64 limit; + u64 curblock; + + qid = make_kqid_projid(projid); + dquot = dqget(sb, qid); + if (IS_ERR(dquot)) + return PTR_ERR(dquot); + spin_lock(&dq_data_lock); + + limit = (dquot->dq_dqb.dqb_bsoftlimit ? + dquot->dq_dqb.dqb_bsoftlimit : + dquot->dq_dqb.dqb_bhardlimit) >> sb->s_blocksize_bits; + if (limit && buf->f_blocks > limit) { + curblock = dquot->dq_dqb.dqb_curspace >> sb->s_blocksize_bits; + buf->f_blocks = limit; + buf->f_bfree = buf->f_bavail = + (buf->f_blocks > curblock) ? + (buf->f_blocks - curblock) : 0; + } + + limit = dquot->dq_dqb.dqb_isoftlimit ? + dquot->dq_dqb.dqb_isoftlimit : + dquot->dq_dqb.dqb_ihardlimit; + if (limit && buf->f_files > limit) { + buf->f_files = limit; + buf->f_ffree = + (buf->f_files > dquot->dq_dqb.dqb_curinodes) ? + (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0; + } + + spin_unlock(&dq_data_lock); + dqput(dquot); + return 0; +} +#endif + static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; @@ -4822,6 +4902,11 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL; buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL; +#ifdef CONFIG_QUOTA + if (ext4_test_inode_flag(dentry->d_inode, EXT4_INODE_PROJINHERIT) && + sb_has_quota_limits_enabled(sb, PRJQUOTA)) + ext4_statfs_project(sb, EXT4_I(dentry->d_inode)->i_projid, buf); +#endif return 0; } @@ -4986,7 +5071,8 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id, struct inode *qf_inode; unsigned long qf_inums[EXT4_MAXQUOTAS] = { le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum), - le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum) + le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum), + le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum) }; BUG_ON(!ext4_has_feature_quota(sb)); @@ -5014,7 +5100,8 @@ static int ext4_enable_quotas(struct super_block *sb) int type, err = 0; unsigned long qf_inums[EXT4_MAXQUOTAS] = { le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum), - le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum) + le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum), + le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum) }; sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE; diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index e8e7af62a..6f7ee30a8 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -23,17 +23,21 @@ #include "xattr.h" #ifdef CONFIG_EXT4_FS_ENCRYPTION -static const char *ext4_encrypted_follow_link(struct dentry *dentry, void **cookie) +static const char *ext4_encrypted_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) { struct page *cpage = NULL; char *caddr, *paddr = NULL; struct ext4_str cstr, pstr; - struct inode *inode = d_inode(dentry); struct ext4_encrypted_symlink_data *sd; loff_t size = min_t(loff_t, i_size_read(inode), PAGE_SIZE - 1); int res; u32 plen, max_size = inode->i_sb->s_blocksize; + if (!dentry) + return ERR_PTR(-ECHILD); + res = ext4_get_encryption_info(inode); if (res) return ERR_PTR(res); @@ -45,7 +49,7 @@ static const char *ext4_encrypted_follow_link(struct dentry *dentry, void **cook cpage = read_mapping_page(inode->i_mapping, 0, NULL); if (IS_ERR(cpage)) return ERR_CAST(cpage); - caddr = kmap(cpage); + caddr = page_address(cpage); caddr[size] = 0; } @@ -75,24 +79,20 @@ static const char *ext4_encrypted_follow_link(struct dentry *dentry, void **cook /* Null-terminate the name */ if (res <= plen) paddr[res] = '\0'; - if (cpage) { - kunmap(cpage); + if (cpage) page_cache_release(cpage); - } - return *cookie = paddr; + set_delayed_call(done, kfree_link, paddr); + return paddr; errout: - if (cpage) { - kunmap(cpage); + if (cpage) page_cache_release(cpage); - } kfree(paddr); return ERR_PTR(res); } const struct inode_operations ext4_encrypted_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = ext4_encrypted_follow_link, - .put_link = kfree_put_link, + .get_link = ext4_encrypted_get_link, .setattr = ext4_setattr, .setxattr = generic_setxattr, .getxattr = generic_getxattr, @@ -103,8 +103,7 @@ const struct inode_operations ext4_encrypted_symlink_inode_operations = { const struct inode_operations ext4_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = page_follow_link_light, - .put_link = page_put_link, + .get_link = page_get_link, .setattr = ext4_setattr, .setxattr = generic_setxattr, .getxattr = generic_getxattr, @@ -114,7 +113,7 @@ const struct inode_operations ext4_symlink_inode_operations = { const struct inode_operations ext4_fast_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = simple_follow_link, + .get_link = simple_get_link, .setattr = ext4_setattr, .setxattr = generic_setxattr, .getxattr = generic_getxattr, diff --git a/fs/ext4/truncate.h b/fs/ext4/truncate.h index 011ba6670..c70d06a38 100644 --- a/fs/ext4/truncate.h +++ b/fs/ext4/truncate.h @@ -10,8 +10,10 @@ */ static inline void ext4_truncate_failed_write(struct inode *inode) { + down_write(&EXT4_I(inode)->i_mmap_sem); truncate_inode_pages(inode->i_mapping, inode->i_size); ext4_truncate(inode); + up_write(&EXT4_I(inode)->i_mmap_sem); } /* diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 6b6b3e751..a95151e87 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -68,10 +68,8 @@ printk("\n"); \ } while (0) # define ea_bdebug(bh, f...) do { \ - char b[BDEVNAME_SIZE]; \ - printk(KERN_DEBUG "block %s:%lu: ", \ - bdevname(bh->b_bdev, b), \ - (unsigned long) bh->b_blocknr); \ + printk(KERN_DEBUG "block %pg:%lu: ", \ + bh->b_bdev, (unsigned long) bh->b_blocknr); \ printk(f); \ printk("\n"); \ } while (0) @@ -404,19 +402,24 @@ ext4_xattr_list_entries(struct dentry *dentry, struct ext4_xattr_entry *entry, const struct xattr_handler *handler = ext4_xattr_handler(entry->e_name_index); - if (handler) { - size_t size = handler->list(handler, dentry, buffer, - rest, entry->e_name, - entry->e_name_len); + if (handler && (!handler->list || handler->list(dentry))) { + const char *prefix = handler->prefix ?: handler->name; + size_t prefix_len = strlen(prefix); + size_t size = prefix_len + entry->e_name_len + 1; + if (buffer) { if (size > rest) return -ERANGE; - buffer += size; + memcpy(buffer, prefix, prefix_len); + buffer += prefix_len; + memcpy(buffer, entry->e_name, entry->e_name_len); + buffer += entry->e_name_len; + *buffer++ = 0; } rest -= size; } } - return buffer_size - rest; + return buffer_size - rest; /* total size */ } static int diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c index 36f4c1a84..3e81bdca0 100644 --- a/fs/ext4/xattr_security.c +++ b/fs/ext4/xattr_security.c @@ -11,30 +11,11 @@ #include "ext4.h" #include "xattr.h" -static size_t -ext4_xattr_security_list(const struct xattr_handler *handler, - struct dentry *dentry, char *list, size_t list_size, - const char *name, size_t name_len) -{ - const size_t prefix_len = sizeof(XATTR_SECURITY_PREFIX)-1; - const size_t total_len = prefix_len + name_len + 1; - - - if (list && total_len <= list_size) { - memcpy(list, XATTR_SECURITY_PREFIX, prefix_len); - memcpy(list+prefix_len, name, name_len); - list[prefix_len + name_len] = '\0'; - } - return total_len; -} - static int ext4_xattr_security_get(const struct xattr_handler *handler, struct dentry *dentry, const char *name, void *buffer, size_t size) { - if (strcmp(name, "") == 0) - return -EINVAL; return ext4_xattr_get(d_inode(dentry), EXT4_XATTR_INDEX_SECURITY, name, buffer, size); } @@ -44,8 +25,6 @@ ext4_xattr_security_set(const struct xattr_handler *handler, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { - if (strcmp(name, "") == 0) - return -EINVAL; return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_SECURITY, name, value, size, flags); } @@ -79,7 +58,6 @@ ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir, const struct xattr_handler ext4_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, - .list = ext4_xattr_security_list, .get = ext4_xattr_security_get, .set = ext4_xattr_security_set, }; diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c index 488089053..2a3c6f9b8 100644 --- a/fs/ext4/xattr_trusted.c +++ b/fs/ext4/xattr_trusted.c @@ -12,23 +12,10 @@ #include "ext4.h" #include "xattr.h" -static size_t -ext4_xattr_trusted_list(const struct xattr_handler *handler, - struct dentry *dentry, char *list, size_t list_size, - const char *name, size_t name_len) +static bool +ext4_xattr_trusted_list(struct dentry *dentry) { - const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN; - const size_t total_len = prefix_len + name_len + 1; - - if (!capable(CAP_SYS_ADMIN)) - return 0; - - if (list && total_len <= list_size) { - memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len); - memcpy(list+prefix_len, name, name_len); - list[prefix_len + name_len] = '\0'; - } - return total_len; + return capable(CAP_SYS_ADMIN); } static int @@ -36,8 +23,6 @@ ext4_xattr_trusted_get(const struct xattr_handler *handler, struct dentry *dentry, const char *name, void *buffer, size_t size) { - if (strcmp(name, "") == 0) - return -EINVAL; return ext4_xattr_get(d_inode(dentry), EXT4_XATTR_INDEX_TRUSTED, name, buffer, size); } @@ -47,8 +32,6 @@ ext4_xattr_trusted_set(const struct xattr_handler *handler, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { - if (strcmp(name, "") == 0) - return -EINVAL; return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_TRUSTED, name, value, size, flags); } diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c index d2dec3364..d152f431e 100644 --- a/fs/ext4/xattr_user.c +++ b/fs/ext4/xattr_user.c @@ -11,23 +11,10 @@ #include "ext4.h" #include "xattr.h" -static size_t -ext4_xattr_user_list(const struct xattr_handler *handler, - struct dentry *dentry, char *list, size_t list_size, - const char *name, size_t name_len) +static bool +ext4_xattr_user_list(struct dentry *dentry) { - const size_t prefix_len = XATTR_USER_PREFIX_LEN; - const size_t total_len = prefix_len + name_len + 1; - - if (!test_opt(dentry->d_sb, XATTR_USER)) - return 0; - - if (list && total_len <= list_size) { - memcpy(list, XATTR_USER_PREFIX, prefix_len); - memcpy(list+prefix_len, name, name_len); - list[prefix_len + name_len] = '\0'; - } - return total_len; + return test_opt(dentry->d_sb, XATTR_USER); } static int @@ -35,8 +22,6 @@ ext4_xattr_user_get(const struct xattr_handler *handler, struct dentry *dentry, const char *name, void *buffer, size_t size) { - if (strcmp(name, "") == 0) - return -EINVAL; if (!test_opt(dentry->d_sb, XATTR_USER)) return -EOPNOTSUPP; return ext4_xattr_get(d_inode(dentry), EXT4_XATTR_INDEX_USER, @@ -48,8 +33,6 @@ ext4_xattr_user_set(const struct xattr_handler *handler, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { - if (strcmp(name, "") == 0) - return -EINVAL; if (!test_opt(dentry->d_sb, XATTR_USER)) return -EOPNOTSUPP; return ext4_xattr_set(d_inode(dentry), EXT4_XATTR_INDEX_USER, diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index f661d8047..3842af954 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -237,7 +237,7 @@ static int f2fs_write_meta_page(struct page *page, dec_page_count(sbi, F2FS_DIRTY_META); unlock_page(page); - if (wbc->for_reclaim) + if (wbc->for_reclaim || unlikely(f2fs_cp_error(sbi))) f2fs_submit_merged_bio(sbi, META, WRITE); return 0; @@ -410,13 +410,13 @@ static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) spin_unlock(&im->ino_lock); } -void add_dirty_inode(struct f2fs_sb_info *sbi, nid_t ino, int type) +void add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) { /* add new dirty ino entry into list */ __add_ino_entry(sbi, ino, type); } -void remove_dirty_inode(struct f2fs_sb_info *sbi, nid_t ino, int type) +void remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) { /* remove dirty ino entry from list */ __remove_ino_entry(sbi, ino, type); @@ -434,7 +434,7 @@ bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode) return e ? true : false; } -void release_dirty_inode(struct f2fs_sb_info *sbi) +void release_ino_entry(struct f2fs_sb_info *sbi) { struct ino_entry *e, *tmp; int i; @@ -722,47 +722,48 @@ fail_no_cp: return -EINVAL; } -static int __add_dirty_inode(struct inode *inode, struct inode_entry *new) +static void __add_dirty_inode(struct inode *inode, enum inode_type type) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + int flag = (type == DIR_INODE) ? FI_DIRTY_DIR : FI_DIRTY_FILE; - if (is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR)) - return -EEXIST; + if (is_inode_flag_set(fi, flag)) + return; - set_inode_flag(F2FS_I(inode), FI_DIRTY_DIR); - F2FS_I(inode)->dirty_dir = new; - list_add_tail(&new->list, &sbi->dir_inode_list); - stat_inc_dirty_dir(sbi); - return 0; + set_inode_flag(fi, flag); + list_add_tail(&fi->dirty_list, &sbi->inode_list[type]); + stat_inc_dirty_inode(sbi, type); +} + +static void __remove_dirty_inode(struct inode *inode, enum inode_type type) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + int flag = (type == DIR_INODE) ? FI_DIRTY_DIR : FI_DIRTY_FILE; + + if (get_dirty_pages(inode) || + !is_inode_flag_set(F2FS_I(inode), flag)) + return; + + list_del_init(&fi->dirty_list); + clear_inode_flag(fi, flag); + stat_dec_dirty_inode(F2FS_I_SB(inode), type); } void update_dirty_page(struct inode *inode, struct page *page) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct inode_entry *new; - int ret = 0; + enum inode_type type = S_ISDIR(inode->i_mode) ? DIR_INODE : FILE_INODE; if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode)) return; - if (!S_ISDIR(inode->i_mode)) { - inode_inc_dirty_pages(inode); - goto out; - } - - new = f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS); - new->inode = inode; - INIT_LIST_HEAD(&new->list); - - spin_lock(&sbi->dir_inode_lock); - ret = __add_dirty_inode(inode, new); + spin_lock(&sbi->inode_lock[type]); + __add_dirty_inode(inode, type); inode_inc_dirty_pages(inode); - spin_unlock(&sbi->dir_inode_lock); + spin_unlock(&sbi->inode_lock[type]); - if (ret) - kmem_cache_free(inode_entry_slab, new); -out: SetPagePrivate(page); f2fs_trace_pid(page); } @@ -770,70 +771,60 @@ out: void add_dirty_dir_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct inode_entry *new = - f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS); - int ret = 0; - - new->inode = inode; - INIT_LIST_HEAD(&new->list); - spin_lock(&sbi->dir_inode_lock); - ret = __add_dirty_inode(inode, new); - spin_unlock(&sbi->dir_inode_lock); - - if (ret) - kmem_cache_free(inode_entry_slab, new); + spin_lock(&sbi->inode_lock[DIR_INODE]); + __add_dirty_inode(inode, DIR_INODE); + spin_unlock(&sbi->inode_lock[DIR_INODE]); } -void remove_dirty_dir_inode(struct inode *inode) +void remove_dirty_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct inode_entry *entry; - - if (!S_ISDIR(inode->i_mode)) - return; + struct f2fs_inode_info *fi = F2FS_I(inode); + enum inode_type type = S_ISDIR(inode->i_mode) ? DIR_INODE : FILE_INODE; - spin_lock(&sbi->dir_inode_lock); - if (get_dirty_pages(inode) || - !is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR)) { - spin_unlock(&sbi->dir_inode_lock); + if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) && + !S_ISLNK(inode->i_mode)) return; - } - entry = F2FS_I(inode)->dirty_dir; - list_del(&entry->list); - F2FS_I(inode)->dirty_dir = NULL; - clear_inode_flag(F2FS_I(inode), FI_DIRTY_DIR); - stat_dec_dirty_dir(sbi); - spin_unlock(&sbi->dir_inode_lock); - kmem_cache_free(inode_entry_slab, entry); + spin_lock(&sbi->inode_lock[type]); + __remove_dirty_inode(inode, type); + spin_unlock(&sbi->inode_lock[type]); /* Only from the recovery routine */ - if (is_inode_flag_set(F2FS_I(inode), FI_DELAY_IPUT)) { - clear_inode_flag(F2FS_I(inode), FI_DELAY_IPUT); + if (is_inode_flag_set(fi, FI_DELAY_IPUT)) { + clear_inode_flag(fi, FI_DELAY_IPUT); iput(inode); } } -void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi) +int sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type) { struct list_head *head; - struct inode_entry *entry; struct inode *inode; + struct f2fs_inode_info *fi; + bool is_dir = (type == DIR_INODE); + + trace_f2fs_sync_dirty_inodes_enter(sbi->sb, is_dir, + get_pages(sbi, is_dir ? + F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA)); retry: if (unlikely(f2fs_cp_error(sbi))) - return; + return -EIO; - spin_lock(&sbi->dir_inode_lock); + spin_lock(&sbi->inode_lock[type]); - head = &sbi->dir_inode_list; + head = &sbi->inode_list[type]; if (list_empty(head)) { - spin_unlock(&sbi->dir_inode_lock); - return; + spin_unlock(&sbi->inode_lock[type]); + trace_f2fs_sync_dirty_inodes_exit(sbi->sb, is_dir, + get_pages(sbi, is_dir ? + F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA)); + return 0; } - entry = list_entry(head->next, struct inode_entry, list); - inode = igrab(entry->inode); - spin_unlock(&sbi->dir_inode_lock); + fi = list_entry(head->next, struct f2fs_inode_info, dirty_list); + inode = igrab(&fi->vfs_inode); + spin_unlock(&sbi->inode_lock[type]); if (inode) { filemap_fdatawrite(inode->i_mapping); iput(inode); @@ -868,11 +859,9 @@ retry_flush_dents: /* write all the dirty dentry pages */ if (get_pages(sbi, F2FS_DIRTY_DENTS)) { f2fs_unlock_all(sbi); - sync_dirty_dir_inodes(sbi); - if (unlikely(f2fs_cp_error(sbi))) { - err = -EIO; + err = sync_dirty_inodes(sbi, DIR_INODE); + if (err) goto out; - } goto retry_flush_dents; } @@ -885,10 +874,9 @@ retry_flush_nodes: if (get_pages(sbi, F2FS_DIRTY_NODES)) { up_write(&sbi->node_write); - sync_node_pages(sbi, 0, &wbc); - if (unlikely(f2fs_cp_error(sbi))) { + err = sync_node_pages(sbi, 0, &wbc); + if (err) { f2fs_unlock_all(sbi); - err = -EIO; goto out; } goto retry_flush_nodes; @@ -919,7 +907,7 @@ static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi) finish_wait(&sbi->cp_wait, &wait); } -static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) +static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); @@ -945,7 +933,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) while (get_pages(sbi, F2FS_DIRTY_META)) { sync_meta_pages(sbi, META, LONG_MAX); if (unlikely(f2fs_cp_error(sbi))) - return; + return -EIO; } next_free_nid(sbi, &last_nid); @@ -1030,7 +1018,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* need to wait for end_io results */ wait_on_all_pages_writeback(sbi); if (unlikely(f2fs_cp_error(sbi))) - return; + return -EIO; /* write out checkpoint buffer at block 0 */ update_meta_page(sbi, ckpt, start_blk++); @@ -1058,7 +1046,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) wait_on_all_pages_writeback(sbi); if (unlikely(f2fs_cp_error(sbi))) - return; + return -EIO; filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LONG_MAX); filemap_fdatawait_range(META_MAPPING(sbi), 0, LONG_MAX); @@ -1081,22 +1069,25 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) invalidate_mapping_pages(META_MAPPING(sbi), discard_blk, discard_blk); - release_dirty_inode(sbi); + release_ino_entry(sbi); if (unlikely(f2fs_cp_error(sbi))) - return; + return -EIO; clear_prefree_segments(sbi, cpc); clear_sbi_flag(sbi, SBI_IS_DIRTY); + + return 0; } /* * We guarantee that this checkpoint procedure will not fail. */ -void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) +int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); unsigned long long ckpt_ver; + int err = 0; mutex_lock(&sbi->cp_mutex); @@ -1104,14 +1095,19 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) (cpc->reason == CP_FASTBOOT || cpc->reason == CP_SYNC || (cpc->reason == CP_DISCARD && !sbi->discard_blks))) goto out; - if (unlikely(f2fs_cp_error(sbi))) + if (unlikely(f2fs_cp_error(sbi))) { + err = -EIO; goto out; - if (f2fs_readonly(sbi->sb)) + } + if (f2fs_readonly(sbi->sb)) { + err = -EROFS; goto out; + } trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "start block_ops"); - if (block_operations(sbi)) + err = block_operations(sbi); + if (err) goto out; trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish block_ops"); @@ -1133,7 +1129,7 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) flush_sit_entries(sbi, cpc); /* unlock all the fs_lock[] in do_checkpoint() */ - do_checkpoint(sbi, cpc); + err = do_checkpoint(sbi, cpc); unblock_operations(sbi); stat_inc_cp_count(sbi->stat_info); @@ -1143,10 +1139,11 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) "checkpoint: version = %llx", ckpt_ver); /* do checkpoint periodically */ - sbi->cp_expires = round_jiffies_up(jiffies + HZ * sbi->cp_interval); + f2fs_update_time(sbi, CP_TIME); + trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint"); out: mutex_unlock(&sbi->cp_mutex); - trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint"); + return err; } void init_ino_entry_info(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 972eab7ac..5c06db17e 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -225,7 +225,8 @@ void set_data_blkaddr(struct dnode_of_data *dn) /* Get physical address of data block */ addr_array = blkaddr_in_node(rn); addr_array[ofs_in_node] = cpu_to_le32(dn->data_blkaddr); - set_page_dirty(node_page); + if (set_page_dirty(node_page)) + dn->node_changed = true; } int reserve_new_block(struct dnode_of_data *dn) @@ -412,7 +413,7 @@ struct page *get_new_data_page(struct inode *inode, struct page *page; struct dnode_of_data dn; int err; -repeat: + page = f2fs_grab_cache_page(mapping, index, true); if (!page) { /* @@ -441,12 +442,11 @@ repeat: } else { f2fs_put_page(page, 1); - page = get_read_data_page(inode, index, READ_SYNC, true); + /* if ipage exists, blkaddr should be NEW_ADDR */ + f2fs_bug_on(F2FS_I_SB(inode), ipage); + page = get_lock_data_page(inode, index, true); if (IS_ERR(page)) - goto repeat; - - /* wait for read completion */ - lock_page(page); + return page; } got_it: if (new_i_size && i_size_read(inode) < @@ -494,14 +494,10 @@ alloc: if (i_size_read(dn->inode) < ((loff_t)(fofs + 1) << PAGE_CACHE_SHIFT)) i_size_write(dn->inode, ((loff_t)(fofs + 1) << PAGE_CACHE_SHIFT)); - - /* direct IO doesn't use extent cache to maximize the performance */ - f2fs_drop_largest_extent(dn->inode, fofs); - return 0; } -static void __allocate_data_blocks(struct inode *inode, loff_t offset, +static int __allocate_data_blocks(struct inode *inode, loff_t offset, size_t count) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -510,14 +506,15 @@ static void __allocate_data_blocks(struct inode *inode, loff_t offset, u64 len = F2FS_BYTES_TO_BLK(count); bool allocated; u64 end_offset; + int err = 0; while (len) { - f2fs_balance_fs(sbi); f2fs_lock_op(sbi); /* When reading holes, we need its node page */ set_new_dnode(&dn, inode, NULL, NULL, 0); - if (get_dnode_of_data(&dn, start, ALLOC_NODE)) + err = get_dnode_of_data(&dn, start, ALLOC_NODE); + if (err) goto out; allocated = false; @@ -526,12 +523,15 @@ static void __allocate_data_blocks(struct inode *inode, loff_t offset, while (dn.ofs_in_node < end_offset && len) { block_t blkaddr; - if (unlikely(f2fs_cp_error(sbi))) + if (unlikely(f2fs_cp_error(sbi))) { + err = -EIO; goto sync_out; + } blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); if (blkaddr == NULL_ADDR || blkaddr == NEW_ADDR) { - if (__allocate_data_block(&dn)) + err = __allocate_data_block(&dn); + if (err) goto sync_out; allocated = true; } @@ -545,8 +545,10 @@ static void __allocate_data_blocks(struct inode *inode, loff_t offset, f2fs_put_dnode(&dn); f2fs_unlock_op(sbi); + + f2fs_balance_fs(sbi, dn.node_changed); } - return; + return err; sync_out: if (allocated) @@ -554,7 +556,8 @@ sync_out: f2fs_put_dnode(&dn); out: f2fs_unlock_op(sbi); - return; + f2fs_balance_fs(sbi, dn.node_changed); + return err; } /* @@ -566,7 +569,7 @@ out: * b. do not use extent cache for better performance * c. give the block addresses to blockdev */ -static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, +int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int create, int flag) { unsigned int maxblocks = map->m_len; @@ -577,6 +580,7 @@ static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int err = 0, ofs = 1; struct extent_info ei; bool allocated = false; + block_t blkaddr; map->m_len = 0; map->m_flags = 0; @@ -592,7 +596,7 @@ static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, } if (create) - f2fs_lock_op(F2FS_I_SB(inode)); + f2fs_lock_op(sbi); /* When reading holes, we need its node page */ set_new_dnode(&dn, inode, NULL, NULL, 0); @@ -640,12 +644,21 @@ static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, pgofs++; get_next: + if (map->m_len >= maxblocks) + goto sync_out; + if (dn.ofs_in_node >= end_offset) { if (allocated) sync_inode_page(&dn); allocated = false; f2fs_put_dnode(&dn); + if (create) { + f2fs_unlock_op(sbi); + f2fs_balance_fs(sbi, dn.node_changed); + f2fs_lock_op(sbi); + } + set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, pgofs, mode); if (err) { @@ -657,52 +670,53 @@ get_next: end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); } - if (maxblocks > map->m_len) { - block_t blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); + blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); - if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) { - if (create) { - if (unlikely(f2fs_cp_error(sbi))) { - err = -EIO; - goto sync_out; - } - err = __allocate_data_block(&dn); - if (err) - goto sync_out; - allocated = true; - map->m_flags |= F2FS_MAP_NEW; - blkaddr = dn.data_blkaddr; - } else { - /* - * we only merge preallocated unwritten blocks - * for fiemap. - */ - if (flag != F2FS_GET_BLOCK_FIEMAP || - blkaddr != NEW_ADDR) - goto sync_out; + if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) { + if (create) { + if (unlikely(f2fs_cp_error(sbi))) { + err = -EIO; + goto sync_out; } + err = __allocate_data_block(&dn); + if (err) + goto sync_out; + allocated = true; + map->m_flags |= F2FS_MAP_NEW; + blkaddr = dn.data_blkaddr; + } else { + /* + * we only merge preallocated unwritten blocks + * for fiemap. + */ + if (flag != F2FS_GET_BLOCK_FIEMAP || + blkaddr != NEW_ADDR) + goto sync_out; } + } - /* Give more consecutive addresses for the readahead */ - if ((map->m_pblk != NEW_ADDR && - blkaddr == (map->m_pblk + ofs)) || - (map->m_pblk == NEW_ADDR && - blkaddr == NEW_ADDR)) { - ofs++; - dn.ofs_in_node++; - pgofs++; - map->m_len++; - goto get_next; - } + /* Give more consecutive addresses for the readahead */ + if ((map->m_pblk != NEW_ADDR && + blkaddr == (map->m_pblk + ofs)) || + (map->m_pblk == NEW_ADDR && + blkaddr == NEW_ADDR)) { + ofs++; + dn.ofs_in_node++; + pgofs++; + map->m_len++; + goto get_next; } + sync_out: if (allocated) sync_inode_page(&dn); put_out: f2fs_put_dnode(&dn); unlock_out: - if (create) - f2fs_unlock_op(F2FS_I_SB(inode)); + if (create) { + f2fs_unlock_op(sbi); + f2fs_balance_fs(sbi, dn.node_changed); + } out: trace_f2fs_map_blocks(inode, map, err); return err; @@ -742,6 +756,10 @@ static int get_data_block_dio(struct inode *inode, sector_t iblock, static int get_data_block_bmap(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { + /* Block number less than F2FS MAX BLOCKS */ + if (unlikely(iblock >= F2FS_I_SB(inode)->max_file_blocks)) + return -EFBIG; + return __get_data_block(inode, iblock, bh_result, create, F2FS_GET_BLOCK_BMAP); } @@ -761,10 +779,9 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, { struct buffer_head map_bh; sector_t start_blk, last_blk; - loff_t isize = i_size_read(inode); + loff_t isize; u64 logical = 0, phys = 0, size = 0; u32 flags = 0; - bool past_eof = false, whole_file = false; int ret = 0; ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC); @@ -777,18 +794,21 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return ret; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); - if (len >= isize) { - whole_file = true; - len = isize; - } + isize = i_size_read(inode); + if (start >= isize) + goto out; + + if (start + len > isize) + len = isize - start; if (logical_to_blk(inode, len) == 0) len = blk_to_logical(inode, 1); start_blk = logical_to_blk(inode, start); last_blk = logical_to_blk(inode, start + len - 1); + next: memset(&map_bh, 0, sizeof(struct buffer_head)); map_bh.b_size = len; @@ -800,59 +820,37 @@ next: /* HOLE */ if (!buffer_mapped(&map_bh)) { - start_blk++; - - if (!past_eof && blk_to_logical(inode, start_blk) >= isize) - past_eof = 1; - - if (past_eof && size) { - flags |= FIEMAP_EXTENT_LAST; - ret = fiemap_fill_next_extent(fieinfo, logical, - phys, size, flags); - } else if (size) { - ret = fiemap_fill_next_extent(fieinfo, logical, - phys, size, flags); - size = 0; - } + /* Go through holes util pass the EOF */ + if (blk_to_logical(inode, start_blk++) < isize) + goto prep_next; + /* Found a hole beyond isize means no more extents. + * Note that the premise is that filesystems don't + * punch holes beyond isize and keep size unchanged. + */ + flags |= FIEMAP_EXTENT_LAST; + } - /* if we have holes up to/past EOF then we're done */ - if (start_blk > last_blk || past_eof || ret) - goto out; - } else { - if (start_blk > last_blk && !whole_file) { - ret = fiemap_fill_next_extent(fieinfo, logical, - phys, size, flags); - goto out; - } + if (size) { + if (f2fs_encrypted_inode(inode)) + flags |= FIEMAP_EXTENT_DATA_ENCRYPTED; - /* - * if size != 0 then we know we already have an extent - * to add, so add it. - */ - if (size) { - ret = fiemap_fill_next_extent(fieinfo, logical, - phys, size, flags); - if (ret) - goto out; - } + ret = fiemap_fill_next_extent(fieinfo, logical, + phys, size, flags); + } - logical = blk_to_logical(inode, start_blk); - phys = blk_to_logical(inode, map_bh.b_blocknr); - size = map_bh.b_size; - flags = 0; - if (buffer_unwritten(&map_bh)) - flags = FIEMAP_EXTENT_UNWRITTEN; + if (start_blk > last_blk || ret) + goto out; - start_blk += logical_to_blk(inode, size); + logical = blk_to_logical(inode, start_blk); + phys = blk_to_logical(inode, map_bh.b_blocknr); + size = map_bh.b_size; + flags = 0; + if (buffer_unwritten(&map_bh)) + flags = FIEMAP_EXTENT_UNWRITTEN; - /* - * If we are past the EOF, then we need to make sure as - * soon as we find a hole that the last extent we found - * is marked with FIEMAP_EXTENT_LAST - */ - if (!past_eof && logical + size >= isize) - past_eof = true; - } + start_blk += logical_to_blk(inode, size); + +prep_next: cond_resched(); if (fatal_signal_pending(current)) ret = -EINTR; @@ -862,7 +860,7 @@ out: if (ret == 1) ret = 0; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } @@ -1083,6 +1081,7 @@ int do_write_data_page(struct f2fs_io_info *fio) */ if (unlikely(fio->blk_addr != NEW_ADDR && !is_cold_data(page) && + !IS_ATOMIC_WRITTEN_PAGE(page) && need_inplace_update(inode))) { rewrite_data_page(fio); set_inode_flag(F2FS_I(inode), FI_UPDATE_WRITE); @@ -1179,10 +1178,11 @@ out: if (err) ClearPageUptodate(page); unlock_page(page); - if (need_balance_fs) - f2fs_balance_fs(sbi); - if (wbc->for_reclaim) + f2fs_balance_fs(sbi, need_balance_fs); + if (wbc->for_reclaim || unlikely(f2fs_cp_error(sbi))) { f2fs_submit_merged_bio(sbi, DATA, WRITE); + remove_dirty_inode(inode); + } return 0; redirty_out: @@ -1354,6 +1354,10 @@ static int f2fs_write_data_pages(struct address_space *mapping, available_free_memory(sbi, DIRTY_DENTS)) goto skip_write; + /* skip writing during file defragment */ + if (is_inode_flag_set(F2FS_I(inode), FI_DO_DEFRAG)) + goto skip_write; + /* during POR, we don't need to trigger writepage at all. */ if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto skip_write; @@ -1369,7 +1373,7 @@ static int f2fs_write_data_pages(struct address_space *mapping, if (locked) mutex_unlock(&sbi->writepages); - remove_dirty_dir_inode(inode); + remove_dirty_inode(inode); wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff); return ret; @@ -1382,13 +1386,85 @@ skip_write: static void f2fs_write_failed(struct address_space *mapping, loff_t to) { struct inode *inode = mapping->host; + loff_t i_size = i_size_read(inode); - if (to > inode->i_size) { - truncate_pagecache(inode, inode->i_size); - truncate_blocks(inode, inode->i_size, true); + if (to > i_size) { + truncate_pagecache(inode, i_size); + truncate_blocks(inode, i_size, true); } } +static int prepare_write_begin(struct f2fs_sb_info *sbi, + struct page *page, loff_t pos, unsigned len, + block_t *blk_addr, bool *node_changed) +{ + struct inode *inode = page->mapping->host; + pgoff_t index = page->index; + struct dnode_of_data dn; + struct page *ipage; + bool locked = false; + struct extent_info ei; + int err = 0; + + if (f2fs_has_inline_data(inode) || + (pos & PAGE_CACHE_MASK) >= i_size_read(inode)) { + f2fs_lock_op(sbi); + locked = true; + } +restart: + /* check inline_data */ + ipage = get_node_page(sbi, inode->i_ino); + if (IS_ERR(ipage)) { + err = PTR_ERR(ipage); + goto unlock_out; + } + + set_new_dnode(&dn, inode, ipage, ipage, 0); + + if (f2fs_has_inline_data(inode)) { + if (pos + len <= MAX_INLINE_DATA) { + read_inline_data(page, ipage); + set_inode_flag(F2FS_I(inode), FI_DATA_EXIST); + sync_inode_page(&dn); + } else { + err = f2fs_convert_inline_page(&dn, page); + if (err) + goto out; + if (dn.data_blkaddr == NULL_ADDR) + err = f2fs_get_block(&dn, index); + } + } else if (locked) { + err = f2fs_get_block(&dn, index); + } else { + if (f2fs_lookup_extent_cache(inode, index, &ei)) { + dn.data_blkaddr = ei.blk + index - ei.fofs; + } else { + bool restart = false; + + /* hole case */ + err = get_dnode_of_data(&dn, index, LOOKUP_NODE); + if (err || (!err && dn.data_blkaddr == NULL_ADDR)) + restart = true; + if (restart) { + f2fs_put_dnode(&dn); + f2fs_lock_op(sbi); + locked = true; + goto restart; + } + } + } + + /* convert_inline_page can make node_changed */ + *blk_addr = dn.data_blkaddr; + *node_changed = dn.node_changed; +out: + f2fs_put_dnode(&dn); +unlock_out: + if (locked) + f2fs_unlock_op(sbi); + return err; +} + static int f2fs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -1396,15 +1472,13 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, struct inode *inode = mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct page *page = NULL; - struct page *ipage; pgoff_t index = ((unsigned long long) pos) >> PAGE_CACHE_SHIFT; - struct dnode_of_data dn; + bool need_balance = false; + block_t blkaddr = NULL_ADDR; int err = 0; trace_f2fs_write_begin(inode, pos, len, flags); - f2fs_balance_fs(sbi); - /* * We should check this at this moment to avoid deadlock on inode page * and #0 page. The locking rule for inline_data conversion should be: @@ -1424,41 +1498,27 @@ repeat: *pagep = page; - f2fs_lock_op(sbi); - - /* check inline_data */ - ipage = get_node_page(sbi, inode->i_ino); - if (IS_ERR(ipage)) { - err = PTR_ERR(ipage); - goto unlock_fail; - } - - set_new_dnode(&dn, inode, ipage, ipage, 0); + err = prepare_write_begin(sbi, page, pos, len, + &blkaddr, &need_balance); + if (err) + goto fail; - if (f2fs_has_inline_data(inode)) { - if (pos + len <= MAX_INLINE_DATA) { - read_inline_data(page, ipage); - set_inode_flag(F2FS_I(inode), FI_DATA_EXIST); - sync_inode_page(&dn); - goto put_next; + if (need_balance && has_not_enough_free_secs(sbi, 0)) { + unlock_page(page); + f2fs_balance_fs(sbi, true); + lock_page(page); + if (page->mapping != mapping) { + /* The page got truncated from under us */ + f2fs_put_page(page, 1); + goto repeat; } - err = f2fs_convert_inline_page(&dn, page); - if (err) - goto put_fail; } - err = f2fs_get_block(&dn, index); - if (err) - goto put_fail; -put_next: - f2fs_put_dnode(&dn); - f2fs_unlock_op(sbi); - f2fs_wait_on_page_writeback(page, DATA); /* wait for GCed encrypted page writeback */ if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) - f2fs_wait_on_encrypted_page_writeback(sbi, dn.data_blkaddr); + f2fs_wait_on_encrypted_page_writeback(sbi, blkaddr); if (len == PAGE_CACHE_SIZE) goto out_update; @@ -1474,14 +1534,14 @@ put_next: goto out_update; } - if (dn.data_blkaddr == NEW_ADDR) { + if (blkaddr == NEW_ADDR) { zero_user_segment(page, 0, PAGE_CACHE_SIZE); } else { struct f2fs_io_info fio = { .sbi = sbi, .type = DATA, .rw = READ_SYNC, - .blk_addr = dn.data_blkaddr, + .blk_addr = blkaddr, .page = page, .encrypted_page = NULL, }; @@ -1512,10 +1572,6 @@ out_clear: clear_cold_data(page); return 0; -put_fail: - f2fs_put_dnode(&dn); -unlock_fail: - f2fs_unlock_op(sbi); fail: f2fs_put_page(page, 1); f2fs_write_failed(mapping, pos + len); @@ -1540,6 +1596,7 @@ static int f2fs_write_end(struct file *file, } f2fs_put_page(page, 1); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return copied; } @@ -1567,11 +1624,9 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int err; /* we don't need to use inline_data strictly */ - if (f2fs_has_inline_data(inode)) { - err = f2fs_convert_inline_inode(inode); - if (err) - return err; - } + err = f2fs_convert_inline_inode(inode); + if (err) + return err; if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) return 0; @@ -1583,11 +1638,9 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, trace_f2fs_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); if (iov_iter_rw(iter) == WRITE) { - __allocate_data_blocks(inode, offset, count); - if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) { - err = -EIO; + err = __allocate_data_blocks(inode, offset, count); + if (err) goto out; - } } err = blockdev_direct_IO(iocb, inode, iter, offset, get_data_block_dio); diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 478e5d541..4fb6ef88a 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -38,12 +38,15 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->hit_rbtree = atomic64_read(&sbi->read_hit_rbtree); si->hit_total = si->hit_largest + si->hit_cached + si->hit_rbtree; si->total_ext = atomic64_read(&sbi->total_hit_ext); - si->ext_tree = sbi->total_ext_tree; + si->ext_tree = atomic_read(&sbi->total_ext_tree); + si->zombie_tree = atomic_read(&sbi->total_zombie_tree); si->ext_node = atomic_read(&sbi->total_ext_node); si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES); si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS); - si->ndirty_dirs = sbi->n_dirty_dirs; si->ndirty_meta = get_pages(sbi, F2FS_DIRTY_META); + si->ndirty_data = get_pages(sbi, F2FS_DIRTY_DATA); + si->ndirty_dirs = sbi->ndirty_inode[DIR_INODE]; + si->ndirty_files = sbi->ndirty_inode[FILE_INODE]; si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES); si->wb_pages = get_pages(sbi, F2FS_WRITEBACK); si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg; @@ -105,7 +108,7 @@ static void update_sit_info(struct f2fs_sb_info *sbi) bimodal = 0; total_vblocks = 0; - blks_per_sec = sbi->segs_per_sec * (1 << sbi->log_blocks_per_seg); + blks_per_sec = sbi->segs_per_sec * sbi->blocks_per_seg; hblks_per_sec = blks_per_sec / 2; for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) { vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec); @@ -189,10 +192,10 @@ get_cache: si->cache_mem += NM_I(sbi)->dirty_nat_cnt * sizeof(struct nat_entry_set); si->cache_mem += si->inmem_pages * sizeof(struct inmem_pages); - si->cache_mem += sbi->n_dirty_dirs * sizeof(struct inode_entry); for (i = 0; i <= UPDATE_INO; i++) si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry); - si->cache_mem += sbi->total_ext_tree * sizeof(struct extent_tree); + si->cache_mem += atomic_read(&sbi->total_ext_tree) * + sizeof(struct extent_tree); si->cache_mem += atomic_read(&sbi->total_ext_node) * sizeof(struct extent_node); @@ -211,12 +214,10 @@ static int stat_show(struct seq_file *s, void *v) mutex_lock(&f2fs_stat_mutex); list_for_each_entry(si, &f2fs_stat_list, stat_list) { - char devname[BDEVNAME_SIZE]; - update_general_status(si->sbi); - seq_printf(s, "\n=====[ partition info(%s). #%d ]=====\n", - bdevname(si->sbi->sb->s_bdev, devname), i++); + seq_printf(s, "\n=====[ partition info(%pg). #%d ]=====\n", + si->sbi->sb->s_bdev, i++); seq_printf(s, "[SB: 1] [CP: 2] [SIT: %d] [NAT: %d] ", si->sit_area_segs, si->nat_area_segs); seq_printf(s, "[SSA: %d] [MAIN: %d", @@ -269,7 +270,8 @@ static int stat_show(struct seq_file *s, void *v) si->dirty_count); seq_printf(s, " - Prefree: %d\n - Free: %d (%d)\n\n", si->prefree_count, si->free_segs, si->free_secs); - seq_printf(s, "CP calls: %d\n", si->cp_count); + seq_printf(s, "CP calls: %d (BG: %d)\n", + si->cp_count, si->bg_cp_count); seq_printf(s, "GC calls: %d (BG: %d)\n", si->call_count, si->bg_gc); seq_printf(s, " - data segments : %d (%d)\n", @@ -290,8 +292,8 @@ static int stat_show(struct seq_file *s, void *v) !si->total_ext ? 0 : div64_u64(si->hit_total * 100, si->total_ext), si->hit_total, si->total_ext); - seq_printf(s, " - Inner Struct Count: tree: %d, node: %d\n", - si->ext_tree, si->ext_node); + seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", + si->ext_tree, si->zombie_tree, si->ext_node); seq_puts(s, "\nBalancing F2FS Async:\n"); seq_printf(s, " - inmem: %4d, wb: %4d\n", si->inmem_pages, si->wb_pages); @@ -299,6 +301,8 @@ static int stat_show(struct seq_file *s, void *v) si->ndirty_node, si->node_pages); seq_printf(s, " - dents: %4d in dirs:%4d\n", si->ndirty_dent, si->ndirty_dirs); + seq_printf(s, " - datas: %4d in files:%4d\n", + si->ndirty_data, si->ndirty_files); seq_printf(s, " - meta: %4d in %4d\n", si->ndirty_meta, si->meta_pages); seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n", @@ -406,20 +410,23 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi) kfree(si); } -void __init f2fs_create_root_stats(void) +int __init f2fs_create_root_stats(void) { struct dentry *file; f2fs_debugfs_root = debugfs_create_dir("f2fs", NULL); if (!f2fs_debugfs_root) - return; + return -ENOMEM; file = debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root, NULL, &stat_fops); if (!file) { debugfs_remove(f2fs_debugfs_root); f2fs_debugfs_root = NULL; + return -ENOMEM; } + + return 0; } void f2fs_destroy_root_stats(void) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 7c1678ba8..faa7495e2 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -172,8 +172,6 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, namehash = f2fs_dentry_hash(&name); - f2fs_bug_on(F2FS_I_SB(dir), level > MAX_DIR_HASH_DEPTH); - nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level); nblock = bucket_blocks(level); @@ -238,6 +236,14 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, goto out; max_depth = F2FS_I(dir)->i_current_depth; + if (unlikely(max_depth > MAX_DIR_HASH_DEPTH)) { + f2fs_msg(F2FS_I_SB(dir)->sb, KERN_WARNING, + "Corrupted max_depth of %lu: %u", + dir->i_ino, max_depth); + max_depth = MAX_DIR_HASH_DEPTH; + F2FS_I(dir)->i_current_depth = max_depth; + mark_inode_dirty(dir); + } for (level = 0; level < max_depth; level++) { de = find_in_level(dir, level, &fname, res_page); @@ -444,7 +450,7 @@ error: /* once the failed inode becomes a bad inode, i_mode is S_IFREG */ truncate_inode_pages(&inode->i_data, 0); truncate_blocks(inode, 0, false); - remove_dirty_dir_inode(inode); + remove_dirty_inode(inode); remove_inode_page(inode); return ERR_PTR(err); } @@ -630,6 +636,7 @@ fail: f2fs_put_page(dentry_page, 1); out: f2fs_fname_free_filename(&fname); + f2fs_update_time(F2FS_I_SB(dir), REQ_TIME); return err; } @@ -651,6 +658,7 @@ int f2fs_do_tmpfile(struct inode *inode, struct inode *dir) clear_inode_flag(F2FS_I(inode), FI_NEW_INODE); fail: up_write(&F2FS_I(inode)->i_sem); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return err; } @@ -695,6 +703,8 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); int i; + f2fs_update_time(F2FS_I_SB(dir), REQ_TIME); + if (f2fs_has_inline_dentry(dir)) return f2fs_delete_inline_entry(dentry, page, dir, inode); @@ -855,25 +865,27 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) for (; n < npages; n++) { dentry_page = get_lock_data_page(inode, n, false); - if (IS_ERR(dentry_page)) - continue; + if (IS_ERR(dentry_page)) { + err = PTR_ERR(dentry_page); + if (err == -ENOENT) + continue; + else + goto out; + } dentry_blk = kmap(dentry_page); make_dentry_ptr(inode, &d, (void *)dentry_blk, 1); - if (f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr)) - goto stop; + if (f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr)) { + kunmap(dentry_page); + f2fs_put_page(dentry_page, 1); + break; + } ctx->pos = (n + 1) * NR_DENTRY_IN_BLOCK; kunmap(dentry_page); f2fs_put_page(dentry_page, 1); - dentry_page = NULL; - } -stop: - if (dentry_page && !IS_ERR(dentry_page)) { - kunmap(dentry_page); - f2fs_put_page(dentry_page, 1); } out: f2fs_fname_crypto_free_buffer(&fstr); diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 7ddba812e..ccd5c636d 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -36,7 +36,7 @@ static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi, rb_link_node(&en->rb_node, parent, p); rb_insert_color(&en->rb_node, &et->root); - et->count++; + atomic_inc(&et->node_cnt); atomic_inc(&sbi->total_ext_node); return en; } @@ -45,7 +45,7 @@ static void __detach_extent_node(struct f2fs_sb_info *sbi, struct extent_tree *et, struct extent_node *en) { rb_erase(&en->rb_node, &et->root); - et->count--; + atomic_dec(&et->node_cnt); atomic_dec(&sbi->total_ext_node); if (et->cached_en == en) @@ -68,11 +68,13 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode) et->root = RB_ROOT; et->cached_en = NULL; rwlock_init(&et->lock); - atomic_set(&et->refcount, 0); - et->count = 0; - sbi->total_ext_tree++; + INIT_LIST_HEAD(&et->list); + atomic_set(&et->node_cnt, 0); + atomic_inc(&sbi->total_ext_tree); + } else { + atomic_dec(&sbi->total_zombie_tree); + list_del_init(&et->list); } - atomic_inc(&et->refcount); up_write(&sbi->extent_tree_lock); /* never died until evict_inode */ @@ -131,7 +133,7 @@ static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi, { struct rb_node *node, *next; struct extent_node *en; - unsigned int count = et->count; + unsigned int count = atomic_read(&et->node_cnt); node = rb_first(&et->root); while (node) { @@ -152,7 +154,7 @@ static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi, node = next; } - return count - et->count; + return count - atomic_read(&et->node_cnt); } static void __drop_largest_extent(struct inode *inode, @@ -164,34 +166,33 @@ static void __drop_largest_extent(struct inode *inode, largest->len = 0; } -void f2fs_drop_largest_extent(struct inode *inode, pgoff_t fofs) -{ - if (!f2fs_may_extent_tree(inode)) - return; - - __drop_largest_extent(inode, fofs, 1); -} - -void f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) +/* return true, if inode page is changed */ +bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct extent_tree *et; struct extent_node *en; struct extent_info ei; - if (!f2fs_may_extent_tree(inode)) - return; + if (!f2fs_may_extent_tree(inode)) { + /* drop largest extent */ + if (i_ext && i_ext->len) { + i_ext->len = 0; + return true; + } + return false; + } et = __grab_extent_tree(inode); - if (!i_ext || le32_to_cpu(i_ext->len) < F2FS_MIN_EXTENT_LEN) - return; + if (!i_ext || !i_ext->len) + return false; set_extent_info(&ei, le32_to_cpu(i_ext->fofs), le32_to_cpu(i_ext->blk), le32_to_cpu(i_ext->len)); write_lock(&et->lock); - if (et->count) + if (atomic_read(&et->node_cnt)) goto out; en = __init_extent_tree(sbi, et, &ei); @@ -202,6 +203,7 @@ void f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) } out: write_unlock(&et->lock); + return false; } static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, @@ -549,45 +551,44 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) { struct extent_tree *treevec[EXT_TREE_VEC_SIZE]; + struct extent_tree *et, *next; struct extent_node *en, *tmp; unsigned long ino = F2FS_ROOT_INO(sbi); - struct radix_tree_root *root = &sbi->extent_tree_root; unsigned int found; unsigned int node_cnt = 0, tree_cnt = 0; int remained; + bool do_free = false; if (!test_opt(sbi, EXTENT_CACHE)) return 0; + if (!atomic_read(&sbi->total_zombie_tree)) + goto free_node; + if (!down_write_trylock(&sbi->extent_tree_lock)) goto out; /* 1. remove unreferenced extent tree */ - while ((found = radix_tree_gang_lookup(root, - (void **)treevec, ino, EXT_TREE_VEC_SIZE))) { - unsigned i; - - ino = treevec[found - 1]->ino + 1; - for (i = 0; i < found; i++) { - struct extent_tree *et = treevec[i]; - - if (!atomic_read(&et->refcount)) { - write_lock(&et->lock); - node_cnt += __free_extent_tree(sbi, et, true); - write_unlock(&et->lock); + list_for_each_entry_safe(et, next, &sbi->zombie_list, list) { + if (atomic_read(&et->node_cnt)) { + write_lock(&et->lock); + node_cnt += __free_extent_tree(sbi, et, true); + write_unlock(&et->lock); + } - radix_tree_delete(root, et->ino); - kmem_cache_free(extent_tree_slab, et); - sbi->total_ext_tree--; - tree_cnt++; + list_del_init(&et->list); + radix_tree_delete(&sbi->extent_tree_root, et->ino); + kmem_cache_free(extent_tree_slab, et); + atomic_dec(&sbi->total_ext_tree); + atomic_dec(&sbi->total_zombie_tree); + tree_cnt++; - if (node_cnt + tree_cnt >= nr_shrink) - goto unlock_out; - } - } + if (node_cnt + tree_cnt >= nr_shrink) + goto unlock_out; } up_write(&sbi->extent_tree_lock); +free_node: /* 2. remove LRU extent entries */ if (!down_write_trylock(&sbi->extent_tree_lock)) goto out; @@ -599,15 +600,19 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) if (!remained--) break; list_del_init(&en->list); + do_free = true; } spin_unlock(&sbi->extent_lock); + if (do_free == false) + goto unlock_out; + /* * reset ino for searching victims from beginning of global extent tree. */ ino = F2FS_ROOT_INO(sbi); - while ((found = radix_tree_gang_lookup(root, + while ((found = radix_tree_gang_lookup(&sbi->extent_tree_root, (void **)treevec, ino, EXT_TREE_VEC_SIZE))) { unsigned i; @@ -615,9 +620,13 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) for (i = 0; i < found; i++) { struct extent_tree *et = treevec[i]; - write_lock(&et->lock); - node_cnt += __free_extent_tree(sbi, et, false); - write_unlock(&et->lock); + if (!atomic_read(&et->node_cnt)) + continue; + + if (write_trylock(&et->lock)) { + node_cnt += __free_extent_tree(sbi, et, false); + write_unlock(&et->lock); + } if (node_cnt + tree_cnt >= nr_shrink) goto unlock_out; @@ -637,7 +646,7 @@ unsigned int f2fs_destroy_extent_node(struct inode *inode) struct extent_tree *et = F2FS_I(inode)->extent_tree; unsigned int node_cnt = 0; - if (!et) + if (!et || !atomic_read(&et->node_cnt)) return 0; write_lock(&et->lock); @@ -656,8 +665,12 @@ void f2fs_destroy_extent_tree(struct inode *inode) if (!et) return; - if (inode->i_nlink && !is_bad_inode(inode) && et->count) { - atomic_dec(&et->refcount); + if (inode->i_nlink && !is_bad_inode(inode) && + atomic_read(&et->node_cnt)) { + down_write(&sbi->extent_tree_lock); + list_add_tail(&et->list, &sbi->zombie_list); + atomic_inc(&sbi->total_zombie_tree); + up_write(&sbi->extent_tree_lock); return; } @@ -666,11 +679,10 @@ void f2fs_destroy_extent_tree(struct inode *inode) /* delete extent tree entry in radix tree */ down_write(&sbi->extent_tree_lock); - atomic_dec(&et->refcount); - f2fs_bug_on(sbi, atomic_read(&et->refcount) || et->count); + f2fs_bug_on(sbi, atomic_read(&et->node_cnt)); radix_tree_delete(&sbi->extent_tree_root, inode->i_ino); kmem_cache_free(extent_tree_slab, et); - sbi->total_ext_tree--; + atomic_dec(&sbi->total_ext_tree); up_write(&sbi->extent_tree_lock); F2FS_I(inode)->extent_tree = NULL; @@ -722,7 +734,9 @@ void init_extent_cache_info(struct f2fs_sb_info *sbi) init_rwsem(&sbi->extent_tree_lock); INIT_LIST_HEAD(&sbi->extent_list); spin_lock_init(&sbi->extent_lock); - sbi->total_ext_tree = 0; + atomic_set(&sbi->total_ext_tree, 0); + INIT_LIST_HEAD(&sbi->zombie_list); + atomic_set(&sbi->total_zombie_tree, 0); atomic_set(&sbi->total_ext_node, 0); } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 9db5500d6..ff79054c6 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -21,6 +21,7 @@ #include <linux/sched.h> #include <linux/vmalloc.h> #include <linux/bio.h> +#include <linux/blkdev.h> #ifdef CONFIG_F2FS_CHECK_FS #define f2fs_bug_on(sbi, condition) BUG_ON(condition) @@ -54,6 +55,7 @@ #define F2FS_MOUNT_FASTBOOT 0x00001000 #define F2FS_MOUNT_EXTENT_CACHE 0x00002000 #define F2FS_MOUNT_FORCE_FG_GC 0x00004000 +#define F2FS_MOUNT_DATA_FLUSH 0x00008000 #define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option) #define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option) @@ -125,6 +127,7 @@ enum { #define BATCHED_TRIM_BLOCKS(sbi) \ (BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg) #define DEF_CP_INTERVAL 60 /* 60 secs */ +#define DEF_IDLE_INTERVAL 120 /* 2 mins */ struct cp_control { int reason; @@ -158,13 +161,7 @@ struct ino_entry { nid_t ino; /* inode number */ }; -/* - * for the list of directory inodes or gc inodes. - * NOTE: there are two slab users for this structure, if we add/modify/delete - * fields in structure for one of slab users, it may affect fields or size of - * other one, in this condition, it's better to split both of slab and related - * data structure. - */ +/* for the list of inodes to be GCed */ struct inode_entry { struct list_head list; /* list head */ struct inode *inode; /* vfs inode pointer */ @@ -234,6 +231,7 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size, #define F2FS_IOC_ABORT_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 5) #define F2FS_IOC_GARBAGE_COLLECT _IO(F2FS_IOCTL_MAGIC, 6) #define F2FS_IOC_WRITE_CHECKPOINT _IO(F2FS_IOCTL_MAGIC, 7) +#define F2FS_IOC_DEFRAGMENT _IO(F2FS_IOCTL_MAGIC, 8) #define F2FS_IOC_SET_ENCRYPTION_POLICY \ _IOR('f', 19, struct f2fs_encryption_policy) @@ -256,10 +254,16 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size, /* * ioctl commands in 32 bit emulation */ -#define F2FS_IOC32_GETFLAGS FS_IOC32_GETFLAGS -#define F2FS_IOC32_SETFLAGS FS_IOC32_SETFLAGS +#define F2FS_IOC32_GETFLAGS FS_IOC32_GETFLAGS +#define F2FS_IOC32_SETFLAGS FS_IOC32_SETFLAGS +#define F2FS_IOC32_GETVERSION FS_IOC32_GETVERSION #endif +struct f2fs_defragment { + u64 start; + u64 len; +}; + /* * For INODE and NODE manager */ @@ -357,9 +361,9 @@ struct extent_tree { struct rb_root root; /* root of extent info rb-tree */ struct extent_node *cached_en; /* recently accessed extent node */ struct extent_info largest; /* largested extent info */ + struct list_head list; /* to be used by sbi->zombie_list */ rwlock_t lock; /* protect extent info rb-tree */ - atomic_t refcount; /* reference count of rb-tree */ - unsigned int count; /* # of extent node in rb-tree*/ + atomic_t node_cnt; /* # of extent node in rb-tree*/ }; /* @@ -434,8 +438,8 @@ struct f2fs_inode_info { unsigned int clevel; /* maximum level of given file name */ nid_t i_xattr_nid; /* node id that contains xattrs */ unsigned long long xattr_ver; /* cp version of xattr modification */ - struct inode_entry *dirty_dir; /* the pointer of dirty dir */ + struct list_head dirty_list; /* linked in global dirty list */ struct list_head inmem_pages; /* inmemory pages managed by f2fs */ struct mutex inmem_lock; /* lock for inmemory pages */ @@ -544,6 +548,7 @@ struct dnode_of_data { nid_t nid; /* node id of the direct node block */ unsigned int ofs_in_node; /* data offset in the node page */ bool inode_page_locked; /* inode page is locked or not */ + bool node_changed; /* is node block changed */ block_t data_blkaddr; /* block address of the node block */ }; @@ -647,6 +652,7 @@ struct f2fs_sm_info { enum count_type { F2FS_WRITEBACK, F2FS_DIRTY_DENTS, + F2FS_DIRTY_DATA, F2FS_DIRTY_NODES, F2FS_DIRTY_META, F2FS_INMEM_PAGES, @@ -695,6 +701,12 @@ struct f2fs_bio_info { struct rw_semaphore io_rwsem; /* blocking op for bio */ }; +enum inode_type { + DIR_INODE, /* for dirty dir inode */ + FILE_INODE, /* for dirty regular/symlink inode */ + NR_INODE_TYPE, +}; + /* for inner inode cache management */ struct inode_management { struct radix_tree_root ino_root; /* ino entry array */ @@ -711,11 +723,17 @@ enum { SBI_POR_DOING, /* recovery is doing or not */ }; +enum { + CP_TIME, + REQ_TIME, + MAX_TIME, +}; + struct f2fs_sb_info { struct super_block *sb; /* pointer to VFS super block */ struct proc_dir_entry *s_proc; /* proc entry */ - struct buffer_head *raw_super_buf; /* buffer head of raw sb */ struct f2fs_super_block *raw_super; /* raw super block pointer */ + int valid_super_block; /* valid super block no */ int s_flag; /* flags for sbi */ /* for node-related operations */ @@ -737,23 +755,26 @@ struct f2fs_sb_info { struct rw_semaphore node_write; /* locking node writes */ struct mutex writepages; /* mutex for writepages() */ wait_queue_head_t cp_wait; - long cp_expires, cp_interval; /* next expected periodic cp */ + unsigned long last_time[MAX_TIME]; /* to store time in jiffies */ + long interval_time[MAX_TIME]; /* to store thresholds */ struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */ /* for orphan inode, use 0'th array */ unsigned int max_orphans; /* max orphan inodes */ - /* for directory inode management */ - struct list_head dir_inode_list; /* dir inode list */ - spinlock_t dir_inode_lock; /* for dir inode list lock */ + /* for inode management */ + struct list_head inode_list[NR_INODE_TYPE]; /* dirty inode list */ + spinlock_t inode_lock[NR_INODE_TYPE]; /* for dirty inode list lock */ /* for extent tree cache */ struct radix_tree_root extent_tree_root;/* cache extent cache entries */ struct rw_semaphore extent_tree_lock; /* locking extent radix tree */ struct list_head extent_list; /* lru list for shrinker */ spinlock_t extent_lock; /* locking extent lru list */ - int total_ext_tree; /* extent tree count */ + atomic_t total_ext_tree; /* extent tree count */ + struct list_head zombie_list; /* extent zombie tree list */ + atomic_t total_zombie_tree; /* extent zombie tree count */ atomic_t total_ext_node; /* extent info count */ /* basic filesystem units */ @@ -771,6 +792,7 @@ struct f2fs_sb_info { unsigned int total_node_count; /* total node block count */ unsigned int total_valid_node_count; /* valid node block count */ unsigned int total_valid_inode_count; /* valid inode count */ + loff_t max_file_blocks; /* max block index of file */ int active_logs; /* # of active logs */ int dir_level; /* directory level */ @@ -809,7 +831,7 @@ struct f2fs_sb_info { atomic_t inline_inode; /* # of inline_data inodes */ atomic_t inline_dir; /* # of inline_dentry inodes */ int bg_gc; /* background gc calls */ - unsigned int n_dirty_dirs; /* # of dir inodes */ + unsigned int ndirty_inode[NR_INODE_TYPE]; /* # of dirty inodes */ #endif unsigned int last_victim[2]; /* last victim segment # */ spinlock_t stat_lock; /* lock for stat operations */ @@ -824,6 +846,31 @@ struct f2fs_sb_info { unsigned int shrinker_run_no; }; +static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type) +{ + sbi->last_time[type] = jiffies; +} + +static inline bool f2fs_time_over(struct f2fs_sb_info *sbi, int type) +{ + struct timespec ts = {sbi->interval_time[type], 0}; + unsigned long interval = timespec_to_jiffies(&ts); + + return time_after(jiffies, sbi->last_time[type] + interval); +} + +static inline bool is_idle(struct f2fs_sb_info *sbi) +{ + struct block_device *bdev = sbi->sb->s_bdev; + struct request_queue *q = bdev_get_queue(bdev); + struct request_list *rl = &q->root_rl; + + if (rl->count[BLK_RW_SYNC] || rl->count[BLK_RW_ASYNC]) + return 0; + + return f2fs_time_over(sbi, REQ_TIME); +} + /* * Inline functions */ @@ -1059,8 +1106,8 @@ static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) static inline void inode_inc_dirty_pages(struct inode *inode) { atomic_inc(&F2FS_I(inode)->dirty_pages); - if (S_ISDIR(inode->i_mode)) - inc_page_count(F2FS_I_SB(inode), F2FS_DIRTY_DENTS); + inc_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ? + F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA); } static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type) @@ -1075,9 +1122,8 @@ static inline void inode_dec_dirty_pages(struct inode *inode) return; atomic_dec(&F2FS_I(inode)->dirty_pages); - - if (S_ISDIR(inode->i_mode)) - dec_page_count(F2FS_I_SB(inode), F2FS_DIRTY_DENTS); + dec_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ? + F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA); } static inline int get_pages(struct f2fs_sb_info *sbi, int count_type) @@ -1092,8 +1138,7 @@ static inline int get_dirty_pages(struct inode *inode) static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type) { - unsigned int pages_per_sec = sbi->segs_per_sec * - (1 << sbi->log_blocks_per_seg); + unsigned int pages_per_sec = sbi->segs_per_sec * sbi->blocks_per_seg; return ((get_pages(sbi, block_type) + pages_per_sec - 1) >> sbi->log_blocks_per_seg) / sbi->segs_per_sec; } @@ -1416,6 +1461,8 @@ enum { FI_DROP_CACHE, /* drop dirty page cache */ FI_DATA_EXIST, /* indicate data exists */ FI_INLINE_DOTS, /* indicate inline dot dentries */ + FI_DO_DEFRAG, /* indicate defragment is running */ + FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */ }; static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag) @@ -1602,13 +1649,11 @@ static inline bool is_dot_dotdot(const struct qstr *str) static inline bool f2fs_may_extent_tree(struct inode *inode) { - mode_t mode = inode->i_mode; - if (!test_opt(F2FS_I_SB(inode), EXTENT_CACHE) || is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT)) return false; - return S_ISREG(mode); + return S_ISREG(inode->i_mode); } static inline void *f2fs_kvmalloc(size_t size, gfp_t flags) @@ -1661,8 +1706,8 @@ long f2fs_compat_ioctl(struct file *, unsigned int, unsigned long); void f2fs_set_inode_flags(struct inode *); struct inode *f2fs_iget(struct super_block *, unsigned long); int try_to_free_nats(struct f2fs_sb_info *, int); -void update_inode(struct inode *, struct page *); -void update_inode_page(struct inode *); +int update_inode(struct inode *, struct page *); +int update_inode_page(struct inode *); int f2fs_write_inode(struct inode *, struct writeback_control *); void f2fs_evict_inode(struct inode *); void handle_failed_inode(struct inode *); @@ -1767,7 +1812,7 @@ void destroy_node_manager_caches(void); */ void register_inmem_page(struct inode *, struct page *); int commit_inmem_pages(struct inode *, bool); -void f2fs_balance_fs(struct f2fs_sb_info *); +void f2fs_balance_fs(struct f2fs_sb_info *, bool); void f2fs_balance_fs_bg(struct f2fs_sb_info *); int f2fs_issue_flush(struct f2fs_sb_info *); int create_flush_cmd_control(struct f2fs_sb_info *); @@ -1813,9 +1858,9 @@ bool is_valid_blkaddr(struct f2fs_sb_info *, block_t, int); int ra_meta_pages(struct f2fs_sb_info *, block_t, int, int, bool); void ra_meta_pages_cond(struct f2fs_sb_info *, pgoff_t); long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long); -void add_dirty_inode(struct f2fs_sb_info *, nid_t, int type); -void remove_dirty_inode(struct f2fs_sb_info *, nid_t, int type); -void release_dirty_inode(struct f2fs_sb_info *); +void add_ino_entry(struct f2fs_sb_info *, nid_t, int type); +void remove_ino_entry(struct f2fs_sb_info *, nid_t, int type); +void release_ino_entry(struct f2fs_sb_info *); bool exist_written_data(struct f2fs_sb_info *, nid_t, int); int acquire_orphan_inode(struct f2fs_sb_info *); void release_orphan_inode(struct f2fs_sb_info *); @@ -1825,9 +1870,9 @@ int recover_orphan_inodes(struct f2fs_sb_info *); int get_valid_checkpoint(struct f2fs_sb_info *); void update_dirty_page(struct inode *, struct page *); void add_dirty_dir_inode(struct inode *); -void remove_dirty_dir_inode(struct inode *); -void sync_dirty_dir_inodes(struct f2fs_sb_info *); -void write_checkpoint(struct f2fs_sb_info *, struct cp_control *); +void remove_dirty_inode(struct inode *); +int sync_dirty_inodes(struct f2fs_sb_info *, enum inode_type); +int write_checkpoint(struct f2fs_sb_info *, struct cp_control *); void init_ino_entry_info(struct f2fs_sb_info *); int __init create_checkpoint_caches(void); void destroy_checkpoint_caches(void); @@ -1847,6 +1892,7 @@ struct page *find_data_page(struct inode *, pgoff_t); struct page *get_lock_data_page(struct inode *, pgoff_t, bool); struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool); int do_write_data_page(struct f2fs_io_info *); +int f2fs_map_blocks(struct inode *, struct f2fs_map_blocks *, int, int); int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64); void f2fs_invalidate_page(struct page *, unsigned int, unsigned int); int f2fs_release_page(struct page *, gfp_t); @@ -1877,8 +1923,9 @@ struct f2fs_stat_info { int main_area_segs, main_area_sections, main_area_zones; unsigned long long hit_largest, hit_cached, hit_rbtree; unsigned long long hit_total, total_ext; - int ext_tree, ext_node; - int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta; + int ext_tree, zombie_tree, ext_node; + int ndirty_node, ndirty_meta; + int ndirty_dent, ndirty_dirs, ndirty_data, ndirty_files; int nats, dirty_nats, sits, dirty_sits, fnids; int total_count, utilization; int bg_gc, inmem_pages, wb_pages; @@ -1888,7 +1935,7 @@ struct f2fs_stat_info { int util_free, util_valid, util_invalid; int rsvd_segs, overp_segs; int dirty_count, node_pages, meta_pages; - int prefree_count, call_count, cp_count; + int prefree_count, call_count, cp_count, bg_cp_count; int tot_segs, node_segs, data_segs, free_segs, free_secs; int bg_node_segs, bg_data_segs; int tot_blks, data_blks, node_blks; @@ -1909,10 +1956,11 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) } #define stat_inc_cp_count(si) ((si)->cp_count++) +#define stat_inc_bg_cp_count(si) ((si)->bg_cp_count++) #define stat_inc_call_count(si) ((si)->call_count++) #define stat_inc_bggc_count(sbi) ((sbi)->bg_gc++) -#define stat_inc_dirty_dir(sbi) ((sbi)->n_dirty_dirs++) -#define stat_dec_dirty_dir(sbi) ((sbi)->n_dirty_dirs--) +#define stat_inc_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]++) +#define stat_dec_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]--) #define stat_inc_total_hit(sbi) (atomic64_inc(&(sbi)->total_hit_ext)) #define stat_inc_rbtree_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_rbtree)) #define stat_inc_largest_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_largest)) @@ -1987,14 +2035,15 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) int f2fs_build_stats(struct f2fs_sb_info *); void f2fs_destroy_stats(struct f2fs_sb_info *); -void __init f2fs_create_root_stats(void); +int __init f2fs_create_root_stats(void); void f2fs_destroy_root_stats(void); #else #define stat_inc_cp_count(si) +#define stat_inc_bg_cp_count(si) #define stat_inc_call_count(si) #define stat_inc_bggc_count(si) -#define stat_inc_dirty_dir(sbi) -#define stat_dec_dirty_dir(sbi) +#define stat_inc_dirty_inode(sbi, type) +#define stat_dec_dirty_inode(sbi, type) #define stat_inc_total_hit(sb) #define stat_inc_rbtree_node_hit(sb) #define stat_inc_largest_node_hit(sbi) @@ -2015,7 +2064,7 @@ void f2fs_destroy_root_stats(void); static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; } static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { } -static inline void __init f2fs_create_root_stats(void) { } +static inline int __init f2fs_create_root_stats(void) { return 0; } static inline void f2fs_destroy_root_stats(void) { } #endif @@ -2069,8 +2118,7 @@ void f2fs_leave_shrinker(struct f2fs_sb_info *); * extent_cache.c */ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *, int); -void f2fs_drop_largest_extent(struct inode *, pgoff_t); -void f2fs_init_extent_tree(struct inode *, struct f2fs_extent *); +bool f2fs_init_extent_tree(struct inode *, struct f2fs_extent *); unsigned int f2fs_destroy_extent_node(struct inode *); void f2fs_destroy_extent_tree(struct inode *); bool f2fs_lookup_extent_cache(struct inode *, pgoff_t, struct extent_info *); @@ -2121,7 +2169,7 @@ static inline int f2fs_sb_has_crypto(struct super_block *sb) static inline bool f2fs_may_encrypt(struct inode *inode) { #ifdef CONFIG_F2FS_FS_ENCRYPTION - mode_t mode = inode->i_mode; + umode_t mode = inode->i_mode; return (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)); #else diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index a197215ad..ea272be62 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -40,8 +40,6 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, struct dnode_of_data dn; int err; - f2fs_balance_fs(sbi); - sb_start_pagefault(inode->i_sb); f2fs_bug_on(sbi, f2fs_has_inline_data(inode)); @@ -57,6 +55,8 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, f2fs_put_dnode(&dn); f2fs_unlock_op(sbi); + f2fs_balance_fs(sbi, dn.node_changed); + file_update_time(vma->vm_file); lock_page(page); if (unlikely(page->mapping != inode->i_mapping || @@ -96,6 +96,7 @@ mapped: clear_cold_data(page); out: sb_end_pagefault(inode->i_sb); + f2fs_update_time(sbi, REQ_TIME); return block_page_mkwrite_return(err); } @@ -201,7 +202,7 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) trace_f2fs_sync_file_enter(inode); /* if fdatasync is triggered, let's do in-place-update */ - if (get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks) + if (datasync || get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks) set_inode_flag(fi, FI_NEED_IPU); ret = filemap_write_and_wait_range(inode->i_mapping, start, end); clear_inode_flag(fi, FI_NEED_IPU); @@ -233,9 +234,6 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) goto out; } go_write: - /* guarantee free sections for fsync */ - f2fs_balance_fs(sbi); - /* * Both of fdatasync() and fsync() are able to be recovered from * sudden-power-off. @@ -261,8 +259,10 @@ sync_nodes: sync_node_pages(sbi, ino, &wbc); /* if cp_error was enabled, we should avoid infinite loop */ - if (unlikely(f2fs_cp_error(sbi))) + if (unlikely(f2fs_cp_error(sbi))) { + ret = -EIO; goto out; + } if (need_inode_block_update(sbi, ino)) { mark_inode_dirty_sync(inode); @@ -275,12 +275,13 @@ sync_nodes: goto out; /* once recovery info is written, don't need to tack this */ - remove_dirty_inode(sbi, ino, APPEND_INO); + remove_ino_entry(sbi, ino, APPEND_INO); clear_inode_flag(fi, FI_APPEND_WRITE); flush_out: - remove_dirty_inode(sbi, ino, UPDATE_INO); + remove_ino_entry(sbi, ino, UPDATE_INO); clear_inode_flag(fi, FI_UPDATE_WRITE); ret = f2fs_issue_flush(sbi); + f2fs_update_time(sbi, REQ_TIME); out: trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); f2fs_trace_ios(NULL, 1); @@ -332,7 +333,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) loff_t isize; int err = 0; - mutex_lock(&inode->i_mutex); + inode_lock(inode); isize = i_size_read(inode); if (offset >= isize) @@ -387,10 +388,10 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) found: if (whence == SEEK_HOLE && data_ofs > isize) data_ofs = isize; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return vfs_setpos(file, data_ofs, maxbytes); fail: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return -ENXIO; } @@ -418,19 +419,18 @@ static loff_t f2fs_llseek(struct file *file, loff_t offset, int whence) static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file_inode(file); + int err; if (f2fs_encrypted_inode(inode)) { - int err = f2fs_get_encryption_info(inode); + err = f2fs_get_encryption_info(inode); if (err) return 0; } /* we don't need to use inline_data strictly */ - if (f2fs_has_inline_data(inode)) { - int err = f2fs_convert_inline_inode(inode); - if (err) - return err; - } + err = f2fs_convert_inline_inode(inode); + if (err) + return err; file_accessed(file); vma->vm_ops = &f2fs_file_vm_ops; @@ -483,11 +483,11 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count) F2FS_I(dn->inode)) + ofs; f2fs_update_extent_cache_range(dn, fofs, 0, len); dec_valid_block_count(sbi, dn->inode, nr_free); - set_page_dirty(dn->node_page); sync_inode_page(dn); } dn->ofs_in_node = ofs; + f2fs_update_time(sbi, REQ_TIME); trace_f2fs_truncate_data_blocks_range(dn->inode, dn->nid, dn->ofs_in_node, nr_free); return nr_free; @@ -604,7 +604,7 @@ int f2fs_truncate(struct inode *inode, bool lock) trace_f2fs_truncate(inode); /* we should check inline_data size */ - if (f2fs_has_inline_data(inode) && !f2fs_may_inline_data(inode)) { + if (!f2fs_may_inline_data(inode)) { err = f2fs_convert_inline_inode(inode); if (err) return err; @@ -679,13 +679,20 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) err = f2fs_truncate(inode, true); if (err) return err; - f2fs_balance_fs(F2FS_I_SB(inode)); + f2fs_balance_fs(F2FS_I_SB(inode), true); } else { /* * do not trim all blocks after i_size if target size is * larger than i_size. */ truncate_setsize(inode, attr->ia_size); + + /* should convert inline inode here */ + if (!f2fs_may_inline_data(inode)) { + err = f2fs_convert_inline_inode(inode); + if (err) + return err; + } inode->i_mtime = inode->i_ctime = CURRENT_TIME; } } @@ -727,7 +734,7 @@ static int fill_zero(struct inode *inode, pgoff_t index, if (!len) return 0; - f2fs_balance_fs(sbi); + f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); page = get_new_data_page(inode, NULL, index, false); @@ -778,13 +785,11 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) { pgoff_t pg_start, pg_end; loff_t off_start, off_end; - int ret = 0; + int ret; - if (f2fs_has_inline_data(inode)) { - ret = f2fs_convert_inline_inode(inode); - if (ret) - return ret; - } + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT; pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT; @@ -815,7 +820,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) loff_t blk_start, blk_end; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - f2fs_balance_fs(sbi); + f2fs_balance_fs(sbi, true); blk_start = (loff_t)pg_start << PAGE_CACHE_SHIFT; blk_end = (loff_t)pg_end << PAGE_CACHE_SHIFT; @@ -918,7 +923,7 @@ static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end) int ret = 0; for (; end < nrpages; start++, end++) { - f2fs_balance_fs(sbi); + f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); ret = __exchange_data_block(inode, end, start, true); f2fs_unlock_op(sbi); @@ -941,13 +946,9 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) if (offset & (F2FS_BLKSIZE - 1) || len & (F2FS_BLKSIZE - 1)) return -EINVAL; - f2fs_balance_fs(F2FS_I_SB(inode)); - - if (f2fs_has_inline_data(inode)) { - ret = f2fs_convert_inline_inode(inode); - if (ret) - return ret; - } + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; pg_start = offset >> PAGE_CACHE_SHIFT; pg_end = (offset + len) >> PAGE_CACHE_SHIFT; @@ -991,13 +992,9 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, if (ret) return ret; - f2fs_balance_fs(sbi); - - if (f2fs_has_inline_data(inode)) { - ret = f2fs_convert_inline_inode(inode); - if (ret) - return ret; - } + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; ret = filemap_write_and_wait_range(mapping, offset, offset + len - 1); if (ret) @@ -1104,13 +1101,11 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) if (offset & (F2FS_BLKSIZE - 1) || len & (F2FS_BLKSIZE - 1)) return -EINVAL; - f2fs_balance_fs(sbi); + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; - if (f2fs_has_inline_data(inode)) { - ret = f2fs_convert_inline_inode(inode); - if (ret) - return ret; - } + f2fs_balance_fs(sbi, true); ret = truncate_blocks(inode, i_size_read(inode), true); if (ret) @@ -1154,17 +1149,15 @@ static int expand_inode_data(struct inode *inode, loff_t offset, loff_t off_start, off_end; int ret = 0; - f2fs_balance_fs(sbi); - ret = inode_newsize_ok(inode, (len + offset)); if (ret) return ret; - if (f2fs_has_inline_data(inode)) { - ret = f2fs_convert_inline_inode(inode); - if (ret) - return ret; - } + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; + + f2fs_balance_fs(sbi, true); pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT; pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT; @@ -1226,7 +1219,7 @@ static long f2fs_fallocate(struct file *file, int mode, FALLOC_FL_INSERT_RANGE)) return -EOPNOTSUPP; - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (mode & FALLOC_FL_PUNCH_HOLE) { if (offset >= inode->i_size) @@ -1246,10 +1239,11 @@ static long f2fs_fallocate(struct file *file, int mode, if (!ret) { inode->i_mtime = inode->i_ctime = CURRENT_TIME; mark_inode_dirty(inode); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); } out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); trace_f2fs_fallocate(inode, mode, offset, len, ret); return ret; @@ -1313,13 +1307,13 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) flags = f2fs_mask_flags(inode->i_mode, flags); - mutex_lock(&inode->i_mutex); + inode_lock(inode); oldflags = fi->i_flags; if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { if (!capable(CAP_LINUX_IMMUTABLE)) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); ret = -EPERM; goto out; } @@ -1328,7 +1322,7 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) flags = flags & FS_FL_USER_MODIFIABLE; flags |= oldflags & ~FS_FL_USER_MODIFIABLE; fi->i_flags = flags; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); f2fs_set_inode_flags(inode); inode->i_ctime = CURRENT_TIME; @@ -1353,8 +1347,6 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) if (!inode_owner_or_capable(inode)) return -EACCES; - f2fs_balance_fs(F2FS_I_SB(inode)); - if (f2fs_is_atomic_file(inode)) return 0; @@ -1363,6 +1355,8 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) return ret; set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); + return 0; } @@ -1384,8 +1378,10 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) if (f2fs_is_atomic_file(inode)) { clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); ret = commit_inmem_pages(inode, false); - if (ret) + if (ret) { + set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); goto err_out; + } } ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0); @@ -1410,6 +1406,7 @@ static int f2fs_ioc_start_volatile_write(struct file *filp) return ret; set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return 0; } @@ -1441,13 +1438,17 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp) if (ret) return ret; - f2fs_balance_fs(F2FS_I_SB(inode)); - - clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); - clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE); - commit_inmem_pages(inode, true); + if (f2fs_is_atomic_file(inode)) { + clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); + commit_inmem_pages(inode, true); + } + if (f2fs_is_volatile_file(inode)) { + clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE); + ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0); + } mnt_drop_write_file(filp); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return ret; } @@ -1487,6 +1488,7 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) default: return -EINVAL; } + f2fs_update_time(sbi, REQ_TIME); return 0; } @@ -1517,6 +1519,7 @@ static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg) if (copy_to_user((struct fstrim_range __user *)arg, &range, sizeof(range))) return -EFAULT; + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return 0; } @@ -1540,6 +1543,7 @@ static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg) sizeof(policy))) return -EFAULT; + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return f2fs_process_policy(&policy, inode); #else return -EOPNOTSUPP; @@ -1586,13 +1590,13 @@ static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg) generate_random_uuid(sbi->raw_super->encrypt_pw_salt); err = f2fs_commit_super(sbi, false); - - mnt_drop_write_file(filp); if (err) { /* undo new data */ memset(sbi->raw_super->encrypt_pw_salt, 0, 16); + mnt_drop_write_file(filp); return err; } + mnt_drop_write_file(filp); got_it: if (copy_to_user((__u8 __user *)arg, sbi->raw_super->encrypt_pw_salt, 16)) @@ -1629,7 +1633,6 @@ static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct cp_control cpc; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1637,13 +1640,196 @@ static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg) if (f2fs_readonly(sbi->sb)) return -EROFS; - cpc.reason = __get_cp_reason(sbi); + return f2fs_sync_fs(sbi->sb, 1); +} - mutex_lock(&sbi->gc_mutex); - write_checkpoint(sbi, &cpc); - mutex_unlock(&sbi->gc_mutex); +static int f2fs_defragment_range(struct f2fs_sb_info *sbi, + struct file *filp, + struct f2fs_defragment *range) +{ + struct inode *inode = file_inode(filp); + struct f2fs_map_blocks map; + struct extent_info ei; + pgoff_t pg_start, pg_end; + unsigned int blk_per_seg = sbi->blocks_per_seg; + unsigned int total = 0, sec_num; + unsigned int pages_per_sec = sbi->segs_per_sec * blk_per_seg; + block_t blk_end = 0; + bool fragmented = false; + int err; - return 0; + /* if in-place-update policy is enabled, don't waste time here */ + if (need_inplace_update(inode)) + return -EINVAL; + + pg_start = range->start >> PAGE_CACHE_SHIFT; + pg_end = (range->start + range->len) >> PAGE_CACHE_SHIFT; + + f2fs_balance_fs(sbi, true); + + inode_lock(inode); + + /* writeback all dirty pages in the range */ + err = filemap_write_and_wait_range(inode->i_mapping, range->start, + range->start + range->len - 1); + if (err) + goto out; + + /* + * lookup mapping info in extent cache, skip defragmenting if physical + * block addresses are continuous. + */ + if (f2fs_lookup_extent_cache(inode, pg_start, &ei)) { + if (ei.fofs + ei.len >= pg_end) + goto out; + } + + map.m_lblk = pg_start; + + /* + * lookup mapping info in dnode page cache, skip defragmenting if all + * physical block addresses are continuous even if there are hole(s) + * in logical blocks. + */ + while (map.m_lblk < pg_end) { + map.m_len = pg_end - map.m_lblk; + err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ); + if (err) + goto out; + + if (!(map.m_flags & F2FS_MAP_FLAGS)) { + map.m_lblk++; + continue; + } + + if (blk_end && blk_end != map.m_pblk) { + fragmented = true; + break; + } + blk_end = map.m_pblk + map.m_len; + + map.m_lblk += map.m_len; + } + + if (!fragmented) + goto out; + + map.m_lblk = pg_start; + map.m_len = pg_end - pg_start; + + sec_num = (map.m_len + pages_per_sec - 1) / pages_per_sec; + + /* + * make sure there are enough free section for LFS allocation, this can + * avoid defragment running in SSR mode when free section are allocated + * intensively + */ + if (has_not_enough_free_secs(sbi, sec_num)) { + err = -EAGAIN; + goto out; + } + + while (map.m_lblk < pg_end) { + pgoff_t idx; + int cnt = 0; + +do_map: + map.m_len = pg_end - map.m_lblk; + err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ); + if (err) + goto clear_out; + + if (!(map.m_flags & F2FS_MAP_FLAGS)) { + map.m_lblk++; + continue; + } + + set_inode_flag(F2FS_I(inode), FI_DO_DEFRAG); + + idx = map.m_lblk; + while (idx < map.m_lblk + map.m_len && cnt < blk_per_seg) { + struct page *page; + + page = get_lock_data_page(inode, idx, true); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto clear_out; + } + + set_page_dirty(page); + f2fs_put_page(page, 1); + + idx++; + cnt++; + total++; + } + + map.m_lblk = idx; + + if (idx < pg_end && cnt < blk_per_seg) + goto do_map; + + clear_inode_flag(F2FS_I(inode), FI_DO_DEFRAG); + + err = filemap_fdatawrite(inode->i_mapping); + if (err) + goto out; + } +clear_out: + clear_inode_flag(F2FS_I(inode), FI_DO_DEFRAG); +out: + inode_unlock(inode); + if (!err) + range->len = (u64)total << PAGE_CACHE_SHIFT; + return err; +} + +static int f2fs_ioc_defragment(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_defragment range; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + + err = mnt_want_write_file(filp); + if (err) + return err; + + if (f2fs_readonly(sbi->sb)) { + err = -EROFS; + goto out; + } + + if (copy_from_user(&range, (struct f2fs_defragment __user *)arg, + sizeof(range))) { + err = -EFAULT; + goto out; + } + + /* verify alignment of offset & size */ + if (range.start & (F2FS_BLKSIZE - 1) || + range.len & (F2FS_BLKSIZE - 1)) { + err = -EINVAL; + goto out; + } + + err = f2fs_defragment_range(sbi, filp, &range); + f2fs_update_time(sbi, REQ_TIME); + if (err < 0) + goto out; + + if (copy_to_user((struct f2fs_defragment __user *)arg, &range, + sizeof(range))) + err = -EFAULT; +out: + mnt_drop_write_file(filp); + return err; } long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) @@ -1679,6 +1865,8 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_gc(filp, arg); case F2FS_IOC_WRITE_CHECKPOINT: return f2fs_ioc_write_checkpoint(filp, arg); + case F2FS_IOC_DEFRAGMENT: + return f2fs_ioc_defragment(filp, arg); default: return -ENOTTY; } @@ -1706,6 +1894,22 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC32_SETFLAGS: cmd = F2FS_IOC_SETFLAGS; break; + case F2FS_IOC32_GETVERSION: + cmd = F2FS_IOC_GETVERSION; + break; + case F2FS_IOC_START_ATOMIC_WRITE: + case F2FS_IOC_COMMIT_ATOMIC_WRITE: + case F2FS_IOC_START_VOLATILE_WRITE: + case F2FS_IOC_RELEASE_VOLATILE_WRITE: + case F2FS_IOC_ABORT_VOLATILE_WRITE: + case F2FS_IOC_SHUTDOWN: + case F2FS_IOC_SET_ENCRYPTION_POLICY: + case F2FS_IOC_GET_ENCRYPTION_PWSALT: + case F2FS_IOC_GET_ENCRYPTION_POLICY: + case F2FS_IOC_GARBAGE_COLLECT: + case F2FS_IOC_WRITE_CHECKPOINT: + case F2FS_IOC_DEFRAGMENT: + break; default: return -ENOIOCTLCMD; } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index fedbf67a0..f610c2a9b 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -16,7 +16,6 @@ #include <linux/kthread.h> #include <linux/delay.h> #include <linux/freezer.h> -#include <linux/blkdev.h> #include "f2fs.h" #include "node.h" @@ -173,9 +172,9 @@ static unsigned int get_max_cost(struct f2fs_sb_info *sbi, { /* SSR allocates in a segment unit */ if (p->alloc_mode == SSR) - return 1 << sbi->log_blocks_per_seg; + return sbi->blocks_per_seg; if (p->gc_mode == GC_GREEDY) - return (1 << sbi->log_blocks_per_seg) * p->ofs_unit; + return sbi->blocks_per_seg * p->ofs_unit; else if (p->gc_mode == GC_CB) return UINT_MAX; else /* No other gc_mode */ @@ -832,8 +831,10 @@ gc_more: if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) goto stop; - if (unlikely(f2fs_cp_error(sbi))) + if (unlikely(f2fs_cp_error(sbi))) { + ret = -EIO; goto stop; + } if (gc_type == BG_GC && has_not_enough_free_secs(sbi, sec_freed)) { gc_type = FG_GC; diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index b4a65be9f..a993967dc 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -100,11 +100,3 @@ static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi) return true; return false; } - -static inline int is_idle(struct f2fs_sb_info *sbi) -{ - struct block_device *bdev = sbi->sb->s_bdev; - struct request_queue *q = bdev_get_queue(bdev); - struct request_list *rl = &q->root_rl; - return !(rl->count[BLK_RW_SYNC]) && !(rl->count[BLK_RW_ASYNC]); -} diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index bda712646..c3f0b7d4c 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -16,9 +16,6 @@ bool f2fs_may_inline_data(struct inode *inode) { - if (!test_opt(F2FS_I_SB(inode), INLINE_DATA)) - return false; - if (f2fs_is_atomic_file(inode)) return false; @@ -177,6 +174,9 @@ int f2fs_convert_inline_inode(struct inode *inode) struct page *ipage, *page; int err = 0; + if (!f2fs_has_inline_data(inode)) + return 0; + page = grab_cache_page(inode->i_mapping, 0); if (!page) return -ENOMEM; @@ -199,6 +199,9 @@ out: f2fs_unlock_op(sbi); f2fs_put_page(page, 1); + + f2fs_balance_fs(sbi, dn.node_changed); + return err; } diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 97e20deca..2adeff26b 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -138,7 +138,8 @@ static int do_read_inode(struct inode *inode) fi->i_pino = le32_to_cpu(ri->i_pino); fi->i_dir_level = ri->i_dir_level; - f2fs_init_extent_tree(inode, &ri->i_ext); + if (f2fs_init_extent_tree(inode, &ri->i_ext)) + set_page_dirty(node_page); get_inline_info(fi, ri); @@ -202,6 +203,7 @@ make_now: inode->i_op = &f2fs_encrypted_symlink_inode_operations; else inode->i_op = &f2fs_symlink_inode_operations; + inode_nohighmem(inode); inode->i_mapping->a_ops = &f2fs_dblock_aops; } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { @@ -221,7 +223,7 @@ bad_inode: return ERR_PTR(ret); } -void update_inode(struct inode *inode, struct page *node_page) +int update_inode(struct inode *inode, struct page *node_page) { struct f2fs_inode *ri; @@ -259,15 +261,16 @@ void update_inode(struct inode *inode, struct page *node_page) __set_inode_rdev(inode, ri); set_cold_node(inode, node_page); - set_page_dirty(node_page); - clear_inode_flag(F2FS_I(inode), FI_DIRTY_INODE); + + return set_page_dirty(node_page); } -void update_inode_page(struct inode *inode) +int update_inode_page(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct page *node_page; + int ret = 0; retry: node_page = get_node_page(sbi, inode->i_ino); if (IS_ERR(node_page)) { @@ -278,10 +281,11 @@ retry: } else if (err != -ENOENT) { f2fs_stop_checkpoint(sbi); } - return; + return 0; } - update_inode(inode, node_page); + ret = update_inode(inode, node_page); f2fs_put_page(node_page, 1); + return ret; } int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) @@ -299,9 +303,8 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) * We need to balance fs here to prevent from producing dirty node pages * during the urgent cleaning time when runing out of free sections. */ - update_inode_page(inode); - - f2fs_balance_fs(sbi); + if (update_inode_page(inode)) + f2fs_balance_fs(sbi, true); return 0; } @@ -327,7 +330,7 @@ void f2fs_evict_inode(struct inode *inode) goto out_clear; f2fs_bug_on(sbi, get_dirty_pages(inode)); - remove_dirty_dir_inode(inode); + remove_dirty_inode(inode); f2fs_destroy_extent_tree(inode); @@ -357,9 +360,9 @@ no_delete: if (xnid) invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid); if (is_inode_flag_set(fi, FI_APPEND_WRITE)) - add_dirty_inode(sbi, inode->i_ino, APPEND_INO); + add_ino_entry(sbi, inode->i_ino, APPEND_INO); if (is_inode_flag_set(fi, FI_UPDATE_WRITE)) - add_dirty_inode(sbi, inode->i_ino, UPDATE_INO); + add_ino_entry(sbi, inode->i_ino, UPDATE_INO); if (is_inode_flag_set(fi, FI_FREE_NID)) { if (err && err != -ENOENT) alloc_nid_done(sbi, inode->i_ino); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 2c32110f9..6f944e5eb 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -60,7 +60,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode)) f2fs_set_encrypted_inode(inode); - if (f2fs_may_inline_data(inode)) + if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode)) set_inode_flag(F2FS_I(inode), FI_INLINE_DATA); if (f2fs_may_inline_dentry(inode)) set_inode_flag(F2FS_I(inode), FI_INLINE_DENTRY); @@ -128,8 +128,6 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, nid_t ino = 0; int err; - f2fs_balance_fs(sbi); - inode = f2fs_new_inode(dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -142,6 +140,8 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, inode->i_mapping->a_ops = &f2fs_dblock_aops; ino = inode->i_ino; + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -172,7 +172,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, !f2fs_is_child_context_consistent_with_parent(dir, inode)) return -EPERM; - f2fs_balance_fs(sbi); + f2fs_balance_fs(sbi, true); inode->i_ctime = CURRENT_TIME; ihold(inode); @@ -214,6 +214,15 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino) struct page *page; int err = 0; + if (f2fs_readonly(sbi->sb)) { + f2fs_msg(sbi->sb, KERN_INFO, + "skip recovering inline_dots inode (ino:%lu, pino:%u) " + "in readonly mountpoint", dir->i_ino, pino); + return 0; + } + + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); de = f2fs_find_entry(dir, &dot, &page); @@ -288,12 +297,13 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) int err = -ENOENT; trace_f2fs_unlink_enter(dir, dentry); - f2fs_balance_fs(sbi); de = f2fs_find_entry(dir, &dentry->d_name, &page); if (!de) goto fail; + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); err = acquire_orphan_inode(sbi); if (err) { @@ -315,12 +325,15 @@ fail: return err; } -static const char *f2fs_follow_link(struct dentry *dentry, void **cookie) +static const char *f2fs_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) { - const char *link = page_follow_link_light(dentry, cookie); + const char *link = page_get_link(dentry, inode, done); if (!IS_ERR(link) && !*link) { /* this is broken symlink case */ - page_put_link(NULL, *cookie); + do_delayed_call(done); + clear_delayed_call(done); link = ERR_PTR(-ENOENT); } return link; @@ -341,8 +354,6 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, if (len > dir->i_sb->s_blocksize) return -ENAMETOOLONG; - f2fs_balance_fs(sbi); - inode = f2fs_new_inode(dir, S_IFLNK | S_IRWXUGO); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -351,8 +362,11 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, inode->i_op = &f2fs_encrypted_symlink_inode_operations; else inode->i_op = &f2fs_symlink_inode_operations; + inode_nohighmem(inode); inode->i_mapping->a_ops = &f2fs_dblock_aops; + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -433,8 +447,6 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) struct inode *inode; int err; - f2fs_balance_fs(sbi); - inode = f2fs_new_inode(dir, S_IFDIR | mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -444,6 +456,8 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) inode->i_mapping->a_ops = &f2fs_dblock_aops; mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_HIGH_ZERO); + f2fs_balance_fs(sbi, true); + set_inode_flag(F2FS_I(inode), FI_INC_LINK); f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); @@ -481,8 +495,6 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, struct inode *inode; int err = 0; - f2fs_balance_fs(sbi); - inode = f2fs_new_inode(dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -490,6 +502,8 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, init_special_inode(inode, inode->i_mode, rdev); inode->i_op = &f2fs_special_inode_operations; + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -516,9 +530,6 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, struct inode *inode; int err; - if (!whiteout) - f2fs_balance_fs(sbi); - inode = f2fs_new_inode(dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -532,6 +543,8 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, inode->i_mapping->a_ops = &f2fs_dblock_aops; } + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); err = acquire_orphan_inode(sbi); if (err) @@ -604,8 +617,6 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out; } - f2fs_balance_fs(sbi); - old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); if (!old_entry) goto out; @@ -635,6 +646,8 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (!new_entry) goto out_whiteout; + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); err = acquire_orphan_inode(sbi); @@ -666,6 +679,8 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, update_inode_page(old_inode); update_inode_page(new_inode); } else { + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); err = f2fs_add_link(new_dentry, old_inode); @@ -763,8 +778,6 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, new_inode))) return -EPERM; - f2fs_balance_fs(sbi); - old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); if (!old_entry) goto out; @@ -807,6 +820,8 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, goto out_new_dir; } + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); err = update_dent_inode(old_inode, new_inode, &new_dentry->d_name); @@ -923,18 +938,22 @@ static int f2fs_rename2(struct inode *old_dir, struct dentry *old_dentry, } #ifdef CONFIG_F2FS_FS_ENCRYPTION -static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cookie) +static const char *f2fs_encrypted_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) { struct page *cpage = NULL; char *caddr, *paddr = NULL; - struct f2fs_str cstr; + struct f2fs_str cstr = FSTR_INIT(NULL, 0); struct f2fs_str pstr = FSTR_INIT(NULL, 0); - struct inode *inode = d_inode(dentry); struct f2fs_encrypted_symlink_data *sd; loff_t size = min_t(loff_t, i_size_read(inode), PAGE_SIZE - 1); u32 max_size = inode->i_sb->s_blocksize; int res; + if (!dentry) + return ERR_PTR(-ECHILD); + res = f2fs_get_encryption_info(inode); if (res) return ERR_PTR(res); @@ -942,12 +961,18 @@ static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cook cpage = read_mapping_page(inode->i_mapping, 0, NULL); if (IS_ERR(cpage)) return ERR_CAST(cpage); - caddr = kmap(cpage); + caddr = page_address(cpage); caddr[size] = 0; /* Symlink is encrypted */ sd = (struct f2fs_encrypted_symlink_data *)caddr; cstr.len = le16_to_cpu(sd->len); + + /* this is broken symlink case */ + if (unlikely(cstr.len == 0)) { + res = -ENOENT; + goto errout; + } cstr.name = kmalloc(cstr.len, GFP_NOFS); if (!cstr.name) { res = -ENOMEM; @@ -956,7 +981,7 @@ static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cook memcpy(cstr.name, sd->encrypted_path, cstr.len); /* this is broken symlink case */ - if (cstr.name[0] == 0 && cstr.len == 0) { + if (unlikely(cstr.name[0] == 0)) { res = -ENOENT; goto errout; } @@ -982,27 +1007,27 @@ static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cook /* Null-terminate the name */ paddr[res] = '\0'; - kunmap(cpage); page_cache_release(cpage); - return *cookie = paddr; + set_delayed_call(done, kfree_link, paddr); + return paddr; errout: kfree(cstr.name); f2fs_fname_crypto_free_buffer(&pstr); - kunmap(cpage); page_cache_release(cpage); return ERR_PTR(res); } const struct inode_operations f2fs_encrypted_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = f2fs_encrypted_follow_link, - .put_link = kfree_put_link, + .get_link = f2fs_encrypted_get_link, .getattr = f2fs_getattr, .setattr = f2fs_setattr, +#ifdef CONFIG_F2FS_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = f2fs_listxattr, .removexattr = generic_removexattr, +#endif }; #endif @@ -1031,8 +1056,7 @@ const struct inode_operations f2fs_dir_inode_operations = { const struct inode_operations f2fs_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = f2fs_follow_link, - .put_link = page_put_link, + .get_link = f2fs_get_link, .getattr = f2fs_getattr, .setattr = f2fs_setattr, #ifdef CONFIG_F2FS_FS_XATTR diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 7bcbc6e9c..342597a58 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -65,13 +65,14 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) sizeof(struct ino_entry)) >> PAGE_CACHE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); } else if (type == EXTENT_CACHE) { - mem_size = (sbi->total_ext_tree * sizeof(struct extent_tree) + + mem_size = (atomic_read(&sbi->total_ext_tree) * + sizeof(struct extent_tree) + atomic_read(&sbi->total_ext_node) * sizeof(struct extent_node)) >> PAGE_CACHE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); } else { - if (sbi->sb->s_bdi->wb.dirty_exceeded) - return false; + if (!sbi->sb->s_bdi->wb.dirty_exceeded) + return true; } return res; } @@ -261,13 +262,11 @@ static void cache_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid, { struct nat_entry *e; - down_write(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, nid); if (!e) { e = grab_nat_entry(nm_i, nid); node_info_from_raw_nat(&e->ni, ne); } - up_write(&nm_i->nat_tree_lock); } static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, @@ -379,6 +378,8 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) memset(&ne, 0, sizeof(struct f2fs_nat_entry)); + down_write(&nm_i->nat_tree_lock); + /* Check current segment summary */ mutex_lock(&curseg->curseg_mutex); i = lookup_journal_in_cursum(sum, NAT_JOURNAL, nid, 0); @@ -399,6 +400,7 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) cache: /* cache nat entry */ cache_nat_entry(NM_I(sbi), nid, &ne); + up_write(&nm_i->nat_tree_lock); } /* @@ -676,7 +678,8 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, ret = truncate_dnode(&rdn); if (ret < 0) goto out_err; - set_nid(page, i, 0, false); + if (set_nid(page, i, 0, false)) + dn->node_changed = true; } } else { child_nofs = nofs + ofs * (NIDS_PER_BLOCK + 1) + 1; @@ -689,7 +692,8 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, rdn.nid = child_nid; ret = truncate_nodes(&rdn, child_nofs, 0, depth - 1); if (ret == (NIDS_PER_BLOCK + 1)) { - set_nid(page, i, 0, false); + if (set_nid(page, i, 0, false)) + dn->node_changed = true; child_nofs += ret; } else if (ret < 0 && ret != -ENOENT) { goto out_err; @@ -750,7 +754,8 @@ static int truncate_partial_nodes(struct dnode_of_data *dn, err = truncate_dnode(dn); if (err < 0) goto fail; - set_nid(pages[idx], i, 0, false); + if (set_nid(pages[idx], i, 0, false)) + dn->node_changed = true; } if (offset[idx + 1] == 0) { @@ -975,7 +980,8 @@ struct page *new_node_page(struct dnode_of_data *dn, fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true); set_cold_node(dn->inode, page); SetPageUptodate(page); - set_page_dirty(page); + if (set_page_dirty(page)) + dn->node_changed = true; if (f2fs_has_xattr_block(ofs)) F2FS_I(dn->inode)->i_xattr_nid = dn->nid; @@ -1035,6 +1041,10 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) struct page *apage; int err; + if (!nid) + return; + f2fs_bug_on(sbi, check_nid_range(sbi, nid)); + apage = find_get_page(NODE_MAPPING(sbi), nid); if (apage && PageUptodate(apage)) { f2fs_put_page(apage, 0); @@ -1050,51 +1060,38 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) f2fs_put_page(apage, err ? 1 : 0); } -struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) +/* + * readahead MAX_RA_NODE number of node pages. + */ +void ra_node_pages(struct page *parent, int start) { - struct page *page; - int err; -repeat: - page = grab_cache_page(NODE_MAPPING(sbi), nid); - if (!page) - return ERR_PTR(-ENOMEM); + struct f2fs_sb_info *sbi = F2FS_P_SB(parent); + struct blk_plug plug; + int i, end; + nid_t nid; - err = read_node_page(page, READ_SYNC); - if (err < 0) { - f2fs_put_page(page, 1); - return ERR_PTR(err); - } else if (err != LOCKED_PAGE) { - lock_page(page); - } + blk_start_plug(&plug); - if (unlikely(!PageUptodate(page) || nid != nid_of_node(page))) { - ClearPageUptodate(page); - f2fs_put_page(page, 1); - return ERR_PTR(-EIO); - } - if (unlikely(page->mapping != NODE_MAPPING(sbi))) { - f2fs_put_page(page, 1); - goto repeat; + /* Then, try readahead for siblings of the desired node */ + end = start + MAX_RA_NODE; + end = min(end, NIDS_PER_BLOCK); + for (i = start; i < end; i++) { + nid = get_nid(parent, i, false); + ra_node_page(sbi, nid); } - return page; + + blk_finish_plug(&plug); } -/* - * Return a locked page for the desired node page. - * And, readahead MAX_RA_NODE number of node pages. - */ -struct page *get_node_page_ra(struct page *parent, int start) +struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid, + struct page *parent, int start) { - struct f2fs_sb_info *sbi = F2FS_P_SB(parent); - struct blk_plug plug; struct page *page; - int err, i, end; - nid_t nid; + int err; - /* First, try getting the desired direct node. */ - nid = get_nid(parent, start, false); if (!nid) return ERR_PTR(-ENOENT); + f2fs_bug_on(sbi, check_nid_range(sbi, nid)); repeat: page = grab_cache_page(NODE_MAPPING(sbi), nid); if (!page) @@ -1108,46 +1105,53 @@ repeat: goto page_hit; } - blk_start_plug(&plug); - - /* Then, try readahead for siblings of the desired node */ - end = start + MAX_RA_NODE; - end = min(end, NIDS_PER_BLOCK); - for (i = start + 1; i < end; i++) { - nid = get_nid(parent, i, false); - if (!nid) - continue; - ra_node_page(sbi, nid); - } - - blk_finish_plug(&plug); + if (parent) + ra_node_pages(parent, start + 1); lock_page(page); + + if (unlikely(!PageUptodate(page))) { + f2fs_put_page(page, 1); + return ERR_PTR(-EIO); + } if (unlikely(page->mapping != NODE_MAPPING(sbi))) { f2fs_put_page(page, 1); goto repeat; } page_hit: - if (unlikely(!PageUptodate(page))) { - f2fs_put_page(page, 1); - return ERR_PTR(-EIO); - } + f2fs_bug_on(sbi, nid != nid_of_node(page)); return page; } +struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) +{ + return __get_node_page(sbi, nid, NULL, 0); +} + +struct page *get_node_page_ra(struct page *parent, int start) +{ + struct f2fs_sb_info *sbi = F2FS_P_SB(parent); + nid_t nid = get_nid(parent, start, false); + + return __get_node_page(sbi, nid, parent, start); +} + void sync_inode_page(struct dnode_of_data *dn) { + int ret = 0; + if (IS_INODE(dn->node_page) || dn->inode_page == dn->node_page) { - update_inode(dn->inode, dn->node_page); + ret = update_inode(dn->inode, dn->node_page); } else if (dn->inode_page) { if (!dn->inode_page_locked) lock_page(dn->inode_page); - update_inode(dn->inode, dn->inode_page); + ret = update_inode(dn->inode, dn->inode_page); if (!dn->inode_page_locked) unlock_page(dn->inode_page); } else { - update_inode_page(dn->inode); + ret = update_inode_page(dn->inode); } + dn->node_changed = ret ? true: false; } int sync_node_pages(struct f2fs_sb_info *sbi, nid_t ino, @@ -1175,6 +1179,11 @@ next_step: for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; + if (unlikely(f2fs_cp_error(sbi))) { + pagevec_release(&pvec); + return -EIO; + } + /* * flushing sequence with step: * 0. indirect nodes @@ -1349,7 +1358,7 @@ static int f2fs_write_node_page(struct page *page, up_read(&sbi->node_write); unlock_page(page); - if (wbc->for_reclaim) + if (wbc->for_reclaim || unlikely(f2fs_cp_error(sbi))) f2fs_submit_merged_bio(sbi, NODE, WRITE); return 0; @@ -1440,13 +1449,10 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) if (build) { /* do not add allocated nids */ - down_read(&nm_i->nat_tree_lock); ne = __lookup_nat_cache(nm_i, nid); - if (ne && - (!get_nat_flag(ne, IS_CHECKPOINTED) || + if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) || nat_get_blkaddr(ne) != NULL_ADDR)) allocated = true; - up_read(&nm_i->nat_tree_lock); if (allocated) return 0; } @@ -1532,6 +1538,8 @@ static void build_free_nids(struct f2fs_sb_info *sbi) ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, META_NAT, true); + down_read(&nm_i->nat_tree_lock); + while (1) { struct page *page = get_current_nat_page(sbi, nid); @@ -1560,6 +1568,7 @@ static void build_free_nids(struct f2fs_sb_info *sbi) remove_free_nid(nm_i, nid); } mutex_unlock(&curseg->curseg_mutex); + up_read(&nm_i->nat_tree_lock); ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid), nm_i->ra_nid_pages, META_NAT, false); @@ -1582,8 +1591,6 @@ retry: /* We should not use stale free nids created by build_free_nids */ if (nm_i->fcnt && !on_build_free_nids(nm_i)) { - struct node_info ni; - f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list)); list_for_each_entry(i, &nm_i->free_nid_list, list) if (i->state == NID_NEW) @@ -1594,13 +1601,6 @@ retry: i->state = NID_ALLOC; nm_i->fcnt--; spin_unlock(&nm_i->free_nid_list_lock); - - /* check nid is allocated already */ - get_node_info(sbi, *nid, &ni); - if (ni.blk_addr != NULL_ADDR) { - alloc_nid_done(sbi, *nid); - goto retry; - } return true; } spin_unlock(&nm_i->free_nid_list_lock); @@ -1842,14 +1842,12 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi) raw_ne = nat_in_journal(sum, i); - down_write(&nm_i->nat_tree_lock); ne = __lookup_nat_cache(nm_i, nid); if (!ne) { ne = grab_nat_entry(nm_i, nid); node_info_from_raw_nat(&ne->ni, &raw_ne); } __set_nat_cache_dirty(nm_i, ne); - up_write(&nm_i->nat_tree_lock); } update_nats_in_cursum(sum, -i); mutex_unlock(&curseg->curseg_mutex); @@ -1883,7 +1881,6 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, struct f2fs_nat_block *nat_blk; struct nat_entry *ne, *cur; struct page *page = NULL; - struct f2fs_nm_info *nm_i = NM_I(sbi); /* * there are two steps to flush nat entries: @@ -1920,12 +1917,8 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, raw_ne = &nat_blk->entries[nid - start_nid]; } raw_nat_from_node_info(raw_ne, &ne->ni); - - down_write(&NM_I(sbi)->nat_tree_lock); nat_reset_flag(ne); __clear_nat_cache_dirty(NM_I(sbi), ne); - up_write(&NM_I(sbi)->nat_tree_lock); - if (nat_get_blkaddr(ne) == NULL_ADDR) add_free_nid(sbi, nid, false); } @@ -1937,9 +1930,7 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, f2fs_bug_on(sbi, set->entry_cnt); - down_write(&nm_i->nat_tree_lock); radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set); - up_write(&nm_i->nat_tree_lock); kmem_cache_free(nat_entry_set_slab, set); } @@ -1959,6 +1950,9 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) if (!nm_i->dirty_nat_cnt) return; + + down_write(&nm_i->nat_tree_lock); + /* * if there are no enough space in journal to store dirty nat * entries, remove all entries from journal and merge them @@ -1967,7 +1961,6 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt, NAT_JOURNAL)) remove_nats_in_journal(sbi); - down_write(&nm_i->nat_tree_lock); while ((found = __gang_lookup_nat_set(nm_i, set_idx, SETVEC_SIZE, setvec))) { unsigned idx; @@ -1976,12 +1969,13 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) __adjust_nat_entry_set(setvec[idx], &sets, MAX_NAT_JENTRIES(sum)); } - up_write(&nm_i->nat_tree_lock); /* flush dirty nats in nat entry set */ list_for_each_entry_safe(set, tmp, &sets, set_list) __flush_nat_entry_set(sbi, set); + up_write(&nm_i->nat_tree_lock); + f2fs_bug_on(sbi, nm_i->dirty_nat_cnt); } diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index e4fffd2d9..d4d1f636f 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -183,7 +183,7 @@ static inline pgoff_t current_nat_addr(struct f2fs_sb_info *sbi, nid_t start) block_addr = (pgoff_t)(nm_i->nat_blkaddr + (seg_off << sbi->log_blocks_per_seg << 1) + - (block_off & ((1 << sbi->log_blocks_per_seg) - 1))); + (block_off & (sbi->blocks_per_seg - 1))); if (f2fs_test_bit(block_off, nm_i->nat_bitmap)) block_addr += sbi->blocks_per_seg; @@ -317,7 +317,7 @@ static inline bool IS_DNODE(struct page *node_page) return true; } -static inline void set_nid(struct page *p, int off, nid_t nid, bool i) +static inline int set_nid(struct page *p, int off, nid_t nid, bool i) { struct f2fs_node *rn = F2FS_NODE(p); @@ -327,7 +327,7 @@ static inline void set_nid(struct page *p, int off, nid_t nid, bool i) rn->i.i_nid[off - NODE_DIR1_BLOCK] = cpu_to_le32(nid); else rn->in.nid[off] = cpu_to_le32(nid); - set_page_dirty(p); + return set_page_dirty(p); } static inline nid_t get_nid(struct page *p, int off, bool i) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index cbf74f47c..589b20b86 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -168,6 +168,32 @@ static void recover_inode(struct inode *inode, struct page *page) ino_of_node(page), name); } +static bool is_same_inode(struct inode *inode, struct page *ipage) +{ + struct f2fs_inode *ri = F2FS_INODE(ipage); + struct timespec disk; + + if (!IS_INODE(ipage)) + return true; + + disk.tv_sec = le64_to_cpu(ri->i_ctime); + disk.tv_nsec = le32_to_cpu(ri->i_ctime_nsec); + if (timespec_compare(&inode->i_ctime, &disk) > 0) + return false; + + disk.tv_sec = le64_to_cpu(ri->i_atime); + disk.tv_nsec = le32_to_cpu(ri->i_atime_nsec); + if (timespec_compare(&inode->i_atime, &disk) > 0) + return false; + + disk.tv_sec = le64_to_cpu(ri->i_mtime); + disk.tv_nsec = le32_to_cpu(ri->i_mtime_nsec); + if (timespec_compare(&inode->i_mtime, &disk) > 0) + return false; + + return true; +} + static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) { unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi)); @@ -197,7 +223,10 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) goto next; entry = get_fsync_inode(head, ino_of_node(page)); - if (!entry) { + if (entry) { + if (!is_same_inode(entry->inode, page)) + goto next; + } else { if (IS_INODE(page) && is_dent_dnode(page)) { err = recover_inode_page(sbi, page); if (err) @@ -459,8 +488,7 @@ out: return err; } -static int recover_data(struct f2fs_sb_info *sbi, - struct list_head *head, int type) +static int recover_data(struct f2fs_sb_info *sbi, struct list_head *head) { unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi)); struct curseg_info *curseg; @@ -469,7 +497,7 @@ static int recover_data(struct f2fs_sb_info *sbi, block_t blkaddr; /* get node pages in the current segment */ - curseg = CURSEG_I(sbi, type); + curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); while (1) { @@ -556,7 +584,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi) need_writecp = true; /* step #2: recover data */ - err = recover_data(sbi, &inode_list, CURSEG_WARM_NODE); + err = recover_data(sbi, &inode_list); if (!err) f2fs_bug_on(sbi, !list_empty(&inode_list)); out: @@ -595,7 +623,7 @@ out: .reason = CP_RECOVERY, }; mutex_unlock(&sbi->cp_mutex); - write_checkpoint(sbi, &cpc); + err = write_checkpoint(sbi, &cpc); } else { mutex_unlock(&sbi->cp_mutex); } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index f77b32584..5904a411c 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -86,6 +86,7 @@ static inline unsigned long __reverse_ffs(unsigned long word) /* * __find_rev_next(_zero)_bit is copied from lib/find_next_bit.c because * f2fs_set_bit makes MSB and LSB reversed in a byte. + * @size must be integral times of unsigned long. * Example: * MSB <--> LSB * f2fs_set_bit(0, bitmap) => 1000 0000 @@ -95,94 +96,73 @@ static unsigned long __find_rev_next_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { const unsigned long *p = addr + BIT_WORD(offset); - unsigned long result = offset & ~(BITS_PER_LONG - 1); + unsigned long result = size; unsigned long tmp; if (offset >= size) return size; - size -= result; + size -= (offset & ~(BITS_PER_LONG - 1)); offset %= BITS_PER_LONG; - if (!offset) - goto aligned; - - tmp = __reverse_ulong((unsigned char *)p); - tmp &= ~0UL >> offset; - - if (size < BITS_PER_LONG) - goto found_first; - if (tmp) - goto found_middle; - - size -= BITS_PER_LONG; - result += BITS_PER_LONG; - p++; -aligned: - while (size & ~(BITS_PER_LONG-1)) { + + while (1) { + if (*p == 0) + goto pass; + tmp = __reverse_ulong((unsigned char *)p); + + tmp &= ~0UL >> offset; + if (size < BITS_PER_LONG) + tmp &= (~0UL << (BITS_PER_LONG - size)); if (tmp) - goto found_middle; - result += BITS_PER_LONG; + goto found; +pass: + if (size <= BITS_PER_LONG) + break; size -= BITS_PER_LONG; + offset = 0; p++; } - if (!size) - return result; - - tmp = __reverse_ulong((unsigned char *)p); -found_first: - tmp &= (~0UL << (BITS_PER_LONG - size)); - if (!tmp) /* Are any bits set? */ - return result + size; /* Nope. */ -found_middle: - return result + __reverse_ffs(tmp); + return result; +found: + return result - size + __reverse_ffs(tmp); } static unsigned long __find_rev_next_zero_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { const unsigned long *p = addr + BIT_WORD(offset); - unsigned long result = offset & ~(BITS_PER_LONG - 1); + unsigned long result = size; unsigned long tmp; if (offset >= size) return size; - size -= result; + size -= (offset & ~(BITS_PER_LONG - 1)); offset %= BITS_PER_LONG; - if (!offset) - goto aligned; - - tmp = __reverse_ulong((unsigned char *)p); - tmp |= ~((~0UL << offset) >> offset); - - if (size < BITS_PER_LONG) - goto found_first; - if (tmp != ~0UL) - goto found_middle; - - size -= BITS_PER_LONG; - result += BITS_PER_LONG; - p++; -aligned: - while (size & ~(BITS_PER_LONG - 1)) { + + while (1) { + if (*p == ~0UL) + goto pass; + tmp = __reverse_ulong((unsigned char *)p); + + if (offset) + tmp |= ~0UL << (BITS_PER_LONG - offset); + if (size < BITS_PER_LONG) + tmp |= ~0UL >> size; if (tmp != ~0UL) - goto found_middle; - result += BITS_PER_LONG; + goto found; +pass: + if (size <= BITS_PER_LONG) + break; size -= BITS_PER_LONG; + offset = 0; p++; } - if (!size) - return result; - - tmp = __reverse_ulong((unsigned char *)p); -found_first: - tmp |= ~(~0UL << (BITS_PER_LONG - size)); - if (tmp == ~0UL) /* Are any bits zero? */ - return result + size; /* Nope. */ -found_middle: - return result + __reverse_ffz(tmp); + return result; +found: + return result - size + __reverse_ffz(tmp); } void register_inmem_page(struct inode *inode, struct page *page) @@ -233,7 +213,7 @@ int commit_inmem_pages(struct inode *inode, bool abort) * inode becomes free by iget_locked in f2fs_iget. */ if (!abort) { - f2fs_balance_fs(sbi); + f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); } @@ -257,6 +237,7 @@ int commit_inmem_pages(struct inode *inode, bool abort) submit_bio = true; } } else { + ClearPageUptodate(cur->page); trace_f2fs_commit_inmem_page(cur->page, INMEM_DROP); } set_page_private(cur->page, 0); @@ -281,8 +262,10 @@ int commit_inmem_pages(struct inode *inode, bool abort) * This function balances dirty node and dentry pages. * In addition, it controls garbage collection. */ -void f2fs_balance_fs(struct f2fs_sb_info *sbi) +void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) { + if (!need) + return; /* * We should do GC or end up with checkpoint, if there are so many dirty * dir/node pages without enough free segments. @@ -310,8 +293,12 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) if (!available_free_memory(sbi, NAT_ENTRIES) || excess_prefree_segs(sbi) || !available_free_memory(sbi, INO_ENTRIES) || - jiffies > sbi->cp_expires) + (is_idle(sbi) && f2fs_time_over(sbi, CP_TIME))) { + if (test_opt(sbi, DATA_FLUSH)) + sync_dirty_inodes(sbi, FILE_INODE); f2fs_sync_fs(sbi->sb, true); + stat_inc_bg_cp_count(sbi->stat_info); + } } static int issue_flush_thread(void *data) @@ -1134,6 +1121,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) __u64 end = start + F2FS_BYTES_TO_BLK(range->len) - 1; unsigned int start_segno, end_segno; struct cp_control cpc; + int err = 0; if (start >= MAX_BLKADDR(sbi) || range->len < sbi->blocksize) return -EINVAL; @@ -1164,12 +1152,12 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) sbi->segs_per_sec) - 1, end_segno); mutex_lock(&sbi->gc_mutex); - write_checkpoint(sbi, &cpc); + err = write_checkpoint(sbi, &cpc); mutex_unlock(&sbi->gc_mutex); } out: range->len = F2FS_BLK_TO_BYTES(cpc.trimmed); - return 0; + return err; } static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type) @@ -1749,13 +1737,13 @@ int lookup_journal_in_cursum(struct f2fs_summary_block *sum, int type, if (le32_to_cpu(nid_in_journal(sum, i)) == val) return i; } - if (alloc && nats_in_cursum(sum) < NAT_JOURNAL_ENTRIES) + if (alloc && __has_cursum_space(sum, 1, NAT_JOURNAL)) return update_nats_in_cursum(sum, 1); } else if (type == SIT_JOURNAL) { for (i = 0; i < sits_in_cursum(sum); i++) if (le32_to_cpu(segno_in_journal(sum, i)) == val) return i; - if (alloc && sits_in_cursum(sum) < SIT_JOURNAL_ENTRIES) + if (alloc && __has_cursum_space(sum, 1, SIT_JOURNAL)) return update_sits_in_cursum(sum, 1); } return -1; diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c index da0d8e0b5..93606f281 100644 --- a/fs/f2fs/shrinker.c +++ b/fs/f2fs/shrinker.c @@ -32,7 +32,8 @@ static unsigned long __count_free_nids(struct f2fs_sb_info *sbi) static unsigned long __count_extent_cache(struct f2fs_sb_info *sbi) { - return sbi->total_ext_tree + atomic_read(&sbi->total_ext_node); + return atomic_read(&sbi->total_zombie_tree) + + atomic_read(&sbi->total_ext_node); } unsigned long f2fs_shrink_count(struct shrinker *shrink, diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 3a65e0132..6134832ba 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -67,6 +67,7 @@ enum { Opt_extent_cache, Opt_noextent_cache, Opt_noinline_data, + Opt_data_flush, Opt_err, }; @@ -91,6 +92,7 @@ static match_table_t f2fs_tokens = { {Opt_extent_cache, "extent_cache"}, {Opt_noextent_cache, "noextent_cache"}, {Opt_noinline_data, "noinline_data"}, + {Opt_data_flush, "data_flush"}, {Opt_err, NULL}, }; @@ -216,7 +218,8 @@ F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, cp_interval); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { @@ -235,6 +238,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(ram_thresh), ATTR_LIST(ra_nid_pages), ATTR_LIST(cp_interval), + ATTR_LIST(idle_interval), NULL, }; @@ -406,6 +410,9 @@ static int parse_options(struct super_block *sb, char *options) case Opt_noinline_data: clear_opt(sbi, INLINE_DATA); break; + case Opt_data_flush: + set_opt(sbi, DATA_FLUSH); + break; default: f2fs_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" or missing value", @@ -432,6 +439,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) fi->i_current_depth = 1; fi->i_advise = 0; init_rwsem(&fi->i_sem); + INIT_LIST_HEAD(&fi->dirty_list); INIT_LIST_HEAD(&fi->inmem_pages); mutex_init(&fi->inmem_lock); @@ -548,7 +556,7 @@ static void f2fs_put_super(struct super_block *sb) * normally superblock is clean, so we need to release this. * In addition, EIO will skip do checkpoint, we need this as well. */ - release_dirty_inode(sbi); + release_ino_entry(sbi); release_discard_addrs(sbi); f2fs_leave_shrinker(sbi); @@ -566,13 +574,14 @@ static void f2fs_put_super(struct super_block *sb) wait_for_completion(&sbi->s_kobj_unregister); sb->s_fs_info = NULL; - brelse(sbi->raw_super_buf); + kfree(sbi->raw_super); kfree(sbi); } int f2fs_sync_fs(struct super_block *sb, int sync) { struct f2fs_sb_info *sbi = F2FS_SB(sb); + int err = 0; trace_f2fs_sync_fs(sb, sync); @@ -582,14 +591,12 @@ int f2fs_sync_fs(struct super_block *sb, int sync) cpc.reason = __get_cp_reason(sbi); mutex_lock(&sbi->gc_mutex); - write_checkpoint(sbi, &cpc); + err = write_checkpoint(sbi, &cpc); mutex_unlock(&sbi->gc_mutex); - } else { - f2fs_balance_fs(sbi); } f2fs_trace_ios(NULL, 1); - return 0; + return err; } static int f2fs_freeze(struct super_block *sb) @@ -686,6 +693,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",extent_cache"); else seq_puts(seq, ",noextent_cache"); + if (test_opt(sbi, DATA_FLUSH)) + seq_puts(seq, ",data_flush"); seq_printf(seq, ",active_logs=%u", sbi->active_logs); return 0; @@ -898,7 +907,7 @@ static const struct export_operations f2fs_export_ops = { .get_parent = f2fs_get_parent, }; -static loff_t max_file_size(unsigned bits) +static loff_t max_file_blocks(void) { loff_t result = (DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS); loff_t leaf_count = ADDRS_PER_BLOCK; @@ -914,10 +923,82 @@ static loff_t max_file_size(unsigned bits) leaf_count *= NIDS_PER_BLOCK; result += leaf_count; - result <<= bits; return result; } +static inline bool sanity_check_area_boundary(struct super_block *sb, + struct f2fs_super_block *raw_super) +{ + u32 segment0_blkaddr = le32_to_cpu(raw_super->segment0_blkaddr); + u32 cp_blkaddr = le32_to_cpu(raw_super->cp_blkaddr); + u32 sit_blkaddr = le32_to_cpu(raw_super->sit_blkaddr); + u32 nat_blkaddr = le32_to_cpu(raw_super->nat_blkaddr); + u32 ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr); + u32 main_blkaddr = le32_to_cpu(raw_super->main_blkaddr); + u32 segment_count_ckpt = le32_to_cpu(raw_super->segment_count_ckpt); + u32 segment_count_sit = le32_to_cpu(raw_super->segment_count_sit); + u32 segment_count_nat = le32_to_cpu(raw_super->segment_count_nat); + u32 segment_count_ssa = le32_to_cpu(raw_super->segment_count_ssa); + u32 segment_count_main = le32_to_cpu(raw_super->segment_count_main); + u32 segment_count = le32_to_cpu(raw_super->segment_count); + u32 log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg); + + if (segment0_blkaddr != cp_blkaddr) { + f2fs_msg(sb, KERN_INFO, + "Mismatch start address, segment0(%u) cp_blkaddr(%u)", + segment0_blkaddr, cp_blkaddr); + return true; + } + + if (cp_blkaddr + (segment_count_ckpt << log_blocks_per_seg) != + sit_blkaddr) { + f2fs_msg(sb, KERN_INFO, + "Wrong CP boundary, start(%u) end(%u) blocks(%u)", + cp_blkaddr, sit_blkaddr, + segment_count_ckpt << log_blocks_per_seg); + return true; + } + + if (sit_blkaddr + (segment_count_sit << log_blocks_per_seg) != + nat_blkaddr) { + f2fs_msg(sb, KERN_INFO, + "Wrong SIT boundary, start(%u) end(%u) blocks(%u)", + sit_blkaddr, nat_blkaddr, + segment_count_sit << log_blocks_per_seg); + return true; + } + + if (nat_blkaddr + (segment_count_nat << log_blocks_per_seg) != + ssa_blkaddr) { + f2fs_msg(sb, KERN_INFO, + "Wrong NAT boundary, start(%u) end(%u) blocks(%u)", + nat_blkaddr, ssa_blkaddr, + segment_count_nat << log_blocks_per_seg); + return true; + } + + if (ssa_blkaddr + (segment_count_ssa << log_blocks_per_seg) != + main_blkaddr) { + f2fs_msg(sb, KERN_INFO, + "Wrong SSA boundary, start(%u) end(%u) blocks(%u)", + ssa_blkaddr, main_blkaddr, + segment_count_ssa << log_blocks_per_seg); + return true; + } + + if (main_blkaddr + (segment_count_main << log_blocks_per_seg) != + segment0_blkaddr + (segment_count << log_blocks_per_seg)) { + f2fs_msg(sb, KERN_INFO, + "Wrong MAIN_AREA boundary, start(%u) end(%u) blocks(%u)", + main_blkaddr, + segment0_blkaddr + (segment_count << log_blocks_per_seg), + segment_count_main << log_blocks_per_seg); + return true; + } + + return false; +} + static int sanity_check_raw_super(struct super_block *sb, struct f2fs_super_block *raw_super) { @@ -947,6 +1028,14 @@ static int sanity_check_raw_super(struct super_block *sb, return 1; } + /* check log blocks per segment */ + if (le32_to_cpu(raw_super->log_blocks_per_seg) != 9) { + f2fs_msg(sb, KERN_INFO, + "Invalid log blocks per segment (%u)\n", + le32_to_cpu(raw_super->log_blocks_per_seg)); + return 1; + } + /* Currently, support 512/1024/2048/4096 bytes sector size */ if (le32_to_cpu(raw_super->log_sectorsize) > F2FS_MAX_LOG_SECTOR_SIZE || @@ -965,6 +1054,23 @@ static int sanity_check_raw_super(struct super_block *sb, le32_to_cpu(raw_super->log_sectorsize)); return 1; } + + /* check reserved ino info */ + if (le32_to_cpu(raw_super->node_ino) != 1 || + le32_to_cpu(raw_super->meta_ino) != 2 || + le32_to_cpu(raw_super->root_ino) != 3) { + f2fs_msg(sb, KERN_INFO, + "Invalid Fs Meta Ino: node(%u) meta(%u) root(%u)", + le32_to_cpu(raw_super->node_ino), + le32_to_cpu(raw_super->meta_ino), + le32_to_cpu(raw_super->root_ino)); + return 1; + } + + /* check CP/SIT/NAT/SSA/MAIN_AREA area boundary */ + if (sanity_check_area_boundary(sb, raw_super)) + return 1; + return 0; } @@ -1018,7 +1124,8 @@ static void init_sb_info(struct f2fs_sb_info *sbi) atomic_set(&sbi->nr_pages[i], 0); sbi->dir_level = DEF_DIR_LEVEL; - sbi->cp_interval = DEF_CP_INTERVAL; + sbi->interval_time[CP_TIME] = DEF_CP_INTERVAL; + sbi->interval_time[REQ_TIME] = DEF_IDLE_INTERVAL; clear_sbi_flag(sbi, SBI_NEED_FSCK); INIT_LIST_HEAD(&sbi->s_list); @@ -1032,111 +1139,114 @@ static void init_sb_info(struct f2fs_sb_info *sbi) */ static int read_raw_super_block(struct super_block *sb, struct f2fs_super_block **raw_super, - struct buffer_head **raw_super_buf, - int *recovery) + int *valid_super_block, int *recovery) { int block = 0; - struct buffer_head *buffer; - struct f2fs_super_block *super; + struct buffer_head *bh; + struct f2fs_super_block *super, *buf; int err = 0; + super = kzalloc(sizeof(struct f2fs_super_block), GFP_KERNEL); + if (!super) + return -ENOMEM; retry: - buffer = sb_bread(sb, block); - if (!buffer) { + bh = sb_bread(sb, block); + if (!bh) { *recovery = 1; f2fs_msg(sb, KERN_ERR, "Unable to read %dth superblock", block + 1); - if (block == 0) { - block++; - goto retry; - } else { - err = -EIO; - goto out; - } + err = -EIO; + goto next; } - super = (struct f2fs_super_block *) - ((char *)(buffer)->b_data + F2FS_SUPER_OFFSET); + buf = (struct f2fs_super_block *)(bh->b_data + F2FS_SUPER_OFFSET); /* sanity checking of raw super */ - if (sanity_check_raw_super(sb, super)) { - brelse(buffer); + if (sanity_check_raw_super(sb, buf)) { + brelse(bh); *recovery = 1; f2fs_msg(sb, KERN_ERR, "Can't find valid F2FS filesystem in %dth superblock", block + 1); - if (block == 0) { - block++; - goto retry; - } else { - err = -EINVAL; - goto out; - } + err = -EINVAL; + goto next; } if (!*raw_super) { - *raw_super_buf = buffer; + memcpy(super, buf, sizeof(*super)); + *valid_super_block = block; *raw_super = super; - } else { - /* already have a valid superblock */ - brelse(buffer); } + brelse(bh); +next: /* check the validity of the second superblock */ if (block == 0) { block++; goto retry; } -out: /* No valid superblock */ - if (!*raw_super) + if (!*raw_super) { + kfree(super); return err; + } return 0; } +static int __f2fs_commit_super(struct f2fs_sb_info *sbi, int block) +{ + struct f2fs_super_block *super = F2FS_RAW_SUPER(sbi); + struct buffer_head *bh; + int err; + + bh = sb_getblk(sbi->sb, block); + if (!bh) + return -EIO; + + lock_buffer(bh); + memcpy(bh->b_data + F2FS_SUPER_OFFSET, super, sizeof(*super)); + set_buffer_uptodate(bh); + set_buffer_dirty(bh); + unlock_buffer(bh); + + /* it's rare case, we can do fua all the time */ + err = __sync_dirty_buffer(bh, WRITE_FLUSH_FUA); + brelse(bh); + + return err; +} + int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover) { - struct buffer_head *sbh = sbi->raw_super_buf; - sector_t block = sbh->b_blocknr; int err; /* write back-up superblock first */ - sbh->b_blocknr = block ? 0 : 1; - mark_buffer_dirty(sbh); - err = sync_dirty_buffer(sbh); - - sbh->b_blocknr = block; + err = __f2fs_commit_super(sbi, sbi->valid_super_block ? 0 : 1); /* if we are in recovery path, skip writing valid superblock */ if (recover || err) - goto out; + return err; /* write current valid superblock */ - mark_buffer_dirty(sbh); - err = sync_dirty_buffer(sbh); -out: - clear_buffer_write_io_error(sbh); - set_buffer_uptodate(sbh); - return err; + return __f2fs_commit_super(sbi, sbi->valid_super_block); } static int f2fs_fill_super(struct super_block *sb, void *data, int silent) { struct f2fs_sb_info *sbi; struct f2fs_super_block *raw_super; - struct buffer_head *raw_super_buf; struct inode *root; long err; bool retry = true, need_fsck = false; char *options = NULL; - int recovery, i; + int recovery, i, valid_super_block; try_onemore: err = -EINVAL; raw_super = NULL; - raw_super_buf = NULL; + valid_super_block = -1; recovery = 0; /* allocate memory for f2fs-specific super block info */ @@ -1150,7 +1260,8 @@ try_onemore: goto free_sbi; } - err = read_raw_super_block(sb, &raw_super, &raw_super_buf, &recovery); + err = read_raw_super_block(sb, &raw_super, &valid_super_block, + &recovery); if (err) goto free_sbi; @@ -1167,7 +1278,9 @@ try_onemore: if (err) goto free_options; - sb->s_maxbytes = max_file_size(le32_to_cpu(raw_super->log_blocksize)); + sbi->max_file_blocks = max_file_blocks(); + sb->s_maxbytes = sbi->max_file_blocks << + le32_to_cpu(raw_super->log_blocksize); sb->s_max_links = F2FS_LINK_MAX; get_random_bytes(&sbi->s_next_generation, sizeof(u32)); @@ -1183,7 +1296,7 @@ try_onemore: /* init f2fs-specific super block info */ sbi->sb = sb; sbi->raw_super = raw_super; - sbi->raw_super_buf = raw_super_buf; + sbi->valid_super_block = valid_super_block; mutex_init(&sbi->gc_mutex); mutex_init(&sbi->writepages); mutex_init(&sbi->cp_mutex); @@ -1236,8 +1349,10 @@ try_onemore: le64_to_cpu(sbi->ckpt->valid_block_count); sbi->last_valid_block_count = sbi->total_valid_block_count; sbi->alloc_valid_block_count = 0; - INIT_LIST_HEAD(&sbi->dir_inode_list); - spin_lock_init(&sbi->dir_inode_lock); + for (i = 0; i < NR_INODE_TYPE; i++) { + INIT_LIST_HEAD(&sbi->inode_list[i]); + spin_lock_init(&sbi->inode_lock[i]); + } init_extent_cache_info(sbi); @@ -1355,12 +1470,14 @@ try_onemore: f2fs_commit_super(sbi, true); } - sbi->cp_expires = round_jiffies_up(jiffies); - + f2fs_update_time(sbi, CP_TIME); + f2fs_update_time(sbi, REQ_TIME); return 0; free_kobj: kobject_del(&sbi->s_kobj); + kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); free_proc: if (sbi->s_proc) { remove_proc_entry("segment_info", sbi->s_proc); @@ -1387,7 +1504,7 @@ free_meta_inode: free_options: kfree(options); free_sb_buf: - brelse(raw_super_buf); + kfree(raw_super); free_sbi: kfree(sbi); @@ -1424,8 +1541,9 @@ MODULE_ALIAS_FS("f2fs"); static int __init init_inodecache(void) { - f2fs_inode_cachep = f2fs_kmem_cache_create("f2fs_inode_cache", - sizeof(struct f2fs_inode_info)); + f2fs_inode_cachep = kmem_cache_create("f2fs_inode_cache", + sizeof(struct f2fs_inode_info), 0, + SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, NULL); if (!f2fs_inode_cachep) return -ENOMEM; return 0; @@ -1478,10 +1596,14 @@ static int __init init_f2fs_fs(void) err = register_filesystem(&f2fs_fs_type); if (err) goto free_shrinker; - f2fs_create_root_stats(); + err = f2fs_create_root_stats(); + if (err) + goto free_filesystem; f2fs_proc_root = proc_mkdir("fs/f2fs", NULL); return 0; +free_filesystem: + unregister_filesystem(&f2fs_fs_type); free_shrinker: unregister_shrinker(&f2fs_shrinker_info); free_crypto: diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 862368a32..10f1e784f 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -25,38 +25,6 @@ #include "f2fs.h" #include "xattr.h" -static size_t f2fs_xattr_generic_list(const struct xattr_handler *handler, - struct dentry *dentry, char *list, size_t list_size, - const char *name, size_t len) -{ - struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb); - int total_len, prefix_len; - - switch (handler->flags) { - case F2FS_XATTR_INDEX_USER: - if (!test_opt(sbi, XATTR_USER)) - return -EOPNOTSUPP; - break; - case F2FS_XATTR_INDEX_TRUSTED: - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - break; - case F2FS_XATTR_INDEX_SECURITY: - break; - default: - return -EINVAL; - } - - prefix_len = strlen(handler->prefix); - total_len = prefix_len + len + 1; - if (list && total_len <= list_size) { - memcpy(list, handler->prefix, prefix_len); - memcpy(list + prefix_len, name, len); - list[prefix_len + len] = '\0'; - } - return total_len; -} - static int f2fs_xattr_generic_get(const struct xattr_handler *handler, struct dentry *dentry, const char *name, void *buffer, size_t size) @@ -77,8 +45,6 @@ static int f2fs_xattr_generic_get(const struct xattr_handler *handler, default: return -EINVAL; } - if (strcmp(name, "") == 0) - return -EINVAL; return f2fs_getxattr(d_inode(dentry), handler->flags, name, buffer, size, NULL); } @@ -103,24 +69,20 @@ static int f2fs_xattr_generic_set(const struct xattr_handler *handler, default: return -EINVAL; } - if (strcmp(name, "") == 0) - return -EINVAL; - return f2fs_setxattr(d_inode(dentry), handler->flags, name, value, size, NULL, flags); } -static size_t f2fs_xattr_advise_list(const struct xattr_handler *handler, - struct dentry *dentry, char *list, size_t list_size, - const char *name, size_t len) +static bool f2fs_xattr_user_list(struct dentry *dentry) { - const char *xname = F2FS_SYSTEM_ADVISE_PREFIX; - size_t size; + struct f2fs_sb_info *sbi = F2FS_SB(dentry->d_sb); - size = strlen(xname) + 1; - if (list && size <= list_size) - memcpy(list, xname, size); - return size; + return test_opt(sbi, XATTR_USER); +} + +static bool f2fs_xattr_trusted_list(struct dentry *dentry) +{ + return capable(CAP_SYS_ADMIN); } static int f2fs_xattr_advise_get(const struct xattr_handler *handler, @@ -129,9 +91,6 @@ static int f2fs_xattr_advise_get(const struct xattr_handler *handler, { struct inode *inode = d_inode(dentry); - if (strcmp(name, "") != 0) - return -EINVAL; - if (buffer) *((char *)buffer) = F2FS_I(inode)->i_advise; return sizeof(char); @@ -143,8 +102,6 @@ static int f2fs_xattr_advise_set(const struct xattr_handler *handler, { struct inode *inode = d_inode(dentry); - if (strcmp(name, "") != 0) - return -EINVAL; if (!inode_owner_or_capable(inode)) return -EPERM; if (value == NULL) @@ -183,7 +140,7 @@ int f2fs_init_security(struct inode *inode, struct inode *dir, const struct xattr_handler f2fs_xattr_user_handler = { .prefix = XATTR_USER_PREFIX, .flags = F2FS_XATTR_INDEX_USER, - .list = f2fs_xattr_generic_list, + .list = f2fs_xattr_user_list, .get = f2fs_xattr_generic_get, .set = f2fs_xattr_generic_set, }; @@ -191,15 +148,14 @@ const struct xattr_handler f2fs_xattr_user_handler = { const struct xattr_handler f2fs_xattr_trusted_handler = { .prefix = XATTR_TRUSTED_PREFIX, .flags = F2FS_XATTR_INDEX_TRUSTED, - .list = f2fs_xattr_generic_list, + .list = f2fs_xattr_trusted_list, .get = f2fs_xattr_generic_get, .set = f2fs_xattr_generic_set, }; const struct xattr_handler f2fs_xattr_advise_handler = { - .prefix = F2FS_SYSTEM_ADVISE_PREFIX, + .name = F2FS_SYSTEM_ADVISE_NAME, .flags = F2FS_XATTR_INDEX_ADVISE, - .list = f2fs_xattr_advise_list, .get = f2fs_xattr_advise_get, .set = f2fs_xattr_advise_set, }; @@ -207,7 +163,6 @@ const struct xattr_handler f2fs_xattr_advise_handler = { const struct xattr_handler f2fs_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .flags = F2FS_XATTR_INDEX_SECURITY, - .list = f2fs_xattr_generic_list, .get = f2fs_xattr_generic_get, .set = f2fs_xattr_generic_set, }; @@ -455,20 +410,27 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) list_for_each_xattr(entry, base_addr) { const struct xattr_handler *handler = f2fs_xattr_handler(entry->e_name_index); + const char *prefix; + size_t prefix_len; size_t size; - if (!handler) + if (!handler || (handler->list && !handler->list(dentry))) continue; - size = handler->list(handler, dentry, buffer, rest, - entry->e_name, entry->e_name_len); - if (buffer && size > rest) { - error = -ERANGE; - goto cleanup; + prefix = handler->prefix ?: handler->name; + prefix_len = strlen(prefix); + size = prefix_len + entry->e_name_len + 1; + if (buffer) { + if (size > rest) { + error = -ERANGE; + goto cleanup; + } + memcpy(buffer, prefix, prefix_len); + buffer += prefix_len; + memcpy(buffer, entry->e_name, entry->e_name_len); + buffer += entry->e_name_len; + *buffer++ = 0; } - - if (buffer) - buffer += size; rest -= size; } error = buffer_size - rest; @@ -609,7 +571,7 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name, if (ipage) return __f2fs_setxattr(inode, index, name, value, size, ipage, flags); - f2fs_balance_fs(sbi); + f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); /* protect xattr_ver */ @@ -618,5 +580,6 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name, up_write(&F2FS_I(inode)->i_sem); f2fs_unlock_op(sbi); + f2fs_update_time(sbi, REQ_TIME); return err; } diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h index 71a7100d5..79dccc825 100644 --- a/fs/f2fs/xattr.h +++ b/fs/f2fs/xattr.h @@ -27,7 +27,7 @@ #define F2FS_XATTR_REFCOUNT_MAX 1024 /* Name indexes */ -#define F2FS_SYSTEM_ADVISE_PREFIX "system.advise" +#define F2FS_SYSTEM_ADVISE_NAME "system.advise" #define F2FS_XATTR_INDEX_USER 1 #define F2FS_XATTR_INDEX_POSIX_ACL_ACCESS 2 #define F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT 3 diff --git a/fs/fat/cache.c b/fs/fat/cache.c index 93fc62232..5d3849215 100644 --- a/fs/fat/cache.c +++ b/fs/fat/cache.c @@ -301,15 +301,59 @@ static int fat_bmap_cluster(struct inode *inode, int cluster) return dclus; } -int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys, - unsigned long *mapped_blocks, int create) +int fat_get_mapped_cluster(struct inode *inode, sector_t sector, + sector_t last_block, + unsigned long *mapped_blocks, sector_t *bmap) { struct super_block *sb = inode->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); + int cluster, offset; + + cluster = sector >> (sbi->cluster_bits - sb->s_blocksize_bits); + offset = sector & (sbi->sec_per_clus - 1); + cluster = fat_bmap_cluster(inode, cluster); + if (cluster < 0) + return cluster; + else if (cluster) { + *bmap = fat_clus_to_blknr(sbi, cluster) + offset; + *mapped_blocks = sbi->sec_per_clus - offset; + if (*mapped_blocks > last_block - sector) + *mapped_blocks = last_block - sector; + } + + return 0; +} + +static int is_exceed_eof(struct inode *inode, sector_t sector, + sector_t *last_block, int create) +{ + struct super_block *sb = inode->i_sb; const unsigned long blocksize = sb->s_blocksize; const unsigned char blocksize_bits = sb->s_blocksize_bits; + + *last_block = (i_size_read(inode) + (blocksize - 1)) >> blocksize_bits; + if (sector >= *last_block) { + if (!create) + return 1; + + /* + * ->mmu_private can access on only allocation path. + * (caller must hold ->i_mutex) + */ + *last_block = (MSDOS_I(inode)->mmu_private + (blocksize - 1)) + >> blocksize_bits; + if (sector >= *last_block) + return 1; + } + + return 0; +} + +int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys, + unsigned long *mapped_blocks, int create, bool from_bmap) +{ + struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); sector_t last_block; - int cluster, offset; *phys = 0; *mapped_blocks = 0; @@ -321,31 +365,16 @@ int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys, return 0; } - last_block = (i_size_read(inode) + (blocksize - 1)) >> blocksize_bits; - if (sector >= last_block) { - if (!create) + if (!from_bmap) { + if (is_exceed_eof(inode, sector, &last_block, create)) return 0; - - /* - * ->mmu_private can access on only allocation path. - * (caller must hold ->i_mutex) - */ - last_block = (MSDOS_I(inode)->mmu_private + (blocksize - 1)) - >> blocksize_bits; + } else { + last_block = inode->i_blocks >> + (inode->i_sb->s_blocksize_bits - 9); if (sector >= last_block) return 0; } - cluster = sector >> (sbi->cluster_bits - sb->s_blocksize_bits); - offset = sector & (sbi->sec_per_clus - 1); - cluster = fat_bmap_cluster(inode, cluster); - if (cluster < 0) - return cluster; - else if (cluster) { - *phys = fat_clus_to_blknr(sbi, cluster) + offset; - *mapped_blocks = sbi->sec_per_clus - offset; - if (*mapped_blocks > last_block - sector) - *mapped_blocks = last_block - sector; - } - return 0; + return fat_get_mapped_cluster(inode, sector, last_block, mapped_blocks, + phys); } diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 8b2127ffb..d0b95c950 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -91,7 +91,7 @@ next: *bh = NULL; iblock = *pos >> sb->s_blocksize_bits; - err = fat_bmap(dir, iblock, &phys, &mapped_blocks, 0); + err = fat_bmap(dir, iblock, &phys, &mapped_blocks, 0, false); if (err || !phys) return -1; /* beyond EOF or error */ @@ -769,7 +769,7 @@ static int fat_ioctl_readdir(struct inode *inode, struct file *file, buf.dirent = dirent; buf.result = 0; - mutex_lock(&inode->i_mutex); + inode_lock(inode); buf.ctx.pos = file->f_pos; ret = -ENOENT; if (!IS_DEADDIR(inode)) { @@ -777,7 +777,7 @@ static int fat_ioctl_readdir(struct inode *inode, struct file *file, short_only, both ? &buf : NULL); file->f_pos = buf.ctx.pos; } - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (ret >= 0) ret = buf.result; return ret; diff --git a/fs/fat/fat.h b/fs/fat/fat.h index be5e15323..e6b764a17 100644 --- a/fs/fat/fat.h +++ b/fs/fat/fat.h @@ -87,7 +87,7 @@ struct msdos_sb_info { unsigned int vol_id; /*volume ID*/ int fatent_shift; - struct fatent_operations *fatent_ops; + const struct fatent_operations *fatent_ops; struct inode *fat_inode; struct inode *fsinfo_inode; @@ -285,8 +285,11 @@ static inline void fatwchar_to16(__u8 *dst, const wchar_t *src, size_t len) extern void fat_cache_inval_inode(struct inode *inode); extern int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus); +extern int fat_get_mapped_cluster(struct inode *inode, sector_t sector, + sector_t last_block, + unsigned long *mapped_blocks, sector_t *bmap); extern int fat_bmap(struct inode *inode, sector_t sector, sector_t *phys, - unsigned long *mapped_blocks, int create); + unsigned long *mapped_blocks, int create, bool from_bmap); /* fat/dir.c */ extern const struct file_operations fat_dir_operations; @@ -384,6 +387,7 @@ static inline unsigned long fat_dir_hash(int logstart) { return hash_32(logstart, FAT_HASH_BITS); } +extern int fat_add_cluster(struct inode *inode); /* fat/misc.c */ extern __printf(3, 4) __cold diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c index 822655713..1d9a8c4e9 100644 --- a/fs/fat/fatent.c +++ b/fs/fat/fatent.c @@ -99,7 +99,7 @@ err: static int fat_ent_bread(struct super_block *sb, struct fat_entry *fatent, int offset, sector_t blocknr) { - struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops; + const struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops; WARN_ON(blocknr < MSDOS_SB(sb)->fat_start); fatent->fat_inode = MSDOS_SB(sb)->fat_inode; @@ -246,7 +246,7 @@ static int fat32_ent_next(struct fat_entry *fatent) return 0; } -static struct fatent_operations fat12_ops = { +static const struct fatent_operations fat12_ops = { .ent_blocknr = fat12_ent_blocknr, .ent_set_ptr = fat12_ent_set_ptr, .ent_bread = fat12_ent_bread, @@ -255,7 +255,7 @@ static struct fatent_operations fat12_ops = { .ent_next = fat12_ent_next, }; -static struct fatent_operations fat16_ops = { +static const struct fatent_operations fat16_ops = { .ent_blocknr = fat_ent_blocknr, .ent_set_ptr = fat16_ent_set_ptr, .ent_bread = fat_ent_bread, @@ -264,7 +264,7 @@ static struct fatent_operations fat16_ops = { .ent_next = fat16_ent_next, }; -static struct fatent_operations fat32_ops = { +static const struct fatent_operations fat32_ops = { .ent_blocknr = fat_ent_blocknr, .ent_set_ptr = fat32_ent_set_ptr, .ent_bread = fat_ent_bread, @@ -320,7 +320,7 @@ static inline int fat_ent_update_ptr(struct super_block *sb, int offset, sector_t blocknr) { struct msdos_sb_info *sbi = MSDOS_SB(sb); - struct fatent_operations *ops = sbi->fatent_ops; + const struct fatent_operations *ops = sbi->fatent_ops; struct buffer_head **bhs = fatent->bhs; /* Is this fatent's blocks including this entry? */ @@ -349,7 +349,7 @@ int fat_ent_read(struct inode *inode, struct fat_entry *fatent, int entry) { struct super_block *sb = inode->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); - struct fatent_operations *ops = sbi->fatent_ops; + const struct fatent_operations *ops = sbi->fatent_ops; int err, offset; sector_t blocknr; @@ -407,7 +407,7 @@ int fat_ent_write(struct inode *inode, struct fat_entry *fatent, int new, int wait) { struct super_block *sb = inode->i_sb; - struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops; + const struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops; int err; ops->ent_put(fatent, new); @@ -432,7 +432,7 @@ static inline int fat_ent_next(struct msdos_sb_info *sbi, static inline int fat_ent_read_block(struct super_block *sb, struct fat_entry *fatent) { - struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops; + const struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops; sector_t blocknr; int offset; @@ -463,7 +463,7 @@ int fat_alloc_clusters(struct inode *inode, int *cluster, int nr_cluster) { struct super_block *sb = inode->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); - struct fatent_operations *ops = sbi->fatent_ops; + const struct fatent_operations *ops = sbi->fatent_ops; struct fat_entry fatent, prev_ent; struct buffer_head *bhs[MAX_BUF_PER_PAGE]; int i, count, err, nr_bhs, idx_clus; @@ -551,7 +551,7 @@ int fat_free_clusters(struct inode *inode, int cluster) { struct super_block *sb = inode->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); - struct fatent_operations *ops = sbi->fatent_ops; + const struct fatent_operations *ops = sbi->fatent_ops; struct fat_entry fatent; struct buffer_head *bhs[MAX_BUF_PER_PAGE]; int i, err, nr_bhs; @@ -636,7 +636,7 @@ EXPORT_SYMBOL_GPL(fat_free_clusters); static void fat_ent_reada(struct super_block *sb, struct fat_entry *fatent, unsigned long reada_blocks) { - struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops; + const struct fatent_operations *ops = MSDOS_SB(sb)->fatent_ops; sector_t blocknr; int i, offset; @@ -649,7 +649,7 @@ static void fat_ent_reada(struct super_block *sb, struct fat_entry *fatent, int fat_count_free_clusters(struct super_block *sb) { struct msdos_sb_info *sbi = MSDOS_SB(sb); - struct fatent_operations *ops = sbi->fatent_ops; + const struct fatent_operations *ops = sbi->fatent_ops; struct fat_entry fatent; unsigned long reada_blocks, reada_mask, cur_block; int err = 0, free; diff --git a/fs/fat/file.c b/fs/fat/file.c index a08f10399..f70185668 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -14,15 +14,19 @@ #include <linux/backing-dev.h> #include <linux/fsnotify.h> #include <linux/security.h> +#include <linux/falloc.h> #include "fat.h" +static long fat_fallocate(struct file *file, int mode, + loff_t offset, loff_t len); + static int fat_ioctl_get_attributes(struct inode *inode, u32 __user *user_attr) { u32 attr; - mutex_lock(&inode->i_mutex); + inode_lock(inode); attr = fat_make_attrs(inode); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return put_user(attr, user_attr); } @@ -43,7 +47,7 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr) err = mnt_want_write_file(file); if (err) goto out; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* * ATTR_VOLUME and ATTR_DIR cannot be changed; this also @@ -105,7 +109,7 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr) fat_save_attrs(inode, attr); mark_inode_dirty(inode); out_unlock_inode: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); mnt_drop_write_file(file); out: return err; @@ -177,6 +181,7 @@ const struct file_operations fat_file_operations = { #endif .fsync = fat_file_fsync, .splice_read = generic_file_splice_read, + .fallocate = fat_fallocate, }; static int fat_cont_expand(struct inode *inode, loff_t size) @@ -215,6 +220,62 @@ out: return err; } +/* + * Preallocate space for a file. This implements fat's fallocate file + * operation, which gets called from sys_fallocate system call. User + * space requests len bytes at offset. If FALLOC_FL_KEEP_SIZE is set + * we just allocate clusters without zeroing them out. Otherwise we + * allocate and zero out clusters via an expanding truncate. + */ +static long fat_fallocate(struct file *file, int mode, + loff_t offset, loff_t len) +{ + int nr_cluster; /* Number of clusters to be allocated */ + loff_t mm_bytes; /* Number of bytes to be allocated for file */ + loff_t ondisksize; /* block aligned on-disk size in bytes*/ + struct inode *inode = file->f_mapping->host; + struct super_block *sb = inode->i_sb; + struct msdos_sb_info *sbi = MSDOS_SB(sb); + int err = 0; + + /* No support for hole punch or other fallocate flags. */ + if (mode & ~FALLOC_FL_KEEP_SIZE) + return -EOPNOTSUPP; + + /* No support for dir */ + if (!S_ISREG(inode->i_mode)) + return -EOPNOTSUPP; + + inode_lock(inode); + if (mode & FALLOC_FL_KEEP_SIZE) { + ondisksize = inode->i_blocks << 9; + if ((offset + len) <= ondisksize) + goto error; + + /* First compute the number of clusters to be allocated */ + mm_bytes = offset + len - ondisksize; + nr_cluster = (mm_bytes + (sbi->cluster_size - 1)) >> + sbi->cluster_bits; + + /* Start the allocation.We are not zeroing out the clusters */ + while (nr_cluster-- > 0) { + err = fat_add_cluster(inode); + if (err) + goto error; + } + } else { + if ((offset + len) <= i_size_read(inode)) + goto error; + + /* This is just an expanding truncate */ + err = fat_cont_expand(inode, (offset + len)); + } + +error: + inode_unlock(inode); + return err; +} + /* Free all clusters after the skip'th cluster. */ static int fat_free(struct inode *inode, int skip) { diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 509411dd3..a55990521 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -93,7 +93,7 @@ static struct fat_floppy_defaults { }, }; -static int fat_add_cluster(struct inode *inode) +int fat_add_cluster(struct inode *inode) { int err, cluster; @@ -115,10 +115,10 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock, struct super_block *sb = inode->i_sb; struct msdos_sb_info *sbi = MSDOS_SB(sb); unsigned long mapped_blocks; - sector_t phys; + sector_t phys, last_block; int err, offset; - err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create); + err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create, false); if (err) return err; if (phys) { @@ -135,8 +135,14 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock, return -EIO; } + last_block = inode->i_blocks >> (sb->s_blocksize_bits - 9); offset = (unsigned long)iblock & (sbi->sec_per_clus - 1); - if (!offset) { + /* + * allocate a cluster according to the following. + * 1) no more available blocks + * 2) not part of fallocate region + */ + if (!offset && !(iblock < last_block)) { /* TODO: multiple cluster allocation would be desirable. */ err = fat_add_cluster(inode); if (err) @@ -148,7 +154,7 @@ static inline int __fat_get_block(struct inode *inode, sector_t iblock, *max_blocks = min(mapped_blocks, *max_blocks); MSDOS_I(inode)->mmu_private += *max_blocks << sb->s_blocksize_bits; - err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create); + err = fat_bmap(inode, iblock, &phys, &mapped_blocks, create, false); if (err) return err; @@ -273,13 +279,38 @@ static ssize_t fat_direct_IO(struct kiocb *iocb, struct iov_iter *iter, return ret; } +static int fat_get_block_bmap(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) +{ + struct super_block *sb = inode->i_sb; + unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits; + int err; + sector_t bmap; + unsigned long mapped_blocks; + + BUG_ON(create != 0); + + err = fat_bmap(inode, iblock, &bmap, &mapped_blocks, create, true); + if (err) + return err; + + if (bmap) { + map_bh(bh_result, sb, bmap); + max_blocks = min(mapped_blocks, max_blocks); + } + + bh_result->b_size = max_blocks << sb->s_blocksize_bits; + + return 0; +} + static sector_t _fat_bmap(struct address_space *mapping, sector_t block) { sector_t blocknr; /* fat_get_cluster() assumes the requested blocknr isn't truncated. */ down_read(&MSDOS_I(mapping->host)->truncate_lock); - blocknr = generic_block_bmap(mapping, block, fat_get_block); + blocknr = generic_block_bmap(mapping, block, fat_get_block_bmap); up_read(&MSDOS_I(mapping->host)->truncate_lock); return blocknr; @@ -449,6 +480,24 @@ static int fat_calc_dir_size(struct inode *inode) return 0; } +static int fat_validate_dir(struct inode *dir) +{ + struct super_block *sb = dir->i_sb; + + if (dir->i_nlink < 2) { + /* Directory should have "."/".." entries at least. */ + fat_fs_error(sb, "corrupted directory (invalid entries)"); + return -EIO; + } + if (MSDOS_I(dir)->i_start == 0 || + MSDOS_I(dir)->i_start == MSDOS_SB(sb)->root_cluster) { + /* Directory should point valid cluster. */ + fat_fs_error(sb, "corrupted directory (invalid i_start)"); + return -EIO; + } + return 0; +} + /* doesn't deal with root inode */ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) { @@ -475,6 +524,10 @@ int fat_fill_inode(struct inode *inode, struct msdos_dir_entry *de) MSDOS_I(inode)->mmu_private = inode->i_size; set_nlink(inode, fat_subdirs(inode)); + + error = fat_validate_dir(inode); + if (error < 0) + return error; } else { /* not a directory */ inode->i_generation |= 1; inode->i_mode = fat_make_mode(sbi, de->attr, @@ -553,13 +606,43 @@ out: EXPORT_SYMBOL_GPL(fat_build_inode); +static int __fat_write_inode(struct inode *inode, int wait); + +static void fat_free_eofblocks(struct inode *inode) +{ + /* Release unwritten fallocated blocks on inode eviction. */ + if ((inode->i_blocks << 9) > + round_up(MSDOS_I(inode)->mmu_private, + MSDOS_SB(inode->i_sb)->cluster_size)) { + int err; + + fat_truncate_blocks(inode, MSDOS_I(inode)->mmu_private); + /* Fallocate results in updating the i_start/iogstart + * for the zero byte file. So, make it return to + * original state during evict and commit it to avoid + * any corruption on the next access to the cluster + * chain for the file. + */ + err = __fat_write_inode(inode, inode_needs_sync(inode)); + if (err) { + fat_msg(inode->i_sb, KERN_WARNING, "Failed to " + "update on disk inode for unused " + "fallocated blocks, inode could be " + "corrupted. Please run fsck"); + } + + } +} + static void fat_evict_inode(struct inode *inode) { truncate_inode_pages_final(&inode->i_data); if (!inode->i_nlink) { inode->i_size = 0; fat_truncate_blocks(inode, 0); - } + } else + fat_free_eofblocks(inode); + invalidate_inode_buffers(inode); clear_inode(inode); fat_cache_inval_inode(inode); @@ -677,7 +760,7 @@ static int __init fat_init_inodecache(void) fat_inode_cachep = kmem_cache_create("fat_inode_cache", sizeof(struct msdos_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (fat_inode_cachep == NULL) return -ENOMEM; @@ -1146,7 +1229,12 @@ static int parse_options(struct super_block *sb, char *options, int is_vfat, case Opt_time_offset: if (match_int(&args[0], &option)) return -EINVAL; - if (option < -12 * 60 || option > 12 * 60) + /* + * GMT+-12 zones may have DST corrections so at least + * 13 hours difference is needed. Make the limit 24 + * just in case someone invents something unusual. + */ + if (option < -24 * 60 || option > 24 * 60) return -EINVAL; opts->tz_set = 1; opts->time_offset = option; diff --git a/fs/fcntl.c b/fs/fcntl.c index 8abb9f814..350a2c8cf 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -29,7 +29,7 @@ #define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME) -int setfl(int fd, struct file * filp, unsigned long arg) +static int setfl(int fd, struct file * filp, unsigned long arg) { struct inode * inode = file_inode(filp); int error = 0; @@ -51,7 +51,8 @@ int setfl(int fd, struct file * filp, unsigned long arg) if (arg & O_NDELAY) arg |= O_NONBLOCK; - if (arg & O_DIRECT) { + /* Pipe packetized mode is controlled by O_DIRECT flag */ + if (!S_ISFIFO(filp->f_inode->i_mode) && (arg & O_DIRECT)) { if (!filp->f_mapping || !filp->f_mapping->a_ops || !filp->f_mapping->a_ops->direct_IO) return -EINVAL; @@ -59,8 +60,6 @@ int setfl(int fd, struct file * filp, unsigned long arg) if (filp->f_op->check_flags) error = filp->f_op->check_flags(arg); - if (!error && filp->f_op->setfl) - error = filp->f_op->setfl(filp, arg); if (error) return error; @@ -81,7 +80,6 @@ int setfl(int fd, struct file * filp, unsigned long arg) out: return error; } -EXPORT_SYMBOL_GPL(setfl); static void f_modown(struct file *filp, struct pid *pid, enum pid_type type, int force) @@ -25,9 +25,9 @@ int sysctl_nr_open __read_mostly = 1024*1024; int sysctl_nr_open_min = BITS_PER_LONG; -/* our max() is unusable in constant expressions ;-/ */ -#define __const_max(x, y) ((x) < (y) ? (x) : (y)) -int sysctl_nr_open_max = __const_max(INT_MAX, ~(size_t)0/sizeof(void *)) & +/* our min() is unusable in constant expressions ;-/ */ +#define __const_min(x, y) ((x) < (y) ? (x) : (y)) +int sysctl_nr_open_max = __const_min(INT_MAX, ~(size_t)0/sizeof(void *)) & -BITS_PER_LONG; static void *alloc_fdmem(size_t size) @@ -37,11 +37,12 @@ static void *alloc_fdmem(size_t size) * vmalloc() if the allocation size will be considered "large" by the VM. */ if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { - void *data = kmalloc(size, GFP_KERNEL|__GFP_NOWARN|__GFP_NORETRY); + void *data = kmalloc(size, GFP_KERNEL_ACCOUNT | + __GFP_NOWARN | __GFP_NORETRY); if (data != NULL) return data; } - return vmalloc(size); + return __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_HIGHMEM, PAGE_KERNEL); } static void __free_fdtable(struct fdtable *fdt) @@ -126,7 +127,7 @@ static struct fdtable * alloc_fdtable(unsigned int nr) if (unlikely(nr > sysctl_nr_open)) nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1; - fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL); + fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT); if (!fdt) goto out; fdt->max_fds = nr; diff --git a/fs/file_table.c b/fs/file_table.c index ae9f2676d..ad17e05eb 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -147,7 +147,6 @@ over: } return ERR_PTR(-ENFILE); } -EXPORT_SYMBOL_GPL(get_empty_filp); /** * alloc_file - allocate and initialize a 'struct file' @@ -259,7 +258,6 @@ void flush_delayed_fput(void) { delayed_fput(NULL); } -EXPORT_SYMBOL_GPL(flush_delayed_fput); static DECLARE_DELAYED_WORK(delayed_fput_work, delayed_fput); @@ -302,7 +300,6 @@ void __fput_sync(struct file *file) } EXPORT_SYMBOL(fput); -EXPORT_SYMBOL_GPL(__fput_sync); void put_filp(struct file *file) { @@ -311,7 +308,6 @@ void put_filp(struct file *file) file_free(file); } } -EXPORT_SYMBOL_GPL(put_filp); void __init files_init(void) { diff --git a/fs/filesystems.c b/fs/filesystems.c index 5797d45a7..c5618db11 100644 --- a/fs/filesystems.c +++ b/fs/filesystems.c @@ -46,9 +46,9 @@ void put_filesystem(struct file_system_type *fs) static struct file_system_type **find_filesystem(const char *name, unsigned len) { struct file_system_type **p; - for (p=&file_systems; *p; p=&(*p)->next) - if (strlen((*p)->name) == len && - strncmp((*p)->name, name, len) == 0) + for (p = &file_systems; *p; p = &(*p)->next) + if (strncmp((*p)->name, name, len) == 0 && + !(*p)->name[len]) break; return p; } diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c index ef73ed674..3e2ccade6 100644 --- a/fs/freevxfs/vxfs_inode.c +++ b/fs/freevxfs/vxfs_inode.c @@ -326,6 +326,7 @@ vxfs_iget(struct super_block *sbp, ino_t ino) } else if (S_ISLNK(ip->i_mode)) { if (!VXFS_ISIMMED(vip)) { ip->i_op = &page_symlink_inode_operations; + inode_nohighmem(ip); ip->i_mapping->a_ops = &vxfs_aops; } else { ip->i_op = &simple_symlink_inode_operations; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 7a8ea1351..5c46ed9f3 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -685,9 +685,7 @@ void wbc_account_io(struct writeback_control *wbc, struct page *page, if (!wbc->wb) return; - rcu_read_lock(); id = mem_cgroup_css_from_page(page)->id; - rcu_read_unlock(); if (id == wbc->wb_id) { wbc->wb_bytes += bytes; diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 5e2e08712..4b855b65d 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -944,7 +944,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, if (!parent) return -ENOENT; - mutex_lock(&parent->i_mutex); + inode_lock(parent); if (!S_ISDIR(parent->i_mode)) goto unlock; @@ -962,7 +962,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, fuse_invalidate_entry(entry); if (child_nodeid != 0 && d_really_is_positive(entry)) { - mutex_lock(&d_inode(entry)->i_mutex); + inode_lock(d_inode(entry)); if (get_node_id(d_inode(entry)) != child_nodeid) { err = -ENOENT; goto badentry; @@ -983,7 +983,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, clear_nlink(d_inode(entry)); err = 0; badentry: - mutex_unlock(&d_inode(entry)->i_mutex); + inode_unlock(d_inode(entry)); if (!err) d_delete(entry); } else { @@ -992,7 +992,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, dput(entry); unlock: - mutex_unlock(&parent->i_mutex); + inode_unlock(parent); iput(parent); return err; } @@ -1365,15 +1365,19 @@ static int fuse_readdir(struct file *file, struct dir_context *ctx) return err; } -static const char *fuse_follow_link(struct dentry *dentry, void **cookie) +static const char *fuse_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) { - struct inode *inode = d_inode(dentry); struct fuse_conn *fc = get_fuse_conn(inode); FUSE_ARGS(args); char *link; ssize_t ret; - link = (char *) __get_free_page(GFP_KERNEL); + if (!dentry) + return ERR_PTR(-ECHILD); + + link = kmalloc(PAGE_SIZE, GFP_KERNEL); if (!link) return ERR_PTR(-ENOMEM); @@ -1385,11 +1389,11 @@ static const char *fuse_follow_link(struct dentry *dentry, void **cookie) args.out.args[0].value = link; ret = fuse_simple_request(fc, &args); if (ret < 0) { - free_page((unsigned long) link); + kfree(link); link = ERR_PTR(ret); } else { link[ret] = '\0'; - *cookie = link; + set_delayed_call(done, kfree_link, link); } fuse_invalidate_atime(inode); return link; @@ -1500,7 +1504,7 @@ void fuse_set_nowrite(struct inode *inode) struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); - BUG_ON(!mutex_is_locked(&inode->i_mutex)); + BUG_ON(!inode_is_locked(inode)); spin_lock(&fc->lock); BUG_ON(fi->writectr < 0); @@ -1909,8 +1913,7 @@ static const struct inode_operations fuse_common_inode_operations = { static const struct inode_operations fuse_symlink_inode_operations = { .setattr = fuse_setattr, - .follow_link = fuse_follow_link, - .put_link = free_page_put_link, + .get_link = fuse_get_link, .readlink = generic_readlink, .getattr = fuse_getattr, .setxattr = fuse_setxattr, diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 570ca4053..b03d253ec 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -207,7 +207,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir) return err; if (lock_inode) - mutex_lock(&inode->i_mutex); + inode_lock(inode); err = fuse_do_open(fc, get_node_id(inode), file, isdir); @@ -215,7 +215,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir) fuse_finish_open(inode, file); if (lock_inode) - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return err; } @@ -413,9 +413,9 @@ static int fuse_flush(struct file *file, fl_owner_t id) if (err) return err; - mutex_lock(&inode->i_mutex); + inode_lock(inode); fuse_sync_writes(inode); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); req = fuse_get_req_nofail_nopages(fc, file); memset(&inarg, 0, sizeof(inarg)); @@ -450,7 +450,7 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, if (is_bad_inode(inode)) return -EIO; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* * Start writeback against all dirty pages of the inode, then @@ -486,7 +486,7 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, err = 0; } out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return err; } @@ -1160,7 +1160,7 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) return generic_file_write_iter(iocb, from); } - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* We can write back this queue in page reclaim */ current->backing_dev_info = inode_to_bdi(inode); @@ -1210,7 +1210,7 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) } out: current->backing_dev_info = NULL; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return written ? written : err; } @@ -1322,10 +1322,10 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) { if (!write) - mutex_lock(&inode->i_mutex); + inode_lock(inode); fuse_sync_writes(inode); if (!write) - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } while (count) { @@ -1413,14 +1413,14 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) return -EIO; /* Don't allow parallel writes to the same file */ - mutex_lock(&inode->i_mutex); + inode_lock(inode); res = generic_write_checks(iocb, from); if (res > 0) res = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE); fuse_invalidate_attr(inode); if (res > 0) fuse_write_update_size(inode, iocb->ki_pos); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return res; } @@ -2231,20 +2231,77 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block) return err ? 0 : outarg.block; } +static loff_t fuse_lseek(struct file *file, loff_t offset, int whence) +{ + struct inode *inode = file->f_mapping->host; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_file *ff = file->private_data; + FUSE_ARGS(args); + struct fuse_lseek_in inarg = { + .fh = ff->fh, + .offset = offset, + .whence = whence + }; + struct fuse_lseek_out outarg; + int err; + + if (fc->no_lseek) + goto fallback; + + args.in.h.opcode = FUSE_LSEEK; + args.in.h.nodeid = ff->nodeid; + args.in.numargs = 1; + args.in.args[0].size = sizeof(inarg); + args.in.args[0].value = &inarg; + args.out.numargs = 1; + args.out.args[0].size = sizeof(outarg); + args.out.args[0].value = &outarg; + err = fuse_simple_request(fc, &args); + if (err) { + if (err == -ENOSYS) { + fc->no_lseek = 1; + goto fallback; + } + return err; + } + + return vfs_setpos(file, outarg.offset, inode->i_sb->s_maxbytes); + +fallback: + err = fuse_update_attributes(inode, NULL, file, NULL); + if (!err) + return generic_file_llseek(file, offset, whence); + else + return err; +} + static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence) { loff_t retval; struct inode *inode = file_inode(file); - /* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */ - if (whence == SEEK_CUR || whence == SEEK_SET) - return generic_file_llseek(file, offset, whence); - - mutex_lock(&inode->i_mutex); - retval = fuse_update_attributes(inode, NULL, file, NULL); - if (!retval) + switch (whence) { + case SEEK_SET: + case SEEK_CUR: + /* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */ retval = generic_file_llseek(file, offset, whence); - mutex_unlock(&inode->i_mutex); + break; + case SEEK_END: + inode_lock(inode); + retval = fuse_update_attributes(inode, NULL, file, NULL); + if (!retval) + retval = generic_file_llseek(file, offset, whence); + inode_unlock(inode); + break; + case SEEK_HOLE: + case SEEK_DATA: + inode_lock(inode); + retval = fuse_lseek(file, offset, whence); + inode_unlock(inode); + break; + default: + retval = -EINVAL; + } return retval; } @@ -2887,7 +2944,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, return -EOPNOTSUPP; if (lock_inode) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (mode & FALLOC_FL_PUNCH_HOLE) { loff_t endbyte = offset + length - 1; err = filemap_write_and_wait_range(inode->i_mapping, @@ -2933,7 +2990,7 @@ out: clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); if (lock_inode) - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return err; } diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 405113101..ce394b5fe 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -605,6 +605,9 @@ struct fuse_conn { /** Does the filesystem support asynchronous direct-IO submission? */ unsigned async_dio:1; + /** Is lseek not implemented by fs? */ + unsigned no_lseek:1; + /** The number of requests waiting for completion */ atomic_t num_waiting; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 2913db2a5..4d69d5c0b 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1255,8 +1255,8 @@ static int __init fuse_fs_init(void) int err; fuse_inode_cachep = kmem_cache_create("fuse_inode", - sizeof(struct fuse_inode), - 0, SLAB_HWCACHE_ALIGN, + sizeof(struct fuse_inode), 0, + SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT, fuse_inode_init_once); err = -ENOMEM; if (!fuse_inode_cachep) diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 1be3b061c..791932617 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -31,9 +31,9 @@ static const char *gfs2_acl_name(int type) { switch (type) { case ACL_TYPE_ACCESS: - return GFS2_POSIX_ACL_ACCESS; + return XATTR_POSIX_ACL_ACCESS; case ACL_TYPE_DEFAULT: - return GFS2_POSIX_ACL_DEFAULT; + return XATTR_POSIX_ACL_DEFAULT; } return NULL; } diff --git a/fs/gfs2/acl.h b/fs/gfs2/acl.h index 2d65ec4cd..3af4f407a 100644 --- a/fs/gfs2/acl.h +++ b/fs/gfs2/acl.h @@ -12,8 +12,6 @@ #include "incore.h" -#define GFS2_POSIX_ACL_ACCESS "posix_acl_access" -#define GFS2_POSIX_ACL_DEFAULT "posix_acl_default" #define GFS2_ACL_MAX_ENTRIES(sdp) ((300 << (sdp)->sd_sb.sb_bsize_shift) >> 12) extern struct posix_acl *gfs2_get_acl(struct inode *inode, int type); diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 1caee0534..93f07465e 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -914,7 +914,7 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping, failed: gfs2_trans_end(sdp); gfs2_inplace_release(ip); - if (ip->i_res->rs_qa_qd_num) + if (ip->i_qadata && ip->i_qadata->qa_qd_num) gfs2_quota_unlock(ip); if (inode == sdp->sd_rindex) { gfs2_glock_dq(&m_ip->i_gh); diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 61296ecbd..0860f0b5b 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -787,8 +787,8 @@ static int do_strip(struct gfs2_inode *ip, struct buffer_head *dibh, if (error) goto out_rlist; - if (gfs2_rs_active(ip->i_res)) /* needs to be done with the rgrp glock held */ - gfs2_rs_deltree(ip->i_res); + if (gfs2_rs_active(&ip->i_res)) /* needs to be done with the rgrp glock held */ + gfs2_rs_deltree(&ip->i_res); error = gfs2_trans_begin(sdp, rg_blocks + RES_DINODE + RES_INDIRECT + RES_STATFS + RES_QUOTA, @@ -1291,13 +1291,9 @@ int gfs2_setattr_size(struct inode *inode, u64 newsize) if (ret) return ret; - ret = get_write_access(inode); - if (ret) - return ret; - inode_dio_wait(inode); - ret = gfs2_rs_alloc(ip); + ret = gfs2_rsqa_alloc(ip); if (ret) goto out; @@ -1307,10 +1303,9 @@ int gfs2_setattr_size(struct inode *inode, u64 newsize) goto out; } - gfs2_rs_deltree(ip->i_res); ret = do_shrink(inode, oldsize, newsize); out: - put_write_access(inode); + gfs2_rsqa_delete(ip, NULL); return ret; } diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index ad8a5b757..6a9259230 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -82,6 +82,8 @@ #define gfs2_disk_hash2offset(h) (((u64)(h)) >> 1) #define gfs2_dir_offset2hash(p) ((u32)(((u64)(p)) << 1)) +#define GFS2_HASH_INDEX_MASK 0xffffc000 +#define GFS2_USE_HASH_FLAG 0x2000 struct qstr gfs2_qdot __read_mostly; struct qstr gfs2_qdotdot __read_mostly; @@ -108,7 +110,7 @@ static int gfs2_dir_get_existing_buffer(struct gfs2_inode *ip, u64 block, struct buffer_head *bh; int error; - error = gfs2_meta_read(ip->i_gl, block, DIO_WAIT, &bh); + error = gfs2_meta_read(ip->i_gl, block, DIO_WAIT, 0, &bh); if (error) return error; if (gfs2_metatype_check(GFS2_SB(&ip->i_inode), bh, GFS2_METATYPE_JD)) { @@ -305,7 +307,7 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, __be64 *buf, BUG_ON(extlen < 1); bh = gfs2_meta_ra(ip->i_gl, dblock, extlen); } else { - error = gfs2_meta_read(ip->i_gl, dblock, DIO_WAIT, &bh); + error = gfs2_meta_read(ip->i_gl, dblock, DIO_WAIT, 0, &bh); if (error) goto fail; } @@ -443,6 +445,27 @@ static int gfs2_dirent_last(const struct gfs2_dirent *dent, return 0; } +/* Look for the dirent that contains the offset specified in data. Once we + * find that dirent, there must be space available there for the new dirent */ +static int gfs2_dirent_find_offset(const struct gfs2_dirent *dent, + const struct qstr *name, + void *ptr) +{ + unsigned required = GFS2_DIRENT_SIZE(name->len); + unsigned actual = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len)); + unsigned totlen = be16_to_cpu(dent->de_rec_len); + + if (ptr < (void *)dent || ptr >= (void *)dent + totlen) + return 0; + if (gfs2_dirent_sentinel(dent)) + actual = 0; + if (ptr < (void *)dent + actual) + return -1; + if ((void *)dent + totlen >= ptr + required) + return 1; + return -1; +} + static int gfs2_dirent_find_space(const struct gfs2_dirent *dent, const struct qstr *name, void *opaque) @@ -682,6 +705,27 @@ static void dirent_del(struct gfs2_inode *dip, struct buffer_head *bh, prev->de_rec_len = cpu_to_be16(prev_rec_len); } + +static struct gfs2_dirent *do_init_dirent(struct inode *inode, + struct gfs2_dirent *dent, + const struct qstr *name, + struct buffer_head *bh, + unsigned offset) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_dirent *ndent; + unsigned totlen; + + totlen = be16_to_cpu(dent->de_rec_len); + BUG_ON(offset + name->len > totlen); + gfs2_trans_add_meta(ip->i_gl, bh); + ndent = (struct gfs2_dirent *)((char *)dent + offset); + dent->de_rec_len = cpu_to_be16(offset); + gfs2_qstr2dirent(name, totlen - offset, ndent); + return ndent; +} + + /* * Takes a dent from which to grab space as an argument. Returns the * newly created dent. @@ -691,31 +735,25 @@ static struct gfs2_dirent *gfs2_init_dirent(struct inode *inode, const struct qstr *name, struct buffer_head *bh) { - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_dirent *ndent; - unsigned offset = 0, totlen; + unsigned offset = 0; if (!gfs2_dirent_sentinel(dent)) offset = GFS2_DIRENT_SIZE(be16_to_cpu(dent->de_name_len)); - totlen = be16_to_cpu(dent->de_rec_len); - BUG_ON(offset + name->len > totlen); - gfs2_trans_add_meta(ip->i_gl, bh); - ndent = (struct gfs2_dirent *)((char *)dent + offset); - dent->de_rec_len = cpu_to_be16(offset); - gfs2_qstr2dirent(name, totlen - offset, ndent); - return ndent; + return do_init_dirent(inode, dent, name, bh, offset); } -static struct gfs2_dirent *gfs2_dirent_alloc(struct inode *inode, - struct buffer_head *bh, - const struct qstr *name) +static struct gfs2_dirent *gfs2_dirent_split_alloc(struct inode *inode, + struct buffer_head *bh, + const struct qstr *name, + void *ptr) { struct gfs2_dirent *dent; dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size, - gfs2_dirent_find_space, name, NULL); + gfs2_dirent_find_offset, name, ptr); if (!dent || IS_ERR(dent)) return dent; - return gfs2_init_dirent(inode, dent, name, bh); + return do_init_dirent(inode, dent, name, bh, + (unsigned)(ptr - (void *)dent)); } static int get_leaf(struct gfs2_inode *dip, u64 leaf_no, @@ -723,7 +761,7 @@ static int get_leaf(struct gfs2_inode *dip, u64 leaf_no, { int error; - error = gfs2_meta_read(dip->i_gl, leaf_no, DIO_WAIT, bhp); + error = gfs2_meta_read(dip->i_gl, leaf_no, DIO_WAIT, 0, bhp); if (!error && gfs2_metatype_check(GFS2_SB(&dip->i_inode), *bhp, GFS2_METATYPE_LF)) { /* pr_info("block num=%llu\n", leaf_no); */ error = -EIO; @@ -1051,10 +1089,11 @@ static int dir_split_leaf(struct inode *inode, const struct qstr *name) if (!gfs2_dirent_sentinel(dent) && be32_to_cpu(dent->de_hash) < divider) { struct qstr str; + void *ptr = ((char *)dent - obh->b_data) + nbh->b_data; str.name = (char*)(dent+1); str.len = be16_to_cpu(dent->de_name_len); str.hash = be32_to_cpu(dent->de_hash); - new = gfs2_dirent_alloc(inode, nbh, &str); + new = gfs2_dirent_split_alloc(inode, nbh, &str, ptr); if (IS_ERR(new)) { error = PTR_ERR(new); break; @@ -1186,10 +1225,10 @@ static int compare_dents(const void *a, const void *b) int ret = 0; dent_a = *(const struct gfs2_dirent **)a; - hash_a = be32_to_cpu(dent_a->de_hash); + hash_a = dent_a->de_cookie; dent_b = *(const struct gfs2_dirent **)b; - hash_b = be32_to_cpu(dent_b->de_hash); + hash_b = dent_b->de_cookie; if (hash_a > hash_b) ret = 1; @@ -1227,19 +1266,20 @@ static int compare_dents(const void *a, const void *b) */ static int do_filldir_main(struct gfs2_inode *dip, struct dir_context *ctx, - const struct gfs2_dirent **darr, u32 entries, - int *copied) + struct gfs2_dirent **darr, u32 entries, + u32 sort_start, int *copied) { const struct gfs2_dirent *dent, *dent_next; u64 off, off_next; unsigned int x, y; int run = 0; - sort(darr, entries, sizeof(struct gfs2_dirent *), compare_dents, NULL); + if (sort_start < entries) + sort(&darr[sort_start], entries - sort_start, + sizeof(struct gfs2_dirent *), compare_dents, NULL); dent_next = darr[0]; - off_next = be32_to_cpu(dent_next->de_hash); - off_next = gfs2_disk_hash2offset(off_next); + off_next = dent_next->de_cookie; for (x = 0, y = 1; x < entries; x++, y++) { dent = dent_next; @@ -1247,8 +1287,7 @@ static int do_filldir_main(struct gfs2_inode *dip, struct dir_context *ctx, if (y < entries) { dent_next = darr[y]; - off_next = be32_to_cpu(dent_next->de_hash); - off_next = gfs2_disk_hash2offset(off_next); + off_next = dent_next->de_cookie; if (off < ctx->pos) continue; @@ -1295,6 +1334,40 @@ static void *gfs2_alloc_sort_buffer(unsigned size) return ptr; } + +static int gfs2_set_cookies(struct gfs2_sbd *sdp, struct buffer_head *bh, + unsigned leaf_nr, struct gfs2_dirent **darr, + unsigned entries) +{ + int sort_id = -1; + int i; + + for (i = 0; i < entries; i++) { + unsigned offset; + + darr[i]->de_cookie = be32_to_cpu(darr[i]->de_hash); + darr[i]->de_cookie = gfs2_disk_hash2offset(darr[i]->de_cookie); + + if (!sdp->sd_args.ar_loccookie) + continue; + offset = (char *)(darr[i]) - + (bh->b_data + gfs2_dirent_offset(bh->b_data)); + offset /= GFS2_MIN_DIRENT_SIZE; + offset += leaf_nr * sdp->sd_max_dents_per_leaf; + if (offset >= GFS2_USE_HASH_FLAG || + leaf_nr >= GFS2_USE_HASH_FLAG) { + darr[i]->de_cookie |= GFS2_USE_HASH_FLAG; + if (sort_id < 0) + sort_id = i; + continue; + } + darr[i]->de_cookie &= GFS2_HASH_INDEX_MASK; + darr[i]->de_cookie |= offset; + } + return sort_id; +} + + static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx, int *copied, unsigned *depth, u64 leaf_no) @@ -1304,12 +1377,11 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx, struct buffer_head *bh; struct gfs2_leaf *lf; unsigned entries = 0, entries2 = 0; - unsigned leaves = 0; - const struct gfs2_dirent **darr, *dent; + unsigned leaves = 0, leaf = 0, offset, sort_offset; + struct gfs2_dirent **darr, *dent; struct dirent_gather g; struct buffer_head **larr; - int leaf = 0; - int error, i; + int error, i, need_sort = 0, sort_id; u64 lfn = leaf_no; do { @@ -1325,6 +1397,11 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx, brelse(bh); } while(lfn); + if (*depth < GFS2_DIR_MAX_DEPTH || !sdp->sd_args.ar_loccookie) { + need_sort = 1; + sort_offset = 0; + } + if (!entries) return 0; @@ -1338,8 +1415,8 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx, larr = gfs2_alloc_sort_buffer((leaves + entries + 99) * sizeof(void *)); if (!larr) goto out; - darr = (const struct gfs2_dirent **)(larr + leaves); - g.pdent = darr; + darr = (struct gfs2_dirent **)(larr + leaves); + g.pdent = (const struct gfs2_dirent **)darr; g.offset = 0; lfn = leaf_no; @@ -1350,6 +1427,7 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx, lf = (struct gfs2_leaf *)bh->b_data; lfn = be64_to_cpu(lf->lf_next); if (lf->lf_entries) { + offset = g.offset; entries2 += be16_to_cpu(lf->lf_entries); dent = gfs2_dirent_scan(inode, bh->b_data, bh->b_size, gfs2_dirent_gather, NULL, &g); @@ -1367,17 +1445,26 @@ static int gfs2_dir_read_leaf(struct inode *inode, struct dir_context *ctx, goto out_free; } error = 0; + sort_id = gfs2_set_cookies(sdp, bh, leaf, &darr[offset], + be16_to_cpu(lf->lf_entries)); + if (!need_sort && sort_id >= 0) { + need_sort = 1; + sort_offset = offset + sort_id; + } larr[leaf++] = bh; } else { + larr[leaf++] = NULL; brelse(bh); } } while(lfn); BUG_ON(entries2 != entries); - error = do_filldir_main(ip, ctx, darr, entries, copied); + error = do_filldir_main(ip, ctx, darr, entries, need_sort ? + sort_offset : entries, copied); out_free: for(i = 0; i < leaf; i++) - brelse(larr[i]); + if (larr[i]) + brelse(larr[i]); kvfree(larr); out: return error; @@ -1483,7 +1570,7 @@ int gfs2_dir_read(struct inode *inode, struct dir_context *ctx, struct gfs2_inode *dip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); struct dirent_gather g; - const struct gfs2_dirent **darr, *dent; + struct gfs2_dirent **darr, *dent; struct buffer_head *dibh; int copied = 0; int error; @@ -1507,7 +1594,7 @@ int gfs2_dir_read(struct inode *inode, struct dir_context *ctx, /* 96 is max number of dirents which can be stuffed into an inode */ darr = kmalloc(96 * sizeof(struct gfs2_dirent *), GFP_NOFS); if (darr) { - g.pdent = darr; + g.pdent = (const struct gfs2_dirent **)darr; g.offset = 0; dent = gfs2_dirent_scan(inode, dibh->b_data, dibh->b_size, gfs2_dirent_gather, NULL, &g); @@ -1524,8 +1611,9 @@ int gfs2_dir_read(struct inode *inode, struct dir_context *ctx, error = -EIO; goto out; } + gfs2_set_cookies(sdp, dibh, 0, darr, dip->i_entries); error = do_filldir_main(dip, ctx, darr, - dip->i_entries, &copied); + dip->i_entries, 0, &copied); out: kfree(darr); } @@ -1560,15 +1648,22 @@ struct inode *gfs2_dir_search(struct inode *dir, const struct qstr *name, dent = gfs2_dirent_search(dir, name, gfs2_dirent_find, &bh); if (dent) { + struct inode *inode; + u16 rahead; + if (IS_ERR(dent)) return ERR_CAST(dent); dtype = be16_to_cpu(dent->de_type); + rahead = be16_to_cpu(dent->de_rahead); addr = be64_to_cpu(dent->de_inum.no_addr); formal_ino = be64_to_cpu(dent->de_inum.no_formal_ino); brelse(bh); if (fail_on_exist) return ERR_PTR(-EEXIST); - return gfs2_inode_lookup(dir->i_sb, dtype, addr, formal_ino, 0); + inode = gfs2_inode_lookup(dir->i_sb, dtype, addr, formal_ino, 0); + if (!IS_ERR(inode)) + GFS2_I(inode)->i_rahead = rahead; + return inode; } return ERR_PTR(-ENOENT); } diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 5e425469f..c9384f932 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -298,9 +298,9 @@ static int gfs2_set_flags(struct file *filp, u32 __user *ptr) gfsflags &= ~GFS2_DIF_TOPDIR; if (gfsflags & GFS2_DIF_INHERIT_JDATA) gfsflags ^= (GFS2_DIF_JDATA | GFS2_DIF_INHERIT_JDATA); - return do_gfs2_set_flags(filp, gfsflags, ~0); + return do_gfs2_set_flags(filp, gfsflags, ~GFS2_DIF_SYSTEM); } - return do_gfs2_set_flags(filp, gfsflags, ~GFS2_DIF_JDATA); + return do_gfs2_set_flags(filp, gfsflags, ~(GFS2_DIF_SYSTEM | GFS2_DIF_JDATA)); } static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) @@ -336,8 +336,8 @@ static void gfs2_size_hint(struct file *filep, loff_t offset, size_t size) size_t blks = (size + sdp->sd_sb.sb_bsize - 1) >> sdp->sd_sb.sb_bsize_shift; int hint = min_t(size_t, INT_MAX, blks); - if (hint > atomic_read(&ip->i_res->rs_sizehint)) - atomic_set(&ip->i_res->rs_sizehint, hint); + if (hint > atomic_read(&ip->i_res.rs_sizehint)) + atomic_set(&ip->i_res.rs_sizehint, hint); } /** @@ -397,14 +397,10 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) /* Update file times before taking page lock */ file_update_time(vma->vm_file); - ret = get_write_access(inode); + ret = gfs2_rsqa_alloc(ip); if (ret) goto out; - ret = gfs2_rs_alloc(ip); - if (ret) - goto out_write_access; - gfs2_size_hint(vma->vm_file, pos, PAGE_CACHE_SIZE); gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); @@ -486,8 +482,6 @@ out_uninit: set_page_dirty(page); wait_for_stable_page(page); } -out_write_access: - put_write_access(inode); out: sb_end_pagefault(inode->i_sb); return block_page_mkwrite_return(ret); @@ -623,7 +617,7 @@ static int gfs2_release(struct inode *inode, struct file *file) if (!(file->f_mode & FMODE_WRITE)) return 0; - gfs2_rs_delete(ip, &inode->i_writecount); + gfs2_rsqa_delete(ip, &inode->i_writecount); return 0; } @@ -703,7 +697,7 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) struct gfs2_inode *ip = GFS2_I(file_inode(file)); int ret; - ret = gfs2_rs_alloc(ip); + ret = gfs2_rsqa_alloc(ip); if (ret) return ret; @@ -920,7 +914,7 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t le if ((mode & ~FALLOC_FL_KEEP_SIZE) || gfs2_is_jdata(ip)) return -EOPNOTSUPP; - mutex_lock(&inode->i_mutex); + inode_lock(inode); gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); ret = gfs2_glock_nq(&gh); @@ -938,20 +932,21 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t le if (ret) goto out_unlock; - ret = gfs2_rs_alloc(ip); + ret = gfs2_rsqa_alloc(ip); if (ret) goto out_putw; ret = __gfs2_fallocate(file, mode, offset, len); if (ret) - gfs2_rs_deltree(ip->i_res); + gfs2_rs_deltree(&ip->i_res); + out_putw: put_write_access(inode); out_unlock: gfs2_glock_dq(&gh); out_uninit: gfs2_holder_uninit(&gh); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } @@ -962,7 +957,7 @@ static ssize_t gfs2_file_splice_write(struct pipe_inode_info *pipe, int error; struct gfs2_inode *ip = GFS2_I(out->f_mapping->host); - error = gfs2_rs_alloc(ip); + error = gfs2_rsqa_alloc(ip); if (error) return (ssize_t)error; @@ -1018,7 +1013,7 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl) struct gfs2_inode *ip = GFS2_I(file_inode(file)); struct gfs2_glock *gl; unsigned int state; - int flags; + u16 flags; int error = 0; int sleeptime; @@ -1032,7 +1027,10 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl) if (fl_gh->gh_state == state) goto out; locks_lock_file_wait(file, - &(struct file_lock){.fl_type = F_UNLCK}); + &(struct file_lock) { + .fl_type = F_UNLCK, + .fl_flags = FL_FLOCK + }); gfs2_glock_dq(fl_gh); gfs2_holder_reinit(state, flags, fl_gh); } else { diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 32e74710b..a4ff7b56f 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -446,7 +446,7 @@ __acquires(&gl->gl_lockref.lock) { const struct gfs2_glock_operations *glops = gl->gl_ops; struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; - unsigned int lck_flags = gh ? gh->gh_flags : 0; + unsigned int lck_flags = (unsigned int)(gh ? gh->gh_flags : 0); int ret; lck_flags &= (LM_FLAG_TRY | LM_FLAG_TRY_1CB | LM_FLAG_NOEXP | @@ -750,7 +750,7 @@ again: * */ -void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags, +void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, u16 flags, struct gfs2_holder *gh) { INIT_LIST_HEAD(&gh->gh_list); @@ -774,7 +774,7 @@ void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags, * */ -void gfs2_holder_reinit(unsigned int state, unsigned flags, struct gfs2_holder *gh) +void gfs2_holder_reinit(unsigned int state, u16 flags, struct gfs2_holder *gh) { gh->gh_state = state; gh->gh_flags = flags; @@ -1080,7 +1080,7 @@ void gfs2_glock_dq_uninit(struct gfs2_holder *gh) int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number, const struct gfs2_glock_operations *glops, - unsigned int state, int flags, struct gfs2_holder *gh) + unsigned int state, u16 flags, struct gfs2_holder *gh) { struct gfs2_glock *gl; int error; @@ -1417,14 +1417,14 @@ static struct shrinker glock_shrinker = { static void glock_hash_walk(glock_examiner examiner, const struct gfs2_sbd *sdp) { struct gfs2_glock *gl; - struct rhash_head *pos, *next; + struct rhash_head *pos; const struct bucket_table *tbl; int i; rcu_read_lock(); tbl = rht_dereference_rcu(gl_hash_table.tbl, &gl_hash_table); for (i = 0; i < tbl->size; i++) { - rht_for_each_entry_safe(gl, pos, next, tbl, i, gl_node) { + rht_for_each_entry_rcu(gl, pos, tbl, i, gl_node) { if ((gl->gl_name.ln_sbd == sdp) && lockref_get_not_dead(&gl->gl_lockref)) examiner(gl); @@ -1506,7 +1506,9 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp) flush_workqueue(glock_workqueue); glock_hash_walk(clear_glock, sdp); flush_workqueue(glock_workqueue); - wait_event(sdp->sd_glock_wait, atomic_read(&sdp->sd_glock_disposal) == 0); + wait_event_timeout(sdp->sd_glock_wait, + atomic_read(&sdp->sd_glock_disposal) == 0, + HZ * 600); glock_hash_walk(dump_glock_func, sdp); } @@ -1539,7 +1541,7 @@ static const char *state2str(unsigned state) return "??"; } -static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags) +static const char *hflags2str(char *buf, u16 flags, unsigned long iflags) { char *p = buf; if (flags & LM_FLAG_TRY) diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index f7cdaa8b4..46ab67fc1 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -79,15 +79,15 @@ enum { * requested had acquired and released the lock. */ -#define LM_FLAG_TRY 0x00000001 -#define LM_FLAG_TRY_1CB 0x00000002 -#define LM_FLAG_NOEXP 0x00000004 -#define LM_FLAG_ANY 0x00000008 -#define LM_FLAG_PRIORITY 0x00000010 -#define GL_ASYNC 0x00000040 -#define GL_EXACT 0x00000080 -#define GL_SKIP 0x00000100 -#define GL_NOCACHE 0x00000400 +#define LM_FLAG_TRY 0x0001 +#define LM_FLAG_TRY_1CB 0x0002 +#define LM_FLAG_NOEXP 0x0004 +#define LM_FLAG_ANY 0x0008 +#define LM_FLAG_PRIORITY 0x0010 +#define GL_ASYNC 0x0040 +#define GL_EXACT 0x0080 +#define GL_SKIP 0x0100 +#define GL_NOCACHE 0x0400 /* * lm_async_cb return flags @@ -183,8 +183,8 @@ extern int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, int create, struct gfs2_glock **glp); extern void gfs2_glock_put(struct gfs2_glock *gl); extern void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, - unsigned flags, struct gfs2_holder *gh); -extern void gfs2_holder_reinit(unsigned int state, unsigned flags, + u16 flags, struct gfs2_holder *gh); +extern void gfs2_holder_reinit(unsigned int state, u16 flags, struct gfs2_holder *gh); extern void gfs2_holder_uninit(struct gfs2_holder *gh); extern int gfs2_glock_nq(struct gfs2_holder *gh); @@ -195,7 +195,7 @@ extern void gfs2_glock_dq_wait(struct gfs2_holder *gh); extern void gfs2_glock_dq_uninit(struct gfs2_holder *gh); extern int gfs2_glock_nq_num(struct gfs2_sbd *sdp, u64 number, const struct gfs2_glock_operations *glops, - unsigned int state, int flags, + unsigned int state, u16 flags, struct gfs2_holder *gh); extern int gfs2_glock_nq_m(unsigned int num_gh, struct gfs2_holder *ghs); extern void gfs2_glock_dq_m(unsigned int num_gh, struct gfs2_holder *ghs); @@ -215,7 +215,7 @@ void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...); */ static inline int gfs2_glock_nq_init(struct gfs2_glock *gl, - unsigned int state, int flags, + unsigned int state, u16 flags, struct gfs2_holder *gh) { int error; diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index f348cfb6b..437fd73e3 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -13,6 +13,7 @@ #include <linux/gfs2_ondisk.h> #include <linux/bio.h> #include <linux/posix_acl.h> +#include <linux/security.h> #include "gfs2.h" #include "incore.h" @@ -262,6 +263,7 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags) if (ip) { set_bit(GIF_INVALID, &ip->i_flags); forget_all_cached_acls(&ip->i_inode); + security_inode_invalidate_secctx(&ip->i_inode); gfs2_dir_hash_inval(ip); } } diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index de7b4f97a..845fb09cc 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -259,8 +259,8 @@ struct gfs2_holder { struct gfs2_glock *gh_gl; struct pid *gh_owner_pid; - unsigned int gh_state; - unsigned gh_flags; + u16 gh_flags; + u16 gh_state; int gh_error; unsigned long gh_iflags; /* HIF_... */ @@ -270,6 +270,13 @@ struct gfs2_holder { /* Number of quota types we support */ #define GFS2_MAXQUOTAS 2 +struct gfs2_qadata { /* quota allocation data */ + /* Quota stuff */ + struct gfs2_quota_data *qa_qd[2 * GFS2_MAXQUOTAS]; + struct gfs2_holder qa_qd_ghs[2 * GFS2_MAXQUOTAS]; + unsigned int qa_qd_num; +}; + /* Resource group multi-block reservation, in order of appearance: Step 1. Function prepares to write, allocates a mb, sets the size hint. @@ -288,11 +295,6 @@ struct gfs2_blkreserv { struct gfs2_rbm rs_rbm; /* Start of reservation */ u32 rs_free; /* how many blocks are still free */ u64 rs_inum; /* Inode number for reservation */ - - /* ancillary quota stuff */ - struct gfs2_quota_data *rs_qa_qd[2 * GFS2_MAXQUOTAS]; - struct gfs2_holder rs_qa_qd_ghs[2 * GFS2_MAXQUOTAS]; - unsigned int rs_qa_qd_num; }; /* @@ -391,7 +393,8 @@ struct gfs2_inode { struct gfs2_glock *i_gl; /* Move into i_gh? */ struct gfs2_holder i_iopen_gh; struct gfs2_holder i_gh; /* for prepare/commit_write only */ - struct gfs2_blkreserv *i_res; /* rgrp multi-block reservation */ + struct gfs2_qadata *i_qadata; /* quota allocation data */ + struct gfs2_blkreserv i_res; /* rgrp multi-block reservation */ struct gfs2_rgrpd *i_rgd; u64 i_goal; /* goal block for allocations */ struct rw_semaphore i_rw_mutex; @@ -402,6 +405,7 @@ struct gfs2_inode { u32 i_diskflags; u8 i_height; u8 i_depth; + u16 i_rahead; }; /* @@ -558,6 +562,8 @@ struct gfs2_args { unsigned int ar_errors:2; /* errors=withdraw | panic */ unsigned int ar_nobarrier:1; /* do not send barriers */ unsigned int ar_rgrplvb:1; /* use lvbs for rgrp info */ + unsigned int ar_loccookie:1; /* use location based readdir + cookies */ int ar_commit; /* Commit interval */ int ar_statfs_quantum; /* The fast statfs interval */ int ar_quota_quantum; /* The quota interval */ @@ -685,6 +691,7 @@ struct gfs2_sbd { u64 sd_heightsize[GFS2_MAX_META_HEIGHT + 1]; u32 sd_max_jheight; /* Max height of journaled file's meta tree */ u64 sd_jheightsize[GFS2_MAX_META_HEIGHT + 1]; + u32 sd_max_dents_per_leaf; /* Max number of dirents in a leaf block */ struct gfs2_args sd_args; /* Mount arguments */ struct gfs2_tune sd_tune; /* Filesystem tuning structure */ diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 063fdfcf8..352f95876 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -191,13 +191,13 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned int type, fail_refresh: ip->i_iopen_gh.gh_flags |= GL_NOCACHE; ip->i_iopen_gh.gh_gl->gl_object = NULL; - gfs2_glock_dq_uninit(&ip->i_iopen_gh); + gfs2_glock_dq_wait(&ip->i_iopen_gh); + gfs2_holder_uninit(&ip->i_iopen_gh); fail_iopen: if (io_gl) gfs2_glock_put(io_gl); fail_put: ip->i_gl->gl_object = NULL; - gfs2_glock_put(ip->i_gl); fail: iget_failed(inode); return ERR_PTR(error); @@ -593,7 +593,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, struct gfs2_inode *dip = GFS2_I(dir), *ip; struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); struct gfs2_glock *io_gl; - int error, free_vfs_inode = 0; + int error, free_vfs_inode = 1; u32 aflags = 0; unsigned blocks = 1; struct gfs2_diradd da = { .bh = NULL, .save_loc = 1, }; @@ -601,7 +601,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, if (!name->len || name->len > GFS2_FNAMESIZE) return -ENAMETOOLONG; - error = gfs2_rs_alloc(dip); + error = gfs2_rsqa_alloc(dip); if (error) return error; @@ -650,10 +650,10 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, error = posix_acl_create(dir, &mode, &default_acl, &acl); if (error) - goto fail_free_vfs_inode; + goto fail_gunlock; ip = GFS2_I(inode); - error = gfs2_rs_alloc(ip); + error = gfs2_rsqa_alloc(ip); if (error) goto fail_free_acls; @@ -685,6 +685,11 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, ip->i_entries = 2; break; } + + /* Force SYSTEM flag on all files and subdirs of a SYSTEM directory */ + if (dip->i_diskflags & GFS2_DIF_SYSTEM) + ip->i_diskflags |= GFS2_DIF_SYSTEM; + gfs2_set_inode_flags(inode); if ((GFS2_I(d_inode(sdp->sd_root_dir)) == dip) || @@ -733,6 +738,9 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, gfs2_set_iop(inode); insert_inode_hash(inode); + free_vfs_inode = 0; /* After this point, the inode is no longer + considered free. Any failures need to undo + the gfs2 structures. */ if (default_acl) { error = gfs2_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); posix_acl_release(default_acl); @@ -766,24 +774,19 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, return error; fail_gunlock3: - gfs2_glock_dq_uninit(ghs + 1); - if (ip->i_gl) - gfs2_glock_put(ip->i_gl); - goto fail_gunlock; - + gfs2_glock_dq_uninit(&ip->i_iopen_gh); + gfs2_glock_put(io_gl); fail_gunlock2: gfs2_glock_dq_uninit(ghs + 1); fail_free_inode: if (ip->i_gl) gfs2_glock_put(ip->i_gl); - gfs2_rs_delete(ip, NULL); + gfs2_rsqa_delete(ip, NULL); fail_free_acls: if (default_acl) posix_acl_release(default_acl); if (acl) posix_acl_release(acl); -fail_free_vfs_inode: - free_vfs_inode = 1; fail_gunlock: gfs2_dir_no_add(&da); gfs2_glock_dq_uninit(ghs); @@ -898,7 +901,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, if (S_ISDIR(inode->i_mode)) return -EPERM; - error = gfs2_rs_alloc(dip); + error = gfs2_rsqa_alloc(dip); if (error) return error; @@ -1371,7 +1374,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, if (error) return error; - error = gfs2_rs_alloc(ndip); + error = gfs2_rsqa_alloc(ndip); if (error) return error; @@ -1712,24 +1715,30 @@ static int gfs2_rename2(struct inode *odir, struct dentry *odentry, } /** - * gfs2_follow_link - Follow a symbolic link + * gfs2_get_link - Follow a symbolic link * @dentry: The dentry of the link - * @nd: Data that we pass to vfs_follow_link() + * @inode: The inode of the link + * @done: destructor for return value * * This can handle symlinks of any size. * * Returns: 0 on success or error code */ -static const char *gfs2_follow_link(struct dentry *dentry, void **cookie) +static const char *gfs2_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) { - struct gfs2_inode *ip = GFS2_I(d_inode(dentry)); + struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder i_gh; struct buffer_head *dibh; unsigned int size; char *buf; int error; + if (!dentry) + return ERR_PTR(-ECHILD); + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh); error = gfs2_glock_nq(&i_gh); if (error) { @@ -1759,7 +1768,7 @@ static const char *gfs2_follow_link(struct dentry *dentry, void **cookie) out: gfs2_glock_dq_uninit(&i_gh); if (!IS_ERR(buf)) - *cookie = buf; + set_delayed_call(done, kfree_link, buf); return buf; } @@ -1854,11 +1863,7 @@ static int setattr_chown(struct inode *inode, struct iattr *attr) if (!(attr->ia_valid & ATTR_GID) || gid_eq(ogid, ngid)) ogid = ngid = NO_GID_QUOTA_CHANGE; - error = get_write_access(inode); - if (error) - return error; - - error = gfs2_rs_alloc(ip); + error = gfs2_rsqa_alloc(ip); if (error) goto out; @@ -1898,7 +1903,6 @@ out_end_trans: out_gunlock_q: gfs2_quota_unlock(ip); out: - put_write_access(inode); return error; } @@ -1920,7 +1924,7 @@ static int gfs2_setattr(struct dentry *dentry, struct iattr *attr) struct gfs2_holder i_gh; int error; - error = gfs2_rs_alloc(ip); + error = gfs2_rsqa_alloc(ip); if (error) return error; @@ -2002,7 +2006,7 @@ static int gfs2_setxattr(struct dentry *dentry, const char *name, gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); ret = gfs2_glock_nq(&gh); if (ret == 0) { - ret = gfs2_rs_alloc(ip); + ret = gfs2_rsqa_alloc(ip); if (ret == 0) ret = generic_setxattr(dentry, name, data, size, flags); gfs2_glock_dq(&gh); @@ -2043,7 +2047,7 @@ static int gfs2_removexattr(struct dentry *dentry, const char *name) gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); ret = gfs2_glock_nq(&gh); if (ret == 0) { - ret = gfs2_rs_alloc(ip); + ret = gfs2_rsqa_alloc(ip); if (ret == 0) ret = generic_removexattr(dentry, name); gfs2_glock_dq(&gh); @@ -2063,7 +2067,7 @@ static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (ret) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh); if (ret) @@ -2090,7 +2094,7 @@ static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, gfs2_glock_dq_uninit(&gh); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } @@ -2132,8 +2136,7 @@ const struct inode_operations gfs2_dir_iops = { const struct inode_operations gfs2_symlink_iops = { .readlink = generic_readlink, - .follow_link = gfs2_follow_link, - .put_link = kfree_put_link, + .get_link = gfs2_get_link, .permission = gfs2_permission, .setattr = gfs2_setattr, .getattr = gfs2_getattr, diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 536e7a625..0ff028c15 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -716,6 +716,9 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl, } trace_gfs2_log_flush(sdp, 1); + if (type == SHUTDOWN_FLUSH) + clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); + sdp->sd_log_flush_head = sdp->sd_log_head; sdp->sd_log_flush_wrapped = 0; tr = sdp->sd_log_tr; diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index fb2b42cf4..f99f8e94d 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -41,7 +41,9 @@ static void gfs2_init_inode_once(void *foo) inode_init_once(&ip->i_inode); init_rwsem(&ip->i_rw_mutex); INIT_LIST_HEAD(&ip->i_trunc_list); - ip->i_res = NULL; + ip->i_qadata = NULL; + memset(&ip->i_res, 0, sizeof(ip->i_res)); + RB_CLEAR_NODE(&ip->i_res.rs_node); ip->i_hash_cache = NULL; } @@ -112,7 +114,8 @@ static int __init init_gfs2_fs(void) gfs2_inode_cachep = kmem_cache_create("gfs2_inode", sizeof(struct gfs2_inode), 0, SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD, + SLAB_MEM_SPREAD| + SLAB_ACCOUNT, gfs2_init_inode_once); if (!gfs2_inode_cachep) goto fail; @@ -135,10 +138,10 @@ static int __init init_gfs2_fs(void) if (!gfs2_quotad_cachep) goto fail; - gfs2_rsrv_cachep = kmem_cache_create("gfs2_mblk", - sizeof(struct gfs2_blkreserv), + gfs2_qadata_cachep = kmem_cache_create("gfs2_qadata", + sizeof(struct gfs2_qadata), 0, 0, NULL); - if (!gfs2_rsrv_cachep) + if (!gfs2_qadata_cachep) goto fail; register_shrinker(&gfs2_qd_shrinker); @@ -193,8 +196,8 @@ fail_lru: unregister_shrinker(&gfs2_qd_shrinker); gfs2_glock_exit(); - if (gfs2_rsrv_cachep) - kmem_cache_destroy(gfs2_rsrv_cachep); + if (gfs2_qadata_cachep) + kmem_cache_destroy(gfs2_qadata_cachep); if (gfs2_quotad_cachep) kmem_cache_destroy(gfs2_quotad_cachep); @@ -238,7 +241,7 @@ static void __exit exit_gfs2_fs(void) rcu_barrier(); mempool_destroy(gfs2_page_pool); - kmem_cache_destroy(gfs2_rsrv_cachep); + kmem_cache_destroy(gfs2_qadata_cachep); kmem_cache_destroy(gfs2_quotad_cachep); kmem_cache_destroy(gfs2_rgrpd_cachep); kmem_cache_destroy(gfs2_bufdata_cachep); diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 0e1d4be58..e137d96f1 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -187,6 +187,52 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno) return bh; } +static void gfs2_meta_read_endio(struct bio *bio) +{ + struct bio_vec *bvec; + int i; + + bio_for_each_segment_all(bvec, bio, i) { + struct page *page = bvec->bv_page; + struct buffer_head *bh = page_buffers(page); + unsigned int len = bvec->bv_len; + + while (bh_offset(bh) < bvec->bv_offset) + bh = bh->b_this_page; + do { + struct buffer_head *next = bh->b_this_page; + len -= bh->b_size; + bh->b_end_io(bh, !bio->bi_error); + bh = next; + } while (bh && len); + } + bio_put(bio); +} + +/* + * Submit several consecutive buffer head I/O requests as a single bio I/O + * request. (See submit_bh_wbc.) + */ +static void gfs2_submit_bhs(int rw, struct buffer_head *bhs[], int num) +{ + struct buffer_head *bh = bhs[0]; + struct bio *bio; + int i; + + if (!num) + return; + + bio = bio_alloc(GFP_NOIO, num); + bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); + bio->bi_bdev = bh->b_bdev; + for (i = 0; i < num; i++) { + bh = bhs[i]; + bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh)); + } + bio->bi_end_io = gfs2_meta_read_endio; + submit_bio(rw, bio); +} + /** * gfs2_meta_read - Read a block from disk * @gl: The glock covering the block @@ -198,10 +244,11 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno) */ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, - struct buffer_head **bhp) + int rahead, struct buffer_head **bhp) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; - struct buffer_head *bh; + struct buffer_head *bh, *bhs[2]; + int num = 0; if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) { *bhp = NULL; @@ -213,14 +260,31 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, lock_buffer(bh); if (buffer_uptodate(bh)) { unlock_buffer(bh); - return 0; + flags &= ~DIO_WAIT; + } else { + bh->b_end_io = end_buffer_read_sync; + get_bh(bh); + bhs[num++] = bh; } - bh->b_end_io = end_buffer_read_sync; - get_bh(bh); - submit_bh(READ_SYNC | REQ_META | REQ_PRIO, bh); + + if (rahead) { + bh = gfs2_getbuf(gl, blkno + 1, CREATE); + + lock_buffer(bh); + if (buffer_uptodate(bh)) { + unlock_buffer(bh); + brelse(bh); + } else { + bh->b_end_io = end_buffer_read_sync; + bhs[num++] = bh; + } + } + + gfs2_submit_bhs(READ_SYNC | REQ_META | REQ_PRIO, bhs, num); if (!(flags & DIO_WAIT)) return 0; + bh = *bhp; wait_on_buffer(bh); if (unlikely(!buffer_uptodate(bh))) { struct gfs2_trans *tr = current->journal_info; @@ -341,8 +405,12 @@ int gfs2_meta_indirect_buffer(struct gfs2_inode *ip, int height, u64 num, struct buffer_head *bh; int ret = 0; u32 mtype = height ? GFS2_METATYPE_IN : GFS2_METATYPE_DI; + int rahead = 0; + + if (num == ip->i_no_addr) + rahead = ip->i_rahead; - ret = gfs2_meta_read(gl, num, DIO_WAIT, &bh); + ret = gfs2_meta_read(gl, num, DIO_WAIT, rahead, &bh); if (ret == 0 && gfs2_metatype_check(sdp, bh, mtype)) { brelse(bh); ret = -EIO; diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h index 8ca161567..c5086c8af 100644 --- a/fs/gfs2/meta_io.h +++ b/fs/gfs2/meta_io.h @@ -53,7 +53,7 @@ static inline struct gfs2_sbd *gfs2_mapping2sbd(struct address_space *mapping) extern struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno); extern int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, - struct buffer_head **bhp); + int rahead, struct buffer_head **bhp); extern int gfs2_meta_wait(struct gfs2_sbd *sdp, struct buffer_head *bh); extern struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index baab99b69..dbed9e243 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -352,6 +352,9 @@ static int gfs2_read_sb(struct gfs2_sbd *sdp, int silent) sdp->sd_jheightsize[x] = ~0; gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT); + sdp->sd_max_dents_per_leaf = (sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_leaf)) / + GFS2_MIN_DIRENT_SIZE; return 0; } @@ -910,8 +913,7 @@ fail_qc_i: fail_ut_i: iput(sdp->sd_sc_inode); fail: - if (pn) - iput(pn); + iput(pn); return error; } @@ -1315,9 +1317,7 @@ static struct dentry *gfs2_mount(struct file_system_type *fs_type, int flags, if ((flags ^ s->s_flags) & MS_RDONLY) goto error_super; } else { - char b[BDEVNAME_SIZE]; - - strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); + snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev); sb_set_blocksize(s, block_size(bdev)); error = fill_super(s, &args, flags & MS_SILENT ? 1 : 0); if (error) diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 3a3122653..a39891344 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -388,7 +388,7 @@ static int bh_get(struct gfs2_quota_data *qd) error = gfs2_block_map(&ip->i_inode, block, &bh_map, 0); if (error) goto fail; - error = gfs2_meta_read(ip->i_gl, bh_map.b_blocknr, DIO_WAIT, &bh); + error = gfs2_meta_read(ip->i_gl, bh_map.b_blocknr, DIO_WAIT, 0, &bh); if (error) goto fail; error = -EIO; @@ -527,37 +527,70 @@ static void qdsb_put(struct gfs2_quota_data *qd) qd_put(qd); } +/** + * gfs2_qa_alloc - make sure we have a quota allocations data structure, + * if necessary + * @ip: the inode for this reservation + */ +int gfs2_qa_alloc(struct gfs2_inode *ip) +{ + int error = 0; + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + + if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) + return 0; + + down_write(&ip->i_rw_mutex); + if (ip->i_qadata == NULL) { + ip->i_qadata = kmem_cache_zalloc(gfs2_qadata_cachep, GFP_NOFS); + if (!ip->i_qadata) + error = -ENOMEM; + } + up_write(&ip->i_rw_mutex); + return error; +} + +void gfs2_qa_delete(struct gfs2_inode *ip, atomic_t *wcount) +{ + down_write(&ip->i_rw_mutex); + if (ip->i_qadata && ((wcount == NULL) || (atomic_read(wcount) <= 1))) { + kmem_cache_free(gfs2_qadata_cachep, ip->i_qadata); + ip->i_qadata = NULL; + } + up_write(&ip->i_rw_mutex); +} + int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_quota_data **qd; int error; - if (ip->i_res == NULL) { - error = gfs2_rs_alloc(ip); + if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) + return 0; + + if (ip->i_qadata == NULL) { + error = gfs2_rsqa_alloc(ip); if (error) return error; } - qd = ip->i_res->rs_qa_qd; + qd = ip->i_qadata->qa_qd; - if (gfs2_assert_warn(sdp, !ip->i_res->rs_qa_qd_num) || + if (gfs2_assert_warn(sdp, !ip->i_qadata->qa_qd_num) || gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags))) return -EIO; - if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF) - return 0; - error = qdsb_get(sdp, make_kqid_uid(ip->i_inode.i_uid), qd); if (error) goto out; - ip->i_res->rs_qa_qd_num++; + ip->i_qadata->qa_qd_num++; qd++; error = qdsb_get(sdp, make_kqid_gid(ip->i_inode.i_gid), qd); if (error) goto out; - ip->i_res->rs_qa_qd_num++; + ip->i_qadata->qa_qd_num++; qd++; if (!uid_eq(uid, NO_UID_QUOTA_CHANGE) && @@ -565,7 +598,7 @@ int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid) error = qdsb_get(sdp, make_kqid_uid(uid), qd); if (error) goto out; - ip->i_res->rs_qa_qd_num++; + ip->i_qadata->qa_qd_num++; qd++; } @@ -574,7 +607,7 @@ int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid) error = qdsb_get(sdp, make_kqid_gid(gid), qd); if (error) goto out; - ip->i_res->rs_qa_qd_num++; + ip->i_qadata->qa_qd_num++; qd++; } @@ -587,17 +620,17 @@ out: void gfs2_quota_unhold(struct gfs2_inode *ip) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - unsigned int x; + u32 x; - if (ip->i_res == NULL) + if (ip->i_qadata == NULL) return; gfs2_assert_warn(sdp, !test_bit(GIF_QD_LOCKED, &ip->i_flags)); - for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) { - qdsb_put(ip->i_res->rs_qa_qd[x]); - ip->i_res->rs_qa_qd[x] = NULL; + for (x = 0; x < ip->i_qadata->qa_qd_num; x++) { + qdsb_put(ip->i_qadata->qa_qd[x]); + ip->i_qadata->qa_qd[x] = NULL; } - ip->i_res->rs_qa_qd_num = 0; + ip->i_qadata->qa_qd_num = 0; } static int sort_qd(const void *a, const void *b) @@ -843,7 +876,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) unsigned int nalloc = 0, blocks; int error; - error = gfs2_rs_alloc(ip); + error = gfs2_rsqa_alloc(ip); if (error) return error; @@ -855,7 +888,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) return -ENOMEM; sort(qda, num_qd, sizeof(struct gfs2_quota_data *), sort_qd, NULL); - mutex_lock(&ip->i_inode.i_mutex); + inode_lock(&ip->i_inode); for (qx = 0; qx < num_qd; qx++) { error = gfs2_glock_nq_init(qda[qx]->qd_gl, LM_ST_EXCLUSIVE, GL_NOCACHE, &ghs[qx]); @@ -920,7 +953,7 @@ out_alloc: out: while (qx--) gfs2_glock_dq_uninit(&ghs[qx]); - mutex_unlock(&ip->i_inode.i_mutex); + inode_unlock(&ip->i_inode); kfree(ghs); gfs2_log_flush(ip->i_gl->gl_name.ln_sbd, ip->i_gl, NORMAL_FLUSH); return error; @@ -1003,23 +1036,23 @@ int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_quota_data *qd; - unsigned int x; + u32 x; int error = 0; - error = gfs2_quota_hold(ip, uid, gid); - if (error) - return error; - if (capable(CAP_SYS_RESOURCE) || sdp->sd_args.ar_quota != GFS2_QUOTA_ON) return 0; - sort(ip->i_res->rs_qa_qd, ip->i_res->rs_qa_qd_num, + error = gfs2_quota_hold(ip, uid, gid); + if (error) + return error; + + sort(ip->i_qadata->qa_qd, ip->i_qadata->qa_qd_num, sizeof(struct gfs2_quota_data *), sort_qd, NULL); - for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) { - qd = ip->i_res->rs_qa_qd[x]; - error = do_glock(qd, NO_FORCE, &ip->i_res->rs_qa_qd_ghs[x]); + for (x = 0; x < ip->i_qadata->qa_qd_num; x++) { + qd = ip->i_qadata->qa_qd[x]; + error = do_glock(qd, NO_FORCE, &ip->i_qadata->qa_qd_ghs[x]); if (error) break; } @@ -1028,7 +1061,7 @@ int gfs2_quota_lock(struct gfs2_inode *ip, kuid_t uid, kgid_t gid) set_bit(GIF_QD_LOCKED, &ip->i_flags); else { while (x--) - gfs2_glock_dq_uninit(&ip->i_res->rs_qa_qd_ghs[x]); + gfs2_glock_dq_uninit(&ip->i_qadata->qa_qd_ghs[x]); gfs2_quota_unhold(ip); } @@ -1076,20 +1109,20 @@ void gfs2_quota_unlock(struct gfs2_inode *ip) struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_quota_data *qda[4]; unsigned int count = 0; - unsigned int x; + u32 x; int found; if (!test_and_clear_bit(GIF_QD_LOCKED, &ip->i_flags)) goto out; - for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) { + for (x = 0; x < ip->i_qadata->qa_qd_num; x++) { struct gfs2_quota_data *qd; int sync; - qd = ip->i_res->rs_qa_qd[x]; + qd = ip->i_qadata->qa_qd[x]; sync = need_sync(qd); - gfs2_glock_dq_uninit(&ip->i_res->rs_qa_qd_ghs[x]); + gfs2_glock_dq_uninit(&ip->i_qadata->qa_qd_ghs[x]); if (!sync) continue; @@ -1158,7 +1191,7 @@ int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid, struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_quota_data *qd; s64 value, warn, limit; - unsigned int x; + u32 x; int error = 0; ap->allowed = UINT_MAX; /* Assume we are permitted a whole lot */ @@ -1168,8 +1201,8 @@ int gfs2_quota_check(struct gfs2_inode *ip, kuid_t uid, kgid_t gid, if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON) return 0; - for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) { - qd = ip->i_res->rs_qa_qd[x]; + for (x = 0; x < ip->i_qadata->qa_qd_num; x++) { + qd = ip->i_qadata->qa_qd[x]; if (!(qid_eq(qd->qd_id, make_kqid_uid(uid)) || qid_eq(qd->qd_id, make_kqid_gid(gid)))) @@ -1216,15 +1249,17 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change, kuid_t uid, kgid_t gid) { struct gfs2_quota_data *qd; - unsigned int x; + u32 x; + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), change)) + if (sdp->sd_args.ar_quota != GFS2_QUOTA_ON || + gfs2_assert_warn(sdp, change)) return; if (ip->i_diskflags & GFS2_DIF_SYSTEM) return; - for (x = 0; x < ip->i_res->rs_qa_qd_num; x++) { - qd = ip->i_res->rs_qa_qd[x]; + for (x = 0; x < ip->i_qadata->qa_qd_num; x++) { + qd = ip->i_qadata->qa_qd[x]; if (qid_eq(qd->qd_id, make_kqid_uid(uid)) || qid_eq(qd->qd_id, make_kqid_gid(gid))) { @@ -1635,11 +1670,11 @@ static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid, if (error) return error; - error = gfs2_rs_alloc(ip); + error = gfs2_rsqa_alloc(ip); if (error) goto out_put; - mutex_lock(&ip->i_inode.i_mutex); + inode_lock(&ip->i_inode); error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_EXCLUSIVE, 0, &q_gh); if (error) goto out_unlockput; @@ -1704,7 +1739,7 @@ out_i: out_q: gfs2_glock_dq_uninit(&q_gh); out_unlockput: - mutex_unlock(&ip->i_inode.i_mutex); + inode_unlock(&ip->i_inode); out_put: qd_put(qd); return error; diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h index ad04b3aca..5e47c935a 100644 --- a/fs/gfs2/quota.h +++ b/fs/gfs2/quota.h @@ -18,6 +18,8 @@ struct gfs2_sbd; #define NO_UID_QUOTA_CHANGE INVALID_UID #define NO_GID_QUOTA_CHANGE INVALID_GID +extern int gfs2_qa_alloc(struct gfs2_inode *ip); +extern void gfs2_qa_delete(struct gfs2_inode *ip, atomic_t *wcount); extern int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid); extern void gfs2_quota_unhold(struct gfs2_inode *ip); diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index c134c0462..07c0265aa 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -596,27 +596,13 @@ void gfs2_free_clones(struct gfs2_rgrpd *rgd) } /** - * gfs2_rs_alloc - make sure we have a reservation assigned to the inode + * gfs2_rsqa_alloc - make sure we have a reservation assigned to the inode + * plus a quota allocations data structure, if necessary * @ip: the inode for this reservation */ -int gfs2_rs_alloc(struct gfs2_inode *ip) +int gfs2_rsqa_alloc(struct gfs2_inode *ip) { - int error = 0; - - down_write(&ip->i_rw_mutex); - if (ip->i_res) - goto out; - - ip->i_res = kmem_cache_zalloc(gfs2_rsrv_cachep, GFP_NOFS); - if (!ip->i_res) { - error = -ENOMEM; - goto out; - } - - RB_CLEAR_NODE(&ip->i_res->rs_node); -out: - up_write(&ip->i_rw_mutex); - return error; + return gfs2_qa_alloc(ip); } static void dump_rs(struct seq_file *seq, const struct gfs2_blkreserv *rs) @@ -678,21 +664,20 @@ void gfs2_rs_deltree(struct gfs2_blkreserv *rs) } /** - * gfs2_rs_delete - delete a multi-block reservation + * gfs2_rsqa_delete - delete a multi-block reservation and quota allocation * @ip: The inode for this reservation * @wcount: The inode's write count, or NULL * */ -void gfs2_rs_delete(struct gfs2_inode *ip, atomic_t *wcount) +void gfs2_rsqa_delete(struct gfs2_inode *ip, atomic_t *wcount) { down_write(&ip->i_rw_mutex); - if (ip->i_res && ((wcount == NULL) || (atomic_read(wcount) <= 1))) { - gfs2_rs_deltree(ip->i_res); - BUG_ON(ip->i_res->rs_free); - kmem_cache_free(gfs2_rsrv_cachep, ip->i_res); - ip->i_res = NULL; + if ((wcount == NULL) || (atomic_read(wcount) <= 1)) { + gfs2_rs_deltree(&ip->i_res); + BUG_ON(ip->i_res.rs_free); } up_write(&ip->i_rw_mutex); + gfs2_qa_delete(ip, wcount); } /** @@ -1158,7 +1143,7 @@ static int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd) for (x = 0; x < length; x++) { bi = rgd->rd_bits + x; - error = gfs2_meta_read(gl, rgd->rd_addr + x, 0, &bi->bi_bh); + error = gfs2_meta_read(gl, rgd->rd_addr + x, 0, 0, &bi->bi_bh); if (error) goto fail; } @@ -1456,7 +1441,7 @@ static void rs_insert(struct gfs2_inode *ip) { struct rb_node **newn, *parent = NULL; int rc; - struct gfs2_blkreserv *rs = ip->i_res; + struct gfs2_blkreserv *rs = &ip->i_res; struct gfs2_rgrpd *rgd = rs->rs_rbm.rgd; u64 fsblock = gfs2_rbm_to_block(&rs->rs_rbm); @@ -1503,7 +1488,7 @@ static void rg_mblk_search(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip, { struct gfs2_rbm rbm = { .rgd = rgd, }; u64 goal; - struct gfs2_blkreserv *rs = ip->i_res; + struct gfs2_blkreserv *rs = &ip->i_res; u32 extlen; u32 free_blocks = rgd->rd_free_clone - rgd->rd_reserved; int ret; @@ -1574,7 +1559,7 @@ static u64 gfs2_next_unreserved_block(struct gfs2_rgrpd *rgd, u64 block, } if (n) { - while ((rs_cmp(block, length, rs) == 0) && (ip->i_res != rs)) { + while ((rs_cmp(block, length, rs) == 0) && (&ip->i_res != rs)) { block = gfs2_rbm_to_block(&rs->rs_rbm) + rs->rs_free; n = n->rb_right; if (n == NULL) @@ -1804,7 +1789,7 @@ static void try_rgrp_unlink(struct gfs2_rgrpd *rgd, u64 *last_unlinked, u64 skip continue; *last_unlinked = block; - error = gfs2_glock_get(sdp, block, &gfs2_inode_glops, CREATE, &gl); + error = gfs2_glock_get(sdp, block, &gfs2_iopen_glops, CREATE, &gl); if (error) continue; @@ -1984,7 +1969,7 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, struct gfs2_alloc_parms *ap) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_rgrpd *begin = NULL; - struct gfs2_blkreserv *rs = ip->i_res; + struct gfs2_blkreserv *rs = &ip->i_res; int error = 0, rg_locked, flags = 0; u64 last_unlinked = NO_BLOCK; int loops = 0; @@ -2113,7 +2098,7 @@ next_rgrp: void gfs2_inplace_release(struct gfs2_inode *ip) { - struct gfs2_blkreserv *rs = ip->i_res; + struct gfs2_blkreserv *rs = &ip->i_res; if (rs->rs_rgd_gh.gh_gl) gfs2_glock_dq_uninit(&rs->rs_rgd_gh); @@ -2267,7 +2252,7 @@ static void gfs2_rgrp_error(struct gfs2_rgrpd *rgd) static void gfs2_adjust_reservation(struct gfs2_inode *ip, const struct gfs2_rbm *rbm, unsigned len) { - struct gfs2_blkreserv *rs = ip->i_res; + struct gfs2_blkreserv *rs = &ip->i_res; struct gfs2_rgrpd *rgd = rbm->rgd; unsigned rlen; u64 block; @@ -2310,8 +2295,8 @@ static void gfs2_set_alloc_start(struct gfs2_rbm *rbm, { u64 goal; - if (gfs2_rs_active(ip->i_res)) { - *rbm = ip->i_res->rs_rbm; + if (gfs2_rs_active(&ip->i_res)) { + *rbm = ip->i_res.rs_rbm; return; } @@ -2365,7 +2350,7 @@ int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *nblocks, gfs2_alloc_extent(&rbm, dinode, nblocks); block = gfs2_rbm_to_block(&rbm); rbm.rgd->rd_last_alloc = block - rbm.rgd->rd_data0; - if (gfs2_rs_active(ip->i_res)) + if (gfs2_rs_active(&ip->i_res)) gfs2_adjust_reservation(ip, &rbm, *nblocks); ndata = *nblocks; if (dinode) diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index c0ab33fa3..66b51cf66 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -49,9 +49,9 @@ extern void gfs2_inplace_release(struct gfs2_inode *ip); extern int gfs2_alloc_blocks(struct gfs2_inode *ip, u64 *bn, unsigned int *n, bool dinode, u64 *generation); -extern int gfs2_rs_alloc(struct gfs2_inode *ip); +extern int gfs2_rsqa_alloc(struct gfs2_inode *ip); extern void gfs2_rs_deltree(struct gfs2_blkreserv *rs); -extern void gfs2_rs_delete(struct gfs2_inode *ip, atomic_t *wcount); +extern void gfs2_rsqa_delete(struct gfs2_inode *ip, atomic_t *wcount); extern void __gfs2_free_blocks(struct gfs2_inode *ip, u64 bstart, u32 blen, int meta); extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen); extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip); @@ -78,7 +78,7 @@ extern int gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, extern int gfs2_fitrim(struct file *filp, void __user *argp); /* This is how to tell if a reservation is in the rgrp tree: */ -static inline bool gfs2_rs_active(struct gfs2_blkreserv *rs) +static inline bool gfs2_rs_active(const struct gfs2_blkreserv *rs) { return rs && !RB_EMPTY_NODE(&rs->rs_node); } diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 894fb01a9..8f960a51a 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -83,6 +83,8 @@ enum { Opt_nobarrier, Opt_rgrplvb, Opt_norgrplvb, + Opt_loccookie, + Opt_noloccookie, Opt_error, }; @@ -122,6 +124,8 @@ static const match_table_t tokens = { {Opt_nobarrier, "nobarrier"}, {Opt_rgrplvb, "rgrplvb"}, {Opt_norgrplvb, "norgrplvb"}, + {Opt_loccookie, "loccookie"}, + {Opt_noloccookie, "noloccookie"}, {Opt_error, NULL} }; @@ -278,6 +282,12 @@ int gfs2_mount_args(struct gfs2_args *args, char *options) case Opt_norgrplvb: args->ar_rgrplvb = 0; break; + case Opt_loccookie: + args->ar_loccookie = 1; + break; + case Opt_noloccookie: + args->ar_loccookie = 0; + break; case Opt_error: default: pr_warn("invalid mount option: %s\n", o); @@ -556,6 +566,7 @@ void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh, struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; gfs2_trans_add_meta(l_ip->i_gl, l_bh); + gfs2_trans_add_meta(m_ip->i_gl, m_bh); spin_lock(&sdp->sd_statfs_spin); m_sc->sc_total += l_sc->sc_total; @@ -564,10 +575,8 @@ void update_statfs(struct gfs2_sbd *sdp, struct buffer_head *m_bh, memset(l_sc, 0, sizeof(struct gfs2_statfs_change)); memset(l_bh->b_data + sizeof(struct gfs2_dinode), 0, sizeof(struct gfs2_statfs_change)); - spin_unlock(&sdp->sd_statfs_spin); - - gfs2_trans_add_meta(m_ip->i_gl, m_bh); gfs2_statfs_change_out(m_sc, m_bh->b_data + sizeof(struct gfs2_dinode)); + spin_unlock(&sdp->sd_statfs_spin); } int gfs2_statfs_sync(struct super_block *sb, int type) @@ -842,10 +851,6 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp) gfs2_quota_sync(sdp->sd_vfs, 0); gfs2_statfs_sync(sdp->sd_vfs, 0); - down_write(&sdp->sd_log_flush_lock); - clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); - up_write(&sdp->sd_log_flush_lock); - gfs2_log_flush(sdp, NULL, SHUTDOWN_FLUSH); wait_event(sdp->sd_reserving_log_wait, atomic_read(&sdp->sd_reserving_log) == 0); gfs2_assert_warn(sdp, atomic_read(&sdp->sd_log_blks_free) == sdp->sd_jdesc->jd_blocks); @@ -1419,6 +1424,8 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root) seq_puts(s, ",demote_interface_used"); if (args->ar_rgrplvb) seq_puts(s, ",rgrplvb"); + if (args->ar_loccookie) + seq_puts(s, ",loccookie"); return 0; } @@ -1512,6 +1519,7 @@ static void gfs2_evict_inode(struct inode *inode) struct gfs2_sbd *sdp = sb->s_fs_info; struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder gh; + struct address_space *metamapping; int error; if (test_bit(GIF_FREE_VFS_INODE, &ip->i_flags)) { @@ -1526,7 +1534,8 @@ static void gfs2_evict_inode(struct inode *inode) error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &gh); if (unlikely(error)) { ip->i_iopen_gh.gh_flags |= GL_NOCACHE; - gfs2_glock_dq_uninit(&ip->i_iopen_gh); + gfs2_glock_dq_wait(&ip->i_iopen_gh); + gfs2_holder_uninit(&ip->i_iopen_gh); goto out; } @@ -1575,8 +1584,8 @@ static void gfs2_evict_inode(struct inode *inode) out_truncate: gfs2_log_flush(sdp, ip->i_gl, NORMAL_FLUSH); + metamapping = gfs2_glock2aspace(ip->i_gl); if (test_bit(GLF_DIRTY, &ip->i_gl->gl_flags)) { - struct address_space *metamapping = gfs2_glock2aspace(ip->i_gl); filemap_fdatawrite(metamapping); filemap_fdatawait(metamapping); } @@ -1589,16 +1598,17 @@ out_truncate: goto out_unlock; /* Needs to be done before glock release & also in a transaction */ truncate_inode_pages(&inode->i_data, 0); + truncate_inode_pages(metamapping, 0); gfs2_trans_end(sdp); out_unlock: /* Error path for case 1 */ - if (gfs2_rs_active(ip->i_res)) - gfs2_rs_deltree(ip->i_res); + if (gfs2_rs_active(&ip->i_res)) + gfs2_rs_deltree(&ip->i_res); if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) { ip->i_iopen_gh.gh_flags |= GL_NOCACHE; - gfs2_glock_dq(&ip->i_iopen_gh); + gfs2_glock_dq_wait(&ip->i_iopen_gh); } gfs2_holder_uninit(&ip->i_iopen_gh); gfs2_glock_dq_uninit(&gh); @@ -1607,7 +1617,7 @@ out_unlock: out: /* Case 3 starts here */ truncate_inode_pages_final(&inode->i_data); - gfs2_rs_delete(ip, NULL); + gfs2_rsqa_delete(ip, NULL); gfs2_ordered_del_inode(ip); clear_inode(inode); gfs2_dir_hash_inval(ip); @@ -1619,7 +1629,8 @@ out: if (ip->i_iopen_gh.gh_gl) { ip->i_iopen_gh.gh_gl->gl_object = NULL; ip->i_iopen_gh.gh_flags |= GL_NOCACHE; - gfs2_glock_dq_uninit(&ip->i_iopen_gh); + gfs2_glock_dq_wait(&ip->i_iopen_gh); + gfs2_holder_uninit(&ip->i_iopen_gh); } } @@ -1632,7 +1643,9 @@ static struct inode *gfs2_alloc_inode(struct super_block *sb) ip->i_flags = 0; ip->i_gl = NULL; ip->i_rgd = NULL; - ip->i_res = NULL; + memset(&ip->i_res, 0, sizeof(ip->i_res)); + RB_CLEAR_NODE(&ip->i_res.rs_node); + ip->i_rahead = 0; } return &ip->i_inode; } diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index 86d2035ac..cf6458357 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -27,7 +27,7 @@ struct kmem_cache *gfs2_inode_cachep __read_mostly; struct kmem_cache *gfs2_bufdata_cachep __read_mostly; struct kmem_cache *gfs2_rgrpd_cachep __read_mostly; struct kmem_cache *gfs2_quotad_cachep __read_mostly; -struct kmem_cache *gfs2_rsrv_cachep __read_mostly; +struct kmem_cache *gfs2_qadata_cachep __read_mostly; mempool_t *gfs2_page_pool __read_mostly; void gfs2_assert_i(struct gfs2_sbd *sdp) diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h index cbdcbdf39..c81295f40 100644 --- a/fs/gfs2/util.h +++ b/fs/gfs2/util.h @@ -149,7 +149,7 @@ extern struct kmem_cache *gfs2_inode_cachep; extern struct kmem_cache *gfs2_bufdata_cachep; extern struct kmem_cache *gfs2_rgrpd_cachep; extern struct kmem_cache *gfs2_quotad_cachep; -extern struct kmem_cache *gfs2_rsrv_cachep; +extern struct kmem_cache *gfs2_qadata_cachep; extern mempool_t *gfs2_page_pool; static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt, diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index 53ce76a37..e8dfb4740 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -119,7 +119,7 @@ static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data) __be64 *eablk, *end; int error; - error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &bh); + error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, 0, &bh); if (error) return error; @@ -143,7 +143,7 @@ static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data) break; bn = be64_to_cpu(*eablk); - error = gfs2_meta_read(ip->i_gl, bn, DIO_WAIT, &eabh); + error = gfs2_meta_read(ip->i_gl, bn, DIO_WAIT, 0, &eabh); if (error) break; error = ea_foreach_i(ip, eabh, ea_call, data); @@ -477,7 +477,7 @@ static int gfs2_iter_unstuffed(struct gfs2_inode *ip, struct gfs2_ea_header *ea, return -ENOMEM; for (x = 0; x < nptrs; x++) { - error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs), 0, + error = gfs2_meta_read(ip->i_gl, be64_to_cpu(*dataptrs), 0, 0, bh + x); if (error) { while (x--) @@ -979,7 +979,7 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er, if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) { __be64 *end; - error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, + error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, 0, &indbh); if (error) return error; @@ -1237,56 +1237,6 @@ static int gfs2_xattr_set(const struct xattr_handler *handler, size, flags, handler->flags); } - -static int ea_acl_chmod_unstuffed(struct gfs2_inode *ip, - struct gfs2_ea_header *ea, char *data) -{ - struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - unsigned int amount = GFS2_EA_DATA_LEN(ea); - unsigned int nptrs = DIV_ROUND_UP(amount, sdp->sd_jbsize); - int ret; - - ret = gfs2_trans_begin(sdp, nptrs + RES_DINODE, 0); - if (ret) - return ret; - - ret = gfs2_iter_unstuffed(ip, ea, data, NULL); - gfs2_trans_end(sdp); - - return ret; -} - -int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data) -{ - struct inode *inode = &ip->i_inode; - struct gfs2_sbd *sdp = GFS2_SB(inode); - struct gfs2_ea_location el; - int error; - - error = gfs2_ea_find(ip, GFS2_EATYPE_SYS, GFS2_POSIX_ACL_ACCESS, &el); - if (error) - return error; - - if (GFS2_EA_IS_STUFFED(el.el_ea)) { - error = gfs2_trans_begin(sdp, RES_DINODE + RES_EATTR, 0); - if (error == 0) { - gfs2_trans_add_meta(ip->i_gl, el.el_bh); - memcpy(GFS2_EA2DATA(el.el_ea), data, - GFS2_EA_DATA_LEN(el.el_ea)); - } - } else { - error = ea_acl_chmod_unstuffed(ip, el.el_ea, data); - } - - brelse(el.el_bh); - if (error) - return error; - - error = gfs2_setattr_simple(inode, attr); - gfs2_trans_end(sdp); - return error; -} - static int ea_dealloc_indirect(struct gfs2_inode *ip) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); @@ -1306,7 +1256,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip) memset(&rlist, 0, sizeof(struct gfs2_rgrp_list)); - error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &indbh); + error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, 0, &indbh); if (error) return error; diff --git a/fs/gfs2/xattr.h b/fs/gfs2/xattr.h index d392f8358..2d887c88e 100644 --- a/fs/gfs2/xattr.h +++ b/fs/gfs2/xattr.h @@ -62,6 +62,5 @@ extern int gfs2_ea_dealloc(struct gfs2_inode *ip); /* Exported to acl.c */ extern int gfs2_xattr_acl_get(struct gfs2_inode *ip, const char *name, char **data); -extern int gfs2_xattr_acl_chmod(struct gfs2_inode *ip, struct iattr *attr, char *data); #endif /* __EATTR_DOT_H__ */ diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c index db458ee3a..1eb5d415d 100644 --- a/fs/hfs/catalog.c +++ b/fs/hfs/catalog.c @@ -214,7 +214,7 @@ int hfs_cat_delete(u32 cnid, struct inode *dir, struct qstr *str) { struct super_block *sb; struct hfs_find_data fd; - struct list_head *pos; + struct hfs_readdir_data *rd; int res, type; hfs_dbg(CAT_MOD, "delete_cat: %s,%u\n", str ? str->name : NULL, cnid); @@ -240,9 +240,7 @@ int hfs_cat_delete(u32 cnid, struct inode *dir, struct qstr *str) } } - list_for_each(pos, &HFS_I(dir)->open_dir_list) { - struct hfs_readdir_data *rd = - list_entry(pos, struct hfs_readdir_data, list); + list_for_each_entry(rd, &HFS_I(dir)->open_dir_list, list) { if (fd.tree->keycmp(fd.search_key, (void *)&rd->key) < 0) rd->file->f_pos--; } diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c index 70788e038..e9f2b855f 100644 --- a/fs/hfs/dir.c +++ b/fs/hfs/dir.c @@ -173,9 +173,9 @@ static int hfs_dir_release(struct inode *inode, struct file *file) { struct hfs_readdir_data *rd = file->private_data; if (rd) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); list_del(&rd->list); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); kfree(rd); } return 0; diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index b99ebddb1..6686bf39a 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -570,13 +570,13 @@ static int hfs_file_release(struct inode *inode, struct file *file) if (HFS_IS_RSRC(inode)) inode = HFS_I(inode)->rsrc_inode; if (atomic_dec_and_test(&HFS_I(inode)->opencnt)) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); hfs_file_truncate(inode); //if (inode->i_flags & S_DEAD) { // hfs_delete_cat(inode->i_ino, HFSPLUS_SB(sb).hidden_dir, NULL); // hfs_delete_inode(inode); //} - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } return 0; } @@ -656,7 +656,7 @@ static int hfs_file_fsync(struct file *filp, loff_t start, loff_t end, ret = filemap_write_and_wait_range(inode->i_mapping, start, end); if (ret) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* sync the inode to buffers */ ret = write_inode_now(inode, 0); @@ -668,7 +668,7 @@ static int hfs_file_fsync(struct file *filp, loff_t start, loff_t end, err = sync_blockdev(sb->s_bdev); if (!ret) ret = err; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c index aa3f0d6d0..a3ec3ae7d 100644 --- a/fs/hfs/mdb.c +++ b/fs/hfs/mdb.c @@ -166,7 +166,7 @@ int hfs_mdb_get(struct super_block *sb) pr_warn("continuing without an alternate MDB\n"); } - HFS_SB(sb)->bitmap = (__be32 *)__get_free_pages(GFP_KERNEL, PAGE_SIZE < 8192 ? 1 : 0); + HFS_SB(sb)->bitmap = kmalloc(8192, GFP_KERNEL); if (!HFS_SB(sb)->bitmap) goto out; @@ -360,7 +360,7 @@ void hfs_mdb_put(struct super_block *sb) unload_nls(HFS_SB(sb)->nls_io); unload_nls(HFS_SB(sb)->nls_disk); - free_pages((unsigned long)HFS_SB(sb)->bitmap, PAGE_SIZE < 8192 ? 1 : 0); + kfree(HFS_SB(sb)->bitmap); kfree(HFS_SB(sb)); sb->s_fs_info = NULL; } diff --git a/fs/hfs/super.c b/fs/hfs/super.c index 4574fdd3d..1ca95c232 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -483,8 +483,8 @@ static int __init init_hfs_fs(void) int err; hfs_inode_cachep = kmem_cache_create("hfs_inode_cache", - sizeof(struct hfs_inode_info), 0, SLAB_HWCACHE_ALIGN, - hfs_init_once); + sizeof(struct hfs_inode_info), 0, + SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT, hfs_init_once); if (!hfs_inode_cachep) return -ENOMEM; err = register_filesystem(&hfs_fs_type); diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index d0f39dcbb..a4e867e08 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -284,9 +284,9 @@ static int hfsplus_dir_release(struct inode *inode, struct file *file) { struct hfsplus_readdir_data *rd = file->private_data; if (rd) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); list_del(&rd->list); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); kfree(rd); } return 0; diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 6dd107d74..1a6394cdb 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -229,14 +229,14 @@ static int hfsplus_file_release(struct inode *inode, struct file *file) if (HFSPLUS_IS_RSRC(inode)) inode = HFSPLUS_I(inode)->rsrc_inode; if (atomic_dec_and_test(&HFSPLUS_I(inode)->opencnt)) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); hfsplus_file_truncate(inode); if (inode->i_flags & S_DEAD) { hfsplus_delete_cat(inode->i_ino, HFSPLUS_SB(sb)->hidden_dir, NULL); hfsplus_delete_inode(inode); } - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } return 0; } @@ -286,7 +286,7 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, error = filemap_write_and_wait_range(inode->i_mapping, start, end); if (error) return error; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* * Sync inode metadata into the catalog and extent trees. @@ -327,7 +327,7 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags)) blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return error; } @@ -403,6 +403,7 @@ struct inode *hfsplus_new_inode(struct super_block *sb, umode_t mode) } else if (S_ISLNK(inode->i_mode)) { sbi->file_count++; inode->i_op = &page_symlink_inode_operations; + inode_nohighmem(inode); inode->i_mapping->a_ops = &hfsplus_aops; hip->clump_blocks = 1; } else @@ -526,6 +527,7 @@ int hfsplus_cat_read_inode(struct inode *inode, struct hfs_find_data *fd) inode->i_mapping->a_ops = &hfsplus_aops; } else if (S_ISLNK(inode->i_mode)) { inode->i_op = &page_symlink_inode_operations; + inode_nohighmem(inode); inode->i_mapping->a_ops = &hfsplus_aops; } else { init_special_inode(inode, inode->i_mode, diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c index 0624ce4e0..32a49e292 100644 --- a/fs/hfsplus/ioctl.c +++ b/fs/hfsplus/ioctl.c @@ -93,7 +93,7 @@ static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags) goto out_drop_write; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); if ((flags & (FS_IMMUTABLE_FL|FS_APPEND_FL)) || inode->i_flags & (S_IMMUTABLE|S_APPEND)) { @@ -126,7 +126,7 @@ static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags) mark_inode_dirty(inode); out_unlock_inode: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); out_drop_write: mnt_drop_write_file(file); out: diff --git a/fs/hfsplus/posix_acl.c b/fs/hfsplus/posix_acl.c index df0c9af68..afb33eda6 100644 --- a/fs/hfsplus/posix_acl.c +++ b/fs/hfsplus/posix_acl.c @@ -21,10 +21,10 @@ struct posix_acl *hfsplus_get_posix_acl(struct inode *inode, int type) switch (type) { case ACL_TYPE_ACCESS: - xattr_name = POSIX_ACL_XATTR_ACCESS; + xattr_name = XATTR_NAME_POSIX_ACL_ACCESS; break; case ACL_TYPE_DEFAULT: - xattr_name = POSIX_ACL_XATTR_DEFAULT; + xattr_name = XATTR_NAME_POSIX_ACL_DEFAULT; break; default: return ERR_PTR(-EINVAL); @@ -66,7 +66,7 @@ int hfsplus_set_posix_acl(struct inode *inode, struct posix_acl *acl, switch (type) { case ACL_TYPE_ACCESS: - xattr_name = POSIX_ACL_XATTR_ACCESS; + xattr_name = XATTR_NAME_POSIX_ACL_ACCESS; if (acl) { err = posix_acl_equiv_mode(acl, &inode->i_mode); if (err < 0) @@ -76,7 +76,7 @@ int hfsplus_set_posix_acl(struct inode *inode, struct posix_acl *acl, break; case ACL_TYPE_DEFAULT: - xattr_name = POSIX_ACL_XATTR_DEFAULT; + xattr_name = XATTR_NAME_POSIX_ACL_DEFAULT; if (!S_ISDIR(inode->i_mode)) return acl ? -EACCES : 0; break; diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 7302d96ae..5d54490a1 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -663,7 +663,7 @@ static int __init init_hfsplus_fs(void) int err; hfsplus_inode_cachep = kmem_cache_create("hfsplus_icache", - HFSPLUS_INODE_SIZE, 0, SLAB_HWCACHE_ALIGN, + HFSPLUS_INODE_SIZE, 0, SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT, hfsplus_init_once); if (!hfsplus_inode_cachep) return -ENOMEM; diff --git a/fs/hfsplus/xattr.c b/fs/hfsplus/xattr.c index e41a010cd..ab01530b4 100644 --- a/fs/hfsplus/xattr.c +++ b/fs/hfsplus/xattr.c @@ -431,9 +431,6 @@ int hfsplus_setxattr(struct dentry *dentry, const char *name, char *xattr_name; int res; - if (!strcmp(name, "")) - return -EINVAL; - xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1, GFP_KERNEL); if (!xattr_name) @@ -589,9 +586,6 @@ ssize_t hfsplus_getxattr(struct dentry *dentry, const char *name, int res; char *xattr_name; - if (!strcmp(name, "")) - return -EINVAL; - xattr_name = kmalloc(NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN + 1, GFP_KERNEL); if (!xattr_name) @@ -853,9 +847,6 @@ static int hfsplus_osx_getxattr(const struct xattr_handler *handler, struct dentry *dentry, const char *name, void *buffer, size_t size) { - if (!strcmp(name, "")) - return -EINVAL; - /* * Don't allow retrieving properly prefixed attributes * by prepending them with "osx." @@ -876,9 +867,6 @@ static int hfsplus_osx_setxattr(const struct xattr_handler *handler, struct dentry *dentry, const char *name, const void *buffer, size_t size, int flags) { - if (!strcmp(name, "")) - return -EINVAL; - /* * Don't allow setting properly prefixed attributes * by prepending them with "osx." diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 5a7b3229b..d1abbee28 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -223,7 +223,7 @@ static struct inode *hostfs_alloc_inode(struct super_block *sb) { struct hostfs_inode_info *hi; - hi = kmalloc(sizeof(*hi), GFP_KERNEL); + hi = kmalloc(sizeof(*hi), GFP_KERNEL_ACCOUNT); if (hi == NULL) return NULL; hi->fd = -1; @@ -378,9 +378,9 @@ static int hostfs_fsync(struct file *file, loff_t start, loff_t end, if (ret) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = fsync_file(HOSTFS_I(inode)->fd, datasync); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } @@ -890,9 +890,14 @@ static const struct inode_operations hostfs_dir_iops = { .setattr = hostfs_setattr, }; -static const char *hostfs_follow_link(struct dentry *dentry, void **cookie) +static const char *hostfs_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) { - char *link = __getname(); + char *link; + if (!dentry) + return ERR_PTR(-ECHILD); + link = kmalloc(PATH_MAX, GFP_KERNEL); if (link) { char *path = dentry_name(dentry); int err = -ENOMEM; @@ -903,25 +908,20 @@ static const char *hostfs_follow_link(struct dentry *dentry, void **cookie) __putname(path); } if (err < 0) { - __putname(link); + kfree(link); return ERR_PTR(err); } } else { return ERR_PTR(-ENOMEM); } - return *cookie = link; -} - -static void hostfs_put_link(struct inode *unused, void *cookie) -{ - __putname(cookie); + set_delayed_call(done, kfree_link, link); + return link; } static const struct inode_operations hostfs_link_iops = { .readlink = generic_readlink, - .follow_link = hostfs_follow_link, - .put_link = hostfs_put_link, + .get_link = hostfs_get_link, }; static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent) diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c index dc540bfce..e57a53c13 100644 --- a/fs/hpfs/dir.c +++ b/fs/hpfs/dir.c @@ -33,7 +33,7 @@ static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence) if (whence == SEEK_DATA || whence == SEEK_HOLE) return -EINVAL; - mutex_lock(&i->i_mutex); + inode_lock(i); hpfs_lock(s); /*pr_info("dir lseek\n");*/ @@ -48,12 +48,12 @@ static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence) ok: filp->f_pos = new_off; hpfs_unlock(s); - mutex_unlock(&i->i_mutex); + inode_unlock(i); return new_off; fail: /*pr_warn("illegal lseek: %016llx\n", new_off);*/ hpfs_unlock(s); - mutex_unlock(&i->i_mutex); + inode_unlock(i); return -ESPIPE; } diff --git a/fs/hpfs/inode.c b/fs/hpfs/inode.c index 933c73780..1f3c6d762 100644 --- a/fs/hpfs/inode.c +++ b/fs/hpfs/inode.c @@ -77,6 +77,7 @@ void hpfs_read_inode(struct inode *i) kfree(ea); i->i_mode = S_IFLNK | 0777; i->i_op = &page_symlink_inode_operations; + inode_nohighmem(i); i->i_data.a_ops = &hpfs_symlink_aops; set_nlink(i, 1); i->i_size = ea_size; diff --git a/fs/hpfs/map.c b/fs/hpfs/map.c index a69bbc1e8..a13692918 100644 --- a/fs/hpfs/map.c +++ b/fs/hpfs/map.c @@ -133,7 +133,7 @@ __le32 *hpfs_load_bitmap_directory(struct super_block *s, secno bmp) void hpfs_load_hotfix_map(struct super_block *s, struct hpfs_spare_block *spareblock) { struct quad_buffer_head qbh; - u32 *directory; + __le32 *directory; u32 n_hotfixes, n_used_hotfixes; unsigned i; diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c index bffb908ac..bb8d67e27 100644 --- a/fs/hpfs/namei.c +++ b/fs/hpfs/namei.c @@ -332,6 +332,7 @@ static int hpfs_symlink(struct inode *dir, struct dentry *dentry, const char *sy result->i_blocks = 1; set_nlink(result, 1); result->i_size = strlen(symlink); + inode_nohighmem(result); result->i_op = &page_symlink_inode_operations; result->i_data.a_ops = &hpfs_symlink_aops; @@ -475,7 +476,7 @@ out: static int hpfs_symlink_readpage(struct file *file, struct page *page) { - char *link = kmap(page); + char *link = page_address(page); struct inode *i = page->mapping->host; struct fnode *fnode; struct buffer_head *bh; @@ -491,14 +492,12 @@ static int hpfs_symlink_readpage(struct file *file, struct page *page) goto fail; hpfs_unlock(i->i_sb); SetPageUptodate(page); - kunmap(page); unlock_page(page); return 0; fail: hpfs_unlock(i->i_sb); SetPageError(page); - kunmap(page); unlock_page(page); return err; } diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index a56159189..458cf4630 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -261,7 +261,7 @@ static int init_inodecache(void) hpfs_inode_cachep = kmem_cache_create("hpfs_inode_cache", sizeof(struct hpfs_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (hpfs_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 595ebdb41..e1f465a38 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -4,11 +4,11 @@ * Nadia Yvette Chambers, 2002 * * Copyright (C) 2002 Linus Torvalds. + * License: GPL */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/module.h> #include <linux/thread_info.h> #include <asm/current.h> #include <linux/sched.h> /* remove ASAP */ @@ -141,7 +141,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) vma_len = (loff_t)(vma->vm_end - vma->vm_start); - mutex_lock(&inode->i_mutex); + inode_lock(inode); file_accessed(file); ret = -ENOMEM; @@ -157,7 +157,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) if (vma->vm_flags & VM_WRITE && inode->i_size < len) inode->i_size = len; out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } @@ -324,11 +324,48 @@ static void remove_huge_page(struct page *page) delete_from_page_cache(page); } +static void +hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end) +{ + struct vm_area_struct *vma; + + /* + * end == 0 indicates that the entire range after + * start should be unmapped. + */ + vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) { + unsigned long v_offset; + unsigned long v_end; + + /* + * Can the expression below overflow on 32-bit arches? + * No, because the interval tree returns us only those vmas + * which overlap the truncated area starting at pgoff, + * and no vma on a 32-bit arch can span beyond the 4GB. + */ + if (vma->vm_pgoff < start) + v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT; + else + v_offset = 0; + + if (!end) + v_end = vma->vm_end; + else { + v_end = ((end - vma->vm_pgoff) << PAGE_SHIFT) + + vma->vm_start; + if (v_end > vma->vm_end) + v_end = vma->vm_end; + } + + unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end, + NULL); + } +} /* * remove_inode_hugepages handles two distinct cases: truncation and hole * punch. There are subtle differences in operation for each case. - + * * truncation is indicated by end of range being LLONG_MAX * In this case, we first scan the range and release found pages. * After releasing pages, hugetlb_unreserve_pages cleans up region/reserv @@ -379,6 +416,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, for (i = 0; i < pagevec_count(&pvec); ++i) { struct page *page = pvec.pages[i]; + bool rsv_on_error; u32 hash; /* @@ -395,37 +433,43 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, mapping, next, 0); mutex_lock(&hugetlb_fault_mutex_table[hash]); - lock_page(page); - if (likely(!page_mapped(page))) { - bool rsv_on_error = !PagePrivate(page); - /* - * We must free the huge page and remove - * from page cache (remove_huge_page) BEFORE - * removing the region/reserve map - * (hugetlb_unreserve_pages). In rare out - * of memory conditions, removal of the - * region/reserve map could fail. Before - * free'ing the page, note PagePrivate which - * is used in case of error. - */ - remove_huge_page(page); - freed++; - if (!truncate_op) { - if (unlikely(hugetlb_unreserve_pages( - inode, next, - next + 1, 1))) - hugetlb_fix_reserve_counts( - inode, rsv_on_error); - } - } else { - /* - * If page is mapped, it was faulted in after - * being unmapped. It indicates a race between - * hole punch and page fault. Do nothing in - * this case. Getting here in a truncate - * operation is a bug. - */ + /* + * If page is mapped, it was faulted in after being + * unmapped in caller. Unmap (again) now after taking + * the fault mutex. The mutex will prevent faults + * until we finish removing the page. + * + * This race can only happen in the hole punch case. + * Getting here in a truncate operation is a bug. + */ + if (unlikely(page_mapped(page))) { BUG_ON(truncate_op); + + i_mmap_lock_write(mapping); + hugetlb_vmdelete_list(&mapping->i_mmap, + next * pages_per_huge_page(h), + (next + 1) * pages_per_huge_page(h)); + i_mmap_unlock_write(mapping); + } + + lock_page(page); + /* + * We must free the huge page and remove from page + * cache (remove_huge_page) BEFORE removing the + * region/reserve map (hugetlb_unreserve_pages). In + * rare out of memory conditions, removal of the + * region/reserve map could fail. Before free'ing + * the page, note PagePrivate which is used in case + * of error. + */ + rsv_on_error = !PagePrivate(page); + remove_huge_page(page); + freed++; + if (!truncate_op) { + if (unlikely(hugetlb_unreserve_pages(inode, + next, next + 1, 1))) + hugetlb_fix_reserve_counts(inode, + rsv_on_error); } unlock_page(page); @@ -452,44 +496,6 @@ static void hugetlbfs_evict_inode(struct inode *inode) clear_inode(inode); } -static inline void -hugetlb_vmdelete_list(struct rb_root *root, pgoff_t start, pgoff_t end) -{ - struct vm_area_struct *vma; - - /* - * end == 0 indicates that the entire range after - * start should be unmapped. - */ - vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) { - unsigned long v_offset; - unsigned long v_end; - - /* - * Can the expression below overflow on 32-bit arches? - * No, because the interval tree returns us only those vmas - * which overlap the truncated area starting at pgoff, - * and no vma on a 32-bit arch can span beyond the 4GB. - */ - if (vma->vm_pgoff < start) - v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT; - else - v_offset = 0; - - if (!end) - v_end = vma->vm_end; - else { - v_end = ((end - vma->vm_pgoff) << PAGE_SHIFT) - + vma->vm_start; - if (v_end > vma->vm_end) - v_end = vma->vm_end; - } - - unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end, - NULL); - } -} - static int hugetlb_vmtruncate(struct inode *inode, loff_t offset) { pgoff_t pgoff; @@ -524,7 +530,7 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) if (hole_end > hole_start) { struct address_space *mapping = inode->i_mapping; - mutex_lock(&inode->i_mutex); + inode_lock(inode); i_mmap_lock_write(mapping); if (!RB_EMPTY_ROOT(&mapping->i_mmap)) hugetlb_vmdelete_list(&mapping->i_mmap, @@ -532,7 +538,7 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) hole_end >> PAGE_SHIFT); i_mmap_unlock_write(mapping); remove_inode_hugepages(inode, hole_start, hole_end); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } return 0; @@ -566,7 +572,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, start = offset >> hpage_shift; end = (offset + len + hpage_size - 1) >> hpage_shift; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */ error = inode_newsize_ok(inode, offset + len); @@ -653,7 +659,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, i_size_write(inode, offset + len); inode->i_ctime = CURRENT_TIME; out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return error; } @@ -711,7 +717,7 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb, /* * Hugetlbfs is not reclaimable; therefore its i_mmap_rwsem will never * be taken from reclaim -- unlike regular filesystems. This needs an - * annotation because huge_pmd_share() does an allocation under + * annotation because huge_pmd_share() does an allocation under hugetlb's * i_mmap_rwsem. */ static struct lock_class_key hugetlbfs_i_mmap_rwsem_key; @@ -741,7 +747,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, /* * The policy is initialized here even if we are creating a * private inode because initialization simply creates an - * an empty rb tree and calls spin_lock_init(), later when we + * an empty rb tree and calls rwlock_init(), later when we * call mpol_free_shared_policy() it will just return because * the rb tree will still be empty. */ @@ -763,6 +769,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, break; case S_IFLNK: inode->i_op = &page_symlink_inode_operations; + inode_nohighmem(inode); break; } lockdep_annotate_inode_mutex_key(inode); @@ -1204,7 +1211,6 @@ static struct file_system_type hugetlbfs_fs_type = { .mount = hugetlbfs_mount, .kill_sb = kill_litter_super, }; -MODULE_ALIAS_FS("hugetlbfs"); static struct vfsmount *hugetlbfs_vfsmount[HUGE_MAX_HSTATE]; @@ -1324,7 +1330,7 @@ static int __init init_hugetlbfs_fs(void) error = -ENOMEM; hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache", sizeof(struct hugetlbfs_inode_info), - 0, 0, init_once); + 0, SLAB_ACCOUNT, init_once); if (hugetlbfs_inode_cachep == NULL) goto out2; @@ -1358,26 +1364,4 @@ static int __init init_hugetlbfs_fs(void) out2: return error; } - -static void __exit exit_hugetlbfs_fs(void) -{ - struct hstate *h; - int i; - - - /* - * Make sure all delayed rcu free inodes are flushed before we - * destroy cache. - */ - rcu_barrier(); - kmem_cache_destroy(hugetlbfs_inode_cachep); - i = 0; - for_each_hstate(h) - kern_unmount(hugetlbfs_vfsmount[i++]); - unregister_filesystem(&hugetlbfs_fs_type); -} - -module_init(init_hugetlbfs_fs) -module_exit(exit_hugetlbfs_fs) - -MODULE_LICENSE("GPL"); +fs_initcall(init_hugetlbfs_fs) diff --git a/fs/inode.c b/fs/inode.c index 1be5f9003..69b8b526c 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -154,6 +154,12 @@ int inode_init_always(struct super_block *sb, struct inode *inode) inode->i_rdev = 0; inode->dirtied_when = 0; +#ifdef CONFIG_CGROUP_WRITEBACK + inode->i_wb_frn_winner = 0; + inode->i_wb_frn_avg_time = 0; + inode->i_wb_frn_history = 0; +#endif + if (security_inode_alloc(inode)) goto out; spin_lock_init(&inode->i_lock); @@ -225,7 +231,7 @@ void __destroy_inode(struct inode *inode) inode_detach_wb(inode); security_inode_free(inode); fsnotify_inode_delete(inode); - locks_free_lock_context(inode->i_flctx); + locks_free_lock_context(inode); if (!inode->i_nlink) { WARN_ON(atomic_long_read(&inode->i_sb->s_remove_count) == 0); atomic_long_dec(&inode->i_sb->s_remove_count); @@ -495,7 +501,7 @@ void clear_inode(struct inode *inode) */ spin_lock_irq(&inode->i_data.tree_lock); BUG_ON(inode->i_data.nrpages); - BUG_ON(inode->i_data.nrshadows); + BUG_ON(inode->i_data.nrexceptional); spin_unlock_irq(&inode->i_data.tree_lock); BUG_ON(!list_empty(&inode->i_data.private_list)); BUG_ON(!(inode->i_state & I_FREEING)); @@ -966,9 +972,9 @@ void lock_two_nondirectories(struct inode *inode1, struct inode *inode2) swap(inode1, inode2); if (inode1 && !S_ISDIR(inode1->i_mode)) - mutex_lock(&inode1->i_mutex); + inode_lock(inode1); if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1) - mutex_lock_nested(&inode2->i_mutex, I_MUTEX_NONDIR2); + inode_lock_nested(inode2, I_MUTEX_NONDIR2); } EXPORT_SYMBOL(lock_two_nondirectories); @@ -980,9 +986,9 @@ EXPORT_SYMBOL(lock_two_nondirectories); void unlock_two_nondirectories(struct inode *inode1, struct inode *inode2) { if (inode1 && !S_ISDIR(inode1->i_mode)) - mutex_unlock(&inode1->i_mutex); + inode_unlock(inode1); if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1) - mutex_unlock(&inode2->i_mutex); + inode_unlock(inode2); } EXPORT_SYMBOL(unlock_two_nondirectories); @@ -1883,7 +1889,7 @@ void __init inode_init(void) sizeof(struct inode), 0, (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); /* Hash may have been set up in inode_init_early */ @@ -2028,3 +2034,9 @@ void inode_set_flags(struct inode *inode, unsigned int flags, new_flags) != old_flags)); } EXPORT_SYMBOL(inode_set_flags); + +void inode_nohighmem(struct inode *inode) +{ + mapping_set_gfp_mask(inode->i_mapping, GFP_USER); +} +EXPORT_SYMBOL(inode_nohighmem); diff --git a/fs/internal.h b/fs/internal.h index 71859c4d0..b71deeece 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -55,7 +55,7 @@ extern int vfs_path_lookup(struct dentry *, struct vfsmount *, /* * namespace.c */ -extern int copy_mount_options(const void __user *, unsigned long *); +extern void *copy_mount_options(const void __user *); extern char *copy_mount_string(const void __user *); extern struct vfsmount *lookup_mnt(struct path *); @@ -151,3 +151,10 @@ extern void mnt_pin_kill(struct mount *m); * fs/nsfs.c */ extern struct dentry_operations ns_dentry_operations; + +/* + * fs/ioctl.c + */ +extern int do_vfs_ioctl(struct file *file, unsigned int fd, unsigned int cmd, + unsigned long arg); +extern long vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); diff --git a/fs/ioctl.c b/fs/ioctl.c index 5d01d2638..116a333e9 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -15,6 +15,7 @@ #include <linux/writeback.h> #include <linux/buffer_head.h> #include <linux/falloc.h> +#include "internal.h" #include <asm/ioctls.h> @@ -32,8 +33,7 @@ * * Returns 0 on success, -errno on error. */ -static long vfs_ioctl(struct file *filp, unsigned int cmd, - unsigned long arg) +long vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { int error = -ENOTTY; @@ -215,6 +215,29 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg) return error; } +static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd, + u64 off, u64 olen, u64 destoff) +{ + struct fd src_file = fdget(srcfd); + int ret; + + if (!src_file.file) + return -EBADF; + ret = vfs_clone_file_range(src_file.file, off, dst_file, destoff, olen); + fdput(src_file); + return ret; +} + +static long ioctl_file_clone_range(struct file *file, void __user *argp) +{ + struct file_clone_range args; + + if (copy_from_user(&args, argp, sizeof(args))) + return -EFAULT; + return ioctl_file_clone(file, args.src_fd, args.src_offset, + args.src_length, args.dest_offset); +} + #ifdef CONFIG_BLOCK static inline sector_t logical_to_blk(struct inode *inode, loff_t offset) @@ -411,9 +434,9 @@ int generic_block_fiemap(struct inode *inode, u64 len, get_block_t *get_block) { int ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = __generic_block_fiemap(inode, fieinfo, start, len, get_block); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } EXPORT_SYMBOL(generic_block_fiemap); @@ -545,6 +568,41 @@ static int ioctl_fsthaw(struct file *filp) return thaw_super(sb); } +static long ioctl_file_dedupe_range(struct file *file, void __user *arg) +{ + struct file_dedupe_range __user *argp = arg; + struct file_dedupe_range *same = NULL; + int ret; + unsigned long size; + u16 count; + + if (get_user(count, &argp->dest_count)) { + ret = -EFAULT; + goto out; + } + + size = offsetof(struct file_dedupe_range __user, info[count]); + + same = memdup_user(argp, size); + if (IS_ERR(same)) { + ret = PTR_ERR(same); + same = NULL; + goto out; + } + + ret = vfs_dedupe_file_range(file, same); + if (ret) + goto out; + + ret = copy_to_user(argp, same, size); + if (ret) + ret = -EFAULT; + +out: + kfree(same); + return ret; +} + /* * When you add any new common ioctls to the switches above and below * please update compat_sys_ioctl() too. @@ -600,6 +658,15 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd, case FIGETBSZ: return put_user(inode->i_sb->s_blocksize, argp); + case FICLONE: + return ioctl_file_clone(filp, arg, 0, 0, 0); + + case FICLONERANGE: + return ioctl_file_clone_range(filp, argp); + + case FIDEDUPERANGE: + return ioctl_file_dedupe_range(filp, argp); + default: if (S_ISREG(inode->i_mode)) error = file_ioctl(filp, cmd, arg); diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index d67a16f2a..bcd2d41b3 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -94,7 +94,7 @@ static int __init init_inodecache(void) isofs_inode_cachep = kmem_cache_create("isofs_inode_cache", sizeof(struct iso_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (isofs_inode_cachep == NULL) return -ENOMEM; @@ -1417,6 +1417,7 @@ static int isofs_read_inode(struct inode *inode, int relocated) inode->i_fop = &isofs_dir_operations; } else if (S_ISLNK(inode->i_mode)) { inode->i_op = &page_symlink_inode_operations; + inode_nohighmem(inode); inode->i_data.a_ops = &isofs_symlink_aops; } else /* XXX - parse_rock_ridge_inode() had already set i_rdev. */ diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c index 735d7522a..5384ceb35 100644 --- a/fs/isofs/rock.c +++ b/fs/isofs/rock.c @@ -687,7 +687,7 @@ static int rock_ridge_symlink_readpage(struct file *file, struct page *page) struct inode *inode = page->mapping->host; struct iso_inode_info *ei = ISOFS_I(inode); struct isofs_sb_info *sbi = ISOFS_SB(inode->i_sb); - char *link = kmap(page); + char *link = page_address(page); unsigned long bufsize = ISOFS_BUFFER_SIZE(inode); struct buffer_head *bh; char *rpnt = link; @@ -774,7 +774,6 @@ repeat: brelse(bh); *rpnt = '\0'; SetPageUptodate(page); - kunmap(page); unlock_page(page); return 0; @@ -791,7 +790,6 @@ fail: brelse(bh); error: SetPageError(page); - kunmap(page); unlock_page(page); return -EIO; } diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index ca181e81c..081dff087 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -764,13 +764,11 @@ void jbd2_journal_unlock_updates (journal_t *journal) static void warn_dirty_buffer(struct buffer_head *bh) { - char b[BDEVNAME_SIZE]; - printk(KERN_WARNING - "JBD2: Spotted dirty metadata buffer (dev = %s, blocknr = %llu). " + "JBD2: Spotted dirty metadata buffer (dev = %pg, blocknr = %llu). " "There's a risk of filesystem corruption in case of system " "crash.\n", - bdevname(bh->b_bdev, b), (unsigned long long)bh->b_blocknr); + bh->b_bdev, (unsigned long long)bh->b_blocknr); } /* Call t_frozen trigger and copy buffer data into jh->b_frozen_data. */ diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c index c1f04947d..b288c8ae1 100644 --- a/fs/jffs2/build.c +++ b/fs/jffs2/build.c @@ -17,6 +17,7 @@ #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/mtd/mtd.h> +#include <linux/mm.h> /* kvfree() */ #include "nodelist.h" static void jffs2_build_remove_unlinked_inode(struct jffs2_sb_info *, @@ -422,12 +423,7 @@ int jffs2_do_mount_fs(struct jffs2_sb_info *c) return 0; out_free: -#ifndef __ECOS - if (jffs2_blocks_use_vmalloc(c)) - vfree(c->blocks); - else -#endif - kfree(c->blocks); + kvfree(c->blocks); return ret; } diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index d211b8e18..30c4c9ebb 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -843,9 +843,14 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry, pr_notice("%s(): Link succeeded, unlink failed (err %d). You now have a hard link\n", __func__, ret); - /* Might as well let the VFS know */ - d_instantiate(new_dentry, d_inode(old_dentry)); - ihold(d_inode(old_dentry)); + /* + * We can't keep the target in dcache after that. + * For one thing, we can't afford dentry aliases for directories. + * For another, if there was a victim, we _can't_ set new inode + * for that sucker and we have to trigger mount eviction - the + * caller won't do it on its own since we are returning an error. + */ + d_invalidate(new_dentry); new_dir_i->i_mtime = new_dir_i->i_ctime = ITIME(now); return ret; } diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index 3361979d7..cad86bac3 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -39,10 +39,10 @@ int jffs2_fsync(struct file *filp, loff_t start, loff_t end, int datasync) if (ret) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* Trigger GC to flush any pending writes for this inode */ jffs2_flush_wbuf_gc(c, inode->i_ino); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return 0; } diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index 2caf16820..bead25ae8 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -596,10 +596,7 @@ int jffs2_do_fill_super(struct super_block *sb, void *data, int silent) out_root: jffs2_free_ino_caches(c); jffs2_free_raw_node_refs(c); - if (jffs2_blocks_use_vmalloc(c)) - vfree(c->blocks); - else - kfree(c->blocks); + kvfree(c->blocks); out_inohash: jffs2_clear_xattr_subsystem(c); kfree(c->inocache_list); diff --git a/fs/jffs2/security.c b/fs/jffs2/security.c index bf12fe5f8..7a28facd7 100644 --- a/fs/jffs2/security.c +++ b/fs/jffs2/security.c @@ -52,9 +52,6 @@ static int jffs2_security_getxattr(const struct xattr_handler *handler, struct dentry *dentry, const char *name, void *buffer, size_t size) { - if (!strcmp(name, "")) - return -EINVAL; - return do_jffs2_getxattr(d_inode(dentry), JFFS2_XPREFIX_SECURITY, name, buffer, size); } @@ -63,31 +60,12 @@ static int jffs2_security_setxattr(const struct xattr_handler *handler, struct dentry *dentry, const char *name, const void *buffer, size_t size, int flags) { - if (!strcmp(name, "")) - return -EINVAL; - return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_SECURITY, name, buffer, size, flags); } -static size_t jffs2_security_listxattr(const struct xattr_handler *handler, - struct dentry *dentry, char *list, - size_t list_size, const char *name, - size_t name_len) -{ - size_t retlen = XATTR_SECURITY_PREFIX_LEN + name_len + 1; - - if (list && retlen <= list_size) { - strcpy(list, XATTR_SECURITY_PREFIX); - strcpy(list + XATTR_SECURITY_PREFIX_LEN, name); - } - - return retlen; -} - const struct xattr_handler jffs2_security_xattr_handler = { .prefix = XATTR_SECURITY_PREFIX, - .list = jffs2_security_listxattr, .set = jffs2_security_setxattr, .get = jffs2_security_getxattr }; diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index d86c5e317..0a9a114bb 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -331,10 +331,7 @@ static void jffs2_put_super (struct super_block *sb) jffs2_free_ino_caches(c); jffs2_free_raw_node_refs(c); - if (jffs2_blocks_use_vmalloc(c)) - vfree(c->blocks); - else - kfree(c->blocks); + kvfree(c->blocks); jffs2_flash_cleanup(c); kfree(c->inocache_list); jffs2_clear_xattr_subsystem(c); @@ -387,7 +384,7 @@ static int __init init_jffs2_fs(void) jffs2_inode_cachep = kmem_cache_create("jffs2_i", sizeof(struct jffs2_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), jffs2_i_init_once); if (!jffs2_inode_cachep) { pr_err("error: Failed to initialise inode cache\n"); diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c index 8ce2f2401..2cabd649d 100644 --- a/fs/jffs2/symlink.c +++ b/fs/jffs2/symlink.c @@ -14,7 +14,7 @@ const struct inode_operations jffs2_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = simple_follow_link, + .get_link = simple_get_link, .setattr = jffs2_setattr, .setxattr = jffs2_setxattr, .getxattr = jffs2_getxattr, diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c index f3a4857ff..5a3da3f52 100644 --- a/fs/jffs2/wbuf.c +++ b/fs/jffs2/wbuf.c @@ -1153,7 +1153,7 @@ static struct jffs2_sb_info *work_to_sb(struct work_struct *work) { struct delayed_work *dwork; - dwork = container_of(work, struct delayed_work, work); + dwork = to_delayed_work(work); return container_of(dwork, struct jffs2_sb_info, wbuf_dwork); } diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c index 4c2c03663..da3e18503 100644 --- a/fs/jffs2/xattr.c +++ b/fs/jffs2/xattr.c @@ -967,7 +967,8 @@ ssize_t jffs2_listxattr(struct dentry *dentry, char *buffer, size_t size) struct jffs2_xattr_ref *ref, **pref; struct jffs2_xattr_datum *xd; const struct xattr_handler *xhandle; - ssize_t len, rc; + const char *prefix; + ssize_t prefix_len, len, rc; int retry = 0; rc = check_xattr_ref_inode(c, ic); @@ -998,18 +999,23 @@ ssize_t jffs2_listxattr(struct dentry *dentry, char *buffer, size_t size) } } xhandle = xprefix_to_handler(xd->xprefix); - if (!xhandle) + if (!xhandle || (xhandle->list && !xhandle->list(dentry))) continue; + prefix = xhandle->prefix ?: xhandle->name; + prefix_len = strlen(prefix); + rc = prefix_len + xd->name_len + 1; + if (buffer) { - rc = xhandle->list(xhandle, dentry, buffer + len, - size - len, xd->xname, - xd->name_len); - } else { - rc = xhandle->list(xhandle, dentry, NULL, 0, - xd->xname, xd->name_len); + if (rc > size - len) { + rc = -ERANGE; + goto out; + } + memcpy(buffer, prefix, prefix_len); + buffer += prefix_len; + memcpy(buffer, xd->xname, xd->name_len); + buffer += xd->name_len; + *buffer++ = 0; } - if (rc < 0) - goto out; len += rc; } rc = len; diff --git a/fs/jffs2/xattr_trusted.c b/fs/jffs2/xattr_trusted.c index a562da0d6..b2555ef07 100644 --- a/fs/jffs2/xattr_trusted.c +++ b/fs/jffs2/xattr_trusted.c @@ -20,8 +20,6 @@ static int jffs2_trusted_getxattr(const struct xattr_handler *handler, struct dentry *dentry, const char *name, void *buffer, size_t size) { - if (!strcmp(name, "")) - return -EINVAL; return do_jffs2_getxattr(d_inode(dentry), JFFS2_XPREFIX_TRUSTED, name, buffer, size); } @@ -30,28 +28,13 @@ static int jffs2_trusted_setxattr(const struct xattr_handler *handler, struct dentry *dentry, const char *name, const void *buffer, size_t size, int flags) { - if (!strcmp(name, "")) - return -EINVAL; return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_TRUSTED, name, buffer, size, flags); } -static size_t jffs2_trusted_listxattr(const struct xattr_handler *handler, - struct dentry *dentry, char *list, - size_t list_size, const char *name, - size_t name_len) +static bool jffs2_trusted_listxattr(struct dentry *dentry) { - size_t retlen = XATTR_TRUSTED_PREFIX_LEN + name_len + 1; - - if (!capable(CAP_SYS_ADMIN)) - return 0; - - if (list && retlen<=list_size) { - strcpy(list, XATTR_TRUSTED_PREFIX); - strcpy(list + XATTR_TRUSTED_PREFIX_LEN, name); - } - - return retlen; + return capable(CAP_SYS_ADMIN); } const struct xattr_handler jffs2_trusted_xattr_handler = { diff --git a/fs/jffs2/xattr_user.c b/fs/jffs2/xattr_user.c index cbc0472e5..539bd630b 100644 --- a/fs/jffs2/xattr_user.c +++ b/fs/jffs2/xattr_user.c @@ -20,8 +20,6 @@ static int jffs2_user_getxattr(const struct xattr_handler *handler, struct dentry *dentry, const char *name, void *buffer, size_t size) { - if (!strcmp(name, "")) - return -EINVAL; return do_jffs2_getxattr(d_inode(dentry), JFFS2_XPREFIX_USER, name, buffer, size); } @@ -30,30 +28,12 @@ static int jffs2_user_setxattr(const struct xattr_handler *handler, struct dentry *dentry, const char *name, const void *buffer, size_t size, int flags) { - if (!strcmp(name, "")) - return -EINVAL; return do_jffs2_setxattr(d_inode(dentry), JFFS2_XPREFIX_USER, name, buffer, size, flags); } -static size_t jffs2_user_listxattr(const struct xattr_handler *handler, - struct dentry *dentry, char *list, - size_t list_size, const char *name, - size_t name_len) -{ - size_t retlen = XATTR_USER_PREFIX_LEN + name_len + 1; - - if (list && retlen <= list_size) { - strcpy(list, XATTR_USER_PREFIX); - strcpy(list + XATTR_USER_PREFIX_LEN, name); - } - - return retlen; -} - const struct xattr_handler jffs2_user_xattr_handler = { .prefix = XATTR_USER_PREFIX, - .list = jffs2_user_listxattr, .set = jffs2_user_setxattr, .get = jffs2_user_getxattr }; diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c index 0c8ca830b..49456853e 100644 --- a/fs/jfs/acl.c +++ b/fs/jfs/acl.c @@ -40,10 +40,10 @@ struct posix_acl *jfs_get_acl(struct inode *inode, int type) switch(type) { case ACL_TYPE_ACCESS: - ea_name = POSIX_ACL_XATTR_ACCESS; + ea_name = XATTR_NAME_POSIX_ACL_ACCESS; break; case ACL_TYPE_DEFAULT: - ea_name = POSIX_ACL_XATTR_DEFAULT; + ea_name = XATTR_NAME_POSIX_ACL_DEFAULT; break; default: return ERR_PTR(-EINVAL); @@ -82,7 +82,7 @@ static int __jfs_set_acl(tid_t tid, struct inode *inode, int type, switch (type) { case ACL_TYPE_ACCESS: - ea_name = POSIX_ACL_XATTR_ACCESS; + ea_name = XATTR_NAME_POSIX_ACL_ACCESS; if (acl) { rc = posix_acl_equiv_mode(acl, &inode->i_mode); if (rc < 0) @@ -94,7 +94,7 @@ static int __jfs_set_acl(tid_t tid, struct inode *inode, int type, } break; case ACL_TYPE_DEFAULT: - ea_name = POSIX_ACL_XATTR_DEFAULT; + ea_name = XATTR_NAME_POSIX_ACL_DEFAULT; break; default: return -EINVAL; diff --git a/fs/jfs/file.c b/fs/jfs/file.c index 0e026a7bd..4ce7735dd 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -38,17 +38,17 @@ int jfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) if (rc) return rc; - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (!(inode->i_state & I_DIRTY_ALL) || (datasync && !(inode->i_state & I_DIRTY_DATASYNC))) { /* Make sure committed changes hit the disk */ jfs_flush_journal(JFS_SBI(inode->i_sb)->log, 1); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return rc; } rc |= jfs_commit_inode(inode, 1); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return rc ? -EIO : 0; } diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 41aa3ca6a..9d9bae63a 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -60,6 +60,7 @@ struct inode *jfs_iget(struct super_block *sb, unsigned long ino) } else if (S_ISLNK(inode->i_mode)) { if (inode->i_size >= IDATASIZE) { inode->i_op = &page_symlink_inode_operations; + inode_nohighmem(inode); inode->i_mapping->a_ops = &jfs_aops; } else { inode->i_op = &jfs_fast_symlink_inode_operations; diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c index 8db8b7d61..8653cac7e 100644 --- a/fs/jfs/ioctl.c +++ b/fs/jfs/ioctl.c @@ -96,7 +96,7 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) } /* Lock against other parallel changes of flags */ - mutex_lock(&inode->i_mutex); + inode_lock(inode); jfs_get_inode_flags(jfs_inode); oldflags = jfs_inode->mode2; @@ -109,7 +109,7 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) ((flags ^ oldflags) & (JFS_APPEND_FL | JFS_IMMUTABLE_FL))) { if (!capable(CAP_LINUX_IMMUTABLE)) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); err = -EPERM; goto setflags_out; } @@ -120,7 +120,7 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) jfs_inode->mode2 = flags; jfs_set_inode_flags(inode); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); inode->i_ctime = CURRENT_TIME_SEC; mark_inode_dirty(inode); setflags_out: diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index a69bdf2a1..a270cb7ff 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -1835,17 +1835,16 @@ static int lbmLogInit(struct jfs_log * log) for (i = 0; i < LOGPAGES;) { char *buffer; uint offset; - struct page *page; + struct page *page = alloc_page(GFP_KERNEL | __GFP_ZERO); - buffer = (char *) get_zeroed_page(GFP_KERNEL); - if (buffer == NULL) + if (!page) goto error; - page = virt_to_page(buffer); + buffer = page_address(page); for (offset = 0; offset < PAGE_SIZE; offset += LOGPSIZE) { lbuf = kmalloc(sizeof(struct lbuf), GFP_KERNEL); if (lbuf == NULL) { if (offset == 0) - free_page((unsigned long) buffer); + __free_page(page); goto error; } if (offset) /* we already have one reference */ diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index 9d7551f5c..701f89370 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -983,6 +983,7 @@ static int jfs_symlink(struct inode *dip, struct dentry *dentry, jfs_info("jfs_symlink: allocate extent ip:0x%p", ip); ip->i_op = &jfs_symlink_inode_operations; + inode_nohighmem(ip); ip->i_mapping->a_ops = &jfs_aops; /* diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 8f9176caf..4f5d85ba8 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -792,7 +792,7 @@ static ssize_t jfs_quota_write(struct super_block *sb, int type, struct buffer_head tmp_bh; struct buffer_head *bh; - mutex_lock(&inode->i_mutex); + inode_lock(inode); while (towrite > 0) { tocopy = sb->s_blocksize - offset < towrite ? sb->s_blocksize - offset : towrite; @@ -824,7 +824,7 @@ static ssize_t jfs_quota_write(struct super_block *sb, int type, } out: if (len == towrite) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return err; } if (inode->i_size < off+len-towrite) @@ -832,7 +832,7 @@ out: inode->i_version++; inode->i_mtime = inode->i_ctime = CURRENT_TIME; mark_inode_dirty(inode); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return len - towrite; } @@ -898,7 +898,7 @@ static int __init init_jfs_fs(void) jfs_inode_cachep = kmem_cache_create("jfs_ip", sizeof(struct jfs_inode_info), 0, - SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT, init_once); if (jfs_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/jfs/symlink.c b/fs/jfs/symlink.c index 5929e2363..f8db4fde0 100644 --- a/fs/jfs/symlink.c +++ b/fs/jfs/symlink.c @@ -23,7 +23,7 @@ const struct inode_operations jfs_fast_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = simple_follow_link, + .get_link = simple_get_link, .setattr = jfs_setattr, .setxattr = jfs_setxattr, .getxattr = jfs_getxattr, @@ -33,8 +33,7 @@ const struct inode_operations jfs_fast_symlink_inode_operations = { const struct inode_operations jfs_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = page_follow_link_light, - .put_link = page_put_link, + .get_link = page_get_link, .setattr = jfs_setattr, .setxattr = jfs_setxattr, .getxattr = jfs_getxattr, diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 91e004518..996b7742c 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -541,14 +541,7 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, if (!kn) goto err_out1; - /* - * If the ino of the sysfs entry created for a kmem cache gets - * allocated from an ida layer, which is accounted to the memcg that - * owns the cache, the memcg will get pinned forever. So do not account - * ino ida allocations. - */ - ret = ida_simple_get(&root->ino_ida, 1, 0, - GFP_KERNEL | __GFP_NOACCOUNT); + ret = ida_simple_get(&root->ino_ida, 1, 0, GFP_KERNEL); if (ret < 0) goto err_out2; kn->ino = ret; @@ -694,6 +687,29 @@ static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent, return NULL; } +static struct kernfs_node *kernfs_walk_ns(struct kernfs_node *parent, + const unsigned char *path, + const void *ns) +{ + static char path_buf[PATH_MAX]; /* protected by kernfs_mutex */ + size_t len = strlcpy(path_buf, path, PATH_MAX); + char *p = path_buf; + char *name; + + lockdep_assert_held(&kernfs_mutex); + + if (len >= PATH_MAX) + return NULL; + + while ((name = strsep(&p, "/")) && parent) { + if (*name == '\0') + continue; + parent = kernfs_find_ns(parent, name, ns); + } + + return parent; +} + /** * kernfs_find_and_get_ns - find and get kernfs_node with the given name * @parent: kernfs_node to search under @@ -719,6 +735,29 @@ struct kernfs_node *kernfs_find_and_get_ns(struct kernfs_node *parent, EXPORT_SYMBOL_GPL(kernfs_find_and_get_ns); /** + * kernfs_walk_and_get_ns - find and get kernfs_node with the given path + * @parent: kernfs_node to search under + * @path: path to look for + * @ns: the namespace tag to use + * + * Look for kernfs_node with path @path under @parent and get a reference + * if found. This function may sleep and returns pointer to the found + * kernfs_node on success, %NULL on failure. + */ +struct kernfs_node *kernfs_walk_and_get_ns(struct kernfs_node *parent, + const char *path, const void *ns) +{ + struct kernfs_node *kn; + + mutex_lock(&kernfs_mutex); + kn = kernfs_walk_ns(parent, path, ns); + kernfs_get(kn); + mutex_unlock(&kernfs_mutex); + + return kn; +} + +/** * kernfs_create_root - create a new kernfs hierarchy * @scops: optional syscall operations for the hierarchy * @flags: KERNFS_ROOT_* flags @@ -1472,9 +1511,9 @@ static loff_t kernfs_dir_fop_llseek(struct file *file, loff_t offset, struct inode *inode = file_inode(file); loff_t ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = generic_file_llseek(file, offset, whence); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c index 756dd56aa..16405ae88 100644 --- a/fs/kernfs/inode.c +++ b/fs/kernfs/inode.c @@ -205,7 +205,7 @@ int kernfs_iop_removexattr(struct dentry *dentry, const char *name) if (!attrs) return -ENOMEM; - return simple_xattr_remove(&attrs->xattrs, name); + return simple_xattr_set(&attrs->xattrs, name, NULL, 0, XATTR_REPLACE); } ssize_t kernfs_iop_getxattr(struct dentry *dentry, const char *name, void *buf, @@ -230,7 +230,7 @@ ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size) if (!attrs) return -ENOMEM; - return simple_xattr_list(&attrs->xattrs, buf, size); + return simple_xattr_list(d_inode(dentry), &attrs->xattrs, buf, size); } static inline void set_default_inode_attr(struct inode *inode, umode_t mode) diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c index db272528a..117b8b341 100644 --- a/fs/kernfs/symlink.c +++ b/fs/kernfs/symlink.c @@ -112,18 +112,25 @@ static int kernfs_getlink(struct dentry *dentry, char *path) return error; } -static const char *kernfs_iop_follow_link(struct dentry *dentry, void **cookie) +static const char *kernfs_iop_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) { - int error = -ENOMEM; - unsigned long page = get_zeroed_page(GFP_KERNEL); - if (!page) + char *body; + int error; + + if (!dentry) + return ERR_PTR(-ECHILD); + body = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!body) return ERR_PTR(-ENOMEM); - error = kernfs_getlink(dentry, (char *)page); + error = kernfs_getlink(dentry, body); if (unlikely(error < 0)) { - free_page((unsigned long)page); + kfree(body); return ERR_PTR(error); } - return *cookie = (char *)page; + set_delayed_call(done, kfree_link, body); + return body; } const struct inode_operations kernfs_symlink_iops = { @@ -132,8 +139,7 @@ const struct inode_operations kernfs_symlink_iops = { .getxattr = kernfs_iop_getxattr, .listxattr = kernfs_iop_listxattr, .readlink = generic_readlink, - .follow_link = kernfs_iop_follow_link, - .put_link = free_page_put_link, + .get_link = kernfs_iop_get_link, .setattr = kernfs_iop_setattr, .getattr = kernfs_iop_getattr, .permission = kernfs_iop_permission, diff --git a/fs/libfs.c b/fs/libfs.c index c7cbfb092..0ca80b2af 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -89,7 +89,7 @@ EXPORT_SYMBOL(dcache_dir_close); loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) { struct dentry *dentry = file->f_path.dentry; - mutex_lock(&d_inode(dentry)->i_mutex); + inode_lock(d_inode(dentry)); switch (whence) { case 1: offset += file->f_pos; @@ -97,7 +97,7 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) if (offset >= 0) break; default: - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); return -EINVAL; } if (offset != file->f_pos) { @@ -124,7 +124,7 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) spin_unlock(&dentry->d_lock); } } - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); return offset; } EXPORT_SYMBOL(dcache_dir_lseek); @@ -941,7 +941,7 @@ int __generic_file_fsync(struct file *file, loff_t start, loff_t end, if (err) return err; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = sync_mapping_buffers(inode->i_mapping); if (!(inode->i_state & I_DIRTY_ALL)) goto out; @@ -953,7 +953,7 @@ int __generic_file_fsync(struct file *file, loff_t start, loff_t end, ret = err; out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } EXPORT_SYMBOL(__generic_file_fsync); @@ -1019,17 +1019,12 @@ int noop_fsync(struct file *file, loff_t start, loff_t end, int datasync) } EXPORT_SYMBOL(noop_fsync); -void kfree_put_link(struct inode *unused, void *cookie) +/* Because kfree isn't assignment-compatible with void(void*) ;-/ */ +void kfree_link(void *p) { - kfree(cookie); + kfree(p); } -EXPORT_SYMBOL(kfree_put_link); - -void free_page_put_link(struct inode *unused, void *cookie) -{ - free_page((unsigned long) cookie); -} -EXPORT_SYMBOL(free_page_put_link); +EXPORT_SYMBOL(kfree_link); /* * nop .set_page_dirty method so that people can use .page_mkwrite on @@ -1092,14 +1087,15 @@ simple_nosetlease(struct file *filp, long arg, struct file_lock **flp, } EXPORT_SYMBOL(simple_nosetlease); -const char *simple_follow_link(struct dentry *dentry, void **cookie) +const char *simple_get_link(struct dentry *dentry, struct inode *inode, + struct delayed_call *done) { - return d_inode(dentry)->i_link; + return inode->i_link; } -EXPORT_SYMBOL(simple_follow_link); +EXPORT_SYMBOL(simple_get_link); const struct inode_operations simple_symlink_inode_operations = { - .follow_link = simple_follow_link, + .get_link = simple_get_link, .readlink = generic_readlink }; EXPORT_SYMBOL(simple_symlink_inode_operations); diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 5f31ebd96..154a107cd 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -25,13 +25,17 @@ #include <linux/mutex.h> #include <linux/kthread.h> #include <linux/freezer.h> +#include <linux/inetdevice.h> #include <linux/sunrpc/types.h> #include <linux/sunrpc/stats.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/svc.h> #include <linux/sunrpc/svcsock.h> +#include <linux/sunrpc/svc_xprt.h> #include <net/ip.h> +#include <net/addrconf.h> +#include <net/ipv6.h> #include <linux/lockd/lockd.h> #include <linux/nfs.h> @@ -44,7 +48,7 @@ static struct svc_program nlmsvc_program; -struct nlmsvc_binding * nlmsvc_ops; +const struct nlmsvc_binding *nlmsvc_ops; EXPORT_SYMBOL_GPL(nlmsvc_ops); static DEFINE_MUTEX(nlmsvc_mutex); @@ -90,8 +94,7 @@ static unsigned long get_lockd_grace_period(void) static void grace_ender(struct work_struct *grace) { - struct delayed_work *dwork = container_of(grace, struct delayed_work, - work); + struct delayed_work *dwork = to_delayed_work(grace); struct lockd_net *ln = container_of(dwork, struct lockd_net, grace_period_end); @@ -279,6 +282,68 @@ static void lockd_down_net(struct svc_serv *serv, struct net *net) } } +static int lockd_inetaddr_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; + struct sockaddr_in sin; + + if (event != NETDEV_DOWN) + goto out; + + if (nlmsvc_rqst) { + dprintk("lockd_inetaddr_event: removed %pI4\n", + &ifa->ifa_local); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = ifa->ifa_local; + svc_age_temp_xprts_now(nlmsvc_rqst->rq_server, + (struct sockaddr *)&sin); + } + +out: + return NOTIFY_DONE; +} + +static struct notifier_block lockd_inetaddr_notifier = { + .notifier_call = lockd_inetaddr_event, +}; + +#if IS_ENABLED(CONFIG_IPV6) +static int lockd_inet6addr_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr; + struct sockaddr_in6 sin6; + + if (event != NETDEV_DOWN) + goto out; + + if (nlmsvc_rqst) { + dprintk("lockd_inet6addr_event: removed %pI6\n", &ifa->addr); + sin6.sin6_family = AF_INET6; + sin6.sin6_addr = ifa->addr; + svc_age_temp_xprts_now(nlmsvc_rqst->rq_server, + (struct sockaddr *)&sin6); + } + +out: + return NOTIFY_DONE; +} + +static struct notifier_block lockd_inet6addr_notifier = { + .notifier_call = lockd_inet6addr_event, +}; +#endif + +static void lockd_svc_exit_thread(void) +{ + unregister_inetaddr_notifier(&lockd_inetaddr_notifier); +#if IS_ENABLED(CONFIG_IPV6) + unregister_inet6addr_notifier(&lockd_inet6addr_notifier); +#endif + svc_exit_thread(nlmsvc_rqst); +} + static int lockd_start_svc(struct svc_serv *serv) { int error; @@ -315,7 +380,7 @@ static int lockd_start_svc(struct svc_serv *serv) return 0; out_task: - svc_exit_thread(nlmsvc_rqst); + lockd_svc_exit_thread(); nlmsvc_task = NULL; out_rqst: nlmsvc_rqst = NULL; @@ -360,6 +425,10 @@ static struct svc_serv *lockd_create_svc(void) printk(KERN_WARNING "lockd_up: create service failed\n"); return ERR_PTR(-ENOMEM); } + register_inetaddr_notifier(&lockd_inetaddr_notifier); +#if IS_ENABLED(CONFIG_IPV6) + register_inet6addr_notifier(&lockd_inet6addr_notifier); +#endif dprintk("lockd_up: service created\n"); return serv; } @@ -428,7 +497,7 @@ lockd_down(struct net *net) } kthread_stop(nlmsvc_task); dprintk("lockd_down: service stopped\n"); - svc_exit_thread(nlmsvc_rqst); + lockd_svc_exit_thread(); dprintk("lockd_down: service destroyed\n"); nlmsvc_task = NULL; nlmsvc_rqst = NULL; diff --git a/fs/locks.c b/fs/locks.c index 6333263b7..7c5f91be9 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -119,7 +119,6 @@ #include <linux/fdtable.h> #include <linux/fs.h> #include <linux/init.h> -#include <linux/module.h> #include <linux/security.h> #include <linux/slab.h> #include <linux/syscalls.h> @@ -230,16 +229,44 @@ locks_get_lock_context(struct inode *inode, int type) ctx = smp_load_acquire(&inode->i_flctx); } out: + trace_locks_get_lock_context(inode, type, ctx); return ctx; } +static void +locks_dump_ctx_list(struct list_head *list, char *list_type) +{ + struct file_lock *fl; + + list_for_each_entry(fl, list, fl_list) { + pr_warn("%s: fl_owner=%p fl_flags=0x%x fl_type=0x%x fl_pid=%u\n", list_type, fl->fl_owner, fl->fl_flags, fl->fl_type, fl->fl_pid); + } +} + +static void +locks_check_ctx_lists(struct inode *inode) +{ + struct file_lock_context *ctx = inode->i_flctx; + + if (unlikely(!list_empty(&ctx->flc_flock) || + !list_empty(&ctx->flc_posix) || + !list_empty(&ctx->flc_lease))) { + pr_warn("Leaked locks on dev=0x%x:0x%x ino=0x%lx:\n", + MAJOR(inode->i_sb->s_dev), MINOR(inode->i_sb->s_dev), + inode->i_ino); + locks_dump_ctx_list(&ctx->flc_flock, "FLOCK"); + locks_dump_ctx_list(&ctx->flc_posix, "POSIX"); + locks_dump_ctx_list(&ctx->flc_lease, "LEASE"); + } +} + void -locks_free_lock_context(struct file_lock_context *ctx) +locks_free_lock_context(struct inode *inode) { - if (ctx) { - WARN_ON_ONCE(!list_empty(&ctx->flc_flock)); - WARN_ON_ONCE(!list_empty(&ctx->flc_posix)); - WARN_ON_ONCE(!list_empty(&ctx->flc_lease)); + struct file_lock_context *ctx = inode->i_flctx; + + if (unlikely(ctx)) { + locks_check_ctx_lists(inode); kmem_cache_free(flctx_cache, ctx); } } @@ -934,7 +961,8 @@ out: return error; } -static int __posix_lock_file(struct inode *inode, struct file_lock *request, struct file_lock *conflock) +static int posix_lock_inode(struct inode *inode, struct file_lock *request, + struct file_lock *conflock) { struct file_lock *fl, *tmp; struct file_lock *new_fl = NULL; @@ -1142,6 +1170,8 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str if (new_fl2) locks_free_lock(new_fl2); locks_dispose_list(&dispose); + trace_posix_lock_inode(inode, request, error); + return error; } @@ -1162,7 +1192,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str int posix_lock_file(struct file *filp, struct file_lock *fl, struct file_lock *conflock) { - return __posix_lock_file(file_inode(filp), fl, conflock); + return posix_lock_inode(file_inode(filp), fl, conflock); } EXPORT_SYMBOL(posix_lock_file); @@ -1178,7 +1208,7 @@ static int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl) int error; might_sleep (); for (;;) { - error = __posix_lock_file(inode, fl, NULL); + error = posix_lock_inode(inode, fl, NULL); if (error != FILE_LOCK_DEFERRED) break; error = wait_event_interruptible(fl->fl_wait, !fl->fl_next); @@ -1191,6 +1221,7 @@ static int posix_lock_inode_wait(struct inode *inode, struct file_lock *fl) return error; } +#ifdef CONFIG_MANDATORY_FILE_LOCKING /** * locks_mandatory_locked - Check for an active lock * @file: the file to check @@ -1227,20 +1258,16 @@ int locks_mandatory_locked(struct file *file) /** * locks_mandatory_area - Check for a conflicting lock - * @read_write: %FLOCK_VERIFY_WRITE for exclusive access, %FLOCK_VERIFY_READ - * for shared - * @inode: the file to check + * @inode: the file to check * @filp: how the file was opened (if it was) - * @offset: start of area to check - * @count: length of area to check + * @start: first byte in the file to check + * @end: lastbyte in the file to check + * @type: %F_WRLCK for a write lock, else %F_RDLCK * * Searches the inode's list of locks to find any POSIX locks which conflict. - * This function is called from rw_verify_area() and - * locks_verify_truncate(). */ -int locks_mandatory_area(int read_write, struct inode *inode, - struct file *filp, loff_t offset, - size_t count) +int locks_mandatory_area(struct inode *inode, struct file *filp, loff_t start, + loff_t end, unsigned char type) { struct file_lock fl; int error; @@ -1252,15 +1279,15 @@ int locks_mandatory_area(int read_write, struct inode *inode, fl.fl_flags = FL_POSIX | FL_ACCESS; if (filp && !(filp->f_flags & O_NONBLOCK)) sleep = true; - fl.fl_type = (read_write == FLOCK_VERIFY_WRITE) ? F_WRLCK : F_RDLCK; - fl.fl_start = offset; - fl.fl_end = offset + count - 1; + fl.fl_type = type; + fl.fl_start = start; + fl.fl_end = end; for (;;) { if (filp) { fl.fl_owner = filp; fl.fl_flags &= ~FL_SLEEP; - error = __posix_lock_file(inode, &fl, NULL); + error = posix_lock_inode(inode, &fl, NULL); if (!error) break; } @@ -1268,7 +1295,7 @@ int locks_mandatory_area(int read_write, struct inode *inode, if (sleep) fl.fl_flags |= FL_SLEEP; fl.fl_owner = current->files; - error = __posix_lock_file(inode, &fl, NULL); + error = posix_lock_inode(inode, &fl, NULL); if (error != FILE_LOCK_DEFERRED) break; error = wait_event_interruptible(fl.fl_wait, !fl.fl_next); @@ -1289,6 +1316,7 @@ int locks_mandatory_area(int read_write, struct inode *inode, } EXPORT_SYMBOL(locks_mandatory_area); +#endif /* CONFIG_MANDATORY_FILE_LOCKING */ static void lease_clear_pending(struct file_lock *fl, int arg) { @@ -1503,12 +1531,10 @@ void lease_get_mtime(struct inode *inode, struct timespec *time) ctx = smp_load_acquire(&inode->i_flctx); if (ctx && !list_empty_careful(&ctx->flc_lease)) { spin_lock(&ctx->flc_lock); - if (!list_empty(&ctx->flc_lease)) { - fl = list_first_entry(&ctx->flc_lease, - struct file_lock, fl_list); - if (fl->fl_type == F_WRLCK) - has_lease = true; - } + fl = list_first_entry_or_null(&ctx->flc_lease, + struct file_lock, fl_list); + if (fl && (fl->fl_type == F_WRLCK)) + has_lease = true; spin_unlock(&ctx->flc_lock); } @@ -1624,12 +1650,12 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr * bother, maybe that's a sign this just isn't a good file to * hand out a delegation on. */ - if (is_deleg && !mutex_trylock(&inode->i_mutex)) + if (is_deleg && !inode_trylock(inode)) return -EAGAIN; if (is_deleg && arg == F_WRLCK) { /* Write delegations are not currently supported: */ - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); WARN_ON_ONCE(1); return -EINVAL; } @@ -1706,7 +1732,7 @@ out: spin_unlock(&ctx->flc_lock); locks_dispose_list(&dispose); if (is_deleg) - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (!error && !my_fl) *flp = NULL; return error; @@ -2165,6 +2191,8 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, if (file_lock == NULL) return -ENOLCK; + inode = file_inode(filp); + /* * This might block, so we do it before checking the inode. */ @@ -2172,8 +2200,6 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, if (copy_from_user(&flock, l, sizeof(flock))) goto out; - inode = file_inode(filp); - /* Don't allow mandatory locks on files that may be memory mapped * and shared. */ @@ -2220,10 +2246,12 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, error = do_lock_file_wait(filp, cmd, file_lock); /* - * Attempt to detect a close/fcntl race and recover by - * releasing the lock that was just acquired. + * Attempt to detect a close/fcntl race and recover by releasing the + * lock that was just acquired. There is no need to do that when we're + * unlocking though, or for OFD locks. */ - if (!error && file_lock->fl_type != F_UNLCK) { + if (!error && file_lock->fl_type != F_UNLCK && + !(file_lock->fl_flags & FL_OFDLCK)) { /* * We need that spin_lock here - it prevents reordering between * update of i_flctx->flc_posix and check for it done in @@ -2240,6 +2268,7 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, } } out: + trace_fcntl_setlk(inode, file_lock, error); locks_free_lock(file_lock); return error; } @@ -2362,10 +2391,12 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd, error = do_lock_file_wait(filp, cmd, file_lock); /* - * Attempt to detect a close/fcntl race and recover by - * releasing the lock that was just acquired. + * Attempt to detect a close/fcntl race and recover by releasing the + * lock that was just acquired. There is no need to do that when we're + * unlocking though, or for OFD locks. */ - if (!error && file_lock->fl_type != F_UNLCK) { + if (!error && file_lock->fl_type != F_UNLCK && + !(file_lock->fl_flags & FL_OFDLCK)) { /* * We need that spin_lock here - it prevents reordering between * update of i_flctx->flc_posix and check for it done in @@ -2394,6 +2425,7 @@ out: */ void locks_remove_posix(struct file *filp, fl_owner_t owner) { + int error; struct file_lock lock; struct file_lock_context *ctx; @@ -2416,10 +2448,11 @@ void locks_remove_posix(struct file *filp, fl_owner_t owner) lock.fl_ops = NULL; lock.fl_lmops = NULL; - vfs_lock_file(filp, F_SETLK, &lock, NULL); + error = vfs_lock_file(filp, F_SETLK, &lock, NULL); if (lock.fl_ops && lock.fl_ops->fl_release_private) lock.fl_ops->fl_release_private(&lock); + trace_locks_remove_posix(file_inode(filp), &lock, error); } EXPORT_SYMBOL(locks_remove_posix); @@ -2715,7 +2748,7 @@ static int __init proc_locks_init(void) proc_create("locks", 0, NULL, &proc_locks_operations); return 0; } -module_init(proc_locks_init); +fs_initcall(proc_locks_init); #endif static int __init filelock_init(void) diff --git a/fs/logfs/Kconfig b/fs/logfs/Kconfig index 09ed066c0..2b4503163 100644 --- a/fs/logfs/Kconfig +++ b/fs/logfs/Kconfig @@ -1,6 +1,6 @@ config LOGFS tristate "LogFS file system" - depends on (MTD || BLOCK) + depends on MTD || (!MTD && BLOCK) select ZLIB_INFLATE select ZLIB_DEFLATE select CRC32 diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c index f9b45d46d..542468e9b 100644 --- a/fs/logfs/dir.c +++ b/fs/logfs/dir.c @@ -528,7 +528,8 @@ static int logfs_symlink(struct inode *dir, struct dentry *dentry, if (IS_ERR(inode)) return PTR_ERR(inode); - inode->i_op = &logfs_symlink_iops; + inode->i_op = &page_symlink_inode_operations; + inode_nohighmem(inode); inode->i_mapping->a_ops = &logfs_reg_aops; return __logfs_create(dir, dentry, inode, target, destlen); @@ -776,12 +777,6 @@ fail: return -EIO; } -const struct inode_operations logfs_symlink_iops = { - .readlink = generic_readlink, - .follow_link = page_follow_link_light, - .put_link = page_put_link, -}; - const struct inode_operations logfs_dir_iops = { .create = logfs_create, .link = logfs_link, diff --git a/fs/logfs/file.c b/fs/logfs/file.c index 1a6f0167b..61eaeb1b6 100644 --- a/fs/logfs/file.c +++ b/fs/logfs/file.c @@ -204,12 +204,12 @@ long logfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) if (err) return err; - mutex_lock(&inode->i_mutex); + inode_lock(inode); oldflags = li->li_flags; flags &= LOGFS_FL_USER_MODIFIABLE; flags |= oldflags & ~LOGFS_FL_USER_MODIFIABLE; li->li_flags = flags; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); inode->i_ctime = CURRENT_TIME; mark_inode_dirty_sync(inode); @@ -230,11 +230,11 @@ int logfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) if (ret) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); logfs_get_wblocks(sb, NULL, WF_LOCK); logfs_write_anchor(sb); logfs_put_wblocks(sb, NULL, WF_LOCK); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return 0; } diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c index af49e2d69..db9cfc598 100644 --- a/fs/logfs/inode.c +++ b/fs/logfs/inode.c @@ -64,7 +64,8 @@ static void logfs_inode_setops(struct inode *inode) inode->i_mapping->a_ops = &logfs_reg_aops; break; case S_IFLNK: - inode->i_op = &logfs_symlink_iops; + inode->i_op = &page_symlink_inode_operations; + inode_nohighmem(inode); inode->i_mapping->a_ops = &logfs_reg_aops; break; case S_IFSOCK: /* fall through */ @@ -408,7 +409,8 @@ const struct super_operations logfs_super_operations = { int logfs_init_inode_cache(void) { logfs_inode_cache = kmem_cache_create("logfs_inode_cache", - sizeof(struct logfs_inode), 0, SLAB_RECLAIM_ACCOUNT, + sizeof(struct logfs_inode), 0, + SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, logfs_init_once); if (!logfs_inode_cache) return -ENOMEM; diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h index 5f0937609..27d040e35 100644 --- a/fs/logfs/logfs.h +++ b/fs/logfs/logfs.h @@ -302,7 +302,7 @@ struct logfs_block { struct inode *inode; struct logfs_transaction *ta; unsigned long alias_map[LOGFS_BLOCK_FACTOR / BITS_PER_LONG]; - struct logfs_block_ops *ops; + const struct logfs_block_ops *ops; int full; int partial; int reserved_bytes; @@ -485,7 +485,7 @@ static inline int logfs_get_sb_bdev(struct logfs_super *s, #endif /* dev_mtd.c */ -#ifdef CONFIG_MTD +#if IS_ENABLED(CONFIG_MTD) int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr); #else static inline int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr) @@ -495,7 +495,6 @@ static inline int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr) #endif /* dir.c */ -extern const struct inode_operations logfs_symlink_iops; extern const struct inode_operations logfs_dir_iops; extern const struct file_operations logfs_dir_fops; int logfs_replay_journal(struct super_block *sb); @@ -579,7 +578,7 @@ int logfs_exist_block(struct inode *inode, u64 bix); int get_page_reserve(struct inode *inode, struct page *page); void logfs_get_wblocks(struct super_block *sb, struct page *page, int lock); void logfs_put_wblocks(struct super_block *sb, struct page *page, int lock); -extern struct logfs_block_ops indirect_block_ops; +extern const struct logfs_block_ops indirect_block_ops; /* segment.c */ int logfs_erase_segment(struct super_block *sb, u32 ofs, int ensure_erase); diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c index 380d86e1a..20973c9e5 100644 --- a/fs/logfs/readwrite.c +++ b/fs/logfs/readwrite.c @@ -569,13 +569,13 @@ static void indirect_free_block(struct super_block *sb, } -static struct logfs_block_ops inode_block_ops = { +static const struct logfs_block_ops inode_block_ops = { .write_block = inode_write_block, .free_block = inode_free_block, .write_alias = inode_write_alias, }; -struct logfs_block_ops indirect_block_ops = { +const struct logfs_block_ops indirect_block_ops = { .write_block = indirect_write_block, .free_block = indirect_free_block, .write_alias = indirect_write_alias, diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c index 6de0fbfc6..d270e4b2a 100644 --- a/fs/logfs/segment.c +++ b/fs/logfs/segment.c @@ -197,7 +197,7 @@ static int btree_write_alias(struct super_block *sb, struct logfs_block *block, return 0; } -static struct logfs_block_ops btree_block_ops = { +static const struct logfs_block_ops btree_block_ops = { .write_block = btree_write_block, .free_block = __free_block, .write_alias = btree_write_alias, diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 086cd0a61..f975d667c 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -91,7 +91,7 @@ static int __init init_inodecache(void) minix_inode_cachep = kmem_cache_create("minix_inode_cache", sizeof(struct minix_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (minix_inode_cachep == NULL) return -ENOMEM; @@ -435,8 +435,7 @@ static const struct address_space_operations minix_aops = { static const struct inode_operations minix_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = page_follow_link_light, - .put_link = page_put_link, + .get_link = page_get_link, .getattr = minix_getattr, }; @@ -452,6 +451,7 @@ void minix_set_inode(struct inode *inode, dev_t rdev) inode->i_mapping->a_ops = &minix_aops; } else if (S_ISLNK(inode->i_mode)) { inode->i_op = &minix_symlink_inode_operations; + inode_nohighmem(inode); inode->i_mapping->a_ops = &minix_aops; } else init_special_inode(inode, inode->i_mode, rdev); diff --git a/fs/minix/itree_v1.c b/fs/minix/itree_v1.c index 282e15ad8..46ca39d6c 100644 --- a/fs/minix/itree_v1.c +++ b/fs/minix/itree_v1.c @@ -24,16 +24,15 @@ static inline block_t *i_data(struct inode *inode) static int block_to_path(struct inode * inode, long block, int offsets[DEPTH]) { int n = 0; - char b[BDEVNAME_SIZE]; if (block < 0) { - printk("MINIX-fs: block_to_path: block %ld < 0 on dev %s\n", - block, bdevname(inode->i_sb->s_bdev, b)); + printk("MINIX-fs: block_to_path: block %ld < 0 on dev %pg\n", + block, inode->i_sb->s_bdev); } else if (block >= (minix_sb(inode->i_sb)->s_max_size/BLOCK_SIZE)) { if (printk_ratelimit()) printk("MINIX-fs: block_to_path: " - "block %ld too big on dev %s\n", - block, bdevname(inode->i_sb->s_bdev, b)); + "block %ld too big on dev %pg\n", + block, inode->i_sb->s_bdev); } else if (block < 7) { offsets[n++] = block; } else if ((block -= 7) < 512) { diff --git a/fs/minix/itree_v2.c b/fs/minix/itree_v2.c index 78e2d93e5..1ee101352 100644 --- a/fs/minix/itree_v2.c +++ b/fs/minix/itree_v2.c @@ -26,18 +26,17 @@ static inline block_t *i_data(struct inode *inode) static int block_to_path(struct inode * inode, long block, int offsets[DEPTH]) { int n = 0; - char b[BDEVNAME_SIZE]; struct super_block *sb = inode->i_sb; if (block < 0) { - printk("MINIX-fs: block_to_path: block %ld < 0 on dev %s\n", - block, bdevname(sb->s_bdev, b)); + printk("MINIX-fs: block_to_path: block %ld < 0 on dev %pg\n", + block, sb->s_bdev); } else if ((u64)block * (u64)sb->s_blocksize >= minix_sb(sb)->s_max_size) { if (printk_ratelimit()) printk("MINIX-fs: block_to_path: " - "block %ld too big on dev %s\n", - block, bdevname(sb->s_bdev, b)); + "block %ld too big on dev %pg\n", + block, sb->s_bdev); } else if (block < DIRCOUNT) { offsets[n++] = block; } else if ((block -= DIRCOUNT) < INDIRCOUNT(sb)) { diff --git a/fs/namei.c b/fs/namei.c index d8ee4da93..9c590e0f6 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -505,13 +505,13 @@ struct nameidata { int total_link_count; struct saved { struct path link; - void *cookie; + struct delayed_call done; const char *name; - struct inode *inode; unsigned seq; } *stack, internal[EMBEDDED_LEVELS]; struct filename *name; struct nameidata *saved; + struct inode *link_inode; unsigned root_seq; int dfd; }; @@ -534,10 +534,8 @@ static void restore_nameidata(void) current->nameidata = old; if (old) old->total_link_count = now->total_link_count; - if (now->stack != now->internal) { + if (now->stack != now->internal) kfree(now->stack); - now->stack = now->internal; - } } static int __nd_alloc_stack(struct nameidata *nd) @@ -592,11 +590,8 @@ static void drop_links(struct nameidata *nd) int i = nd->depth; while (i--) { struct saved *last = nd->stack + i; - struct inode *inode = last->inode; - if (last->cookie && inode->i_op->put_link) { - inode->i_op->put_link(inode, last->cookie); - last->cookie = NULL; - } + do_delayed_call(&last->done); + clear_delayed_call(&last->done); } } @@ -657,7 +652,7 @@ static bool legitimize_links(struct nameidata *nd) * Path walking has 2 modes, rcu-walk and ref-walk (see * Documentation/filesystems/path-lookup.txt). In situations when we can't * continue in RCU mode, we attempt to drop out of rcu-walk mode and grab - * normal reference counts on dentries and vfsmounts to transition to rcu-walk + * normal reference counts on dentries and vfsmounts to transition to ref-walk * mode. Refcounts are grabbed at the last known good point before rcu-walk * got stuck, so ref-walk may continue from there. If this is not successful * (eg. a seqcount has changed), then failure is returned and it's up to caller @@ -807,19 +802,19 @@ static int complete_walk(struct nameidata *nd) static void set_root(struct nameidata *nd) { - get_fs_root(current->fs, &nd->root); -} - -static void set_root_rcu(struct nameidata *nd) -{ struct fs_struct *fs = current->fs; - unsigned seq; - do { - seq = read_seqcount_begin(&fs->seq); - nd->root = fs->root; - nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq); - } while (read_seqcount_retry(&fs->seq, seq)); + if (nd->flags & LOOKUP_RCU) { + unsigned seq; + + do { + seq = read_seqcount_begin(&fs->seq); + nd->root = fs->root; + nd->root_seq = __read_seqcount_begin(&nd->root.dentry->d_seq); + } while (read_seqcount_retry(&fs->seq, seq)); + } else { + get_fs_root(fs, &nd->root); + } } static void path_put_conditional(struct path *path, struct nameidata *nd) @@ -841,8 +836,28 @@ static inline void path_to_nameidata(const struct path *path, nd->path.dentry = path->dentry; } +static int nd_jump_root(struct nameidata *nd) +{ + if (nd->flags & LOOKUP_RCU) { + struct dentry *d; + nd->path = nd->root; + d = nd->path.dentry; + nd->inode = d->d_inode; + nd->seq = nd->root_seq; + if (unlikely(read_seqcount_retry(&d->d_seq, nd->seq))) + return -ECHILD; + } else { + path_put(&nd->path); + nd->path = nd->root; + path_get(&nd->path); + nd->inode = nd->path.dentry->d_inode; + } + nd->flags |= LOOKUP_JUMPED; + return 0; +} + /* - * Helper to directly jump to a known parsed path from ->follow_link, + * Helper to directly jump to a known parsed path from ->get_link, * caller must have taken a reference to path beforehand. */ void nd_jump_link(struct path *path) @@ -858,9 +873,7 @@ void nd_jump_link(struct path *path) static inline void put_link(struct nameidata *nd) { struct saved *last = nd->stack + --nd->depth; - struct inode *inode = last->inode; - if (last->cookie && inode->i_op->put_link) - inode->i_op->put_link(inode, last->cookie); + do_delayed_call(&last->done); if (!(nd->flags & LOOKUP_RCU)) path_put(&last->link); } @@ -892,7 +905,7 @@ static inline int may_follow_link(struct nameidata *nd) return 0; /* Allowed if owner and follower match. */ - inode = nd->stack[0].inode; + inode = nd->link_inode; if (uid_eq(current_cred()->fsuid, inode->i_uid)) return 0; @@ -983,7 +996,7 @@ const char *get_link(struct nameidata *nd) { struct saved *last = nd->stack + nd->depth - 1; struct dentry *dentry = last->link.dentry; - struct inode *inode = last->inode; + struct inode *inode = nd->link_inode; int error; const char *res; @@ -1004,36 +1017,27 @@ const char *get_link(struct nameidata *nd) nd->last_type = LAST_BIND; res = inode->i_link; if (!res) { + const char * (*get)(struct dentry *, struct inode *, + struct delayed_call *); + get = inode->i_op->get_link; if (nd->flags & LOOKUP_RCU) { - if (unlikely(unlazy_walk(nd, NULL, 0))) - return ERR_PTR(-ECHILD); + res = get(NULL, inode, &last->done); + if (res == ERR_PTR(-ECHILD)) { + if (unlikely(unlazy_walk(nd, NULL, 0))) + return ERR_PTR(-ECHILD); + res = get(dentry, inode, &last->done); + } + } else { + res = get(dentry, inode, &last->done); } - res = inode->i_op->follow_link(dentry, &last->cookie); - if (IS_ERR_OR_NULL(res)) { - last->cookie = NULL; + if (IS_ERR_OR_NULL(res)) return res; - } } if (*res == '/') { - if (nd->flags & LOOKUP_RCU) { - struct dentry *d; - if (!nd->root.mnt) - set_root_rcu(nd); - nd->path = nd->root; - d = nd->path.dentry; - nd->inode = d->d_inode; - nd->seq = nd->root_seq; - if (unlikely(read_seqcount_retry(&d->d_seq, nd->seq))) - return ERR_PTR(-ECHILD); - } else { - if (!nd->root.mnt) - set_root(nd); - path_put(&nd->path); - nd->path = nd->root; - path_get(&nd->root); - nd->inode = nd->path.dentry->d_inode; - } - nd->flags |= LOOKUP_JUMPED; + if (!nd->root.mnt) + set_root(nd); + if (unlikely(nd_jump_root(nd))) + return ERR_PTR(-ECHILD); while (unlikely(*++res == '/')) ; } @@ -1294,8 +1298,6 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, static int follow_dotdot_rcu(struct nameidata *nd) { struct inode *inode = nd->inode; - if (!nd->root.mnt) - set_root_rcu(nd); while (1) { if (path_equal(&nd->path, &nd->root)) @@ -1415,9 +1417,6 @@ static void follow_mount(struct path *path) static int follow_dotdot(struct nameidata *nd) { - if (!nd->root.mnt) - set_root(nd); - while(1) { struct dentry *old = nd->path.dentry; @@ -1630,9 +1629,9 @@ static int lookup_slow(struct nameidata *nd, struct path *path) parent = nd->path.dentry; BUG_ON(nd->inode != parent->d_inode); - mutex_lock(&parent->d_inode->i_mutex); + inode_lock(parent->d_inode); dentry = __lookup_hash(&nd->last, parent, nd->flags); - mutex_unlock(&parent->d_inode->i_mutex); + inode_unlock(parent->d_inode); if (IS_ERR(dentry)) return PTR_ERR(dentry); path->mnt = nd->path.mnt; @@ -1655,6 +1654,8 @@ static inline int may_lookup(struct nameidata *nd) static inline int handle_dots(struct nameidata *nd, int type) { if (type == LAST_DOTDOT) { + if (!nd->root.mnt) + set_root(nd); if (nd->flags & LOOKUP_RCU) { return follow_dotdot_rcu(nd); } else @@ -1691,8 +1692,8 @@ static int pick_link(struct nameidata *nd, struct path *link, last = nd->stack + nd->depth++; last->link = *link; - last->cookie = NULL; - last->inode = inode; + clear_delayed_call(&last->done); + nd->link_inode = inode; last->seq = seq; return 1; } @@ -2025,18 +2026,19 @@ static const char *path_init(struct nameidata *nd, unsigned flags) } nd->root.mnt = NULL; + nd->path.mnt = NULL; + nd->path.dentry = NULL; nd->m_seq = read_seqbegin(&mount_lock); if (*s == '/') { - if (flags & LOOKUP_RCU) { + if (flags & LOOKUP_RCU) rcu_read_lock(); - set_root_rcu(nd); - nd->seq = nd->root_seq; - } else { - set_root(nd); - path_get(&nd->root); - } - nd->path = nd->root; + set_root(nd); + if (likely(!nd_jump_root(nd))) + return s; + nd->root.mnt = NULL; + rcu_read_unlock(); + return ERR_PTR(-ECHILD); } else if (nd->dfd == AT_FDCWD) { if (flags & LOOKUP_RCU) { struct fs_struct *fs = current->fs; @@ -2047,11 +2049,14 @@ static const char *path_init(struct nameidata *nd, unsigned flags) do { seq = read_seqcount_begin(&fs->seq); nd->path = fs->pwd; + nd->inode = nd->path.dentry->d_inode; nd->seq = __read_seqcount_begin(&nd->path.dentry->d_seq); } while (read_seqcount_retry(&fs->seq, seq)); } else { get_fs_pwd(current->fs, &nd->path); + nd->inode = nd->path.dentry->d_inode; } + return s; } else { /* Caller must check execute permissions on the starting path component */ struct fd f = fdget_raw(nd->dfd); @@ -2081,16 +2086,6 @@ static const char *path_init(struct nameidata *nd, unsigned flags) fdput(f); return s; } - - nd->inode = nd->path.dentry->d_inode; - if (!(flags & LOOKUP_RCU)) - return s; - if (likely(!read_seqcount_retry(&nd->path.dentry->d_seq, nd->seq))) - return s; - if (!(nd->flags & LOOKUP_ROOT)) - nd->root.mnt = NULL; - rcu_read_unlock(); - return ERR_PTR(-ECHILD); } static const char *trailing_symlink(struct nameidata *nd) @@ -2239,10 +2234,10 @@ struct dentry *kern_path_locked(const char *name, struct path *path) putname(filename); return ERR_PTR(-EINVAL); } - mutex_lock_nested(&path->dentry->d_inode->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT); d = __lookup_hash(&last, path->dentry, 0); if (IS_ERR(d)) { - mutex_unlock(&path->dentry->d_inode->i_mutex); + inode_unlock(path->dentry->d_inode); path_put(path); } putname(filename); @@ -2283,6 +2278,8 @@ EXPORT_SYMBOL(vfs_path_lookup); * * Note that this routine is purely a helper for filesystem usage and should * not be called by generic code. + * + * The caller must hold base->i_mutex. */ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) { @@ -2290,7 +2287,7 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) unsigned int c; int err; - WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex)); + WARN_ON_ONCE(!inode_is_locked(base->d_inode)); this.name = name; this.len = len; @@ -2326,6 +2323,75 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) } EXPORT_SYMBOL(lookup_one_len); +/** + * lookup_one_len_unlocked - filesystem helper to lookup single pathname component + * @name: pathname component to lookup + * @base: base directory to lookup from + * @len: maximum length @len should be interpreted to + * + * Note that this routine is purely a helper for filesystem usage and should + * not be called by generic code. + * + * Unlike lookup_one_len, it should be called without the parent + * i_mutex held, and will take the i_mutex itself if necessary. + */ +struct dentry *lookup_one_len_unlocked(const char *name, + struct dentry *base, int len) +{ + struct qstr this; + unsigned int c; + int err; + struct dentry *ret; + + this.name = name; + this.len = len; + this.hash = full_name_hash(name, len); + if (!len) + return ERR_PTR(-EACCES); + + if (unlikely(name[0] == '.')) { + if (len < 2 || (len == 2 && name[1] == '.')) + return ERR_PTR(-EACCES); + } + + while (len--) { + c = *(const unsigned char *)name++; + if (c == '/' || c == '\0') + return ERR_PTR(-EACCES); + } + /* + * See if the low-level filesystem might want + * to use its own hash.. + */ + if (base->d_flags & DCACHE_OP_HASH) { + int err = base->d_op->d_hash(base, &this); + if (err < 0) + return ERR_PTR(err); + } + + err = inode_permission(base->d_inode, MAY_EXEC); + if (err) + return ERR_PTR(err); + + /* + * __d_lookup() is used to try to get a quick answer and avoid the + * mutex. A false-negative does no harm. + */ + ret = __d_lookup(base, &this); + if (ret && unlikely(ret->d_flags & DCACHE_OP_REVALIDATE)) { + dput(ret); + ret = NULL; + } + if (ret) + return ret; + + inode_lock(base->d_inode); + ret = __lookup_hash(&this, base, 0); + inode_unlock(base->d_inode); + return ret; +} +EXPORT_SYMBOL(lookup_one_len_unlocked); + int user_path_at_empty(int dfd, const char __user *name, unsigned flags, struct path *path, int *empty) { @@ -2402,7 +2468,7 @@ mountpoint_last(struct nameidata *nd, struct path *path) goto done; } - mutex_lock(&dir->d_inode->i_mutex); + inode_lock(dir->d_inode); dentry = d_lookup(dir, &nd->last); if (!dentry) { /* @@ -2412,16 +2478,16 @@ mountpoint_last(struct nameidata *nd, struct path *path) */ dentry = d_alloc(dir, &nd->last); if (!dentry) { - mutex_unlock(&dir->d_inode->i_mutex); + inode_unlock(dir->d_inode); return -ENOMEM; } dentry = lookup_real(dir->d_inode, dentry, nd->flags); if (IS_ERR(dentry)) { - mutex_unlock(&dir->d_inode->i_mutex); + inode_unlock(dir->d_inode); return PTR_ERR(dentry); } } - mutex_unlock(&dir->d_inode->i_mutex); + inode_unlock(dir->d_inode); done: if (d_is_negative(dentry)) { @@ -2611,7 +2677,7 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2) struct dentry *p; if (p1 == p2) { - mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(p1->d_inode, I_MUTEX_PARENT); return NULL; } @@ -2619,29 +2685,29 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2) p = d_ancestor(p2, p1); if (p) { - mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(p2->d_inode, I_MUTEX_PARENT); + inode_lock_nested(p1->d_inode, I_MUTEX_CHILD); return p; } p = d_ancestor(p1, p2); if (p) { - mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(p1->d_inode, I_MUTEX_PARENT); + inode_lock_nested(p2->d_inode, I_MUTEX_CHILD); return p; } - mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT2); + inode_lock_nested(p1->d_inode, I_MUTEX_PARENT); + inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2); return NULL; } EXPORT_SYMBOL(lock_rename); void unlock_rename(struct dentry *p1, struct dentry *p2) { - mutex_unlock(&p1->d_inode->i_mutex); + inode_unlock(p1->d_inode); if (p1 != p2) { - mutex_unlock(&p2->d_inode->i_mutex); + inode_unlock(p2->d_inode); mutex_unlock(&p1->d_inode->i_sb->s_vfs_rename_mutex); } } @@ -2674,10 +2740,6 @@ static int may_open(struct path *path, int acc_mode, int flag) struct inode *inode = dentry->d_inode; int error; - /* O_PATH? */ - if (!acc_mode) - return 0; - if (!inode) return -ENOENT; @@ -2699,7 +2761,7 @@ static int may_open(struct path *path, int acc_mode, int flag) break; } - error = inode_permission(inode, acc_mode); + error = inode_permission(inode, MAY_OPEN | acc_mode); if (error) return error; @@ -2891,7 +2953,7 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry, if (*opened & FILE_CREATED) { WARN_ON(!(open_flag & O_CREAT)); fsnotify_create(dir, dentry); - acc_mode = MAY_OPEN; + acc_mode = 0; } error = may_open(&file->f_path, acc_mode, open_flag); if (error) @@ -3084,9 +3146,9 @@ retry_lookup: * dropping this one anyway. */ } - mutex_lock(&dir->d_inode->i_mutex); + inode_lock(dir->d_inode); error = lookup_open(nd, &path, file, op, got_write, opened); - mutex_unlock(&dir->d_inode->i_mutex); + inode_unlock(dir->d_inode); if (error <= 0) { if (error) @@ -3104,7 +3166,7 @@ retry_lookup: /* Don't check for write permission, don't truncate */ open_flag &= ~O_TRUNC; will_truncate = false; - acc_mode = MAY_OPEN; + acc_mode = 0; path_to_nameidata(&path, nd); goto finish_open_created; } @@ -3187,10 +3249,11 @@ finish_open: got_write = true; } finish_open_created: - error = may_open(&nd->path, acc_mode, open_flag); - if (error) - goto out; - + if (likely(!(open_flag & O_PATH))) { + error = may_open(&nd->path, acc_mode, open_flag); + if (error) + goto out; + } BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */ error = vfs_open(&nd->path, file, current_cred()); if (!error) { @@ -3281,7 +3344,7 @@ static int do_tmpfile(struct nameidata *nd, unsigned flags, goto out2; audit_inode(nd->name, child, 0); /* Don't check for other permissions, the inode was just created */ - error = may_open(&path, MAY_OPEN, op->open_flag); + error = may_open(&path, 0, op->open_flag); if (error) goto out2; file->f_path.mnt = path.mnt; @@ -3434,7 +3497,7 @@ static struct dentry *filename_create(int dfd, struct filename *name, * Do the final lookup. */ lookup_flags |= LOOKUP_CREATE | LOOKUP_EXCL; - mutex_lock_nested(&path->dentry->d_inode->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT); dentry = __lookup_hash(&last, path->dentry, lookup_flags); if (IS_ERR(dentry)) goto unlock; @@ -3463,7 +3526,7 @@ fail: dput(dentry); dentry = ERR_PTR(error); unlock: - mutex_unlock(&path->dentry->d_inode->i_mutex); + inode_unlock(path->dentry->d_inode); if (!err2) mnt_drop_write(path->mnt); out: @@ -3483,7 +3546,7 @@ EXPORT_SYMBOL(kern_path_create); void done_path_create(struct path *path, struct dentry *dentry) { dput(dentry); - mutex_unlock(&path->dentry->d_inode->i_mutex); + inode_unlock(path->dentry->d_inode); mnt_drop_write(path->mnt); path_put(path); } @@ -3680,7 +3743,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry) return -EPERM; dget(dentry); - mutex_lock(&dentry->d_inode->i_mutex); + inode_lock(dentry->d_inode); error = -EBUSY; if (is_local_mountpoint(dentry)) @@ -3700,7 +3763,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry) detach_mounts(dentry); out: - mutex_unlock(&dentry->d_inode->i_mutex); + inode_unlock(dentry->d_inode); dput(dentry); if (!error) d_delete(dentry); @@ -3739,7 +3802,7 @@ retry: if (error) goto exit1; - mutex_lock_nested(&path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT); dentry = __lookup_hash(&last, path.dentry, lookup_flags); error = PTR_ERR(dentry); if (IS_ERR(dentry)) @@ -3755,7 +3818,7 @@ retry: exit3: dput(dentry); exit2: - mutex_unlock(&path.dentry->d_inode->i_mutex); + inode_unlock(path.dentry->d_inode); mnt_drop_write(path.mnt); exit1: path_put(&path); @@ -3801,7 +3864,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegate if (!dir->i_op->unlink) return -EPERM; - mutex_lock(&target->i_mutex); + inode_lock(target); if (is_local_mountpoint(dentry)) error = -EBUSY; else { @@ -3818,7 +3881,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegate } } out: - mutex_unlock(&target->i_mutex); + inode_unlock(target); /* We don't d_delete() NFS sillyrenamed files--they still exist. */ if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) { @@ -3861,7 +3924,7 @@ retry: if (error) goto exit1; retry_deleg: - mutex_lock_nested(&path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT); dentry = __lookup_hash(&last, path.dentry, lookup_flags); error = PTR_ERR(dentry); if (!IS_ERR(dentry)) { @@ -3879,7 +3942,7 @@ retry_deleg: exit2: dput(dentry); } - mutex_unlock(&path.dentry->d_inode->i_mutex); + inode_unlock(path.dentry->d_inode); if (inode) iput(inode); /* truncate the inode here */ inode = NULL; @@ -4031,7 +4094,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de if (error) return error; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* Make sure we don't allow creating hardlink to an unlinked file */ if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE)) error = -ENOENT; @@ -4048,7 +4111,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de inode->i_state &= ~I_LINKABLE; spin_unlock(&inode->i_lock); } - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (!error) fsnotify_link(dir, inode, new_dentry); return error; @@ -4248,7 +4311,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (!is_dir || (flags & RENAME_EXCHANGE)) lock_two_nondirectories(source, target); else if (target) - mutex_lock(&target->i_mutex); + inode_lock(target); error = -EBUSY; if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry)) @@ -4301,7 +4364,7 @@ out: if (!is_dir || (flags & RENAME_EXCHANGE)) unlock_two_nondirectories(source, target); else if (target) - mutex_unlock(&target->i_mutex); + inode_unlock(target); dput(new_dentry); if (!error) { fsnotify_move(old_dir, new_dir, old_name, is_dir, @@ -4503,72 +4566,73 @@ EXPORT_SYMBOL(readlink_copy); /* * A helper for ->readlink(). This should be used *ONLY* for symlinks that - * have ->follow_link() touching nd only in nd_set_link(). Using (or not - * using) it for any given inode is up to filesystem. + * have ->get_link() not calling nd_jump_link(). Using (or not using) it + * for any given inode is up to filesystem. */ int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen) { - void *cookie; + DEFINE_DELAYED_CALL(done); struct inode *inode = d_inode(dentry); const char *link = inode->i_link; int res; if (!link) { - link = inode->i_op->follow_link(dentry, &cookie); + link = inode->i_op->get_link(dentry, inode, &done); if (IS_ERR(link)) return PTR_ERR(link); } res = readlink_copy(buffer, buflen, link); - if (inode->i_op->put_link) - inode->i_op->put_link(inode, cookie); + do_delayed_call(&done); return res; } EXPORT_SYMBOL(generic_readlink); /* get the link contents into pagecache */ -static char *page_getlink(struct dentry * dentry, struct page **ppage) +const char *page_get_link(struct dentry *dentry, struct inode *inode, + struct delayed_call *callback) { char *kaddr; struct page *page; - struct address_space *mapping = dentry->d_inode->i_mapping; - page = read_mapping_page(mapping, 0, NULL); - if (IS_ERR(page)) - return (char*)page; - *ppage = page; - kaddr = kmap(page); - nd_terminate_link(kaddr, dentry->d_inode->i_size, PAGE_SIZE - 1); + struct address_space *mapping = inode->i_mapping; + + if (!dentry) { + page = find_get_page(mapping, 0); + if (!page) + return ERR_PTR(-ECHILD); + if (!PageUptodate(page)) { + put_page(page); + return ERR_PTR(-ECHILD); + } + } else { + page = read_mapping_page(mapping, 0, NULL); + if (IS_ERR(page)) + return (char*)page; + } + set_delayed_call(callback, page_put_link, page); + BUG_ON(mapping_gfp_mask(mapping) & __GFP_HIGHMEM); + kaddr = page_address(page); + nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1); return kaddr; } -int page_readlink(struct dentry *dentry, char __user *buffer, int buflen) -{ - struct page *page = NULL; - int res = readlink_copy(buffer, buflen, page_getlink(dentry, &page)); - if (page) { - kunmap(page); - page_cache_release(page); - } - return res; -} -EXPORT_SYMBOL(page_readlink); +EXPORT_SYMBOL(page_get_link); -const char *page_follow_link_light(struct dentry *dentry, void **cookie) +void page_put_link(void *arg) { - struct page *page = NULL; - char *res = page_getlink(dentry, &page); - if (!IS_ERR(res)) - *cookie = page; - return res; + put_page(arg); } -EXPORT_SYMBOL(page_follow_link_light); +EXPORT_SYMBOL(page_put_link); -void page_put_link(struct inode *unused, void *cookie) +int page_readlink(struct dentry *dentry, char __user *buffer, int buflen) { - struct page *page = cookie; - kunmap(page); - page_cache_release(page); + DEFINE_DELAYED_CALL(done); + int res = readlink_copy(buffer, buflen, + page_get_link(dentry, d_inode(dentry), + &done)); + do_delayed_call(&done); + return res; } -EXPORT_SYMBOL(page_put_link); +EXPORT_SYMBOL(page_readlink); /* * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS @@ -4579,7 +4643,6 @@ int __page_symlink(struct inode *inode, const char *symname, int len, int nofs) struct page *page; void *fsdata; int err; - char *kaddr; unsigned int flags = AOP_FLAG_UNINTERRUPTIBLE; if (nofs) flags |= AOP_FLAG_NOFS; @@ -4590,9 +4653,7 @@ retry: if (err) goto fail; - kaddr = kmap_atomic(page); - memcpy(kaddr, symname, len-1); - kunmap_atomic(kaddr); + memcpy(page_address(page), symname, len-1); err = pagecache_write_end(NULL, mapping, 0, len-1, len-1, page, fsdata); @@ -4617,7 +4678,6 @@ EXPORT_SYMBOL(page_symlink); const struct inode_operations page_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = page_follow_link_light, - .put_link = page_put_link, + .get_link = page_get_link, }; EXPORT_SYMBOL(page_symlink_inode_operations); diff --git a/fs/namespace.c b/fs/namespace.c index fc5002819..4fb1691b4 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -463,7 +463,6 @@ void __mnt_drop_write(struct vfsmount *mnt) mnt_dec_writers(real_mount(mnt)); preempt_enable(); } -EXPORT_SYMBOL_GPL(__mnt_drop_write); /** * mnt_drop_write - give up write access to a mount @@ -1585,6 +1584,14 @@ static inline bool may_mount(void) return ns_capable(current->nsproxy->mnt_ns->user_ns, CAP_SYS_ADMIN); } +static inline bool may_mandlock(void) +{ +#ifndef CONFIG_MANDATORY_FILE_LOCKING + return false; +#endif + return capable(CAP_SYS_ADMIN); +} + /* * Now umount can handle mount points as well as block devices. * This is important for filesystems which use unnamed block devices. @@ -1804,7 +1811,6 @@ int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg, } return 0; } -EXPORT_SYMBOL_GPL(iterate_mounts); static void cleanup_group_ids(struct mount *mnt, struct mount *end) { @@ -1955,9 +1961,9 @@ static struct mountpoint *lock_mount(struct path *path) struct vfsmount *mnt; struct dentry *dentry = path->dentry; retry: - mutex_lock(&dentry->d_inode->i_mutex); + inode_lock(dentry->d_inode); if (unlikely(cant_mount(dentry))) { - mutex_unlock(&dentry->d_inode->i_mutex); + inode_unlock(dentry->d_inode); return ERR_PTR(-ENOENT); } namespace_lock(); @@ -1968,13 +1974,13 @@ retry: mp = new_mountpoint(dentry); if (IS_ERR(mp)) { namespace_unlock(); - mutex_unlock(&dentry->d_inode->i_mutex); + inode_unlock(dentry->d_inode); return mp; } return mp; } namespace_unlock(); - mutex_unlock(&path->dentry->d_inode->i_mutex); + inode_unlock(path->dentry->d_inode); path_put(path); path->mnt = mnt; dentry = path->dentry = dget(mnt->mnt_root); @@ -1986,7 +1992,7 @@ static void unlock_mount(struct mountpoint *where) struct dentry *dentry = where->m_dentry; put_mountpoint(where); namespace_unlock(); - mutex_unlock(&dentry->d_inode->i_mutex); + inode_unlock(dentry->d_inode); } static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp) @@ -2603,18 +2609,18 @@ static long exact_copy_from_user(void *to, const void __user * from, return n; } -int copy_mount_options(const void __user * data, unsigned long *where) +void *copy_mount_options(const void __user * data) { int i; - unsigned long page; unsigned long size; + char *copy; - *where = 0; if (!data) - return 0; + return NULL; - if (!(page = __get_free_page(GFP_KERNEL))) - return -ENOMEM; + copy = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!copy) + return ERR_PTR(-ENOMEM); /* We only care that *some* data at the address the user * gave us is valid. Just in case, we'll zero @@ -2625,15 +2631,14 @@ int copy_mount_options(const void __user * data, unsigned long *where) if (size > PAGE_SIZE) size = PAGE_SIZE; - i = size - exact_copy_from_user((void *)page, data, size); + i = size - exact_copy_from_user(copy, data, size); if (!i) { - free_page(page); - return -EFAULT; + kfree(copy); + return ERR_PTR(-EFAULT); } if (i != PAGE_SIZE) - memset((char *)page + i, 0, PAGE_SIZE - i); - *where = page; - return 0; + memset(copy + i, 0, PAGE_SIZE - i); + return copy; } char *copy_mount_string(const void __user *data) @@ -2679,6 +2684,8 @@ long do_mount(const char *dev_name, const char __user *dir_name, type_page, flags, data_page); if (!retval && !may_mount()) retval = -EPERM; + if (!retval && (flags & MS_MANDLOCK) && !may_mandlock()) + retval = -EPERM; if (retval) goto dput_out; @@ -2898,7 +2905,7 @@ SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, int ret; char *kernel_type; char *kernel_dev; - unsigned long data_page; + void *options; kernel_type = copy_mount_string(type); ret = PTR_ERR(kernel_type); @@ -2910,14 +2917,14 @@ SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, if (IS_ERR(kernel_dev)) goto out_dev; - ret = copy_mount_options(data, &data_page); - if (ret < 0) + options = copy_mount_options(data); + ret = PTR_ERR(options); + if (IS_ERR(options)) goto out_data; - ret = do_mount(kernel_dev, dir_name, kernel_type, flags, - (void *) data_page); + ret = do_mount(kernel_dev, dir_name, kernel_type, flags, options); - free_page(data_page); + kfree(options); out_data: kfree(kernel_dev); out_dev: @@ -2941,9 +2948,9 @@ bool is_path_reachable(struct mount *mnt, struct dentry *dentry, return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry); } -int path_is_under(struct path *path1, struct path *path2) +bool path_is_under(struct path *path1, struct path *path2) { - int res; + bool res; read_seqlock_excl(&mount_lock); res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2); read_sequnlock_excl(&mount_lock); diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index f0e3e9e74..b7f8eaeea 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -369,7 +369,7 @@ ncp_lookup_validate(struct dentry *dentry, unsigned int flags) if (!res) { struct inode *inode = d_inode(dentry); - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (finfo.i.dirEntNum == NCP_FINFO(inode)->dirEntNum) { ncp_new_dentry(dentry); val=1; @@ -377,7 +377,7 @@ ncp_lookup_validate(struct dentry *dentry, unsigned int flags) ncp_dbg(2, "found, but dirEntNum changed\n"); ncp_update_inode2(inode, &finfo); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } finished: @@ -633,15 +633,15 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx, d_rehash(newdent); } else { spin_lock(&dentry->d_lock); - NCP_FINFO(inode)->flags &= ~NCPI_DIR_CACHE; + NCP_FINFO(dir)->flags &= ~NCPI_DIR_CACHE; spin_unlock(&dentry->d_lock); } } else { struct inode *inode = d_inode(newdent); - mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(inode, I_MUTEX_CHILD); ncp_update_inode2(inode, entry); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } if (ctl.idx >= NCP_DIRCACHE_SIZE) { diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c index 011324ce9..dd38ca1f2 100644 --- a/fs/ncpfs/file.c +++ b/fs/ncpfs/file.c @@ -224,10 +224,10 @@ ncp_file_write_iter(struct kiocb *iocb, struct iov_iter *from) iocb->ki_pos = pos; if (pos > i_size_read(inode)) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (pos > i_size_read(inode)) i_size_write(inode, pos); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } ncp_dbg(1, "exit %pD2\n", file); outrel: diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index 9605a2f63..1af15fcbe 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -82,7 +82,7 @@ static int init_inodecache(void) ncp_inode_cachep = kmem_cache_create("ncp_inode_cache", sizeof(struct ncp_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (ncp_inode_cachep == NULL) return -ENOMEM; @@ -244,8 +244,7 @@ static void ncp_set_attr(struct inode *inode, struct ncp_entry_info *nwinfo) #if defined(CONFIG_NCPFS_EXTRAS) || defined(CONFIG_NCPFS_NFS_NS) static const struct inode_operations ncp_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = page_follow_link_light, - .put_link = page_put_link, + .get_link = page_get_link, .setattr = ncp_notify_change, }; #endif @@ -283,6 +282,7 @@ ncp_iget(struct super_block *sb, struct ncp_entry_info *info) #if defined(CONFIG_NCPFS_EXTRAS) || defined(CONFIG_NCPFS_NFS_NS) } else if (S_ISLNK(inode->i_mode)) { inode->i_op = &ncp_symlink_inode_operations; + inode_nohighmem(inode); inode->i_data.a_ops = &ncp_symlink_aops; #endif } else { diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c index c59a59c37..35ab51c04 100644 --- a/fs/nfs/blocklayout/extent_tree.c +++ b/fs/nfs/blocklayout/extent_tree.c @@ -476,6 +476,7 @@ static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg, for (i = 0; i < nr_pages; i++) put_page(arg->layoutupdate_pages[i]); + vfree(arg->start_p); kfree(arg->layoutupdate_pages); } else { put_page(arg->layoutupdate_page); @@ -559,10 +560,15 @@ retry: if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) { void *p = start_p, *end = p + arg->layoutupdate_len; + struct page *page = NULL; int i = 0; - for ( ; p < end; p += PAGE_SIZE) - arg->layoutupdate_pages[i++] = vmalloc_to_page(p); + arg->start_p = start_p; + for ( ; p < end; p += PAGE_SIZE) { + page = vmalloc_to_page(p); + arg->layoutupdate_pages[i++] = page; + get_page(page); + } } dprintk("%s found %zu ranges\n", __func__, count); diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 807eb6ef4..f0939d097 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -83,8 +83,11 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy, res = htonl(NFS4ERR_BADHANDLE); inode = nfs_delegation_find_inode(cps->clp, &args->fh); - if (inode == NULL) + if (inode == NULL) { + trace_nfs4_cb_recall(cps->clp, &args->fh, NULL, + &args->stateid, -ntohl(res)); goto out; + } /* Set up a helper thread to actually return the delegation */ switch (nfs_async_inode_return_delegation(inode, &args->stateid)) { case 0: @@ -96,7 +99,8 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy, default: res = htonl(NFS4ERR_RESOURCE); } - trace_nfs4_recall_delegation(inode, -ntohl(res)); + trace_nfs4_cb_recall(cps->clp, &args->fh, inode, + &args->stateid, -ntohl(res)); iput(inode); out: dprintk("%s: exit with status = %d\n", __func__, ntohl(res)); @@ -160,6 +164,22 @@ static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp, return lo; } +/* + * Enforce RFC5661 section 12.5.5.2.1. (Layout Recall and Return Sequencing) + */ +static bool pnfs_check_stateid_sequence(struct pnfs_layout_hdr *lo, + const nfs4_stateid *new) +{ + u32 oldseq, newseq; + + oldseq = be32_to_cpu(lo->plh_stateid.seqid); + newseq = be32_to_cpu(new->seqid); + + if (newseq > oldseq + 1) + return false; + return true; +} + static u32 initiate_file_draining(struct nfs_client *clp, struct cb_layoutrecallargs *args) { @@ -169,34 +189,52 @@ static u32 initiate_file_draining(struct nfs_client *clp, LIST_HEAD(free_me_list); lo = get_layout_by_fh(clp, &args->cbl_fh, &args->cbl_stateid); - if (!lo) + if (!lo) { + trace_nfs4_cb_layoutrecall_file(clp, &args->cbl_fh, NULL, + &args->cbl_stateid, -rv); goto out; + } ino = lo->plh_inode; spin_lock(&ino->i_lock); + if (!pnfs_check_stateid_sequence(lo, &args->cbl_stateid)) { + rv = NFS4ERR_DELAY; + goto unlock; + } pnfs_set_layout_stateid(lo, &args->cbl_stateid, true); spin_unlock(&ino->i_lock); pnfs_layoutcommit_inode(ino, false); spin_lock(&ino->i_lock); - if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || - pnfs_mark_matching_lsegs_invalid(lo, &free_me_list, - &args->cbl_range)) { + /* + * Enforce RFC5661 Section 12.5.5.2.1.5 (Bulk Recall and Return) + */ + if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { rv = NFS4ERR_DELAY; goto unlock; } + if (pnfs_mark_matching_lsegs_return(lo, &free_me_list, + &args->cbl_range)) { + rv = NFS4_OK; + goto unlock; + } + if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) { NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, &args->cbl_range); } + pnfs_mark_layout_returned_if_empty(lo); unlock: spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&free_me_list); + /* Free all lsegs that are attached to commit buckets */ + nfs_commit_inode(ino, 0); pnfs_put_layout_hdr(lo); - trace_nfs4_cb_layoutrecall_inode(clp, &args->cbl_fh, ino, -rv); + trace_nfs4_cb_layoutrecall_file(clp, &args->cbl_fh, ino, + &args->cbl_stateid, -rv); iput(ino); out: return rv; diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index ce5a21861..9cce67043 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -940,7 +940,7 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence) dfprintk(FILE, "NFS: llseek dir(%pD2, %lld, %d)\n", filp, offset, whence); - mutex_lock(&inode->i_mutex); + inode_lock(inode); switch (whence) { case 1: offset += filp->f_pos; @@ -957,7 +957,7 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence) dir_ctx->duped = 0; } out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return offset; } @@ -972,9 +972,9 @@ static int nfs_fsync_dir(struct file *filp, loff_t start, loff_t end, dfprintk(FILE, "NFS: fsync dir(%pD2) datasync %d\n", filp, datasync); - mutex_lock(&inode->i_mutex); + inode_lock(inode); nfs_inc_stats(inode, NFSIOS_VFSFSYNC); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return 0; } @@ -1894,15 +1894,14 @@ int nfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) attr.ia_mode = S_IFLNK | S_IRWXUGO; attr.ia_valid = ATTR_MODE; - page = alloc_page(GFP_HIGHUSER); + page = alloc_page(GFP_USER); if (!page) return -ENOMEM; - kaddr = kmap_atomic(page); + kaddr = page_address(page); memcpy(kaddr, symname, pathlen); if (pathlen < PAGE_SIZE) memset(kaddr + pathlen, 0, PAGE_SIZE - pathlen); - kunmap_atomic(kaddr); trace_nfs_symlink_enter(dir, dentry); error = NFS_PROTO(dir)->symlink(dir, dentry, page, pathlen, &attr); @@ -2432,6 +2431,20 @@ int nfs_may_open(struct inode *inode, struct rpc_cred *cred, int openflags) } EXPORT_SYMBOL_GPL(nfs_may_open); +static int nfs_execute_ok(struct inode *inode, int mask) +{ + struct nfs_server *server = NFS_SERVER(inode); + int ret; + + if (mask & MAY_NOT_BLOCK) + ret = nfs_revalidate_inode_rcu(server, inode); + else + ret = nfs_revalidate_inode(server, inode); + if (ret == 0 && !execute_ok(inode)) + ret = -EACCES; + return ret; +} + int nfs_permission(struct inode *inode, int mask) { struct rpc_cred *cred; @@ -2449,6 +2462,9 @@ int nfs_permission(struct inode *inode, int mask) case S_IFLNK: goto out; case S_IFREG: + if ((mask & MAY_OPEN) && + nfs_server_capable(inode, NFS_CAP_ATOMIC_OPEN)) + return 0; break; case S_IFDIR: /* @@ -2481,8 +2497,8 @@ force_lookup: res = PTR_ERR(cred); } out: - if (!res && (mask & MAY_EXEC) && !execute_ok(inode)) - res = -EACCES; + if (!res && (mask & MAY_EXEC)) + res = nfs_execute_ok(inode, mask); dfprintk(VFS, "NFS: permission(%s/%lu), mask=0x%x, res=%d\n", inode->i_sb->s_id, inode->i_ino, mask, res); diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 4b1d08f56..7a0cfd326 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -117,12 +117,6 @@ static inline int put_dreq(struct nfs_direct_req *dreq) return atomic_dec_and_test(&dreq->io_count); } -void nfs_direct_set_resched_writes(struct nfs_direct_req *dreq) -{ - dreq->flags = NFS_ODIRECT_RESCHED_WRITES; -} -EXPORT_SYMBOL_GPL(nfs_direct_set_resched_writes); - static void nfs_direct_good_bytes(struct nfs_direct_req *dreq, struct nfs_pgio_header *hdr) { @@ -586,7 +580,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, if (!count) goto out; - mutex_lock(&inode->i_mutex); + inode_lock(inode); result = nfs_sync_mapping(mapping); if (result) goto out_unlock; @@ -614,7 +608,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, NFS_I(inode)->read_io += count; result = nfs_direct_read_schedule_iovec(dreq, iter, pos); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (!result) { result = nfs_direct_wait(dreq); @@ -628,7 +622,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, out_release: nfs_direct_req_release(dreq); out_unlock: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); out: return result; } @@ -670,6 +664,10 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) req = nfs_list_entry(reqs.next); nfs_direct_setup_mirroring(dreq, &desc, req); + if (desc.pg_error < 0) { + list_splice_init(&reqs, &failed); + goto out_failed; + } list_for_each_entry_safe(req, tmp, &reqs, wb_list) { if (!nfs_pageio_add_request(&desc, req)) { @@ -677,13 +675,17 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) nfs_list_add_request(req, &failed); spin_lock(cinfo.lock); dreq->flags = 0; - dreq->error = -EIO; + if (desc.pg_error < 0) + dreq->error = desc.pg_error; + else + dreq->error = -EIO; spin_unlock(cinfo.lock); } nfs_release_request(req); } nfs_pageio_complete(&desc); +out_failed: while (!list_empty(&failed)) { req = nfs_list_entry(failed.next); nfs_list_remove_request(req); @@ -727,14 +729,20 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data) nfs_direct_write_complete(dreq, data->inode); } -static void nfs_direct_error_cleanup(struct nfs_inode *nfsi) +static void nfs_direct_resched_write(struct nfs_commit_info *cinfo, + struct nfs_page *req) { - /* There is no lock to clear */ + struct nfs_direct_req *dreq = cinfo->dreq; + + spin_lock(&dreq->lock); + dreq->flags = NFS_ODIRECT_RESCHED_WRITES; + spin_unlock(&dreq->lock); + nfs_mark_request_commit(req, NULL, cinfo, 0); } static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops = { .completion = nfs_direct_commit_complete, - .error_cleanup = nfs_direct_error_cleanup, + .resched_write = nfs_direct_resched_write, }; static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) @@ -839,10 +847,25 @@ static void nfs_write_sync_pgio_error(struct list_head *head) } } +static void nfs_direct_write_reschedule_io(struct nfs_pgio_header *hdr) +{ + struct nfs_direct_req *dreq = hdr->dreq; + + spin_lock(&dreq->lock); + if (dreq->error == 0) { + dreq->flags = NFS_ODIRECT_RESCHED_WRITES; + /* fake unstable write to let common nfs resend pages */ + hdr->verf.committed = NFS_UNSTABLE; + hdr->good_bytes = hdr->args.count; + } + spin_unlock(&dreq->lock); +} + static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = { .error_cleanup = nfs_write_sync_pgio_error, .init_hdr = nfs_direct_pgio_init, .completion = nfs_direct_write_completion, + .reschedule_io = nfs_direct_write_reschedule_io, }; @@ -900,6 +923,11 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, } nfs_direct_setup_mirroring(dreq, &desc, req); + if (desc.pg_error < 0) { + nfs_free_request(req); + result = desc.pg_error; + break; + } nfs_lock_request(req); req->wb_index = pos >> PAGE_SHIFT; @@ -977,7 +1005,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) pos = iocb->ki_pos; end = (pos + iov_iter_count(iter) - 1) >> PAGE_CACHE_SHIFT; - mutex_lock(&inode->i_mutex); + inode_lock(inode); result = nfs_sync_mapping(mapping); if (result) @@ -1017,7 +1045,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) pos >> PAGE_CACHE_SHIFT, end); } - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (!result) { result = nfs_direct_wait(dreq); @@ -1038,7 +1066,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) out_release: nfs_direct_req_release(dreq); out_unlock: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return result; } diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 93e236429..748bb813b 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -278,9 +278,9 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) ret = filemap_write_and_wait_range(inode->i_mapping, start, end); if (ret != 0) break; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = nfs_file_fsync_commit(file, start, end, datasync); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); /* * If nfs_file_fsync_commit detected a server reboot, then * resend all dirty pages that might have been covered by @@ -514,7 +514,7 @@ static void nfs_check_dirty_writeback(struct page *page, * so it will not block due to pages that will shortly be freeable. */ nfsi = NFS_I(mapping->host); - if (test_bit(NFS_INO_COMMIT, &nfsi->flags)) { + if (atomic_read(&nfsi->commit_info.rpcs_out)) { *writeback = true; return; } @@ -545,7 +545,7 @@ static int nfs_launder_page(struct page *page) inode->i_ino, (long long)page_offset(page)); nfs_fscache_wait_on_page_write(nfsi, page); - return nfs_wb_page(inode, page); + return nfs_wb_launder_page(inode, page); } static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file, @@ -756,7 +756,7 @@ do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local) l_ctx = nfs_get_lock_context(nfs_file_open_context(filp)); if (!IS_ERR(l_ctx)) { - status = nfs_iocounter_wait(&l_ctx->io_count); + status = nfs_iocounter_wait(l_ctx); nfs_put_lock_context(l_ctx); if (status < 0) return status; diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 02ec07973..3384dc8e6 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -202,6 +202,7 @@ static int filelayout_async_handle_error(struct rpc_task *task, task->tk_status); nfs4_mark_deviceid_unavailable(devid); pnfs_error_mark_layout_for_return(inode, lseg); + pnfs_set_lo_fail(lseg); rpc_wake_up(&tbl->slot_tbl_waitq); /* fall through */ default: @@ -883,13 +884,19 @@ static void filelayout_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { - if (!pgio->pg_lseg) + if (!pgio->pg_lseg) { pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, req->wb_context, 0, NFS4_MAX_UINT64, IOMODE_READ, GFP_KERNEL); + if (IS_ERR(pgio->pg_lseg)) { + pgio->pg_error = PTR_ERR(pgio->pg_lseg); + pgio->pg_lseg = NULL; + return; + } + } /* If no lseg, fall back to read through mds */ if (pgio->pg_lseg == NULL) nfs_pageio_reset_read_mds(pgio); @@ -902,13 +909,20 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_commit_info cinfo; int status; - if (!pgio->pg_lseg) + if (!pgio->pg_lseg) { pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, req->wb_context, 0, NFS4_MAX_UINT64, IOMODE_RW, GFP_NOFS); + if (IS_ERR(pgio->pg_lseg)) { + pgio->pg_error = PTR_ERR(pgio->pg_lseg); + pgio->pg_lseg = NULL; + return; + } + } + /* If no lseg, fall back to write through mds */ if (pgio->pg_lseg == NULL) goto out_mds; @@ -957,7 +971,7 @@ filelayout_mark_request_commit(struct nfs_page *req, u32 i, j; if (fl->commit_through_mds) { - nfs_request_add_commit_list(req, &cinfo->mds->list, cinfo); + nfs_request_add_commit_list(req, cinfo); } else { /* Note that we are calling nfs4_fl_calc_j_index on each page * that ends up being committed to a data server. An attractive diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 2a2e2d8dd..0cb1abd53 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -505,9 +505,17 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, } p = xdr_inline_decode(&stream, 4); - if (p) - fls->flags = be32_to_cpup(p); + if (!p) + goto out_sort_mirrors; + fls->flags = be32_to_cpup(p); + + p = xdr_inline_decode(&stream, 4); + if (!p) + goto out_sort_mirrors; + for (i=0; i < fls->mirror_array_cnt; i++) + fls->mirror_array[i]->report_interval = be32_to_cpup(p); +out_sort_mirrors: ff_layout_sort_mirrors(fls); rc = ff_layout_check_layout(lgr); if (rc) @@ -603,7 +611,9 @@ nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror, mirror->start_time = now; if (ktime_equal(mirror->last_report_time, notime)) mirror->last_report_time = now; - if (layoutstats_timer != 0) + if (mirror->report_interval != 0) + report_interval = (s64)mirror->report_interval * 1000LL; + else if (layoutstats_timer != 0) report_interval = (s64)layoutstats_timer * 1000LL; if (ktime_to_ms(ktime_sub(now, mirror->last_report_time)) >= report_interval) { @@ -785,13 +795,19 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio, int ds_idx; /* Use full layout for now */ - if (!pgio->pg_lseg) + if (!pgio->pg_lseg) { pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, req->wb_context, 0, NFS4_MAX_UINT64, IOMODE_READ, GFP_KERNEL); + if (IS_ERR(pgio->pg_lseg)) { + pgio->pg_error = PTR_ERR(pgio->pg_lseg); + pgio->pg_lseg = NULL; + return; + } + } /* If no lseg, fall back to read through mds */ if (pgio->pg_lseg == NULL) goto out_mds; @@ -825,13 +841,19 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio, int i; int status; - if (!pgio->pg_lseg) + if (!pgio->pg_lseg) { pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, req->wb_context, 0, NFS4_MAX_UINT64, IOMODE_RW, GFP_NOFS); + if (IS_ERR(pgio->pg_lseg)) { + pgio->pg_error = PTR_ERR(pgio->pg_lseg); + pgio->pg_lseg = NULL; + return; + } + } /* If no lseg, fall back to write through mds */ if (pgio->pg_lseg == NULL) goto out_mds; @@ -867,18 +889,25 @@ static unsigned int ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { - if (!pgio->pg_lseg) + if (!pgio->pg_lseg) { pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, req->wb_context, 0, NFS4_MAX_UINT64, IOMODE_RW, GFP_NOFS); + if (IS_ERR(pgio->pg_lseg)) { + pgio->pg_error = PTR_ERR(pgio->pg_lseg); + pgio->pg_lseg = NULL; + goto out; + } + } if (pgio->pg_lseg) return FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg); /* no lseg means that pnfs is not in use, so no mirroring here */ nfs_pageio_reset_write_mds(pgio); +out: return 1; } @@ -912,18 +941,7 @@ static void ff_layout_reset_write(struct nfs_pgio_header *hdr, bool retry_pnfs) hdr->args.count, (unsigned long long)hdr->args.offset); - if (!hdr->dreq) { - struct nfs_open_context *ctx; - - ctx = nfs_list_entry(hdr->pages.next)->wb_context; - set_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags); - hdr->completion_ops->error_cleanup(&hdr->pages); - } else { - nfs_direct_set_resched_writes(hdr->dreq); - /* fake unstable write to let common nfs resend pages */ - hdr->verf.committed = NFS_UNSTABLE; - hdr->good_bytes = hdr->args.count; - } + hdr->completion_ops->reschedule_io(hdr); return; } @@ -1101,7 +1119,7 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task, return -NFS4ERR_RESET_TO_PNFS; out_retry: task->tk_status = 0; - rpc_restart_call(task); + rpc_restart_call_prepare(task); rpc_delay(task, NFS_JUKEBOX_RETRY_TIME); return -EAGAIN; } @@ -1159,6 +1177,14 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, } } + switch (status) { + case NFS4ERR_DELAY: + case NFS4ERR_GRACE: + return; + default: + break; + } + mirror = FF_LAYOUT_COMP(lseg, idx); err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout), mirror, offset, length, status, opnum, @@ -1189,7 +1215,7 @@ static int ff_layout_read_done_cb(struct rpc_task *task, hdr->pgio_mirror_idx + 1, &hdr->pgio_mirror_idx)) goto out_eagain; - set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, + set_bit(NFS_LAYOUT_RETURN_REQUESTED, &hdr->lseg->pls_layout->plh_flags); pnfs_read_resend_pnfs(hdr); return task->tk_status; @@ -1242,14 +1268,31 @@ ff_layout_reset_to_mds(struct pnfs_layout_segment *lseg, int idx) return ff_layout_test_devid_unavailable(node); } -static int ff_layout_read_prepare_common(struct rpc_task *task, - struct nfs_pgio_header *hdr) +static void ff_layout_read_record_layoutstats_start(struct rpc_task *task, + struct nfs_pgio_header *hdr) { + if (test_and_set_bit(NFS_IOHDR_STAT, &hdr->flags)) + return; nfs4_ff_layout_stat_io_start_read(hdr->inode, FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), hdr->args.count, task->tk_start); +} + +static void ff_layout_read_record_layoutstats_done(struct rpc_task *task, + struct nfs_pgio_header *hdr) +{ + if (!test_and_clear_bit(NFS_IOHDR_STAT, &hdr->flags)) + return; + nfs4_ff_layout_stat_io_end_read(task, + FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), + hdr->args.count, + hdr->res.count); +} +static int ff_layout_read_prepare_common(struct rpc_task *task, + struct nfs_pgio_header *hdr) +{ if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) { rpc_exit(task, -EIO); return -EIO; @@ -1265,6 +1308,7 @@ static int ff_layout_read_prepare_common(struct rpc_task *task, } hdr->pgio_done_cb = ff_layout_read_done_cb; + ff_layout_read_record_layoutstats_start(task, hdr); return 0; } @@ -1323,10 +1367,6 @@ static void ff_layout_read_call_done(struct rpc_task *task, void *data) dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status); - nfs4_ff_layout_stat_io_end_read(task, - FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), - hdr->args.count, hdr->res.count); - if (test_bit(NFS_IOHDR_REDO, &hdr->flags) && task->tk_status == 0) { nfs4_sequence_done(task, &hdr->res.seq_res); @@ -1341,10 +1381,20 @@ static void ff_layout_read_count_stats(struct rpc_task *task, void *data) { struct nfs_pgio_header *hdr = data; + ff_layout_read_record_layoutstats_done(task, hdr); rpc_count_iostats_metrics(task, &NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_READ]); } +static void ff_layout_read_release(void *data) +{ + struct nfs_pgio_header *hdr = data; + + ff_layout_read_record_layoutstats_done(&hdr->task, hdr); + pnfs_generic_rw_release(data); +} + + static int ff_layout_write_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr) { @@ -1362,15 +1412,12 @@ static int ff_layout_write_done_cb(struct rpc_task *task, switch (err) { case -NFS4ERR_RESET_TO_PNFS: - pnfs_set_retry_layoutget(hdr->lseg->pls_layout); ff_layout_reset_write(hdr, true); return task->tk_status; case -NFS4ERR_RESET_TO_MDS: - pnfs_clear_retry_layoutget(hdr->lseg->pls_layout); ff_layout_reset_write(hdr, false); return task->tk_status; case -EAGAIN: - rpc_restart_call_prepare(task); return -EAGAIN; } @@ -1402,11 +1449,9 @@ static int ff_layout_commit_done_cb(struct rpc_task *task, switch (err) { case -NFS4ERR_RESET_TO_PNFS: - pnfs_set_retry_layoutget(data->lseg->pls_layout); pnfs_generic_prepare_to_resend_writes(data); return -EAGAIN; case -NFS4ERR_RESET_TO_MDS: - pnfs_clear_retry_layoutget(data->lseg->pls_layout); pnfs_generic_prepare_to_resend_writes(data); return -EAGAIN; case -EAGAIN: @@ -1421,14 +1466,31 @@ static int ff_layout_commit_done_cb(struct rpc_task *task, return 0; } -static int ff_layout_write_prepare_common(struct rpc_task *task, - struct nfs_pgio_header *hdr) +static void ff_layout_write_record_layoutstats_start(struct rpc_task *task, + struct nfs_pgio_header *hdr) { + if (test_and_set_bit(NFS_IOHDR_STAT, &hdr->flags)) + return; nfs4_ff_layout_stat_io_start_write(hdr->inode, FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), hdr->args.count, task->tk_start); +} + +static void ff_layout_write_record_layoutstats_done(struct rpc_task *task, + struct nfs_pgio_header *hdr) +{ + if (!test_and_clear_bit(NFS_IOHDR_STAT, &hdr->flags)) + return; + nfs4_ff_layout_stat_io_end_write(task, + FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), + hdr->args.count, hdr->res.count, + hdr->res.verf->committed); +} +static int ff_layout_write_prepare_common(struct rpc_task *task, + struct nfs_pgio_header *hdr) +{ if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) { rpc_exit(task, -EIO); return -EIO; @@ -1445,6 +1507,7 @@ static int ff_layout_write_prepare_common(struct rpc_task *task, return -EAGAIN; } + ff_layout_write_record_layoutstats_start(task, hdr); return 0; } @@ -1480,11 +1543,6 @@ static void ff_layout_write_call_done(struct rpc_task *task, void *data) { struct nfs_pgio_header *hdr = data; - nfs4_ff_layout_stat_io_end_write(task, - FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), - hdr->args.count, hdr->res.count, - hdr->res.verf->committed); - if (test_bit(NFS_IOHDR_REDO, &hdr->flags) && task->tk_status == 0) { nfs4_sequence_done(task, &hdr->res.seq_res); @@ -1499,18 +1557,53 @@ static void ff_layout_write_count_stats(struct rpc_task *task, void *data) { struct nfs_pgio_header *hdr = data; + ff_layout_write_record_layoutstats_done(task, hdr); rpc_count_iostats_metrics(task, &NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_WRITE]); } -static void ff_layout_commit_prepare_common(struct rpc_task *task, +static void ff_layout_write_release(void *data) +{ + struct nfs_pgio_header *hdr = data; + + ff_layout_write_record_layoutstats_done(&hdr->task, hdr); + pnfs_generic_rw_release(data); +} + +static void ff_layout_commit_record_layoutstats_start(struct rpc_task *task, struct nfs_commit_data *cdata) { + if (test_and_set_bit(NFS_IOHDR_STAT, &cdata->flags)) + return; nfs4_ff_layout_stat_io_start_write(cdata->inode, FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index), 0, task->tk_start); } +static void ff_layout_commit_record_layoutstats_done(struct rpc_task *task, + struct nfs_commit_data *cdata) +{ + struct nfs_page *req; + __u64 count = 0; + + if (!test_and_clear_bit(NFS_IOHDR_STAT, &cdata->flags)) + return; + + if (task->tk_status == 0) { + list_for_each_entry(req, &cdata->pages, wb_list) + count += req->wb_bytes; + } + nfs4_ff_layout_stat_io_end_write(task, + FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index), + count, count, NFS_FILE_SYNC); +} + +static void ff_layout_commit_prepare_common(struct rpc_task *task, + struct nfs_commit_data *cdata) +{ + ff_layout_commit_record_layoutstats_start(task, cdata); +} + static void ff_layout_commit_prepare_v3(struct rpc_task *task, void *data) { ff_layout_commit_prepare_common(task, data); @@ -1531,19 +1624,6 @@ static void ff_layout_commit_prepare_v4(struct rpc_task *task, void *data) static void ff_layout_commit_done(struct rpc_task *task, void *data) { - struct nfs_commit_data *cdata = data; - struct nfs_page *req; - __u64 count = 0; - - if (task->tk_status == 0) { - list_for_each_entry(req, &cdata->pages, wb_list) - count += req->wb_bytes; - } - - nfs4_ff_layout_stat_io_end_write(task, - FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index), - count, count, NFS_FILE_SYNC); - pnfs_generic_write_commit_done(task, data); } @@ -1551,50 +1631,59 @@ static void ff_layout_commit_count_stats(struct rpc_task *task, void *data) { struct nfs_commit_data *cdata = data; + ff_layout_commit_record_layoutstats_done(task, cdata); rpc_count_iostats_metrics(task, &NFS_CLIENT(cdata->inode)->cl_metrics[NFSPROC4_CLNT_COMMIT]); } +static void ff_layout_commit_release(void *data) +{ + struct nfs_commit_data *cdata = data; + + ff_layout_commit_record_layoutstats_done(&cdata->task, cdata); + pnfs_generic_commit_release(data); +} + static const struct rpc_call_ops ff_layout_read_call_ops_v3 = { .rpc_call_prepare = ff_layout_read_prepare_v3, .rpc_call_done = ff_layout_read_call_done, .rpc_count_stats = ff_layout_read_count_stats, - .rpc_release = pnfs_generic_rw_release, + .rpc_release = ff_layout_read_release, }; static const struct rpc_call_ops ff_layout_read_call_ops_v4 = { .rpc_call_prepare = ff_layout_read_prepare_v4, .rpc_call_done = ff_layout_read_call_done, .rpc_count_stats = ff_layout_read_count_stats, - .rpc_release = pnfs_generic_rw_release, + .rpc_release = ff_layout_read_release, }; static const struct rpc_call_ops ff_layout_write_call_ops_v3 = { .rpc_call_prepare = ff_layout_write_prepare_v3, .rpc_call_done = ff_layout_write_call_done, .rpc_count_stats = ff_layout_write_count_stats, - .rpc_release = pnfs_generic_rw_release, + .rpc_release = ff_layout_write_release, }; static const struct rpc_call_ops ff_layout_write_call_ops_v4 = { .rpc_call_prepare = ff_layout_write_prepare_v4, .rpc_call_done = ff_layout_write_call_done, .rpc_count_stats = ff_layout_write_count_stats, - .rpc_release = pnfs_generic_rw_release, + .rpc_release = ff_layout_write_release, }; static const struct rpc_call_ops ff_layout_commit_call_ops_v3 = { .rpc_call_prepare = ff_layout_commit_prepare_v3, .rpc_call_done = ff_layout_commit_done, .rpc_count_stats = ff_layout_commit_count_stats, - .rpc_release = pnfs_generic_commit_release, + .rpc_release = ff_layout_commit_release, }; static const struct rpc_call_ops ff_layout_commit_call_ops_v4 = { .rpc_call_prepare = ff_layout_commit_prepare_v4, .rpc_call_done = ff_layout_commit_done, .rpc_count_stats = ff_layout_commit_count_stats, - .rpc_release = pnfs_generic_commit_release, + .rpc_release = ff_layout_commit_release, }; static enum pnfs_try_status diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h index 2bb08bc6a..dd353bb7d 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.h +++ b/fs/nfs/flexfilelayout/flexfilelayout.h @@ -85,6 +85,7 @@ struct nfs4_ff_layout_mirror { struct nfs4_ff_layoutstat write_stat; ktime_t start_time; ktime_t last_report_time; + u32 report_interval; }; struct nfs4_ff_layout_segment { diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index e125e55de..eb370460c 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -218,63 +218,55 @@ static void extend_ds_error(struct nfs4_ff_layout_ds_err *err, err->length = end - err->offset; } -static bool ds_error_can_merge(struct nfs4_ff_layout_ds_err *err, u64 offset, - u64 length, int status, enum nfs_opnum4 opnum, - nfs4_stateid *stateid, - struct nfs4_deviceid *deviceid) +static int +ff_ds_error_match(const struct nfs4_ff_layout_ds_err *e1, + const struct nfs4_ff_layout_ds_err *e2) { - return err->status == status && err->opnum == opnum && - nfs4_stateid_match(&err->stateid, stateid) && - !memcmp(&err->deviceid, deviceid, sizeof(*deviceid)) && - end_offset(err->offset, err->length) >= offset && - err->offset <= end_offset(offset, length); -} - -static bool merge_ds_error(struct nfs4_ff_layout_ds_err *old, - struct nfs4_ff_layout_ds_err *new) -{ - if (!ds_error_can_merge(old, new->offset, new->length, new->status, - new->opnum, &new->stateid, &new->deviceid)) - return false; - - extend_ds_error(old, new->offset, new->length); - return true; + int ret; + + if (e1->opnum != e2->opnum) + return e1->opnum < e2->opnum ? -1 : 1; + if (e1->status != e2->status) + return e1->status < e2->status ? -1 : 1; + ret = memcmp(&e1->stateid, &e2->stateid, sizeof(e1->stateid)); + if (ret != 0) + return ret; + ret = memcmp(&e1->deviceid, &e2->deviceid, sizeof(e1->deviceid)); + if (ret != 0) + return ret; + if (end_offset(e1->offset, e1->length) < e2->offset) + return -1; + if (e1->offset > end_offset(e2->offset, e2->length)) + return 1; + /* If ranges overlap or are contiguous, they are the same */ + return 0; } -static bool +static void ff_layout_add_ds_error_locked(struct nfs4_flexfile_layout *flo, struct nfs4_ff_layout_ds_err *dserr) { - struct nfs4_ff_layout_ds_err *err; - - list_for_each_entry(err, &flo->error_list, list) { - if (merge_ds_error(err, dserr)) { - return true; - } - } - - list_add(&dserr->list, &flo->error_list); - return false; -} - -static bool -ff_layout_update_ds_error(struct nfs4_flexfile_layout *flo, u64 offset, - u64 length, int status, enum nfs_opnum4 opnum, - nfs4_stateid *stateid, struct nfs4_deviceid *deviceid) -{ - bool found = false; - struct nfs4_ff_layout_ds_err *err; - - list_for_each_entry(err, &flo->error_list, list) { - if (ds_error_can_merge(err, offset, length, status, opnum, - stateid, deviceid)) { - found = true; - extend_ds_error(err, offset, length); + struct nfs4_ff_layout_ds_err *err, *tmp; + struct list_head *head = &flo->error_list; + int match; + + /* Do insertion sort w/ merges */ + list_for_each_entry_safe(err, tmp, &flo->error_list, list) { + match = ff_ds_error_match(err, dserr); + if (match < 0) + continue; + if (match > 0) { + /* Add entry "dserr" _before_ entry "err" */ + head = &err->list; break; } + /* Entries match, so merge "err" into "dserr" */ + extend_ds_error(dserr, err->offset, err->length); + list_del(&err->list); + kfree(err); } - return found; + list_add_tail(&dserr->list, head); } int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo, @@ -283,7 +275,6 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo, gfp_t gfp_flags) { struct nfs4_ff_layout_ds_err *dserr; - bool needfree; if (status == 0) return 0; @@ -291,14 +282,6 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo, if (mirror->mirror_ds == NULL) return -EINVAL; - spin_lock(&flo->generic_hdr.plh_inode->i_lock); - if (ff_layout_update_ds_error(flo, offset, length, status, opnum, - &mirror->stateid, - &mirror->mirror_ds->id_node.deviceid)) { - spin_unlock(&flo->generic_hdr.plh_inode->i_lock); - return 0; - } - spin_unlock(&flo->generic_hdr.plh_inode->i_lock); dserr = kmalloc(sizeof(*dserr), gfp_flags); if (!dserr) return -ENOMEM; @@ -313,10 +296,8 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo, NFS4_DEVICEID4_SIZE); spin_lock(&flo->generic_hdr.plh_inode->i_lock); - needfree = ff_layout_add_ds_error_locked(flo, dserr); + ff_layout_add_ds_error_locked(flo, dserr); spin_unlock(&flo->generic_hdr.plh_inode->i_lock); - if (needfree) - kfree(dserr); return 0; } @@ -429,22 +410,14 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, mirror, lseg->pls_range.offset, lseg->pls_range.length, NFS4ERR_NXIO, OP_ILLEGAL, GFP_NOIO); - if (fail_return) { - pnfs_error_mark_layout_for_return(ino, lseg); + if (!fail_return) { if (ff_layout_has_available_ds(lseg)) - pnfs_set_retry_layoutget(lseg->pls_layout); - else - pnfs_clear_retry_layoutget(lseg->pls_layout); - - } else { - if (ff_layout_has_available_ds(lseg)) - set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, + set_bit(NFS_LAYOUT_RETURN_REQUESTED, &lseg->pls_layout->plh_flags); - else { + else pnfs_error_mark_layout_for_return(ino, lseg); - pnfs_clear_retry_layoutget(lseg->pls_layout); - } - } + } else + pnfs_error_mark_layout_for_return(ino, lseg); } out_update_creds: if (ff_layout_update_mirror_cred(mirror, ds)) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 3e2071a17..86faecf8f 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -71,19 +71,25 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr) return nfs_fileid_to_ino_t(fattr->fileid); } -/** - * nfs_wait_bit_killable - helper for functions that are sleeping on bit locks - * @word: long word containing the bit lock - */ -int nfs_wait_bit_killable(struct wait_bit_key *key, int mode) +static int nfs_wait_killable(int mode) { freezable_schedule_unsafe(); if (signal_pending_state(mode, current)) return -ERESTARTSYS; return 0; } + +int nfs_wait_bit_killable(struct wait_bit_key *key, int mode) +{ + return nfs_wait_killable(mode); +} EXPORT_SYMBOL_GPL(nfs_wait_bit_killable); +int nfs_wait_atomic_killable(atomic_t *p) +{ + return nfs_wait_killable(TASK_KILLABLE); +} + /** * nfs_compat_user_ino64 - returns the user-visible inode number * @fileid: 64-bit fileid @@ -408,9 +414,10 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st inode->i_fop = NULL; inode->i_flags |= S_AUTOMOUNT; } - } else if (S_ISLNK(inode->i_mode)) + } else if (S_ISLNK(inode->i_mode)) { inode->i_op = &nfs_symlink_inode_operations; - else + inode_nohighmem(inode); + } else init_special_inode(inode, inode->i_mode, fattr->rdev); memset(&inode->i_atime, 0, sizeof(inode->i_atime)); @@ -654,9 +661,9 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) trace_nfs_getattr_enter(inode); /* Flush out writes to the server in order to update c/mtime. */ if (S_ISREG(inode->i_mode)) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); err = nfs_sync_inode(inode); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (err) goto out; } @@ -699,7 +706,7 @@ static void nfs_init_lock_context(struct nfs_lock_context *l_ctx) l_ctx->lockowner.l_owner = current->files; l_ctx->lockowner.l_pid = current->tgid; INIT_LIST_HEAD(&l_ctx->list); - nfs_iocounter_init(&l_ctx->io_count); + atomic_set(&l_ctx->io_count, 0); } static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context *ctx) @@ -912,6 +919,12 @@ void nfs_file_clear_open_context(struct file *filp) if (ctx) { struct inode *inode = d_inode(ctx->dentry); + /* + * We fatal error on write before. Try to writeback + * every page again. + */ + if (ctx->error < 0) + invalidate_inode_pages2(inode->i_mapping); filp->private_data = NULL; spin_lock(&inode->i_lock); list_move_tail(&ctx->list, &NFS_I(inode)->open_files); @@ -1086,6 +1099,27 @@ static bool nfs_mapping_need_revalidate_inode(struct inode *inode) || NFS_STALE(inode); } +int nfs_revalidate_mapping_rcu(struct inode *inode) +{ + struct nfs_inode *nfsi = NFS_I(inode); + unsigned long *bitlock = &nfsi->flags; + int ret = 0; + + if (IS_SWAPFILE(inode)) + goto out; + if (nfs_mapping_need_revalidate_inode(inode)) { + ret = -ECHILD; + goto out; + } + spin_lock(&inode->i_lock); + if (test_bit(NFS_INO_INVALIDATING, bitlock) || + (nfsi->cache_validity & NFS_INO_INVALID_DATA)) + ret = -ECHILD; + spin_unlock(&inode->i_lock); +out: + return ret; +} + /** * __nfs_revalidate_mapping - Revalidate the pagecache * @inode - pointer to host inode @@ -1144,9 +1178,9 @@ static int __nfs_revalidate_mapping(struct inode *inode, spin_unlock(&inode->i_lock); trace_nfs_invalidate_mapping_enter(inode); if (may_lock) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = nfs_invalidate_mapping(inode, mapping); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } else ret = nfs_invalidate_mapping(inode, mapping); trace_nfs_invalidate_mapping_exit(inode, ret); @@ -1935,7 +1969,7 @@ static int __init nfs_init_inodecache(void) nfs_inode_cachep = kmem_cache_create("nfs_inode_cache", sizeof(struct nfs_inode), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (nfs_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 9dea85f7f..9a547aa3e 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -238,7 +238,7 @@ extern void nfs_pgheader_init(struct nfs_pageio_descriptor *desc, struct nfs_pgio_header *hdr, void (*release)(struct nfs_pgio_header *hdr)); void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos); -int nfs_iocounter_wait(struct nfs_io_counter *c); +int nfs_iocounter_wait(struct nfs_lock_context *l_ctx); extern const struct nfs_pageio_ops nfs_pgio_rw_ops; struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *); @@ -252,18 +252,18 @@ void nfs_free_request(struct nfs_page *req); struct nfs_pgio_mirror * nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc); -static inline void nfs_iocounter_init(struct nfs_io_counter *c) -{ - c->flags = 0; - atomic_set(&c->io_count, 0); -} - static inline bool nfs_pgio_has_mirroring(struct nfs_pageio_descriptor *desc) { WARN_ON_ONCE(desc->pg_mirror_count < 1); return desc->pg_mirror_count > 1; } +static inline bool nfs_match_open_context(const struct nfs_open_context *ctx1, + const struct nfs_open_context *ctx2) +{ + return ctx1->cred == ctx2->cred && ctx1->state == ctx2->state; +} + /* nfs2xdr.c */ extern struct rpc_procinfo nfs_procedures[]; extern int nfs2_decode_dirent(struct xdr_stream *, @@ -380,6 +380,7 @@ extern void nfs_clear_inode(struct inode *); extern void nfs_evict_inode(struct inode *); void nfs_zap_acl_cache(struct inode *inode); extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode); +extern int nfs_wait_atomic_killable(atomic_t *p); /* super.c */ extern const struct super_operations nfs_sops; @@ -483,7 +484,7 @@ void nfs_retry_commit(struct list_head *page_list, struct nfs_commit_info *cinfo, u32 ds_commit_idx); void nfs_commitdata_release(struct nfs_commit_data *data); -void nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst, +void nfs_request_add_commit_list(struct nfs_page *req, struct nfs_commit_info *cinfo); void nfs_request_add_commit_list_locked(struct nfs_page *req, struct list_head *dst, @@ -519,7 +520,6 @@ static inline void nfs_inode_dio_wait(struct inode *inode) inode_dio_wait(inode); } extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq); -extern void nfs_direct_set_resched_writes(struct nfs_direct_req *dreq); /* nfs4proc.c */ extern void __nfs4_read_done_cb(struct nfs_pgio_header *); @@ -696,9 +696,32 @@ static inline u32 nfs_fhandle_hash(const struct nfs_fh *fh) { return ~crc32_le(0xFFFFFFFF, &fh->data[0], fh->size); } +static inline u32 nfs_stateid_hash(const nfs4_stateid *stateid) +{ + return ~crc32_le(0xFFFFFFFF, &stateid->other[0], + NFS4_STATEID_OTHER_SIZE); +} #else static inline u32 nfs_fhandle_hash(const struct nfs_fh *fh) { return 0; } +static inline u32 nfs_stateid_hash(nfs4_stateid *stateid) +{ + return 0; +} #endif + +static inline bool nfs_error_is_fatal(int err) +{ + switch (err) { + case -ERESTARTSYS: + case -EIO: + case -ENOSPC: + case -EROFS: + case -E2BIG: + return true; + default: + return false; + } +} diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 1ebe2fc7c..17c0fa1ec 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -284,12 +284,12 @@ nfs3_listxattr(struct dentry *dentry, char *data, size_t size) int error; error = nfs3_list_one_acl(inode, ACL_TYPE_ACCESS, - POSIX_ACL_XATTR_ACCESS, data, size, &result); + XATTR_NAME_POSIX_ACL_ACCESS, data, size, &result); if (error) return error; error = nfs3_list_one_acl(inode, ACL_TYPE_DEFAULT, - POSIX_ACL_XATTR_DEFAULT, data, size, &result); + XATTR_NAME_POSIX_ACL_DEFAULT, data, size, &result); if (error) return error; return result; diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 6b1ce9825..dff83460e 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -16,29 +16,8 @@ #define NFSDBG_FACILITY NFSDBG_PROC -static int nfs42_set_rw_stateid(nfs4_stateid *dst, struct file *file, - fmode_t fmode) -{ - struct nfs_open_context *open; - struct nfs_lock_context *lock; - int ret; - - open = get_nfs_open_context(nfs_file_open_context(file)); - lock = nfs_get_lock_context(open); - if (IS_ERR(lock)) { - put_nfs_open_context(open); - return PTR_ERR(lock); - } - - ret = nfs4_set_rw_stateid(dst, open, lock, fmode); - - nfs_put_lock_context(lock); - put_nfs_open_context(open); - return ret; -} - static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, - loff_t offset, loff_t len) + struct nfs_lock_context *lock, loff_t offset, loff_t len) { struct inode *inode = file_inode(filep); struct nfs_server *server = NFS_SERVER(inode); @@ -56,7 +35,8 @@ static int _nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, msg->rpc_argp = &args; msg->rpc_resp = &res; - status = nfs42_set_rw_stateid(&args.falloc_stateid, filep, FMODE_WRITE); + status = nfs4_set_rw_stateid(&args.falloc_stateid, lock->open_context, + lock, FMODE_WRITE); if (status) return status; @@ -78,15 +58,26 @@ static int nfs42_proc_fallocate(struct rpc_message *msg, struct file *filep, { struct nfs_server *server = NFS_SERVER(file_inode(filep)); struct nfs4_exception exception = { }; + struct nfs_lock_context *lock; int err; + lock = nfs_get_lock_context(nfs_file_open_context(filep)); + if (IS_ERR(lock)) + return PTR_ERR(lock); + + exception.inode = file_inode(filep); + exception.state = lock->open_context->state; + do { - err = _nfs42_proc_fallocate(msg, filep, offset, len); - if (err == -ENOTSUPP) - return -EOPNOTSUPP; + err = _nfs42_proc_fallocate(msg, filep, lock, offset, len); + if (err == -ENOTSUPP) { + err = -EOPNOTSUPP; + break; + } err = nfs4_handle_exception(server, err, &exception); } while (exception.retry); + nfs_put_lock_context(lock); return err; } @@ -101,13 +92,13 @@ int nfs42_proc_allocate(struct file *filep, loff_t offset, loff_t len) if (!nfs_server_capable(inode, NFS_CAP_ALLOCATE)) return -EOPNOTSUPP; - mutex_lock(&inode->i_mutex); + inode_lock(inode); err = nfs42_proc_fallocate(&msg, filep, offset, len); if (err == -EOPNOTSUPP) NFS_SERVER(inode)->caps &= ~NFS_CAP_ALLOCATE; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return err; } @@ -123,7 +114,7 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len) return -EOPNOTSUPP; nfs_wb_all(inode); - mutex_lock(&inode->i_mutex); + inode_lock(inode); err = nfs42_proc_fallocate(&msg, filep, offset, len); if (err == 0) @@ -131,11 +122,12 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len) if (err == -EOPNOTSUPP) NFS_SERVER(inode)->caps &= ~NFS_CAP_DEALLOCATE; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return err; } -static loff_t _nfs42_proc_llseek(struct file *filep, loff_t offset, int whence) +static loff_t _nfs42_proc_llseek(struct file *filep, + struct nfs_lock_context *lock, loff_t offset, int whence) { struct inode *inode = file_inode(filep); struct nfs42_seek_args args = { @@ -156,7 +148,8 @@ static loff_t _nfs42_proc_llseek(struct file *filep, loff_t offset, int whence) if (!nfs_server_capable(inode, NFS_CAP_SEEK)) return -ENOTSUPP; - status = nfs42_set_rw_stateid(&args.sa_stateid, filep, FMODE_READ); + status = nfs4_set_rw_stateid(&args.sa_stateid, lock->open_context, + lock, FMODE_READ); if (status) return status; @@ -175,17 +168,28 @@ loff_t nfs42_proc_llseek(struct file *filep, loff_t offset, int whence) { struct nfs_server *server = NFS_SERVER(file_inode(filep)); struct nfs4_exception exception = { }; + struct nfs_lock_context *lock; loff_t err; + lock = nfs_get_lock_context(nfs_file_open_context(filep)); + if (IS_ERR(lock)) + return PTR_ERR(lock); + + exception.inode = file_inode(filep); + exception.state = lock->open_context->state; + do { - err = _nfs42_proc_llseek(filep, offset, whence); + err = _nfs42_proc_llseek(filep, lock, offset, whence); if (err >= 0) break; - if (err == -ENOTSUPP) - return -EOPNOTSUPP; + if (err == -ENOTSUPP) { + err = -EOPNOTSUPP; + break; + } err = nfs4_handle_exception(server, err, &exception); } while (exception.retry); + nfs_put_lock_context(lock); return err; } @@ -204,6 +208,8 @@ static void nfs42_layoutstat_done(struct rpc_task *task, void *calldata) { struct nfs42_layoutstat_data *data = calldata; + struct inode *inode = data->inode; + struct pnfs_layout_hdr *lo; if (!nfs4_sequence_done(task, &data->res.seq_res)) return; @@ -211,12 +217,35 @@ nfs42_layoutstat_done(struct rpc_task *task, void *calldata) switch (task->tk_status) { case 0: break; + case -NFS4ERR_EXPIRED: + case -NFS4ERR_STALE_STATEID: + case -NFS4ERR_OLD_STATEID: + case -NFS4ERR_BAD_STATEID: + spin_lock(&inode->i_lock); + lo = NFS_I(inode)->layout; + if (lo && nfs4_stateid_match(&data->args.stateid, + &lo->plh_stateid)) { + LIST_HEAD(head); + + /* + * Mark the bad layout state as invalid, then retry + * with the current stateid. + */ + set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); + pnfs_mark_matching_lsegs_invalid(lo, &head, NULL); + spin_unlock(&inode->i_lock); + pnfs_free_lseg_list(&head); + } else + spin_unlock(&inode->i_lock); + break; case -ENOTSUPP: case -EOPNOTSUPP: - NFS_SERVER(data->inode)->caps &= ~NFS_CAP_LAYOUTSTATS; + NFS_SERVER(inode)->caps &= ~NFS_CAP_LAYOUTSTATS; default: - dprintk("%s server returns %d\n", __func__, task->tk_status); + break; } + + dprintk("%s server returns %d\n", __func__, task->tk_status); } static void @@ -273,8 +302,9 @@ int nfs42_proc_layoutstats_generic(struct nfs_server *server, } static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f, - struct file *dst_f, loff_t src_offset, - loff_t dst_offset, loff_t count) + struct file *dst_f, struct nfs_lock_context *src_lock, + struct nfs_lock_context *dst_lock, loff_t src_offset, + loff_t dst_offset, loff_t count) { struct inode *src_inode = file_inode(src_f); struct inode *dst_inode = file_inode(dst_f); @@ -295,11 +325,13 @@ static int _nfs42_proc_clone(struct rpc_message *msg, struct file *src_f, msg->rpc_argp = &args; msg->rpc_resp = &res; - status = nfs42_set_rw_stateid(&args.src_stateid, src_f, FMODE_READ); + status = nfs4_set_rw_stateid(&args.src_stateid, src_lock->open_context, + src_lock, FMODE_READ); if (status) return status; - status = nfs42_set_rw_stateid(&args.dst_stateid, dst_f, FMODE_WRITE); + status = nfs4_set_rw_stateid(&args.dst_stateid, dst_lock->open_context, + dst_lock, FMODE_WRITE); if (status) return status; @@ -324,22 +356,48 @@ int nfs42_proc_clone(struct file *src_f, struct file *dst_f, }; struct inode *inode = file_inode(src_f); struct nfs_server *server = NFS_SERVER(file_inode(src_f)); - struct nfs4_exception exception = { }; - int err; + struct nfs_lock_context *src_lock; + struct nfs_lock_context *dst_lock; + struct nfs4_exception src_exception = { }; + struct nfs4_exception dst_exception = { }; + int err, err2; if (!nfs_server_capable(inode, NFS_CAP_CLONE)) return -EOPNOTSUPP; + src_lock = nfs_get_lock_context(nfs_file_open_context(src_f)); + if (IS_ERR(src_lock)) + return PTR_ERR(src_lock); + + src_exception.inode = file_inode(src_f); + src_exception.state = src_lock->open_context->state; + + dst_lock = nfs_get_lock_context(nfs_file_open_context(dst_f)); + if (IS_ERR(dst_lock)) { + err = PTR_ERR(dst_lock); + goto out_put_src_lock; + } + + dst_exception.inode = file_inode(dst_f); + dst_exception.state = dst_lock->open_context->state; + do { - err = _nfs42_proc_clone(&msg, src_f, dst_f, src_offset, - dst_offset, count); + err = _nfs42_proc_clone(&msg, src_f, dst_f, src_lock, dst_lock, + src_offset, dst_offset, count); if (err == -ENOTSUPP || err == -EOPNOTSUPP) { NFS_SERVER(inode)->caps &= ~NFS_CAP_CLONE; - return -EOPNOTSUPP; + err = -EOPNOTSUPP; + break; } - err = nfs4_handle_exception(server, err, &exception); - } while (exception.retry); - return err; + err2 = nfs4_handle_exception(server, err, &src_exception); + err = nfs4_handle_exception(server, err, &dst_exception); + if (!err) + err = err2; + } while (src_exception.retry || dst_exception.retry); + nfs_put_lock_context(dst_lock); +out_put_src_lock: + nfs_put_lock_context(src_lock); + return err; } diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index db9b5fea5..57ca1c803 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -141,11 +141,11 @@ nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) ret = filemap_write_and_wait_range(inode->i_mapping, start, end); if (ret != 0) break; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = nfs_file_fsync_commit(file, start, end, datasync); if (!ret) ret = pnfs_sync_inode(inode, !!datasync); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); /* * If nfs_file_fsync_commit detected a server reboot, then * resend all dirty pages that might have been covered by @@ -195,75 +195,37 @@ static long nfs42_fallocate(struct file *filep, int mode, loff_t offset, loff_t return nfs42_proc_allocate(filep, offset, len); } -static noinline long -nfs42_ioctl_clone(struct file *dst_file, unsigned long srcfd, - u64 src_off, u64 dst_off, u64 count) +static int nfs42_clone_file_range(struct file *src_file, loff_t src_off, + struct file *dst_file, loff_t dst_off, u64 count) { struct inode *dst_inode = file_inode(dst_file); struct nfs_server *server = NFS_SERVER(dst_inode); - struct fd src_file; - struct inode *src_inode; + struct inode *src_inode = file_inode(src_file); unsigned int bs = server->clone_blksize; bool same_inode = false; int ret; - /* dst file must be opened for writing */ - if (!(dst_file->f_mode & FMODE_WRITE)) - return -EINVAL; - - ret = mnt_want_write_file(dst_file); - if (ret) - return ret; - - src_file = fdget(srcfd); - if (!src_file.file) { - ret = -EBADF; - goto out_drop_write; - } - - src_inode = file_inode(src_file.file); - - if (src_inode == dst_inode) - same_inode = true; - - /* src file must be opened for reading */ - if (!(src_file.file->f_mode & FMODE_READ)) - goto out_fput; - - /* src and dst must be regular files */ - ret = -EISDIR; - if (!S_ISREG(src_inode->i_mode) || !S_ISREG(dst_inode->i_mode)) - goto out_fput; - - ret = -EXDEV; - if (src_file.file->f_path.mnt != dst_file->f_path.mnt || - src_inode->i_sb != dst_inode->i_sb) - goto out_fput; - /* check alignment w.r.t. clone_blksize */ ret = -EINVAL; if (bs) { if (!IS_ALIGNED(src_off, bs) || !IS_ALIGNED(dst_off, bs)) - goto out_fput; + goto out; if (!IS_ALIGNED(count, bs) && i_size_read(src_inode) != (src_off + count)) - goto out_fput; + goto out; } - /* verify if ranges are overlapped within the same file */ - if (same_inode) { - if (dst_off + count > src_off && dst_off < src_off + count) - goto out_fput; - } + if (src_inode == dst_inode) + same_inode = true; /* XXX: do we lock at all? what if server needs CB_RECALL_LAYOUT? */ if (same_inode) { - mutex_lock(&src_inode->i_mutex); + inode_lock(src_inode); } else if (dst_inode < src_inode) { - mutex_lock_nested(&dst_inode->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(dst_inode, I_MUTEX_PARENT); + inode_lock_nested(src_inode, I_MUTEX_CHILD); } else { - mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&dst_inode->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(src_inode, I_MUTEX_PARENT); + inode_lock_nested(dst_inode, I_MUTEX_CHILD); } /* flush all pending writes on both src and dst so that server @@ -275,7 +237,7 @@ nfs42_ioctl_clone(struct file *dst_file, unsigned long srcfd, if (ret) goto out_unlock; - ret = nfs42_proc_clone(src_file.file, dst_file, src_off, dst_off, count); + ret = nfs42_proc_clone(src_file, dst_file, src_off, dst_off, count); /* truncate inode page cache of the dst range so that future reads can fetch * new data from server */ @@ -284,45 +246,17 @@ nfs42_ioctl_clone(struct file *dst_file, unsigned long srcfd, out_unlock: if (same_inode) { - mutex_unlock(&src_inode->i_mutex); + inode_unlock(src_inode); } else if (dst_inode < src_inode) { - mutex_unlock(&src_inode->i_mutex); - mutex_unlock(&dst_inode->i_mutex); + inode_unlock(src_inode); + inode_unlock(dst_inode); } else { - mutex_unlock(&dst_inode->i_mutex); - mutex_unlock(&src_inode->i_mutex); + inode_unlock(dst_inode); + inode_unlock(src_inode); } -out_fput: - fdput(src_file); -out_drop_write: - mnt_drop_write_file(dst_file); +out: return ret; } - -static long nfs42_ioctl_clone_range(struct file *dst_file, void __user *argp) -{ - struct btrfs_ioctl_clone_range_args args; - - if (copy_from_user(&args, argp, sizeof(args))) - return -EFAULT; - - return nfs42_ioctl_clone(dst_file, args.src_fd, args.src_offset, - args.dest_offset, args.src_length); -} - -long nfs4_ioctl(struct file *file, unsigned int cmd, unsigned long arg) -{ - void __user *argp = (void __user *)arg; - - switch (cmd) { - case BTRFS_IOC_CLONE: - return nfs42_ioctl_clone(file, arg, 0, 0, 0); - case BTRFS_IOC_CLONE_RANGE: - return nfs42_ioctl_clone_range(file, argp); - } - - return -ENOTTY; -} #endif /* CONFIG_NFS_V4_2 */ const struct file_operations nfs4_file_operations = { @@ -342,8 +276,7 @@ const struct file_operations nfs4_file_operations = { #ifdef CONFIG_NFS_V4_2 .llseek = nfs4_file_llseek, .fallocate = nfs42_fallocate, - .unlocked_ioctl = nfs4_ioctl, - .compat_ioctl = nfs4_ioctl, + .clone_file_range = nfs42_clone_file_range, #else .llseek = nfs_file_llseek, #endif diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 98a441573..14881594d 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -208,6 +208,9 @@ static const u32 nfs4_pnfs_open_bitmap[3] = { | FATTR4_WORD1_TIME_METADATA | FATTR4_WORD1_TIME_MODIFY, FATTR4_WORD2_MDSTHRESHOLD +#ifdef CONFIG_NFS_V4_SECURITY_LABEL + | FATTR4_WORD2_SECURITY_LABEL +#endif }; static const u32 nfs4_open_noattr_bitmap[3] = { @@ -1598,6 +1601,7 @@ _nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data) if (!data->rpc_done) { state = nfs4_try_open_cached(data); + trace_nfs4_cached_open(data->state); goto out; } @@ -2015,6 +2019,7 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) } return; unlock_no_action: + trace_nfs4_cached_open(data->state); rcu_read_unlock(); out_no_action: task->tk_action = NULL; @@ -2703,6 +2708,7 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1); if (status == 0 && state != NULL) renew_lease(server, timestamp); + trace_nfs4_setattr(inode, &arg.stateid, status); return status; } @@ -2719,7 +2725,6 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, int err; do { err = _nfs4_do_setattr(inode, cred, fattr, sattr, state, ilabel, olabel); - trace_nfs4_setattr(inode, err); switch (err) { case -NFS4ERR_OPENMODE: if (!(sattr->ia_valid & ATTR_SIZE)) { @@ -5048,7 +5053,6 @@ static void nfs4_init_boot_verifier(const struct nfs_client *clp, static int nfs4_init_nonuniform_client_string(struct nfs_client *clp) { - int result; size_t len; char *str; @@ -5076,7 +5080,7 @@ nfs4_init_nonuniform_client_string(struct nfs_client *clp) return -ENOMEM; rcu_read_lock(); - result = scnprintf(str, len, "Linux NFSv4.0 %s/%s %s", + scnprintf(str, len, "Linux NFSv4.0 %s/%s %s", clp->cl_ipaddr, rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_ADDR), rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_PROTO)); @@ -5089,7 +5093,6 @@ nfs4_init_nonuniform_client_string(struct nfs_client *clp) static int nfs4_init_uniquifier_client_string(struct nfs_client *clp) { - int result; size_t len; char *str; @@ -5109,7 +5112,7 @@ nfs4_init_uniquifier_client_string(struct nfs_client *clp) if (!str) return -ENOMEM; - result = scnprintf(str, len, "Linux NFSv%u.%u %s/%s", + scnprintf(str, len, "Linux NFSv%u.%u %s/%s", clp->rpc_ops->version, clp->cl_minorversion, nfs4_client_id_uniquifier, clp->cl_rpcclient->cl_nodename); @@ -5120,7 +5123,6 @@ nfs4_init_uniquifier_client_string(struct nfs_client *clp) static int nfs4_init_uniform_client_string(struct nfs_client *clp) { - int result; size_t len; char *str; @@ -5145,7 +5147,7 @@ nfs4_init_uniform_client_string(struct nfs_client *clp) if (!str) return -ENOMEM; - result = scnprintf(str, len, "Linux NFSv%u.%u %s", + scnprintf(str, len, "Linux NFSv%u.%u %s", clp->rpc_ops->version, clp->cl_minorversion, clp->cl_rpcclient->cl_nodename); clp->cl_owner_id = str; @@ -5384,6 +5386,11 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co if (data == NULL) return -ENOMEM; nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1); + + nfs4_state_protect(server->nfs_client, + NFS_SP4_MACH_CRED_CLEANUP, + &task_setup_data.rpc_client, &msg); + data->args.fhandle = &data->fh; data->args.stateid = &data->stateid; data->args.bitmask = server->cache_consistency_bitmask; @@ -5426,7 +5433,7 @@ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4 int err; do { err = _nfs4_proc_delegreturn(inode, cred, stateid, issync); - trace_nfs4_delegreturn(inode, err); + trace_nfs4_delegreturn(inode, stateid, err); switch (err) { case -NFS4ERR_STALE_STATEID: case -NFS4ERR_EXPIRED: @@ -5936,6 +5943,7 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f data->cancelled = 1; rpc_put_task(task); dprintk("%s: done, ret = %d!\n", __func__, ret); + trace_nfs4_set_lock(fl, state, &data->res.stateid, cmd, ret); return ret; } @@ -5952,7 +5960,6 @@ static int nfs4_lock_reclaim(struct nfs4_state *state, struct file_lock *request if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0) return 0; err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_RECLAIM); - trace_nfs4_lock_reclaim(request, state, F_SETLK, err); if (err != -NFS4ERR_DELAY) break; nfs4_handle_exception(server, err, &exception); @@ -5979,7 +5986,6 @@ static int nfs4_lock_expired(struct nfs4_state *state, struct file_lock *request if (test_bit(NFS_DELEGATED_STATE, &state->flags) != 0) return 0; err = _nfs4_do_setlk(state, F_SETLK, request, NFS_LOCK_EXPIRED); - trace_nfs4_lock_expired(request, state, F_SETLK, err); switch (err) { default: goto out; @@ -6087,7 +6093,6 @@ static int nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock * do { err = _nfs4_proc_setlk(state, cmd, request); - trace_nfs4_set_lock(request, state, cmd, err); if (err == -NFS4ERR_DENIED) err = -EAGAIN; err = nfs4_handle_exception(NFS_SERVER(state->inode), @@ -6253,9 +6258,6 @@ static int nfs4_xattr_set_nfs4_acl(const struct xattr_handler *handler, const void *buf, size_t buflen, int flags) { - if (strcmp(key, "") != 0) - return -EINVAL; - return nfs4_proc_set_acl(d_inode(dentry), buf, buflen); } @@ -6263,32 +6265,15 @@ static int nfs4_xattr_get_nfs4_acl(const struct xattr_handler *handler, struct dentry *dentry, const char *key, void *buf, size_t buflen) { - if (strcmp(key, "") != 0) - return -EINVAL; - return nfs4_proc_get_acl(d_inode(dentry), buf, buflen); } -static size_t nfs4_xattr_list_nfs4_acl(const struct xattr_handler *handler, - struct dentry *dentry, char *list, - size_t list_len, const char *name, - size_t name_len) +static bool nfs4_xattr_list_nfs4_acl(struct dentry *dentry) { - size_t len = sizeof(XATTR_NAME_NFSV4_ACL); - - if (!nfs4_server_supports_acls(NFS_SERVER(d_inode(dentry)))) - return 0; - - if (list && len <= list_len) - memcpy(list, XATTR_NAME_NFSV4_ACL, len); - return len; + return nfs4_server_supports_acls(NFS_SERVER(d_inode(dentry))); } #ifdef CONFIG_NFS_V4_SECURITY_LABEL -static inline int nfs4_server_supports_labels(struct nfs_server *server) -{ - return server->caps & NFS_CAP_SECURITY_LABEL; -} static int nfs4_xattr_set_nfs4_label(const struct xattr_handler *handler, struct dentry *dentry, const char *key, @@ -6310,29 +6295,34 @@ static int nfs4_xattr_get_nfs4_label(const struct xattr_handler *handler, return -EOPNOTSUPP; } -static size_t nfs4_xattr_list_nfs4_label(const struct xattr_handler *handler, - struct dentry *dentry, char *list, - size_t list_len, const char *name, - size_t name_len) +static ssize_t +nfs4_listxattr_nfs4_label(struct inode *inode, char *list, size_t list_len) { - size_t len = 0; + int len = 0; - if (nfs_server_capable(d_inode(dentry), NFS_CAP_SECURITY_LABEL)) { - len = security_inode_listsecurity(d_inode(dentry), NULL, 0); - if (list && len <= list_len) - security_inode_listsecurity(d_inode(dentry), list, len); + if (nfs_server_capable(inode, NFS_CAP_SECURITY_LABEL)) { + len = security_inode_listsecurity(inode, list, list_len); + if (list_len && len > list_len) + return -ERANGE; } return len; } static const struct xattr_handler nfs4_xattr_nfs4_label_handler = { .prefix = XATTR_SECURITY_PREFIX, - .list = nfs4_xattr_list_nfs4_label, .get = nfs4_xattr_get_nfs4_label, .set = nfs4_xattr_set_nfs4_label, }; -#endif +#else + +static ssize_t +nfs4_listxattr_nfs4_label(struct inode *inode, char *list, size_t list_len) +{ + return 0; +} + +#endif /* * nfs_fhget will use either the mounted_on_fileid or the fileid @@ -6862,10 +6852,13 @@ static const struct nfs41_state_protection nfs4_sp4_mach_cred_request = { }, .allow.u.words = { [0] = 1 << (OP_CLOSE) | + 1 << (OP_OPEN_DOWNGRADE) | 1 << (OP_LOCKU) | + 1 << (OP_DELEGRETURN) | 1 << (OP_COMMIT), [1] = 1 << (OP_SECINFO - 32) | 1 << (OP_SECINFO_NO_NAME - 32) | + 1 << (OP_LAYOUTRETURN - 32) | 1 << (OP_TEST_STATEID - 32) | 1 << (OP_FREE_STATEID - 32) | 1 << (OP_WRITE - 32) @@ -6930,11 +6923,19 @@ static int nfs4_sp4_select_mode(struct nfs_client *clp, } if (test_bit(OP_CLOSE, sp->allow.u.longs) && + test_bit(OP_OPEN_DOWNGRADE, sp->allow.u.longs) && + test_bit(OP_DELEGRETURN, sp->allow.u.longs) && test_bit(OP_LOCKU, sp->allow.u.longs)) { dfprintk(MOUNT, " cleanup mode enabled\n"); set_bit(NFS_SP4_MACH_CRED_CLEANUP, &clp->cl_sp4_flags); } + if (test_bit(OP_LAYOUTRETURN, sp->allow.u.longs)) { + dfprintk(MOUNT, " pnfs cleanup mode enabled\n"); + set_bit(NFS_SP4_MACH_CRED_PNFS_CLEANUP, + &clp->cl_sp4_flags); + } + if (test_bit(OP_SECINFO, sp->allow.u.longs) && test_bit(OP_SECINFO_NO_NAME, sp->allow.u.longs)) { dfprintk(MOUNT, " secinfo mode enabled\n"); @@ -7763,6 +7764,7 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata) struct nfs4_layoutget *lgp = calldata; struct nfs_server *server = NFS_SERVER(lgp->args.inode); struct nfs4_session *session = nfs4_get_session(server); + int ret; dprintk("--> %s\n", __func__); /* Note the is a race here, where a CB_LAYOUTRECALL can come in @@ -7773,12 +7775,12 @@ nfs4_layoutget_prepare(struct rpc_task *task, void *calldata) if (nfs41_setup_sequence(session, &lgp->args.seq_args, &lgp->res.seq_res, task)) return; - if (pnfs_choose_layoutget_stateid(&lgp->args.stateid, + ret = pnfs_choose_layoutget_stateid(&lgp->args.stateid, NFS_I(lgp->args.inode)->layout, &lgp->args.range, - lgp->args.ctx->state)) { - rpc_exit(task, NFS4_OK); - } + lgp->args.ctx->state); + if (ret < 0) + rpc_exit(task, ret); } static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) @@ -7798,6 +7800,15 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) switch (task->tk_status) { case 0: goto out; + + /* + * NFS4ERR_LAYOUTUNAVAILABLE means we are not supposed to use pnfs + * on the file. set tk_status to -ENODATA to tell upper layer to + * retry go inband. + */ + case -NFS4ERR_LAYOUTUNAVAILABLE: + task->tk_status = -ENODATA; + goto out; /* * NFS4ERR_BADLAYOUT means the MDS cannot return a layout of * length lgp->args.minlength != 0 (see RFC5661 section 18.43.3). @@ -7994,6 +8005,7 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, gfp_t gfp_flags) trace_nfs4_layoutget(lgp->args.ctx, &lgp->args.range, &lgp->res.range, + &lgp->res.stateid, status); /* if layoutp->len is 0, nfs4_layoutget_prepare called rpc_exit */ if (status == 0 && lgp->res.layoutp->len) @@ -8050,9 +8062,10 @@ static void nfs4_layoutreturn_release(void *calldata) dprintk("--> %s\n", __func__); spin_lock(&lo->plh_inode->i_lock); + pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range); + pnfs_mark_layout_returned_if_empty(lo); if (lrp->res.lrs_present) pnfs_set_layout_stateid(lo, &lrp->res.stateid, true); - pnfs_mark_matching_lsegs_invalid(lo, &freeme, &lrp->args.range); pnfs_clear_layoutreturn_waitbit(lo); spin_unlock(&lo->plh_inode->i_lock); pnfs_free_lseg_list(&freeme); @@ -8085,6 +8098,10 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync) }; int status = 0; + nfs4_state_protect(NFS_SERVER(lrp->args.inode)->nfs_client, + NFS_SP4_MACH_CRED_PNFS_CLEANUP, + &task_setup_data.rpc_client, &msg); + dprintk("--> %s\n", __func__); if (!sync) { lrp->inode = nfs_igrab_and_active(lrp->args.inode); @@ -8100,7 +8117,7 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp, bool sync) return PTR_ERR(task); if (sync) status = task->tk_status; - trace_nfs4_layoutreturn(lrp->args.inode, status); + trace_nfs4_layoutreturn(lrp->args.inode, &lrp->args.stateid, status); dprintk("<-- %s status=%d\n", __func__, status); rpc_put_task(task); return status; @@ -8248,7 +8265,7 @@ nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data, bool sync) return PTR_ERR(task); if (sync) status = task->tk_status; - trace_nfs4_layoutcommit(data->args.inode, status); + trace_nfs4_layoutcommit(data->args.inode, &data->args.stateid, status); dprintk("%s: status %d\n", __func__, status); rpc_put_task(task); return status; @@ -8748,6 +8765,24 @@ const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = { #endif }; +ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size) +{ + ssize_t error, error2; + + error = generic_listxattr(dentry, list, size); + if (error < 0) + return error; + if (list) { + list += error; + size -= error; + } + + error2 = nfs4_listxattr_nfs4_label(d_inode(dentry), list, size); + if (error2 < 0) + return error2; + return error + error2; +} + static const struct inode_operations nfs4_dir_inode_operations = { .create = nfs_create, .lookup = nfs_lookup, @@ -8764,7 +8799,7 @@ static const struct inode_operations nfs4_dir_inode_operations = { .setattr = nfs_setattr, .getxattr = generic_getxattr, .setxattr = generic_setxattr, - .listxattr = generic_listxattr, + .listxattr = nfs4_listxattr, .removexattr = generic_removexattr, }; @@ -8774,7 +8809,7 @@ static const struct inode_operations nfs4_file_inode_operations = { .setattr = nfs_setattr, .getxattr = generic_getxattr, .setxattr = generic_setxattr, - .listxattr = generic_listxattr, + .listxattr = nfs4_listxattr, .removexattr = generic_removexattr, }; @@ -8833,7 +8868,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = { }; static const struct xattr_handler nfs4_xattr_nfs4_acl_handler = { - .prefix = XATTR_NAME_NFSV4_ACL, + .name = XATTR_NAME_NFSV4_ACL, .list = nfs4_xattr_list_nfs4_acl, .get = nfs4_xattr_get_nfs4_acl, .set = nfs4_xattr_set_nfs4_acl, diff --git a/fs/nfs/nfs4sysctl.c b/fs/nfs/nfs4sysctl.c index 0fbd3ab1b..8693d77c4 100644 --- a/fs/nfs/nfs4sysctl.c +++ b/fs/nfs/nfs4sysctl.c @@ -12,7 +12,7 @@ #include "nfs4idmap.h" #include "callback.h" -static const int nfs_set_port_min = 0; +static const int nfs_set_port_min; static const int nfs_set_port_max = 65535; static struct ctl_table_header *nfs4_callback_sysctl_table; diff --git a/fs/nfs/nfs4trace.c b/fs/nfs/nfs4trace.c index d774335cc..2850bce19 100644 --- a/fs/nfs/nfs4trace.c +++ b/fs/nfs/nfs4trace.c @@ -6,6 +6,7 @@ #include "internal.h" #include "nfs4session.h" #include "callback.h" +#include "pnfs.h" #define CREATE_TRACE_POINTS #include "nfs4trace.h" diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index 671cf68fe..2c8d05dae 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -321,6 +321,7 @@ TRACE_EVENT(nfs4_sequence_done, __entry->highest_slotid = res->sr_highest_slotid; __entry->target_highest_slotid = res->sr_target_highest_slotid; + __entry->status_flags = res->sr_status_flags; __entry->error = res->sr_status; ), TP_printk( @@ -399,6 +400,10 @@ DECLARE_EVENT_CLASS(nfs4_open_event, __field(u64, fileid) __field(u64, dir) __string(name, ctx->dentry->d_name.name) + __field(int, stateid_seq) + __field(u32, stateid_hash) + __field(int, openstateid_seq) + __field(u32, openstateid_hash) ), TP_fast_assign( @@ -409,8 +414,22 @@ DECLARE_EVENT_CLASS(nfs4_open_event, __entry->flags = flags; __entry->fmode = (__force unsigned int)ctx->mode; __entry->dev = ctx->dentry->d_sb->s_dev; - if (!IS_ERR_OR_NULL(state)) + if (!IS_ERR_OR_NULL(state)) { inode = state->inode; + __entry->stateid_seq = + be32_to_cpu(state->stateid.seqid); + __entry->stateid_hash = + nfs_stateid_hash(&state->stateid); + __entry->openstateid_seq = + be32_to_cpu(state->open_stateid.seqid); + __entry->openstateid_hash = + nfs_stateid_hash(&state->open_stateid); + } else { + __entry->stateid_seq = 0; + __entry->stateid_hash = 0; + __entry->openstateid_seq = 0; + __entry->openstateid_hash = 0; + } if (inode != NULL) { __entry->fileid = NFS_FILEID(inode); __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); @@ -425,7 +444,8 @@ DECLARE_EVENT_CLASS(nfs4_open_event, TP_printk( "error=%d (%s) flags=%d (%s) fmode=%s " "fileid=%02x:%02x:%llu fhandle=0x%08x " - "name=%02x:%02x:%llu/%s", + "name=%02x:%02x:%llu/%s stateid=%d:0x%08x " + "openstateid=%d:0x%08x", __entry->error, show_nfsv4_errors(__entry->error), __entry->flags, @@ -436,7 +456,9 @@ DECLARE_EVENT_CLASS(nfs4_open_event, __entry->fhandle, MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->dir, - __get_str(name) + __get_str(name), + __entry->stateid_seq, __entry->stateid_hash, + __entry->openstateid_seq, __entry->openstateid_hash ) ); @@ -452,6 +474,45 @@ DEFINE_NFS4_OPEN_EVENT(nfs4_open_reclaim); DEFINE_NFS4_OPEN_EVENT(nfs4_open_expired); DEFINE_NFS4_OPEN_EVENT(nfs4_open_file); +TRACE_EVENT(nfs4_cached_open, + TP_PROTO( + const struct nfs4_state *state + ), + TP_ARGS(state), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(unsigned int, fmode) + __field(int, stateid_seq) + __field(u32, stateid_hash) + ), + + TP_fast_assign( + const struct inode *inode = state->inode; + + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = NFS_FILEID(inode); + __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + __entry->fmode = (__force unsigned int)state->state; + __entry->stateid_seq = + be32_to_cpu(state->stateid.seqid); + __entry->stateid_hash = + nfs_stateid_hash(&state->stateid); + ), + + TP_printk( + "fmode=%s fileid=%02x:%02x:%llu " + "fhandle=0x%08x stateid=%d:0x%08x", + __entry->fmode ? show_fmode_flags(__entry->fmode) : + "closed", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + __entry->stateid_seq, __entry->stateid_hash + ) +); + TRACE_EVENT(nfs4_close, TP_PROTO( const struct nfs4_state *state, @@ -468,6 +529,8 @@ TRACE_EVENT(nfs4_close, __field(u64, fileid) __field(unsigned int, fmode) __field(int, error) + __field(int, stateid_seq) + __field(u32, stateid_hash) ), TP_fast_assign( @@ -478,18 +541,23 @@ TRACE_EVENT(nfs4_close, __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); __entry->fmode = (__force unsigned int)state->state; __entry->error = error; + __entry->stateid_seq = + be32_to_cpu(args->stateid.seqid); + __entry->stateid_hash = + nfs_stateid_hash(&args->stateid); ), TP_printk( "error=%d (%s) fmode=%s fileid=%02x:%02x:%llu " - "fhandle=0x%08x", + "fhandle=0x%08x openstateid=%d:0x%08x", __entry->error, show_nfsv4_errors(__entry->error), __entry->fmode ? show_fmode_flags(__entry->fmode) : "closed", MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, - __entry->fhandle + __entry->fhandle, + __entry->stateid_seq, __entry->stateid_hash ) ); @@ -523,6 +591,8 @@ DECLARE_EVENT_CLASS(nfs4_lock_event, __field(dev_t, dev) __field(u32, fhandle) __field(u64, fileid) + __field(int, stateid_seq) + __field(u32, stateid_hash) ), TP_fast_assign( @@ -536,11 +606,16 @@ DECLARE_EVENT_CLASS(nfs4_lock_event, __entry->dev = inode->i_sb->s_dev; __entry->fileid = NFS_FILEID(inode); __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + __entry->stateid_seq = + be32_to_cpu(state->stateid.seqid); + __entry->stateid_hash = + nfs_stateid_hash(&state->stateid); ), TP_printk( "error=%d (%s) cmd=%s:%s range=%lld:%lld " - "fileid=%02x:%02x:%llu fhandle=0x%08x", + "fileid=%02x:%02x:%llu fhandle=0x%08x " + "stateid=%d:0x%08x", __entry->error, show_nfsv4_errors(__entry->error), show_lock_cmd(__entry->cmd), @@ -549,7 +624,8 @@ DECLARE_EVENT_CLASS(nfs4_lock_event, (long long)__entry->end, MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, - __entry->fhandle + __entry->fhandle, + __entry->stateid_seq, __entry->stateid_hash ) ); @@ -563,11 +639,73 @@ DECLARE_EVENT_CLASS(nfs4_lock_event, ), \ TP_ARGS(request, state, cmd, error)) DEFINE_NFS4_LOCK_EVENT(nfs4_get_lock); -DEFINE_NFS4_LOCK_EVENT(nfs4_set_lock); -DEFINE_NFS4_LOCK_EVENT(nfs4_lock_reclaim); -DEFINE_NFS4_LOCK_EVENT(nfs4_lock_expired); DEFINE_NFS4_LOCK_EVENT(nfs4_unlock); +TRACE_EVENT(nfs4_set_lock, + TP_PROTO( + const struct file_lock *request, + const struct nfs4_state *state, + const nfs4_stateid *lockstateid, + int cmd, + int error + ), + + TP_ARGS(request, state, lockstateid, cmd, error), + + TP_STRUCT__entry( + __field(int, error) + __field(int, cmd) + __field(char, type) + __field(loff_t, start) + __field(loff_t, end) + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(int, stateid_seq) + __field(u32, stateid_hash) + __field(int, lockstateid_seq) + __field(u32, lockstateid_hash) + ), + + TP_fast_assign( + const struct inode *inode = state->inode; + + __entry->error = error; + __entry->cmd = cmd; + __entry->type = request->fl_type; + __entry->start = request->fl_start; + __entry->end = request->fl_end; + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = NFS_FILEID(inode); + __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + __entry->stateid_seq = + be32_to_cpu(state->stateid.seqid); + __entry->stateid_hash = + nfs_stateid_hash(&state->stateid); + __entry->lockstateid_seq = + be32_to_cpu(lockstateid->seqid); + __entry->lockstateid_hash = + nfs_stateid_hash(lockstateid); + ), + + TP_printk( + "error=%d (%s) cmd=%s:%s range=%lld:%lld " + "fileid=%02x:%02x:%llu fhandle=0x%08x " + "stateid=%d:0x%08x lockstateid=%d:0x%08x", + __entry->error, + show_nfsv4_errors(__entry->error), + show_lock_cmd(__entry->cmd), + show_lock_type(__entry->type), + (long long)__entry->start, + (long long)__entry->end, + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + __entry->stateid_seq, __entry->stateid_hash, + __entry->lockstateid_seq, __entry->lockstateid_hash + ) +); + DECLARE_EVENT_CLASS(nfs4_set_delegation_event, TP_PROTO( const struct inode *inode, @@ -621,20 +759,28 @@ TRACE_EVENT(nfs4_delegreturn_exit, __field(dev_t, dev) __field(u32, fhandle) __field(int, error) + __field(int, stateid_seq) + __field(u32, stateid_hash) ), TP_fast_assign( __entry->dev = res->server->s_dev; __entry->fhandle = nfs_fhandle_hash(args->fhandle); __entry->error = error; + __entry->stateid_seq = + be32_to_cpu(args->stateid->seqid); + __entry->stateid_hash = + nfs_stateid_hash(args->stateid); ), TP_printk( - "error=%d (%s) dev=%02x:%02x fhandle=0x%08x", + "error=%d (%s) dev=%02x:%02x fhandle=0x%08x " + "stateid=%d:0x%08x", __entry->error, show_nfsv4_errors(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->fhandle + __entry->fhandle, + __entry->stateid_seq, __entry->stateid_hash ) ); @@ -653,6 +799,8 @@ DECLARE_EVENT_CLASS(nfs4_test_stateid_event, __field(dev_t, dev) __field(u32, fhandle) __field(u64, fileid) + __field(int, stateid_seq) + __field(u32, stateid_hash) ), TP_fast_assign( @@ -662,15 +810,21 @@ DECLARE_EVENT_CLASS(nfs4_test_stateid_event, __entry->dev = inode->i_sb->s_dev; __entry->fileid = NFS_FILEID(inode); __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + __entry->stateid_seq = + be32_to_cpu(state->stateid.seqid); + __entry->stateid_hash = + nfs_stateid_hash(&state->stateid); ), TP_printk( - "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x", + "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " + "stateid=%d:0x%08x", __entry->error, show_nfsv4_errors(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, - __entry->fhandle + __entry->fhandle, + __entry->stateid_seq, __entry->stateid_hash ) ); @@ -820,7 +974,6 @@ DECLARE_EVENT_CLASS(nfs4_inode_event, ), \ TP_ARGS(inode, error)) -DEFINE_NFS4_INODE_EVENT(nfs4_setattr); DEFINE_NFS4_INODE_EVENT(nfs4_access); DEFINE_NFS4_INODE_EVENT(nfs4_readlink); DEFINE_NFS4_INODE_EVENT(nfs4_readdir); @@ -830,8 +983,59 @@ DEFINE_NFS4_INODE_EVENT(nfs4_set_acl); DEFINE_NFS4_INODE_EVENT(nfs4_get_security_label); DEFINE_NFS4_INODE_EVENT(nfs4_set_security_label); #endif /* CONFIG_NFS_V4_SECURITY_LABEL */ -DEFINE_NFS4_INODE_EVENT(nfs4_recall_delegation); -DEFINE_NFS4_INODE_EVENT(nfs4_delegreturn); + +DECLARE_EVENT_CLASS(nfs4_inode_stateid_event, + TP_PROTO( + const struct inode *inode, + const nfs4_stateid *stateid, + int error + ), + + TP_ARGS(inode, stateid, error), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(int, error) + __field(int, stateid_seq) + __field(u32, stateid_hash) + ), + + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = NFS_FILEID(inode); + __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + __entry->error = error; + __entry->stateid_seq = + be32_to_cpu(stateid->seqid); + __entry->stateid_hash = + nfs_stateid_hash(stateid); + ), + + TP_printk( + "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " + "stateid=%d:0x%08x", + __entry->error, + show_nfsv4_errors(__entry->error), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + __entry->stateid_seq, __entry->stateid_hash + ) +); + +#define DEFINE_NFS4_INODE_STATEID_EVENT(name) \ + DEFINE_EVENT(nfs4_inode_stateid_event, name, \ + TP_PROTO( \ + const struct inode *inode, \ + const nfs4_stateid *stateid, \ + int error \ + ), \ + TP_ARGS(inode, stateid, error)) + +DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_setattr); +DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_delegreturn); DECLARE_EVENT_CLASS(nfs4_getattr_event, TP_PROTO( @@ -941,8 +1145,74 @@ DECLARE_EVENT_CLASS(nfs4_inode_callback_event, ), \ TP_ARGS(clp, fhandle, inode, error)) DEFINE_NFS4_INODE_CALLBACK_EVENT(nfs4_cb_getattr); -DEFINE_NFS4_INODE_CALLBACK_EVENT(nfs4_cb_layoutrecall_inode); +DECLARE_EVENT_CLASS(nfs4_inode_stateid_callback_event, + TP_PROTO( + const struct nfs_client *clp, + const struct nfs_fh *fhandle, + const struct inode *inode, + const nfs4_stateid *stateid, + int error + ), + + TP_ARGS(clp, fhandle, inode, stateid, error), + + TP_STRUCT__entry( + __field(int, error) + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __string(dstaddr, clp ? + rpc_peeraddr2str(clp->cl_rpcclient, + RPC_DISPLAY_ADDR) : "unknown") + __field(int, stateid_seq) + __field(u32, stateid_hash) + ), + + TP_fast_assign( + __entry->error = error; + __entry->fhandle = nfs_fhandle_hash(fhandle); + if (inode != NULL) { + __entry->fileid = NFS_FILEID(inode); + __entry->dev = inode->i_sb->s_dev; + } else { + __entry->fileid = 0; + __entry->dev = 0; + } + __assign_str(dstaddr, clp ? + rpc_peeraddr2str(clp->cl_rpcclient, + RPC_DISPLAY_ADDR) : "unknown") + __entry->stateid_seq = + be32_to_cpu(stateid->seqid); + __entry->stateid_hash = + nfs_stateid_hash(stateid); + ), + + TP_printk( + "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " + "stateid=%d:0x%08x dstaddr=%s", + __entry->error, + show_nfsv4_errors(__entry->error), + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + __entry->stateid_seq, __entry->stateid_hash, + __get_str(dstaddr) + ) +); + +#define DEFINE_NFS4_INODE_STATEID_CALLBACK_EVENT(name) \ + DEFINE_EVENT(nfs4_inode_stateid_callback_event, name, \ + TP_PROTO( \ + const struct nfs_client *clp, \ + const struct nfs_fh *fhandle, \ + const struct inode *inode, \ + const nfs4_stateid *stateid, \ + int error \ + ), \ + TP_ARGS(clp, fhandle, inode, stateid, error)) +DEFINE_NFS4_INODE_STATEID_CALLBACK_EVENT(nfs4_cb_recall); +DEFINE_NFS4_INODE_STATEID_CALLBACK_EVENT(nfs4_cb_layoutrecall_file); DECLARE_EVENT_CLASS(nfs4_idmap_event, TP_PROTO( @@ -1005,28 +1275,37 @@ DECLARE_EVENT_CLASS(nfs4_read_event, __field(loff_t, offset) __field(size_t, count) __field(int, error) + __field(int, stateid_seq) + __field(u32, stateid_hash) ), TP_fast_assign( const struct inode *inode = hdr->inode; + const struct nfs4_state *state = + hdr->args.context->state; __entry->dev = inode->i_sb->s_dev; __entry->fileid = NFS_FILEID(inode); __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); __entry->offset = hdr->args.offset; __entry->count = hdr->args.count; __entry->error = error; + __entry->stateid_seq = + be32_to_cpu(state->stateid.seqid); + __entry->stateid_hash = + nfs_stateid_hash(&state->stateid); ), TP_printk( "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " - "offset=%lld count=%zu", + "offset=%lld count=%zu stateid=%d:0x%08x", __entry->error, show_nfsv4_errors(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, (long long)__entry->offset, - __entry->count + __entry->count, + __entry->stateid_seq, __entry->stateid_hash ) ); #define DEFINE_NFS4_READ_EVENT(name) \ @@ -1056,28 +1335,37 @@ DECLARE_EVENT_CLASS(nfs4_write_event, __field(loff_t, offset) __field(size_t, count) __field(int, error) + __field(int, stateid_seq) + __field(u32, stateid_hash) ), TP_fast_assign( const struct inode *inode = hdr->inode; + const struct nfs4_state *state = + hdr->args.context->state; __entry->dev = inode->i_sb->s_dev; __entry->fileid = NFS_FILEID(inode); __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); __entry->offset = hdr->args.offset; __entry->count = hdr->args.count; __entry->error = error; + __entry->stateid_seq = + be32_to_cpu(state->stateid.seqid); + __entry->stateid_hash = + nfs_stateid_hash(&state->stateid); ), TP_printk( "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " - "offset=%lld count=%zu", + "offset=%lld count=%zu stateid=%d:0x%08x", __entry->error, show_nfsv4_errors(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, (long long)__entry->offset, - __entry->count + __entry->count, + __entry->stateid_seq, __entry->stateid_hash ) ); @@ -1154,10 +1442,11 @@ TRACE_EVENT(nfs4_layoutget, const struct nfs_open_context *ctx, const struct pnfs_layout_range *args, const struct pnfs_layout_range *res, + const nfs4_stateid *layout_stateid, int error ), - TP_ARGS(ctx, args, res, error), + TP_ARGS(ctx, args, res, layout_stateid, error), TP_STRUCT__entry( __field(dev_t, dev) @@ -1167,10 +1456,15 @@ TRACE_EVENT(nfs4_layoutget, __field(u64, offset) __field(u64, count) __field(int, error) + __field(int, stateid_seq) + __field(u32, stateid_hash) + __field(int, layoutstateid_seq) + __field(u32, layoutstateid_hash) ), TP_fast_assign( const struct inode *inode = d_inode(ctx->dentry); + const struct nfs4_state *state = ctx->state; __entry->dev = inode->i_sb->s_dev; __entry->fileid = NFS_FILEID(inode); __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); @@ -1178,11 +1472,25 @@ TRACE_EVENT(nfs4_layoutget, __entry->offset = args->offset; __entry->count = args->length; __entry->error = error; + __entry->stateid_seq = + be32_to_cpu(state->stateid.seqid); + __entry->stateid_hash = + nfs_stateid_hash(&state->stateid); + if (!error) { + __entry->layoutstateid_seq = + be32_to_cpu(layout_stateid->seqid); + __entry->layoutstateid_hash = + nfs_stateid_hash(layout_stateid); + } else { + __entry->layoutstateid_seq = 0; + __entry->layoutstateid_hash = 0; + } ), TP_printk( "error=%d (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " - "iomode=%s offset=%llu count=%llu", + "iomode=%s offset=%llu count=%llu stateid=%d:0x%08x " + "layoutstateid=%d:0x%08x", __entry->error, show_nfsv4_errors(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), @@ -1190,14 +1498,83 @@ TRACE_EVENT(nfs4_layoutget, __entry->fhandle, show_pnfs_iomode(__entry->iomode), (unsigned long long)__entry->offset, - (unsigned long long)__entry->count + (unsigned long long)__entry->count, + __entry->stateid_seq, __entry->stateid_hash, + __entry->layoutstateid_seq, __entry->layoutstateid_hash ) ); -DEFINE_NFS4_INODE_EVENT(nfs4_layoutcommit); -DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn); +DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_layoutcommit); +DEFINE_NFS4_INODE_STATEID_EVENT(nfs4_layoutreturn); DEFINE_NFS4_INODE_EVENT(nfs4_layoutreturn_on_close); +#define show_pnfs_update_layout_reason(reason) \ + __print_symbolic(reason, \ + { PNFS_UPDATE_LAYOUT_UNKNOWN, "unknown" }, \ + { PNFS_UPDATE_LAYOUT_NO_PNFS, "no pnfs" }, \ + { PNFS_UPDATE_LAYOUT_RD_ZEROLEN, "read+zerolen" }, \ + { PNFS_UPDATE_LAYOUT_MDSTHRESH, "mdsthresh" }, \ + { PNFS_UPDATE_LAYOUT_NOMEM, "nomem" }, \ + { PNFS_UPDATE_LAYOUT_BULK_RECALL, "bulk recall" }, \ + { PNFS_UPDATE_LAYOUT_IO_TEST_FAIL, "io test fail" }, \ + { PNFS_UPDATE_LAYOUT_FOUND_CACHED, "found cached" }, \ + { PNFS_UPDATE_LAYOUT_RETURN, "layoutreturn" }, \ + { PNFS_UPDATE_LAYOUT_BLOCKED, "layouts blocked" }, \ + { PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET, "sent layoutget" }) + +TRACE_EVENT(pnfs_update_layout, + TP_PROTO(struct inode *inode, + loff_t pos, + u64 count, + enum pnfs_iomode iomode, + struct pnfs_layout_hdr *lo, + enum pnfs_update_layout_reason reason + ), + TP_ARGS(inode, pos, count, iomode, lo, reason), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u64, fileid) + __field(u32, fhandle) + __field(loff_t, pos) + __field(u64, count) + __field(enum pnfs_iomode, iomode) + __field(int, layoutstateid_seq) + __field(u32, layoutstateid_hash) + __field(enum pnfs_update_layout_reason, reason) + ), + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = NFS_FILEID(inode); + __entry->fhandle = nfs_fhandle_hash(NFS_FH(inode)); + __entry->pos = pos; + __entry->count = count; + __entry->iomode = iomode; + __entry->reason = reason; + if (lo != NULL) { + __entry->layoutstateid_seq = + be32_to_cpu(lo->plh_stateid.seqid); + __entry->layoutstateid_hash = + nfs_stateid_hash(&lo->plh_stateid); + } else { + __entry->layoutstateid_seq = 0; + __entry->layoutstateid_hash = 0; + } + ), + TP_printk( + "fileid=%02x:%02x:%llu fhandle=0x%08x " + "iomode=%s pos=%llu count=%llu " + "layoutstateid=%d:0x%08x (%s)", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, + show_pnfs_iomode(__entry->iomode), + (unsigned long long)__entry->pos, + (unsigned long long)__entry->count, + __entry->layoutstateid_seq, __entry->layoutstateid_hash, + show_pnfs_update_layout_reason(__entry->reason) + ) +); + #endif /* CONFIG_NFS_V4_1 */ #endif /* _TRACE_NFS4_H */ diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 59f838cdc..9f80a086b 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -39,7 +39,6 @@ { 1 << NFS_INO_INVALIDATING, "INVALIDATING" }, \ { 1 << NFS_INO_FLUSHING, "FLUSHING" }, \ { 1 << NFS_INO_FSCACHE, "FSCACHE" }, \ - { 1 << NFS_INO_COMMIT, "COMMIT" }, \ { 1 << NFS_INO_LAYOUTCOMMIT, "NEED_LAYOUTCOMMIT" }, \ { 1 << NFS_INO_LAYOUTCOMMITTING, "LAYOUTCOMMIT" }) diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 452a011ba..8ce4f61cb 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -101,53 +101,18 @@ nfs_page_free(struct nfs_page *p) kmem_cache_free(nfs_page_cachep, p); } -static void -nfs_iocounter_inc(struct nfs_io_counter *c) -{ - atomic_inc(&c->io_count); -} - -static void -nfs_iocounter_dec(struct nfs_io_counter *c) -{ - if (atomic_dec_and_test(&c->io_count)) { - clear_bit(NFS_IO_INPROGRESS, &c->flags); - smp_mb__after_atomic(); - wake_up_bit(&c->flags, NFS_IO_INPROGRESS); - } -} - -static int -__nfs_iocounter_wait(struct nfs_io_counter *c) -{ - wait_queue_head_t *wq = bit_waitqueue(&c->flags, NFS_IO_INPROGRESS); - DEFINE_WAIT_BIT(q, &c->flags, NFS_IO_INPROGRESS); - int ret = 0; - - do { - prepare_to_wait(wq, &q.wait, TASK_KILLABLE); - set_bit(NFS_IO_INPROGRESS, &c->flags); - if (atomic_read(&c->io_count) == 0) - break; - ret = nfs_wait_bit_killable(&q.key, TASK_KILLABLE); - } while (atomic_read(&c->io_count) != 0 && !ret); - finish_wait(wq, &q.wait); - return ret; -} - /** * nfs_iocounter_wait - wait for i/o to complete - * @c: nfs_io_counter to use + * @l_ctx: nfs_lock_context with io_counter to use * * returns -ERESTARTSYS if interrupted by a fatal signal. * Otherwise returns 0 once the io_count hits 0. */ int -nfs_iocounter_wait(struct nfs_io_counter *c) +nfs_iocounter_wait(struct nfs_lock_context *l_ctx) { - if (atomic_read(&c->io_count) == 0) - return 0; - return __nfs_iocounter_wait(c); + return wait_on_atomic_t(&l_ctx->io_count, nfs_wait_atomic_killable, + TASK_KILLABLE); } /* @@ -370,7 +335,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct page *page, return ERR_CAST(l_ctx); } req->wb_lock_context = l_ctx; - nfs_iocounter_inc(&l_ctx->io_count); + atomic_inc(&l_ctx->io_count); /* Initialize the request struct. Initially, we assume a * long write-back delay. This will be adjusted in @@ -431,7 +396,8 @@ static void nfs_clear_request(struct nfs_page *req) req->wb_page = NULL; } if (l_ctx != NULL) { - nfs_iocounter_dec(&l_ctx->io_count); + if (atomic_dec_and_test(&l_ctx->io_count)) + wake_up_atomic_t(&l_ctx->io_count); nfs_put_lock_context(l_ctx); req->wb_lock_context = NULL; } @@ -664,22 +630,11 @@ EXPORT_SYMBOL_GPL(nfs_initiate_pgio); * @desc: IO descriptor * @hdr: pageio header */ -static int nfs_pgio_error(struct nfs_pageio_descriptor *desc, - struct nfs_pgio_header *hdr) +static void nfs_pgio_error(struct nfs_pgio_header *hdr) { - struct nfs_pgio_mirror *mirror; - u32 midx; - set_bit(NFS_IOHDR_REDO, &hdr->flags); nfs_pgio_data_destroy(hdr); hdr->completion_ops->completion(hdr); - /* TODO: Make sure it's right to clean up all mirrors here - * and not just hdr->pgio_mirror_idx */ - for (midx = 0; midx < desc->pg_mirror_count; midx++) { - mirror = &desc->pg_mirrors[midx]; - desc->pg_completion_ops->error_cleanup(&mirror->pg_list); - } - return -ENOMEM; } /** @@ -800,8 +755,11 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc, unsigned int pagecount, pageused; pagecount = nfs_page_array_len(mirror->pg_base, mirror->pg_count); - if (!nfs_pgarray_set(&hdr->page_array, pagecount)) - return nfs_pgio_error(desc, hdr); + if (!nfs_pgarray_set(&hdr->page_array, pagecount)) { + nfs_pgio_error(hdr); + desc->pg_error = -ENOMEM; + return desc->pg_error; + } nfs_init_cinfo(&cinfo, desc->pg_inode, desc->pg_dreq); pages = hdr->page_array.pagevec; @@ -819,8 +777,11 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc, *pages++ = last_page = req->wb_page; } } - if (WARN_ON_ONCE(pageused != pagecount)) - return nfs_pgio_error(desc, hdr); + if (WARN_ON_ONCE(pageused != pagecount)) { + nfs_pgio_error(hdr); + desc->pg_error = -EINVAL; + return desc->pg_error; + } if ((desc->pg_ioflags & FLUSH_COND_STABLE) && (desc->pg_moreio || nfs_reqs_to_commit(&cinfo))) @@ -835,18 +796,13 @@ EXPORT_SYMBOL_GPL(nfs_generic_pgio); static int nfs_generic_pg_pgios(struct nfs_pageio_descriptor *desc) { - struct nfs_pgio_mirror *mirror; struct nfs_pgio_header *hdr; int ret; - mirror = nfs_pgio_current_mirror(desc); - hdr = nfs_pgio_header_alloc(desc->pg_rw_ops); if (!hdr) { - /* TODO: make sure this is right with mirroring - or - * should it back out all mirrors? */ - desc->pg_completion_ops->error_cleanup(&mirror->pg_list); - return -ENOMEM; + desc->pg_error = -ENOMEM; + return desc->pg_error; } nfs_pgheader_init(desc, hdr, nfs_pgio_header_free); ret = nfs_generic_pgio(desc, hdr); @@ -874,6 +830,9 @@ static int nfs_pageio_setup_mirroring(struct nfs_pageio_descriptor *pgio, mirror_count = pgio->pg_ops->pg_get_mirror_count(pgio, req); + if (pgio->pg_error < 0) + return pgio->pg_error; + if (!mirror_count || mirror_count > NFS_PAGEIO_DESCRIPTOR_MIRROR_MAX) return -EINVAL; @@ -903,12 +862,6 @@ static void nfs_pageio_cleanup_mirroring(struct nfs_pageio_descriptor *pgio) pgio->pg_mirrors_dynamic = NULL; } -static bool nfs_match_open_context(const struct nfs_open_context *ctx1, - const struct nfs_open_context *ctx2) -{ - return ctx1->cred == ctx2->cred && ctx1->state == ctx2->state; -} - static bool nfs_match_lock_context(const struct nfs_lock_context *l1, const struct nfs_lock_context *l2) { @@ -982,6 +935,8 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc, } else { if (desc->pg_ops->pg_init) desc->pg_ops->pg_init(desc, req); + if (desc->pg_error < 0) + return 0; mirror->pg_base = req->wb_pgbase; } if (!nfs_can_coalesce_requests(prev, req, desc)) @@ -1147,6 +1102,8 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, bytes = req->wb_bytes; nfs_pageio_setup_mirroring(desc, req); + if (desc->pg_error < 0) + goto out_failed; for (midx = 0; midx < desc->pg_mirror_count; midx++) { if (midx) { @@ -1163,7 +1120,8 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, if (IS_ERR(dupreq)) { nfs_page_group_unlock(req); - return 0; + desc->pg_error = PTR_ERR(dupreq); + goto out_failed; } nfs_lock_request(dupreq); @@ -1176,10 +1134,32 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, if (nfs_pgio_has_mirroring(desc)) desc->pg_mirror_idx = midx; if (!nfs_pageio_add_request_mirror(desc, dupreq)) - return 0; + goto out_failed; } return 1; + +out_failed: + /* + * We might have failed before sending any reqs over wire. + * Clean up rest of the reqs in mirror pg_list. + */ + if (desc->pg_error) { + struct nfs_pgio_mirror *mirror; + void (*func)(struct list_head *); + + /* remember fatal errors */ + if (nfs_error_is_fatal(desc->pg_error)) + mapping_set_error(desc->pg_inode->i_mapping, + desc->pg_error); + + func = desc->pg_completion_ops->error_cleanup; + for (midx = 0; midx < desc->pg_mirror_count; midx++) { + mirror = &desc->pg_mirrors[midx]; + func(&mirror->pg_list); + } + } + return 0; } /* @@ -1232,7 +1212,7 @@ int nfs_pageio_resend(struct nfs_pageio_descriptor *desc, nfs_pageio_complete(desc); if (!list_empty(&failed)) { list_move(&failed, &hdr->pages); - return -EIO; + return desc->pg_error < 0 ? desc->pg_error : -EIO; } return 0; } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index bec038449..2fa483e6d 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -52,9 +52,7 @@ static DEFINE_SPINLOCK(pnfs_spinlock); */ static LIST_HEAD(pnfs_modules_tbl); -static int -pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid, - enum pnfs_iomode iomode, bool sync); +static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo); /* Return the registered pnfs layout driver module matching given id */ static struct pnfs_layoutdriver_type * @@ -243,6 +241,8 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo) { struct inode *inode = lo->plh_inode; + pnfs_layoutreturn_before_put_layout_hdr(lo); + if (atomic_dec_and_lock(&lo->plh_refcount, &inode->i_lock)) { if (!list_empty(&lo->plh_segs)) WARN_ONCE(1, "NFS: BUG unfreed layout segments.\n"); @@ -252,6 +252,27 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo) } } +/* + * Mark a pnfs_layout_hdr and all associated layout segments as invalid + * + * In order to continue using the pnfs_layout_hdr, a full recovery + * is required. + * Note that caller must hold inode->i_lock. + */ +static int +pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo, + struct list_head *lseg_list) +{ + struct pnfs_layout_range range = { + .iomode = IOMODE_ANY, + .offset = 0, + .length = NFS4_MAX_UINT64, + }; + + set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); + return pnfs_mark_matching_lsegs_invalid(lo, lseg_list, &range); +} + static int pnfs_iomode_to_fail_bit(u32 iomode) { @@ -345,58 +366,6 @@ pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo, rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq); } -/* Return true if layoutreturn is needed */ -static bool -pnfs_layout_need_return(struct pnfs_layout_hdr *lo, - struct pnfs_layout_segment *lseg) -{ - struct pnfs_layout_segment *s; - - if (!test_and_clear_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags)) - return false; - - list_for_each_entry(s, &lo->plh_segs, pls_list) - if (s != lseg && test_bit(NFS_LSEG_LAYOUTRETURN, &s->pls_flags)) - return false; - - return true; -} - -static bool -pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo) -{ - if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) - return false; - lo->plh_return_iomode = 0; - pnfs_get_layout_hdr(lo); - clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, &lo->plh_flags); - return true; -} - -static void pnfs_layoutreturn_before_put_lseg(struct pnfs_layout_segment *lseg, - struct pnfs_layout_hdr *lo, struct inode *inode) -{ - lo = lseg->pls_layout; - inode = lo->plh_inode; - - spin_lock(&inode->i_lock); - if (pnfs_layout_need_return(lo, lseg)) { - nfs4_stateid stateid; - enum pnfs_iomode iomode; - bool send; - - stateid = lo->plh_stateid; - iomode = lo->plh_return_iomode; - send = pnfs_prepare_layoutreturn(lo); - spin_unlock(&inode->i_lock); - if (send) { - /* Send an async layoutreturn so we dont deadlock */ - pnfs_send_layoutreturn(lo, stateid, iomode, false); - } - } else - spin_unlock(&inode->i_lock); -} - void pnfs_put_lseg(struct pnfs_layout_segment *lseg) { @@ -410,15 +379,8 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg) atomic_read(&lseg->pls_refcount), test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); - /* Handle the case where refcount != 1 */ - if (atomic_add_unless(&lseg->pls_refcount, -1, 1)) - return; - lo = lseg->pls_layout; inode = lo->plh_inode; - /* Do we need a layoutreturn? */ - if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags)) - pnfs_layoutreturn_before_put_lseg(lseg, lo, inode); if (atomic_dec_and_lock(&lseg->pls_refcount, &inode->i_lock)) { if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags)) { @@ -566,10 +528,10 @@ static int mark_lseg_invalid(struct pnfs_layout_segment *lseg, int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, struct list_head *tmp_list, - struct pnfs_layout_range *recall_range) + const struct pnfs_layout_range *recall_range) { struct pnfs_layout_segment *lseg, *next; - int invalid = 0, removed = 0; + int remaining = 0; dprintk("%s:Begin lo %p\n", __func__, lo); @@ -582,11 +544,11 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, "offset %llu length %llu\n", __func__, lseg, lseg->pls_range.iomode, lseg->pls_range.offset, lseg->pls_range.length); - invalid++; - removed += mark_lseg_invalid(lseg, tmp_list); + if (!mark_lseg_invalid(lseg, tmp_list)) + remaining++; } - dprintk("%s:Return %i\n", __func__, invalid - removed); - return invalid - removed; + dprintk("%s:Return %i\n", __func__, remaining); + return remaining; } /* note free_me must contain lsegs from a single layout_hdr */ @@ -613,12 +575,10 @@ pnfs_destroy_layout(struct nfs_inode *nfsi) spin_lock(&nfsi->vfs_inode.i_lock); lo = nfsi->layout; if (lo) { - lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */ - pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL); pnfs_get_layout_hdr(lo); + pnfs_mark_layout_stateid_invalid(lo, &tmp_list); pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RO_FAILED); pnfs_layout_clear_fail_bit(lo, NFS_LAYOUT_RW_FAILED); - pnfs_clear_retry_layoutget(lo); spin_unlock(&nfsi->vfs_inode.i_lock); pnfs_free_lseg_list(&tmp_list); pnfs_put_layout_hdr(lo); @@ -677,11 +637,6 @@ pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list, { struct pnfs_layout_hdr *lo; struct inode *inode; - struct pnfs_layout_range range = { - .iomode = IOMODE_ANY, - .offset = 0, - .length = NFS4_MAX_UINT64, - }; LIST_HEAD(lseg_list); int ret = 0; @@ -696,13 +651,15 @@ pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list, spin_lock(&inode->i_lock); list_del_init(&lo->plh_bulk_destroy); - lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */ - if (is_bulk_recall) - set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); - if (pnfs_mark_matching_lsegs_invalid(lo, &lseg_list, &range)) + if (pnfs_mark_layout_stateid_invalid(lo, &lseg_list)) { + if (is_bulk_recall) + set_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags); ret = -EAGAIN; + } spin_unlock(&inode->i_lock); pnfs_free_lseg_list(&lseg_list); + /* Free all lsegs that are attached to commit buckets */ + nfs_commit_inode(inode, 0); pnfs_put_layout_hdr(lo); iput(inode); } @@ -826,7 +783,7 @@ pnfs_layoutgets_blocked(const struct pnfs_layout_hdr *lo) int pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, - struct pnfs_layout_range *range, + const struct pnfs_layout_range *range, struct nfs4_state *open_state) { int status = 0; @@ -861,7 +818,7 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, static struct pnfs_layout_segment * send_layoutget(struct pnfs_layout_hdr *lo, struct nfs_open_context *ctx, - struct pnfs_layout_range *range, + const struct pnfs_layout_range *range, gfp_t gfp_flags) { struct inode *ino = lo->plh_inode; @@ -894,7 +851,7 @@ send_layoutget(struct pnfs_layout_hdr *lo, lgp->args.minlength = i_size - range->offset; } lgp->args.maxcount = PNFS_LAYOUT_MAXSIZE; - lgp->args.range = *range; + pnfs_copy_range(&lgp->args.range, range); lgp->args.type = server->pnfs_curr_ld->id; lgp->args.inode = ino; lgp->args.ctx = get_nfs_open_context(ctx); @@ -904,17 +861,9 @@ send_layoutget(struct pnfs_layout_hdr *lo, lseg = nfs4_proc_layoutget(lgp, gfp_flags); } while (lseg == ERR_PTR(-EAGAIN)); - if (IS_ERR(lseg)) { - switch (PTR_ERR(lseg)) { - case -ENOMEM: - case -ERESTARTSYS: - break; - default: - /* remember that LAYOUTGET failed and suspend trying */ - pnfs_layout_io_set_failed(lo, range->iomode); - } - return NULL; - } else + if (IS_ERR(lseg) && !nfs_error_is_fatal(PTR_ERR(lseg))) + lseg = NULL; + else pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(range->iomode)); @@ -944,8 +893,19 @@ void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo) rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq); } +static bool +pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo) +{ + if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) + return false; + lo->plh_return_iomode = 0; + pnfs_get_layout_hdr(lo); + clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); + return true; +} + static int -pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid, +pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, const nfs4_stateid *stateid, enum pnfs_iomode iomode, bool sync) { struct inode *ino = lo->plh_inode; @@ -962,7 +922,7 @@ pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, nfs4_stateid stateid, goto out; } - lrp->args.stateid = stateid; + nfs4_stateid_copy(&lrp->args.stateid, stateid); lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id; lrp->args.inode = ino; lrp->args.range.iomode = iomode; @@ -978,6 +938,48 @@ out: return status; } +/* Return true if layoutreturn is needed */ +static bool +pnfs_layout_need_return(struct pnfs_layout_hdr *lo) +{ + struct pnfs_layout_segment *s; + + if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) + return false; + + /* Defer layoutreturn until all lsegs are done */ + list_for_each_entry(s, &lo->plh_segs, pls_list) { + if (test_bit(NFS_LSEG_LAYOUTRETURN, &s->pls_flags)) + return false; + } + + return true; +} + +static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo) +{ + struct inode *inode= lo->plh_inode; + + if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) + return; + spin_lock(&inode->i_lock); + if (pnfs_layout_need_return(lo)) { + nfs4_stateid stateid; + enum pnfs_iomode iomode; + bool send; + + nfs4_stateid_copy(&stateid, &lo->plh_stateid); + iomode = lo->plh_return_iomode; + send = pnfs_prepare_layoutreturn(lo); + spin_unlock(&inode->i_lock); + if (send) { + /* Send an async layoutreturn so we dont deadlock */ + pnfs_send_layoutreturn(lo, &stateid, iomode, false); + } + } else + spin_unlock(&inode->i_lock); +} + /* * Initiates a LAYOUTRETURN(FILE), and removes the pnfs_layout_hdr * when the layout segment list is empty. @@ -1005,7 +1007,7 @@ _pnfs_return_layout(struct inode *ino) dprintk("NFS: %s no layout to return\n", __func__); goto out; } - stateid = nfsi->layout->plh_stateid; + nfs4_stateid_copy(&stateid, &nfsi->layout->plh_stateid); /* Reference matched in nfs4_layoutreturn_release */ pnfs_get_layout_hdr(lo); empty = list_empty(&lo->plh_segs); @@ -1033,7 +1035,7 @@ _pnfs_return_layout(struct inode *ino) spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&tmp_list); if (send) - status = pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true); + status = pnfs_send_layoutreturn(lo, &stateid, IOMODE_ANY, true); out_put_layout_hdr: pnfs_put_layout_hdr(lo); out: @@ -1096,13 +1098,12 @@ bool pnfs_roc(struct inode *ino) goto out_noroc; } - stateid = lo->plh_stateid; + nfs4_stateid_copy(&stateid, &lo->plh_stateid); /* always send layoutreturn if being marked so */ - if (test_and_clear_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, + if (test_and_clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) layoutreturn = pnfs_prepare_layoutreturn(lo); - pnfs_clear_retry_layoutget(lo); list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list) /* If we are sending layoutreturn, invalidate all valid lsegs */ if (layoutreturn || test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) { @@ -1124,7 +1125,7 @@ out_noroc: pnfs_free_lseg_list(&tmp_list); pnfs_layoutcommit_inode(ino, true); if (layoutreturn) - pnfs_send_layoutreturn(lo, stateid, IOMODE_ANY, true); + pnfs_send_layoutreturn(lo, &stateid, IOMODE_ANY, true); return roc; } @@ -1149,6 +1150,7 @@ void pnfs_roc_set_barrier(struct inode *ino, u32 barrier) spin_lock(&ino->i_lock); lo = NFS_I(ino)->layout; + pnfs_mark_layout_returned_if_empty(lo); if (pnfs_seqid_is_newer(barrier, lo->plh_barrier)) lo->plh_barrier = barrier; spin_unlock(&ino->i_lock); @@ -1465,25 +1467,15 @@ static bool pnfs_within_mdsthreshold(struct nfs_open_context *ctx, return ret; } -/* stop waiting if someone clears NFS_LAYOUT_RETRY_LAYOUTGET bit. */ -static int pnfs_layoutget_retry_bit_wait(struct wait_bit_key *key, int mode) -{ - if (!test_bit(NFS_LAYOUT_RETRY_LAYOUTGET, key->flags)) - return 1; - return nfs_wait_bit_killable(key, mode); -} - static bool pnfs_prepare_to_retry_layoutget(struct pnfs_layout_hdr *lo) { - if (!pnfs_should_retry_layoutget(lo)) - return false; /* * send layoutcommit as it can hold up layoutreturn due to lseg * reference */ pnfs_layoutcommit_inode(lo->plh_inode, false); return !wait_on_bit_action(&lo->plh_flags, NFS_LAYOUT_RETURN, - pnfs_layoutget_retry_bit_wait, + nfs_wait_bit_killable, TASK_UNINTERRUPTIBLE); } @@ -1520,14 +1512,23 @@ pnfs_update_layout(struct inode *ino, struct pnfs_layout_segment *lseg = NULL; bool first; - if (!pnfs_enabled_sb(NFS_SERVER(ino))) + if (!pnfs_enabled_sb(NFS_SERVER(ino))) { + trace_pnfs_update_layout(ino, pos, count, iomode, NULL, + PNFS_UPDATE_LAYOUT_NO_PNFS); goto out; + } - if (iomode == IOMODE_READ && i_size_read(ino) == 0) + if (iomode == IOMODE_READ && i_size_read(ino) == 0) { + trace_pnfs_update_layout(ino, pos, count, iomode, NULL, + PNFS_UPDATE_LAYOUT_RD_ZEROLEN); goto out; + } - if (pnfs_within_mdsthreshold(ctx, ino, iomode)) + if (pnfs_within_mdsthreshold(ctx, ino, iomode)) { + trace_pnfs_update_layout(ino, pos, count, iomode, NULL, + PNFS_UPDATE_LAYOUT_MDSTHRESH); goto out; + } lookup_again: first = false; @@ -1535,19 +1536,25 @@ lookup_again: lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags); if (lo == NULL) { spin_unlock(&ino->i_lock); + trace_pnfs_update_layout(ino, pos, count, iomode, NULL, + PNFS_UPDATE_LAYOUT_NOMEM); goto out; } /* Do we even need to bother with this? */ if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { + trace_pnfs_update_layout(ino, pos, count, iomode, lo, + PNFS_UPDATE_LAYOUT_BULK_RECALL); dprintk("%s matches recall, use MDS\n", __func__); goto out_unlock; } /* if LAYOUTGET already failed once we don't try again */ - if (pnfs_layout_io_test_failed(lo, iomode) && - !pnfs_should_retry_layoutget(lo)) + if (pnfs_layout_io_test_failed(lo, iomode)) { + trace_pnfs_update_layout(ino, pos, count, iomode, lo, + PNFS_UPDATE_LAYOUT_IO_TEST_FAIL); goto out_unlock; + } first = list_empty(&lo->plh_segs); if (first) { @@ -1567,8 +1574,11 @@ lookup_again: * already exists */ lseg = pnfs_find_lseg(lo, &arg); - if (lseg) + if (lseg) { + trace_pnfs_update_layout(ino, pos, count, iomode, lo, + PNFS_UPDATE_LAYOUT_FOUND_CACHED); goto out_unlock; + } } /* @@ -1585,11 +1595,16 @@ lookup_again: dprintk("%s retrying\n", __func__); goto lookup_again; } + trace_pnfs_update_layout(ino, pos, count, iomode, lo, + PNFS_UPDATE_LAYOUT_RETURN); goto out_put_layout_hdr; } - if (pnfs_layoutgets_blocked(lo)) + if (pnfs_layoutgets_blocked(lo)) { + trace_pnfs_update_layout(ino, pos, count, iomode, lo, + PNFS_UPDATE_LAYOUT_BLOCKED); goto out_unlock; + } atomic_inc(&lo->plh_outstanding); spin_unlock(&ino->i_lock); @@ -1612,8 +1627,9 @@ lookup_again: arg.length = PAGE_CACHE_ALIGN(arg.length); lseg = send_layoutget(lo, ctx, &arg, gfp_flags); - pnfs_clear_retry_layoutget(lo); atomic_dec(&lo->plh_outstanding); + trace_pnfs_update_layout(ino, pos, count, iomode, lo, + PNFS_UPDATE_LAYOUT_SEND_LAYOUTGET); out_put_layout_hdr: if (first) pnfs_clear_first_layoutget(lo); @@ -1623,7 +1639,7 @@ out: "(%s, offset: %llu, length: %llu)\n", __func__, ino->i_sb->s_id, (unsigned long long)NFS_FILEID(ino), - lseg == NULL ? "not found" : "found", + IS_ERR_OR_NULL(lseg) ? "not found" : "found", iomode==IOMODE_RW ? "read/write" : "read-only", (unsigned long long)pos, (unsigned long long)count); @@ -1730,16 +1746,40 @@ out_forget_reply: } static void +pnfs_set_plh_return_iomode(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode) +{ + if (lo->plh_return_iomode == iomode) + return; + if (lo->plh_return_iomode != 0) + iomode = IOMODE_ANY; + lo->plh_return_iomode = iomode; + set_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags); +} + +/** + * pnfs_mark_matching_lsegs_return - Free or return matching layout segments + * @lo: pointer to layout header + * @tmp_list: list header to be used with pnfs_free_lseg_list() + * @return_range: describe layout segment ranges to be returned + * + * This function is mainly intended for use by layoutrecall. It attempts + * to free the layout segment immediately, or else to mark it for return + * as soon as its reference count drops to zero. + */ +int pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, struct list_head *tmp_list, - struct pnfs_layout_range *return_range) + const struct pnfs_layout_range *return_range) { struct pnfs_layout_segment *lseg, *next; + int remaining = 0; dprintk("%s:Begin lo %p\n", __func__, lo); if (list_empty(&lo->plh_segs)) - return; + return 0; + + assert_spin_locked(&lo->plh_inode->i_lock); list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) if (should_free_lseg(&lseg->pls_range, return_range)) { @@ -1748,39 +1788,47 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, lseg, lseg->pls_range.iomode, lseg->pls_range.offset, lseg->pls_range.length); + if (mark_lseg_invalid(lseg, tmp_list)) + continue; + remaining++; set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags); - mark_lseg_invalid(lseg, tmp_list); - set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, - &lo->plh_flags); + pnfs_set_plh_return_iomode(lo, return_range->iomode); } + return remaining; } void pnfs_error_mark_layout_for_return(struct inode *inode, struct pnfs_layout_segment *lseg) { struct pnfs_layout_hdr *lo = NFS_I(inode)->layout; - int iomode = pnfs_iomode_to_fail_bit(lseg->pls_range.iomode); struct pnfs_layout_range range = { .iomode = lseg->pls_range.iomode, .offset = 0, .length = NFS4_MAX_UINT64, }; LIST_HEAD(free_me); + bool return_now = false; spin_lock(&inode->i_lock); - /* set failure bit so that pnfs path will be retried later */ - pnfs_layout_set_fail_bit(lo, iomode); - if (lo->plh_return_iomode == 0) - lo->plh_return_iomode = range.iomode; - else if (lo->plh_return_iomode != range.iomode) - lo->plh_return_iomode = IOMODE_ANY; + pnfs_set_plh_return_iomode(lo, range.iomode); /* * mark all matching lsegs so that we are sure to have no live * segments at hand when sending layoutreturn. See pnfs_put_lseg() * for how it works. */ - pnfs_mark_matching_lsegs_return(lo, &free_me, &range); - spin_unlock(&inode->i_lock); + if (!pnfs_mark_matching_lsegs_return(lo, &free_me, &range)) { + nfs4_stateid stateid; + enum pnfs_iomode iomode = lo->plh_return_iomode; + + nfs4_stateid_copy(&stateid, &lo->plh_stateid); + return_now = pnfs_prepare_layoutreturn(lo); + spin_unlock(&inode->i_lock); + if (return_now) + pnfs_send_layoutreturn(lo, &stateid, iomode, false); + } else { + spin_unlock(&inode->i_lock); + nfs_commit_inode(inode, 0); + } pnfs_free_lseg_list(&free_me); } EXPORT_SYMBOL_GPL(pnfs_error_mark_layout_for_return); @@ -1802,6 +1850,11 @@ pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *r rd_size, IOMODE_READ, GFP_KERNEL); + if (IS_ERR(pgio->pg_lseg)) { + pgio->pg_error = PTR_ERR(pgio->pg_lseg); + pgio->pg_lseg = NULL; + return; + } } /* If no lseg, fall back to read through mds */ if (pgio->pg_lseg == NULL) @@ -1814,13 +1867,19 @@ void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req, u64 wb_size) { - if (pgio->pg_lseg == NULL) + if (pgio->pg_lseg == NULL) { pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, req->wb_context, req_offset(req), wb_size, IOMODE_RW, GFP_NOFS); + if (IS_ERR(pgio->pg_lseg)) { + pgio->pg_error = PTR_ERR(pgio->pg_lseg); + pgio->pg_lseg = NULL; + return; + } + } /* If no lseg, fall back to write through mds */ if (pgio->pg_lseg == NULL) nfs_pageio_reset_write_mds(pgio); @@ -1988,15 +2047,13 @@ static void pnfs_writehdr_free(struct nfs_pgio_header *hdr) int pnfs_generic_pg_writepages(struct nfs_pageio_descriptor *desc) { - struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc); - struct nfs_pgio_header *hdr; int ret; hdr = nfs_pgio_header_alloc(desc->pg_rw_ops); if (!hdr) { - desc->pg_completion_ops->error_cleanup(&mirror->pg_list); - return -ENOMEM; + desc->pg_error = -ENOMEM; + return desc->pg_error; } nfs_pgheader_init(desc, hdr, pnfs_writehdr_free); @@ -2119,15 +2176,13 @@ static void pnfs_readhdr_free(struct nfs_pgio_header *hdr) int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc) { - struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc); - struct nfs_pgio_header *hdr; int ret; hdr = nfs_pgio_header_alloc(desc->pg_rw_ops); if (!hdr) { - desc->pg_completion_ops->error_cleanup(&mirror->pg_list); - return -ENOMEM; + desc->pg_error = -ENOMEM; + return desc->pg_error; } nfs_pgheader_init(desc, hdr, pnfs_readhdr_free); hdr->lseg = pnfs_get_lseg(desc->pg_lseg); diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index d1990e90e..1ac1db5f6 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -94,11 +94,10 @@ enum { NFS_LAYOUT_RO_FAILED = 0, /* get ro layout failed stop trying */ NFS_LAYOUT_RW_FAILED, /* get rw layout failed stop trying */ NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */ - NFS_LAYOUT_RETURN, /* Return this layout ASAP */ - NFS_LAYOUT_RETURN_BEFORE_CLOSE, /* Return this layout before close */ + NFS_LAYOUT_RETURN, /* layoutreturn in progress */ + NFS_LAYOUT_RETURN_REQUESTED, /* Return this layout ASAP */ NFS_LAYOUT_INVALID_STID, /* layout stateid id is invalid */ NFS_LAYOUT_FIRST_LAYOUTGET, /* Serialize first layoutget */ - NFS_LAYOUT_RETRY_LAYOUTGET, /* Retry layoutget */ }; enum layoutdriver_policy_flags { @@ -261,11 +260,14 @@ void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, bool update_barrier); int pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, - struct pnfs_layout_range *range, + const struct pnfs_layout_range *range, struct nfs4_state *open_state); int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, struct list_head *tmp_list, - struct pnfs_layout_range *recall_range); + const struct pnfs_layout_range *recall_range); +int pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo, + struct list_head *tmp_list, + const struct pnfs_layout_range *recall_range); bool pnfs_roc(struct inode *ino); void pnfs_roc_release(struct inode *ino); void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); @@ -379,26 +381,6 @@ nfs4_get_deviceid(struct nfs4_deviceid_node *d) return d; } -static inline void pnfs_set_retry_layoutget(struct pnfs_layout_hdr *lo) -{ - if (!test_and_set_bit(NFS_LAYOUT_RETRY_LAYOUTGET, &lo->plh_flags)) - atomic_inc(&lo->plh_refcount); -} - -static inline void pnfs_clear_retry_layoutget(struct pnfs_layout_hdr *lo) -{ - if (test_and_clear_bit(NFS_LAYOUT_RETRY_LAYOUTGET, &lo->plh_flags)) { - atomic_dec(&lo->plh_refcount); - /* wake up waiters for LAYOUTRETURN as that is not needed */ - wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN); - } -} - -static inline bool pnfs_should_retry_layoutget(struct pnfs_layout_hdr *lo) -{ - return test_bit(NFS_LAYOUT_RETRY_LAYOUTGET, &lo->plh_flags); -} - static inline struct pnfs_layout_segment * pnfs_get_lseg(struct pnfs_layout_segment *lseg) { @@ -409,6 +391,12 @@ pnfs_get_lseg(struct pnfs_layout_segment *lseg) return lseg; } +static inline bool +pnfs_is_valid_lseg(struct pnfs_layout_segment *lseg) +{ + return test_bit(NFS_LSEG_VALID, &lseg->pls_flags) != 0; +} + /* Return true if a layout driver is being used for this mountpoint */ static inline int pnfs_enabled_sb(struct nfs_server *nfss) { @@ -556,6 +544,26 @@ pnfs_calc_offset_length(u64 offset, u64 end) return 1 + end - offset; } +/** + * pnfs_mark_layout_returned_if_empty - marks the layout as returned + * @lo: layout header + * + * Note: Caller must hold inode->i_lock + */ +static inline void +pnfs_mark_layout_returned_if_empty(struct pnfs_layout_hdr *lo) +{ + if (list_empty(&lo->plh_segs)) + set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); +} + +static inline void +pnfs_copy_range(struct pnfs_layout_range *dst, + const struct pnfs_layout_range *src) +{ + memcpy(dst, src, sizeof(*dst)); +} + extern unsigned int layoutstats_timer; #ifdef NFS_DEBUG diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 24655b807..81ac6480f 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -266,17 +266,14 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages, } else { nfs_retry_commit(mds_pages, NULL, cinfo, 0); pnfs_generic_retry_commit(cinfo, 0); - cinfo->completion_ops->error_cleanup(NFS_I(inode)); return -ENOMEM; } } nreq += pnfs_generic_alloc_ds_commits(cinfo, &list); - if (nreq == 0) { - cinfo->completion_ops->error_cleanup(NFS_I(inode)); + if (nreq == 0) goto out; - } atomic_add(nreq, &cinfo->mds->rpcs_out); @@ -871,6 +868,11 @@ pnfs_layout_mark_request_commit(struct nfs_page *req, buckets = cinfo->ds->buckets; list = &buckets[ds_commit_idx].written; if (list_empty(list)) { + if (!pnfs_is_valid_lseg(lseg)) { + spin_unlock(cinfo->lock); + cinfo->completion_ops->resched_write(cinfo, req); + return; + } /* Non-empty buckets hold a reference on the lseg. That ref * is normally transferred to the COMMIT call and released * there. It could also be released if the last req is pulled diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 0a5e33f33..eb31e23e7 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -85,6 +85,23 @@ void nfs_pageio_reset_read_mds(struct nfs_pageio_descriptor *pgio) } EXPORT_SYMBOL_GPL(nfs_pageio_reset_read_mds); +static void nfs_readpage_release(struct nfs_page *req) +{ + struct inode *inode = d_inode(req->wb_context->dentry); + + dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb->s_id, + (unsigned long long)NFS_FILEID(inode), req->wb_bytes, + (long long)req_offset(req)); + + if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) { + if (PageUptodate(req->wb_page)) + nfs_readpage_to_fscache(inode, req->wb_page, 0); + + unlock_page(req->wb_page); + } + nfs_release_request(req); +} + int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, struct page *page) { @@ -106,7 +123,10 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, nfs_pageio_init_read(&pgio, inode, false, &nfs_async_read_completion_ops); - nfs_pageio_add_request(&pgio, new); + if (!nfs_pageio_add_request(&pgio, new)) { + nfs_list_remove_request(new); + nfs_readpage_release(new); + } nfs_pageio_complete(&pgio); /* It doesn't make sense to do mirrored reads! */ @@ -115,24 +135,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, pgm = &pgio.pg_mirrors[0]; NFS_I(inode)->read_io += pgm->pg_bytes_written; - return 0; -} - -static void nfs_readpage_release(struct nfs_page *req) -{ - struct inode *inode = d_inode(req->wb_context->dentry); - - dprintk("NFS: read done (%s/%llu %d@%lld)\n", inode->i_sb->s_id, - (unsigned long long)NFS_FILEID(inode), req->wb_bytes, - (long long)req_offset(req)); - - if (nfs_page_group_sync_on_bit(req, PG_UNLOCKPAGE)) { - if (PageUptodate(req->wb_page)) - nfs_readpage_to_fscache(inode, req->wb_page, 0); - - unlock_page(req->wb_page); - } - nfs_release_request(req); + return pgio.pg_error < 0 ? pgio.pg_error : 0; } static void nfs_page_group_set_uptodate(struct nfs_page *req) @@ -361,6 +364,8 @@ readpage_async_filler(void *data, struct page *page) if (len < PAGE_CACHE_SIZE) zero_user_segment(page, len, PAGE_CACHE_SIZE); if (!nfs_pageio_add_request(desc->pgio, new)) { + nfs_list_remove_request(new); + nfs_readpage_release(new); error = desc->pgio->pg_error; goto out_unlock; } diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c index b6de433da..4fe3eead3 100644 --- a/fs/nfs/symlink.c +++ b/fs/nfs/symlink.c @@ -42,21 +42,35 @@ error: return -EIO; } -static const char *nfs_follow_link(struct dentry *dentry, void **cookie) +static const char *nfs_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) { - struct inode *inode = d_inode(dentry); struct page *page; void *err; - err = ERR_PTR(nfs_revalidate_mapping(inode, inode->i_mapping)); - if (err) - return err; - page = read_cache_page(&inode->i_data, 0, - (filler_t *)nfs_symlink_filler, inode); - if (IS_ERR(page)) - return ERR_CAST(page); - *cookie = page; - return kmap(page); + if (!dentry) { + err = ERR_PTR(nfs_revalidate_mapping_rcu(inode)); + if (err) + return err; + page = find_get_page(inode->i_mapping, 0); + if (!page) + return ERR_PTR(-ECHILD); + if (!PageUptodate(page)) { + put_page(page); + return ERR_PTR(-ECHILD); + } + } else { + err = ERR_PTR(nfs_revalidate_mapping(inode, inode->i_mapping)); + if (err) + return err; + page = read_cache_page(&inode->i_data, 0, + (filler_t *)nfs_symlink_filler, inode); + if (IS_ERR(page)) + return ERR_CAST(page); + } + set_delayed_call(done, page_put_link, page); + return page_address(page); } /* @@ -64,8 +78,7 @@ static const char *nfs_follow_link(struct dentry *dentry, void **cookie) */ const struct inode_operations nfs_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = nfs_follow_link, - .put_link = page_put_link, + .get_link = nfs_get_link, .getattr = nfs_getattr, .setattr = nfs_setattr, }; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 7b9316406..5754835a2 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -21,6 +21,8 @@ #include <linux/nfs_page.h> #include <linux/backing-dev.h> #include <linux/export.h> +#include <linux/freezer.h> +#include <linux/wait.h> #include <asm/uaccess.h> @@ -244,11 +246,9 @@ static int wb_priority(struct writeback_control *wbc) { int ret = 0; if (wbc->for_reclaim) - return FLUSH_HIGHPRI | FLUSH_STABLE; + return FLUSH_HIGHPRI | FLUSH_COND_STABLE; if (wbc->sync_mode == WB_SYNC_ALL) ret = FLUSH_COND_STABLE; - if (wbc->for_kupdate || wbc->for_background) - ret |= FLUSH_LOWPRI; return ret; } @@ -545,12 +545,22 @@ try_again: return head; } +static void nfs_write_error_remove_page(struct nfs_page *req) +{ + nfs_unlock_request(req); + nfs_end_page_writeback(req); + nfs_release_request(req); + generic_error_remove_page(page_file_mapping(req->wb_page), + req->wb_page); +} + /* * Find an associated nfs write request, and prepare to flush it out * May return an error if the user signalled nfs_wait_on_request(). */ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, - struct page *page, bool nonblock) + struct page *page, bool nonblock, + bool launder) { struct nfs_page *req; int ret = 0; @@ -567,8 +577,21 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, ret = 0; if (!nfs_pageio_add_request(pgio, req)) { - nfs_redirty_request(req); ret = pgio->pg_error; + /* + * Remove the problematic req upon fatal errors + * in launder case, while other dirty pages can + * still be around until they get flushed. + */ + if (nfs_error_is_fatal(ret)) { + nfs_context_set_write_error(req->wb_context, ret); + if (launder) { + nfs_write_error_remove_page(req); + goto out; + } + } + nfs_redirty_request(req); + ret = -EAGAIN; } else nfs_add_stats(page_file_mapping(page)->host, NFSIOS_WRITEPAGES, 1); @@ -576,12 +599,14 @@ out: return ret; } -static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio) +static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, + struct nfs_pageio_descriptor *pgio, bool launder) { int ret; nfs_pageio_cond_complete(pgio, page_file_index(page)); - ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE); + ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE, + launder); if (ret == -EAGAIN) { redirty_page_for_writepage(wbc, page); ret = 0; @@ -592,7 +617,9 @@ static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, st /* * Write an mmapped page to the server. */ -static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc) +static int nfs_writepage_locked(struct page *page, + struct writeback_control *wbc, + bool launder) { struct nfs_pageio_descriptor pgio; struct inode *inode = page_file_mapping(page)->host; @@ -601,7 +628,7 @@ static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); nfs_pageio_init_write(&pgio, inode, wb_priority(wbc), false, &nfs_async_write_completion_ops); - err = nfs_do_writepage(page, wbc, &pgio); + err = nfs_do_writepage(page, wbc, &pgio, launder); nfs_pageio_complete(&pgio); if (err < 0) return err; @@ -614,7 +641,7 @@ int nfs_writepage(struct page *page, struct writeback_control *wbc) { int ret; - ret = nfs_writepage_locked(page, wbc); + ret = nfs_writepage_locked(page, wbc, false); unlock_page(page); return ret; } @@ -623,7 +650,7 @@ static int nfs_writepages_callback(struct page *page, struct writeback_control * { int ret; - ret = nfs_do_writepage(page, wbc, data); + ret = nfs_do_writepage(page, wbc, data, false); unlock_page(page); return ret; } @@ -803,11 +830,10 @@ EXPORT_SYMBOL_GPL(nfs_request_add_commit_list_locked); * holding the nfs_page lock. */ void -nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst, - struct nfs_commit_info *cinfo) +nfs_request_add_commit_list(struct nfs_page *req, struct nfs_commit_info *cinfo) { spin_lock(cinfo->lock); - nfs_request_add_commit_list_locked(req, dst, cinfo); + nfs_request_add_commit_list_locked(req, &cinfo->mds->list, cinfo); spin_unlock(cinfo->lock); nfs_mark_page_unstable(req->wb_page, cinfo); } @@ -865,7 +891,7 @@ nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, { if (pnfs_mark_request_commit(req, lseg, cinfo, ds_commit_idx)) return; - nfs_request_add_commit_list(req, &cinfo->mds->list, cinfo); + nfs_request_add_commit_list(req, cinfo); } static void @@ -1128,7 +1154,8 @@ int nfs_flush_incompatible(struct file *file, struct page *page) if (req == NULL) return 0; l_ctx = req->wb_lock_context; - do_flush = req->wb_page != page || req->wb_context != ctx; + do_flush = req->wb_page != page || + !nfs_match_open_context(req->wb_context, ctx); /* for now, flush if more than 1 request in page_group */ do_flush |= req->wb_this_page != req; if (l_ctx && flctx && @@ -1326,9 +1353,15 @@ static void nfs_async_write_error(struct list_head *head) } } +static void nfs_async_write_reschedule_io(struct nfs_pgio_header *hdr) +{ + nfs_async_write_error(&hdr->pages); +} + static const struct nfs_pgio_completion_ops nfs_async_write_completion_ops = { .error_cleanup = nfs_async_write_error, .completion = nfs_write_completion, + .reschedule_io = nfs_async_write_reschedule_io, }; void nfs_pageio_init_write(struct nfs_pageio_descriptor *pgio, @@ -1529,27 +1562,21 @@ static void nfs_writeback_result(struct rpc_task *task, } } - -static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait) +static int wait_on_commit(struct nfs_mds_commit_info *cinfo) { - int ret; + return wait_on_atomic_t(&cinfo->rpcs_out, + nfs_wait_atomic_killable, TASK_KILLABLE); +} - if (!test_and_set_bit(NFS_INO_COMMIT, &nfsi->flags)) - return 1; - if (!may_wait) - return 0; - ret = out_of_line_wait_on_bit_lock(&nfsi->flags, - NFS_INO_COMMIT, - nfs_wait_bit_killable, - TASK_KILLABLE); - return (ret < 0) ? ret : 1; +static void nfs_commit_begin(struct nfs_mds_commit_info *cinfo) +{ + atomic_inc(&cinfo->rpcs_out); } -static void nfs_commit_clear_lock(struct nfs_inode *nfsi) +static void nfs_commit_end(struct nfs_mds_commit_info *cinfo) { - clear_bit(NFS_INO_COMMIT, &nfsi->flags); - smp_mb__after_atomic(); - wake_up_bit(&nfsi->flags, NFS_INO_COMMIT); + if (atomic_dec_and_test(&cinfo->rpcs_out)) + wake_up_atomic_t(&cinfo->rpcs_out); } void nfs_commitdata_release(struct nfs_commit_data *data) @@ -1666,6 +1693,13 @@ void nfs_retry_commit(struct list_head *page_list, } EXPORT_SYMBOL_GPL(nfs_retry_commit); +static void +nfs_commit_resched_write(struct nfs_commit_info *cinfo, + struct nfs_page *req) +{ + __set_page_dirty_nobuffers(req->wb_page); +} + /* * Commit dirty pages */ @@ -1687,7 +1721,6 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how, data->mds_ops, how, 0); out_bad: nfs_retry_commit(head, NULL, cinfo, 0); - cinfo->completion_ops->error_cleanup(NFS_I(inode)); return -ENOMEM; } @@ -1749,8 +1782,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC); nfs_init_cinfo(&cinfo, data->inode, data->dreq); - if (atomic_dec_and_test(&cinfo.mds->rpcs_out)) - nfs_commit_clear_lock(NFS_I(data->inode)); + nfs_commit_end(cinfo.mds); } static void nfs_commit_release(void *calldata) @@ -1769,7 +1801,7 @@ static const struct rpc_call_ops nfs_commit_ops = { static const struct nfs_commit_completion_ops nfs_commit_completion_ops = { .completion = nfs_commit_release_pages, - .error_cleanup = nfs_commit_clear_lock, + .resched_write = nfs_commit_resched_write, }; int nfs_generic_commit_list(struct inode *inode, struct list_head *head, @@ -1788,30 +1820,25 @@ int nfs_commit_inode(struct inode *inode, int how) LIST_HEAD(head); struct nfs_commit_info cinfo; int may_wait = how & FLUSH_SYNC; + int error = 0; int res; - res = nfs_commit_set_lock(NFS_I(inode), may_wait); - if (res <= 0) - goto out_mark_dirty; nfs_init_cinfo_from_inode(&cinfo, inode); + nfs_commit_begin(cinfo.mds); res = nfs_scan_commit(inode, &head, &cinfo); - if (res) { - int error; - + if (res) error = nfs_generic_commit_list(inode, &head, how, &cinfo); - if (error < 0) - return error; - if (!may_wait) - goto out_mark_dirty; - error = wait_on_bit_action(&NFS_I(inode)->flags, - NFS_INO_COMMIT, - nfs_wait_bit_killable, - TASK_KILLABLE); - if (error < 0) - return error; - } else - nfs_commit_clear_lock(NFS_I(inode)); + nfs_commit_end(cinfo.mds); + if (error < 0) + goto out_error; + if (!may_wait) + goto out_mark_dirty; + error = wait_on_commit(cinfo.mds); + if (error < 0) + return error; return res; +out_error: + res = error; /* Note: If we exit without ensuring that the commit is complete, * we must mark the inode as dirty. Otherwise, future calls to * sync_inode() with the WB_SYNC_ALL flag set will fail to ensure @@ -1821,6 +1848,7 @@ out_mark_dirty: __mark_inode_dirty(inode, I_DIRTY_DATASYNC); return res; } +EXPORT_SYMBOL_GPL(nfs_commit_inode); int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) { @@ -1911,7 +1939,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page) /* * Write back all requests on one page - we do this before reading it. */ -int nfs_wb_page(struct inode *inode, struct page *page) +int nfs_wb_single_page(struct inode *inode, struct page *page, bool launder) { loff_t range_start = page_file_offset(page); loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1); @@ -1928,7 +1956,7 @@ int nfs_wb_page(struct inode *inode, struct page *page) for (;;) { wait_on_page_writeback(page); if (clear_page_dirty_for_io(page)) { - ret = nfs_writepage_locked(page, &wbc); + ret = nfs_writepage_locked(page, &wbc, launder); if (ret < 0) goto out_error; continue; diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c index 77e7a5cca..1a03bc305 100644 --- a/fs/nfsd/lockd.c +++ b/fs/nfsd/lockd.c @@ -58,7 +58,7 @@ nlm_fclose(struct file *filp) fput(filp); } -static struct nlmsvc_binding nfsd_nlm_ops = { +static const struct nlmsvc_binding nfsd_nlm_ops = { .fopen = nlm_fopen, /* open file for locking */ .fclose = nlm_fclose, /* close file */ }; diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index d8b16c256..5fbf3bbd0 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -92,7 +92,7 @@ struct nfsd_net { struct file *rec_file; bool in_grace; - struct nfsd4_client_tracking_ops *client_tracking_ops; + const struct nfsd4_client_tracking_ops *client_tracking_ops; time_t nfsd4_lease; time_t nfsd4_grace; diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index 00575d776..2246454de 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -823,7 +823,7 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp, } else dchild = dget(dparent); } else - dchild = lookup_one_len(name, dparent, namlen); + dchild = lookup_one_len_unlocked(name, dparent, namlen); if (IS_ERR(dchild)) return rv; if (d_mountpoint(dchild)) diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index e7f50c408..7389cb1d7 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -792,12 +792,16 @@ static void warn_no_callback_path(struct nfs4_client *clp, int reason) static void nfsd4_mark_cb_down(struct nfs4_client *clp, int reason) { + if (test_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags)) + return; clp->cl_cb_state = NFSD4_CB_DOWN; warn_no_callback_path(clp, reason); } static void nfsd4_mark_cb_fault(struct nfs4_client *clp, int reason) { + if (test_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags)) + return; clp->cl_cb_state = NFSD4_CB_FAULT; warn_no_callback_path(clp, reason); } @@ -1143,7 +1147,7 @@ nfsd4_run_cb_work(struct work_struct *work) } void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp, - struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op) + const struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op) { cb->cb_clp = clp; cb->cb_msg.rpc_proc = &nfs4_cb_procedures[op]; diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index c9d6c715c..ce2d010d3 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -22,7 +22,7 @@ struct nfs4_layout { static struct kmem_cache *nfs4_layout_cache; static struct kmem_cache *nfs4_layout_stateid_cache; -static struct nfsd4_callback_ops nfsd4_cb_layout_ops; +static const struct nfsd4_callback_ops nfsd4_cb_layout_ops; static const struct lock_manager_operations nfsd4_layouts_lm_ops; const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] = { @@ -624,24 +624,39 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task) { struct nfs4_layout_stateid *ls = container_of(cb, struct nfs4_layout_stateid, ls_recall); + struct nfsd_net *nn; + ktime_t now, cutoff; LIST_HEAD(reaplist); + switch (task->tk_status) { case 0: - return 1; + case -NFS4ERR_DELAY: + /* + * Anything left? If not, then call it done. Note that we don't + * take the spinlock since this is an optimization and nothing + * should get added until the cb counter goes to zero. + */ + if (list_empty(&ls->ls_layouts)) + return 1; + + /* Poll the client until it's done with the layout */ + now = ktime_get(); + nn = net_generic(ls->ls_stid.sc_client->net, nfsd_net_id); + + /* Client gets 2 lease periods to return it */ + cutoff = ktime_add_ns(task->tk_start, + nn->nfsd4_lease * NSEC_PER_SEC * 2); + + if (ktime_before(now, cutoff)) { + rpc_delay(task, HZ/100); /* 10 mili-seconds */ + return 0; + } + /* Fallthrough */ case -NFS4ERR_NOMATCHING_LAYOUT: trace_layout_recall_done(&ls->ls_stid.sc_stateid); task->tk_status = 0; return 1; - case -NFS4ERR_DELAY: - /* Poll the client until it's done with the layout */ - /* FIXME: cap number of retries. - * The pnfs standard states that we need to only expire - * the client after at-least "lease time" .eg lease-time * 2 - * when failing to communicate a recall - */ - rpc_delay(task, HZ/100); /* 10 mili-seconds */ - return 0; default: /* * Unknown error or non-responding client, we'll need to fence. @@ -665,7 +680,7 @@ nfsd4_cb_layout_release(struct nfsd4_callback *cb) nfs4_put_stid(&ls->ls_stid); } -static struct nfsd4_callback_ops nfsd4_cb_layout_ops = { +static const struct nfsd4_callback_ops nfsd4_cb_layout_ops = { .prepare = nfsd4_cb_layout_prepare, .done = nfsd4_cb_layout_done, .release = nfsd4_cb_layout_release, diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index a9f096c7e..4cba7865f 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -55,10 +55,10 @@ nfsd4_security_inode_setsecctx(struct svc_fh *resfh, struct xdr_netobj *label, u struct inode *inode = d_inode(resfh->fh_dentry); int status; - mutex_lock(&inode->i_mutex); + inode_lock(inode); status = security_inode_setsecctx(resfh->fh_dentry, label->data, label->len); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (status) /* @@ -774,8 +774,9 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, clear_bit(RQ_SPLICE_OK, &rqstp->rq_flags); /* check stateid */ - status = nfs4_preprocess_stateid_op(rqstp, cstate, &read->rd_stateid, - RD_STATE, &read->rd_filp, &read->rd_tmp_file); + status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, + &read->rd_stateid, RD_STATE, + &read->rd_filp, &read->rd_tmp_file); if (status) { dprintk("NFSD: nfsd4_read: couldn't process stateid!\n"); goto out; @@ -921,7 +922,8 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (setattr->sa_iattr.ia_valid & ATTR_SIZE) { status = nfs4_preprocess_stateid_op(rqstp, cstate, - &setattr->sa_stateid, WR_STATE, NULL, NULL); + &cstate->current_fh, &setattr->sa_stateid, + WR_STATE, NULL, NULL); if (status) { dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n"); return status; @@ -985,8 +987,8 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (write->wr_offset >= OFFSET_MAX) return nfserr_inval; - status = nfs4_preprocess_stateid_op(rqstp, cstate, stateid, WR_STATE, - &filp, NULL); + status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, + stateid, WR_STATE, &filp, NULL); if (status) { dprintk("NFSD: nfsd4_write: couldn't process stateid!\n"); return status; @@ -1010,13 +1012,54 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, } static __be32 +nfsd4_clone(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_clone *clone) +{ + struct file *src, *dst; + __be32 status; + + status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->save_fh, + &clone->cl_src_stateid, RD_STATE, + &src, NULL); + if (status) { + dprintk("NFSD: %s: couldn't process src stateid!\n", __func__); + goto out; + } + + status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, + &clone->cl_dst_stateid, WR_STATE, + &dst, NULL); + if (status) { + dprintk("NFSD: %s: couldn't process dst stateid!\n", __func__); + goto out_put_src; + } + + /* fix up for NFS-specific error code */ + if (!S_ISREG(file_inode(src)->i_mode) || + !S_ISREG(file_inode(dst)->i_mode)) { + status = nfserr_wrong_type; + goto out_put_dst; + } + + status = nfsd4_clone_file_range(src, clone->cl_src_pos, + dst, clone->cl_dst_pos, clone->cl_count); + +out_put_dst: + fput(dst); +out_put_src: + fput(src); +out: + return status; +} + +static __be32 nfsd4_fallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_fallocate *fallocate, int flags) { __be32 status = nfserr_notsupp; struct file *file; - status = nfs4_preprocess_stateid_op(rqstp, cstate, + status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, &fallocate->falloc_stateid, WR_STATE, &file, NULL); if (status != nfs_ok) { @@ -1055,7 +1098,7 @@ nfsd4_seek(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, __be32 status; struct file *file; - status = nfs4_preprocess_stateid_op(rqstp, cstate, + status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, &seek->seek_stateid, RD_STATE, &file, NULL); if (status) { @@ -2279,6 +2322,12 @@ static struct nfsd4_operation nfsd4_ops[] = { .op_name = "OP_DEALLOCATE", .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, + [OP_CLONE] = { + .op_func = (nfsd4op_func)nfsd4_clone, + .op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME, + .op_name = "OP_CLONE", + .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, + }, [OP_SEEK] = { .op_func = (nfsd4op_func)nfsd4_seek, .op_name = "OP_SEEK", diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index e3d47091b..dc8ebecf5 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -192,7 +192,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) dir = nn->rec_file->f_path.dentry; /* lock the parent */ - mutex_lock(&d_inode(dir)->i_mutex); + inode_lock(d_inode(dir)); dentry = lookup_one_len(dname, dir, HEXDIR_LEN-1); if (IS_ERR(dentry)) { @@ -213,7 +213,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) out_put: dput(dentry); out_unlock: - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); if (status == 0) { if (nn->in_grace) { crp = nfs4_client_to_reclaim(dname, nn); @@ -286,7 +286,7 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn) } status = iterate_dir(nn->rec_file, &ctx.ctx); - mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); list_for_each_entry_safe(entry, tmp, &ctx.names, list) { if (!status) { @@ -302,7 +302,7 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn) list_del(&entry->list); kfree(entry); } - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); nfs4_reset_creds(original_cred); list_for_each_entry_safe(entry, tmp, &ctx.names, list) { @@ -322,7 +322,7 @@ nfsd4_unlink_clid_dir(char *name, int namlen, struct nfsd_net *nn) dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name); dir = nn->rec_file->f_path.dentry; - mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); dentry = lookup_one_len(name, dir, namlen); if (IS_ERR(dentry)) { status = PTR_ERR(dentry); @@ -335,7 +335,7 @@ nfsd4_unlink_clid_dir(char *name, int namlen, struct nfsd_net *nn) out: dput(dentry); out_unlock: - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); return status; } @@ -631,7 +631,7 @@ nfsd4_check_legacy_client(struct nfs4_client *clp) return -ENOENT; } -static struct nfsd4_client_tracking_ops nfsd4_legacy_tracking_ops = { +static const struct nfsd4_client_tracking_ops nfsd4_legacy_tracking_ops = { .init = nfsd4_legacy_tracking_init, .exit = nfsd4_legacy_tracking_exit, .create = nfsd4_create_clid_dir, @@ -1050,7 +1050,7 @@ out_err: printk(KERN_ERR "NFSD: Unable to end grace period: %d\n", ret); } -static struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops = { +static const struct nfsd4_client_tracking_ops nfsd4_cld_tracking_ops = { .init = nfsd4_init_cld_pipe, .exit = nfsd4_remove_cld_pipe, .create = nfsd4_cld_create, @@ -1394,7 +1394,7 @@ nfsd4_umh_cltrack_grace_done(struct nfsd_net *nn) kfree(legacy); } -static struct nfsd4_client_tracking_ops nfsd4_umh_tracking_ops = { +static const struct nfsd4_client_tracking_ops nfsd4_umh_tracking_ops = { .init = nfsd4_umh_cltrack_init, .exit = NULL, .create = nfsd4_umh_cltrack_create, diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 6b800b5b8..c484a2b6c 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -98,7 +98,7 @@ static struct kmem_cache *odstate_slab; static void free_session(struct nfsd4_session *); -static struct nfsd4_callback_ops nfsd4_cb_recall_ops; +static const struct nfsd4_callback_ops nfsd4_cb_recall_ops; static bool is_session_dead(struct nfsd4_session *ses) { @@ -1857,15 +1857,28 @@ static void copy_clid(struct nfs4_client *target, struct nfs4_client *source) target->cl_clientid.cl_id = source->cl_clientid.cl_id; } -static int copy_cred(struct svc_cred *target, struct svc_cred *source) +int strdup_if_nonnull(char **target, char *source) { - if (source->cr_principal) { - target->cr_principal = - kstrdup(source->cr_principal, GFP_KERNEL); - if (target->cr_principal == NULL) + if (source) { + *target = kstrdup(source, GFP_KERNEL); + if (!*target) return -ENOMEM; } else - target->cr_principal = NULL; + *target = NULL; + return 0; +} + +static int copy_cred(struct svc_cred *target, struct svc_cred *source) +{ + int ret; + + ret = strdup_if_nonnull(&target->cr_principal, source->cr_principal); + if (ret) + return ret; + ret = strdup_if_nonnull(&target->cr_raw_principal, + source->cr_raw_principal); + if (ret) + return ret; target->cr_flavor = source->cr_flavor; target->cr_uid = source->cr_uid; target->cr_gid = source->cr_gid; @@ -1969,6 +1982,9 @@ static bool mach_creds_match(struct nfs4_client *cl, struct svc_rqst *rqstp) return false; if (!svc_rqst_integrity_protected(rqstp)) return false; + if (cl->cl_cred.cr_raw_principal) + return 0 == strcmp(cl->cl_cred.cr_raw_principal, + cr->cr_raw_principal); if (!cr->cr_principal) return false; return 0 == strcmp(cl->cl_cred.cr_principal, cr->cr_principal); @@ -2240,7 +2256,8 @@ nfsd4_store_cache_entry(struct nfsd4_compoundres *resp) base = resp->cstate.data_offset; slot->sl_datalen = buf->len - base; if (read_bytes_from_xdr_buf(buf, base, slot->sl_data, slot->sl_datalen)) - WARN("%s: sessions DRC could not cache compound\n", __func__); + WARN(1, "%s: sessions DRC could not cache compound\n", + __func__); return; } @@ -2365,10 +2382,27 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, if (exid->flags & ~EXCHGID4_FLAG_MASK_A) return nfserr_inval; + new = create_client(exid->clname, rqstp, &verf); + if (new == NULL) + return nfserr_jukebox; + switch (exid->spa_how) { case SP4_MACH_CRED: - if (!svc_rqst_integrity_protected(rqstp)) - return nfserr_inval; + if (!svc_rqst_integrity_protected(rqstp)) { + status = nfserr_inval; + goto out_nolock; + } + /* + * Sometimes userspace doesn't give us a principal. + * Which is a bug, really. Anyway, we can't enforce + * MACH_CRED in that case, better to give up now: + */ + if (!new->cl_cred.cr_principal && + !new->cl_cred.cr_raw_principal) { + status = nfserr_serverfault; + goto out_nolock; + } + new->cl_mach_cred = true; case SP4_NONE: break; default: /* checked by xdr code */ @@ -2377,10 +2411,6 @@ nfsd4_exchange_id(struct svc_rqst *rqstp, return nfserr_encr_alg_unsupp; } - new = create_client(exid->clname, rqstp, &verf); - if (new == NULL) - return nfserr_jukebox; - /* Cases below refer to rfc 5661 section 18.35.4: */ spin_lock(&nn->client_lock); conf = find_confirmed_client_by_name(&exid->clname, nn); @@ -2442,7 +2472,6 @@ out_new: goto out; } new->cl_minorversion = cstate->minorversion; - new->cl_mach_cred = (exid->spa_how == SP4_MACH_CRED); gen_clid(new, nn); add_to_unconfirmed(new); @@ -2460,6 +2489,7 @@ out_copy: out: spin_unlock(&nn->client_lock); +out_nolock: if (new) expire_client(new); if (unconf) @@ -3648,7 +3678,7 @@ static void nfsd4_cb_recall_release(struct nfsd4_callback *cb) nfs4_put_stid(&dp->dl_stid); } -static struct nfsd4_callback_ops nfsd4_cb_recall_ops = { +static const struct nfsd4_callback_ops nfsd4_cb_recall_ops = { .prepare = nfsd4_cb_recall_prepare, .done = nfsd4_cb_recall_done, .release = nfsd4_cb_recall_release, @@ -4541,8 +4571,7 @@ static void laundromat_main(struct work_struct *laundry) { time_t t; - struct delayed_work *dwork = container_of(laundry, struct delayed_work, - work); + struct delayed_work *dwork = to_delayed_work(laundry); struct nfsd_net *nn = container_of(dwork, struct nfsd_net, laundromat_work); @@ -4797,10 +4826,9 @@ nfs4_check_file(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfs4_stid *s, */ __be32 nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, - struct nfsd4_compound_state *cstate, stateid_t *stateid, - int flags, struct file **filpp, bool *tmp_file) + struct nfsd4_compound_state *cstate, struct svc_fh *fhp, + stateid_t *stateid, int flags, struct file **filpp, bool *tmp_file) { - struct svc_fh *fhp = &cstate->current_fh; struct inode *ino = d_inode(fhp->fh_dentry); struct net *net = SVC_NET(rqstp); struct nfsd_net *nn = net_generic(net, nfsd_net_id); diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 51c9e9ca3..d6ef0955a 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1675,6 +1675,25 @@ nfsd4_decode_fallocate(struct nfsd4_compoundargs *argp, } static __be32 +nfsd4_decode_clone(struct nfsd4_compoundargs *argp, struct nfsd4_clone *clone) +{ + DECODE_HEAD; + + status = nfsd4_decode_stateid(argp, &clone->cl_src_stateid); + if (status) + return status; + status = nfsd4_decode_stateid(argp, &clone->cl_dst_stateid); + if (status) + return status; + + READ_BUF(8 + 8 + 8); + p = xdr_decode_hyper(p, &clone->cl_src_pos); + p = xdr_decode_hyper(p, &clone->cl_dst_pos); + p = xdr_decode_hyper(p, &clone->cl_count); + DECODE_TAIL; +} + +static __be32 nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek) { DECODE_HEAD; @@ -1785,6 +1804,7 @@ static nfsd4_dec nfsd4_dec_ops[] = { [OP_READ_PLUS] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_SEEK] = (nfsd4_dec)nfsd4_decode_seek, [OP_WRITE_SAME] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_CLONE] = (nfsd4_dec)nfsd4_decode_clone, }; static inline bool @@ -2838,14 +2858,14 @@ nfsd4_encode_dirent_fattr(struct xdr_stream *xdr, struct nfsd4_readdir *cd, __be32 nfserr; int ignore_crossmnt = 0; - dentry = lookup_one_len(name, cd->rd_fhp->fh_dentry, namlen); + dentry = lookup_one_len_unlocked(name, cd->rd_fhp->fh_dentry, namlen); if (IS_ERR(dentry)) return nfserrno(PTR_ERR(dentry)); if (d_really_is_negative(dentry)) { /* - * nfsd_buffered_readdir drops the i_mutex between - * readdir and calling this callback, leaving a window - * where this directory entry could have gone away. + * we're not holding the i_mutex here, so there's + * a window where this directory entry could have gone + * away. */ dput(dentry); return nfserr_noent; @@ -4292,6 +4312,7 @@ static nfsd4_enc nfsd4_enc_ops[] = { [OP_READ_PLUS] = (nfsd4_enc)nfsd4_encode_noop, [OP_SEEK] = (nfsd4_enc)nfsd4_encode_seek, [OP_WRITE_SAME] = (nfsd4_enc)nfsd4_encode_noop, + [OP_CLONE] = (nfsd4_enc)nfsd4_encode_noop, }; /* diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index 2087bae17..f84fe6bf9 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -7,6 +7,7 @@ #ifndef _LINUX_NFSD_NFSFH_H #define _LINUX_NFSD_NFSFH_H +#include <linux/crc32.h> #include <linux/sunrpc/svc.h> #include <uapi/linux/nfsd/nfsfh.h> @@ -205,6 +206,28 @@ static inline bool fh_fsid_match(struct knfsd_fh *fh1, struct knfsd_fh *fh2) return true; } +#ifdef CONFIG_CRC32 +/** + * knfsd_fh_hash - calculate the crc32 hash for the filehandle + * @fh - pointer to filehandle + * + * returns a crc32 hash for the filehandle that is compatible with + * the one displayed by "wireshark". + */ + +static inline u32 +knfsd_fh_hash(struct knfsd_fh *fh) +{ + return ~crc32_le(0xFFFFFFFF, (unsigned char *)&fh->fh_base, fh->fh_size); +} +#else +static inline u32 +knfsd_fh_hash(struct knfsd_fh *fh) +{ + return 0; +} +#endif + #ifdef CONFIG_NFSD_V3 /* * The wcc data stored in current_fh should be cleared @@ -265,7 +288,7 @@ fh_lock_nested(struct svc_fh *fhp, unsigned int subclass) } inode = d_inode(dentry); - mutex_lock_nested(&inode->i_mutex, subclass); + inode_lock_nested(inode, subclass); fill_pre_wcc(fhp); fhp->fh_locked = true; } @@ -284,7 +307,7 @@ fh_unlock(struct svc_fh *fhp) { if (fhp->fh_locked) { fill_post_wcc(fhp); - mutex_unlock(&d_inode(fhp->fh_dentry)->i_mutex); + inode_unlock(d_inode(fhp->fh_dentry)); fhp->fh_locked = false; } } diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index ad4e2377d..45007acaf 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -14,9 +14,13 @@ #include <linux/sunrpc/stats.h> #include <linux/sunrpc/svcsock.h> +#include <linux/sunrpc/svc_xprt.h> #include <linux/lockd/bind.h> #include <linux/nfsacl.h> #include <linux/seq_file.h> +#include <linux/inetdevice.h> +#include <net/addrconf.h> +#include <net/ipv6.h> #include <net/net_namespace.h> #include "nfsd.h" #include "cache.h" @@ -306,22 +310,81 @@ static void nfsd_shutdown_net(struct net *net) nfsd_shutdown_generic(); } +static int nfsd_inetaddr_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; + struct net_device *dev = ifa->ifa_dev->dev; + struct net *net = dev_net(dev); + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct sockaddr_in sin; + + if (event != NETDEV_DOWN) + goto out; + + if (nn->nfsd_serv) { + dprintk("nfsd_inetaddr_event: removed %pI4\n", &ifa->ifa_local); + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = ifa->ifa_local; + svc_age_temp_xprts_now(nn->nfsd_serv, (struct sockaddr *)&sin); + } + +out: + return NOTIFY_DONE; +} + +static struct notifier_block nfsd_inetaddr_notifier = { + .notifier_call = nfsd_inetaddr_event, +}; + +#if IS_ENABLED(CONFIG_IPV6) +static int nfsd_inet6addr_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct inet6_ifaddr *ifa = (struct inet6_ifaddr *)ptr; + struct net_device *dev = ifa->idev->dev; + struct net *net = dev_net(dev); + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + struct sockaddr_in6 sin6; + + if (event != NETDEV_DOWN) + goto out; + + if (nn->nfsd_serv) { + dprintk("nfsd_inet6addr_event: removed %pI6\n", &ifa->addr); + sin6.sin6_family = AF_INET6; + sin6.sin6_addr = ifa->addr; + svc_age_temp_xprts_now(nn->nfsd_serv, (struct sockaddr *)&sin6); + } + +out: + return NOTIFY_DONE; +} + +static struct notifier_block nfsd_inet6addr_notifier = { + .notifier_call = nfsd_inet6addr_event, +}; +#endif + static void nfsd_last_thread(struct svc_serv *serv, struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); + unregister_inetaddr_notifier(&nfsd_inetaddr_notifier); +#if IS_ENABLED(CONFIG_IPV6) + unregister_inet6addr_notifier(&nfsd_inet6addr_notifier); +#endif /* * write_ports can create the server without actually starting * any threads--if we get shut down before any threads are * started, then nfsd_last_thread will be run before any of this - * other initialization has been done. + * other initialization has been done except the rpcb information. */ + svc_rpcb_cleanup(serv, net); if (!nn->nfsd_net_up) return; - nfsd_shutdown_net(net); - - svc_rpcb_cleanup(serv, net); + nfsd_shutdown_net(net); printk(KERN_WARNING "nfsd: last server has exited, flushing export " "cache\n"); nfsd_export_flush(net); @@ -425,6 +488,10 @@ int nfsd_create_serv(struct net *net) } set_max_drc(); + register_inetaddr_notifier(&nfsd_inetaddr_notifier); +#if IS_ENABLED(CONFIG_IPV6) + register_inet6addr_notifier(&nfsd_inet6addr_notifier); +#endif do_gettimeofday(&nn->nfssvc_boot); /* record boot time */ return 0; } diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 77fdf4de9..c050c5303 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -65,7 +65,7 @@ struct nfsd4_callback { struct nfs4_client *cb_clp; u32 cb_minorversion; struct rpc_message cb_msg; - struct nfsd4_callback_ops *cb_ops; + const struct nfsd4_callback_ops *cb_ops; struct work_struct cb_work; int cb_seq_status; int cb_status; @@ -578,8 +578,8 @@ struct nfsd4_compound_state; struct nfsd_net; extern __be32 nfs4_preprocess_stateid_op(struct svc_rqst *rqstp, - struct nfsd4_compound_state *cstate, stateid_t *stateid, - int flags, struct file **filp, bool *tmp_file); + struct nfsd4_compound_state *cstate, struct svc_fh *fhp, + stateid_t *stateid, int flags, struct file **filp, bool *tmp_file); __be32 nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, stateid_t *stateid, unsigned char typemask, struct nfs4_stid **s, struct nfsd_net *nn); @@ -599,7 +599,7 @@ extern void nfsd4_probe_callback(struct nfs4_client *clp); extern void nfsd4_probe_callback_sync(struct nfs4_client *clp); extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *); extern void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp, - struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op); + const struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op); extern void nfsd4_run_cb(struct nfsd4_callback *cb); extern int nfsd4_create_callback_queue(void); extern void nfsd4_destroy_callback_queue(void); diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index 0befe7627..328704190 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -8,6 +8,47 @@ #define _NFSD_TRACE_H #include <linux/tracepoint.h> +#include "nfsfh.h" + +DECLARE_EVENT_CLASS(nfsd_io_class, + TP_PROTO(struct svc_rqst *rqstp, + struct svc_fh *fhp, + loff_t offset, + int len), + TP_ARGS(rqstp, fhp, offset, len), + TP_STRUCT__entry( + __field(__be32, xid) + __field_struct(struct knfsd_fh, fh) + __field(loff_t, offset) + __field(int, len) + ), + TP_fast_assign( + __entry->xid = rqstp->rq_xid, + fh_copy_shallow(&__entry->fh, &fhp->fh_handle); + __entry->offset = offset; + __entry->len = len; + ), + TP_printk("xid=0x%x fh=0x%x offset=%lld len=%d", + __be32_to_cpu(__entry->xid), knfsd_fh_hash(&__entry->fh), + __entry->offset, __entry->len) +) + +#define DEFINE_NFSD_IO_EVENT(name) \ +DEFINE_EVENT(nfsd_io_class, name, \ + TP_PROTO(struct svc_rqst *rqstp, \ + struct svc_fh *fhp, \ + loff_t offset, \ + int len), \ + TP_ARGS(rqstp, fhp, offset, len)) + +DEFINE_NFSD_IO_EVENT(read_start); +DEFINE_NFSD_IO_EVENT(read_opened); +DEFINE_NFSD_IO_EVENT(read_io_done); +DEFINE_NFSD_IO_EVENT(read_done); +DEFINE_NFSD_IO_EVENT(write_start); +DEFINE_NFSD_IO_EVENT(write_opened); +DEFINE_NFSD_IO_EVENT(write_io_done); +DEFINE_NFSD_IO_EVENT(write_done); #include "state.h" diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 994d66fbb..5d2a57e4c 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -36,12 +36,14 @@ #endif /* CONFIG_NFSD_V3 */ #ifdef CONFIG_NFSD_V4 +#include "../internal.h" #include "acl.h" #include "idmap.h" #endif /* CONFIG_NFSD_V4 */ #include "nfsd.h" #include "vfs.h" +#include "trace.h" #define NFSDDBG_FACILITY NFSDDBG_FILEOP @@ -217,10 +219,16 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp, host_err = PTR_ERR(dentry); if (IS_ERR(dentry)) goto out_nfserr; - /* - * check if we have crossed a mount point ... - */ if (nfsd_mountpoint(dentry, exp)) { + /* + * We don't need the i_mutex after all. It's + * still possible we could open this (regular + * files can be mountpoints too), but the + * i_mutex is just there to prevent renames of + * something that we might be about to delegate, + * and a mountpoint won't be renamed: + */ + fh_unlock(fhp); if ((host_err = nfsd_cross_mnt(rqstp, &dentry, &exp))) { dput(dentry); goto out_nfserr; @@ -485,9 +493,9 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp, dentry = fhp->fh_dentry; - mutex_lock(&d_inode(dentry)->i_mutex); + inode_lock(d_inode(dentry)); host_error = security_inode_setsecctx(dentry, label->data, label->len); - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); return nfserrno(host_error); } #else @@ -498,6 +506,13 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp, } #endif +__be32 nfsd4_clone_file_range(struct file *src, u64 src_pos, struct file *dst, + u64 dst_pos, u64 count) +{ + return nfserrno(vfs_clone_file_range(src, src_pos, dst, dst_pos, + count)); +} + __be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, loff_t len, int flags) @@ -983,16 +998,23 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct raparms *ra; __be32 err; + trace_read_start(rqstp, fhp, offset, vlen); err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file); if (err) return err; ra = nfsd_init_raparms(file); + + trace_read_opened(rqstp, fhp, offset, vlen); err = nfsd_vfs_read(rqstp, file, offset, vec, vlen, count); + trace_read_io_done(rqstp, fhp, offset, vlen); + if (ra) nfsd_put_raparams(file, ra); fput(file); + trace_read_done(rqstp, fhp, offset, vlen); + return err; } @@ -1008,24 +1030,31 @@ nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, { __be32 err = 0; + trace_write_start(rqstp, fhp, offset, vlen); + if (file) { err = nfsd_permission(rqstp, fhp->fh_export, fhp->fh_dentry, NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE); if (err) goto out; + trace_write_opened(rqstp, fhp, offset, vlen); err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, cnt, stablep); + trace_write_io_done(rqstp, fhp, offset, vlen); } else { err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_WRITE, &file); if (err) goto out; + trace_write_opened(rqstp, fhp, offset, vlen); if (cnt) err = nfsd_vfs_write(rqstp, fhp, file, offset, vec, vlen, cnt, stablep); + trace_write_io_done(rqstp, fhp, offset, vlen); fput(file); } out: + trace_write_done(rqstp, fhp, offset, vlen); return err; } @@ -1809,7 +1838,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func, offset = *offsetp; while (1) { - struct inode *dir_inode = file_inode(file); unsigned int reclen; cdp->err = nfserr_eof; /* will be cleared on successful read */ @@ -1828,15 +1856,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func, if (!size) break; - /* - * Various filldir functions may end up calling back into - * lookup_one_len() and the file system's ->lookup() method. - * These expect i_mutex to be held, as it would within readdir. - */ - host_err = mutex_lock_killable(&dir_inode->i_mutex); - if (host_err) - break; - de = (struct buffered_dirent *)buf.dirent; while (size > 0) { offset = de->offset; @@ -1853,7 +1872,6 @@ static __be32 nfsd_buffered_readdir(struct file *file, nfsd_filldir_t func, size -= reclen; de = (struct buffered_dirent *)((char *)de + reclen); } - mutex_unlock(&dir_inode->i_mutex); if (size > 0) /* We bailed out early */ break; diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index fcfc48cbe..c11ba316f 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -56,6 +56,8 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *, struct svc_fh *, struct xdr_netobj *); __be32 nfsd4_vfs_fallocate(struct svc_rqst *, struct svc_fh *, struct file *, loff_t, loff_t, int); +__be32 nfsd4_clone_file_range(struct file *, u64, struct file *, + u64, u64); #endif /* CONFIG_NFSD_V4 */ __be32 nfsd_create(struct svc_rqst *, struct svc_fh *, char *name, int len, struct iattr *attrs, diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index ce7362c88..d9554813e 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -491,6 +491,15 @@ struct nfsd4_fallocate { u64 falloc_length; }; +struct nfsd4_clone { + /* request */ + stateid_t cl_src_stateid; + stateid_t cl_dst_stateid; + u64 cl_src_pos; + u64 cl_dst_pos; + u64 cl_count; +}; + struct nfsd4_seek { /* request */ stateid_t seek_stateid; @@ -555,6 +564,7 @@ struct nfsd4_op { /* NFSv4.2 */ struct nfsd4_fallocate allocate; struct nfsd4_fallocate deallocate; + struct nfsd4_clone clone; struct nfsd4_seek seek; } u; struct nfs4_replay * replay; diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index ac2f64943..21a1e2e0d 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -510,6 +510,7 @@ static int __nilfs_read_inode(struct super_block *sb, inode->i_mapping->a_ops = &nilfs_aops; } else if (S_ISLNK(inode->i_mode)) { inode->i_op = &nilfs_symlink_inode_operations; + inode_nohighmem(inode); inode->i_mapping->a_ops = &nilfs_aops; } else { inode->i_op = &nilfs_special_inode_operations; @@ -1002,7 +1003,7 @@ int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (ret) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); isize = i_size_read(inode); @@ -1112,6 +1113,6 @@ int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (ret == 1) ret = 0; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index aba43811d..e8fe24882 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -158,7 +158,7 @@ static int nilfs_ioctl_setflags(struct inode *inode, struct file *filp, flags = nilfs_mask_flags(inode->i_mode, flags); - mutex_lock(&inode->i_mutex); + inode_lock(inode); oldflags = NILFS_I(inode)->i_flags; @@ -186,7 +186,7 @@ static int nilfs_ioctl_setflags(struct inode *inode, struct file *filp, nilfs_mark_inode_dirty(inode); ret = nilfs_transaction_commit(inode->i_sb); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); mnt_drop_write_file(filp); return ret; } diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index c9a1a491a..7ccdb961e 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -161,6 +161,7 @@ static int nilfs_symlink(struct inode *dir, struct dentry *dentry, /* slow symlink */ inode->i_op = &nilfs_symlink_inode_operations; + inode_nohighmem(inode); inode->i_mapping->a_ops = &nilfs_aops; err = page_symlink(inode, symname, l); if (err) @@ -568,8 +569,7 @@ const struct inode_operations nilfs_special_inode_operations = { const struct inode_operations nilfs_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = page_follow_link_light, - .put_link = page_put_link, + .get_link = page_get_link, .permission = nilfs_permission, }; diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 354013ea2..7f5d3d9f1 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -1316,13 +1316,11 @@ nilfs_mount(struct file_system_type *fs_type, int flags, } if (!s->s_root) { - char b[BDEVNAME_SIZE]; - - s_new = true; + s_new = true; /* New superblock instance created */ s->s_mode = mode; - strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id)); + snprintf(s->s_id, sizeof(s->s_id), "%pg", sd.bdev); sb_set_blocksize(s, block_size(sd.bdev)); err = nilfs_fill_super(s, data, flags & MS_SILENT ? 1 : 0); @@ -1418,7 +1416,8 @@ static int __init nilfs_init_cachep(void) { nilfs_inode_cachep = kmem_cache_create("nilfs2_inode_cache", sizeof(struct nilfs_inode_info), 0, - SLAB_RECLAIM_ACCOUNT, nilfs_inode_init_once); + SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, + nilfs_inode_init_once); if (!nilfs_inode_cachep) goto fail; diff --git a/fs/notify/group.c b/fs/notify/group.c index 53e45b61d..d16b62cb2 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -22,7 +22,6 @@ #include <linux/srcu.h> #include <linux/rculist.h> #include <linux/wait.h> -#include <linux/module.h> #include <linux/fsnotify_backend.h> #include "fsnotify.h" @@ -73,7 +72,6 @@ void fsnotify_get_group(struct fsnotify_group *group) { atomic_inc(&group->refcnt); } -EXPORT_SYMBOL_GPL(fsnotify_get_group); /* * Drop a reference to a group. Free it if it's through. @@ -83,7 +81,6 @@ void fsnotify_put_group(struct fsnotify_group *group) if (atomic_dec_and_test(&group->refcnt)) fsnotify_final_destroy_group(group); } -EXPORT_SYMBOL_GPL(fsnotify_put_group); /* * Create a new fsnotify_group and hold a reference for the group returned. @@ -112,7 +109,6 @@ struct fsnotify_group *fsnotify_alloc_group(const struct fsnotify_ops *ops) return group; } -EXPORT_SYMBOL_GPL(fsnotify_alloc_group); int fsnotify_fasync(int fd, struct file *file, int on) { diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c index e785fd954..741077dee 100644 --- a/fs/notify/inode_mark.c +++ b/fs/notify/inode_mark.c @@ -199,8 +199,7 @@ void fsnotify_unmount_inodes(struct super_block *sb) break; } spin_unlock(&next_i->i_lock); - next_i = list_entry(next_i->i_sb_list.next, - struct inode, i_sb_list); + next_i = list_next_entry(next_i, i_sb_list); } /* diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 8175f3cd4..7115c5d7d 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -91,10 +91,14 @@ #include <linux/fsnotify_backend.h> #include "fsnotify.h" +#define FSNOTIFY_REAPER_DELAY (1) /* 1 jiffy */ + struct srcu_struct fsnotify_mark_srcu; static DEFINE_SPINLOCK(destroy_lock); static LIST_HEAD(destroy_list); -static DECLARE_WAIT_QUEUE_HEAD(destroy_waitq); + +static void fsnotify_mark_destroy(struct work_struct *work); +static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy); void fsnotify_get_mark(struct fsnotify_mark *mark) { @@ -109,7 +113,6 @@ void fsnotify_put_mark(struct fsnotify_mark *mark) mark->free_mark(mark); } } -EXPORT_SYMBOL_GPL(fsnotify_put_mark); /* Calculate mask of events for a list of marks */ u32 fsnotify_recalc_mask(struct hlist_head *head) @@ -190,7 +193,8 @@ void fsnotify_free_mark(struct fsnotify_mark *mark) spin_lock(&destroy_lock); list_add(&mark->g_list, &destroy_list); spin_unlock(&destroy_lock); - wake_up(&destroy_waitq); + queue_delayed_work(system_unbound_wq, &reaper_work, + FSNOTIFY_REAPER_DELAY); /* * Some groups like to know that marks are being freed. This is a @@ -209,7 +213,6 @@ void fsnotify_destroy_mark(struct fsnotify_mark *mark, mutex_unlock(&group->mark_mutex); fsnotify_free_mark(mark); } -EXPORT_SYMBOL_GPL(fsnotify_destroy_mark); void fsnotify_destroy_marks(struct hlist_head *head, spinlock_t *lock) { @@ -390,11 +393,11 @@ err: spin_lock(&destroy_lock); list_add(&mark->g_list, &destroy_list); spin_unlock(&destroy_lock); - wake_up(&destroy_waitq); + queue_delayed_work(system_unbound_wq, &reaper_work, + FSNOTIFY_REAPER_DELAY); return ret; } -EXPORT_SYMBOL_GPL(fsnotify_add_mark); int fsnotify_add_mark(struct fsnotify_mark *mark, struct fsnotify_group *group, struct inode *inode, struct vfsmount *mnt, int allow_dups) @@ -495,41 +498,21 @@ void fsnotify_init_mark(struct fsnotify_mark *mark, atomic_set(&mark->refcnt, 1); mark->free_mark = free_mark; } -EXPORT_SYMBOL_GPL(fsnotify_init_mark); -static int fsnotify_mark_destroy(void *ignored) +static void fsnotify_mark_destroy(struct work_struct *work) { struct fsnotify_mark *mark, *next; struct list_head private_destroy_list; - for (;;) { - spin_lock(&destroy_lock); - /* exchange the list head */ - list_replace_init(&destroy_list, &private_destroy_list); - spin_unlock(&destroy_lock); - - synchronize_srcu(&fsnotify_mark_srcu); + spin_lock(&destroy_lock); + /* exchange the list head */ + list_replace_init(&destroy_list, &private_destroy_list); + spin_unlock(&destroy_lock); - list_for_each_entry_safe(mark, next, &private_destroy_list, g_list) { - list_del_init(&mark->g_list); - fsnotify_put_mark(mark); - } + synchronize_srcu(&fsnotify_mark_srcu); - wait_event_interruptible(destroy_waitq, !list_empty(&destroy_list)); + list_for_each_entry_safe(mark, next, &private_destroy_list, g_list) { + list_del_init(&mark->g_list); + fsnotify_put_mark(mark); } - - return 0; -} - -static int __init fsnotify_mark_init(void) -{ - struct task_struct *thread; - - thread = kthread_run(fsnotify_mark_destroy, NULL, - "fsnotify_mark"); - if (IS_ERR(thread)) - panic("unable to start fsnotify mark destruction thread."); - - return 0; } -device_initcall(fsnotify_mark_init); diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c index 9e38dafa3..b2eff5816 100644 --- a/fs/ntfs/dir.c +++ b/fs/ntfs/dir.c @@ -1509,7 +1509,7 @@ static int ntfs_dir_fsync(struct file *filp, loff_t start, loff_t end, err = filemap_write_and_wait_range(vi->i_mapping, start, end); if (err) return err; - mutex_lock(&vi->i_mutex); + inode_lock(vi); BUG_ON(!S_ISDIR(vi->i_mode)); /* If the bitmap attribute inode is in memory sync it, too. */ @@ -1532,7 +1532,7 @@ static int ntfs_dir_fsync(struct file *filp, loff_t start, loff_t end, else ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx. Error " "%u.", datasync ? "data" : "", vi->i_ino, -ret); - mutex_unlock(&vi->i_mutex); + inode_unlock(vi); return ret; } diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 9d383e5ef..bed4d427d 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -1944,14 +1944,14 @@ static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) ssize_t written = 0; ssize_t err; - mutex_lock(&vi->i_mutex); + inode_lock(vi); /* We can write back this queue in page reclaim. */ current->backing_dev_info = inode_to_bdi(vi); err = ntfs_prepare_file_for_write(iocb, from); if (iov_iter_count(from) && !err) written = ntfs_perform_write(file, from, iocb->ki_pos); current->backing_dev_info = NULL; - mutex_unlock(&vi->i_mutex); + inode_unlock(vi); if (likely(written > 0)) { err = generic_write_sync(file, iocb->ki_pos, written); if (err < 0) @@ -1996,7 +1996,7 @@ static int ntfs_file_fsync(struct file *filp, loff_t start, loff_t end, err = filemap_write_and_wait_range(vi->i_mapping, start, end); if (err) return err; - mutex_lock(&vi->i_mutex); + inode_lock(vi); BUG_ON(S_ISDIR(vi->i_mode)); if (!datasync || !NInoNonResident(NTFS_I(vi))) @@ -2015,7 +2015,7 @@ static int ntfs_file_fsync(struct file *filp, loff_t start, loff_t end, else ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx. Error " "%u.", datasync ? "data" : "", vi->i_ino, -ret); - mutex_unlock(&vi->i_mutex); + inode_unlock(vi); return ret; } diff --git a/fs/ntfs/quota.c b/fs/ntfs/quota.c index d80e3315c..9793e68ba 100644 --- a/fs/ntfs/quota.c +++ b/fs/ntfs/quota.c @@ -48,7 +48,7 @@ bool ntfs_mark_quotas_out_of_date(ntfs_volume *vol) ntfs_error(vol->sb, "Quota inodes are not open."); return false; } - mutex_lock(&vol->quota_q_ino->i_mutex); + inode_lock(vol->quota_q_ino); ictx = ntfs_index_ctx_get(NTFS_I(vol->quota_q_ino)); if (!ictx) { ntfs_error(vol->sb, "Failed to get index context."); @@ -98,7 +98,7 @@ bool ntfs_mark_quotas_out_of_date(ntfs_volume *vol) ntfs_index_entry_mark_dirty(ictx); set_done: ntfs_index_ctx_put(ictx); - mutex_unlock(&vol->quota_q_ino->i_mutex); + inode_unlock(vol->quota_q_ino); /* * We set the flag so we do not try to mark the quotas out of date * again on remount. @@ -110,7 +110,7 @@ done: err_out: if (ictx) ntfs_index_ctx_put(ictx); - mutex_unlock(&vol->quota_q_ino->i_mutex); + inode_unlock(vol->quota_q_ino); return false; } diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index d1a853585..1b38abdaa 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -1284,10 +1284,10 @@ static int check_windows_hibernation_status(ntfs_volume *vol) * Find the inode number for the hibernation file by looking up the * filename hiberfil.sys in the root directory. */ - mutex_lock(&vol->root_ino->i_mutex); + inode_lock(vol->root_ino); mref = ntfs_lookup_inode_by_name(NTFS_I(vol->root_ino), hiberfil, 12, &name); - mutex_unlock(&vol->root_ino->i_mutex); + inode_unlock(vol->root_ino); if (IS_ERR_MREF(mref)) { ret = MREF_ERR(mref); /* If the file does not exist, Windows is not hibernated. */ @@ -1377,10 +1377,10 @@ static bool load_and_init_quota(ntfs_volume *vol) * Find the inode number for the quota file by looking up the filename * $Quota in the extended system files directory $Extend. */ - mutex_lock(&vol->extend_ino->i_mutex); + inode_lock(vol->extend_ino); mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), Quota, 6, &name); - mutex_unlock(&vol->extend_ino->i_mutex); + inode_unlock(vol->extend_ino); if (IS_ERR_MREF(mref)) { /* * If the file does not exist, quotas are disabled and have @@ -1460,10 +1460,10 @@ static bool load_and_init_usnjrnl(ntfs_volume *vol) * Find the inode number for the transaction log file by looking up the * filename $UsnJrnl in the extended system files directory $Extend. */ - mutex_lock(&vol->extend_ino->i_mutex); + inode_lock(vol->extend_ino); mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), UsnJrnl, 8, &name); - mutex_unlock(&vol->extend_ino->i_mutex); + inode_unlock(vol->extend_ino); if (IS_ERR_MREF(mref)) { /* * If the file does not exist, transaction logging is disabled, @@ -3139,8 +3139,8 @@ static int __init init_ntfs_fs(void) ntfs_big_inode_cache = kmem_cache_create(ntfs_big_inode_cache_name, sizeof(big_ntfs_inode), 0, - SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, - ntfs_big_inode_init_once); + SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD| + SLAB_ACCOUNT, ntfs_big_inode_init_once); if (!ntfs_big_inode_cache) { pr_crit("Failed to create %s!\n", ntfs_big_inode_cache_name); goto big_inode_err_out; diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 86181d652..d002579c6 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -164,7 +164,7 @@ static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et, struct ocfs2_extent_rec *rec); static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et); static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et); -static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = { +static const struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = { .eo_set_last_eb_blk = ocfs2_dinode_set_last_eb_blk, .eo_get_last_eb_blk = ocfs2_dinode_get_last_eb_blk, .eo_update_clusters = ocfs2_dinode_update_clusters, @@ -286,7 +286,7 @@ static void ocfs2_xattr_value_update_clusters(struct ocfs2_extent_tree *et, le32_add_cpu(&vb->vb_xv->xr_clusters, clusters); } -static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = { +static const struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = { .eo_set_last_eb_blk = ocfs2_xattr_value_set_last_eb_blk, .eo_get_last_eb_blk = ocfs2_xattr_value_get_last_eb_blk, .eo_update_clusters = ocfs2_xattr_value_update_clusters, @@ -332,7 +332,7 @@ static void ocfs2_xattr_tree_update_clusters(struct ocfs2_extent_tree *et, le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters); } -static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = { +static const struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = { .eo_set_last_eb_blk = ocfs2_xattr_tree_set_last_eb_blk, .eo_get_last_eb_blk = ocfs2_xattr_tree_get_last_eb_blk, .eo_update_clusters = ocfs2_xattr_tree_update_clusters, @@ -379,7 +379,7 @@ static void ocfs2_dx_root_fill_root_el(struct ocfs2_extent_tree *et) et->et_root_el = &dx_root->dr_list; } -static struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = { +static const struct ocfs2_extent_tree_operations ocfs2_dx_root_et_ops = { .eo_set_last_eb_blk = ocfs2_dx_root_set_last_eb_blk, .eo_get_last_eb_blk = ocfs2_dx_root_get_last_eb_blk, .eo_update_clusters = ocfs2_dx_root_update_clusters, @@ -425,7 +425,7 @@ ocfs2_refcount_tree_extent_contig(struct ocfs2_extent_tree *et, return CONTIG_NONE; } -static struct ocfs2_extent_tree_operations ocfs2_refcount_tree_et_ops = { +static const struct ocfs2_extent_tree_operations ocfs2_refcount_tree_et_ops = { .eo_set_last_eb_blk = ocfs2_refcount_tree_set_last_eb_blk, .eo_get_last_eb_blk = ocfs2_refcount_tree_get_last_eb_blk, .eo_update_clusters = ocfs2_refcount_tree_update_clusters, @@ -438,7 +438,7 @@ static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et, struct buffer_head *bh, ocfs2_journal_access_func access, void *obj, - struct ocfs2_extent_tree_operations *ops) + const struct ocfs2_extent_tree_operations *ops) { et->et_ops = ops; et->et_root_bh = bh; @@ -5719,7 +5719,7 @@ int ocfs2_remove_btree_range(struct inode *inode, goto bail; } - mutex_lock(&tl_inode->i_mutex); + inode_lock(tl_inode); if (ocfs2_truncate_log_needs_flush(osb)) { ret = __ocfs2_flush_truncate_log(osb); @@ -5776,7 +5776,7 @@ int ocfs2_remove_btree_range(struct inode *inode, out_commit: ocfs2_commit_trans(osb, handle); out: - mutex_unlock(&tl_inode->i_mutex); + inode_unlock(tl_inode); bail: if (meta_ac) ocfs2_free_alloc_context(meta_ac); @@ -5832,7 +5832,7 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb, struct ocfs2_dinode *di; struct ocfs2_truncate_log *tl; - BUG_ON(mutex_trylock(&tl_inode->i_mutex)); + BUG_ON(inode_trylock(tl_inode)); start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk); @@ -5980,7 +5980,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) struct ocfs2_dinode *di; struct ocfs2_truncate_log *tl; - BUG_ON(mutex_trylock(&tl_inode->i_mutex)); + BUG_ON(inode_trylock(tl_inode)); di = (struct ocfs2_dinode *) tl_bh->b_data; @@ -6008,7 +6008,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) goto out; } - mutex_lock(&data_alloc_inode->i_mutex); + inode_lock(data_alloc_inode); status = ocfs2_inode_lock(data_alloc_inode, &data_alloc_bh, 1); if (status < 0) { @@ -6035,7 +6035,7 @@ out_unlock: ocfs2_inode_unlock(data_alloc_inode, 1); out_mutex: - mutex_unlock(&data_alloc_inode->i_mutex); + inode_unlock(data_alloc_inode); iput(data_alloc_inode); out: @@ -6047,9 +6047,9 @@ int ocfs2_flush_truncate_log(struct ocfs2_super *osb) int status; struct inode *tl_inode = osb->osb_tl_inode; - mutex_lock(&tl_inode->i_mutex); + inode_lock(tl_inode); status = __ocfs2_flush_truncate_log(osb); - mutex_unlock(&tl_inode->i_mutex); + inode_unlock(tl_inode); return status; } @@ -6174,8 +6174,7 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb, } bail: - if (tl_inode) - iput(tl_inode); + iput(tl_inode); brelse(tl_bh); if (status < 0) { @@ -6209,7 +6208,7 @@ int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb, (unsigned long long)le64_to_cpu(tl_copy->i_blkno), num_recs); - mutex_lock(&tl_inode->i_mutex); + inode_lock(tl_inode); for(i = 0; i < num_recs; i++) { if (ocfs2_truncate_log_needs_flush(osb)) { status = __ocfs2_flush_truncate_log(osb); @@ -6240,7 +6239,7 @@ int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb, } bail_up: - mutex_unlock(&tl_inode->i_mutex); + inode_unlock(tl_inode); return status; } @@ -6347,7 +6346,7 @@ static int ocfs2_free_cached_blocks(struct ocfs2_super *osb, goto out; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = ocfs2_inode_lock(inode, &di_bh, 1); if (ret) { @@ -6396,7 +6395,7 @@ out_unlock: ocfs2_inode_unlock(inode, 1); brelse(di_bh); out_mutex: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); iput(inode); out: while(head) { @@ -6440,7 +6439,7 @@ static int ocfs2_free_cached_clusters(struct ocfs2_super *osb, handle_t *handle; int ret = 0; - mutex_lock(&tl_inode->i_mutex); + inode_lock(tl_inode); while (head) { if (ocfs2_truncate_log_needs_flush(osb)) { @@ -6472,7 +6471,7 @@ static int ocfs2_free_cached_clusters(struct ocfs2_super *osb, } } - mutex_unlock(&tl_inode->i_mutex); + inode_unlock(tl_inode); while (head) { /* Premature exit may have left some dangling items. */ @@ -7356,7 +7355,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) goto out; } - mutex_lock(&main_bm_inode->i_mutex); + inode_lock(main_bm_inode); ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 0); if (ret < 0) { @@ -7423,7 +7422,7 @@ out_unlock: ocfs2_inode_unlock(main_bm_inode, 0); brelse(main_bm_bh); out_mutex: - mutex_unlock(&main_bm_inode->i_mutex); + inode_unlock(main_bm_inode); iput(main_bm_inode); out: return ret; diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h index fb09b97db..f3dc1b0df 100644 --- a/fs/ocfs2/alloc.h +++ b/fs/ocfs2/alloc.h @@ -54,7 +54,7 @@ */ struct ocfs2_extent_tree_operations; struct ocfs2_extent_tree { - struct ocfs2_extent_tree_operations *et_ops; + const struct ocfs2_extent_tree_operations *et_ops; struct buffer_head *et_root_bh; struct ocfs2_extent_list *et_root_el; struct ocfs2_caching_info *et_ci; diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index e6795c7c7..cda0361e9 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -2047,9 +2047,9 @@ static int ocfs2_try_to_free_truncate_log(struct ocfs2_super *osb, int ret = 0; unsigned int truncated_clusters; - mutex_lock(&osb->osb_tl_inode->i_mutex); + inode_lock(osb->osb_tl_inode); truncated_clusters = osb->truncated_clusters; - mutex_unlock(&osb->osb_tl_inode->i_mutex); + inode_unlock(osb->osb_tl_inode); /* * Check whether we can succeed in allocating if we free diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 709fbbd44..a76b9ea77 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -1254,15 +1254,15 @@ static const struct file_operations o2hb_debug_fops = { void o2hb_exit(void) { - kfree(o2hb_db_livenodes); - kfree(o2hb_db_liveregions); - kfree(o2hb_db_quorumregions); - kfree(o2hb_db_failedregions); debugfs_remove(o2hb_debug_failedregions); debugfs_remove(o2hb_debug_quorumregions); debugfs_remove(o2hb_debug_liveregions); debugfs_remove(o2hb_debug_livenodes); debugfs_remove(o2hb_debug_dir); + kfree(o2hb_db_livenodes); + kfree(o2hb_db_liveregions); + kfree(o2hb_db_quorumregions); + kfree(o2hb_db_failedregions); } static struct dentry *o2hb_debug_create(const char *name, struct dentry *dir, @@ -1438,13 +1438,15 @@ static void o2hb_region_release(struct config_item *item) kfree(reg->hr_slots); - kfree(reg->hr_db_regnum); - kfree(reg->hr_db_livenodes); debugfs_remove(reg->hr_debug_livenodes); debugfs_remove(reg->hr_debug_regnum); debugfs_remove(reg->hr_debug_elapsed_time); debugfs_remove(reg->hr_debug_pinned); debugfs_remove(reg->hr_debug_dir); + kfree(reg->hr_db_livenodes); + kfree(reg->hr_db_regnum); + kfree(reg->hr_debug_elapsed_time); + kfree(reg->hr_debug_pinned); spin_lock(&o2hb_live_lock); list_del(®->hr_all_item); @@ -1780,8 +1782,8 @@ static ssize_t o2hb_region_dev_store(struct config_item *item, } ++live_threshold; atomic_set(®->hr_steady_iterations, live_threshold); - /* unsteady_iterations is double the steady_iterations */ - atomic_set(®->hr_unsteady_iterations, (live_threshold << 1)); + /* unsteady_iterations is triple the steady_iterations */ + atomic_set(®->hr_unsteady_iterations, (live_threshold * 3)); hb_task = kthread_run(o2hb_thread, reg, "o2hb-%s", reg->hr_item.ci_name); diff --git a/fs/ocfs2/cluster/nodemanager.c b/fs/ocfs2/cluster/nodemanager.c index 72afdca3c..ebe543894 100644 --- a/fs/ocfs2/cluster/nodemanager.c +++ b/fs/ocfs2/cluster/nodemanager.c @@ -757,7 +757,7 @@ int o2nm_depend_item(struct config_item *item) void o2nm_undepend_item(struct config_item *item) { - configfs_undepend_item(&o2nm_cluster_group.cs_subsys, item); + configfs_undepend_item(item); } int o2nm_depend_this_node(void) diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index ffecf89c8..e1adf285f 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -4361,7 +4361,7 @@ static int ocfs2_dx_dir_remove_index(struct inode *dir, mlog_errno(ret); goto out; } - mutex_lock(&dx_alloc_inode->i_mutex); + inode_lock(dx_alloc_inode); ret = ocfs2_inode_lock(dx_alloc_inode, &dx_alloc_bh, 1); if (ret) { @@ -4410,7 +4410,7 @@ out_unlock: ocfs2_inode_unlock(dx_alloc_inode, 1); out_mutex: - mutex_unlock(&dx_alloc_inode->i_mutex); + inode_unlock(dx_alloc_inode); brelse(dx_alloc_bh); out: iput(dx_alloc_inode); diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index e88ccf8c8..68c607e63 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -376,17 +376,6 @@ struct dlm_lock lksb_kernel_allocated:1; }; - -#define DLM_LKSB_UNUSED1 0x01 -#define DLM_LKSB_PUT_LVB 0x02 -#define DLM_LKSB_GET_LVB 0x04 -#define DLM_LKSB_UNUSED2 0x08 -#define DLM_LKSB_UNUSED3 0x10 -#define DLM_LKSB_UNUSED4 0x20 -#define DLM_LKSB_UNUSED5 0x40 -#define DLM_LKSB_UNUSED6 0x80 - - enum dlm_lockres_list { DLM_GRANTED_LIST = 0, DLM_CONVERTING_LIST = 1, diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 4e2162b35..9477d6e1d 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -2388,8 +2388,8 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data) spin_lock(&res->spinlock); BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF); + __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG); if (test_bit(node, res->refmap)) { - __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG); dlm_lockres_clear_refmap_bit(dlm, res, node); cleared = 1; } @@ -2549,7 +2549,7 @@ static int dlm_migrate_lockres(struct dlm_ctxt *dlm, } fail: - if (oldmle) { + if (ret != -EEXIST && oldmle) { /* master is known, detach if not already detached */ dlm_mle_detach_hb_events(dlm, oldmle); dlm_put_mle(oldmle); @@ -3045,7 +3045,7 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data, int ret = 0; if (!dlm_grab(dlm)) - return -EINVAL; + return 0; name = migrate->name; namelen = migrate->namelen; @@ -3136,7 +3136,8 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm, mlog(0, "tried to migrate %.*s, but some " "process beat me to it\n", namelen, name); - ret = -EEXIST; + spin_unlock(&tmp->spinlock); + return -EEXIST; } else { /* bad. 2 NODES are trying to migrate! */ mlog(ML_ERROR, "migration error mle: " diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 42f0cae93..b94a425f0 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -1373,6 +1373,7 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data, char *buf = NULL; struct dlm_work_item *item = NULL; struct dlm_lock_resource *res = NULL; + unsigned int hash; if (!dlm_grab(dlm)) return -EINVAL; @@ -1400,7 +1401,10 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data, /* lookup the lock to see if we have a secondary queue for this * already... just add the locks in and this will have its owner * and RECOVERY flag changed when it completes. */ - res = dlm_lookup_lockres(dlm, mres->lockname, mres->lockname_len); + hash = dlm_lockid_hash(mres->lockname, mres->lockname_len); + spin_lock(&dlm->spinlock); + res = __dlm_lookup_lockres(dlm, mres->lockname, mres->lockname_len, + hash); if (res) { /* this will get a ref on res */ /* mark it as recovering/migrating and hash it */ @@ -1421,13 +1425,16 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data, mres->lockname_len, mres->lockname); ret = -EFAULT; spin_unlock(&res->spinlock); + spin_unlock(&dlm->spinlock); dlm_lockres_put(res); goto leave; } res->state |= DLM_LOCK_RES_MIGRATING; } spin_unlock(&res->spinlock); + spin_unlock(&dlm->spinlock); } else { + spin_unlock(&dlm->spinlock); /* need to allocate, just like if it was * mastered here normally */ res = dlm_new_lockres(dlm, mres->lockname, mres->lockname_len); @@ -2452,11 +2459,7 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx) * perhaps later we can genericize this for other waiters. */ wake_up(&dlm->migration_wq); - if (test_bit(idx, dlm->recovery_map)) - mlog(0, "domain %s, node %u already added " - "to recovery map!\n", dlm->name, idx); - else - set_bit(idx, dlm->recovery_map); + set_bit(idx, dlm->recovery_map); } void dlm_hb_node_down_cb(struct o2nm_node *node, int idx, void *data) diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index 2e3c9dbab..1082b2c30 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -421,7 +421,7 @@ int dlm_unlock_lock_handler(struct o2net_msg *msg, u32 len, void *data, } if (!dlm_grab(dlm)) - return DLM_REJECTED; + return DLM_FORWARD; mlog_bug_on_msg(!dlm_domain_fully_joined(dlm), "Domain %s not fully joined!\n", dlm->name); diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index b5cf27dcb..03768bb3a 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -638,7 +638,7 @@ static int __init init_dlmfs_fs(void) dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache", sizeof(struct dlmfs_inode_private), 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), dlmfs_init_once); if (!dlmfs_inode_cache) { status = -ENOMEM; diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index b002acf50..474e57f83 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2438,12 +2438,6 @@ bail: * done this we have to return AOP_TRUNCATED_PAGE so the aop method * that called us can bubble that back up into the VFS who will then * immediately retry the aop call. - * - * We do a blocking lock and immediate unlock before returning, though, so that - * the lock has a great chance of being cached on this node by the time the VFS - * calls back to retry the aop. This has a potential to livelock as nodes - * ping locks back and forth, but that's a risk we're willing to take to avoid - * the lock inversion simply. */ int ocfs2_inode_lock_with_page(struct inode *inode, struct buffer_head **ret_bh, @@ -2455,8 +2449,6 @@ int ocfs2_inode_lock_with_page(struct inode *inode, ret = ocfs2_inode_lock_full(inode, ret_bh, ex, OCFS2_LOCK_NONBLOCK); if (ret == -EAGAIN) { unlock_page(page); - if (ocfs2_inode_lock(inode, ret_bh, ex) == 0) - ocfs2_inode_unlock(inode, ex); ret = AOP_TRUNCATED_PAGE; } diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 0e5b4515f..7cb38fdca 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1302,6 +1302,14 @@ int ocfs2_getattr(struct vfsmount *mnt, } generic_fillattr(inode, stat); + /* + * If there is inline data in the inode, the inode will normally not + * have data blocks allocated (it may have an external xattr block). + * Report at least one sector for such files, so tools like tar, rsync, + * others don't incorrectly think the file is completely sparse. + */ + if (unlikely(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)) + stat->blocks += (stat->size + 511)>>9; /* We set the blksize from the cluster size for performance */ stat->blksize = osb->s_clustersize; @@ -1864,7 +1872,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) return -EROFS; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* * This prevents concurrent writes on other nodes @@ -1983,7 +1991,7 @@ out_rw_unlock: ocfs2_rw_unlock(inode, 1); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } @@ -2291,7 +2299,7 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, appending = iocb->ki_flags & IOCB_APPEND ? 1 : 0; direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0; - mutex_lock(&inode->i_mutex); + inode_lock(inode); relock: /* @@ -2427,7 +2435,7 @@ out: ocfs2_rw_unlock(inode, rw_level); out_mutex: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (written) ret = written; @@ -2539,7 +2547,7 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence) struct inode *inode = file->f_mapping->host; int ret = 0; - mutex_lock(&inode->i_mutex); + inode_lock(inode); switch (whence) { case SEEK_SET: @@ -2577,7 +2585,7 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence) offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (ret) return ret; return offset; diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 8f87e05ee..36294446d 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -361,6 +361,7 @@ void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, break; case S_IFLNK: inode->i_op = &ocfs2_symlink_inode_operations; + inode_nohighmem(inode); i_size_write(inode, le64_to_cpu(fe->i_size)); break; default: @@ -629,10 +630,10 @@ static int ocfs2_remove_inode(struct inode *inode, goto bail; } - mutex_lock(&inode_alloc_inode->i_mutex); + inode_lock(inode_alloc_inode); status = ocfs2_inode_lock(inode_alloc_inode, &inode_alloc_bh, 1); if (status < 0) { - mutex_unlock(&inode_alloc_inode->i_mutex); + inode_unlock(inode_alloc_inode); mlog_errno(status); goto bail; @@ -679,7 +680,7 @@ bail_commit: ocfs2_commit_trans(osb, handle); bail_unlock: ocfs2_inode_unlock(inode_alloc_inode, 1); - mutex_unlock(&inode_alloc_inode->i_mutex); + inode_unlock(inode_alloc_inode); brelse(inode_alloc_bh); bail: iput(inode_alloc_inode); @@ -750,10 +751,10 @@ static int ocfs2_wipe_inode(struct inode *inode, /* Lock the orphan dir. The lock will be held for the entire * delete_inode operation. We do this now to avoid races with * recovery completion on other nodes. */ - mutex_lock(&orphan_dir_inode->i_mutex); + inode_lock(orphan_dir_inode); status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); if (status < 0) { - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); mlog_errno(status); goto bail; @@ -802,7 +803,7 @@ bail_unlock_dir: return status; ocfs2_inode_unlock(orphan_dir_inode, 1); - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); brelse(orphan_dir_bh); bail: iput(orphan_dir_inode); diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index 3cb097ccc..4506ec5ec 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -86,7 +86,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags, unsigned oldflags; int status; - mutex_lock(&inode->i_mutex); + inode_lock(inode); status = ocfs2_inode_lock(inode, &bh, 1); if (status < 0) { @@ -135,7 +135,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags, bail_unlock: ocfs2_inode_unlock(inode, 1); bail: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); brelse(bh); @@ -287,7 +287,7 @@ static int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb, struct ocfs2_dinode *dinode_alloc = NULL; if (inode_alloc) - mutex_lock(&inode_alloc->i_mutex); + inode_lock(inode_alloc); if (o2info_coherent(&fi->ifi_req)) { status = ocfs2_inode_lock(inode_alloc, &bh, 0); @@ -317,7 +317,7 @@ bail: ocfs2_inode_unlock(inode_alloc, 0); if (inode_alloc) - mutex_unlock(&inode_alloc->i_mutex); + inode_unlock(inode_alloc); brelse(bh); @@ -547,7 +547,7 @@ static int ocfs2_info_freefrag_scan_bitmap(struct ocfs2_super *osb, struct ocfs2_dinode *gb_dinode = NULL; if (gb_inode) - mutex_lock(&gb_inode->i_mutex); + inode_lock(gb_inode); if (o2info_coherent(&ffg->iff_req)) { status = ocfs2_inode_lock(gb_inode, &bh, 0); @@ -604,11 +604,9 @@ bail: ocfs2_inode_unlock(gb_inode, 0); if (gb_inode) - mutex_unlock(&gb_inode->i_mutex); - - if (gb_inode) - iput(gb_inode); + inode_unlock(gb_inode); + iput(gb_inode); brelse(bh); return status; diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 13534f4fe..61b833b72 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1042,8 +1042,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb) // up_write(&journal->j_trans_barrier); done: - if (inode) - iput(inode); + iput(inode); } static void ocfs2_clear_journal_error(struct super_block *sb, @@ -1687,9 +1686,7 @@ done: if (got_lock) ocfs2_inode_unlock(inode, 1); - if (inode) - iput(inode); - + iput(inode); brelse(bh); return status; @@ -1796,8 +1793,7 @@ static int ocfs2_trylock_journal(struct ocfs2_super *osb, ocfs2_inode_unlock(inode, 1); bail: - if (inode) - iput(inode); + iput(inode); return status; } @@ -2092,7 +2088,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb, return status; } - mutex_lock(&orphan_dir_inode->i_mutex); + inode_lock(orphan_dir_inode); status = ocfs2_inode_lock(orphan_dir_inode, NULL, 0); if (status < 0) { mlog_errno(status); @@ -2110,7 +2106,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb, out_cluster: ocfs2_inode_unlock(orphan_dir_inode, 0); out: - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); iput(orphan_dir_inode); return status; } @@ -2200,7 +2196,7 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, oi->ip_next_orphan = NULL; if (oi->ip_flags & OCFS2_INODE_DIO_ORPHAN_ENTRY) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = ocfs2_rw_lock(inode, 1); if (ret < 0) { mlog_errno(ret); @@ -2239,7 +2235,7 @@ unlock_inode: unlock_rw: ocfs2_rw_unlock(inode, 1); unlock_mutex: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); /* clear dio flag in ocfs2_inode_info */ oi->ip_flags &= ~OCFS2_INODE_DIO_ORPHAN_ENTRY; diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index 0a4457fb0..7d62c43a2 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -358,8 +358,7 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb) bail: if (status < 0) brelse(alloc_bh); - if (inode) - iput(inode); + iput(inode); trace_ocfs2_load_local_alloc(osb->local_alloc_bits); @@ -415,7 +414,7 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb) goto out; } - mutex_lock(&main_bm_inode->i_mutex); + inode_lock(main_bm_inode); status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1); if (status < 0) { @@ -469,12 +468,11 @@ out_unlock: ocfs2_inode_unlock(main_bm_inode, 1); out_mutex: - mutex_unlock(&main_bm_inode->i_mutex); + inode_unlock(main_bm_inode); iput(main_bm_inode); out: - if (local_alloc_inode) - iput(local_alloc_inode); + iput(local_alloc_inode); kfree(alloc_copy); } @@ -508,7 +506,7 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb, goto bail; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); status = ocfs2_read_inode_block_full(inode, &alloc_bh, OCFS2_BH_IGNORE_CACHE); @@ -541,7 +539,7 @@ bail: brelse(alloc_bh); if (inode) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); iput(inode); } @@ -573,7 +571,7 @@ int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb, goto out; } - mutex_lock(&main_bm_inode->i_mutex); + inode_lock(main_bm_inode); status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1); if (status < 0) { @@ -603,7 +601,7 @@ out_unlock: ocfs2_inode_unlock(main_bm_inode, 1); out_mutex: - mutex_unlock(&main_bm_inode->i_mutex); + inode_unlock(main_bm_inode); brelse(main_bm_bh); @@ -645,7 +643,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, goto bail; } - mutex_lock(&local_alloc_inode->i_mutex); + inode_lock(local_alloc_inode); /* * We must double check state and allocator bits because @@ -711,7 +709,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, status = 0; bail: if (status < 0 && local_alloc_inode) { - mutex_unlock(&local_alloc_inode->i_mutex); + inode_unlock(local_alloc_inode); iput(local_alloc_inode); } @@ -1327,9 +1325,7 @@ bail: brelse(main_bm_bh); - if (main_bm_inode) - iput(main_bm_inode); - + iput(main_bm_inode); kfree(alloc_copy); if (ac) diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index 9581d190f..77ebc2bc1 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -147,6 +147,10 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) ret = ocfs2_inode_lock(inode, &di_bh, 1); if (ret < 0) { mlog_errno(ret); + if (ret == -ENOMEM) + ret = VM_FAULT_OOM; + else + ret = VM_FAULT_SIGBUS; goto out; } diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 124471d26..e3d05d990 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -276,7 +276,7 @@ static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context, * context->data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv; */ - mutex_lock(&tl_inode->i_mutex); + inode_lock(tl_inode); if (ocfs2_truncate_log_needs_flush(osb)) { ret = __ocfs2_flush_truncate_log(osb); @@ -338,7 +338,7 @@ out_commit: ocfs2_commit_trans(osb, handle); out_unlock_mutex: - mutex_unlock(&tl_inode->i_mutex); + inode_unlock(tl_inode); if (context->data_ac) { ocfs2_free_alloc_context(context->data_ac); @@ -632,7 +632,7 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, goto out; } - mutex_lock(&gb_inode->i_mutex); + inode_lock(gb_inode); ret = ocfs2_inode_lock(gb_inode, &gb_bh, 1); if (ret) { @@ -640,7 +640,7 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, goto out_unlock_gb_mutex; } - mutex_lock(&tl_inode->i_mutex); + inode_lock(tl_inode); handle = ocfs2_start_trans(osb, credits); if (IS_ERR(handle)) { @@ -708,11 +708,11 @@ out_commit: brelse(gd_bh); out_unlock_tl_inode: - mutex_unlock(&tl_inode->i_mutex); + inode_unlock(tl_inode); ocfs2_inode_unlock(gb_inode, 1); out_unlock_gb_mutex: - mutex_unlock(&gb_inode->i_mutex); + inode_unlock(gb_inode); brelse(gb_bh); iput(gb_inode); @@ -905,7 +905,7 @@ static int ocfs2_move_extents(struct ocfs2_move_extents_context *context) if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) return -EROFS; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* * This prevents concurrent writes from other nodes @@ -969,7 +969,7 @@ out_inode_unlock: out_rw_unlock: ocfs2_rw_unlock(inode, 1); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return status; } diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 3123408da..6b3e87189 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -1045,7 +1045,7 @@ leave: if (orphan_dir) { /* This was locked for us in ocfs2_prepare_orphan_dir() */ ocfs2_inode_unlock(orphan_dir, 1); - mutex_unlock(&orphan_dir->i_mutex); + inode_unlock(orphan_dir); iput(orphan_dir); } @@ -1664,7 +1664,7 @@ bail: if (orphan_dir) { /* This was locked for us in ocfs2_prepare_orphan_dir() */ ocfs2_inode_unlock(orphan_dir, 1); - mutex_unlock(&orphan_dir->i_mutex); + inode_unlock(orphan_dir); iput(orphan_dir); } @@ -1683,8 +1683,7 @@ bail: if (new_inode) sync_mapping_buffers(old_inode->i_mapping); - if (new_inode) - iput(new_inode); + iput(new_inode); ocfs2_free_dir_lookup_result(&target_lookup_res); ocfs2_free_dir_lookup_result(&old_entry_lookup); @@ -1958,6 +1957,7 @@ static int ocfs2_symlink(struct inode *dir, inode->i_rdev = 0; newsize = l - 1; inode->i_op = &ocfs2_symlink_inode_operations; + inode_nohighmem(inode); if (l > ocfs2_fast_symlink_chars(sb)) { u32 offset = 0; @@ -2121,11 +2121,11 @@ static int ocfs2_lookup_lock_orphan_dir(struct ocfs2_super *osb, return ret; } - mutex_lock(&orphan_dir_inode->i_mutex); + inode_lock(orphan_dir_inode); ret = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); if (ret < 0) { - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); iput(orphan_dir_inode); mlog_errno(ret); @@ -2226,7 +2226,7 @@ out: if (ret) { ocfs2_inode_unlock(orphan_dir_inode, 1); - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); iput(orphan_dir_inode); } @@ -2372,6 +2372,15 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, (unsigned long long)OCFS2_I(orphan_dir_inode)->ip_blkno, name, strlen(name)); + status = ocfs2_journal_access_di(handle, + INODE_CACHE(orphan_dir_inode), + orphan_dir_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto leave; + } + /* find it's spot in the orphan directory */ status = ocfs2_find_entry(name, strlen(name), orphan_dir_inode, &lookup); @@ -2387,15 +2396,6 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, goto leave; } - status = ocfs2_journal_access_di(handle, - INODE_CACHE(orphan_dir_inode), - orphan_dir_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (status < 0) { - mlog_errno(status); - goto leave; - } - /* do the i_nlink dance! :) */ orphan_fe = (struct ocfs2_dinode *) orphan_dir_bh->b_data; if (S_ISDIR(inode->i_mode)) @@ -2495,7 +2495,7 @@ out: ocfs2_free_alloc_context(inode_ac); /* Unroll orphan dir locking */ - mutex_unlock(&orphan_dir->i_mutex); + inode_unlock(orphan_dir); ocfs2_inode_unlock(orphan_dir, 1); iput(orphan_dir); } @@ -2602,7 +2602,7 @@ leave: if (orphan_dir) { /* This was locked for us in ocfs2_prepare_orphan_dir() */ ocfs2_inode_unlock(orphan_dir, 1); - mutex_unlock(&orphan_dir->i_mutex); + inode_unlock(orphan_dir); iput(orphan_dir); } @@ -2689,7 +2689,7 @@ int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb, bail_unlock_orphan: ocfs2_inode_unlock(orphan_dir_inode, 1); - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); iput(orphan_dir_inode); ocfs2_free_dir_lookup_result(&orphan_insert); @@ -2721,10 +2721,10 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, goto bail; } - mutex_lock(&orphan_dir_inode->i_mutex); + inode_lock(orphan_dir_inode); status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); if (status < 0) { - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); iput(orphan_dir_inode); mlog_errno(status); goto bail; @@ -2770,7 +2770,7 @@ bail_commit: bail_unlock_orphan: ocfs2_inode_unlock(orphan_dir_inode, 1); - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); brelse(orphan_dir_bh); iput(orphan_dir_inode); @@ -2834,12 +2834,12 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, goto leave; } - mutex_lock(&orphan_dir_inode->i_mutex); + inode_lock(orphan_dir_inode); status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); if (status < 0) { mlog_errno(status); - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); iput(orphan_dir_inode); goto leave; } @@ -2901,7 +2901,7 @@ out_commit: ocfs2_commit_trans(osb, handle); orphan_unlock: ocfs2_inode_unlock(orphan_dir_inode, 1); - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); iput(orphan_dir_inode); leave: diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h index b6d51333a..d153e6e31 100644 --- a/fs/ocfs2/quota.h +++ b/fs/ocfs2/quota.h @@ -82,7 +82,7 @@ struct ocfs2_quota_chunk { extern struct kmem_cache *ocfs2_dquot_cachep; extern struct kmem_cache *ocfs2_qf_chunk_cachep; -extern struct qtree_fmt_operations ocfs2_global_ops; +extern const struct qtree_fmt_operations ocfs2_global_ops; struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery( struct ocfs2_super *osb, int slot_num); diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index c93d67220..9c9dd30bc 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -123,7 +123,7 @@ static int ocfs2_global_is_id(void *dp, struct dquot *dquot) dquot->dq_id); } -struct qtree_fmt_operations ocfs2_global_ops = { +const struct qtree_fmt_operations ocfs2_global_ops = { .mem2disk_dqblk = ocfs2_global_mem2diskdqb, .disk2mem_dqblk = ocfs2_global_disk2memdqb, .is_id = ocfs2_global_is_id, @@ -308,7 +308,7 @@ int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex) WARN_ON(bh != oinfo->dqi_gqi_bh); spin_unlock(&dq_data_lock); if (ex) { - mutex_lock(&oinfo->dqi_gqinode->i_mutex); + inode_lock(oinfo->dqi_gqinode); down_write(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem); } else { down_read(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem); @@ -320,7 +320,7 @@ void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex) { if (ex) { up_write(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem); - mutex_unlock(&oinfo->dqi_gqinode->i_mutex); + inode_unlock(oinfo->dqi_gqinode); } else { up_read(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem); } diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 252119860..3eff031aa 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -807,7 +807,7 @@ int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh) mlog_errno(ret); goto out; } - mutex_lock(&alloc_inode->i_mutex); + inode_lock(alloc_inode); ret = ocfs2_inode_lock(alloc_inode, &alloc_bh, 1); if (ret) { @@ -867,7 +867,7 @@ out_unlock: } out_mutex: if (alloc_inode) { - mutex_unlock(&alloc_inode->i_mutex); + inode_unlock(alloc_inode); iput(alloc_inode); } out: @@ -4197,7 +4197,7 @@ static int __ocfs2_reflink(struct dentry *old_dentry, goto out; } - mutex_lock_nested(&new_inode->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(new_inode, I_MUTEX_CHILD); ret = ocfs2_inode_lock_nested(new_inode, &new_bh, 1, OI_LS_REFLINK_TARGET); if (ret) { @@ -4231,7 +4231,7 @@ inode_unlock: ocfs2_inode_unlock(new_inode, 1); brelse(new_bh); out_unlock: - mutex_unlock(&new_inode->i_mutex); + inode_unlock(new_inode); out: if (!ret) { ret = filemap_fdatawait(inode->i_mapping); @@ -4402,11 +4402,11 @@ static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir, return error; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); error = dquot_initialize(dir); if (!error) error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (!error) fsnotify_create(dir, new_dentry); return error; diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c index 79b802130..576b9a048 100644 --- a/fs/ocfs2/resize.c +++ b/fs/ocfs2/resize.c @@ -301,7 +301,7 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters) goto out; } - mutex_lock(&main_bm_inode->i_mutex); + inode_lock(main_bm_inode); ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1); if (ret < 0) { @@ -375,7 +375,7 @@ out_unlock: ocfs2_inode_unlock(main_bm_inode, 1); out_mutex: - mutex_unlock(&main_bm_inode->i_mutex); + inode_unlock(main_bm_inode); iput(main_bm_inode); out: @@ -486,7 +486,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input) goto out; } - mutex_lock(&main_bm_inode->i_mutex); + inode_lock(main_bm_inode); ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1); if (ret < 0) { @@ -590,7 +590,7 @@ out_unlock: ocfs2_inode_unlock(main_bm_inode, 1); out_mutex: - mutex_unlock(&main_bm_inode->i_mutex); + inode_unlock(main_bm_inode); iput(main_bm_inode); out: diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index e78a203d4..1e0959214 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -322,8 +322,7 @@ static void __ocfs2_free_slot_info(struct ocfs2_slot_info *si) if (si == NULL) return; - if (si->si_inode) - iput(si->si_inode); + iput(si->si_inode); if (si->si_bh) { for (i = 0; i < si->si_blocks; i++) { if (si->si_bh[i]) { @@ -503,8 +502,17 @@ int ocfs2_find_slot(struct ocfs2_super *osb) trace_ocfs2_find_slot(osb->slot_num); status = ocfs2_update_disk_slot(osb, si, osb->slot_num); - if (status < 0) + if (status < 0) { mlog_errno(status); + /* + * if write block failed, invalidate slot to avoid overwrite + * slot during dismount in case another node rightly has mounted + */ + spin_lock(&osb->osb_lock); + ocfs2_invalidate_slot(si, osb->slot_num); + osb->slot_num = OCFS2_INVALID_SLOT; + spin_unlock(&osb->osb_lock); + } bail: return status; diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index fc6d25f6d..2f19aeec5 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -141,7 +141,7 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac) if (ac->ac_which != OCFS2_AC_USE_LOCAL) ocfs2_inode_unlock(inode, 1); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); iput(inode); ac->ac_inode = NULL; @@ -797,11 +797,11 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, return -EINVAL; } - mutex_lock(&alloc_inode->i_mutex); + inode_lock(alloc_inode); status = ocfs2_inode_lock(alloc_inode, &bh, 1); if (status < 0) { - mutex_unlock(&alloc_inode->i_mutex); + inode_unlock(alloc_inode); iput(alloc_inode); mlog_errno(status); @@ -2875,10 +2875,10 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res) goto bail; } - mutex_lock(&inode_alloc_inode->i_mutex); + inode_lock(inode_alloc_inode); status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0); if (status < 0) { - mutex_unlock(&inode_alloc_inode->i_mutex); + inode_unlock(inode_alloc_inode); iput(inode_alloc_inode); mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n", (u32)suballoc_slot, status); @@ -2891,7 +2891,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res) mlog(ML_ERROR, "test suballoc bit failed %d\n", status); ocfs2_inode_unlock(inode_alloc_inode, 0); - mutex_unlock(&inode_alloc_inode->i_mutex); + inode_unlock(inode_alloc_inode); iput(inode_alloc_inode); brelse(alloc_bh); diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 2de4c8a93..faa136509 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1280,6 +1280,8 @@ static int ocfs2_parse_options(struct super_block *sb, int status, user_stack = 0; char *p; u32 tmp; + int token, option; + substring_t args[MAX_OPT_ARGS]; trace_ocfs2_parse_options(is_remount, options ? options : "(none)"); @@ -1298,9 +1300,6 @@ static int ocfs2_parse_options(struct super_block *sb, } while ((p = strsep(&options, ",")) != NULL) { - int token, option; - substring_t args[MAX_OPT_ARGS]; - if (!*p) continue; @@ -1367,7 +1366,6 @@ static int ocfs2_parse_options(struct super_block *sb, mopt->atime_quantum = option; break; case Opt_slot: - option = 0; if (match_int(&args[0], &option)) { status = 0; goto bail; @@ -1376,7 +1374,6 @@ static int ocfs2_parse_options(struct super_block *sb, mopt->slot = (s16)option; break; case Opt_commit: - option = 0; if (match_int(&args[0], &option)) { status = 0; goto bail; @@ -1388,7 +1385,6 @@ static int ocfs2_parse_options(struct super_block *sb, mopt->commit_interval = HZ * option; break; case Opt_localalloc: - option = 0; if (match_int(&args[0], &option)) { status = 0; goto bail; @@ -1726,8 +1722,7 @@ static int ocfs2_statfs(struct dentry *dentry, struct kstatfs *buf) ocfs2_inode_unlock(inode, 0); status = 0; bail: - if (inode) - iput(inode); + iput(inode); if (status) mlog_errno(status); @@ -1771,7 +1766,7 @@ static int ocfs2_initialize_mem_caches(void) sizeof(struct ocfs2_inode_info), 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), ocfs2_inode_init_once); ocfs2_dquot_cachep = kmem_cache_create("ocfs2_dquot_cache", sizeof(struct ocfs2_dquot), diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c index 66edce7ec..6c2a3e3c5 100644 --- a/fs/ocfs2/symlink.c +++ b/fs/ocfs2/symlink.c @@ -88,8 +88,7 @@ const struct address_space_operations ocfs2_fast_symlink_aops = { const struct inode_operations ocfs2_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = page_follow_link_light, - .put_link = page_put_link, + .get_link = page_get_link, .getattr = ocfs2_getattr, .setattr = ocfs2_setattr, .setxattr = generic_setxattr, diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index e9164f098..7d3d979f5 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -544,8 +544,7 @@ static inline const char *ocfs2_xattr_prefix(int name_index) if (name_index > 0 && name_index < OCFS2_XATTR_MAX) handler = ocfs2_xattr_handler_map[name_index]; - - return handler ? handler->prefix : NULL; + return handler ? xattr_prefix(handler) : NULL; } static u32 ocfs2_xattr_name_hash(struct inode *inode, @@ -884,14 +883,39 @@ static int ocfs2_xattr_value_truncate(struct inode *inode, return ret; } -static int ocfs2_xattr_list_entry(char *buffer, size_t size, - size_t *result, const char *prefix, +static int ocfs2_xattr_list_entry(struct super_block *sb, + char *buffer, size_t size, + size_t *result, int type, const char *name, int name_len) { char *p = buffer + *result; - int prefix_len = strlen(prefix); - int total_len = prefix_len + name_len + 1; + const char *prefix; + int prefix_len; + int total_len; + switch(type) { + case OCFS2_XATTR_INDEX_USER: + if (OCFS2_SB(sb)->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR) + return 0; + break; + + case OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS: + case OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT: + if (!(sb->s_flags & MS_POSIXACL)) + return 0; + break; + + case OCFS2_XATTR_INDEX_TRUSTED: + if (!capable(CAP_SYS_ADMIN)) + return 0; + break; + } + + prefix = ocfs2_xattr_prefix(type); + if (!prefix) + return 0; + prefix_len = strlen(prefix); + total_len = prefix_len + name_len + 1; *result += total_len; /* we are just looking for how big our buffer needs to be */ @@ -914,23 +938,20 @@ static int ocfs2_xattr_list_entries(struct inode *inode, { size_t result = 0; int i, type, ret; - const char *prefix, *name; + const char *name; for (i = 0 ; i < le16_to_cpu(header->xh_count); i++) { struct ocfs2_xattr_entry *entry = &header->xh_entries[i]; type = ocfs2_xattr_get_type(entry); - prefix = ocfs2_xattr_prefix(type); - - if (prefix) { - name = (const char *)header + - le16_to_cpu(entry->xe_name_offset); + name = (const char *)header + + le16_to_cpu(entry->xe_name_offset); - ret = ocfs2_xattr_list_entry(buffer, buffer_size, - &result, prefix, name, - entry->xe_name_len); - if (ret) - return ret; - } + ret = ocfs2_xattr_list_entry(inode->i_sb, + buffer, buffer_size, + &result, type, name, + entry->xe_name_len); + if (ret) + return ret; } return result; @@ -2503,7 +2524,7 @@ static int ocfs2_xattr_free_block(struct inode *inode, mlog_errno(ret); goto out; } - mutex_lock(&xb_alloc_inode->i_mutex); + inode_lock(xb_alloc_inode); ret = ocfs2_inode_lock(xb_alloc_inode, &xb_alloc_bh, 1); if (ret < 0) { @@ -2528,7 +2549,7 @@ out_unlock: ocfs2_inode_unlock(xb_alloc_inode, 1); brelse(xb_alloc_bh); out_mutex: - mutex_unlock(&xb_alloc_inode->i_mutex); + inode_unlock(xb_alloc_inode); iput(xb_alloc_inode); out: brelse(blk_bh); @@ -3598,17 +3619,17 @@ int ocfs2_xattr_set(struct inode *inode, } } - mutex_lock(&tl_inode->i_mutex); + inode_lock(tl_inode); if (ocfs2_truncate_log_needs_flush(osb)) { ret = __ocfs2_flush_truncate_log(osb); if (ret < 0) { - mutex_unlock(&tl_inode->i_mutex); + inode_unlock(tl_inode); mlog_errno(ret); goto cleanup; } } - mutex_unlock(&tl_inode->i_mutex); + inode_unlock(tl_inode); ret = ocfs2_init_xattr_set_ctxt(inode, di, &xi, &xis, &xbs, &ctxt, ref_meta, &credits); @@ -4033,32 +4054,30 @@ static int ocfs2_list_xattr_bucket(struct inode *inode, int ret = 0, type; struct ocfs2_xattr_tree_list *xl = (struct ocfs2_xattr_tree_list *)para; int i, block_off, new_offset; - const char *prefix, *name; + const char *name; for (i = 0 ; i < le16_to_cpu(bucket_xh(bucket)->xh_count); i++) { struct ocfs2_xattr_entry *entry = &bucket_xh(bucket)->xh_entries[i]; type = ocfs2_xattr_get_type(entry); - prefix = ocfs2_xattr_prefix(type); - if (prefix) { - ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb, - bucket_xh(bucket), - i, - &block_off, - &new_offset); - if (ret) - break; + ret = ocfs2_xattr_bucket_get_name_value(inode->i_sb, + bucket_xh(bucket), + i, + &block_off, + &new_offset); + if (ret) + break; - name = (const char *)bucket_block(bucket, block_off) + - new_offset; - ret = ocfs2_xattr_list_entry(xl->buffer, - xl->buffer_size, - &xl->result, - prefix, name, - entry->xe_name_len); - if (ret) - break; - } + name = (const char *)bucket_block(bucket, block_off) + + new_offset; + ret = ocfs2_xattr_list_entry(inode->i_sb, + xl->buffer, + xl->buffer_size, + &xl->result, + type, name, + entry->xe_name_len); + if (ret) + break; } return ret; @@ -5441,7 +5460,7 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode, return ret; } - mutex_lock(&tl_inode->i_mutex); + inode_lock(tl_inode); if (ocfs2_truncate_log_needs_flush(osb)) { ret = __ocfs2_flush_truncate_log(osb); @@ -5485,7 +5504,7 @@ out_commit: out: ocfs2_schedule_truncate_log_flush(osb, 1); - mutex_unlock(&tl_inode->i_mutex); + inode_unlock(tl_inode); if (meta_ac) ocfs2_free_alloc_context(meta_ac); @@ -7226,31 +7245,14 @@ int ocfs2_init_security_and_acl(struct inode *dir, leave: return ret; } + /* * 'security' attributes support */ -static size_t ocfs2_xattr_security_list(const struct xattr_handler *handler, - struct dentry *dentry, char *list, - size_t list_size, const char *name, - size_t name_len) -{ - const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN; - const size_t total_len = prefix_len + name_len + 1; - - if (list && total_len <= list_size) { - memcpy(list, XATTR_SECURITY_PREFIX, prefix_len); - memcpy(list + prefix_len, name, name_len); - list[prefix_len + name_len] = '\0'; - } - return total_len; -} - static int ocfs2_xattr_security_get(const struct xattr_handler *handler, struct dentry *dentry, const char *name, void *buffer, size_t size) { - if (strcmp(name, "") == 0) - return -EINVAL; return ocfs2_xattr_get(d_inode(dentry), OCFS2_XATTR_INDEX_SECURITY, name, buffer, size); } @@ -7259,9 +7261,6 @@ static int ocfs2_xattr_security_set(const struct xattr_handler *handler, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { - if (strcmp(name, "") == 0) - return -EINVAL; - return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_SECURITY, name, value, size, flags); } @@ -7314,7 +7313,6 @@ int ocfs2_init_security_set(handle_t *handle, const struct xattr_handler ocfs2_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, - .list = ocfs2_xattr_security_list, .get = ocfs2_xattr_security_get, .set = ocfs2_xattr_security_set, }; @@ -7322,31 +7320,10 @@ const struct xattr_handler ocfs2_xattr_security_handler = { /* * 'trusted' attributes support */ -static size_t ocfs2_xattr_trusted_list(const struct xattr_handler *handler, - struct dentry *dentry, char *list, - size_t list_size, const char *name, - size_t name_len) -{ - const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN; - const size_t total_len = prefix_len + name_len + 1; - - if (!capable(CAP_SYS_ADMIN)) - return 0; - - if (list && total_len <= list_size) { - memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len); - memcpy(list + prefix_len, name, name_len); - list[prefix_len + name_len] = '\0'; - } - return total_len; -} - static int ocfs2_xattr_trusted_get(const struct xattr_handler *handler, struct dentry *dentry, const char *name, void *buffer, size_t size) { - if (strcmp(name, "") == 0) - return -EINVAL; return ocfs2_xattr_get(d_inode(dentry), OCFS2_XATTR_INDEX_TRUSTED, name, buffer, size); } @@ -7355,16 +7332,12 @@ static int ocfs2_xattr_trusted_set(const struct xattr_handler *handler, struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { - if (strcmp(name, "") == 0) - return -EINVAL; - return ocfs2_xattr_set(d_inode(dentry), OCFS2_XATTR_INDEX_TRUSTED, name, value, size, flags); } const struct xattr_handler ocfs2_xattr_trusted_handler = { .prefix = XATTR_TRUSTED_PREFIX, - .list = ocfs2_xattr_trusted_list, .get = ocfs2_xattr_trusted_get, .set = ocfs2_xattr_trusted_set, }; @@ -7372,34 +7345,12 @@ const struct xattr_handler ocfs2_xattr_trusted_handler = { /* * 'user' attributes support */ -static size_t ocfs2_xattr_user_list(const struct xattr_handler *handler, - struct dentry *dentry, char *list, - size_t list_size, const char *name, - size_t name_len) -{ - const size_t prefix_len = XATTR_USER_PREFIX_LEN; - const size_t total_len = prefix_len + name_len + 1; - struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); - - if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR) - return 0; - - if (list && total_len <= list_size) { - memcpy(list, XATTR_USER_PREFIX, prefix_len); - memcpy(list + prefix_len, name, name_len); - list[prefix_len + name_len] = '\0'; - } - return total_len; -} - static int ocfs2_xattr_user_get(const struct xattr_handler *handler, struct dentry *dentry, const char *name, void *buffer, size_t size) { struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); - if (strcmp(name, "") == 0) - return -EINVAL; if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR) return -EOPNOTSUPP; return ocfs2_xattr_get(d_inode(dentry), OCFS2_XATTR_INDEX_USER, name, @@ -7412,8 +7363,6 @@ static int ocfs2_xattr_user_set(const struct xattr_handler *handler, { struct ocfs2_super *osb = OCFS2_SB(dentry->d_sb); - if (strcmp(name, "") == 0) - return -EINVAL; if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR) return -EOPNOTSUPP; @@ -7423,7 +7372,6 @@ static int ocfs2_xattr_user_set(const struct xattr_handler *handler, const struct xattr_handler ocfs2_xattr_user_handler = { .prefix = XATTR_USER_PREFIX, - .list = ocfs2_xattr_user_list, .get = ocfs2_xattr_user_get, .set = ocfs2_xattr_user_set, }; @@ -61,13 +61,12 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, if (ret) newattrs.ia_valid |= ret | ATTR_FORCE; - mutex_lock(&dentry->d_inode->i_mutex); + inode_lock(dentry->d_inode); /* Note any delegations or leases have already been broken: */ ret = notify_change(dentry, &newattrs, NULL); - mutex_unlock(&dentry->d_inode->i_mutex); + inode_unlock(dentry->d_inode); return ret; } -EXPORT_SYMBOL_GPL(do_truncate); long vfs_truncate(struct path *path, loff_t length) { @@ -514,7 +513,7 @@ static int chmod_common(struct path *path, umode_t mode) if (error) return error; retry_deleg: - mutex_lock(&inode->i_mutex); + inode_lock(inode); error = security_path_chmod(path, mode); if (error) goto out_unlock; @@ -522,7 +521,7 @@ retry_deleg: newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; error = notify_change(path->dentry, &newattrs, &delegated_inode); out_unlock: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) @@ -597,11 +596,11 @@ retry_deleg: if (!S_ISDIR(inode->i_mode)) newattrs.ia_valid |= ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV; - mutex_lock(&inode->i_mutex); + inode_lock(inode); error = security_path_chown(path, uid, gid); if (!error) error = notify_change(path->dentry, &newattrs, &delegated_inode); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) @@ -682,7 +681,6 @@ int open_check_o_direct(struct file *f) } return 0; } -EXPORT_SYMBOL_GPL(open_check_o_direct); static int do_dentry_open(struct file *f, struct inode *inode, @@ -892,7 +890,7 @@ EXPORT_SYMBOL(dentry_open); static inline int build_open_flags(int flags, umode_t mode, struct open_flags *op) { int lookup_flags = 0; - int acc_mode; + int acc_mode = ACC_MODE(flags); if (flags & (O_CREAT | __O_TMPFILE)) op->mode = (mode & S_IALLUGO) | S_IFREG; @@ -914,7 +912,6 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o if (flags & __O_TMPFILE) { if ((flags & O_TMPFILE_MASK) != O_TMPFILE) return -EINVAL; - acc_mode = MAY_OPEN | ACC_MODE(flags); if (!(acc_mode & MAY_WRITE)) return -EINVAL; } else if (flags & O_PATH) { @@ -924,8 +921,6 @@ static inline int build_open_flags(int flags, umode_t mode, struct open_flags *o */ flags &= O_DIRECTORY | O_NOFOLLOW | O_PATH; acc_mode = 0; - } else { - acc_mode = MAY_OPEN | ACC_MODE(flags); } op->open_flag = flags; diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index 15e4500cd..b61b883c8 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -443,7 +443,7 @@ static int __init init_openprom_fs(void) sizeof(struct op_inode_info), 0, (SLAB_RECLAIM_ACCOUNT | - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD | SLAB_ACCOUNT), op_inode_init_once); if (!op_inode_cachep) return -ENOMEM; diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index eff6319d5..d894e7cd9 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -248,9 +248,9 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir, if (err) goto out_cleanup; - mutex_lock(&newdentry->d_inode->i_mutex); + inode_lock(newdentry->d_inode); err = ovl_set_attr(newdentry, stat); - mutex_unlock(&newdentry->d_inode->i_mutex); + inode_unlock(newdentry->d_inode); if (err) goto out_cleanup; diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 692ceda3b..52f6de5d4 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -167,7 +167,7 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode, struct dentry *newdentry; int err; - mutex_lock_nested(&udir->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(udir, I_MUTEX_PARENT); newdentry = lookup_one_len(dentry->d_name.name, upperdir, dentry->d_name.len); err = PTR_ERR(newdentry); @@ -185,7 +185,7 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode, out_dput: dput(newdentry); out_unlock: - mutex_unlock(&udir->i_mutex); + inode_unlock(udir); return err; } @@ -258,9 +258,9 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry, if (err) goto out_cleanup; - mutex_lock(&opaquedir->d_inode->i_mutex); + inode_lock(opaquedir->d_inode); err = ovl_set_attr(opaquedir, &stat); - mutex_unlock(&opaquedir->d_inode->i_mutex); + inode_unlock(opaquedir->d_inode); if (err) goto out_cleanup; @@ -599,7 +599,7 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir) struct dentry *upper = ovl_dentry_upper(dentry); int err; - mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(dir, I_MUTEX_PARENT); err = -ESTALE; if (upper->d_parent == upperdir) { /* Don't let d_delete() think it can reset d_inode */ @@ -618,8 +618,9 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir) * sole user of this dentry. Too tricky... Just unhash for * now. */ - d_drop(dentry); - mutex_unlock(&dir->i_mutex); + if (!err) + d_drop(dentry); + inode_unlock(dir); return err; } @@ -903,6 +904,13 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old, if (!overwrite && new_is_dir && !old_opaque && new_opaque) ovl_remove_opaque(newdentry); + /* + * Old dentry now lives in different location. Dentries in + * lowerstack are stale. We cannot drop them here because + * access to them is lockless. This could be only pure upper + * or opaque directory - numlower is zero. Or upper non-dir + * entry - its pureness is tracked by flag opaque. + */ if (old_opaque != new_opaque) { ovl_dentry_set_opaque(old, new_opaque); if (!overwrite) diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index b29036aa8..a4ff5d0d7 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -63,9 +63,11 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr) if (!err) { upperdentry = ovl_dentry_upper(dentry); - mutex_lock(&upperdentry->d_inode->i_mutex); + inode_lock(upperdentry->d_inode); err = notify_change(upperdentry, attr, NULL); - mutex_unlock(&upperdentry->d_inode->i_mutex); + if (!err) + ovl_copyattr(upperdentry->d_inode, dentry->d_inode); + inode_unlock(upperdentry->d_inode); } ovl_drop_write(dentry); out: @@ -108,6 +110,29 @@ int ovl_permission(struct inode *inode, int mask) realdentry = ovl_entry_real(oe, &is_upper); + if (ovl_is_default_permissions(inode)) { + struct kstat stat; + struct path realpath = { .dentry = realdentry }; + + if (mask & MAY_NOT_BLOCK) + return -ECHILD; + + realpath.mnt = ovl_entry_mnt_real(oe, inode, is_upper); + + err = vfs_getattr(&realpath, &stat); + if (err) + return err; + + if ((stat.mode ^ inode->i_mode) & S_IFMT) + return -ESTALE; + + inode->i_mode = stat.mode; + inode->i_uid = stat.uid; + inode->i_gid = stat.gid; + + return generic_permission(inode, mask); + } + /* Careful in RCU walk mode */ realinode = ACCESS_ONCE(realdentry->d_inode); if (!realinode) { @@ -144,57 +169,23 @@ out_dput: return err; } - -struct ovl_link_data { - struct dentry *realdentry; - void *cookie; -}; - -static const char *ovl_follow_link(struct dentry *dentry, void **cookie) +static const char *ovl_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) { struct dentry *realdentry; struct inode *realinode; - struct ovl_link_data *data = NULL; - const char *ret; + + if (!dentry) + return ERR_PTR(-ECHILD); realdentry = ovl_dentry_real(dentry); realinode = realdentry->d_inode; - if (WARN_ON(!realinode->i_op->follow_link)) + if (WARN_ON(!realinode->i_op->get_link)) return ERR_PTR(-EPERM); - if (realinode->i_op->put_link) { - data = kmalloc(sizeof(struct ovl_link_data), GFP_KERNEL); - if (!data) - return ERR_PTR(-ENOMEM); - data->realdentry = realdentry; - } - - ret = realinode->i_op->follow_link(realdentry, cookie); - if (IS_ERR_OR_NULL(ret)) { - kfree(data); - return ret; - } - - if (data) - data->cookie = *cookie; - - *cookie = data; - - return ret; -} - -static void ovl_put_link(struct inode *unused, void *c) -{ - struct inode *realinode; - struct ovl_link_data *data = c; - - if (!data) - return; - - realinode = data->realdentry->d_inode; - realinode->i_op->put_link(realinode, data->cookie); - kfree(data); + return realinode->i_op->get_link(realdentry, realinode, done); } static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz) @@ -391,8 +382,7 @@ static const struct inode_operations ovl_file_inode_operations = { static const struct inode_operations ovl_symlink_inode_operations = { .setattr = ovl_setattr, - .follow_link = ovl_follow_link, - .put_link = ovl_put_link, + .get_link = ovl_get_link, .readlink = ovl_readlink, .getattr = ovl_getattr, .setxattr = ovl_setxattr, diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index e17154aea..99b4168c3 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -142,7 +142,10 @@ struct dentry *ovl_dentry_upper(struct dentry *dentry); struct dentry *ovl_dentry_lower(struct dentry *dentry); struct dentry *ovl_dentry_real(struct dentry *dentry); struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper); +struct vfsmount *ovl_entry_mnt_real(struct ovl_entry *oe, struct inode *inode, + bool is_upper); struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry); +bool ovl_is_default_permissions(struct inode *inode); void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache); struct dentry *ovl_workdir(struct dentry *dentry); int ovl_want_write(struct dentry *dentry); diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index adcb1398c..fdaf28f75 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -228,7 +228,7 @@ static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd) dput(dentry); } } - mutex_unlock(&dir->d_inode->i_mutex); + inode_unlock(dir->d_inode); } revert_creds(old_cred); put_cred(override_cred); @@ -399,7 +399,7 @@ static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin) loff_t res; struct ovl_dir_file *od = file->private_data; - mutex_lock(&file_inode(file)->i_mutex); + inode_lock(file_inode(file)); if (!file->f_pos) ovl_dir_reset(file); @@ -429,7 +429,7 @@ static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin) res = offset; } out_unlock: - mutex_unlock(&file_inode(file)->i_mutex); + inode_unlock(file_inode(file)); return res; } @@ -454,10 +454,10 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end, ovl_path_upper(dentry, &upperpath); realfile = ovl_path_open(&upperpath, O_RDONLY); smp_mb__before_spinlock(); - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (!od->upperfile) { if (IS_ERR(realfile)) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return PTR_ERR(realfile); } od->upperfile = realfile; @@ -467,7 +467,7 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end, fput(realfile); realfile = od->upperfile; } - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } } @@ -479,9 +479,9 @@ static int ovl_dir_release(struct inode *inode, struct file *file) struct ovl_dir_file *od = file->private_data; if (od->cache) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); ovl_cache_put(od, file->f_path.dentry); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } fput(od->realfile); if (od->upperfile) @@ -557,7 +557,7 @@ void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list) { struct ovl_cache_entry *p; - mutex_lock_nested(&upper->d_inode->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(upper->d_inode, I_MUTEX_CHILD); list_for_each_entry(p, list, l_node) { struct dentry *dentry; @@ -575,5 +575,5 @@ void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list) ovl_cleanup(upper->d_inode, dentry); dput(dentry); } - mutex_unlock(&upper->d_inode->i_mutex); + inode_unlock(upper->d_inode); } diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index f42c9407f..619ad4b01 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -16,6 +16,7 @@ #include <linux/slab.h> #include <linux/parser.h> #include <linux/module.h> +#include <linux/pagemap.h> #include <linux/sched.h> #include <linux/statfs.h> #include <linux/seq_file.h> @@ -25,12 +26,11 @@ MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>"); MODULE_DESCRIPTION("Overlay filesystem"); MODULE_LICENSE("GPL"); -#define OVERLAYFS_SUPER_MAGIC 0x794c7630 - struct ovl_config { char *lowerdir; char *upperdir; char *workdir; + bool default_permissions; }; /* private information held for overlayfs's superblock */ @@ -76,12 +76,14 @@ enum ovl_path_type ovl_path_type(struct dentry *dentry) if (oe->__upperdentry) { type = __OVL_PATH_UPPER; - if (oe->numlower) { - if (S_ISDIR(dentry->d_inode->i_mode)) - type |= __OVL_PATH_MERGE; - } else if (!oe->opaque) { + /* + * Non-dir dentry can hold lower dentry from previous + * location. Its purity depends only on opaque flag. + */ + if (oe->numlower && S_ISDIR(dentry->d_inode->i_mode)) + type |= __OVL_PATH_MERGE; + else if (!oe->opaque) type |= __OVL_PATH_PURE; - } } else { if (oe->numlower > 1) type |= __OVL_PATH_MERGE; @@ -155,6 +157,18 @@ struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper) return realdentry; } +struct vfsmount *ovl_entry_mnt_real(struct ovl_entry *oe, struct inode *inode, + bool is_upper) +{ + if (is_upper) { + struct ovl_fs *ofs = inode->i_sb->s_fs_info; + + return ofs->upper_mnt; + } else { + return oe->numlower ? oe->lowerstack[0].mnt : NULL; + } +} + struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry) { struct ovl_entry *oe = dentry->d_fsdata; @@ -162,6 +176,13 @@ struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry) return oe->cache; } +bool ovl_is_default_permissions(struct inode *inode) +{ + struct ovl_fs *ofs = inode->i_sb->s_fs_info; + + return ofs->config.default_permissions; +} + void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache) { struct ovl_entry *oe = dentry->d_fsdata; @@ -210,7 +231,7 @@ void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry) { struct ovl_entry *oe = dentry->d_fsdata; - WARN_ON(!mutex_is_locked(&upperdentry->d_parent->d_inode->i_mutex)); + WARN_ON(!inode_is_locked(upperdentry->d_parent->d_inode)); WARN_ON(oe->__upperdentry); BUG_ON(!upperdentry->d_inode); /* @@ -225,7 +246,7 @@ void ovl_dentry_version_inc(struct dentry *dentry) { struct ovl_entry *oe = dentry->d_fsdata; - WARN_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); + WARN_ON(!inode_is_locked(dentry->d_inode)); oe->version++; } @@ -233,7 +254,7 @@ u64 ovl_dentry_version_get(struct dentry *dentry) { struct ovl_entry *oe = dentry->d_fsdata; - WARN_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); + WARN_ON(!inode_is_locked(dentry->d_inode)); return oe->version; } @@ -322,6 +343,7 @@ static const struct dentry_operations ovl_dentry_operations = { static const struct dentry_operations ovl_reval_dentry_operations = { .d_release = ovl_dentry_release, + .d_select_inode = ovl_d_select_inode, .d_revalidate = ovl_dentry_revalidate, .d_weak_revalidate = ovl_dentry_weak_revalidate, }; @@ -356,9 +378,9 @@ static inline struct dentry *ovl_lookup_real(struct dentry *dir, { struct dentry *dentry; - mutex_lock(&dir->d_inode->i_mutex); + inode_lock(dir->d_inode); dentry = lookup_one_len(name->name, dir, name->len); - mutex_unlock(&dir->d_inode->i_mutex); + inode_unlock(dir->d_inode); if (IS_ERR(dentry)) { if (PTR_ERR(dentry) == -ENOENT) @@ -595,6 +617,8 @@ static int ovl_show_options(struct seq_file *m, struct dentry *dentry) seq_show_option(m, "upperdir", ufs->config.upperdir); seq_show_option(m, "workdir", ufs->config.workdir); } + if (ufs->config.default_permissions) + seq_puts(m, ",default_permissions"); return 0; } @@ -619,6 +643,7 @@ enum { OPT_LOWERDIR, OPT_UPPERDIR, OPT_WORKDIR, + OPT_DEFAULT_PERMISSIONS, OPT_ERR, }; @@ -626,6 +651,7 @@ static const match_table_t ovl_tokens = { {OPT_LOWERDIR, "lowerdir=%s"}, {OPT_UPPERDIR, "upperdir=%s"}, {OPT_WORKDIR, "workdir=%s"}, + {OPT_DEFAULT_PERMISSIONS, "default_permissions"}, {OPT_ERR, NULL} }; @@ -686,6 +712,10 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config) return -ENOMEM; break; + case OPT_DEFAULT_PERMISSIONS: + config->default_permissions = true; + break; + default: pr_err("overlayfs: unrecognized mount option \"%s\" or missing value\n", p); return -EINVAL; @@ -717,7 +747,7 @@ static struct dentry *ovl_workdir_create(struct vfsmount *mnt, if (err) return ERR_PTR(err); - mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(dir, I_MUTEX_PARENT); retry: work = lookup_one_len(OVL_WORKDIR_NAME, dentry, strlen(OVL_WORKDIR_NAME)); @@ -743,7 +773,7 @@ retry: goto out_dput; } out_unlock: - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); mnt_drop_write(mnt); return work; @@ -38,6 +38,12 @@ unsigned int pipe_max_size = 1048576; */ unsigned int pipe_min_size = PAGE_SIZE; +/* Maximum allocatable pages per user. Hard limit is unset by default, soft + * matches default values. + */ +unsigned long pipe_user_pages_hard; +unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR; + /* * We use a start+len construction, which provides full use of the * allocated memory. @@ -583,20 +589,49 @@ pipe_fasync(int fd, struct file *filp, int on) return retval; } +static void account_pipe_buffers(struct pipe_inode_info *pipe, + unsigned long old, unsigned long new) +{ + atomic_long_add(new - old, &pipe->user->pipe_bufs); +} + +static bool too_many_pipe_buffers_soft(struct user_struct *user) +{ + return pipe_user_pages_soft && + atomic_long_read(&user->pipe_bufs) >= pipe_user_pages_soft; +} + +static bool too_many_pipe_buffers_hard(struct user_struct *user) +{ + return pipe_user_pages_hard && + atomic_long_read(&user->pipe_bufs) >= pipe_user_pages_hard; +} + struct pipe_inode_info *alloc_pipe_info(void) { struct pipe_inode_info *pipe; pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL); if (pipe) { - pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * PIPE_DEF_BUFFERS, GFP_KERNEL); + unsigned long pipe_bufs = PIPE_DEF_BUFFERS; + struct user_struct *user = get_current_user(); + + if (!too_many_pipe_buffers_hard(user)) { + if (too_many_pipe_buffers_soft(user)) + pipe_bufs = 1; + pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * pipe_bufs, GFP_KERNEL); + } + if (pipe->bufs) { init_waitqueue_head(&pipe->wait); pipe->r_counter = pipe->w_counter = 1; - pipe->buffers = PIPE_DEF_BUFFERS; + pipe->buffers = pipe_bufs; + pipe->user = user; + account_pipe_buffers(pipe, 0, pipe_bufs); mutex_init(&pipe->mutex); return pipe; } + free_uid(user); kfree(pipe); } @@ -607,6 +642,8 @@ void free_pipe_info(struct pipe_inode_info *pipe) { int i; + account_pipe_buffers(pipe, pipe->buffers, 0); + free_uid(pipe->user); for (i = 0; i < pipe->buffers; i++) { struct pipe_buffer *buf = pipe->bufs + i; if (buf->ops) @@ -998,6 +1035,7 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages) memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer)); } + account_pipe_buffers(pipe, pipe->buffers, nr_pages); pipe->curbuf = 0; kfree(pipe->bufs); pipe->bufs = bufs; @@ -1069,6 +1107,11 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) { ret = -EPERM; goto out; + } else if ((too_many_pipe_buffers_hard(pipe->user) || + too_many_pipe_buffers_soft(pipe->user)) && + !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto out; } ret = pipe_set_size(pipe, nr_pages); break; diff --git a/fs/pnode.c b/fs/pnode.c index 6367e1e43..c524fdddc 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -202,6 +202,11 @@ static struct mount *last_dest, *last_source, *dest_master; static struct mountpoint *mp; static struct hlist_head *list; +static inline bool peers(struct mount *m1, struct mount *m2) +{ + return m1->mnt_group_id == m2->mnt_group_id && m1->mnt_group_id; +} + static int propagate_one(struct mount *m) { struct mount *child; @@ -212,7 +217,7 @@ static int propagate_one(struct mount *m) /* skip if mountpoint isn't covered by it */ if (!is_subdir(mp->m_dentry, m->mnt.mnt_root)) return 0; - if (m->mnt_group_id == last_dest->mnt_group_id) { + if (peers(m, last_dest)) { type = CL_MAKE_SHARED; } else { struct mount *n, *p; @@ -223,7 +228,7 @@ static int propagate_one(struct mount *m) last_source = last_source->mnt_master; last_dest = last_source->mnt_parent; } - if (n->mnt_group_id != last_dest->mnt_group_id) { + if (!peers(n, last_dest)) { last_source = last_source->mnt_master; last_dest = last_source->mnt_parent; } diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 4adde1e2c..711dd5170 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -769,8 +769,6 @@ posix_acl_xattr_get(const struct xattr_handler *handler, struct posix_acl *acl; int error; - if (strcmp(name, "") != 0) - return -EINVAL; if (!IS_POSIXACL(d_backing_inode(dentry))) return -EOPNOTSUPP; if (d_is_symlink(dentry)) @@ -797,8 +795,6 @@ posix_acl_xattr_set(const struct xattr_handler *handler, struct posix_acl *acl = NULL; int ret; - if (strcmp(name, "") != 0) - return -EINVAL; if (!IS_POSIXACL(inode)) return -EOPNOTSUPP; if (!inode->i_op->set_acl) @@ -827,25 +823,14 @@ out: return ret; } -static size_t -posix_acl_xattr_list(const struct xattr_handler *handler, - struct dentry *dentry, char *list, size_t list_size, - const char *name, size_t name_len) +static bool +posix_acl_xattr_list(struct dentry *dentry) { - const char *xname = handler->prefix; - size_t size; - - if (!IS_POSIXACL(d_backing_inode(dentry))) - return 0; - - size = strlen(xname) + 1; - if (list && size <= list_size) - memcpy(list, xname, size); - return size; + return IS_POSIXACL(d_backing_inode(dentry)); } const struct xattr_handler posix_acl_access_xattr_handler = { - .prefix = POSIX_ACL_XATTR_ACCESS, + .name = XATTR_NAME_POSIX_ACL_ACCESS, .flags = ACL_TYPE_ACCESS, .list = posix_acl_xattr_list, .get = posix_acl_xattr_get, @@ -854,7 +839,7 @@ const struct xattr_handler posix_acl_access_xattr_handler = { EXPORT_SYMBOL_GPL(posix_acl_access_xattr_handler); const struct xattr_handler posix_acl_default_xattr_handler = { - .prefix = POSIX_ACL_XATTR_DEFAULT, + .name = XATTR_NAME_POSIX_ACL_DEFAULT, .flags = ACL_TYPE_DEFAULT, .list = posix_acl_xattr_list, .get = posix_acl_xattr_get, diff --git a/fs/proc/base.c b/fs/proc/base.c index 8be0e4cd2..4f764c2ac 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -953,6 +953,7 @@ static ssize_t environ_read(struct file *file, char __user *buf, unsigned long src = *ppos; int ret = 0; struct mm_struct *mm = file->private_data; + unsigned long env_start, env_end; if (!mm) return 0; @@ -964,19 +965,25 @@ static ssize_t environ_read(struct file *file, char __user *buf, ret = 0; if (!atomic_inc_not_zero(&mm->mm_users)) goto free; + + down_read(&mm->mmap_sem); + env_start = mm->env_start; + env_end = mm->env_end; + up_read(&mm->mmap_sem); + while (count > 0) { size_t this_len, max_len; int retval; - if (src >= (mm->env_end - mm->env_start)) + if (src >= (env_end - env_start)) break; - this_len = mm->env_end - (mm->env_start + src); + this_len = env_end - (env_start + src); max_len = min_t(size_t, PAGE_SIZE, count); this_len = min(max_len, this_len); - retval = access_remote_vm(mm, (mm->env_start + src), + retval = access_remote_vm(mm, (env_start + src), page, this_len, 0); if (retval <= 0) { @@ -1565,12 +1572,16 @@ static int proc_exe_link(struct dentry *dentry, struct path *exe_path) return -ENOENT; } -static const char *proc_pid_follow_link(struct dentry *dentry, void **cookie) +static const char *proc_pid_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) { - struct inode *inode = d_inode(dentry); struct path path; int error = -EACCES; + if (!dentry) + return ERR_PTR(-ECHILD); + /* Are we allowed to snoop on the tasks file descriptors? */ if (!proc_fd_access_allowed(inode)) goto out; @@ -1631,7 +1642,7 @@ out: const struct inode_operations proc_pid_link_inode_operations = { .readlink = proc_pid_readlink, - .follow_link = proc_pid_follow_link, + .get_link = proc_pid_get_link, .setattr = proc_setattr, }; @@ -1896,7 +1907,7 @@ static const struct dentry_operations tid_map_files_dentry_operations = { .d_delete = pid_delete_dentry, }; -static int proc_map_files_get_link(struct dentry *dentry, struct path *path) +static int map_files_get_link(struct dentry *dentry, struct path *path) { unsigned long vm_start, vm_end; struct vm_area_struct *vma; @@ -1922,7 +1933,7 @@ static int proc_map_files_get_link(struct dentry *dentry, struct path *path) down_read(&mm->mmap_sem); vma = find_exact_vma(mm, vm_start, vm_end); if (vma && vma->vm_file) { - *path = vma_pr_or_file(vma)->f_path; + *path = vma->vm_file->f_path; path_get(path); rc = 0; } @@ -1946,20 +1957,22 @@ struct map_files_info { * path to the file in question. */ static const char * -proc_map_files_follow_link(struct dentry *dentry, void **cookie) +proc_map_files_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) { if (!capable(CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); - return proc_pid_follow_link(dentry, NULL); + return proc_pid_get_link(dentry, inode, done); } /* - * Identical to proc_pid_link_inode_operations except for follow_link() + * Identical to proc_pid_link_inode_operations except for get_link() */ static const struct inode_operations proc_map_files_link_inode_operations = { .readlink = proc_pid_readlink, - .follow_link = proc_map_files_follow_link, + .get_link = proc_map_files_get_link, .setattr = proc_setattr, }; @@ -1976,7 +1989,7 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry, return -ENOENT; ei = PROC_I(inode); - ei->op.proc_get_link = proc_map_files_get_link; + ei->op.proc_get_link = map_files_get_link; inode->i_op = &proc_map_files_link_inode_operations; inode->i_size = 64; @@ -2360,7 +2373,7 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf, size_t count, loff_t *ppos) { struct inode * inode = file_inode(file); - char *page; + void *page; ssize_t length; struct task_struct *task = get_proc_task(inode); @@ -2375,14 +2388,11 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf, if (*ppos != 0) goto out; - length = -ENOMEM; - page = (char*)__get_free_page(GFP_TEMPORARY); - if (!page) + page = memdup_user(buf, count); + if (IS_ERR(page)) { + length = PTR_ERR(page); goto out; - - length = -EFAULT; - if (copy_from_user(page, buf, count)) - goto out_free; + } /* Guard against adverse ptrace interaction */ length = mutex_lock_interruptible(&task->signal->cred_guard_mutex); @@ -2391,10 +2401,10 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf, length = security_setprocattr(task, (char*)file->f_path.dentry->d_name.name, - (void*)page, count); + page, count); mutex_unlock(&task->signal->cred_guard_mutex); out_free: - free_page((unsigned long) page); + kfree(page); out: put_task_struct(task); out_no_task: diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 3c2a915c6..56afa5ef0 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -258,6 +258,7 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx, name, len, instantiate, p, (void *)(unsigned long)fd)) goto out_fd_loop; + cond_resched(); rcu_read_lock(); } rcu_read_unlock(); diff --git a/fs/proc/inode.c b/fs/proc/inode.c index bd95b9fde..42305ddcb 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -95,7 +95,8 @@ void __init proc_init_inodecache(void) proc_inode_cachep = kmem_cache_create("proc_inode_cache", sizeof(struct proc_inode), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD|SLAB_PANIC), + SLAB_MEM_SPREAD|SLAB_ACCOUNT| + SLAB_PANIC), init_once); } @@ -393,24 +394,25 @@ static const struct file_operations proc_reg_file_ops_no_compat = { }; #endif -static const char *proc_follow_link(struct dentry *dentry, void **cookie) +static void proc_put_link(void *p) { - struct proc_dir_entry *pde = PDE(d_inode(dentry)); - if (unlikely(!use_pde(pde))) - return ERR_PTR(-EINVAL); - *cookie = pde; - return pde->data; + unuse_pde(p); } -static void proc_put_link(struct inode *unused, void *p) +static const char *proc_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) { - unuse_pde(p); + struct proc_dir_entry *pde = PDE(inode); + if (unlikely(!use_pde(pde))) + return ERR_PTR(-EINVAL); + set_delayed_call(done, proc_put_link, pde); + return pde->data; } const struct inode_operations proc_link_inode_operations = { .readlink = generic_readlink, - .follow_link = proc_follow_link, - .put_link = proc_put_link, + .get_link = proc_get_link, }; struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 92e6726f6..a939f5ed7 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -552,9 +552,9 @@ static int open_kcore(struct inode *inode, struct file *filp) if (kcore_need_update) kcore_update_ram(); if (i_size_read(inode) != proc_root_kcore->size) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); i_size_write(inode, proc_root_kcore->size); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } return 0; } diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index ae836fbb0..70b9f953b 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -57,11 +57,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v) /* * Estimate the amount of memory available for userspace allocations, * without causing swapping. - * - * Free memory cannot be taken below the low watermark, before the - * system starts swapping. */ - available = i.freeram - wmark_low; + available = i.freeram - totalreserve_pages; /* * Not all the page cache can be freed, otherwise the system will diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 1b0ea4a5d..276f12431 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -30,14 +30,18 @@ static const struct proc_ns_operations *ns_entries[] = { &mntns_operations, }; -static const char *proc_ns_follow_link(struct dentry *dentry, void **cookie) +static const char *proc_ns_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) { - struct inode *inode = d_inode(dentry); const struct proc_ns_operations *ns_ops = PROC_I(inode)->ns_ops; struct task_struct *task; struct path ns_path; void *error = ERR_PTR(-EACCES); + if (!dentry) + return ERR_PTR(-ECHILD); + task = get_proc_task(inode); if (!task) return error; @@ -74,7 +78,7 @@ static int proc_ns_readlink(struct dentry *dentry, char __user *buffer, int bufl static const struct inode_operations proc_ns_link_inode_operations = { .readlink = proc_ns_readlink, - .follow_link = proc_ns_follow_link, + .get_link = proc_ns_get_link, .setattr = proc_setattr, }; diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c index cb8eda060..f8595e8b5 100644 --- a/fs/proc/nommu.c +++ b/fs/proc/nommu.c @@ -45,10 +45,7 @@ static int nommu_region_show(struct seq_file *m, struct vm_region *region) file = region->vm_file; if (file) { - struct inode *inode; - - file = vmr_pr_or_file(region); - inode = file_inode(file); + struct inode *inode = file_inode(region->vm_file); dev = inode->i_sb->s_dev; ino = inode->i_ino; } diff --git a/fs/proc/page.c b/fs/proc/page.c index 93484034a..b2855eea5 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -103,9 +103,9 @@ u64 stable_page_flags(struct page *page) * pseudo flags for the well known (anonymous) memory mapped pages * * Note that page->_mapcount is overloaded in SLOB/SLUB/SLQB, so the - * simple test in page_mapped() is not enough. + * simple test in page_mapcount() is not enough. */ - if (!PageSlab(page) && page_mapped(page)) + if (!PageSlab(page) && page_mapcount(page)) u |= 1 << KPF_MMAP; if (PageAnon(page)) u |= 1 << KPF_ANON; diff --git a/fs/proc/self.c b/fs/proc/self.c index 113b8d061..b6a8d3529 100644 --- a/fs/proc/self.c +++ b/fs/proc/self.c @@ -18,26 +18,28 @@ static int proc_self_readlink(struct dentry *dentry, char __user *buffer, return readlink_copy(buffer, buflen, tmp); } -static const char *proc_self_follow_link(struct dentry *dentry, void **cookie) +static const char *proc_self_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) { - struct pid_namespace *ns = dentry->d_sb->s_fs_info; + struct pid_namespace *ns = inode->i_sb->s_fs_info; pid_t tgid = task_tgid_nr_ns(current, ns); char *name; if (!tgid) return ERR_PTR(-ENOENT); /* 11 for max length of signed int in decimal + NULL term */ - name = kmalloc(12, GFP_KERNEL); - if (!name) - return ERR_PTR(-ENOMEM); + name = kmalloc(12, dentry ? GFP_KERNEL : GFP_ATOMIC); + if (unlikely(!name)) + return dentry ? ERR_PTR(-ENOMEM) : ERR_PTR(-ECHILD); sprintf(name, "%d", tgid); - return *cookie = name; + set_delayed_call(done, kfree_link, name); + return name; } static const struct inode_operations proc_self_inode_operations = { .readlink = proc_self_readlink, - .follow_link = proc_self_follow_link, - .put_link = kfree_put_link, + .get_link = proc_self_get_link, }; static unsigned self_inum; @@ -48,7 +50,7 @@ int proc_setup_self(struct super_block *s) struct pid_namespace *ns = s->s_fs_info; struct dentry *self; - mutex_lock(&root_inode->i_mutex); + inode_lock(root_inode); self = d_alloc_name(s->s_root, "self"); if (self) { struct inode *inode = new_inode_pseudo(s); @@ -67,7 +69,7 @@ int proc_setup_self(struct super_block *s) } else { self = ERR_PTR(-ENOMEM); } - mutex_unlock(&root_inode->i_mutex); + inode_unlock(root_inode); if (IS_ERR(self)) { pr_err("proc_fill_super: can't allocate /proc/self\n"); return PTR_ERR(self); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index e4733feb0..fa95ab2d3 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -14,6 +14,7 @@ #include <linux/swapops.h> #include <linux/mmu_notifier.h> #include <linux/page_idle.h> +#include <linux/shmem_fs.h> #include <asm/elf.h> #include <asm/uaccess.h> @@ -22,9 +23,13 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) { - unsigned long data, text, lib, swap, ptes, pmds; + unsigned long text, lib, swap, ptes, pmds, anon, file, shmem; unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; + anon = get_mm_counter(mm, MM_ANONPAGES); + file = get_mm_counter(mm, MM_FILEPAGES); + shmem = get_mm_counter(mm, MM_SHMEMPAGES); + /* * Note: to minimize their overhead, mm maintains hiwater_vm and * hiwater_rss only when about to *lower* total_vm or rss. Any @@ -35,11 +40,10 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) hiwater_vm = total_vm = mm->total_vm; if (hiwater_vm < mm->hiwater_vm) hiwater_vm = mm->hiwater_vm; - hiwater_rss = total_rss = get_mm_rss(mm); + hiwater_rss = total_rss = anon + file + shmem; if (hiwater_rss < mm->hiwater_rss) hiwater_rss = mm->hiwater_rss; - data = mm->total_vm - mm->shared_vm - mm->stack_vm; text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10; lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text; swap = get_mm_counter(mm, MM_SWAPENTS); @@ -52,6 +56,9 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) "VmPin:\t%8lu kB\n" "VmHWM:\t%8lu kB\n" "VmRSS:\t%8lu kB\n" + "RssAnon:\t%8lu kB\n" + "RssFile:\t%8lu kB\n" + "RssShmem:\t%8lu kB\n" "VmData:\t%8lu kB\n" "VmStk:\t%8lu kB\n" "VmExe:\t%8lu kB\n" @@ -65,7 +72,10 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) mm->pinned_vm << (PAGE_SHIFT-10), hiwater_rss << (PAGE_SHIFT-10), total_rss << (PAGE_SHIFT-10), - data << (PAGE_SHIFT-10), + anon << (PAGE_SHIFT-10), + file << (PAGE_SHIFT-10), + shmem << (PAGE_SHIFT-10), + mm->data_vm << (PAGE_SHIFT-10), mm->stack_vm << (PAGE_SHIFT-10), text, lib, ptes >> 10, pmds >> 10, @@ -82,10 +92,11 @@ unsigned long task_statm(struct mm_struct *mm, unsigned long *shared, unsigned long *text, unsigned long *data, unsigned long *resident) { - *shared = get_mm_counter(mm, MM_FILEPAGES); + *shared = get_mm_counter(mm, MM_FILEPAGES) + + get_mm_counter(mm, MM_SHMEMPAGES); *text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> PAGE_SHIFT; - *data = mm->total_vm - mm->shared_vm; + *data = mm->data_vm + mm->stack_vm; *resident = *shared + get_mm_counter(mm, MM_ANONPAGES); return mm->total_vm; } @@ -248,23 +259,29 @@ static int do_maps_open(struct inode *inode, struct file *file, sizeof(struct proc_maps_private)); } -static pid_t pid_of_stack(struct proc_maps_private *priv, - struct vm_area_struct *vma, bool is_pid) +/* + * Indicate if the VMA is a stack for the given task; for + * /proc/PID/maps that is the stack of the main task. + */ +static int is_stack(struct proc_maps_private *priv, + struct vm_area_struct *vma, int is_pid) { - struct inode *inode = priv->inode; - struct task_struct *task; - pid_t ret = 0; + int stack = 0; - rcu_read_lock(); - task = pid_task(proc_pid(inode), PIDTYPE_PID); - if (task) { - task = task_of_stack(task, vma, is_pid); + if (is_pid) { + stack = vma->vm_start <= vma->vm_mm->start_stack && + vma->vm_end >= vma->vm_mm->start_stack; + } else { + struct inode *inode = priv->inode; + struct task_struct *task; + + rcu_read_lock(); + task = pid_task(proc_pid(inode), PIDTYPE_PID); if (task) - ret = task_pid_nr_ns(task, inode->i_sb->s_fs_info); + stack = vma_is_stack_for_task(vma, task); + rcu_read_unlock(); } - rcu_read_unlock(); - - return ret; + return stack; } static void @@ -281,10 +298,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) const char *name = NULL; if (file) { - struct inode *inode; - - file = vma_pr_or_file(vma); - inode = file_inode(file); + struct inode *inode = file_inode(vma->vm_file); dev = inode->i_sb->s_dev; ino = inode->i_ino; pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT; @@ -327,8 +341,6 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) name = arch_vma_name(vma); if (!name) { - pid_t tid; - if (!mm) { name = "[vdso]"; goto done; @@ -340,21 +352,8 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) goto done; } - tid = pid_of_stack(priv, vma, is_pid); - if (tid != 0) { - /* - * Thread stack in /proc/PID/task/TID/maps or - * the main process stack. - */ - if (!is_pid || (vma->vm_start <= mm->start_stack && - vma->vm_end >= mm->start_stack)) { - name = "[stack]"; - } else { - /* Thread stack in /proc/PID/maps */ - seq_pad(m, ' '); - seq_printf(m, "[stack:%d]", tid); - } - } + if (is_stack(priv, vma, is_pid)) + name = "[stack]"; } done: @@ -454,12 +453,14 @@ struct mem_size_stats { unsigned long private_hugetlb; u64 pss; u64 swap_pss; + bool check_shmem_swap; }; static void smaps_account(struct mem_size_stats *mss, struct page *page, - unsigned long size, bool young, bool dirty) + bool compound, bool young, bool dirty) { - int mapcount; + int i, nr = compound ? 1 << compound_order(page) : 1; + unsigned long size = nr * PAGE_SIZE; if (PageAnon(page)) mss->anonymous += size; @@ -468,26 +469,53 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page, /* Accumulate the size in pages that have been accessed. */ if (young || page_is_young(page) || PageReferenced(page)) mss->referenced += size; - mapcount = page_mapcount(page); - if (mapcount >= 2) { - u64 pss_delta; - if (dirty || PageDirty(page)) - mss->shared_dirty += size; - else - mss->shared_clean += size; - pss_delta = (u64)size << PSS_SHIFT; - do_div(pss_delta, mapcount); - mss->pss += pss_delta; - } else { + /* + * page_count(page) == 1 guarantees the page is mapped exactly once. + * If any subpage of the compound page mapped with PTE it would elevate + * page_count(). + */ + if (page_count(page) == 1) { if (dirty || PageDirty(page)) mss->private_dirty += size; else mss->private_clean += size; mss->pss += (u64)size << PSS_SHIFT; + return; + } + + for (i = 0; i < nr; i++, page++) { + int mapcount = page_mapcount(page); + + if (mapcount >= 2) { + if (dirty || PageDirty(page)) + mss->shared_dirty += PAGE_SIZE; + else + mss->shared_clean += PAGE_SIZE; + mss->pss += (PAGE_SIZE << PSS_SHIFT) / mapcount; + } else { + if (dirty || PageDirty(page)) + mss->private_dirty += PAGE_SIZE; + else + mss->private_clean += PAGE_SIZE; + mss->pss += PAGE_SIZE << PSS_SHIFT; + } } } +#ifdef CONFIG_SHMEM +static int smaps_pte_hole(unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct mem_size_stats *mss = walk->private; + + mss->swap += shmem_partial_swap_usage( + walk->vma->vm_file->f_mapping, addr, end); + + return 0; +} +#endif + static void smaps_pte_entry(pte_t *pte, unsigned long addr, struct mm_walk *walk) { @@ -515,11 +543,25 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, } } else if (is_migration_entry(swpent)) page = migration_entry_to_page(swpent); + } else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap + && pte_none(*pte))) { + page = find_get_entry(vma->vm_file->f_mapping, + linear_page_index(vma, addr)); + if (!page) + return; + + if (radix_tree_exceptional_entry(page)) + mss->swap += PAGE_SIZE; + else + page_cache_release(page); + + return; } if (!page) return; - smaps_account(mss, page, PAGE_SIZE, pte_young(*pte), pte_dirty(*pte)); + + smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte)); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -535,8 +577,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, if (IS_ERR_OR_NULL(page)) return; mss->anonymous_thp += HPAGE_PMD_SIZE; - smaps_account(mss, page, HPAGE_PMD_SIZE, - pmd_young(*pmd), pmd_dirty(*pmd)); + smaps_account(mss, page, true, pmd_young(*pmd), pmd_dirty(*pmd)); } #else static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, @@ -552,7 +593,8 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pte_t *pte; spinlock_t *ptl; - if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { + ptl = pmd_trans_huge_lock(pmd, vma); + if (ptl) { smaps_pmd_entry(pmd, addr, walk); spin_unlock(ptl); return 0; @@ -674,6 +716,31 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) }; memset(&mss, 0, sizeof mss); + +#ifdef CONFIG_SHMEM + if (vma->vm_file && shmem_mapping(vma->vm_file->f_mapping)) { + /* + * For shared or readonly shmem mappings we know that all + * swapped out pages belong to the shmem object, and we can + * obtain the swap value much more efficiently. For private + * writable mappings, we might have COW pages that are + * not affected by the parent swapped out pages of the shmem + * object, so we have to distinguish them during the page walk. + * Unless we know that the shmem object (or the part mapped by + * our VMA) has no swapped out pages at all. + */ + unsigned long shmem_swapped = shmem_swap_usage(vma); + + if (!shmem_swapped || (vma->vm_flags & VM_SHARED) || + !(vma->vm_flags & VM_WRITE)) { + mss.swap = shmem_swapped; + } else { + mss.check_shmem_swap = true; + smaps_walk.pte_hole = smaps_pte_hole; + } + } +#endif + /* mmap_sem is held in m_start */ walk_page_vma(vma, &smaps_walk); @@ -820,9 +887,6 @@ static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, pmd = pmd_wrprotect(pmd); pmd = pmd_clear_soft_dirty(pmd); - if (vma->vm_flags & VM_SOFTDIRTY) - vma->vm_flags &= ~VM_SOFTDIRTY; - set_pmd_at(vma->vm_mm, addr, pmdp, pmd); } #else @@ -841,7 +905,8 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, spinlock_t *ptl; struct page *page; - if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { + ptl = pmd_trans_huge_lock(pmd, vma); + if (ptl) { if (cp->type == CLEAR_REFS_SOFT_DIRTY) { clear_soft_dirty_pmd(vma, addr, pmd); goto out; @@ -1115,7 +1180,8 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, int err = 0; #ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (pmd_trans_huge_lock(pmdp, vma, &ptl) == 1) { + ptl = pmd_trans_huge_lock(pmdp, vma); + if (ptl) { u64 flags = 0, frame = 0; pmd_t pmd = *pmdp; @@ -1447,7 +1513,8 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, pte_t *orig_pte; pte_t *pte; - if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { + ptl = pmd_trans_huge_lock(pmd, vma); + if (ptl) { pte_t huge_pte = *(pte_t *)pmd; struct page *page; @@ -1509,7 +1576,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) struct proc_maps_private *proc_priv = &numa_priv->proc_maps; struct vm_area_struct *vma = v; struct numa_maps *md = &numa_priv->md; - struct file *file = vma_pr_or_file(vma); + struct file *file = vma->vm_file; struct mm_struct *mm = vma->vm_mm; struct mm_walk walk = { .hugetlb_entry = gather_hugetlb_stats, @@ -1542,19 +1609,8 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) seq_file_path(m, file, "\n\t= "); } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { seq_puts(m, " heap"); - } else { - pid_t tid = pid_of_stack(proc_priv, vma, is_pid); - if (tid != 0) { - /* - * Thread stack in /proc/PID/task/TID/maps or - * the main process stack. - */ - if (!is_pid || (vma->vm_start <= mm->start_stack && - vma->vm_end >= mm->start_stack)) - seq_puts(m, " stack"); - else - seq_printf(m, " stack:%d", tid); - } + } else if (is_stack(proc_priv, vma, is_pid)) { + seq_puts(m, " stack"); } if (is_vm_hugetlb_page(vma)) diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 7aa92dbf9..faacb0c0d 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -123,23 +123,26 @@ unsigned long task_statm(struct mm_struct *mm, return size; } -static pid_t pid_of_stack(struct proc_maps_private *priv, - struct vm_area_struct *vma, bool is_pid) +static int is_stack(struct proc_maps_private *priv, + struct vm_area_struct *vma, int is_pid) { - struct inode *inode = priv->inode; - struct task_struct *task; - pid_t ret = 0; - - rcu_read_lock(); - task = pid_task(proc_pid(inode), PIDTYPE_PID); - if (task) { - task = task_of_stack(task, vma, is_pid); + struct mm_struct *mm = vma->vm_mm; + int stack = 0; + + if (is_pid) { + stack = vma->vm_start <= mm->start_stack && + vma->vm_end >= mm->start_stack; + } else { + struct inode *inode = priv->inode; + struct task_struct *task; + + rcu_read_lock(); + task = pid_task(proc_pid(inode), PIDTYPE_PID); if (task) - ret = task_pid_nr_ns(task, inode->i_sb->s_fs_info); + stack = vma_is_stack_for_task(vma, task); + rcu_read_unlock(); } - rcu_read_unlock(); - - return ret; + return stack; } /* @@ -160,10 +163,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma, file = vma->vm_file; if (file) { - struct inode *inode; - - file = vma_pr_or_file(vma); - inode = file_inode(file); + struct inode *inode = file_inode(vma->vm_file); dev = inode->i_sb->s_dev; ino = inode->i_ino; pgoff = (loff_t)vma->vm_pgoff << PAGE_SHIFT; @@ -184,21 +184,9 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma, if (file) { seq_pad(m, ' '); seq_file_path(m, file, ""); - } else if (mm) { - pid_t tid = pid_of_stack(priv, vma, is_pid); - - if (tid != 0) { - seq_pad(m, ' '); - /* - * Thread stack in /proc/PID/task/TID/maps or - * the main process stack. - */ - if (!is_pid || (vma->vm_start <= mm->start_stack && - vma->vm_end >= mm->start_stack)) - seq_printf(m, "[stack]"); - else - seq_printf(m, "[stack:%d]", tid); - } + } else if (mm && is_stack(priv, vma, is_pid)) { + seq_pad(m, ' '); + seq_printf(m, "[stack]"); } seq_putc(m, '\n'); diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c index 947b0f4fd..e58a31e8f 100644 --- a/fs/proc/thread_self.c +++ b/fs/proc/thread_self.c @@ -19,26 +19,29 @@ static int proc_thread_self_readlink(struct dentry *dentry, char __user *buffer, return readlink_copy(buffer, buflen, tmp); } -static const char *proc_thread_self_follow_link(struct dentry *dentry, void **cookie) +static const char *proc_thread_self_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) { - struct pid_namespace *ns = dentry->d_sb->s_fs_info; + struct pid_namespace *ns = inode->i_sb->s_fs_info; pid_t tgid = task_tgid_nr_ns(current, ns); pid_t pid = task_pid_nr_ns(current, ns); char *name; if (!pid) return ERR_PTR(-ENOENT); - name = kmalloc(PROC_NUMBUF + 6 + PROC_NUMBUF, GFP_KERNEL); - if (!name) - return ERR_PTR(-ENOMEM); + name = kmalloc(PROC_NUMBUF + 6 + PROC_NUMBUF, + dentry ? GFP_KERNEL : GFP_ATOMIC); + if (unlikely(!name)) + return dentry ? ERR_PTR(-ENOMEM) : ERR_PTR(-ECHILD); sprintf(name, "%d/task/%d", tgid, pid); - return *cookie = name; + set_delayed_call(done, kfree_link, name); + return name; } static const struct inode_operations proc_thread_self_inode_operations = { .readlink = proc_thread_self_readlink, - .follow_link = proc_thread_self_follow_link, - .put_link = kfree_put_link, + .get_link = proc_thread_self_get_link, }; static unsigned thread_self_inum; @@ -49,7 +52,7 @@ int proc_setup_thread_self(struct super_block *s) struct pid_namespace *ns = s->s_fs_info; struct dentry *thread_self; - mutex_lock(&root_inode->i_mutex); + inode_lock(root_inode); thread_self = d_alloc_name(s->s_root, "thread-self"); if (thread_self) { struct inode *inode = new_inode_pseudo(s); @@ -68,7 +71,7 @@ int proc_setup_thread_self(struct super_block *s) } else { thread_self = ERR_PTR(-ENOMEM); } - mutex_unlock(&root_inode->i_mutex); + inode_unlock(root_inode); if (IS_ERR(thread_self)) { pr_err("proc_fill_super: can't allocate /proc/thread_self\n"); return PTR_ERR(thread_self); diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c index 8ebd9a334..2256e7e23 100644 --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -95,9 +95,9 @@ static int show_vfsmnt(struct seq_file *m, struct vfsmount *mnt) { struct proc_mounts *p = m->private; struct mount *r = real_mount(mnt); - int err = 0; struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; struct super_block *sb = mnt_path.dentry->d_sb; + int err; if (sb->s_op->show_devname) { err = sb->s_op->show_devname(m, mnt_path.dentry); @@ -131,16 +131,17 @@ static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt) struct mount *r = real_mount(mnt); struct super_block *sb = mnt->mnt_sb; struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; - int err = 0; + int err; seq_printf(m, "%i %i %u:%u ", r->mnt_id, r->mnt_parent->mnt_id, MAJOR(sb->s_dev), MINOR(sb->s_dev)); - if (sb->s_op->show_path) + if (sb->s_op->show_path) { err = sb->s_op->show_path(m, mnt->mnt_root); - else + if (err) + goto out; + } else { seq_dentry(m, mnt->mnt_root, " \t\n\\"); - if (err) - goto out; + } seq_putc(m, ' '); /* mountpoints outside of chroot jail will give SEQ_SKIP on this */ @@ -168,12 +169,13 @@ static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt) seq_puts(m, " - "); show_type(m, sb); seq_putc(m, ' '); - if (sb->s_op->show_devname) + if (sb->s_op->show_devname) { err = sb->s_op->show_devname(m, mnt->mnt_root); - else + if (err) + goto out; + } else { mangle(m, r->mnt_devname ? r->mnt_devname : "none"); - if (err) - goto out; + } seq_puts(m, sb->s_flags & MS_RDONLY ? " ro" : " rw"); err = show_sb_opts(m, sb); if (err) @@ -191,7 +193,7 @@ static int show_vfsstat(struct seq_file *m, struct vfsmount *mnt) struct mount *r = real_mount(mnt); struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; struct super_block *sb = mnt_path.dentry->d_sb; - int err = 0; + int err; /* device */ if (sb->s_op->show_devname) { @@ -220,8 +222,7 @@ static int show_vfsstat(struct seq_file *m, struct vfsmount *mnt) /* optional statistics */ if (sb->s_op->show_stats) { seq_putc(m, ' '); - if (!err) - err = sb->s_op->show_stats(m, mnt_path.dentry); + err = sb->s_op->show_stats(m, mnt_path.dentry); } seq_putc(m, '\n'); diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index d8c439d81..dc645b66c 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -377,7 +377,7 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count, break; } - mutex_lock(&d_inode(root)->i_mutex); + inode_lock(d_inode(root)); dentry = d_alloc_name(root, name); if (!dentry) @@ -397,12 +397,12 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count, list_add(&private->list, &allpstore); spin_unlock_irqrestore(&allpstore_lock, flags); - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); return 0; fail_lockedalloc: - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); kfree(private); fail_alloc: iput(inode); diff --git a/fs/qnx4/inode.c b/fs/qnx4/inode.c index c4bcb7788..3a67cfb14 100644 --- a/fs/qnx4/inode.c +++ b/fs/qnx4/inode.c @@ -316,6 +316,7 @@ struct inode *qnx4_iget(struct super_block *sb, unsigned long ino) inode->i_fop = &qnx4_dir_operations; } else if (S_ISLNK(inode->i_mode)) { inode->i_op = &page_symlink_inode_operations; + inode_nohighmem(inode); inode->i_mapping->a_ops = &qnx4_aops; qnx4_i(inode)->mmu_private = inode->i_size; } else { @@ -364,7 +365,7 @@ static int init_inodecache(void) qnx4_inode_cachep = kmem_cache_create("qnx4_inode_cache", sizeof(struct qnx4_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (qnx4_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/qnx6/inode.c b/fs/qnx6/inode.c index 32d2e1a97..47bb1de07 100644 --- a/fs/qnx6/inode.c +++ b/fs/qnx6/inode.c @@ -582,6 +582,7 @@ struct inode *qnx6_iget(struct super_block *sb, unsigned ino) inode->i_mapping->a_ops = &qnx6_aops; } else if (S_ISLNK(inode->i_mode)) { inode->i_op = &page_symlink_inode_operations; + inode_nohighmem(inode); inode->i_mapping->a_ops = &qnx6_aops; } else init_special_inode(inode, inode->i_mode, 0); @@ -624,7 +625,7 @@ static int init_inodecache(void) qnx6_inode_cachep = kmem_cache_create("qnx6_inode_cache", sizeof(struct qnx6_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (!qnx6_inode_cachep) return -ENOMEM; diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index ef0d64b2a..3c3b81bb6 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -682,9 +682,9 @@ int dquot_quota_sync(struct super_block *sb, int type) continue; if (!sb_has_quota_active(sb, cnt)) continue; - mutex_lock(&dqopt->files[cnt]->i_mutex); + inode_lock(dqopt->files[cnt]); truncate_inode_pages(&dqopt->files[cnt]->i_data, 0); - mutex_unlock(&dqopt->files[cnt]->i_mutex); + inode_unlock(dqopt->files[cnt]); } mutex_unlock(&dqopt->dqonoff_mutex); @@ -2162,12 +2162,12 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags) /* If quota was reenabled in the meantime, we have * nothing to do */ if (!sb_has_quota_loaded(sb, cnt)) { - mutex_lock(&toputinode[cnt]->i_mutex); + inode_lock(toputinode[cnt]); toputinode[cnt]->i_flags &= ~(S_IMMUTABLE | S_NOATIME | S_NOQUOTA); truncate_inode_pages(&toputinode[cnt]->i_data, 0); - mutex_unlock(&toputinode[cnt]->i_mutex); + inode_unlock(toputinode[cnt]); mark_inode_dirty_sync(toputinode[cnt]); } mutex_unlock(&dqopt->dqonoff_mutex); @@ -2258,11 +2258,11 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, /* We don't want quota and atime on quota files (deadlocks * possible) Also nobody should write to the file - we use * special IO operations which ignore the immutable bit. */ - mutex_lock(&inode->i_mutex); + inode_lock(inode); oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE | S_NOQUOTA); inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); /* * When S_NOQUOTA is set, remove dquot references as no more * references can be added @@ -2305,12 +2305,12 @@ out_file_init: iput(inode); out_lock: if (oldflags != -1) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* Set the flags back (in the case of accidental quotaon() * on a wrong file we don't want to mess up the flags) */ inode->i_flags &= ~(S_NOATIME | S_NOQUOTA | S_IMMUTABLE); inode->i_flags |= oldflags; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } mutex_unlock(&dqopt->dqonoff_mutex); out_fmt: @@ -2430,9 +2430,9 @@ int dquot_quota_on_mount(struct super_block *sb, char *qf_name, struct dentry *dentry; int error; - mutex_lock(&d_inode(sb->s_root)->i_mutex); + inode_lock(d_inode(sb->s_root)); dentry = lookup_one_len(qf_name, sb->s_root, strlen(qf_name)); - mutex_unlock(&d_inode(sb->s_root)->i_mutex); + inode_unlock(d_inode(sb->s_root)); if (IS_ERR(dentry)) return PTR_ERR(dentry); @@ -2924,4 +2924,4 @@ static int __init dquot_init(void) return 0; } -module_init(dquot_init); +fs_initcall(dquot_init); diff --git a/fs/quota/netlink.c b/fs/quota/netlink.c index bb2869f5d..d07a2f91d 100644 --- a/fs/quota/netlink.c +++ b/fs/quota/netlink.c @@ -1,7 +1,5 @@ - #include <linux/cred.h> #include <linux/init.h> -#include <linux/module.h> #include <linux/kernel.h> #include <linux/quotaops.h> #include <linux/sched.h> @@ -105,5 +103,4 @@ static int __init quota_init(void) "VFS: Failed to create quota netlink interface.\n"); return 0; }; - -module_init(quota_init); +fs_initcall(quota_init); diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c index 2aa012a68..ed85d4f35 100644 --- a/fs/quota/quota_v2.c +++ b/fs/quota/quota_v2.c @@ -30,13 +30,13 @@ static void v2r1_mem2diskdqb(void *dp, struct dquot *dquot); static void v2r1_disk2memdqb(struct dquot *dquot, void *dp); static int v2r1_is_id(void *dp, struct dquot *dquot); -static struct qtree_fmt_operations v2r0_qtree_ops = { +static const struct qtree_fmt_operations v2r0_qtree_ops = { .mem2disk_dqblk = v2r0_mem2diskdqb, .disk2mem_dqblk = v2r0_disk2memdqb, .is_id = v2r0_is_id, }; -static struct qtree_fmt_operations v2r1_qtree_ops = { +static const struct qtree_fmt_operations v2r1_qtree_ops = { .mem2disk_dqblk = v2r1_mem2diskdqb, .disk2mem_dqblk = v2r1_disk2memdqb, .is_id = v2r1_is_id, diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index 889d558b4..38981b037 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -79,6 +79,7 @@ struct inode *ramfs_get_inode(struct super_block *sb, break; case S_IFLNK: inode->i_op = &page_symlink_inode_operations; + inode_nohighmem(inode); break; } } diff --git a/fs/read_write.c b/fs/read_write.c index 0a2893354..dadf24e5c 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -16,6 +16,8 @@ #include <linux/pagemap.h> #include <linux/splice.h> #include <linux/compat.h> +#include <linux/mount.h> +#include <linux/fs.h> #include "internal.h" #include <asm/uaccess.h> @@ -171,6 +173,45 @@ loff_t fixed_size_llseek(struct file *file, loff_t offset, int whence, loff_t si EXPORT_SYMBOL(fixed_size_llseek); /** + * no_seek_end_llseek - llseek implementation for fixed-sized devices + * @file: file structure to seek on + * @offset: file offset to seek to + * @whence: type of seek + * + */ +loff_t no_seek_end_llseek(struct file *file, loff_t offset, int whence) +{ + switch (whence) { + case SEEK_SET: case SEEK_CUR: + return generic_file_llseek_size(file, offset, whence, + OFFSET_MAX, 0); + default: + return -EINVAL; + } +} +EXPORT_SYMBOL(no_seek_end_llseek); + +/** + * no_seek_end_llseek_size - llseek implementation for fixed-sized devices + * @file: file structure to seek on + * @offset: file offset to seek to + * @whence: type of seek + * @size: maximal offset allowed + * + */ +loff_t no_seek_end_llseek_size(struct file *file, loff_t offset, int whence, loff_t size) +{ + switch (whence) { + case SEEK_SET: case SEEK_CUR: + return generic_file_llseek_size(file, offset, whence, + size, 0); + default: + return -EINVAL; + } +} +EXPORT_SYMBOL(no_seek_end_llseek_size); + +/** * noop_llseek - No Operation Performed llseek implementation * @file: file structure to seek on * @offset: file offset to seek to @@ -198,7 +239,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int whence) struct inode *inode = file_inode(file); loff_t retval; - mutex_lock(&inode->i_mutex); + inode_lock(inode); switch (whence) { case SEEK_END: offset += i_size_read(inode); @@ -243,7 +284,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int whence) retval = offset; } out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return retval; } EXPORT_SYMBOL(default_llseek); @@ -395,9 +436,8 @@ int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t } if (unlikely(inode->i_flctx && mandatory_lock(inode))) { - retval = locks_mandatory_area( - read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE, - inode, file, pos, count); + retval = locks_mandatory_area(inode, file, pos, pos + count - 1, + read_write == READ ? F_RDLCK : F_WRLCK); if (retval < 0) return retval; } @@ -494,30 +534,6 @@ ssize_t __vfs_write(struct file *file, const char __user *p, size_t count, } EXPORT_SYMBOL(__vfs_write); -vfs_readf_t vfs_readf(struct file *file) -{ - const struct file_operations *fop = file->f_op; - - if (fop->read) - return fop->read; - if (fop->read_iter) - return new_sync_read; - return ERR_PTR(-ENOSYS); -} -EXPORT_SYMBOL_GPL(vfs_readf); - -vfs_writef_t vfs_writef(struct file *file) -{ - const struct file_operations *fop = file->f_op; - - if (fop->write) - return fop->write; - if (fop->write_iter) - return new_sync_write; - return ERR_PTR(-ENOSYS); -} -EXPORT_SYMBOL_GPL(vfs_writef); - ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t *pos) { mm_segment_t old_fs; @@ -1351,3 +1367,304 @@ COMPAT_SYSCALL_DEFINE4(sendfile64, int, out_fd, int, in_fd, return do_sendfile(out_fd, in_fd, NULL, count, 0); } #endif + +/* + * copy_file_range() differs from regular file read and write in that it + * specifically allows return partial success. When it does so is up to + * the copy_file_range method. + */ +ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, + size_t len, unsigned int flags) +{ + struct inode *inode_in = file_inode(file_in); + struct inode *inode_out = file_inode(file_out); + ssize_t ret; + + if (flags != 0) + return -EINVAL; + + /* copy_file_range allows full ssize_t len, ignoring MAX_RW_COUNT */ + ret = rw_verify_area(READ, file_in, &pos_in, len); + if (ret >= 0) + ret = rw_verify_area(WRITE, file_out, &pos_out, len); + if (ret < 0) + return ret; + + if (!(file_in->f_mode & FMODE_READ) || + !(file_out->f_mode & FMODE_WRITE) || + (file_out->f_flags & O_APPEND)) + return -EBADF; + + /* this could be relaxed once a method supports cross-fs copies */ + if (inode_in->i_sb != inode_out->i_sb) + return -EXDEV; + + if (len == 0) + return 0; + + ret = mnt_want_write_file(file_out); + if (ret) + return ret; + + ret = -EOPNOTSUPP; + if (file_out->f_op->copy_file_range) + ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out, + pos_out, len, flags); + if (ret == -EOPNOTSUPP) + ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, + len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0); + + if (ret > 0) { + fsnotify_access(file_in); + add_rchar(current, ret); + fsnotify_modify(file_out); + add_wchar(current, ret); + } + inc_syscr(current); + inc_syscw(current); + + mnt_drop_write_file(file_out); + + return ret; +} +EXPORT_SYMBOL(vfs_copy_file_range); + +SYSCALL_DEFINE6(copy_file_range, int, fd_in, loff_t __user *, off_in, + int, fd_out, loff_t __user *, off_out, + size_t, len, unsigned int, flags) +{ + loff_t pos_in; + loff_t pos_out; + struct fd f_in; + struct fd f_out; + ssize_t ret = -EBADF; + + f_in = fdget(fd_in); + if (!f_in.file) + goto out2; + + f_out = fdget(fd_out); + if (!f_out.file) + goto out1; + + ret = -EFAULT; + if (off_in) { + if (copy_from_user(&pos_in, off_in, sizeof(loff_t))) + goto out; + } else { + pos_in = f_in.file->f_pos; + } + + if (off_out) { + if (copy_from_user(&pos_out, off_out, sizeof(loff_t))) + goto out; + } else { + pos_out = f_out.file->f_pos; + } + + ret = vfs_copy_file_range(f_in.file, pos_in, f_out.file, pos_out, len, + flags); + if (ret > 0) { + pos_in += ret; + pos_out += ret; + + if (off_in) { + if (copy_to_user(off_in, &pos_in, sizeof(loff_t))) + ret = -EFAULT; + } else { + f_in.file->f_pos = pos_in; + } + + if (off_out) { + if (copy_to_user(off_out, &pos_out, sizeof(loff_t))) + ret = -EFAULT; + } else { + f_out.file->f_pos = pos_out; + } + } + +out: + fdput(f_out); +out1: + fdput(f_in); +out2: + return ret; +} + +static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write) +{ + struct inode *inode = file_inode(file); + + if (unlikely(pos < 0)) + return -EINVAL; + + if (unlikely((loff_t) (pos + len) < 0)) + return -EINVAL; + + if (unlikely(inode->i_flctx && mandatory_lock(inode))) { + loff_t end = len ? pos + len - 1 : OFFSET_MAX; + int retval; + + retval = locks_mandatory_area(inode, file, pos, end, + write ? F_WRLCK : F_RDLCK); + if (retval < 0) + return retval; + } + + return security_file_permission(file, write ? MAY_WRITE : MAY_READ); +} + +int vfs_clone_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, u64 len) +{ + struct inode *inode_in = file_inode(file_in); + struct inode *inode_out = file_inode(file_out); + int ret; + + if (inode_in->i_sb != inode_out->i_sb || + file_in->f_path.mnt != file_out->f_path.mnt) + return -EXDEV; + + if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode)) + return -EISDIR; + if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode)) + return -EINVAL; + + if (!(file_in->f_mode & FMODE_READ) || + !(file_out->f_mode & FMODE_WRITE) || + (file_out->f_flags & O_APPEND)) + return -EBADF; + + if (!file_in->f_op->clone_file_range) + return -EOPNOTSUPP; + + ret = clone_verify_area(file_in, pos_in, len, false); + if (ret) + return ret; + + ret = clone_verify_area(file_out, pos_out, len, true); + if (ret) + return ret; + + if (pos_in + len > i_size_read(inode_in)) + return -EINVAL; + + ret = mnt_want_write_file(file_out); + if (ret) + return ret; + + ret = file_in->f_op->clone_file_range(file_in, pos_in, + file_out, pos_out, len); + if (!ret) { + fsnotify_access(file_in); + fsnotify_modify(file_out); + } + + mnt_drop_write_file(file_out); + return ret; +} +EXPORT_SYMBOL(vfs_clone_file_range); + +int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) +{ + struct file_dedupe_range_info *info; + struct inode *src = file_inode(file); + u64 off; + u64 len; + int i; + int ret; + bool is_admin = capable(CAP_SYS_ADMIN); + u16 count = same->dest_count; + struct file *dst_file; + loff_t dst_off; + ssize_t deduped; + + if (!(file->f_mode & FMODE_READ)) + return -EINVAL; + + if (same->reserved1 || same->reserved2) + return -EINVAL; + + off = same->src_offset; + len = same->src_length; + + ret = -EISDIR; + if (S_ISDIR(src->i_mode)) + goto out; + + ret = -EINVAL; + if (!S_ISREG(src->i_mode)) + goto out; + + ret = clone_verify_area(file, off, len, false); + if (ret < 0) + goto out; + ret = 0; + + /* pre-format output fields to sane values */ + for (i = 0; i < count; i++) { + same->info[i].bytes_deduped = 0ULL; + same->info[i].status = FILE_DEDUPE_RANGE_SAME; + } + + for (i = 0, info = same->info; i < count; i++, info++) { + struct inode *dst; + struct fd dst_fd = fdget(info->dest_fd); + + dst_file = dst_fd.file; + if (!dst_file) { + info->status = -EBADF; + goto next_loop; + } + dst = file_inode(dst_file); + + ret = mnt_want_write_file(dst_file); + if (ret) { + info->status = ret; + goto next_loop; + } + + dst_off = info->dest_offset; + ret = clone_verify_area(dst_file, dst_off, len, true); + if (ret < 0) { + info->status = ret; + goto next_file; + } + ret = 0; + + if (info->reserved) { + info->status = -EINVAL; + } else if (!(is_admin || (dst_file->f_mode & FMODE_WRITE))) { + info->status = -EINVAL; + } else if (file->f_path.mnt != dst_file->f_path.mnt) { + info->status = -EXDEV; + } else if (S_ISDIR(dst->i_mode)) { + info->status = -EISDIR; + } else if (dst_file->f_op->dedupe_file_range == NULL) { + info->status = -EINVAL; + } else { + deduped = dst_file->f_op->dedupe_file_range(file, off, + len, dst_file, + info->dest_offset); + if (deduped == -EBADE) + info->status = FILE_DEDUPE_RANGE_DIFFERS; + else if (deduped < 0) + info->status = deduped; + else + info->bytes_deduped += deduped; + } + +next_file: + mnt_drop_write_file(dst_file); +next_loop: + fdput(dst_fd); + + if (fatal_signal_pending(current)) + goto out; + } + +out: + return ret; +} +EXPORT_SYMBOL(vfs_dedupe_file_range); diff --git a/fs/readdir.c b/fs/readdir.c index ced679179..e69ef3b79 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -44,7 +44,7 @@ int iterate_dir(struct file *file, struct dir_context *ctx) fsnotify_access(file); file_accessed(file); } - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); out: return res; } diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c index 4a024e2ce..3abd40041 100644 --- a/fs/reiserfs/dir.c +++ b/fs/reiserfs/dir.c @@ -38,11 +38,11 @@ static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end, if (err) return err; - mutex_lock(&inode->i_mutex); + inode_lock(inode); reiserfs_write_lock(inode->i_sb); err = reiserfs_commit_for_inode(inode); reiserfs_write_unlock(inode->i_sb); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (err < 0) return err; return 0; diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index 96a1bcf33..9424a4ba9 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -158,7 +158,7 @@ static int reiserfs_sync_file(struct file *filp, loff_t start, loff_t end, if (err) return err; - mutex_lock(&inode->i_mutex); + inode_lock(inode); BUG_ON(!S_ISREG(inode->i_mode)); err = sync_mapping_buffers(inode->i_mapping); reiserfs_write_lock(inode->i_sb); @@ -166,7 +166,7 @@ static int reiserfs_sync_file(struct file *filp, loff_t start, loff_t end, reiserfs_write_unlock(inode->i_sb); if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb)) blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (barrier_done < 0) return barrier_done; return (err < 0) ? -EIO : 0; diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 3d8e7e671..ae9e5b308 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -1361,6 +1361,7 @@ static void init_inode(struct inode *inode, struct treepath *path) inode->i_fop = &reiserfs_dir_operations; } else if (S_ISLNK(inode->i_mode)) { inode->i_op = &reiserfs_symlink_inode_operations; + inode_nohighmem(inode); inode->i_mapping->a_ops = &reiserfs_address_space_operations; } else { inode->i_blocks = 0; diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index 6ec8a30a0..036a1fc0a 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -224,7 +224,7 @@ out_unlock: page_cache_release(page); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); reiserfs_write_unlock(inode->i_sb); return retval; } diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 9d6486d41..44c2bdced 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -618,12 +618,10 @@ static void release_buffer_page(struct buffer_head *bh) static void reiserfs_end_buffer_io_sync(struct buffer_head *bh, int uptodate) { - char b[BDEVNAME_SIZE]; - if (buffer_journaled(bh)) { reiserfs_warning(NULL, "clm-2084", - "pinned buffer %lu:%s sent to disk", - bh->b_blocknr, bdevname(bh->b_bdev, b)); + "pinned buffer %lu:%pg sent to disk", + bh->b_blocknr, bh->b_bdev); } if (uptodate) set_buffer_uptodate(bh); @@ -2387,11 +2385,10 @@ static int journal_read(struct super_block *sb) int replay_count = 0; int continue_replay = 1; int ret; - char b[BDEVNAME_SIZE]; cur_dblock = SB_ONDISK_JOURNAL_1st_BLOCK(sb); - reiserfs_info(sb, "checking transaction log (%s)\n", - bdevname(journal->j_dev_bd, b)); + reiserfs_info(sb, "checking transaction log (%pg)\n", + journal->j_dev_bd); start = get_seconds(); /* @@ -2651,8 +2648,8 @@ static int journal_init_dev(struct super_block *super, set_blocksize(journal->j_dev_bd, super->s_blocksize); reiserfs_info(super, - "journal_init_dev: journal device: %s\n", - bdevname(journal->j_dev_bd, b)); + "journal_init_dev: journal device: %pg\n", + journal->j_dev_bd); return 0; } @@ -2724,7 +2721,6 @@ int journal_init(struct super_block *sb, const char *j_dev_name, struct reiserfs_journal_header *jh; struct reiserfs_journal *journal; struct reiserfs_journal_list *jl; - char b[BDEVNAME_SIZE]; int ret; journal = SB_JOURNAL(sb) = vzalloc(sizeof(struct reiserfs_journal)); @@ -2794,10 +2790,10 @@ int journal_init(struct super_block *sb, const char *j_dev_name, && (le32_to_cpu(jh->jh_journal.jp_journal_magic) != sb_jp_journal_magic(rs))) { reiserfs_warning(sb, "sh-460", - "journal header magic %x (device %s) does " + "journal header magic %x (device %pg) does " "not match to magic found in super block %x", jh->jh_journal.jp_journal_magic, - bdevname(journal->j_dev_bd, b), + journal->j_dev_bd, sb_jp_journal_magic(rs)); brelse(bhjh); goto free_and_return; @@ -2818,10 +2814,10 @@ int journal_init(struct super_block *sb, const char *j_dev_name, journal->j_max_trans_age = commit_max_age; } - reiserfs_info(sb, "journal params: device %s, size %u, " + reiserfs_info(sb, "journal params: device %pg, size %u, " "journal first block %u, max trans len %u, max batch %u, " "max commit age %u, max trans age %u\n", - bdevname(journal->j_dev_bd, b), + journal->j_dev_bd, SB_ONDISK_JOURNAL_SIZE(sb), SB_ONDISK_JOURNAL_1st_BLOCK(sb), journal->j_trans_max, diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 47f96988f..2a12d46d7 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -1170,6 +1170,7 @@ static int reiserfs_symlink(struct inode *parent_dir, reiserfs_update_inode_transaction(parent_dir); inode->i_op = &reiserfs_symlink_inode_operations; + inode_nohighmem(inode); inode->i_mapping->a_ops = &reiserfs_address_space_operations; retval = reiserfs_add_entry(&th, parent_dir, dentry->d_name.name, @@ -1664,8 +1665,7 @@ const struct inode_operations reiserfs_dir_inode_operations = { */ const struct inode_operations reiserfs_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = page_follow_link_light, - .put_link = page_put_link, + .get_link = page_get_link, .setattr = reiserfs_setattr, .setxattr = reiserfs_setxattr, .getxattr = reiserfs_getxattr, diff --git a/fs/reiserfs/prints.c b/fs/reiserfs/prints.c index ae1dc841d..4f3f92807 100644 --- a/fs/reiserfs/prints.c +++ b/fs/reiserfs/prints.c @@ -139,11 +139,9 @@ static void sprintf_block_head(char *buf, struct buffer_head *bh) static void sprintf_buffer_head(char *buf, struct buffer_head *bh) { - char b[BDEVNAME_SIZE]; - sprintf(buf, - "dev %s, size %zd, blocknr %llu, count %d, state 0x%lx, page %p, (%s, %s, %s)", - bdevname(bh->b_bdev, b), bh->b_size, + "dev %pg, size %zd, blocknr %llu, count %d, state 0x%lx, page %p, (%s, %s, %s)", + bh->b_bdev, bh->b_size, (unsigned long long)bh->b_blocknr, atomic_read(&(bh->b_count)), bh->b_state, bh->b_page, buffer_uptodate(bh) ? "UPTODATE" : "!UPTODATE", @@ -530,7 +528,6 @@ static int print_super_block(struct buffer_head *bh) (struct reiserfs_super_block *)(bh->b_data); int skipped, data_blocks; char *version; - char b[BDEVNAME_SIZE]; if (is_reiserfs_3_5(rs)) { version = "3.5"; @@ -543,7 +540,7 @@ static int print_super_block(struct buffer_head *bh) return 1; } - printk("%s\'s super block is in block %llu\n", bdevname(bh->b_bdev, b), + printk("%pg\'s super block is in block %llu\n", bh->b_bdev, (unsigned long long)bh->b_blocknr); printk("Reiserfs version %s\n", version); printk("Block count %u\n", sb_block_count(rs)); diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c index 621b9f381..fe999157d 100644 --- a/fs/reiserfs/procfs.c +++ b/fs/reiserfs/procfs.c @@ -303,11 +303,10 @@ static int show_journal(struct seq_file *m, void *unused) struct reiserfs_sb_info *r = REISERFS_SB(sb); struct reiserfs_super_block *rs = r->s_rs; struct journal_params *jp = &rs->s_v1.s_journal; - char b[BDEVNAME_SIZE]; seq_printf(m, /* on-disk fields */ "jp_journal_1st_block: \t%i\n" - "jp_journal_dev: \t%s[%x]\n" + "jp_journal_dev: \t%pg[%x]\n" "jp_journal_size: \t%i\n" "jp_journal_trans_max: \t%i\n" "jp_journal_magic: \t%i\n" @@ -348,7 +347,7 @@ static int show_journal(struct seq_file *m, void *unused) "prepare: \t%12lu\n" "prepare_retry: \t%12lu\n", DJP(jp_journal_1st_block), - bdevname(SB_JOURNAL(sb)->j_dev_bd, b), + SB_JOURNAL(sb)->j_dev_bd, DJP(jp_journal_dev), DJP(jp_journal_size), DJP(jp_journal_trans_max), diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 4a62fe8cc..c0306ec8e 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -288,7 +288,7 @@ static int finish_unfinished(struct super_block *s) pathrelse(&path); inode = reiserfs_iget(s, &obj_key); - if (!inode) { + if (IS_ERR_OR_NULL(inode)) { /* * the unlink almost completed, it just did not * manage to remove "save" link and release objectid @@ -626,7 +626,8 @@ static int __init init_inodecache(void) sizeof(struct reiserfs_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD| + SLAB_ACCOUNT), init_once); if (reiserfs_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 66b26fdff..57e0b2310 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -64,14 +64,14 @@ #ifdef CONFIG_REISERFS_FS_XATTR static int xattr_create(struct inode *dir, struct dentry *dentry, int mode) { - BUG_ON(!mutex_is_locked(&dir->i_mutex)); + BUG_ON(!inode_is_locked(dir)); return dir->i_op->create(dir, dentry, mode, true); } #endif static int xattr_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { - BUG_ON(!mutex_is_locked(&dir->i_mutex)); + BUG_ON(!inode_is_locked(dir)); return dir->i_op->mkdir(dir, dentry, mode); } @@ -85,11 +85,11 @@ static int xattr_unlink(struct inode *dir, struct dentry *dentry) { int error; - BUG_ON(!mutex_is_locked(&dir->i_mutex)); + BUG_ON(!inode_is_locked(dir)); - mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD); error = dir->i_op->unlink(dir, dentry); - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); if (!error) d_delete(dentry); @@ -100,13 +100,13 @@ static int xattr_rmdir(struct inode *dir, struct dentry *dentry) { int error; - BUG_ON(!mutex_is_locked(&dir->i_mutex)); + BUG_ON(!inode_is_locked(dir)); - mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD); error = dir->i_op->rmdir(dir, dentry); if (!error) d_inode(dentry)->i_flags |= S_DEAD; - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); if (!error) d_delete(dentry); @@ -123,7 +123,7 @@ static struct dentry *open_xa_root(struct super_block *sb, int flags) if (d_really_is_negative(privroot)) return ERR_PTR(-ENODATA); - mutex_lock_nested(&d_inode(privroot)->i_mutex, I_MUTEX_XATTR); + inode_lock_nested(d_inode(privroot), I_MUTEX_XATTR); xaroot = dget(REISERFS_SB(sb)->xattr_root); if (!xaroot) @@ -139,7 +139,7 @@ static struct dentry *open_xa_root(struct super_block *sb, int flags) } } - mutex_unlock(&d_inode(privroot)->i_mutex); + inode_unlock(d_inode(privroot)); return xaroot; } @@ -156,7 +156,7 @@ static struct dentry *open_xa_dir(const struct inode *inode, int flags) le32_to_cpu(INODE_PKEY(inode)->k_objectid), inode->i_generation); - mutex_lock_nested(&d_inode(xaroot)->i_mutex, I_MUTEX_XATTR); + inode_lock_nested(d_inode(xaroot), I_MUTEX_XATTR); xadir = lookup_one_len(namebuf, xaroot, strlen(namebuf)); if (!IS_ERR(xadir) && d_really_is_negative(xadir)) { @@ -170,7 +170,7 @@ static struct dentry *open_xa_dir(const struct inode *inode, int flags) } } - mutex_unlock(&d_inode(xaroot)->i_mutex); + inode_unlock(d_inode(xaroot)); dput(xaroot); return xadir; } @@ -195,7 +195,7 @@ fill_with_dentries(struct dir_context *ctx, const char *name, int namelen, container_of(ctx, struct reiserfs_dentry_buf, ctx); struct dentry *dentry; - WARN_ON_ONCE(!mutex_is_locked(&d_inode(dbuf->xadir)->i_mutex)); + WARN_ON_ONCE(!inode_is_locked(d_inode(dbuf->xadir))); if (dbuf->count == ARRAY_SIZE(dbuf->dentries)) return -ENOSPC; @@ -254,7 +254,7 @@ static int reiserfs_for_each_xattr(struct inode *inode, goto out_dir; } - mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_XATTR); + inode_lock_nested(d_inode(dir), I_MUTEX_XATTR); buf.xadir = dir; while (1) { @@ -276,7 +276,7 @@ static int reiserfs_for_each_xattr(struct inode *inode, break; buf.count = 0; } - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); cleanup_dentry_buf(&buf); @@ -298,13 +298,13 @@ static int reiserfs_for_each_xattr(struct inode *inode, if (!err) { int jerror; - mutex_lock_nested(&d_inode(dir->d_parent)->i_mutex, + inode_lock_nested(d_inode(dir->d_parent), I_MUTEX_XATTR); err = action(dir, data); reiserfs_write_lock(inode->i_sb); jerror = journal_end(&th); reiserfs_write_unlock(inode->i_sb); - mutex_unlock(&d_inode(dir->d_parent)->i_mutex); + inode_unlock(d_inode(dir->d_parent)); err = jerror ?: err; } } @@ -384,7 +384,7 @@ static struct dentry *xattr_lookup(struct inode *inode, const char *name, if (IS_ERR(xadir)) return ERR_CAST(xadir); - mutex_lock_nested(&d_inode(xadir)->i_mutex, I_MUTEX_XATTR); + inode_lock_nested(d_inode(xadir), I_MUTEX_XATTR); xafile = lookup_one_len(name, xadir, strlen(name)); if (IS_ERR(xafile)) { err = PTR_ERR(xafile); @@ -404,7 +404,7 @@ static struct dentry *xattr_lookup(struct inode *inode, const char *name, if (err) dput(xafile); out: - mutex_unlock(&d_inode(xadir)->i_mutex); + inode_unlock(d_inode(xadir)); dput(xadir); if (err) return ERR_PTR(err); @@ -469,7 +469,7 @@ static int lookup_and_delete_xattr(struct inode *inode, const char *name) if (IS_ERR(xadir)) return PTR_ERR(xadir); - mutex_lock_nested(&d_inode(xadir)->i_mutex, I_MUTEX_XATTR); + inode_lock_nested(d_inode(xadir), I_MUTEX_XATTR); dentry = lookup_one_len(name, xadir, strlen(name)); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); @@ -483,7 +483,7 @@ static int lookup_and_delete_xattr(struct inode *inode, const char *name) dput(dentry); out_dput: - mutex_unlock(&d_inode(xadir)->i_mutex); + inode_unlock(d_inode(xadir)); dput(xadir); return err; } @@ -580,11 +580,11 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th, .ia_valid = ATTR_SIZE | ATTR_CTIME, }; - mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_XATTR); + inode_lock_nested(d_inode(dentry), I_MUTEX_XATTR); inode_dio_wait(d_inode(dentry)); err = reiserfs_setattr(dentry, &newattrs); - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); } else update_ctime(inode); out_unlock: @@ -756,7 +756,8 @@ find_xattr_handler_prefix(const struct xattr_handler **handlers, return NULL; for_each_xattr_handler(handlers, xah) { - if (strncmp(xah->prefix, name, strlen(xah->prefix)) == 0) + const char *prefix = xattr_prefix(xah); + if (strncmp(prefix, name, strlen(prefix)) == 0) break; } @@ -839,19 +840,16 @@ static int listxattr_filler(struct dir_context *ctx, const char *name, handler = find_xattr_handler_prefix(b->dentry->d_sb->s_xattr, name); - if (!handler) /* Unsupported xattr name */ + if (!handler /* Unsupported xattr name */ || + (handler->list && !handler->list(b->dentry))) return 0; + size = namelen + 1; if (b->buf) { - size = handler->list(handler, b->dentry, - b->buf + b->pos, b->size, name, - namelen); if (size > b->size) return -ERANGE; - } else { - size = handler->list(handler, b->dentry, - NULL, 0, name, namelen); + memcpy(b->buf + b->pos, name, namelen); + b->buf[b->pos + namelen] = 0; } - b->pos += size; } return 0; @@ -890,9 +888,9 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size) goto out; } - mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_XATTR); + inode_lock_nested(d_inode(dir), I_MUTEX_XATTR); err = reiserfs_readdir_inode(d_inode(dir), &buf.ctx); - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); if (!err) err = buf.pos; @@ -907,7 +905,7 @@ static int create_privroot(struct dentry *dentry) int err; struct inode *inode = d_inode(dentry->d_parent); - WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex)); + WARN_ON_ONCE(!inode_is_locked(inode)); err = xattr_mkdir(inode, dentry, 0700); if (err || d_really_is_negative(dentry)) { @@ -997,7 +995,7 @@ int reiserfs_lookup_privroot(struct super_block *s) int err = 0; /* If we don't have the privroot located yet - go find it */ - mutex_lock(&d_inode(s->s_root)->i_mutex); + inode_lock(d_inode(s->s_root)); dentry = lookup_one_len(PRIVROOT_NAME, s->s_root, strlen(PRIVROOT_NAME)); if (!IS_ERR(dentry)) { @@ -1007,7 +1005,7 @@ int reiserfs_lookup_privroot(struct super_block *s) d_inode(dentry)->i_flags |= S_PRIVATE; } else err = PTR_ERR(dentry); - mutex_unlock(&d_inode(s->s_root)->i_mutex); + inode_unlock(d_inode(s->s_root)); return err; } @@ -1027,14 +1025,14 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags) goto error; if (d_really_is_negative(privroot) && !(mount_flags & MS_RDONLY)) { - mutex_lock(&d_inode(s->s_root)->i_mutex); + inode_lock(d_inode(s->s_root)); err = create_privroot(REISERFS_SB(s)->priv_root); - mutex_unlock(&d_inode(s->s_root)->i_mutex); + inode_unlock(d_inode(s->s_root)); } if (d_really_is_positive(privroot)) { s->s_xattr = reiserfs_xattr_handlers; - mutex_lock(&d_inode(privroot)->i_mutex); + inode_lock(d_inode(privroot)); if (!REISERFS_SB(s)->xattr_root) { struct dentry *dentry; @@ -1045,7 +1043,7 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags) else err = PTR_ERR(dentry); } - mutex_unlock(&d_inode(privroot)->i_mutex); + inode_unlock(d_inode(privroot)); } error: diff --git a/fs/reiserfs/xattr_acl.c b/fs/reiserfs/xattr_acl.c index 4b34b9dc0..558a16bea 100644 --- a/fs/reiserfs/xattr_acl.c +++ b/fs/reiserfs/xattr_acl.c @@ -186,10 +186,10 @@ struct posix_acl *reiserfs_get_acl(struct inode *inode, int type) switch (type) { case ACL_TYPE_ACCESS: - name = POSIX_ACL_XATTR_ACCESS; + name = XATTR_NAME_POSIX_ACL_ACCESS; break; case ACL_TYPE_DEFAULT: - name = POSIX_ACL_XATTR_DEFAULT; + name = XATTR_NAME_POSIX_ACL_DEFAULT; break; default: BUG(); @@ -244,7 +244,7 @@ __reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode, switch (type) { case ACL_TYPE_ACCESS: - name = POSIX_ACL_XATTR_ACCESS; + name = XATTR_NAME_POSIX_ACL_ACCESS; if (acl) { error = posix_acl_equiv_mode(acl, &inode->i_mode); if (error < 0) @@ -256,7 +256,7 @@ __reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode, } break; case ACL_TYPE_DEFAULT: - name = POSIX_ACL_XATTR_DEFAULT; + name = XATTR_NAME_POSIX_ACL_DEFAULT; if (!S_ISDIR(inode->i_mode)) return acl ? -EACCES : 0; break; diff --git a/fs/reiserfs/xattr_security.c b/fs/reiserfs/xattr_security.c index ac659af43..ab0217d32 100644 --- a/fs/reiserfs/xattr_security.c +++ b/fs/reiserfs/xattr_security.c @@ -34,21 +34,9 @@ security_set(const struct xattr_handler *handler, struct dentry *dentry, return reiserfs_xattr_set(d_inode(dentry), name, buffer, size, flags); } -static size_t security_list(const struct xattr_handler *handler, - struct dentry *dentry, char *list, size_t list_len, - const char *name, size_t namelen) +static bool security_list(struct dentry *dentry) { - const size_t len = namelen + 1; - - if (IS_PRIVATE(d_inode(dentry))) - return 0; - - if (list && len <= list_len) { - memcpy(list, name, namelen); - list[namelen] = '\0'; - } - - return len; + return !IS_PRIVATE(d_inode(dentry)); } /* Initializes the security context for a new inode and returns the number diff --git a/fs/reiserfs/xattr_trusted.c b/fs/reiserfs/xattr_trusted.c index a338adf1b..64b67aa64 100644 --- a/fs/reiserfs/xattr_trusted.c +++ b/fs/reiserfs/xattr_trusted.c @@ -33,20 +33,9 @@ trusted_set(const struct xattr_handler *handler, struct dentry *dentry, return reiserfs_xattr_set(d_inode(dentry), name, buffer, size, flags); } -static size_t trusted_list(const struct xattr_handler *handler, - struct dentry *dentry, char *list, size_t list_size, - const char *name, size_t name_len) +static bool trusted_list(struct dentry *dentry) { - const size_t len = name_len + 1; - - if (!capable(CAP_SYS_ADMIN) || IS_PRIVATE(d_inode(dentry))) - return 0; - - if (list && len <= list_size) { - memcpy(list, name, name_len); - list[name_len] = '\0'; - } - return len; + return capable(CAP_SYS_ADMIN) && !IS_PRIVATE(d_inode(dentry)); } const struct xattr_handler reiserfs_xattr_trusted_handler = { diff --git a/fs/reiserfs/xattr_user.c b/fs/reiserfs/xattr_user.c index 39c966719..12e6306f5 100644 --- a/fs/reiserfs/xattr_user.c +++ b/fs/reiserfs/xattr_user.c @@ -30,19 +30,9 @@ user_set(const struct xattr_handler *handler, struct dentry *dentry, return reiserfs_xattr_set(d_inode(dentry), name, buffer, size, flags); } -static size_t user_list(const struct xattr_handler *handler, - struct dentry *dentry, char *list, size_t list_size, - const char *name, size_t name_len) +static bool user_list(struct dentry *dentry) { - const size_t len = name_len + 1; - - if (!reiserfs_xattrs_user(dentry->d_sb)) - return 0; - if (list && len <= list_size) { - memcpy(list, name, name_len); - list[name_len] = '\0'; - } - return len; + return reiserfs_xattrs_user(dentry->d_sb); } const struct xattr_handler reiserfs_xattr_user_handler = { diff --git a/fs/romfs/super.c b/fs/romfs/super.c index 268733cda..6b00ca357 100644 --- a/fs/romfs/super.c +++ b/fs/romfs/super.c @@ -360,6 +360,7 @@ static struct inode *romfs_iget(struct super_block *sb, unsigned long pos) break; case ROMFH_SYM: i->i_op = &page_symlink_inode_operations; + inode_nohighmem(i); i->i_data.a_ops = &romfs_aops; mode |= S_IRWXUGO; break; @@ -618,8 +619,8 @@ static int __init init_romfs_fs(void) romfs_inode_cachep = kmem_cache_create("romfs_i", sizeof(struct romfs_inode_info), 0, - SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, - romfs_i_init_once); + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | + SLAB_ACCOUNT, romfs_i_init_once); if (!romfs_inode_cachep) { pr_err("Failed to initialise inode cache\n"); diff --git a/fs/select.c b/fs/select.c index 015547330..79d0d4953 100644 --- a/fs/select.c +++ b/fs/select.c @@ -778,8 +778,8 @@ static inline unsigned int do_pollfd(struct pollfd *pollfd, poll_table *pwait, return mask; } -static int do_poll(unsigned int nfds, struct poll_list *list, - struct poll_wqueues *wait, struct timespec *end_time) +static int do_poll(struct poll_list *list, struct poll_wqueues *wait, + struct timespec *end_time) { poll_table* pt = &wait->pt; ktime_t expire, *to = NULL; @@ -908,7 +908,7 @@ int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, } poll_initwait(&table); - fdcount = do_poll(nfds, head, &table, end_time); + fdcount = do_poll(head, &table, end_time); poll_freewait(&table); for (walk = head; walk; walk = walk->next) { diff --git a/fs/splice.c b/fs/splice.c index b0ade1fd4..82bc0d64f 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -415,6 +415,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, */ if (!page->mapping) { unlock_page(page); +retry_lookup: page = find_or_create_page(mapping, index, mapping_gfp_mask(mapping)); @@ -439,13 +440,10 @@ __generic_file_splice_read(struct file *in, loff_t *ppos, error = mapping->a_ops->readpage(in, page); if (unlikely(error)) { /* - * We really should re-lookup the page here, - * but it complicates things a lot. Instead - * lets just do what we already stored, and - * we'll get it the next time we are called. + * Re-lookup the page */ if (error == AOP_TRUNCATED_PAGE) - error = 0; + goto retry_lookup; break; } @@ -1110,8 +1108,8 @@ EXPORT_SYMBOL(generic_splice_sendpage); /* * Attempt to initiate a splice from pipe to file. */ -long do_splice_from(struct pipe_inode_info *pipe, struct file *out, - loff_t *ppos, size_t len, unsigned int flags) +static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, + loff_t *ppos, size_t len, unsigned int flags) { ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); @@ -1123,14 +1121,13 @@ long do_splice_from(struct pipe_inode_info *pipe, struct file *out, return splice_write(pipe, out, ppos, len, flags); } -EXPORT_SYMBOL_GPL(do_splice_from); /* * Attempt to initiate a splice from a file to a pipe. */ -long do_splice_to(struct file *in, loff_t *ppos, - struct pipe_inode_info *pipe, size_t len, - unsigned int flags) +static long do_splice_to(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) { ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); @@ -1150,7 +1147,6 @@ long do_splice_to(struct file *in, loff_t *ppos, return splice_read(in, ppos, pipe, len, flags); } -EXPORT_SYMBOL_GPL(do_splice_to); /** * splice_direct_to_actor - splices data directly between two non-pipes diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c index a1ce5ce60..0927b1e80 100644 --- a/fs/squashfs/inode.c +++ b/fs/squashfs/inode.c @@ -41,6 +41,7 @@ #include <linux/fs.h> #include <linux/vfs.h> #include <linux/xattr.h> +#include <linux/pagemap.h> #include "squashfs_fs.h" #include "squashfs_fs_sb.h" @@ -291,6 +292,7 @@ int squashfs_read_inode(struct inode *inode, long long ino) set_nlink(inode, le32_to_cpu(sqsh_ino->nlink)); inode->i_size = le32_to_cpu(sqsh_ino->symlink_size); inode->i_op = &squashfs_symlink_inode_ops; + inode_nohighmem(inode); inode->i_data.a_ops = &squashfs_symlink_aops; inode->i_mode |= S_IFLNK; squashfs_i(inode)->start = block; diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 5056babe0..5e79bfa4f 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -80,7 +80,6 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent) { struct squashfs_sb_info *msblk; struct squashfs_super_block *sblk = NULL; - char b[BDEVNAME_SIZE]; struct inode *root; long long root_inode; unsigned short flags; @@ -124,8 +123,8 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_magic = le32_to_cpu(sblk->s_magic); if (sb->s_magic != SQUASHFS_MAGIC) { if (!silent) - ERROR("Can't find a SQUASHFS superblock on %s\n", - bdevname(sb->s_bdev, b)); + ERROR("Can't find a SQUASHFS superblock on %pg\n", + sb->s_bdev); goto failed_mount; } @@ -178,7 +177,7 @@ static int squashfs_fill_super(struct super_block *sb, void *data, int silent) msblk->inodes = le32_to_cpu(sblk->inodes); flags = le16_to_cpu(sblk->flags); - TRACE("Found valid superblock on %s\n", bdevname(sb->s_bdev, b)); + TRACE("Found valid superblock on %pg\n", sb->s_bdev); TRACE("Inodes are %scompressed\n", SQUASHFS_UNCOMPRESSED_INODES(flags) ? "un" : ""); TRACE("Data is %scompressed\n", SQUASHFS_UNCOMPRESSED_DATA(flags) @@ -420,7 +419,8 @@ static int __init init_inodecache(void) { squashfs_inode_cachep = kmem_cache_create("squashfs_inode_cache", sizeof(struct squashfs_inode_info), 0, - SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT, init_once); + SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, + init_once); return squashfs_inode_cachep ? 0 : -ENOMEM; } diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c index 12806dffb..dbcc2f54b 100644 --- a/fs/squashfs/symlink.c +++ b/fs/squashfs/symlink.c @@ -119,8 +119,7 @@ const struct address_space_operations squashfs_symlink_aops = { const struct inode_operations squashfs_symlink_inode_ops = { .readlink = generic_readlink, - .follow_link = page_follow_link_light, - .put_link = page_put_link, + .get_link = page_get_link, .getxattr = generic_getxattr, .listxattr = squashfs_listxattr }; diff --git a/fs/squashfs/xattr.c b/fs/squashfs/xattr.c index 6a4cc3440..1e9de9628 100644 --- a/fs/squashfs/xattr.c +++ b/fs/squashfs/xattr.c @@ -58,7 +58,7 @@ ssize_t squashfs_listxattr(struct dentry *d, char *buffer, struct squashfs_xattr_entry entry; struct squashfs_xattr_val val; const struct xattr_handler *handler; - int name_size, prefix_size = 0; + int name_size; err = squashfs_read_metadata(sb, &entry, &start, &offset, sizeof(entry)); @@ -67,15 +67,16 @@ ssize_t squashfs_listxattr(struct dentry *d, char *buffer, name_size = le16_to_cpu(entry.size); handler = squashfs_xattr_handler(le16_to_cpu(entry.type)); - if (handler) - prefix_size = handler->list(handler, d, buffer, rest, - NULL, name_size); - if (prefix_size) { + if (handler && (!handler->list || handler->list(d))) { + const char *prefix = handler->prefix ?: handler->name; + size_t prefix_size = strlen(prefix); + if (buffer) { if (prefix_size + name_size + 1 > rest) { err = -ERANGE; goto failed; } + memcpy(buffer, prefix, prefix_size); buffer += prefix_size; } err = squashfs_read_metadata(sb, buffer, &start, @@ -212,25 +213,10 @@ failed: } -static size_t squashfs_xattr_handler_list(const struct xattr_handler *handler, - struct dentry *d, char *list, - size_t list_size, const char *name, - size_t name_len) -{ - int len = strlen(handler->prefix); - - if (list && len <= list_size) - memcpy(list, handler->prefix, len); - return len; -} - static int squashfs_xattr_handler_get(const struct xattr_handler *handler, struct dentry *d, const char *name, void *buffer, size_t size) { - if (name[0] == '\0') - return -EINVAL; - return squashfs_xattr_get(d_inode(d), handler->flags, name, buffer, size); } @@ -241,22 +227,15 @@ static int squashfs_xattr_handler_get(const struct xattr_handler *handler, static const struct xattr_handler squashfs_xattr_user_handler = { .prefix = XATTR_USER_PREFIX, .flags = SQUASHFS_XATTR_USER, - .list = squashfs_xattr_handler_list, .get = squashfs_xattr_handler_get }; /* * Trusted namespace support */ -static size_t squashfs_trusted_xattr_handler_list(const struct xattr_handler *handler, - struct dentry *d, char *list, - size_t list_size, const char *name, - size_t name_len) +static bool squashfs_trusted_xattr_handler_list(struct dentry *d) { - if (!capable(CAP_SYS_ADMIN)) - return 0; - return squashfs_xattr_handler_list(handler, d, list, list_size, name, - name_len); + return capable(CAP_SYS_ADMIN); } static const struct xattr_handler squashfs_xattr_trusted_handler = { @@ -272,7 +251,6 @@ static const struct xattr_handler squashfs_xattr_trusted_handler = { static const struct xattr_handler squashfs_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .flags = SQUASHFS_XATTR_SECURITY, - .list = squashfs_xattr_handler_list, .get = squashfs_xattr_handler_get }; @@ -219,7 +219,7 @@ SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, stat # define choose_32_64(a,b) b #endif -#define valid_dev(x) choose_32_64(old_valid_dev,new_valid_dev)(x) +#define valid_dev(x) choose_32_64(old_valid_dev(x),true) #define encode_dev(x) choose_32_64(old_encode_dev,new_encode_dev)(x) #ifndef INIT_STRUCT_STAT_PADDING diff --git a/fs/super.c b/fs/super.c index 6746d6087..74914b1ba 100644 --- a/fs/super.c +++ b/fs/super.c @@ -36,7 +36,7 @@ #include "internal.h" -LIST_HEAD(super_blocks); +static LIST_HEAD(super_blocks); static DEFINE_SPINLOCK(sb_lock); static char *sb_writers_name[SB_FREEZE_LEVELS] = { @@ -1013,10 +1013,8 @@ struct dentry *mount_bdev(struct file_system_type *fs_type, blkdev_put(bdev, mode); down_write(&s->s_umount); } else { - char b[BDEVNAME_SIZE]; - s->s_mode = mode; - strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); + snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev); sb_set_blocksize(s, block_size(bdev)); error = fill_super(s, data, flags & MS_SILENT ? 1 : 0); if (error) { @@ -1200,7 +1198,7 @@ int __sb_start_write(struct super_block *sb, int level, bool wait) else ret = percpu_down_read_trylock(sb->s_writers.rw_sem + level-1); - WARN_ON(force_trylock & !ret); + WARN_ON(force_trylock && !ret); return ret; } EXPORT_SYMBOL(__sb_start_write); diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c index 02fa1dcc5..d62c423a5 100644 --- a/fs/sysv/inode.c +++ b/fs/sysv/inode.c @@ -146,8 +146,7 @@ static inline void write3byte(struct sysv_sb_info *sbi, static const struct inode_operations sysv_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = page_follow_link_light, - .put_link = page_put_link, + .get_link = page_get_link, .getattr = sysv_getattr, }; @@ -163,6 +162,7 @@ void sysv_set_inode(struct inode *inode, dev_t rdev) inode->i_mapping->a_ops = &sysv_aops; } else if (S_ISLNK(inode->i_mode)) { inode->i_op = &sysv_symlink_inode_operations; + inode_nohighmem(inode); inode->i_mapping->a_ops = &sysv_aops; } else init_special_inode(inode, inode->i_mode, rdev); @@ -346,7 +346,7 @@ int __init sysv_init_icache(void) { sysv_inode_cachep = kmem_cache_create("sysv_inode_cache", sizeof(struct sysv_inode_info), 0, - SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, + SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT, init_once); if (!sysv_inode_cachep) return -ENOMEM; diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index c66f2423e..4a0e48f92 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -84,9 +84,9 @@ static int tracefs_syscall_mkdir(struct inode *inode, struct dentry *dentry, umo * the files within the tracefs system. It is up to the individual * mkdir routine to handle races. */ - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); ret = tracefs_ops.mkdir(name); - mutex_lock(&inode->i_mutex); + inode_lock(inode); kfree(name); @@ -109,13 +109,13 @@ static int tracefs_syscall_rmdir(struct inode *inode, struct dentry *dentry) * This time we need to unlock not only the parent (inode) but * also the directory that is being deleted. */ - mutex_unlock(&inode->i_mutex); - mutex_unlock(&dentry->d_inode->i_mutex); + inode_unlock(inode); + inode_unlock(dentry->d_inode); ret = tracefs_ops.rmdir(name); - mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); - mutex_lock(&dentry->d_inode->i_mutex); + inode_lock_nested(inode, I_MUTEX_PARENT); + inode_lock(dentry->d_inode); kfree(name); @@ -334,7 +334,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent) if (!parent) parent = tracefs_mount->mnt_root; - mutex_lock(&parent->d_inode->i_mutex); + inode_lock(parent->d_inode); dentry = lookup_one_len(name, parent, strlen(name)); if (!IS_ERR(dentry) && dentry->d_inode) { dput(dentry); @@ -342,7 +342,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent) } if (IS_ERR(dentry)) { - mutex_unlock(&parent->d_inode->i_mutex); + inode_unlock(parent->d_inode); simple_release_fs(&tracefs_mount, &tracefs_mount_count); } @@ -351,7 +351,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent) static struct dentry *failed_creating(struct dentry *dentry) { - mutex_unlock(&dentry->d_parent->d_inode->i_mutex); + inode_unlock(dentry->d_parent->d_inode); dput(dentry); simple_release_fs(&tracefs_mount, &tracefs_mount_count); return NULL; @@ -359,7 +359,7 @@ static struct dentry *failed_creating(struct dentry *dentry) static struct dentry *end_creating(struct dentry *dentry) { - mutex_unlock(&dentry->d_parent->d_inode->i_mutex); + inode_unlock(dentry->d_parent->d_inode); return dentry; } @@ -544,9 +544,9 @@ void tracefs_remove(struct dentry *dentry) if (!parent || !parent->d_inode) return; - mutex_lock(&parent->d_inode->i_mutex); + inode_lock(parent->d_inode); ret = __tracefs_remove(dentry, parent); - mutex_unlock(&parent->d_inode->i_mutex); + inode_unlock(parent->d_inode); if (!ret) simple_release_fs(&tracefs_mount, &tracefs_mount_count); } @@ -572,7 +572,7 @@ void tracefs_remove_recursive(struct dentry *dentry) parent = dentry; down: - mutex_lock(&parent->d_inode->i_mutex); + inode_lock(parent->d_inode); loop: /* * The parent->d_subdirs is protected by the d_lock. Outside that @@ -587,7 +587,7 @@ void tracefs_remove_recursive(struct dentry *dentry) /* perhaps simple_empty(child) makes more sense */ if (!list_empty(&child->d_subdirs)) { spin_unlock(&parent->d_lock); - mutex_unlock(&parent->d_inode->i_mutex); + inode_unlock(parent->d_inode); parent = child; goto down; } @@ -608,10 +608,10 @@ void tracefs_remove_recursive(struct dentry *dentry) } spin_unlock(&parent->d_lock); - mutex_unlock(&parent->d_inode->i_mutex); + inode_unlock(parent->d_inode); child = parent; parent = parent->d_parent; - mutex_lock(&parent->d_inode->i_mutex); + inode_lock(parent->d_inode); if (child != dentry) /* go up */ @@ -619,7 +619,7 @@ void tracefs_remove_recursive(struct dentry *dentry) if (!__tracefs_remove(child, parent)) simple_release_fs(&tracefs_mount, &tracefs_mount_count); - mutex_unlock(&parent->d_inode->i_mutex); + inode_unlock(parent->d_inode); } /** diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index e49bd2808..795992a83 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -515,8 +515,8 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir, dbg_gen("dent '%pd' to ino %lu (nlink %d) in dir ino %lu", dentry, inode->i_ino, inode->i_nlink, dir->i_ino); - ubifs_assert(mutex_is_locked(&dir->i_mutex)); - ubifs_assert(mutex_is_locked(&inode->i_mutex)); + ubifs_assert(inode_is_locked(dir)); + ubifs_assert(inode_is_locked(inode)); err = dbg_check_synced_i_size(c, inode); if (err) @@ -572,8 +572,8 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry) dbg_gen("dent '%pd' from ino %lu (nlink %d) in dir ino %lu", dentry, inode->i_ino, inode->i_nlink, dir->i_ino); - ubifs_assert(mutex_is_locked(&dir->i_mutex)); - ubifs_assert(mutex_is_locked(&inode->i_mutex)); + ubifs_assert(inode_is_locked(dir)); + ubifs_assert(inode_is_locked(inode)); err = dbg_check_synced_i_size(c, inode); if (err) return err; @@ -661,8 +661,8 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry) dbg_gen("directory '%pd', ino %lu in dir ino %lu", dentry, inode->i_ino, dir->i_ino); - ubifs_assert(mutex_is_locked(&dir->i_mutex)); - ubifs_assert(mutex_is_locked(&inode->i_mutex)); + ubifs_assert(inode_is_locked(dir)); + ubifs_assert(inode_is_locked(inode)); err = check_dir_empty(c, d_inode(dentry)); if (err) return err; @@ -996,10 +996,10 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry, dbg_gen("dent '%pd' ino %lu in dir ino %lu to dent '%pd' in dir ino %lu", old_dentry, old_inode->i_ino, old_dir->i_ino, new_dentry, new_dir->i_ino); - ubifs_assert(mutex_is_locked(&old_dir->i_mutex)); - ubifs_assert(mutex_is_locked(&new_dir->i_mutex)); + ubifs_assert(inode_is_locked(old_dir)); + ubifs_assert(inode_is_locked(new_dir)); if (unlink) - ubifs_assert(mutex_is_locked(&new_inode->i_mutex)); + ubifs_assert(inode_is_locked(new_inode)); if (unlink && is_dir) { diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 0edc12856..065c88f8e 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1317,7 +1317,7 @@ int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync) err = filemap_write_and_wait_range(inode->i_mapping, start, end); if (err) return err; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* Synchronize the inode unless this is a 'datasync()' call. */ if (!datasync || (inode->i_state & I_DIRTY_DATASYNC)) { @@ -1332,7 +1332,7 @@ int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync) */ err = ubifs_sync_wbufs_by_inode(c, inode); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return err; } @@ -1608,7 +1608,7 @@ const struct inode_operations ubifs_file_inode_operations = { const struct inode_operations ubifs_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = simple_follow_link, + .get_link = simple_get_link, .setattr = ubifs_setattr, .getattr = ubifs_getattr, .setxattr = ubifs_setxattr, diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h index 92a8491a8..c0a95e393 100644 --- a/fs/ubifs/key.h +++ b/fs/ubifs/key.h @@ -34,6 +34,12 @@ * node. We use "r5" hash borrowed from reiserfs. */ +/* + * Lot's of the key helpers require a struct ubifs_info *c as the first parameter. + * But we are not using it at all currently. That's designed for future extensions of + * different c->key_format. But right now, there is only one key type, UBIFS_SIMPLE_KEY_FMT. + */ + #ifndef __UBIFS_KEY_H__ #define __UBIFS_KEY_H__ diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 1fd90c079..a233ba913 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -2248,8 +2248,8 @@ static int __init ubifs_init(void) ubifs_inode_slab = kmem_cache_create("ubifs_inode_slab", sizeof(struct ubifs_inode), 0, - SLAB_MEM_SPREAD | SLAB_RECLAIM_ACCOUNT, - &inode_slab_ctor); + SLAB_MEM_SPREAD | SLAB_RECLAIM_ACCOUNT | + SLAB_ACCOUNT, &inode_slab_ctor); if (!ubifs_inode_slab) return -ENOMEM; diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index e8b01b721..c7f4d434d 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -267,7 +267,7 @@ static int check_namespace(const struct qstr *nm) if (!strncmp(nm->name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) { - if (nm->name[sizeof(XATTR_TRUSTED_PREFIX) - 1] == '\0') + if (nm->name[XATTR_TRUSTED_PREFIX_LEN] == '\0') return -EINVAL; type = TRUSTED_XATTR; } else if (!strncmp(nm->name, XATTR_USER_PREFIX, @@ -277,7 +277,7 @@ static int check_namespace(const struct qstr *nm) type = USER_XATTR; } else if (!strncmp(nm->name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) { - if (nm->name[sizeof(XATTR_SECURITY_PREFIX) - 1] == '\0') + if (nm->name[XATTR_SECURITY_PREFIX_LEN] == '\0') return -EINVAL; type = SECURITY_XATTR; } else @@ -313,7 +313,7 @@ static int setxattr(struct inode *host, const char *name, const void *value, union ubifs_key key; int err, type; - ubifs_assert(mutex_is_locked(&host->i_mutex)); + ubifs_assert(inode_is_locked(host)); if (size > UBIFS_MAX_INO_DATA) return -ERANGE; @@ -550,7 +550,7 @@ int ubifs_removexattr(struct dentry *dentry, const char *name) dbg_gen("xattr '%s', ino %lu ('%pd')", name, host->i_ino, dentry); - ubifs_assert(mutex_is_locked(&host->i_mutex)); + ubifs_assert(inode_is_locked(host)); err = check_namespace(&nm); if (err < 0) diff --git a/fs/udf/balloc.c b/fs/udf/balloc.c index 6d6a96b4e..e0fd65fe7 100644 --- a/fs/udf/balloc.c +++ b/fs/udf/balloc.c @@ -447,9 +447,6 @@ static void udf_table_free_blocks(struct super_block *sb, */ int adsize; - struct short_ad *sad = NULL; - struct long_ad *lad = NULL; - struct allocExtDesc *aed; eloc.logicalBlockNum = start; elen = EXT_RECORDED_ALLOCATED | @@ -466,102 +463,17 @@ static void udf_table_free_blocks(struct super_block *sb, } if (epos.offset + (2 * adsize) > sb->s_blocksize) { - unsigned char *sptr, *dptr; - int loffset; - - brelse(oepos.bh); - oepos = epos; - /* Steal a block from the extent being free'd */ - epos.block.logicalBlockNum = eloc.logicalBlockNum; + udf_setup_indirect_aext(table, eloc.logicalBlockNum, + &epos); + eloc.logicalBlockNum++; elen -= sb->s_blocksize; - - epos.bh = udf_tread(sb, - udf_get_lb_pblock(sb, &epos.block, 0)); - if (!epos.bh) { - brelse(oepos.bh); - goto error_return; - } - aed = (struct allocExtDesc *)(epos.bh->b_data); - aed->previousAllocExtLocation = - cpu_to_le32(oepos.block.logicalBlockNum); - if (epos.offset + adsize > sb->s_blocksize) { - loffset = epos.offset; - aed->lengthAllocDescs = cpu_to_le32(adsize); - sptr = iinfo->i_ext.i_data + epos.offset - - adsize; - dptr = epos.bh->b_data + - sizeof(struct allocExtDesc); - memcpy(dptr, sptr, adsize); - epos.offset = sizeof(struct allocExtDesc) + - adsize; - } else { - loffset = epos.offset + adsize; - aed->lengthAllocDescs = cpu_to_le32(0); - if (oepos.bh) { - sptr = oepos.bh->b_data + epos.offset; - aed = (struct allocExtDesc *) - oepos.bh->b_data; - le32_add_cpu(&aed->lengthAllocDescs, - adsize); - } else { - sptr = iinfo->i_ext.i_data + - epos.offset; - iinfo->i_lenAlloc += adsize; - mark_inode_dirty(table); - } - epos.offset = sizeof(struct allocExtDesc); - } - if (sbi->s_udfrev >= 0x0200) - udf_new_tag(epos.bh->b_data, TAG_IDENT_AED, - 3, 1, epos.block.logicalBlockNum, - sizeof(struct tag)); - else - udf_new_tag(epos.bh->b_data, TAG_IDENT_AED, - 2, 1, epos.block.logicalBlockNum, - sizeof(struct tag)); - - switch (iinfo->i_alloc_type) { - case ICBTAG_FLAG_AD_SHORT: - sad = (struct short_ad *)sptr; - sad->extLength = cpu_to_le32( - EXT_NEXT_EXTENT_ALLOCDECS | - sb->s_blocksize); - sad->extPosition = - cpu_to_le32(epos.block.logicalBlockNum); - break; - case ICBTAG_FLAG_AD_LONG: - lad = (struct long_ad *)sptr; - lad->extLength = cpu_to_le32( - EXT_NEXT_EXTENT_ALLOCDECS | - sb->s_blocksize); - lad->extLocation = - cpu_to_lelb(epos.block); - break; - } - if (oepos.bh) { - udf_update_tag(oepos.bh->b_data, loffset); - mark_buffer_dirty(oepos.bh); - } else { - mark_inode_dirty(table); - } } /* It's possible that stealing the block emptied the extent */ - if (elen) { - udf_write_aext(table, &epos, &eloc, elen, 1); - - if (!epos.bh) { - iinfo->i_lenAlloc += adsize; - mark_inode_dirty(table); - } else { - aed = (struct allocExtDesc *)epos.bh->b_data; - le32_add_cpu(&aed->lengthAllocDescs, adsize); - udf_update_tag(epos.bh->b_data, epos.offset); - mark_buffer_dirty(epos.bh); - } - } + if (elen) + __udf_add_aext(table, &epos, &eloc, elen, 1); } brelse(epos.bh); diff --git a/fs/udf/file.c b/fs/udf/file.c index bddf3d071..1af98963d 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -122,7 +122,7 @@ static ssize_t udf_file_write_iter(struct kiocb *iocb, struct iov_iter *from) struct udf_inode_info *iinfo = UDF_I(inode); int err; - mutex_lock(&inode->i_mutex); + inode_lock(inode); retval = generic_write_checks(iocb, from); if (retval <= 0) @@ -136,7 +136,7 @@ static ssize_t udf_file_write_iter(struct kiocb *iocb, struct iov_iter *from) (udf_file_entry_alloc_offset(inode) + end)) { err = udf_expand_file_adinicb(inode); if (err) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); udf_debug("udf_expand_adinicb: err=%d\n", err); return err; } @@ -149,7 +149,7 @@ static ssize_t udf_file_write_iter(struct kiocb *iocb, struct iov_iter *from) retval = __generic_file_write_iter(iocb, from); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (retval > 0) { mark_inode_dirty(inode); @@ -223,12 +223,12 @@ static int udf_release_file(struct inode *inode, struct file *filp) * Grab i_mutex to avoid races with writes changing i_size * while we are running. */ - mutex_lock(&inode->i_mutex); + inode_lock(inode); down_write(&UDF_I(inode)->i_data_sem); udf_discard_prealloc(inode); udf_truncate_tail_extent(inode); up_write(&UDF_I(inode)->i_data_sem); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } return 0; } diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 566df9b5a..166d3ed32 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -262,7 +262,7 @@ int udf_expand_file_adinicb(struct inode *inode) .nr_to_write = 1, }; - WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex)); + WARN_ON_ONCE(!inode_is_locked(inode)); if (!iinfo->i_lenAlloc) { if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD)) iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT; @@ -539,9 +539,18 @@ static int udf_do_extend_file(struct inode *inode, udf_add_aext(inode, last_pos, &last_ext->extLocation, last_ext->extLength, 1); count++; - } else + } else { + struct kernel_lb_addr tmploc; + uint32_t tmplen; + udf_write_aext(inode, last_pos, &last_ext->extLocation, last_ext->extLength, 1); + /* + * We've rewritten the last extent but there may be empty + * indirect extent after it - enter it. + */ + udf_next_aext(inode, last_pos, &tmploc, &tmplen, 0); + } /* Managed to do everything necessary? */ if (!blocks) @@ -1540,7 +1549,8 @@ reread: break; case ICBTAG_FILE_TYPE_SYMLINK: inode->i_data.a_ops = &udf_symlink_aops; - inode->i_op = &udf_symlink_inode_operations; + inode->i_op = &page_symlink_inode_operations; + inode_nohighmem(inode); inode->i_mode = S_IFLNK | S_IRWXUGO; break; case ICBTAG_FILE_TYPE_MAIN: @@ -1866,22 +1876,90 @@ struct inode *__udf_iget(struct super_block *sb, struct kernel_lb_addr *ino, return inode; } -int udf_add_aext(struct inode *inode, struct extent_position *epos, - struct kernel_lb_addr *eloc, uint32_t elen, int inc) +int udf_setup_indirect_aext(struct inode *inode, int block, + struct extent_position *epos) { - int adsize; - struct short_ad *sad = NULL; - struct long_ad *lad = NULL; + struct super_block *sb = inode->i_sb; + struct buffer_head *bh; struct allocExtDesc *aed; - uint8_t *ptr; - struct udf_inode_info *iinfo = UDF_I(inode); + struct extent_position nepos; + struct kernel_lb_addr neloc; + int ver, adsize; - if (!epos->bh) - ptr = iinfo->i_ext.i_data + epos->offset - - udf_file_entry_alloc_offset(inode) + - iinfo->i_lenEAttr; + if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_SHORT) + adsize = sizeof(struct short_ad); + else if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_LONG) + adsize = sizeof(struct long_ad); else - ptr = epos->bh->b_data + epos->offset; + return -EIO; + + neloc.logicalBlockNum = block; + neloc.partitionReferenceNum = epos->block.partitionReferenceNum; + + bh = udf_tgetblk(sb, udf_get_lb_pblock(sb, &neloc, 0)); + if (!bh) + return -EIO; + lock_buffer(bh); + memset(bh->b_data, 0x00, sb->s_blocksize); + set_buffer_uptodate(bh); + unlock_buffer(bh); + mark_buffer_dirty_inode(bh, inode); + + aed = (struct allocExtDesc *)(bh->b_data); + if (!UDF_QUERY_FLAG(sb, UDF_FLAG_STRICT)) { + aed->previousAllocExtLocation = + cpu_to_le32(epos->block.logicalBlockNum); + } + aed->lengthAllocDescs = cpu_to_le32(0); + if (UDF_SB(sb)->s_udfrev >= 0x0200) + ver = 3; + else + ver = 2; + udf_new_tag(bh->b_data, TAG_IDENT_AED, ver, 1, block, + sizeof(struct tag)); + + nepos.block = neloc; + nepos.offset = sizeof(struct allocExtDesc); + nepos.bh = bh; + + /* + * Do we have to copy current last extent to make space for indirect + * one? + */ + if (epos->offset + adsize > sb->s_blocksize) { + struct kernel_lb_addr cp_loc; + uint32_t cp_len; + int cp_type; + + epos->offset -= adsize; + cp_type = udf_current_aext(inode, epos, &cp_loc, &cp_len, 0); + cp_len |= ((uint32_t)cp_type) << 30; + + __udf_add_aext(inode, &nepos, &cp_loc, cp_len, 1); + udf_write_aext(inode, epos, &nepos.block, + sb->s_blocksize | EXT_NEXT_EXTENT_ALLOCDECS, 0); + } else { + __udf_add_aext(inode, epos, &nepos.block, + sb->s_blocksize | EXT_NEXT_EXTENT_ALLOCDECS, 0); + } + + brelse(epos->bh); + *epos = nepos; + + return 0; +} + +/* + * Append extent at the given position - should be the first free one in inode + * / indirect extent. This function assumes there is enough space in the inode + * or indirect extent. Use udf_add_aext() if you didn't check for this before. + */ +int __udf_add_aext(struct inode *inode, struct extent_position *epos, + struct kernel_lb_addr *eloc, uint32_t elen, int inc) +{ + struct udf_inode_info *iinfo = UDF_I(inode); + struct allocExtDesc *aed; + int adsize; if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_SHORT) adsize = sizeof(struct short_ad); @@ -1890,88 +1968,14 @@ int udf_add_aext(struct inode *inode, struct extent_position *epos, else return -EIO; - if (epos->offset + (2 * adsize) > inode->i_sb->s_blocksize) { - unsigned char *sptr, *dptr; - struct buffer_head *nbh; - int err, loffset; - struct kernel_lb_addr obloc = epos->block; - - epos->block.logicalBlockNum = udf_new_block(inode->i_sb, NULL, - obloc.partitionReferenceNum, - obloc.logicalBlockNum, &err); - if (!epos->block.logicalBlockNum) - return -ENOSPC; - nbh = udf_tgetblk(inode->i_sb, udf_get_lb_pblock(inode->i_sb, - &epos->block, - 0)); - if (!nbh) - return -EIO; - lock_buffer(nbh); - memset(nbh->b_data, 0x00, inode->i_sb->s_blocksize); - set_buffer_uptodate(nbh); - unlock_buffer(nbh); - mark_buffer_dirty_inode(nbh, inode); - - aed = (struct allocExtDesc *)(nbh->b_data); - if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT)) - aed->previousAllocExtLocation = - cpu_to_le32(obloc.logicalBlockNum); - if (epos->offset + adsize > inode->i_sb->s_blocksize) { - loffset = epos->offset; - aed->lengthAllocDescs = cpu_to_le32(adsize); - sptr = ptr - adsize; - dptr = nbh->b_data + sizeof(struct allocExtDesc); - memcpy(dptr, sptr, adsize); - epos->offset = sizeof(struct allocExtDesc) + adsize; - } else { - loffset = epos->offset + adsize; - aed->lengthAllocDescs = cpu_to_le32(0); - sptr = ptr; - epos->offset = sizeof(struct allocExtDesc); - - if (epos->bh) { - aed = (struct allocExtDesc *)epos->bh->b_data; - le32_add_cpu(&aed->lengthAllocDescs, adsize); - } else { - iinfo->i_lenAlloc += adsize; - mark_inode_dirty(inode); - } - } - if (UDF_SB(inode->i_sb)->s_udfrev >= 0x0200) - udf_new_tag(nbh->b_data, TAG_IDENT_AED, 3, 1, - epos->block.logicalBlockNum, sizeof(struct tag)); - else - udf_new_tag(nbh->b_data, TAG_IDENT_AED, 2, 1, - epos->block.logicalBlockNum, sizeof(struct tag)); - switch (iinfo->i_alloc_type) { - case ICBTAG_FLAG_AD_SHORT: - sad = (struct short_ad *)sptr; - sad->extLength = cpu_to_le32(EXT_NEXT_EXTENT_ALLOCDECS | - inode->i_sb->s_blocksize); - sad->extPosition = - cpu_to_le32(epos->block.logicalBlockNum); - break; - case ICBTAG_FLAG_AD_LONG: - lad = (struct long_ad *)sptr; - lad->extLength = cpu_to_le32(EXT_NEXT_EXTENT_ALLOCDECS | - inode->i_sb->s_blocksize); - lad->extLocation = cpu_to_lelb(epos->block); - memset(lad->impUse, 0x00, sizeof(lad->impUse)); - break; - } - if (epos->bh) { - if (!UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_STRICT) || - UDF_SB(inode->i_sb)->s_udfrev >= 0x0201) - udf_update_tag(epos->bh->b_data, loffset); - else - udf_update_tag(epos->bh->b_data, - sizeof(struct allocExtDesc)); - mark_buffer_dirty_inode(epos->bh, inode); - brelse(epos->bh); - } else { - mark_inode_dirty(inode); - } - epos->bh = nbh; + if (!epos->bh) { + WARN_ON(iinfo->i_lenAlloc != + epos->offset - udf_file_entry_alloc_offset(inode)); + } else { + aed = (struct allocExtDesc *)epos->bh->b_data; + WARN_ON(le32_to_cpu(aed->lengthAllocDescs) != + epos->offset - sizeof(struct allocExtDesc)); + WARN_ON(epos->offset + adsize > inode->i_sb->s_blocksize); } udf_write_aext(inode, epos, eloc, elen, inc); @@ -1995,6 +1999,41 @@ int udf_add_aext(struct inode *inode, struct extent_position *epos, return 0; } +/* + * Append extent at given position - should be the first free one in inode + * / indirect extent. Takes care of allocating and linking indirect blocks. + */ +int udf_add_aext(struct inode *inode, struct extent_position *epos, + struct kernel_lb_addr *eloc, uint32_t elen, int inc) +{ + int adsize; + struct super_block *sb = inode->i_sb; + + if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_SHORT) + adsize = sizeof(struct short_ad); + else if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_LONG) + adsize = sizeof(struct long_ad); + else + return -EIO; + + if (epos->offset + (2 * adsize) > sb->s_blocksize) { + int err; + int new_block; + + new_block = udf_new_block(sb, NULL, + epos->block.partitionReferenceNum, + epos->block.logicalBlockNum, &err); + if (!new_block) + return -ENOSPC; + + err = udf_setup_indirect_aext(inode, new_block, epos); + if (err) + return err; + } + + return __udf_add_aext(inode, epos, eloc, elen, inc); +} + void udf_write_aext(struct inode *inode, struct extent_position *epos, struct kernel_lb_addr *eloc, uint32_t elen, int inc) { diff --git a/fs/udf/namei.c b/fs/udf/namei.c index c97b5a8d1..42eafb91f 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -921,7 +921,8 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry, } inode->i_data.a_ops = &udf_symlink_aops; - inode->i_op = &udf_symlink_inode_operations; + inode->i_op = &page_symlink_inode_operations; + inode_nohighmem(inode); if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { struct kernel_lb_addr eloc; @@ -1344,8 +1345,3 @@ const struct inode_operations udf_dir_inode_operations = { .rename = udf_rename, .tmpfile = udf_tmpfile, }; -const struct inode_operations udf_symlink_inode_operations = { - .readlink = generic_readlink, - .follow_link = page_follow_link_light, - .put_link = page_put_link, -}; diff --git a/fs/udf/super.c b/fs/udf/super.c index 81155b9b4..a522c15a0 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -179,7 +179,8 @@ static int __init init_inodecache(void) udf_inode_cachep = kmem_cache_create("udf_inode_cache", sizeof(struct udf_inode_info), 0, (SLAB_RECLAIM_ACCOUNT | - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD | + SLAB_ACCOUNT), init_once); if (!udf_inode_cachep) return -ENOMEM; @@ -278,17 +279,12 @@ static void udf_sb_free_bitmap(struct udf_bitmap *bitmap) { int i; int nr_groups = bitmap->s_nr_groups; - int size = sizeof(struct udf_bitmap) + (sizeof(struct buffer_head *) * - nr_groups); for (i = 0; i < nr_groups; i++) if (bitmap->s_block_bitmap[i]) brelse(bitmap->s_block_bitmap[i]); - if (size <= PAGE_SIZE) - kfree(bitmap); - else - vfree(bitmap); + kvfree(bitmap); } static void udf_free_partition(struct udf_part_map *map) @@ -1586,6 +1582,13 @@ static void udf_load_logicalvolint(struct super_block *sb, struct kernel_extent_ } /* + * Maximum number of Terminating Descriptor redirections. The chosen number is + * arbitrary - just that we hopefully don't limit any real use of rewritten + * inode on write-once media but avoid looping for too long on corrupted media. + */ +#define UDF_MAX_TD_NESTING 64 + +/* * Process a main/reserve volume descriptor sequence. * @block First block of first extent of the sequence. * @lastblock Lastblock of first extent of the sequence. @@ -1609,6 +1612,7 @@ static noinline int udf_process_sequence( uint16_t ident; long next_s = 0, next_e = 0; int ret; + unsigned int indirections = 0; memset(vds, 0, sizeof(struct udf_vds_record) * VDS_POS_LENGTH); @@ -1679,6 +1683,12 @@ static noinline int udf_process_sequence( } break; case TAG_IDENT_TD: /* ISO 13346 3/10.9 */ + if (++indirections > UDF_MAX_TD_NESTING) { + udf_err(sb, "too many TDs (max %u supported)\n", UDF_MAX_TD_NESTING); + brelse(bh); + return -EIO; + } + vds[VDS_POS_TERMINATING_DESC].block = block; if (next_e) { block = next_s; diff --git a/fs/udf/symlink.c b/fs/udf/symlink.c index 862535b3b..8d6197730 100644 --- a/fs/udf/symlink.c +++ b/fs/udf/symlink.c @@ -107,7 +107,7 @@ static int udf_symlink_filler(struct file *file, struct page *page) struct buffer_head *bh = NULL; unsigned char *symlink; int err; - unsigned char *p = kmap(page); + unsigned char *p = page_address(page); struct udf_inode_info *iinfo; uint32_t pos; @@ -141,7 +141,6 @@ static int udf_symlink_filler(struct file *file, struct page *page) up_read(&iinfo->i_data_sem); SetPageUptodate(page); - kunmap(page); unlock_page(page); return 0; @@ -149,7 +148,6 @@ out_unlock_inode: up_read(&iinfo->i_data_sem); SetPageError(page); out_unmap: - kunmap(page); unlock_page(page); return err; } diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index 47bb3f5ca..fa0044b6b 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -85,7 +85,6 @@ extern const struct inode_operations udf_dir_inode_operations; extern const struct file_operations udf_dir_operations; extern const struct inode_operations udf_file_inode_operations; extern const struct file_operations udf_file_operations; -extern const struct inode_operations udf_symlink_inode_operations; extern const struct address_space_operations udf_aops; extern const struct address_space_operations udf_adinicb_aops; extern const struct address_space_operations udf_symlink_aops; @@ -159,6 +158,10 @@ extern int udf_write_inode(struct inode *, struct writeback_control *wbc); extern long udf_block_map(struct inode *, sector_t); extern int8_t inode_bmap(struct inode *, sector_t, struct extent_position *, struct kernel_lb_addr *, uint32_t *, sector_t *); +extern int udf_setup_indirect_aext(struct inode *inode, int block, + struct extent_position *epos); +extern int __udf_add_aext(struct inode *inode, struct extent_position *epos, + struct kernel_lb_addr *eloc, uint32_t elen, int inc); extern int udf_add_aext(struct inode *, struct extent_position *, struct kernel_lb_addr *, uint32_t, int); extern void udf_write_aext(struct inode *, struct extent_position *, diff --git a/fs/ufs/Makefile b/fs/ufs/Makefile index 392db25c0..ec4a6b49f 100644 --- a/fs/ufs/Makefile +++ b/fs/ufs/Makefile @@ -5,5 +5,5 @@ obj-$(CONFIG_UFS_FS) += ufs.o ufs-objs := balloc.o cylinder.o dir.o file.o ialloc.o inode.o \ - namei.o super.o symlink.o util.o + namei.o super.o util.o ccflags-$(CONFIG_UFS_DEBUG) += -DDEBUG diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index a064cf44b..d897e169a 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -528,11 +528,12 @@ static void ufs_set_inode_ops(struct inode *inode) inode->i_mapping->a_ops = &ufs_aops; } else if (S_ISLNK(inode->i_mode)) { if (!inode->i_blocks) { - inode->i_op = &ufs_fast_symlink_inode_operations; inode->i_link = (char *)UFS_I(inode)->i_u1.i_symlink; + inode->i_op = &simple_symlink_inode_operations; } else { - inode->i_op = &ufs_symlink_inode_operations; inode->i_mapping->a_ops = &ufs_aops; + inode->i_op = &page_symlink_inode_operations; + inode_nohighmem(inode); } } else init_special_inode(inode, inode->i_mode, diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index 479665543..acf4a3b61 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -123,14 +123,15 @@ static int ufs_symlink (struct inode * dir, struct dentry * dentry, if (l > UFS_SB(sb)->s_uspi->s_maxsymlinklen) { /* slow symlink */ - inode->i_op = &ufs_symlink_inode_operations; + inode->i_op = &page_symlink_inode_operations; + inode_nohighmem(inode); inode->i_mapping->a_ops = &ufs_aops; err = page_symlink(inode, symname, l); if (err) goto out_fail; } else { /* fast symlink */ - inode->i_op = &ufs_fast_symlink_inode_operations; + inode->i_op = &simple_symlink_inode_operations; inode->i_link = (char *)UFS_I(inode)->i_u1.i_symlink; memcpy(inode->i_link, symname, l); inode->i_size = l-1; diff --git a/fs/ufs/super.c b/fs/ufs/super.c index f6390eec0..442fd52eb 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -1427,7 +1427,7 @@ static int __init init_inodecache(void) ufs_inode_cachep = kmem_cache_create("ufs_inode_cache", sizeof(struct ufs_inode_info), 0, (SLAB_RECLAIM_ACCOUNT| - SLAB_MEM_SPREAD), + SLAB_MEM_SPREAD|SLAB_ACCOUNT), init_once); if (ufs_inode_cachep == NULL) return -ENOMEM; diff --git a/fs/ufs/symlink.c b/fs/ufs/symlink.c deleted file mode 100644 index 874480bb4..000000000 --- a/fs/ufs/symlink.c +++ /dev/null @@ -1,42 +0,0 @@ -/* - * linux/fs/ufs/symlink.c - * - * Only fast symlinks left here - the rest is done by generic code. AV, 1999 - * - * Copyright (C) 1998 - * Daniel Pirkl <daniel.pirkl@emai.cz> - * Charles University, Faculty of Mathematics and Physics - * - * from - * - * linux/fs/ext2/symlink.c - * - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - * - * from - * - * linux/fs/minix/symlink.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - * - * ext2 symlink handling code - */ - -#include "ufs_fs.h" -#include "ufs.h" - -const struct inode_operations ufs_fast_symlink_inode_operations = { - .readlink = generic_readlink, - .follow_link = simple_follow_link, - .setattr = ufs_setattr, -}; - -const struct inode_operations ufs_symlink_inode_operations = { - .readlink = generic_readlink, - .follow_link = page_follow_link_light, - .put_link = page_put_link, - .setattr = ufs_setattr, -}; diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h index 7da4aca86..c87f4c3fa 100644 --- a/fs/ufs/ufs.h +++ b/fs/ufs/ufs.h @@ -136,10 +136,6 @@ extern __printf(3, 4) void ufs_panic(struct super_block *, const char *, const char *, ...); void ufs_mark_sb_dirty(struct super_block *sb); -/* symlink.c */ -extern const struct inode_operations ufs_fast_symlink_inode_operations; -extern const struct inode_operations ufs_symlink_inode_operations; - static inline struct ufs_sb_info *UFS_SB(struct super_block *sb) { return sb->s_fs_info; diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 503117031..66cdb4461 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -287,6 +287,12 @@ int handle_userfault(struct vm_area_struct *vma, unsigned long address, goto out; /* + * We don't do userfault handling for the final child pid update. + */ + if (current->flags & PF_EXITING) + goto out; + + /* * Check that we can return VM_FAULT_RETRY. * * NOTE: it should become possible to return VM_FAULT_RETRY diff --git a/fs/utimes.c b/fs/utimes.c index aa138d645..85c40f4f3 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -103,9 +103,9 @@ static int utimes_common(struct path *path, struct timespec *times) } } retry_deleg: - mutex_lock(&inode->i_mutex); + inode_lock(inode); error = notify_change(path->dentry, &newattrs, &delegated_inode); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) diff --git a/fs/xattr.c b/fs/xattr.c index 0c317c4fd..4861322e2 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -129,7 +129,7 @@ vfs_setxattr(struct dentry *dentry, const char *name, const void *value, if (error) return error; - mutex_lock(&inode->i_mutex); + inode_lock(inode); error = security_inode_setxattr(dentry, name, value, size, flags); if (error) goto out; @@ -137,7 +137,7 @@ vfs_setxattr(struct dentry *dentry, const char *name, const void *value, error = __vfs_setxattr_noperm(dentry, name, value, size, flags); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return error; } EXPORT_SYMBOL_GPL(vfs_setxattr); @@ -207,26 +207,6 @@ vfs_getxattr_alloc(struct dentry *dentry, const char *name, char **xattr_value, *xattr_value = value; return error; } -EXPORT_SYMBOL_GPL(vfs_getxattr_alloc); - -/* Compare an extended attribute value with the given value */ -int vfs_xattr_cmp(struct dentry *dentry, const char *xattr_name, - const char *value, size_t size, gfp_t flags) -{ - char *xattr_value = NULL; - int rc; - - rc = vfs_getxattr_alloc(dentry, xattr_name, &xattr_value, 0, flags); - if (rc < 0) - return rc; - - if ((rc != size) || (memcmp(xattr_value, value, rc) != 0)) - rc = -EINVAL; - else - rc = 0; - kfree(xattr_value); - return rc; -} ssize_t vfs_getxattr(struct dentry *dentry, const char *name, void *value, size_t size) @@ -297,7 +277,7 @@ vfs_removexattr(struct dentry *dentry, const char *name) if (error) return error; - mutex_lock(&inode->i_mutex); + inode_lock(inode); error = security_inode_removexattr(dentry, name); if (error) goto out; @@ -310,7 +290,7 @@ vfs_removexattr(struct dentry *dentry, const char *name) } out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return error; } EXPORT_SYMBOL_GPL(vfs_removexattr); @@ -325,7 +305,6 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value, { int error; void *kvalue = NULL; - void *vvalue = NULL; /* If non-NULL, we used vmalloc() */ char kname[XATTR_NAME_MAX + 1]; if (flags & ~(XATTR_CREATE|XATTR_REPLACE)) @@ -342,10 +321,9 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value, return -E2BIG; kvalue = kmalloc(size, GFP_KERNEL | __GFP_NOWARN); if (!kvalue) { - vvalue = vmalloc(size); - if (!vvalue) + kvalue = vmalloc(size); + if (!kvalue) return -ENOMEM; - kvalue = vvalue; } if (copy_from_user(kvalue, value, size)) { error = -EFAULT; @@ -358,10 +336,8 @@ setxattr(struct dentry *d, const char __user *name, const void __user *value, error = vfs_setxattr(d, kname, kvalue, size, flags); out: - if (vvalue) - vfree(vvalue); - else - kfree(kvalue); + kvfree(kvalue); + return error; } @@ -429,7 +405,6 @@ getxattr(struct dentry *d, const char __user *name, void __user *value, { ssize_t error; void *kvalue = NULL; - void *vvalue = NULL; char kname[XATTR_NAME_MAX + 1]; error = strncpy_from_user(kname, name, sizeof(kname)); @@ -443,10 +418,9 @@ getxattr(struct dentry *d, const char __user *name, void __user *value, size = XATTR_SIZE_MAX; kvalue = kzalloc(size, GFP_KERNEL | __GFP_NOWARN); if (!kvalue) { - vvalue = vmalloc(size); - if (!vvalue) + kvalue = vmalloc(size); + if (!kvalue) return -ENOMEM; - kvalue = vvalue; } } @@ -462,10 +436,9 @@ getxattr(struct dentry *d, const char __user *name, void __user *value, than XATTR_SIZE_MAX bytes. Not possible. */ error = -E2BIG; } - if (vvalue) - vfree(vvalue); - else - kfree(kvalue); + + kvfree(kvalue); + return error; } @@ -522,17 +495,15 @@ listxattr(struct dentry *d, char __user *list, size_t size) { ssize_t error; char *klist = NULL; - char *vlist = NULL; /* If non-NULL, we used vmalloc() */ if (size) { if (size > XATTR_LIST_MAX) size = XATTR_LIST_MAX; klist = kmalloc(size, __GFP_NOWARN | GFP_KERNEL); if (!klist) { - vlist = vmalloc(size); - if (!vlist) + klist = vmalloc(size); + if (!klist) return -ENOMEM; - klist = vlist; } } @@ -545,10 +516,9 @@ listxattr(struct dentry *d, char __user *list, size_t size) than XATTR_LIST_MAX bytes. Not possible. */ error = -E2BIG; } - if (vlist) - vfree(vlist); - else - kfree(klist); + + kvfree(klist); + return error; } @@ -701,13 +671,20 @@ xattr_resolve_name(const struct xattr_handler **handlers, const char **name) return NULL; for_each_xattr_handler(handlers, handler) { - const char *n = strcmp_prefix(*name, handler->prefix); + const char *n; + + n = strcmp_prefix(*name, xattr_prefix(handler)); if (n) { + if (!handler->prefix ^ !*n) { + if (*n) + continue; + return ERR_PTR(-EINVAL); + } *name = n; - break; + return handler; } } - return handler; + return ERR_PTR(-EOPNOTSUPP); } /* @@ -719,8 +696,8 @@ generic_getxattr(struct dentry *dentry, const char *name, void *buffer, size_t s const struct xattr_handler *handler; handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name); - if (!handler) - return -EOPNOTSUPP; + if (IS_ERR(handler)) + return PTR_ERR(handler); return handler->get(handler, dentry, name, buffer, size); } @@ -736,19 +713,25 @@ generic_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) if (!buffer) { for_each_xattr_handler(handlers, handler) { - size += handler->list(handler, dentry, NULL, 0, - NULL, 0); + if (!handler->name || + (handler->list && !handler->list(dentry))) + continue; + size += strlen(handler->name) + 1; } } else { char *buf = buffer; + size_t len; for_each_xattr_handler(handlers, handler) { - size = handler->list(handler, dentry, buf, buffer_size, - NULL, 0); - if (size > buffer_size) + if (!handler->name || + (handler->list && !handler->list(dentry))) + continue; + len = strlen(handler->name); + if (len + 1 > buffer_size) return -ERANGE; - buf += size; - buffer_size -= size; + memcpy(buf, handler->name, len + 1); + buf += len + 1; + buffer_size -= len + 1; } size = buf - buffer; } @@ -766,8 +749,8 @@ generic_setxattr(struct dentry *dentry, const char *name, const void *value, siz if (size == 0) value = ""; /* empty EA, do not remove */ handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name); - if (!handler) - return -EOPNOTSUPP; + if (IS_ERR(handler)) + return PTR_ERR(handler); return handler->set(handler, dentry, name, value, size, flags); } @@ -781,8 +764,8 @@ generic_removexattr(struct dentry *dentry, const char *name) const struct xattr_handler *handler; handler = xattr_resolve_name(dentry->d_sb->s_xattr, &name); - if (!handler) - return -EOPNOTSUPP; + if (IS_ERR(handler)) + return PTR_ERR(handler); return handler->set(handler, dentry, name, NULL, 0, XATTR_REPLACE); } @@ -809,7 +792,7 @@ EXPORT_SYMBOL(generic_removexattr); const char *xattr_full_name(const struct xattr_handler *handler, const char *name) { - size_t prefix_len = strlen(handler->prefix); + size_t prefix_len = strlen(xattr_prefix(handler)); return name - prefix_len; } @@ -864,8 +847,22 @@ int simple_xattr_get(struct simple_xattrs *xattrs, const char *name, return ret; } -static int __simple_xattr_set(struct simple_xattrs *xattrs, const char *name, - const void *value, size_t size, int flags) +/** + * simple_xattr_set - xattr SET operation for in-memory/pseudo filesystems + * @xattrs: target simple_xattr list + * @name: name of the extended attribute + * @value: value of the xattr. If %NULL, will remove the attribute. + * @size: size of the new xattr + * @flags: %XATTR_{CREATE|REPLACE} + * + * %XATTR_CREATE is set, the xattr shouldn't exist already; otherwise fails + * with -EEXIST. If %XATTR_REPLACE is set, the xattr should exist; + * otherwise, fails with -ENODATA. + * + * Returns 0 on success, -errno on failure. + */ +int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, + const void *value, size_t size, int flags) { struct simple_xattr *xattr; struct simple_xattr *new_xattr = NULL; @@ -915,73 +912,64 @@ out: } -/** - * simple_xattr_set - xattr SET operation for in-memory/pseudo filesystems - * @xattrs: target simple_xattr list - * @name: name of the new extended attribute - * @value: value of the new xattr. If %NULL, will remove the attribute - * @size: size of the new xattr - * @flags: %XATTR_{CREATE|REPLACE} - * - * %XATTR_CREATE is set, the xattr shouldn't exist already; otherwise fails - * with -EEXIST. If %XATTR_REPLACE is set, the xattr should exist; - * otherwise, fails with -ENODATA. - * - * Returns 0 on success, -errno on failure. - */ -int simple_xattr_set(struct simple_xattrs *xattrs, const char *name, - const void *value, size_t size, int flags) -{ - if (size == 0) - value = ""; /* empty EA, do not remove */ - return __simple_xattr_set(xattrs, name, value, size, flags); -} - -/* - * xattr REMOVE operation for in-memory/pseudo filesystems - */ -int simple_xattr_remove(struct simple_xattrs *xattrs, const char *name) +static bool xattr_is_trusted(const char *name) { - return __simple_xattr_set(xattrs, name, NULL, 0, XATTR_REPLACE); + return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN); } -static bool xattr_is_trusted(const char *name) +static int xattr_list_one(char **buffer, ssize_t *remaining_size, + const char *name) { - return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN); + size_t len = strlen(name) + 1; + if (*buffer) { + if (*remaining_size < len) + return -ERANGE; + memcpy(*buffer, name, len); + *buffer += len; + } + *remaining_size -= len; + return 0; } /* * xattr LIST operation for in-memory/pseudo filesystems */ -ssize_t simple_xattr_list(struct simple_xattrs *xattrs, char *buffer, - size_t size) +ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, + char *buffer, size_t size) { bool trusted = capable(CAP_SYS_ADMIN); struct simple_xattr *xattr; - size_t used = 0; + ssize_t remaining_size = size; + int err = 0; + +#ifdef CONFIG_FS_POSIX_ACL + if (inode->i_acl) { + err = xattr_list_one(&buffer, &remaining_size, + XATTR_NAME_POSIX_ACL_ACCESS); + if (err) + return err; + } + if (inode->i_default_acl) { + err = xattr_list_one(&buffer, &remaining_size, + XATTR_NAME_POSIX_ACL_DEFAULT); + if (err) + return err; + } +#endif spin_lock(&xattrs->lock); list_for_each_entry(xattr, &xattrs->head, list) { - size_t len; - /* skip "trusted." attributes for unprivileged callers */ if (!trusted && xattr_is_trusted(xattr->name)) continue; - len = strlen(xattr->name) + 1; - used += len; - if (buffer) { - if (size < used) { - used = -ERANGE; - break; - } - memcpy(buffer, xattr->name, len); - buffer += len; - } + err = xattr_list_one(&buffer, &remaining_size, xattr->name); + if (err) + break; } spin_unlock(&xattrs->lock); - return used; + return err ? err : size - remaining_size; } /* diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index cc6b768fc..d1c66e465 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -84,6 +84,7 @@ kmem_zalloc(size_t size, xfs_km_flags_t flags) #define KM_ZONE_HWALIGN SLAB_HWCACHE_ALIGN #define KM_ZONE_RECLAIM SLAB_RECLAIM_ACCOUNT #define KM_ZONE_SPREAD SLAB_MEM_SPREAD +#define KM_ZONE_ACCOUNT SLAB_ACCOUNT #define kmem_zone kmem_cache #define kmem_zone_t struct kmem_cache diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 3479294c1..a708e38b4 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -535,6 +535,7 @@ xfs_agfl_write_verify( } const struct xfs_buf_ops xfs_agfl_buf_ops = { + .name = "xfs_agfl", .verify_read = xfs_agfl_read_verify, .verify_write = xfs_agfl_write_verify, }; @@ -1926,7 +1927,7 @@ xfs_alloc_space_available( * Decide whether to use this allocation group for this allocation. * If so, fix up the btree freelist's size. */ -STATIC int /* error */ +int /* error */ xfs_alloc_fix_freelist( struct xfs_alloc_arg *args, /* allocation argument structure */ int flags) /* XFS_ALLOC_FLAG_... */ @@ -2339,6 +2340,7 @@ xfs_agf_write_verify( } const struct xfs_buf_ops xfs_agf_buf_ops = { + .name = "xfs_agf", .verify_read = xfs_agf_read_verify, .verify_write = xfs_agf_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h index 0ecde4d5c..135eb3d24 100644 --- a/fs/xfs/libxfs/xfs_alloc.h +++ b/fs/xfs/libxfs/xfs_alloc.h @@ -235,5 +235,6 @@ xfs_alloc_get_rec( int xfs_read_agf(struct xfs_mount *mp, struct xfs_trans *tp, xfs_agnumber_t agno, int flags, struct xfs_buf **bpp); +int xfs_alloc_fix_freelist(struct xfs_alloc_arg *args, int flags); #endif /* __XFS_ALLOC_H__ */ diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c index 90de071dd..444626ddb 100644 --- a/fs/xfs/libxfs/xfs_alloc_btree.c +++ b/fs/xfs/libxfs/xfs_alloc_btree.c @@ -293,14 +293,7 @@ xfs_allocbt_verify( level = be16_to_cpu(block->bb_level); switch (block->bb_magic) { case cpu_to_be32(XFS_ABTB_CRC_MAGIC): - if (!xfs_sb_version_hascrc(&mp->m_sb)) - return false; - if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) - return false; - if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn)) - return false; - if (pag && - be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno) + if (!xfs_btree_sblock_v5hdr_verify(bp)) return false; /* fall through */ case cpu_to_be32(XFS_ABTB_MAGIC): @@ -311,14 +304,7 @@ xfs_allocbt_verify( return false; break; case cpu_to_be32(XFS_ABTC_CRC_MAGIC): - if (!xfs_sb_version_hascrc(&mp->m_sb)) - return false; - if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) - return false; - if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn)) - return false; - if (pag && - be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno) + if (!xfs_btree_sblock_v5hdr_verify(bp)) return false; /* fall through */ case cpu_to_be32(XFS_ABTC_MAGIC): @@ -332,21 +318,7 @@ xfs_allocbt_verify( return false; } - /* numrecs verification */ - if (be16_to_cpu(block->bb_numrecs) > mp->m_alloc_mxr[level != 0]) - return false; - - /* sibling pointer verification */ - if (!block->bb_u.s.bb_leftsib || - (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks && - block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK))) - return false; - if (!block->bb_u.s.bb_rightsib || - (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks && - block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK))) - return false; - - return true; + return xfs_btree_sblock_verify(bp, mp->m_alloc_mxr[level != 0]); } static void @@ -379,6 +351,7 @@ xfs_allocbt_write_verify( } const struct xfs_buf_ops xfs_allocbt_buf_ops = { + .name = "xfs_allocbt", .verify_read = xfs_allocbt_read_verify, .verify_write = xfs_allocbt_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index f949818fa..fa3b948ef 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -207,7 +207,7 @@ xfs_attr_set( struct xfs_trans_res tres; xfs_fsblock_t firstblock; int rsvd = (flags & ATTR_ROOT) != 0; - int error, err2, committed, local; + int error, err2, local; XFS_STATS_INC(mp, xs_attr_set); @@ -334,25 +334,15 @@ xfs_attr_set( */ xfs_bmap_init(args.flist, args.firstblock); error = xfs_attr_shortform_to_leaf(&args); - if (!error) { - error = xfs_bmap_finish(&args.trans, args.flist, - &committed); - } + if (!error) + error = xfs_bmap_finish(&args.trans, args.flist, dp); if (error) { - ASSERT(committed); args.trans = NULL; xfs_bmap_cancel(&flist); goto out; } /* - * bmap_finish() may have committed the last trans and started - * a new one. We need the inode to be in all transactions. - */ - if (committed) - xfs_trans_ijoin(args.trans, dp, 0); - - /* * Commit the leaf transformation. We'll need another (linked) * transaction to add the new attribute to the leaf. */ @@ -568,7 +558,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) { xfs_inode_t *dp; struct xfs_buf *bp; - int retval, error, committed, forkoff; + int retval, error, forkoff; trace_xfs_attr_leaf_addname(args); @@ -628,25 +618,15 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) */ xfs_bmap_init(args->flist, args->firstblock); error = xfs_attr3_leaf_to_node(args); - if (!error) { - error = xfs_bmap_finish(&args->trans, args->flist, - &committed); - } + if (!error) + error = xfs_bmap_finish(&args->trans, args->flist, dp); if (error) { - ASSERT(committed); args->trans = NULL; xfs_bmap_cancel(args->flist); return error; } /* - * bmap_finish() may have committed the last trans and started - * a new one. We need the inode to be in all transactions. - */ - if (committed) - xfs_trans_ijoin(args->trans, dp, 0); - - /* * Commit the current trans (including the inode) and start * a new one. */ @@ -729,25 +709,14 @@ xfs_attr_leaf_addname(xfs_da_args_t *args) xfs_bmap_init(args->flist, args->firstblock); error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ - if (!error) { + if (!error) error = xfs_bmap_finish(&args->trans, - args->flist, - &committed); - } + args->flist, dp); if (error) { - ASSERT(committed); args->trans = NULL; xfs_bmap_cancel(args->flist); return error; } - - /* - * bmap_finish() may have committed the last trans - * and started a new one. We need the inode to be - * in all transactions. - */ - if (committed) - xfs_trans_ijoin(args->trans, dp, 0); } /* @@ -775,7 +744,7 @@ xfs_attr_leaf_removename(xfs_da_args_t *args) { xfs_inode_t *dp; struct xfs_buf *bp; - int error, committed, forkoff; + int error, forkoff; trace_xfs_attr_leaf_removename(args); @@ -803,23 +772,13 @@ xfs_attr_leaf_removename(xfs_da_args_t *args) xfs_bmap_init(args->flist, args->firstblock); error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ - if (!error) { - error = xfs_bmap_finish(&args->trans, args->flist, - &committed); - } + if (!error) + error = xfs_bmap_finish(&args->trans, args->flist, dp); if (error) { - ASSERT(committed); args->trans = NULL; xfs_bmap_cancel(args->flist); return error; } - - /* - * bmap_finish() may have committed the last trans and started - * a new one. We need the inode to be in all transactions. - */ - if (committed) - xfs_trans_ijoin(args->trans, dp, 0); } return 0; } @@ -877,7 +836,7 @@ xfs_attr_node_addname(xfs_da_args_t *args) xfs_da_state_blk_t *blk; xfs_inode_t *dp; xfs_mount_t *mp; - int committed, retval, error; + int retval, error; trace_xfs_attr_node_addname(args); @@ -938,27 +897,16 @@ restart: state = NULL; xfs_bmap_init(args->flist, args->firstblock); error = xfs_attr3_leaf_to_node(args); - if (!error) { + if (!error) error = xfs_bmap_finish(&args->trans, - args->flist, - &committed); - } + args->flist, dp); if (error) { - ASSERT(committed); args->trans = NULL; xfs_bmap_cancel(args->flist); goto out; } /* - * bmap_finish() may have committed the last trans - * and started a new one. We need the inode to be - * in all transactions. - */ - if (committed) - xfs_trans_ijoin(args->trans, dp, 0); - - /* * Commit the node conversion and start the next * trans in the chain. */ @@ -977,23 +925,13 @@ restart: */ xfs_bmap_init(args->flist, args->firstblock); error = xfs_da3_split(state); - if (!error) { - error = xfs_bmap_finish(&args->trans, args->flist, - &committed); - } + if (!error) + error = xfs_bmap_finish(&args->trans, args->flist, dp); if (error) { - ASSERT(committed); args->trans = NULL; xfs_bmap_cancel(args->flist); goto out; } - - /* - * bmap_finish() may have committed the last trans and started - * a new one. We need the inode to be in all transactions. - */ - if (committed) - xfs_trans_ijoin(args->trans, dp, 0); } else { /* * Addition succeeded, update Btree hashvals. @@ -1086,25 +1024,14 @@ restart: if (retval && (state->path.active > 1)) { xfs_bmap_init(args->flist, args->firstblock); error = xfs_da3_join(state); - if (!error) { + if (!error) error = xfs_bmap_finish(&args->trans, - args->flist, - &committed); - } + args->flist, dp); if (error) { - ASSERT(committed); args->trans = NULL; xfs_bmap_cancel(args->flist); goto out; } - - /* - * bmap_finish() may have committed the last trans - * and started a new one. We need the inode to be - * in all transactions. - */ - if (committed) - xfs_trans_ijoin(args->trans, dp, 0); } /* @@ -1146,7 +1073,7 @@ xfs_attr_node_removename(xfs_da_args_t *args) xfs_da_state_blk_t *blk; xfs_inode_t *dp; struct xfs_buf *bp; - int retval, error, committed, forkoff; + int retval, error, forkoff; trace_xfs_attr_node_removename(args); @@ -1220,24 +1147,13 @@ xfs_attr_node_removename(xfs_da_args_t *args) if (retval && (state->path.active > 1)) { xfs_bmap_init(args->flist, args->firstblock); error = xfs_da3_join(state); - if (!error) { - error = xfs_bmap_finish(&args->trans, args->flist, - &committed); - } + if (!error) + error = xfs_bmap_finish(&args->trans, args->flist, dp); if (error) { - ASSERT(committed); args->trans = NULL; xfs_bmap_cancel(args->flist); goto out; } - - /* - * bmap_finish() may have committed the last trans and started - * a new one. We need the inode to be in all transactions. - */ - if (committed) - xfs_trans_ijoin(args->trans, dp, 0); - /* * Commit the Btree join operation and start a new trans. */ @@ -1265,25 +1181,14 @@ xfs_attr_node_removename(xfs_da_args_t *args) xfs_bmap_init(args->flist, args->firstblock); error = xfs_attr3_leaf_to_shortform(bp, args, forkoff); /* bp is gone due to xfs_da_shrink_inode */ - if (!error) { + if (!error) error = xfs_bmap_finish(&args->trans, - args->flist, - &committed); - } + args->flist, dp); if (error) { - ASSERT(committed); args->trans = NULL; xfs_bmap_cancel(args->flist); goto out; } - - /* - * bmap_finish() may have committed the last trans - * and started a new one. We need the inode to be - * in all transactions. - */ - if (committed) - xfs_trans_ijoin(args->trans, dp, 0); } else xfs_trans_brelse(args->trans, bp); } diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index aa187f7ba..01a5ecfed 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -328,6 +328,7 @@ xfs_attr3_leaf_read_verify( } const struct xfs_buf_ops xfs_attr3_leaf_buf_ops = { + .name = "xfs_attr3_leaf", .verify_read = xfs_attr3_leaf_read_verify, .verify_write = xfs_attr3_leaf_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_attr_remote.c b/fs/xfs/libxfs/xfs_attr_remote.c index 5ab95ffa4..a572532a5 100644 --- a/fs/xfs/libxfs/xfs_attr_remote.c +++ b/fs/xfs/libxfs/xfs_attr_remote.c @@ -201,6 +201,7 @@ xfs_attr3_rmt_write_verify( } const struct xfs_buf_ops xfs_attr3_rmt_buf_ops = { + .name = "xfs_attr3_rmt", .verify_read = xfs_attr3_rmt_read_verify, .verify_write = xfs_attr3_rmt_write_verify, }; @@ -447,8 +448,6 @@ xfs_attr_rmtval_set( * Roll through the "value", allocating blocks on disk as required. */ while (blkcnt > 0) { - int committed; - /* * Allocate a single extent, up to the size of the value. * @@ -466,24 +465,14 @@ xfs_attr_rmtval_set( error = xfs_bmapi_write(args->trans, dp, (xfs_fileoff_t)lblkno, blkcnt, XFS_BMAPI_ATTRFORK, args->firstblock, args->total, &map, &nmap, args->flist); - if (!error) { - error = xfs_bmap_finish(&args->trans, args->flist, - &committed); - } + if (!error) + error = xfs_bmap_finish(&args->trans, args->flist, dp); if (error) { - ASSERT(committed); args->trans = NULL; xfs_bmap_cancel(args->flist); return error; } - /* - * bmap_finish() may have committed the last trans and started - * a new one. We need the inode to be in all transactions. - */ - if (committed) - xfs_trans_ijoin(args->trans, dp, 0); - ASSERT(nmap == 1); ASSERT((map.br_startblock != DELAYSTARTBLOCK) && (map.br_startblock != HOLESTARTBLOCK)); @@ -614,31 +603,20 @@ xfs_attr_rmtval_remove( blkcnt = args->rmtblkcnt; done = 0; while (!done) { - int committed; - xfs_bmap_init(args->flist, args->firstblock); error = xfs_bunmapi(args->trans, args->dp, lblkno, blkcnt, XFS_BMAPI_ATTRFORK, 1, args->firstblock, args->flist, &done); - if (!error) { + if (!error) error = xfs_bmap_finish(&args->trans, args->flist, - &committed); - } + args->dp); if (error) { - ASSERT(committed); args->trans = NULL; xfs_bmap_cancel(args->flist); return error; } /* - * bmap_finish() may have committed the last trans and started - * a new one. We need the inode to be in all transactions. - */ - if (committed) - xfs_trans_ijoin(args->trans, args->dp, 0); - - /* * Close out trans and start the next one in the chain. */ error = xfs_trans_roll(&args->trans, args->dp); diff --git a/fs/xfs/libxfs/xfs_bit.c b/fs/xfs/libxfs/xfs_bit.c index 0e8885a59..0a94cce5e 100644 --- a/fs/xfs/libxfs/xfs_bit.c +++ b/fs/xfs/libxfs/xfs_bit.c @@ -32,13 +32,13 @@ int xfs_bitmap_empty(uint *map, uint size) { uint i; - uint ret = 0; for (i = 0; i < size; i++) { - ret |= map[i]; + if (map[i] != 0) + return 0; } - return (ret == 0); + return 1; } /* diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 119c2422a..ef00156f4 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -325,9 +325,11 @@ xfs_check_block( /* * Check that the extents for the inode ip are in the right order in all - * btree leaves. + * btree leaves. THis becomes prohibitively expensive for large extent count + * files, so don't bother with inodes that have more than 10,000 extents in + * them. The btree record ordering checks will still be done, so for such large + * bmapbt constructs that is going to catch most corruptions. */ - STATIC void xfs_bmap_check_leaf_extents( xfs_btree_cur_t *cur, /* btree cursor or null */ @@ -352,6 +354,10 @@ xfs_bmap_check_leaf_extents( return; } + /* skip large extent count inodes */ + if (ip->i_d.di_nextents > 10000) + return; + bno = NULLFSBLOCK; mp = ip->i_mount; ifp = XFS_IFORK_PTR(ip, whichfork); @@ -1111,7 +1117,6 @@ xfs_bmap_add_attrfork( xfs_trans_t *tp; /* transaction pointer */ int blks; /* space reservation */ int version = 1; /* superblock attr version */ - int committed; /* xaction was committed */ int logflags; /* logging flags */ int error; /* error return value */ @@ -1214,7 +1219,7 @@ xfs_bmap_add_attrfork( xfs_log_sb(tp); } - error = xfs_bmap_finish(&tp, &flist, &committed); + error = xfs_bmap_finish(&tp, &flist, NULL); if (error) goto bmap_cancel; error = xfs_trans_commit(tp); @@ -1723,10 +1728,11 @@ xfs_bmap_add_extent_delay_real( xfs_filblks_t temp=0; /* value for da_new calculations */ xfs_filblks_t temp2=0;/* value for da_new calculations */ int tmp_rval; /* partial logging flags */ + int whichfork = XFS_DATA_FORK; struct xfs_mount *mp; - mp = bma->tp ? bma->tp->t_mountp : NULL; - ifp = XFS_IFORK_PTR(bma->ip, XFS_DATA_FORK); + mp = bma->ip->i_mount; + ifp = XFS_IFORK_PTR(bma->ip, whichfork); ASSERT(bma->idx >= 0); ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec)); @@ -1785,7 +1791,7 @@ xfs_bmap_add_extent_delay_real( * Don't set contiguous if the combined extent would be too large. * Also check for all-three-contiguous being too large. */ - if (bma->idx < bma->ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) { + if (bma->idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) { state |= BMAP_RIGHT_VALID; xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx + 1), &RIGHT); @@ -2016,10 +2022,10 @@ xfs_bmap_add_extent_delay_real( XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); } - if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { + if (xfs_bmap_needs_btree(bma->ip, whichfork)) { error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, bma->firstblock, bma->flist, - &bma->cur, 1, &tmp_rval, XFS_DATA_FORK); + &bma->cur, 1, &tmp_rval, whichfork); rval |= tmp_rval; if (error) goto done; @@ -2100,10 +2106,10 @@ xfs_bmap_add_extent_delay_real( XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); } - if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { + if (xfs_bmap_needs_btree(bma->ip, whichfork)) { error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, bma->firstblock, bma->flist, &bma->cur, 1, - &tmp_rval, XFS_DATA_FORK); + &tmp_rval, whichfork); rval |= tmp_rval; if (error) goto done; @@ -2169,10 +2175,10 @@ xfs_bmap_add_extent_delay_real( XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done); } - if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { + if (xfs_bmap_needs_btree(bma->ip, whichfork)) { error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, bma->firstblock, bma->flist, &bma->cur, - 1, &tmp_rval, XFS_DATA_FORK); + 1, &tmp_rval, whichfork); rval |= tmp_rval; if (error) goto done; @@ -2215,13 +2221,13 @@ xfs_bmap_add_extent_delay_real( } /* convert to a btree if necessary */ - if (xfs_bmap_needs_btree(bma->ip, XFS_DATA_FORK)) { + if (xfs_bmap_needs_btree(bma->ip, whichfork)) { int tmp_logflags; /* partial log flag return val */ ASSERT(bma->cur == NULL); error = xfs_bmap_extents_to_btree(bma->tp, bma->ip, bma->firstblock, bma->flist, &bma->cur, - da_old > 0, &tmp_logflags, XFS_DATA_FORK); + da_old > 0, &tmp_logflags, whichfork); bma->logflags |= tmp_logflags; if (error) goto done; @@ -2242,7 +2248,7 @@ xfs_bmap_add_extent_delay_real( if (bma->cur) bma->cur->bc_private.b.allocated = 0; - xfs_bmap_check_leaf_extents(bma->cur, bma->ip, XFS_DATA_FORK); + xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork); done: bma->logflags |= rval; return error; @@ -2939,7 +2945,7 @@ xfs_bmap_add_extent_hole_real( int state; /* state bits, accessed thru macros */ struct xfs_mount *mp; - mp = bma->tp ? bma->tp->t_mountp : NULL; + mp = bma->ip->i_mount; ifp = XFS_IFORK_PTR(bma->ip, whichfork); ASSERT(bma->idx >= 0); @@ -5950,7 +5956,6 @@ xfs_bmap_split_extent( struct xfs_trans *tp; struct xfs_bmap_free free_list; xfs_fsblock_t firstfsb; - int committed; int error; tp = xfs_trans_alloc(mp, XFS_TRANS_DIOSTRAT); @@ -5971,7 +5976,7 @@ xfs_bmap_split_extent( if (error) goto out; - error = xfs_bmap_finish(&tp, &free_list, &committed); + error = xfs_bmap_finish(&tp, &free_list, NULL); if (error) goto out; diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index a160f8a5a..423a34e83 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -195,7 +195,7 @@ void xfs_bmap_add_free(xfs_fsblock_t bno, xfs_filblks_t len, struct xfs_bmap_free *flist, struct xfs_mount *mp); void xfs_bmap_cancel(struct xfs_bmap_free *flist); int xfs_bmap_finish(struct xfs_trans **tp, struct xfs_bmap_free *flist, - int *committed); + struct xfs_inode *ip); void xfs_bmap_compute_maxlevels(struct xfs_mount *mp, int whichfork); int xfs_bmap_first_unused(struct xfs_trans *tp, struct xfs_inode *ip, xfs_extlen_t len, xfs_fileoff_t *unused, int whichfork); diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c index 6b0cf6546..1637c37bf 100644 --- a/fs/xfs/libxfs/xfs_bmap_btree.c +++ b/fs/xfs/libxfs/xfs_bmap_btree.c @@ -720,6 +720,7 @@ xfs_bmbt_write_verify( } const struct xfs_buf_ops xfs_bmbt_buf_ops = { + .name = "xfs_bmbt", .verify_read = xfs_bmbt_read_verify, .verify_write = xfs_bmbt_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index af1bbee55..a0eb18ce3 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -4080,3 +4080,61 @@ xfs_btree_change_owner( return 0; } + +/** + * xfs_btree_sblock_v5hdr_verify() -- verify the v5 fields of a short-format + * btree block + * + * @bp: buffer containing the btree block + * @max_recs: pointer to the m_*_mxr max records field in the xfs mount + * @pag_max_level: pointer to the per-ag max level field + */ +bool +xfs_btree_sblock_v5hdr_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + struct xfs_perag *pag = bp->b_pag; + + if (!xfs_sb_version_hascrc(&mp->m_sb)) + return false; + if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) + return false; + if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn)) + return false; + if (pag && be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno) + return false; + return true; +} + +/** + * xfs_btree_sblock_verify() -- verify a short-format btree block + * + * @bp: buffer containing the btree block + * @max_recs: maximum records allowed in this btree node + */ +bool +xfs_btree_sblock_verify( + struct xfs_buf *bp, + unsigned int max_recs) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); + + /* numrecs verification */ + if (be16_to_cpu(block->bb_numrecs) > max_recs) + return false; + + /* sibling pointer verification */ + if (!block->bb_u.s.bb_leftsib || + (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks && + block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK))) + return false; + if (!block->bb_u.s.bb_rightsib || + (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks && + block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK))) + return false; + + return true; +} diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h index 992dec063..2e874be70 100644 --- a/fs/xfs/libxfs/xfs_btree.h +++ b/fs/xfs/libxfs/xfs_btree.h @@ -472,4 +472,7 @@ static inline int xfs_btree_get_level(struct xfs_btree_block *block) #define XFS_BTREE_TRACE_ARGR(c, r) #define XFS_BTREE_TRACE_CURSOR(c, t) +bool xfs_btree_sblock_v5hdr_verify(struct xfs_buf *bp); +bool xfs_btree_sblock_verify(struct xfs_buf *bp, unsigned int max_recs); + #endif /* __XFS_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index e89a0f8f8..097bf7717 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -245,6 +245,7 @@ xfs_da3_node_read_verify( } const struct xfs_buf_ops xfs_da3_node_buf_ops = { + .name = "xfs_da3_node", .verify_read = xfs_da3_node_read_verify, .verify_write = xfs_da3_node_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_dir2_block.c b/fs/xfs/libxfs/xfs_dir2_block.c index 9c10e2b8c..aa17cb788 100644 --- a/fs/xfs/libxfs/xfs_dir2_block.c +++ b/fs/xfs/libxfs/xfs_dir2_block.c @@ -123,6 +123,7 @@ xfs_dir3_block_write_verify( } const struct xfs_buf_ops xfs_dir3_block_buf_ops = { + .name = "xfs_dir3_block", .verify_read = xfs_dir3_block_read_verify, .verify_write = xfs_dir3_block_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c index af71a84f3..725fc7841 100644 --- a/fs/xfs/libxfs/xfs_dir2_data.c +++ b/fs/xfs/libxfs/xfs_dir2_data.c @@ -305,11 +305,13 @@ xfs_dir3_data_write_verify( } const struct xfs_buf_ops xfs_dir3_data_buf_ops = { + .name = "xfs_dir3_data", .verify_read = xfs_dir3_data_read_verify, .verify_write = xfs_dir3_data_write_verify, }; static const struct xfs_buf_ops xfs_dir3_data_reada_buf_ops = { + .name = "xfs_dir3_data_reada", .verify_read = xfs_dir3_data_reada_verify, .verify_write = xfs_dir3_data_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_dir2_leaf.c b/fs/xfs/libxfs/xfs_dir2_leaf.c index 3923e1f94..b887fb2a2 100644 --- a/fs/xfs/libxfs/xfs_dir2_leaf.c +++ b/fs/xfs/libxfs/xfs_dir2_leaf.c @@ -245,11 +245,13 @@ xfs_dir3_leafn_write_verify( } const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops = { + .name = "xfs_dir3_leaf1", .verify_read = xfs_dir3_leaf1_read_verify, .verify_write = xfs_dir3_leaf1_write_verify, }; const struct xfs_buf_ops xfs_dir3_leafn_buf_ops = { + .name = "xfs_dir3_leafn", .verify_read = xfs_dir3_leafn_read_verify, .verify_write = xfs_dir3_leafn_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_dir2_node.c b/fs/xfs/libxfs/xfs_dir2_node.c index 70b0cb2fd..63ee03db7 100644 --- a/fs/xfs/libxfs/xfs_dir2_node.c +++ b/fs/xfs/libxfs/xfs_dir2_node.c @@ -150,6 +150,7 @@ xfs_dir3_free_write_verify( } const struct xfs_buf_ops xfs_dir3_free_buf_ops = { + .name = "xfs_dir3_free", .verify_read = xfs_dir3_free_read_verify, .verify_write = xfs_dir3_free_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_dquot_buf.c b/fs/xfs/libxfs/xfs_dquot_buf.c index 5331b7f04..3cc3cf767 100644 --- a/fs/xfs/libxfs/xfs_dquot_buf.c +++ b/fs/xfs/libxfs/xfs_dquot_buf.c @@ -54,7 +54,7 @@ xfs_dqcheck( xfs_dqid_t id, uint type, /* used only when IO_dorepair is true */ uint flags, - char *str) + const char *str) { xfs_dqblk_t *d = (xfs_dqblk_t *)ddq; int errs = 0; @@ -207,7 +207,8 @@ xfs_dquot_buf_verify_crc( STATIC bool xfs_dquot_buf_verify( struct xfs_mount *mp, - struct xfs_buf *bp) + struct xfs_buf *bp, + int warn) { struct xfs_dqblk *d = (struct xfs_dqblk *)bp->b_addr; xfs_dqid_t id = 0; @@ -240,8 +241,7 @@ xfs_dquot_buf_verify( if (i == 0) id = be32_to_cpu(ddq->d_id); - error = xfs_dqcheck(mp, ddq, id + i, 0, XFS_QMOPT_DOWARN, - "xfs_dquot_buf_verify"); + error = xfs_dqcheck(mp, ddq, id + i, 0, warn, __func__); if (error) return false; } @@ -256,7 +256,7 @@ xfs_dquot_buf_read_verify( if (!xfs_dquot_buf_verify_crc(mp, bp)) xfs_buf_ioerror(bp, -EFSBADCRC); - else if (!xfs_dquot_buf_verify(mp, bp)) + else if (!xfs_dquot_buf_verify(mp, bp, XFS_QMOPT_DOWARN)) xfs_buf_ioerror(bp, -EFSCORRUPTED); if (bp->b_error) @@ -264,6 +264,25 @@ xfs_dquot_buf_read_verify( } /* + * readahead errors are silent and simply leave the buffer as !done so a real + * read will then be run with the xfs_dquot_buf_ops verifier. See + * xfs_inode_buf_verify() for why we use EIO and ~XBF_DONE here rather than + * reporting the failure. + */ +static void +xfs_dquot_buf_readahead_verify( + struct xfs_buf *bp) +{ + struct xfs_mount *mp = bp->b_target->bt_mount; + + if (!xfs_dquot_buf_verify_crc(mp, bp) || + !xfs_dquot_buf_verify(mp, bp, 0)) { + xfs_buf_ioerror(bp, -EIO); + bp->b_flags &= ~XBF_DONE; + } +} + +/* * we don't calculate the CRC here as that is done when the dquot is flushed to * the buffer after the update is done. This ensures that the dquot in the * buffer always has an up-to-date CRC value. @@ -274,7 +293,7 @@ xfs_dquot_buf_write_verify( { struct xfs_mount *mp = bp->b_target->bt_mount; - if (!xfs_dquot_buf_verify(mp, bp)) { + if (!xfs_dquot_buf_verify(mp, bp, XFS_QMOPT_DOWARN)) { xfs_buf_ioerror(bp, -EFSCORRUPTED); xfs_verifier_error(bp); return; @@ -282,7 +301,13 @@ xfs_dquot_buf_write_verify( } const struct xfs_buf_ops xfs_dquot_buf_ops = { + .name = "xfs_dquot", .verify_read = xfs_dquot_buf_read_verify, .verify_write = xfs_dquot_buf_write_verify, }; +const struct xfs_buf_ops xfs_dquot_buf_ra_ops = { + .name = "xfs_dquot_ra", + .verify_read = xfs_dquot_buf_readahead_verify, + .verify_write = xfs_dquot_buf_write_verify, +}; diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index e2536bb1c..dc97eb21a 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -984,8 +984,6 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) /* * Values for di_flags - * There should be a one-to-one correspondence between these flags and the - * XFS_XFLAG_s. */ #define XFS_DIFLAG_REALTIME_BIT 0 /* file's blocks come from rt area */ #define XFS_DIFLAG_PREALLOC_BIT 1 /* file space has been preallocated */ @@ -1026,6 +1024,15 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) XFS_DIFLAG_EXTSZINHERIT | XFS_DIFLAG_NODEFRAG | XFS_DIFLAG_FILESTREAM) /* + * Values for di_flags2 These start by being exposed to userspace in the upper + * 16 bits of the XFS_XFLAG_s range. + */ +#define XFS_DIFLAG2_DAX_BIT 0 /* use DAX for this inode */ +#define XFS_DIFLAG2_DAX (1 << XFS_DIFLAG2_DAX_BIT) + +#define XFS_DIFLAG2_ANY (XFS_DIFLAG2_DAX) + +/* * Inode number format: * low inopblog bits - offset in block * next agblklog bits - block number in ag diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index b2b73a998..fffe3d01b 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -36,40 +36,6 @@ struct dioattr { #endif /* - * Structure for XFS_IOC_FSGETXATTR[A] and XFS_IOC_FSSETXATTR. - */ -#ifndef HAVE_FSXATTR -struct fsxattr { - __u32 fsx_xflags; /* xflags field value (get/set) */ - __u32 fsx_extsize; /* extsize field value (get/set)*/ - __u32 fsx_nextents; /* nextents field value (get) */ - __u32 fsx_projid; /* project identifier (get/set) */ - unsigned char fsx_pad[12]; -}; -#endif - -/* - * Flags for the bs_xflags/fsx_xflags field - * There should be a one-to-one correspondence between these flags and the - * XFS_DIFLAG_s. - */ -#define XFS_XFLAG_REALTIME 0x00000001 /* data in realtime volume */ -#define XFS_XFLAG_PREALLOC 0x00000002 /* preallocated file extents */ -#define XFS_XFLAG_IMMUTABLE 0x00000008 /* file cannot be modified */ -#define XFS_XFLAG_APPEND 0x00000010 /* all writes append */ -#define XFS_XFLAG_SYNC 0x00000020 /* all writes synchronous */ -#define XFS_XFLAG_NOATIME 0x00000040 /* do not update access time */ -#define XFS_XFLAG_NODUMP 0x00000080 /* do not include in backups */ -#define XFS_XFLAG_RTINHERIT 0x00000100 /* create with rt bit set */ -#define XFS_XFLAG_PROJINHERIT 0x00000200 /* create with parents projid */ -#define XFS_XFLAG_NOSYMLINKS 0x00000400 /* disallow symlink creation */ -#define XFS_XFLAG_EXTSIZE 0x00000800 /* extent size allocator hint */ -#define XFS_XFLAG_EXTSZINHERIT 0x00001000 /* inherit inode extent size */ -#define XFS_XFLAG_NODEFRAG 0x00002000 /* do not defragment */ -#define XFS_XFLAG_FILESTREAM 0x00004000 /* use filestream allocator */ -#define XFS_XFLAG_HASATTR 0x80000000 /* no DIFLAG for this */ - -/* * Structure for XFS_IOC_GETBMAP. * On input, fill in bmv_offset and bmv_length of the first structure * to indicate the area of interest in the file, and bmv_entries with @@ -514,8 +480,8 @@ typedef struct xfs_swapext #define XFS_IOC_ALLOCSP _IOW ('X', 10, struct xfs_flock64) #define XFS_IOC_FREESP _IOW ('X', 11, struct xfs_flock64) #define XFS_IOC_DIOINFO _IOR ('X', 30, struct dioattr) -#define XFS_IOC_FSGETXATTR _IOR ('X', 31, struct fsxattr) -#define XFS_IOC_FSSETXATTR _IOW ('X', 32, struct fsxattr) +#define XFS_IOC_FSGETXATTR FS_IOC_FSGETXATTR +#define XFS_IOC_FSSETXATTR FS_IOC_FSSETXATTR #define XFS_IOC_ALLOCSP64 _IOW ('X', 36, struct xfs_flock64) #define XFS_IOC_FREESP64 _IOW ('X', 37, struct xfs_flock64) #define XFS_IOC_GETBMAP _IOWR('X', 38, struct getbmap) diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index 70c1db99f..66d702e6b 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -2572,6 +2572,7 @@ xfs_agi_write_verify( } const struct xfs_buf_ops xfs_agi_buf_ops = { + .name = "xfs_agi", .verify_read = xfs_agi_read_verify, .verify_write = xfs_agi_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c index f39b285be..c679f3c05 100644 --- a/fs/xfs/libxfs/xfs_ialloc_btree.c +++ b/fs/xfs/libxfs/xfs_ialloc_btree.c @@ -221,7 +221,6 @@ xfs_inobt_verify( { struct xfs_mount *mp = bp->b_target->bt_mount; struct xfs_btree_block *block = XFS_BUF_TO_BLOCK(bp); - struct xfs_perag *pag = bp->b_pag; unsigned int level; /* @@ -237,14 +236,7 @@ xfs_inobt_verify( switch (block->bb_magic) { case cpu_to_be32(XFS_IBT_CRC_MAGIC): case cpu_to_be32(XFS_FIBT_CRC_MAGIC): - if (!xfs_sb_version_hascrc(&mp->m_sb)) - return false; - if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid)) - return false; - if (block->bb_u.s.bb_blkno != cpu_to_be64(bp->b_bn)) - return false; - if (pag && - be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno) + if (!xfs_btree_sblock_v5hdr_verify(bp)) return false; /* fall through */ case cpu_to_be32(XFS_IBT_MAGIC): @@ -254,24 +246,12 @@ xfs_inobt_verify( return 0; } - /* numrecs and level verification */ + /* level verification */ level = be16_to_cpu(block->bb_level); if (level >= mp->m_in_maxlevels) return false; - if (be16_to_cpu(block->bb_numrecs) > mp->m_inobt_mxr[level != 0]) - return false; - - /* sibling pointer verification */ - if (!block->bb_u.s.bb_leftsib || - (be32_to_cpu(block->bb_u.s.bb_leftsib) >= mp->m_sb.sb_agblocks && - block->bb_u.s.bb_leftsib != cpu_to_be32(NULLAGBLOCK))) - return false; - if (!block->bb_u.s.bb_rightsib || - (be32_to_cpu(block->bb_u.s.bb_rightsib) >= mp->m_sb.sb_agblocks && - block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK))) - return false; - return true; + return xfs_btree_sblock_verify(bp, mp->m_inobt_mxr[level != 0]); } static void @@ -304,6 +284,7 @@ xfs_inobt_write_verify( } const struct xfs_buf_ops xfs_inobt_buf_ops = { + .name = "xfs_inobt", .verify_read = xfs_inobt_read_verify, .verify_write = xfs_inobt_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 65485cfc4..1aabfda66 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -68,6 +68,8 @@ xfs_inobp_check( * recovery and we don't get unnecssary panics on debug kernels. We use EIO here * because all we want to do is say readahead failed; there is no-one to report * the error to, so this will distinguish it from a non-ra verifier failure. + * Changes to this readahead error behavour also need to be reflected in + * xfs_dquot_buf_readahead_verify(). */ static void xfs_inode_buf_verify( @@ -134,11 +136,13 @@ xfs_inode_buf_write_verify( } const struct xfs_buf_ops xfs_inode_buf_ops = { + .name = "xfs_inode", .verify_read = xfs_inode_buf_read_verify, .verify_write = xfs_inode_buf_write_verify, }; const struct xfs_buf_ops xfs_inode_buf_ra_ops = { + .name = "xxfs_inode_ra", .verify_read = xfs_inode_buf_readahead_verify, .verify_write = xfs_inode_buf_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h index 1c55ccbb3..8e385f91d 100644 --- a/fs/xfs/libxfs/xfs_log_recover.h +++ b/fs/xfs/libxfs/xfs_log_recover.h @@ -60,6 +60,7 @@ typedef struct xlog_recover { */ #define XLOG_BC_TABLE_SIZE 64 +#define XLOG_RECOVER_CRCPASS 0 #define XLOG_RECOVER_PASS1 1 #define XLOG_RECOVER_PASS2 2 diff --git a/fs/xfs/libxfs/xfs_quota_defs.h b/fs/xfs/libxfs/xfs_quota_defs.h index 1b0a08379..f51078f1e 100644 --- a/fs/xfs/libxfs/xfs_quota_defs.h +++ b/fs/xfs/libxfs/xfs_quota_defs.h @@ -153,7 +153,7 @@ typedef __uint16_t xfs_qwarncnt_t; #define XFS_QMOPT_RESBLK_MASK (XFS_QMOPT_RES_REGBLKS | XFS_QMOPT_RES_RTBLKS) extern int xfs_dqcheck(struct xfs_mount *mp, xfs_disk_dquot_t *ddq, - xfs_dqid_t id, uint type, uint flags, char *str); + xfs_dqid_t id, uint type, uint flags, const char *str); extern int xfs_calc_dquots_per_chunk(unsigned int nbblks); #endif /* __XFS_QUOTA_H__ */ diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index a0b071d88..8a53eaa34 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -679,11 +679,13 @@ xfs_sb_write_verify( } const struct xfs_buf_ops xfs_sb_buf_ops = { + .name = "xfs_sb", .verify_read = xfs_sb_read_verify, .verify_write = xfs_sb_write_verify, }; const struct xfs_buf_ops xfs_sb_quiet_buf_ops = { + .name = "xfs_sb_quiet", .verify_read = xfs_sb_quiet_read_verify, .verify_write = xfs_sb_write_verify, }; diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h index 5be529707..15c3ceb84 100644 --- a/fs/xfs/libxfs/xfs_shared.h +++ b/fs/xfs/libxfs/xfs_shared.h @@ -49,6 +49,7 @@ extern const struct xfs_buf_ops xfs_inobt_buf_ops; extern const struct xfs_buf_ops xfs_inode_buf_ops; extern const struct xfs_buf_ops xfs_inode_buf_ra_ops; extern const struct xfs_buf_ops xfs_dquot_buf_ops; +extern const struct xfs_buf_ops xfs_dquot_buf_ra_ops; extern const struct xfs_buf_ops xfs_sb_buf_ops; extern const struct xfs_buf_ops xfs_sb_quiet_buf_ops; extern const struct xfs_buf_ops xfs_symlink_buf_ops; diff --git a/fs/xfs/libxfs/xfs_symlink_remote.c b/fs/xfs/libxfs/xfs_symlink_remote.c index cb6fd20a4..2e2c6716b 100644 --- a/fs/xfs/libxfs/xfs_symlink_remote.c +++ b/fs/xfs/libxfs/xfs_symlink_remote.c @@ -168,6 +168,7 @@ xfs_symlink_write_verify( } const struct xfs_buf_ops xfs_symlink_buf_ops = { + .name = "xfs_symlink", .verify_read = xfs_symlink_read_verify, .verify_write = xfs_symlink_write_verify, }; diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index 6bb470fbb..2d5df1f23 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -252,29 +252,6 @@ xfs_set_mode(struct inode *inode, umode_t mode) return error; } -static int -xfs_acl_exists(struct inode *inode, unsigned char *name) -{ - int len = XFS_ACL_MAX_SIZE(XFS_M(inode->i_sb)); - - return (xfs_attr_get(XFS_I(inode), name, NULL, &len, - ATTR_ROOT|ATTR_KERNOVAL) == 0); -} - -int -posix_acl_access_exists(struct inode *inode) -{ - return xfs_acl_exists(inode, SGI_ACL_FILE); -} - -int -posix_acl_default_exists(struct inode *inode) -{ - if (!S_ISDIR(inode->i_mode)) - return 0; - return xfs_acl_exists(inode, SGI_ACL_DEFAULT); -} - int xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) { diff --git a/fs/xfs/xfs_acl.h b/fs/xfs/xfs_acl.h index 52f8255d6..286fa8921 100644 --- a/fs/xfs/xfs_acl.h +++ b/fs/xfs/xfs_acl.h @@ -24,16 +24,12 @@ struct posix_acl; #ifdef CONFIG_XFS_POSIX_ACL extern struct posix_acl *xfs_get_acl(struct inode *inode, int type); extern int xfs_set_acl(struct inode *inode, struct posix_acl *acl, int type); -extern int posix_acl_access_exists(struct inode *inode); -extern int posix_acl_default_exists(struct inode *inode); #else static inline struct posix_acl *xfs_get_acl(struct inode *inode, int type) { return NULL; } # define xfs_set_acl NULL -# define posix_acl_access_exists(inode) 0 -# define posix_acl_default_exists(inode) 0 #endif /* CONFIG_XFS_POSIX_ACL */ extern void xfs_forget_acl(struct inode *inode, const char *name, int xflags); diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 29e7e5dd5..a9ebabfe7 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -55,7 +55,7 @@ xfs_count_page_state( } while ((bh = bh->b_this_page) != head); } -STATIC struct block_device * +struct block_device * xfs_find_bdev_for_inode( struct inode *inode) { @@ -1208,6 +1208,10 @@ xfs_vm_writepages( struct writeback_control *wbc) { xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); + if (dax_mapping(mapping)) + return dax_writeback_mapping_range(mapping, + xfs_find_bdev_for_inode(mapping->host), wbc); + return generic_writepages(mapping, wbc); } @@ -1917,6 +1921,7 @@ xfs_vm_readpage( struct file *unused, struct page *page) { + trace_xfs_vm_readpage(page->mapping->host, 1); return mpage_readpage(page, xfs_get_blocks); } @@ -1927,6 +1932,7 @@ xfs_vm_readpages( struct list_head *pages, unsigned nr_pages) { + trace_xfs_vm_readpages(mapping->host, nr_pages); return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks); } diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index f6ffc9ae5..a4343c63f 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -62,5 +62,6 @@ int xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset, struct buffer_head *map_bh, int create); extern void xfs_count_page_state(struct page *, int *, int *); +extern struct block_device *xfs_find_bdev_for_inode(struct inode *); #endif /* __XFS_AOPS_H__ */ diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index dbae6490a..6c876012b 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -75,7 +75,8 @@ xfs_zero_extent( ssize_t size = XFS_FSB_TO_B(mp, count_fsb); if (IS_DAX(VFS_I(ip))) - return dax_clear_blocks(VFS_I(ip), block, size); + return dax_clear_sectors(xfs_find_bdev_for_inode(VFS_I(ip)), + sector, size); /* * let the block layer decide on the fastest method of @@ -91,32 +92,32 @@ xfs_zero_extent( * last due to locking considerations. We never free any extents in * the first transaction. * - * Return 1 if the given transaction was committed and a new one - * started, and 0 otherwise in the committed parameter. + * If an inode *ip is provided, rejoin it to the transaction if + * the transaction was committed. */ int /* error */ xfs_bmap_finish( struct xfs_trans **tp, /* transaction pointer addr */ struct xfs_bmap_free *flist, /* i/o: list extents to free */ - int *committed)/* xact committed or not */ + struct xfs_inode *ip) { struct xfs_efd_log_item *efd; /* extent free data */ struct xfs_efi_log_item *efi; /* extent free intention */ int error; /* error return value */ + int committed;/* xact committed or not */ struct xfs_bmap_free_item *free; /* free extent item */ struct xfs_bmap_free_item *next; /* next item on free list */ ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); - if (flist->xbf_count == 0) { - *committed = 0; + if (flist->xbf_count == 0) return 0; - } + efi = xfs_trans_get_efi(*tp, flist->xbf_count); for (free = flist->xbf_first; free; free = free->xbfi_next) xfs_trans_log_efi_extent(*tp, efi, free->xbfi_startblock, free->xbfi_blockcount); - error = __xfs_trans_roll(tp, NULL, committed); + error = __xfs_trans_roll(tp, ip, &committed); if (error) { /* * If the transaction was committed, drop the EFD reference @@ -128,16 +129,13 @@ xfs_bmap_finish( * transaction so we should return committed=1 even though we're * returning an error. */ - if (*committed) { + if (committed) { xfs_efi_release(efi); xfs_force_shutdown((*tp)->t_mountp, (error == -EFSCORRUPTED) ? SHUTDOWN_CORRUPT_INCORE : SHUTDOWN_META_IO_ERROR); - } else { - *committed = 1; } - return error; } @@ -969,7 +967,6 @@ xfs_alloc_file_space( xfs_bmbt_irec_t imaps[1], *imapp; xfs_bmap_free_t free_list; uint qblocks, resblks, resrtextents; - int committed; int error; trace_xfs_alloc_file_space(ip); @@ -1064,23 +1061,20 @@ xfs_alloc_file_space( error = xfs_bmapi_write(tp, ip, startoffset_fsb, allocatesize_fsb, alloc_type, &firstfsb, resblks, imapp, &nimaps, &free_list); - if (error) { + if (error) goto error0; - } /* * Complete the transaction */ - error = xfs_bmap_finish(&tp, &free_list, &committed); - if (error) { + error = xfs_bmap_finish(&tp, &free_list, NULL); + if (error) goto error0; - } error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); - if (error) { + if (error) break; - } allocated_fsb = imapp->br_blockcount; @@ -1206,7 +1200,6 @@ xfs_free_file_space( xfs_off_t offset, xfs_off_t len) { - int committed; int done; xfs_fileoff_t endoffset_fsb; int error; @@ -1346,17 +1339,15 @@ xfs_free_file_space( error = xfs_bunmapi(tp, ip, startoffset_fsb, endoffset_fsb - startoffset_fsb, 0, 2, &firstfsb, &free_list, &done); - if (error) { + if (error) goto error0; - } /* * complete the transaction */ - error = xfs_bmap_finish(&tp, &free_list, &committed); - if (error) { + error = xfs_bmap_finish(&tp, &free_list, NULL); + if (error) goto error0; - } error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); @@ -1434,7 +1425,6 @@ xfs_shift_file_space( int error; struct xfs_bmap_free free_list; xfs_fsblock_t first_block; - int committed; xfs_fileoff_t stop_fsb; xfs_fileoff_t next_fsb; xfs_fileoff_t shift_fsb; @@ -1526,7 +1516,7 @@ xfs_shift_file_space( if (error) goto out_bmap_cancel; - error = xfs_bmap_finish(&tp, &free_list, &committed); + error = xfs_bmap_finish(&tp, &free_list, NULL); if (error) goto out_bmap_cancel; diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 39090fc56..435c7de42 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1052,7 +1052,7 @@ xfs_buf_ioend_work( xfs_buf_ioend(bp); } -void +static void xfs_buf_ioend_async( struct xfs_buf *bp) { @@ -1649,13 +1649,9 @@ xfs_setsize_buftarg( btp->bt_meta_sectormask = sectorsize - 1; if (set_blocksize(btp->bt_bdev, sectorsize)) { - char name[BDEVNAME_SIZE]; - - bdevname(btp->bt_bdev, name); - xfs_warn(btp->bt_mount, - "Cannot set_blocksize to %u on device %s", - sectorsize, name); + "Cannot set_blocksize to %u on device %pg", + sectorsize, btp->bt_bdev); return -EINVAL; } diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index c79b717d9..c75721acd 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -132,6 +132,7 @@ struct xfs_buf_map { struct xfs_buf_map (map) = { .bm_bn = (blkno), .bm_len = (numblk) }; struct xfs_buf_ops { + char *name; void (*verify_read)(struct xfs_buf *); void (*verify_write)(struct xfs_buf *); }; diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 7ac6c5c58..9c44d38dc 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -306,7 +306,7 @@ xfs_qm_dqalloc( xfs_fsblock_t firstblock; xfs_bmap_free_t flist; xfs_bmbt_irec_t map; - int nmaps, error, committed; + int nmaps, error; xfs_buf_t *bp; xfs_trans_t *tp = *tpp; @@ -379,11 +379,12 @@ xfs_qm_dqalloc( xfs_trans_bhold(tp, bp); - if ((error = xfs_bmap_finish(tpp, &flist, &committed))) { + error = xfs_bmap_finish(tpp, &flist, NULL); + if (error) goto error1; - } - if (committed) { + /* Transaction was committed? */ + if (*tpp != tp) { tp = *tpp; xfs_trans_bjoin(tp, bp); } else { @@ -393,9 +394,9 @@ xfs_qm_dqalloc( *O_bpp = bp; return 0; - error1: +error1: xfs_bmap_cancel(&flist); - error0: +error0: xfs_iunlock(quotip, XFS_ILOCK_EXCL); return error; diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c index 74d0e5966..88693a98f 100644 --- a/fs/xfs/xfs_error.c +++ b/fs/xfs/xfs_error.c @@ -164,9 +164,9 @@ xfs_verifier_error( { struct xfs_mount *mp = bp->b_target->bt_mount; - xfs_alert(mp, "Metadata %s detected at %pF, block 0x%llx", + xfs_alert(mp, "Metadata %s detected at %pF, %s block 0x%llx", bp->b_error == -EFSBADCRC ? "CRC error" : "corruption", - __return_address, bp->b_bn); + __return_address, bp->b_ops->name, bp->b_bn); xfs_alert(mp, "Unmount and run xfs_repair"); diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index f5392ab2d..52883ac3c 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -55,7 +55,7 @@ xfs_rw_ilock( int type) { if (type & XFS_IOLOCK_EXCL) - mutex_lock(&VFS_I(ip)->i_mutex); + inode_lock(VFS_I(ip)); xfs_ilock(ip, type); } @@ -66,7 +66,7 @@ xfs_rw_iunlock( { xfs_iunlock(ip, type); if (type & XFS_IOLOCK_EXCL) - mutex_unlock(&VFS_I(ip)->i_mutex); + inode_unlock(VFS_I(ip)); } static inline void @@ -76,7 +76,7 @@ xfs_rw_ilock_demote( { xfs_ilock_demote(ip, type); if (type & XFS_IOLOCK_EXCL) - mutex_unlock(&VFS_I(ip)->i_mutex); + inode_unlock(VFS_I(ip)); } /* @@ -402,19 +402,26 @@ xfs_file_splice_read( if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return -EIO; - xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); - trace_xfs_file_splice_read(ip, count, *ppos, ioflags); - /* for dax, we need to avoid the page cache */ - if (IS_DAX(VFS_I(ip))) - ret = default_file_splice_read(infilp, ppos, pipe, count, flags); - else - ret = generic_file_splice_read(infilp, ppos, pipe, count, flags); - if (ret > 0) - XFS_STATS_ADD(ip->i_mount, xs_read_bytes, ret); + /* + * DAX inodes cannot ues the page cache for splice, so we have to push + * them through the VFS IO path. This means it goes through + * ->read_iter, which for us takes the XFS_IOLOCK_SHARED. Hence we + * cannot lock the splice operation at this level for DAX inodes. + */ + if (IS_DAX(VFS_I(ip))) { + ret = default_file_splice_read(infilp, ppos, pipe, count, + flags); + goto out; + } + xfs_rw_ilock(ip, XFS_IOLOCK_SHARED); + ret = generic_file_splice_read(infilp, ppos, pipe, count, flags); xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED); +out: + if (ret > 0) + XFS_STATS_ADD(ip->i_mount, xs_read_bytes, ret); return ret; } @@ -1603,9 +1610,8 @@ xfs_filemap_pmd_fault( /* * pfn_mkwrite was originally inteneded to ensure we capture time stamp * updates on write faults. In reality, it's need to serialise against - * truncate similar to page_mkwrite. Hence we open-code dax_pfn_mkwrite() - * here and cycle the XFS_MMAPLOCK_SHARED to ensure we serialise the fault - * barrier in place. + * truncate similar to page_mkwrite. Hence we cycle the XFS_MMAPLOCK_SHARED + * to ensure we serialise the fault barrier in place. */ static int xfs_filemap_pfn_mkwrite( @@ -1628,6 +1634,8 @@ xfs_filemap_pfn_mkwrite( size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (vmf->pgoff >= size) ret = VM_FAULT_SIGBUS; + else if (IS_DAX(inode)) + ret = dax_pfn_mkwrite(vma, vmf); xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); sb_end_pagefault(inode->i_sb); return ret; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 8ee393996..ceba1a83c 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -610,60 +610,69 @@ __xfs_iflock( STATIC uint _xfs_dic2xflags( - __uint16_t di_flags) + __uint16_t di_flags, + uint64_t di_flags2, + bool has_attr) { uint flags = 0; if (di_flags & XFS_DIFLAG_ANY) { if (di_flags & XFS_DIFLAG_REALTIME) - flags |= XFS_XFLAG_REALTIME; + flags |= FS_XFLAG_REALTIME; if (di_flags & XFS_DIFLAG_PREALLOC) - flags |= XFS_XFLAG_PREALLOC; + flags |= FS_XFLAG_PREALLOC; if (di_flags & XFS_DIFLAG_IMMUTABLE) - flags |= XFS_XFLAG_IMMUTABLE; + flags |= FS_XFLAG_IMMUTABLE; if (di_flags & XFS_DIFLAG_APPEND) - flags |= XFS_XFLAG_APPEND; + flags |= FS_XFLAG_APPEND; if (di_flags & XFS_DIFLAG_SYNC) - flags |= XFS_XFLAG_SYNC; + flags |= FS_XFLAG_SYNC; if (di_flags & XFS_DIFLAG_NOATIME) - flags |= XFS_XFLAG_NOATIME; + flags |= FS_XFLAG_NOATIME; if (di_flags & XFS_DIFLAG_NODUMP) - flags |= XFS_XFLAG_NODUMP; + flags |= FS_XFLAG_NODUMP; if (di_flags & XFS_DIFLAG_RTINHERIT) - flags |= XFS_XFLAG_RTINHERIT; + flags |= FS_XFLAG_RTINHERIT; if (di_flags & XFS_DIFLAG_PROJINHERIT) - flags |= XFS_XFLAG_PROJINHERIT; + flags |= FS_XFLAG_PROJINHERIT; if (di_flags & XFS_DIFLAG_NOSYMLINKS) - flags |= XFS_XFLAG_NOSYMLINKS; + flags |= FS_XFLAG_NOSYMLINKS; if (di_flags & XFS_DIFLAG_EXTSIZE) - flags |= XFS_XFLAG_EXTSIZE; + flags |= FS_XFLAG_EXTSIZE; if (di_flags & XFS_DIFLAG_EXTSZINHERIT) - flags |= XFS_XFLAG_EXTSZINHERIT; + flags |= FS_XFLAG_EXTSZINHERIT; if (di_flags & XFS_DIFLAG_NODEFRAG) - flags |= XFS_XFLAG_NODEFRAG; + flags |= FS_XFLAG_NODEFRAG; if (di_flags & XFS_DIFLAG_FILESTREAM) - flags |= XFS_XFLAG_FILESTREAM; + flags |= FS_XFLAG_FILESTREAM; } + if (di_flags2 & XFS_DIFLAG2_ANY) { + if (di_flags2 & XFS_DIFLAG2_DAX) + flags |= FS_XFLAG_DAX; + } + + if (has_attr) + flags |= FS_XFLAG_HASATTR; + return flags; } uint xfs_ip2xflags( - xfs_inode_t *ip) + struct xfs_inode *ip) { - xfs_icdinode_t *dic = &ip->i_d; + struct xfs_icdinode *dic = &ip->i_d; - return _xfs_dic2xflags(dic->di_flags) | - (XFS_IFORK_Q(ip) ? XFS_XFLAG_HASATTR : 0); + return _xfs_dic2xflags(dic->di_flags, dic->di_flags2, XFS_IFORK_Q(ip)); } uint xfs_dic2xflags( - xfs_dinode_t *dip) + struct xfs_dinode *dip) { - return _xfs_dic2xflags(be16_to_cpu(dip->di_flags)) | - (XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0); + return _xfs_dic2xflags(be16_to_cpu(dip->di_flags), + be64_to_cpu(dip->di_flags2), XFS_DFORK_Q(dip)); } /* @@ -862,7 +871,8 @@ xfs_ialloc( case S_IFREG: case S_IFDIR: if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) { - uint di_flags = 0; + uint64_t di_flags2 = 0; + uint di_flags = 0; if (S_ISDIR(mode)) { if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) @@ -898,7 +908,11 @@ xfs_ialloc( di_flags |= XFS_DIFLAG_NODEFRAG; if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM) di_flags |= XFS_DIFLAG_FILESTREAM; + if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX) + di_flags2 |= XFS_DIFLAG2_DAX; + ip->i_d.di_flags |= di_flags; + ip->i_d.di_flags2 |= di_flags2; } /* FALLTHROUGH */ case S_IFLNK: @@ -1143,7 +1157,6 @@ xfs_create( xfs_bmap_free_t free_list; xfs_fsblock_t first_block; bool unlock_dp_on_error = false; - int committed; prid_t prid; struct xfs_dquot *udqp = NULL; struct xfs_dquot *gdqp = NULL; @@ -1226,7 +1239,7 @@ xfs_create( * pointing to itself. */ error = xfs_dir_ialloc(&tp, dp, mode, is_dir ? 2 : 1, rdev, - prid, resblks > 0, &ip, &committed); + prid, resblks > 0, &ip, NULL); if (error) goto out_trans_cancel; @@ -1275,7 +1288,7 @@ xfs_create( */ xfs_qm_vop_create_dqattach(tp, ip, udqp, gdqp, pdqp); - error = xfs_bmap_finish(&tp, &free_list, &committed); + error = xfs_bmap_finish(&tp, &free_list, NULL); if (error) goto out_bmap_cancel; @@ -1427,7 +1440,6 @@ xfs_link( int error; xfs_bmap_free_t free_list; xfs_fsblock_t first_block; - int committed; int resblks; trace_xfs_link(tdp, target_name); @@ -1502,11 +1514,10 @@ xfs_link( * link transaction goes to disk before returning to * the user. */ - if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) { + if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) xfs_trans_set_sync(tp); - } - error = xfs_bmap_finish (&tp, &free_list, &committed); + error = xfs_bmap_finish(&tp, &free_list, NULL); if (error) { xfs_bmap_cancel(&free_list); goto error_return; @@ -1555,7 +1566,6 @@ xfs_itruncate_extents( xfs_fileoff_t first_unmap_block; xfs_fileoff_t last_block; xfs_filblks_t unmap_len; - int committed; int error = 0; int done = 0; @@ -1601,9 +1611,7 @@ xfs_itruncate_extents( * Duplicate the transaction that has the permanent * reservation and commit the old transaction. */ - error = xfs_bmap_finish(&tp, &free_list, &committed); - if (committed) - xfs_trans_ijoin(tp, ip, 0); + error = xfs_bmap_finish(&tp, &free_list, ip); if (error) goto out_bmap_cancel; @@ -1774,7 +1782,6 @@ xfs_inactive_ifree( { xfs_bmap_free_t free_list; xfs_fsblock_t first_block; - int committed; struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; int error; @@ -1841,7 +1848,7 @@ xfs_inactive_ifree( * Just ignore errors at this point. There is nothing we can do except * to try to keep going. Make sure it's not a silent error. */ - error = xfs_bmap_finish(&tp, &free_list, &committed); + error = xfs_bmap_finish(&tp, &free_list, NULL); if (error) { xfs_notice(mp, "%s: xfs_bmap_finish returned error %d", __func__, error); @@ -2523,7 +2530,6 @@ xfs_remove( int error = 0; xfs_bmap_free_t free_list; xfs_fsblock_t first_block; - int committed; uint resblks; trace_xfs_remove(dp, name); @@ -2624,7 +2630,7 @@ xfs_remove( if (mp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) xfs_trans_set_sync(tp); - error = xfs_bmap_finish(&tp, &free_list, &committed); + error = xfs_bmap_finish(&tp, &free_list, NULL); if (error) goto out_bmap_cancel; @@ -2701,7 +2707,6 @@ xfs_finish_rename( struct xfs_trans *tp, struct xfs_bmap_free *free_list) { - int committed = 0; int error; /* @@ -2711,7 +2716,7 @@ xfs_finish_rename( if (tp->t_mountp->m_flags & (XFS_MOUNT_WSYNC|XFS_MOUNT_DIRSYNC)) xfs_trans_set_sync(tp); - error = xfs_bmap_finish(&tp, free_list, &committed); + error = xfs_bmap_finish(&tp, free_list, NULL); if (error) { xfs_bmap_cancel(free_list); xfs_trans_cancel(tp); diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index d42738dee..478d04e07 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -859,25 +859,25 @@ xfs_merge_ioc_xflags( unsigned int xflags = start; if (flags & FS_IMMUTABLE_FL) - xflags |= XFS_XFLAG_IMMUTABLE; + xflags |= FS_XFLAG_IMMUTABLE; else - xflags &= ~XFS_XFLAG_IMMUTABLE; + xflags &= ~FS_XFLAG_IMMUTABLE; if (flags & FS_APPEND_FL) - xflags |= XFS_XFLAG_APPEND; + xflags |= FS_XFLAG_APPEND; else - xflags &= ~XFS_XFLAG_APPEND; + xflags &= ~FS_XFLAG_APPEND; if (flags & FS_SYNC_FL) - xflags |= XFS_XFLAG_SYNC; + xflags |= FS_XFLAG_SYNC; else - xflags &= ~XFS_XFLAG_SYNC; + xflags &= ~FS_XFLAG_SYNC; if (flags & FS_NOATIME_FL) - xflags |= XFS_XFLAG_NOATIME; + xflags |= FS_XFLAG_NOATIME; else - xflags &= ~XFS_XFLAG_NOATIME; + xflags &= ~FS_XFLAG_NOATIME; if (flags & FS_NODUMP_FL) - xflags |= XFS_XFLAG_NODUMP; + xflags |= FS_XFLAG_NODUMP; else - xflags &= ~XFS_XFLAG_NODUMP; + xflags &= ~FS_XFLAG_NODUMP; return xflags; } @@ -945,40 +945,51 @@ xfs_set_diflags( unsigned int xflags) { unsigned int di_flags; + uint64_t di_flags2; /* can't set PREALLOC this way, just preserve it */ di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC); - if (xflags & XFS_XFLAG_IMMUTABLE) + if (xflags & FS_XFLAG_IMMUTABLE) di_flags |= XFS_DIFLAG_IMMUTABLE; - if (xflags & XFS_XFLAG_APPEND) + if (xflags & FS_XFLAG_APPEND) di_flags |= XFS_DIFLAG_APPEND; - if (xflags & XFS_XFLAG_SYNC) + if (xflags & FS_XFLAG_SYNC) di_flags |= XFS_DIFLAG_SYNC; - if (xflags & XFS_XFLAG_NOATIME) + if (xflags & FS_XFLAG_NOATIME) di_flags |= XFS_DIFLAG_NOATIME; - if (xflags & XFS_XFLAG_NODUMP) + if (xflags & FS_XFLAG_NODUMP) di_flags |= XFS_DIFLAG_NODUMP; - if (xflags & XFS_XFLAG_NODEFRAG) + if (xflags & FS_XFLAG_NODEFRAG) di_flags |= XFS_DIFLAG_NODEFRAG; - if (xflags & XFS_XFLAG_FILESTREAM) + if (xflags & FS_XFLAG_FILESTREAM) di_flags |= XFS_DIFLAG_FILESTREAM; if (S_ISDIR(ip->i_d.di_mode)) { - if (xflags & XFS_XFLAG_RTINHERIT) + if (xflags & FS_XFLAG_RTINHERIT) di_flags |= XFS_DIFLAG_RTINHERIT; - if (xflags & XFS_XFLAG_NOSYMLINKS) + if (xflags & FS_XFLAG_NOSYMLINKS) di_flags |= XFS_DIFLAG_NOSYMLINKS; - if (xflags & XFS_XFLAG_EXTSZINHERIT) + if (xflags & FS_XFLAG_EXTSZINHERIT) di_flags |= XFS_DIFLAG_EXTSZINHERIT; - if (xflags & XFS_XFLAG_PROJINHERIT) + if (xflags & FS_XFLAG_PROJINHERIT) di_flags |= XFS_DIFLAG_PROJINHERIT; } else if (S_ISREG(ip->i_d.di_mode)) { - if (xflags & XFS_XFLAG_REALTIME) + if (xflags & FS_XFLAG_REALTIME) di_flags |= XFS_DIFLAG_REALTIME; - if (xflags & XFS_XFLAG_EXTSIZE) + if (xflags & FS_XFLAG_EXTSIZE) di_flags |= XFS_DIFLAG_EXTSIZE; } - ip->i_d.di_flags = di_flags; + + /* diflags2 only valid for v3 inodes. */ + if (ip->i_d.di_version < 3) + return; + + di_flags2 = 0; + if (xflags & FS_XFLAG_DAX) + di_flags2 |= XFS_DIFLAG2_DAX; + + ip->i_d.di_flags2 = di_flags2; + } STATIC void @@ -988,22 +999,27 @@ xfs_diflags_to_linux( struct inode *inode = VFS_I(ip); unsigned int xflags = xfs_ip2xflags(ip); - if (xflags & XFS_XFLAG_IMMUTABLE) + if (xflags & FS_XFLAG_IMMUTABLE) inode->i_flags |= S_IMMUTABLE; else inode->i_flags &= ~S_IMMUTABLE; - if (xflags & XFS_XFLAG_APPEND) + if (xflags & FS_XFLAG_APPEND) inode->i_flags |= S_APPEND; else inode->i_flags &= ~S_APPEND; - if (xflags & XFS_XFLAG_SYNC) + if (xflags & FS_XFLAG_SYNC) inode->i_flags |= S_SYNC; else inode->i_flags &= ~S_SYNC; - if (xflags & XFS_XFLAG_NOATIME) + if (xflags & FS_XFLAG_NOATIME) inode->i_flags |= S_NOATIME; else inode->i_flags &= ~S_NOATIME; + if (xflags & FS_XFLAG_DAX) + inode->i_flags |= S_DAX; + else + inode->i_flags &= ~S_DAX; + } static int @@ -1016,11 +1032,11 @@ xfs_ioctl_setattr_xflags( /* Can't change realtime flag if any extents are allocated. */ if ((ip->i_d.di_nextents || ip->i_delayed_blks) && - XFS_IS_REALTIME_INODE(ip) != (fa->fsx_xflags & XFS_XFLAG_REALTIME)) + XFS_IS_REALTIME_INODE(ip) != (fa->fsx_xflags & FS_XFLAG_REALTIME)) return -EINVAL; /* If realtime flag is set then must have realtime device */ - if (fa->fsx_xflags & XFS_XFLAG_REALTIME) { + if (fa->fsx_xflags & FS_XFLAG_REALTIME) { if (mp->m_sb.sb_rblocks == 0 || mp->m_sb.sb_rextsize == 0 || (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) return -EINVAL; @@ -1031,7 +1047,7 @@ xfs_ioctl_setattr_xflags( * we have appropriate permission. */ if (((ip->i_d.di_flags & (XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND)) || - (fa->fsx_xflags & (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) && + (fa->fsx_xflags & (FS_XFLAG_IMMUTABLE | FS_XFLAG_APPEND))) && !capable(CAP_LINUX_IMMUTABLE)) return -EPERM; @@ -1095,8 +1111,8 @@ out_cancel: * extent size hint validation is somewhat cumbersome. Rules are: * * 1. extent size hint is only valid for directories and regular files - * 2. XFS_XFLAG_EXTSIZE is only valid for regular files - * 3. XFS_XFLAG_EXTSZINHERIT is only valid for directories. + * 2. FS_XFLAG_EXTSIZE is only valid for regular files + * 3. FS_XFLAG_EXTSZINHERIT is only valid for directories. * 4. can only be changed on regular files if no extents are allocated * 5. can be changed on directories at any time * 6. extsize hint of 0 turns off hints, clears inode flags. @@ -1112,10 +1128,10 @@ xfs_ioctl_setattr_check_extsize( { struct xfs_mount *mp = ip->i_mount; - if ((fa->fsx_xflags & XFS_XFLAG_EXTSIZE) && !S_ISREG(ip->i_d.di_mode)) + if ((fa->fsx_xflags & FS_XFLAG_EXTSIZE) && !S_ISREG(ip->i_d.di_mode)) return -EINVAL; - if ((fa->fsx_xflags & XFS_XFLAG_EXTSZINHERIT) && + if ((fa->fsx_xflags & FS_XFLAG_EXTSZINHERIT) && !S_ISDIR(ip->i_d.di_mode)) return -EINVAL; @@ -1132,7 +1148,7 @@ xfs_ioctl_setattr_check_extsize( return -EINVAL; if (XFS_IS_REALTIME_INODE(ip) || - (fa->fsx_xflags & XFS_XFLAG_REALTIME)) { + (fa->fsx_xflags & FS_XFLAG_REALTIME)) { size = mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog; } else { size = mp->m_sb.sb_blocksize; @@ -1143,7 +1159,7 @@ xfs_ioctl_setattr_check_extsize( if (fa->fsx_extsize % size) return -EINVAL; } else - fa->fsx_xflags &= ~(XFS_XFLAG_EXTSIZE | XFS_XFLAG_EXTSZINHERIT); + fa->fsx_xflags &= ~(FS_XFLAG_EXTSIZE | FS_XFLAG_EXTSZINHERIT); return 0; } @@ -1168,7 +1184,7 @@ xfs_ioctl_setattr_check_projid( if (xfs_get_projid(ip) != fa->fsx_projid) return -EINVAL; - if ((fa->fsx_xflags & XFS_XFLAG_PROJINHERIT) != + if ((fa->fsx_xflags & FS_XFLAG_PROJINHERIT) != (ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)) return -EINVAL; diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index f4f5b43cf..d81bdc080 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -129,7 +129,6 @@ xfs_iomap_write_direct( xfs_trans_t *tp; xfs_bmap_free_t free_list; uint qblocks, resblks, resrtextents; - int committed; int error; int lockmode; int bmapi_flags = XFS_BMAPI_PREALLOC; @@ -203,15 +202,20 @@ xfs_iomap_write_direct( * this outside the transaction context, but if we commit and then crash * we may not have zeroed the blocks and this will be exposed on * recovery of the allocation. Hence we must zero before commit. + * * Further, if we are mapping unwritten extents here, we need to zero * and convert them to written so that we don't need an unwritten extent * callback for DAX. This also means that we need to be able to dip into - * the reserve block pool if there is no space left but we need to do - * unwritten extent conversion. + * the reserve block pool for bmbt block allocation if there is no space + * left but we need to do unwritten extent conversion. */ + if (IS_DAX(VFS_I(ip))) { bmapi_flags = XFS_BMAPI_CONVERT | XFS_BMAPI_ZERO; - tp->t_flags |= XFS_TRANS_RESERVE; + if (ISUNWRITTEN(imap)) { + tp->t_flags |= XFS_TRANS_RESERVE; + resblks = XFS_DIOSTRAT_SPACE_RES(mp, 0) << 1; + } } error = xfs_trans_reserve(tp, &M_RES(mp)->tr_write, resblks, resrtextents); @@ -247,7 +251,7 @@ xfs_iomap_write_direct( /* * Complete the transaction */ - error = xfs_bmap_finish(&tp, &free_list, &committed); + error = xfs_bmap_finish(&tp, &free_list, NULL); if (error) goto out_bmap_cancel; @@ -693,7 +697,7 @@ xfs_iomap_write_allocate( xfs_bmap_free_t free_list; xfs_filblks_t count_fsb; xfs_trans_t *tp; - int nimaps, committed; + int nimaps; int error = 0; int nres; @@ -794,7 +798,7 @@ xfs_iomap_write_allocate( if (error) goto trans_cancel; - error = xfs_bmap_finish(&tp, &free_list, &committed); + error = xfs_bmap_finish(&tp, &free_list, NULL); if (error) goto trans_cancel; @@ -852,7 +856,6 @@ xfs_iomap_write_unwritten( xfs_bmap_free_t free_list; xfs_fsize_t i_size; uint resblks; - int committed; int error; trace_xfs_unwritten_convert(ip, offset, count); @@ -924,7 +927,7 @@ xfs_iomap_write_unwritten( xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); } - error = xfs_bmap_finish(&tp, &free_list, &committed); + error = xfs_bmap_finish(&tp, &free_list, NULL); if (error) goto error_on_bmapi_transaction; diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 245268a0c..76b71a1c6 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -414,13 +414,17 @@ xfs_vn_rename( * uio is kmalloced for this reason... */ STATIC const char * -xfs_vn_follow_link( +xfs_vn_get_link( struct dentry *dentry, - void **cookie) + struct inode *inode, + struct delayed_call *done) { char *link; int error = -ENOMEM; + if (!dentry) + return ERR_PTR(-ECHILD); + link = kmalloc(MAXPATHLEN+1, GFP_KERNEL); if (!link) goto out_err; @@ -429,7 +433,8 @@ xfs_vn_follow_link( if (unlikely(error)) goto out_kfree; - return *cookie = link; + set_delayed_call(done, kfree_link, link); + return link; out_kfree: kfree(link); @@ -1172,8 +1177,7 @@ static const struct inode_operations xfs_dir_ci_inode_operations = { static const struct inode_operations xfs_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = xfs_vn_follow_link, - .put_link = kfree_put_link, + .get_link = xfs_vn_get_link, .getattr = xfs_vn_getattr, .setattr = xfs_vn_setattr, .setxattr = generic_setxattr, @@ -1201,8 +1205,8 @@ xfs_diflags_to_iflags( inode->i_flags |= S_SYNC; if (flags & XFS_DIFLAG_NOATIME) inode->i_flags |= S_NOATIME; - /* XXX: Also needs an on-disk per inode flag! */ - if (ip->i_mount->m_flags & XFS_MOUNT_DAX) + if (ip->i_mount->m_flags & XFS_MOUNT_DAX || + ip->i_d.di_flags2 & XFS_DIFLAG2_DAX) inode->i_flags |= S_DAX; } diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index f52c72a1a..9c9a1c9bc 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1188,10 +1188,16 @@ xlog_iodone(xfs_buf_t *bp) int aborted = 0; /* - * Race to shutdown the filesystem if we see an error. + * Race to shutdown the filesystem if we see an error or the iclog is in + * IOABORT state. The IOABORT state is only set in DEBUG mode to inject + * CRC errors into log recovery. */ - if (XFS_TEST_ERROR(bp->b_error, l->l_mp, - XFS_ERRTAG_IODONE_IOERR, XFS_RANDOM_IODONE_IOERR)) { + if (XFS_TEST_ERROR(bp->b_error, l->l_mp, XFS_ERRTAG_IODONE_IOERR, + XFS_RANDOM_IODONE_IOERR) || + iclog->ic_state & XLOG_STATE_IOABORT) { + if (iclog->ic_state & XLOG_STATE_IOABORT) + iclog->ic_state &= ~XLOG_STATE_IOABORT; + xfs_buf_ioerror_alert(bp, __func__); xfs_buf_stale(bp); xfs_force_shutdown(l->l_mp, SHUTDOWN_LOG_IO_ERROR); @@ -1838,6 +1844,23 @@ xlog_sync( /* calculcate the checksum */ iclog->ic_header.h_crc = xlog_cksum(log, &iclog->ic_header, iclog->ic_datap, size); +#ifdef DEBUG + /* + * Intentionally corrupt the log record CRC based on the error injection + * frequency, if defined. This facilitates testing log recovery in the + * event of torn writes. Hence, set the IOABORT state to abort the log + * write on I/O completion and shutdown the fs. The subsequent mount + * detects the bad CRC and attempts to recover. + */ + if (log->l_badcrc_factor && + (prandom_u32() % log->l_badcrc_factor == 0)) { + iclog->ic_header.h_crc &= 0xAAAAAAAA; + iclog->ic_state |= XLOG_STATE_IOABORT; + xfs_warn(log->l_mp, + "Intentionally corrupted log record at LSN 0x%llx. Shutdown imminent.", + be64_to_cpu(iclog->ic_header.h_lsn)); + } +#endif bp->b_io_length = BTOBB(count); bp->b_fspriv = iclog; @@ -2045,12 +2068,14 @@ xlog_print_tic_res( "QM_DQCLUSTER", "QM_QINOCREATE", "QM_QUOTAOFF_END", - "SB_UNIT", "FSYNC_TS", "GROWFSRT_ALLOC", "GROWFSRT_ZERO", "GROWFSRT_FREE", - "SWAPEXT" + "SWAPEXT", + "CHECKPOINT", + "ICREATE", + "CREATE_TMPFILE" }; xfs_warn(mp, "xlog_write: reservation summary:"); @@ -2791,11 +2816,19 @@ xlog_state_do_callback( } } while (!ioerrors && loopdidcallbacks); +#ifdef DEBUG /* - * make one last gasp attempt to see if iclogs are being left in - * limbo.. + * Make one last gasp attempt to see if iclogs are being left in limbo. + * If the above loop finds an iclog earlier than the current iclog and + * in one of the syncing states, the current iclog is put into + * DO_CALLBACK and the callbacks are deferred to the completion of the + * earlier iclog. Walk the iclogs in order and make sure that no iclog + * is in DO_CALLBACK unless an earlier iclog is in one of the syncing + * states. + * + * Note that SYNCING|IOABORT is a valid state so we cannot just check + * for ic_state == SYNCING. */ -#ifdef DEBUG if (funcdidcallbacks) { first_iclog = iclog = log->l_iclog; do { @@ -2810,7 +2843,7 @@ xlog_state_do_callback( * IOERROR - give up hope all ye who enter here */ if (iclog->ic_state == XLOG_STATE_WANT_SYNC || - iclog->ic_state == XLOG_STATE_SYNCING || + iclog->ic_state & XLOG_STATE_SYNCING || iclog->ic_state == XLOG_STATE_DONE_SYNC || iclog->ic_state == XLOG_STATE_IOERROR ) break; diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index 8daba7491..ed8896310 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -62,6 +62,7 @@ static inline uint xlog_get_client_id(__be32 i) #define XLOG_STATE_CALLBACK 0x0020 /* Callback functions now */ #define XLOG_STATE_DIRTY 0x0040 /* Dirty IC log, not ready for ACTIVE status*/ #define XLOG_STATE_IOERROR 0x0080 /* IO error happened in sync'ing log */ +#define XLOG_STATE_IOABORT 0x0100 /* force abort on I/O completion (debug) */ #define XLOG_STATE_ALL 0x7FFF /* All possible valid flags */ #define XLOG_STATE_NOTUSED 0x8000 /* This IC log not being used */ @@ -410,6 +411,8 @@ struct xlog { /* The following field are used for debugging; need to hold icloglock */ #ifdef DEBUG void *l_iclog_bak[XLOG_MAX_ICLOGS]; + /* log record crc error injection factor */ + uint32_t l_badcrc_factor; #endif }; diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index c5ecaacdd..be5568839 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -61,6 +61,9 @@ xlog_recover_check_summary( #else #define xlog_recover_check_summary(log) #endif +STATIC int +xlog_do_recovery_pass( + struct xlog *, xfs_daddr_t, xfs_daddr_t, int, xfs_daddr_t *); /* * This structure is used during recovery to record the buf log items which @@ -868,136 +871,365 @@ validate_head: } /* - * Find the sync block number or the tail of the log. - * - * This will be the block number of the last record to have its - * associated buffers synced to disk. Every log record header has - * a sync lsn embedded in it. LSNs hold block numbers, so it is easy - * to get a sync block number. The only concern is to figure out which - * log record header to believe. - * - * The following algorithm uses the log record header with the largest - * lsn. The entire log record does not need to be valid. We only care - * that the header is valid. + * Seek backwards in the log for log record headers. * - * We could speed up search by using current head_blk buffer, but it is not - * available. + * Given a starting log block, walk backwards until we find the provided number + * of records or hit the provided tail block. The return value is the number of + * records encountered or a negative error code. The log block and buffer + * pointer of the last record seen are returned in rblk and rhead respectively. */ STATIC int -xlog_find_tail( +xlog_rseek_logrec_hdr( struct xlog *log, - xfs_daddr_t *head_blk, - xfs_daddr_t *tail_blk) + xfs_daddr_t head_blk, + xfs_daddr_t tail_blk, + int count, + struct xfs_buf *bp, + xfs_daddr_t *rblk, + struct xlog_rec_header **rhead, + bool *wrapped) { - xlog_rec_header_t *rhead; - xlog_op_header_t *op_head; + int i; + int error; + int found = 0; char *offset = NULL; - xfs_buf_t *bp; - int error, i, found; - xfs_daddr_t umount_data_blk; - xfs_daddr_t after_umount_blk; - xfs_lsn_t tail_lsn; - int hblks; + xfs_daddr_t end_blk; - found = 0; + *wrapped = false; /* - * Find previous log record + * Walk backwards from the head block until we hit the tail or the first + * block in the log. */ - if ((error = xlog_find_head(log, head_blk))) - return error; - - bp = xlog_get_bp(log, 1); - if (!bp) - return -ENOMEM; - if (*head_blk == 0) { /* special case */ - error = xlog_bread(log, 0, 1, bp, &offset); + end_blk = head_blk > tail_blk ? tail_blk : 0; + for (i = (int) head_blk - 1; i >= end_blk; i--) { + error = xlog_bread(log, i, 1, bp, &offset); if (error) - goto done; + goto out_error; - if (xlog_get_cycle(offset) == 0) { - *tail_blk = 0; - /* leave all other log inited values alone */ - goto done; + if (*(__be32 *) offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) { + *rblk = i; + *rhead = (struct xlog_rec_header *) offset; + if (++found == count) + break; } } /* - * Search backwards looking for log record header block + * If we haven't hit the tail block or the log record header count, + * start looking again from the end of the physical log. Note that + * callers can pass head == tail if the tail is not yet known. */ - ASSERT(*head_blk < INT_MAX); - for (i = (int)(*head_blk) - 1; i >= 0; i--) { + if (tail_blk >= head_blk && found != count) { + for (i = log->l_logBBsize - 1; i >= (int) tail_blk; i--) { + error = xlog_bread(log, i, 1, bp, &offset); + if (error) + goto out_error; + + if (*(__be32 *)offset == + cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) { + *wrapped = true; + *rblk = i; + *rhead = (struct xlog_rec_header *) offset; + if (++found == count) + break; + } + } + } + + return found; + +out_error: + return error; +} + +/* + * Seek forward in the log for log record headers. + * + * Given head and tail blocks, walk forward from the tail block until we find + * the provided number of records or hit the head block. The return value is the + * number of records encountered or a negative error code. The log block and + * buffer pointer of the last record seen are returned in rblk and rhead + * respectively. + */ +STATIC int +xlog_seek_logrec_hdr( + struct xlog *log, + xfs_daddr_t head_blk, + xfs_daddr_t tail_blk, + int count, + struct xfs_buf *bp, + xfs_daddr_t *rblk, + struct xlog_rec_header **rhead, + bool *wrapped) +{ + int i; + int error; + int found = 0; + char *offset = NULL; + xfs_daddr_t end_blk; + + *wrapped = false; + + /* + * Walk forward from the tail block until we hit the head or the last + * block in the log. + */ + end_blk = head_blk > tail_blk ? head_blk : log->l_logBBsize - 1; + for (i = (int) tail_blk; i <= end_blk; i++) { error = xlog_bread(log, i, 1, bp, &offset); if (error) - goto done; + goto out_error; - if (*(__be32 *)offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) { - found = 1; - break; + if (*(__be32 *) offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) { + *rblk = i; + *rhead = (struct xlog_rec_header *) offset; + if (++found == count) + break; } } + /* - * If we haven't found the log record header block, start looking - * again from the end of the physical log. XXXmiken: There should be - * a check here to make sure we didn't search more than N blocks in - * the previous code. + * If we haven't hit the head block or the log record header count, + * start looking again from the start of the physical log. */ - if (!found) { - for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) { + if (tail_blk > head_blk && found != count) { + for (i = 0; i < (int) head_blk; i++) { error = xlog_bread(log, i, 1, bp, &offset); if (error) - goto done; + goto out_error; if (*(__be32 *)offset == cpu_to_be32(XLOG_HEADER_MAGIC_NUM)) { - found = 2; - break; + *wrapped = true; + *rblk = i; + *rhead = (struct xlog_rec_header *) offset; + if (++found == count) + break; } } } - if (!found) { - xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__); - xlog_put_bp(bp); - ASSERT(0); - return -EIO; + + return found; + +out_error: + return error; +} + +/* + * Check the log tail for torn writes. This is required when torn writes are + * detected at the head and the head had to be walked back to a previous record. + * The tail of the previous record must now be verified to ensure the torn + * writes didn't corrupt the previous tail. + * + * Return an error if CRC verification fails as recovery cannot proceed. + */ +STATIC int +xlog_verify_tail( + struct xlog *log, + xfs_daddr_t head_blk, + xfs_daddr_t tail_blk) +{ + struct xlog_rec_header *thead; + struct xfs_buf *bp; + xfs_daddr_t first_bad; + int count; + int error = 0; + bool wrapped; + xfs_daddr_t tmp_head; + + bp = xlog_get_bp(log, 1); + if (!bp) + return -ENOMEM; + + /* + * Seek XLOG_MAX_ICLOGS + 1 records past the current tail record to get + * a temporary head block that points after the last possible + * concurrently written record of the tail. + */ + count = xlog_seek_logrec_hdr(log, head_blk, tail_blk, + XLOG_MAX_ICLOGS + 1, bp, &tmp_head, &thead, + &wrapped); + if (count < 0) { + error = count; + goto out; } - /* find blk_no of tail of log */ - rhead = (xlog_rec_header_t *)offset; - *tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn)); + /* + * If the call above didn't find XLOG_MAX_ICLOGS + 1 records, we ran + * into the actual log head. tmp_head points to the start of the record + * so update it to the actual head block. + */ + if (count < XLOG_MAX_ICLOGS + 1) + tmp_head = head_blk; /* - * Reset log values according to the state of the log when we - * crashed. In the case where head_blk == 0, we bump curr_cycle - * one because the next write starts a new cycle rather than - * continuing the cycle of the last good log record. At this - * point we have guaranteed that all partial log records have been - * accounted for. Therefore, we know that the last good log record - * written was complete and ended exactly on the end boundary - * of the physical log. + * We now have a tail and temporary head block that covers at least + * XLOG_MAX_ICLOGS records from the tail. We need to verify that these + * records were completely written. Run a CRC verification pass from + * tail to head and return the result. */ - log->l_prev_block = i; - log->l_curr_block = (int)*head_blk; - log->l_curr_cycle = be32_to_cpu(rhead->h_cycle); - if (found == 2) - log->l_curr_cycle++; - atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn)); - atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn)); - xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle, - BBTOB(log->l_curr_block)); - xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle, - BBTOB(log->l_curr_block)); + error = xlog_do_recovery_pass(log, tmp_head, tail_blk, + XLOG_RECOVER_CRCPASS, &first_bad); + +out: + xlog_put_bp(bp); + return error; +} + +/* + * Detect and trim torn writes from the head of the log. + * + * Storage without sector atomicity guarantees can result in torn writes in the + * log in the event of a crash. Our only means to detect this scenario is via + * CRC verification. While we can't always be certain that CRC verification + * failure is due to a torn write vs. an unrelated corruption, we do know that + * only a certain number (XLOG_MAX_ICLOGS) of log records can be written out at + * one time. Therefore, CRC verify up to XLOG_MAX_ICLOGS records at the head of + * the log and treat failures in this range as torn writes as a matter of + * policy. In the event of CRC failure, the head is walked back to the last good + * record in the log and the tail is updated from that record and verified. + */ +STATIC int +xlog_verify_head( + struct xlog *log, + xfs_daddr_t *head_blk, /* in/out: unverified head */ + xfs_daddr_t *tail_blk, /* out: tail block */ + struct xfs_buf *bp, + xfs_daddr_t *rhead_blk, /* start blk of last record */ + struct xlog_rec_header **rhead, /* ptr to last record */ + bool *wrapped) /* last rec. wraps phys. log */ +{ + struct xlog_rec_header *tmp_rhead; + struct xfs_buf *tmp_bp; + xfs_daddr_t first_bad; + xfs_daddr_t tmp_rhead_blk; + int found; + int error; + bool tmp_wrapped; /* - * Look for unmount record. If we find it, then we know there - * was a clean unmount. Since 'i' could be the last block in - * the physical log, we convert to a log block before comparing - * to the head_blk. + * Check the head of the log for torn writes. Search backwards from the + * head until we hit the tail or the maximum number of log record I/Os + * that could have been in flight at one time. Use a temporary buffer so + * we don't trash the rhead/bp pointers from the caller. + */ + tmp_bp = xlog_get_bp(log, 1); + if (!tmp_bp) + return -ENOMEM; + error = xlog_rseek_logrec_hdr(log, *head_blk, *tail_blk, + XLOG_MAX_ICLOGS, tmp_bp, &tmp_rhead_blk, + &tmp_rhead, &tmp_wrapped); + xlog_put_bp(tmp_bp); + if (error < 0) + return error; + + /* + * Now run a CRC verification pass over the records starting at the + * block found above to the current head. If a CRC failure occurs, the + * log block of the first bad record is saved in first_bad. + */ + error = xlog_do_recovery_pass(log, *head_blk, tmp_rhead_blk, + XLOG_RECOVER_CRCPASS, &first_bad); + if (error == -EFSBADCRC) { + /* + * We've hit a potential torn write. Reset the error and warn + * about it. + */ + error = 0; + xfs_warn(log->l_mp, +"Torn write (CRC failure) detected at log block 0x%llx. Truncating head block from 0x%llx.", + first_bad, *head_blk); + + /* + * Get the header block and buffer pointer for the last good + * record before the bad record. + * + * Note that xlog_find_tail() clears the blocks at the new head + * (i.e., the records with invalid CRC) if the cycle number + * matches the the current cycle. + */ + found = xlog_rseek_logrec_hdr(log, first_bad, *tail_blk, 1, bp, + rhead_blk, rhead, wrapped); + if (found < 0) + return found; + if (found == 0) /* XXX: right thing to do here? */ + return -EIO; + + /* + * Reset the head block to the starting block of the first bad + * log record and set the tail block based on the last good + * record. + * + * Bail out if the updated head/tail match as this indicates + * possible corruption outside of the acceptable + * (XLOG_MAX_ICLOGS) range. This is a job for xfs_repair... + */ + *head_blk = first_bad; + *tail_blk = BLOCK_LSN(be64_to_cpu((*rhead)->h_tail_lsn)); + if (*head_blk == *tail_blk) { + ASSERT(0); + return 0; + } + + /* + * Now verify the tail based on the updated head. This is + * required because the torn writes trimmed from the head could + * have been written over the tail of a previous record. Return + * any errors since recovery cannot proceed if the tail is + * corrupt. + * + * XXX: This leaves a gap in truly robust protection from torn + * writes in the log. If the head is behind the tail, the tail + * pushes forward to create some space and then a crash occurs + * causing the writes into the previous record's tail region to + * tear, log recovery isn't able to recover. + * + * How likely is this to occur? If possible, can we do something + * more intelligent here? Is it safe to push the tail forward if + * we can determine that the tail is within the range of the + * torn write (e.g., the kernel can only overwrite the tail if + * it has actually been pushed forward)? Alternatively, could we + * somehow prevent this condition at runtime? + */ + error = xlog_verify_tail(log, *head_blk, *tail_blk); + } + + return error; +} + +/* + * Check whether the head of the log points to an unmount record. In other + * words, determine whether the log is clean. If so, update the in-core state + * appropriately. + */ +static int +xlog_check_unmount_rec( + struct xlog *log, + xfs_daddr_t *head_blk, + xfs_daddr_t *tail_blk, + struct xlog_rec_header *rhead, + xfs_daddr_t rhead_blk, + struct xfs_buf *bp, + bool *clean) +{ + struct xlog_op_header *op_head; + xfs_daddr_t umount_data_blk; + xfs_daddr_t after_umount_blk; + int hblks; + int error; + char *offset; + + *clean = false; + + /* + * Look for unmount record. If we find it, then we know there was a + * clean unmount. Since 'i' could be the last block in the physical + * log, we convert to a log block before comparing to the head_blk. * - * Save the current tail lsn to use to pass to - * xlog_clear_stale_blocks() below. We won't want to clear the - * unmount record if there is one, so we pass the lsn of the - * unmount record rather than the block after it. + * Save the current tail lsn to use to pass to xlog_clear_stale_blocks() + * below. We won't want to clear the unmount record if there is one, so + * we pass the lsn of the unmount record rather than the block after it. */ if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { int h_size = be32_to_cpu(rhead->h_size); @@ -1014,22 +1246,22 @@ xlog_find_tail( } else { hblks = 1; } - after_umount_blk = (i + hblks + (int) - BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize; - tail_lsn = atomic64_read(&log->l_tail_lsn); + after_umount_blk = rhead_blk + hblks + BTOBB(be32_to_cpu(rhead->h_len)); + after_umount_blk = do_mod(after_umount_blk, log->l_logBBsize); if (*head_blk == after_umount_blk && be32_to_cpu(rhead->h_num_logops) == 1) { - umount_data_blk = (i + hblks) % log->l_logBBsize; + umount_data_blk = rhead_blk + hblks; + umount_data_blk = do_mod(umount_data_blk, log->l_logBBsize); error = xlog_bread(log, umount_data_blk, 1, bp, &offset); if (error) - goto done; + return error; - op_head = (xlog_op_header_t *)offset; + op_head = (struct xlog_op_header *)offset; if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) { /* - * Set tail and last sync so that newly written - * log records will point recovery to after the - * current unmount record. + * Set tail and last sync so that newly written log + * records will point recovery to after the current + * unmount record. */ xlog_assign_atomic_lsn(&log->l_tail_lsn, log->l_curr_cycle, after_umount_blk); @@ -1037,16 +1269,166 @@ xlog_find_tail( log->l_curr_cycle, after_umount_blk); *tail_blk = after_umount_blk; - /* - * Note that the unmount was clean. If the unmount - * was not clean, we need to know this to rebuild the - * superblock counters from the perag headers if we - * have a filesystem using non-persistent counters. - */ - log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN; + *clean = true; } } + return 0; +} + +static void +xlog_set_state( + struct xlog *log, + xfs_daddr_t head_blk, + struct xlog_rec_header *rhead, + xfs_daddr_t rhead_blk, + bool bump_cycle) +{ + /* + * Reset log values according to the state of the log when we + * crashed. In the case where head_blk == 0, we bump curr_cycle + * one because the next write starts a new cycle rather than + * continuing the cycle of the last good log record. At this + * point we have guaranteed that all partial log records have been + * accounted for. Therefore, we know that the last good log record + * written was complete and ended exactly on the end boundary + * of the physical log. + */ + log->l_prev_block = rhead_blk; + log->l_curr_block = (int)head_blk; + log->l_curr_cycle = be32_to_cpu(rhead->h_cycle); + if (bump_cycle) + log->l_curr_cycle++; + atomic64_set(&log->l_tail_lsn, be64_to_cpu(rhead->h_tail_lsn)); + atomic64_set(&log->l_last_sync_lsn, be64_to_cpu(rhead->h_lsn)); + xlog_assign_grant_head(&log->l_reserve_head.grant, log->l_curr_cycle, + BBTOB(log->l_curr_block)); + xlog_assign_grant_head(&log->l_write_head.grant, log->l_curr_cycle, + BBTOB(log->l_curr_block)); +} + +/* + * Find the sync block number or the tail of the log. + * + * This will be the block number of the last record to have its + * associated buffers synced to disk. Every log record header has + * a sync lsn embedded in it. LSNs hold block numbers, so it is easy + * to get a sync block number. The only concern is to figure out which + * log record header to believe. + * + * The following algorithm uses the log record header with the largest + * lsn. The entire log record does not need to be valid. We only care + * that the header is valid. + * + * We could speed up search by using current head_blk buffer, but it is not + * available. + */ +STATIC int +xlog_find_tail( + struct xlog *log, + xfs_daddr_t *head_blk, + xfs_daddr_t *tail_blk) +{ + xlog_rec_header_t *rhead; + char *offset = NULL; + xfs_buf_t *bp; + int error; + xfs_daddr_t rhead_blk; + xfs_lsn_t tail_lsn; + bool wrapped = false; + bool clean = false; + + /* + * Find previous log record + */ + if ((error = xlog_find_head(log, head_blk))) + return error; + ASSERT(*head_blk < INT_MAX); + + bp = xlog_get_bp(log, 1); + if (!bp) + return -ENOMEM; + if (*head_blk == 0) { /* special case */ + error = xlog_bread(log, 0, 1, bp, &offset); + if (error) + goto done; + + if (xlog_get_cycle(offset) == 0) { + *tail_blk = 0; + /* leave all other log inited values alone */ + goto done; + } + } + + /* + * Search backwards through the log looking for the log record header + * block. This wraps all the way back around to the head so something is + * seriously wrong if we can't find it. + */ + error = xlog_rseek_logrec_hdr(log, *head_blk, *head_blk, 1, bp, + &rhead_blk, &rhead, &wrapped); + if (error < 0) + return error; + if (!error) { + xfs_warn(log->l_mp, "%s: couldn't find sync record", __func__); + return -EIO; + } + *tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn)); + + /* + * Set the log state based on the current head record. + */ + xlog_set_state(log, *head_blk, rhead, rhead_blk, wrapped); + tail_lsn = atomic64_read(&log->l_tail_lsn); + + /* + * Look for an unmount record at the head of the log. This sets the log + * state to determine whether recovery is necessary. + */ + error = xlog_check_unmount_rec(log, head_blk, tail_blk, rhead, + rhead_blk, bp, &clean); + if (error) + goto done; + + /* + * Verify the log head if the log is not clean (e.g., we have anything + * but an unmount record at the head). This uses CRC verification to + * detect and trim torn writes. If discovered, CRC failures are + * considered torn writes and the log head is trimmed accordingly. + * + * Note that we can only run CRC verification when the log is dirty + * because there's no guarantee that the log data behind an unmount + * record is compatible with the current architecture. + */ + if (!clean) { + xfs_daddr_t orig_head = *head_blk; + + error = xlog_verify_head(log, head_blk, tail_blk, bp, + &rhead_blk, &rhead, &wrapped); + if (error) + goto done; + + /* update in-core state again if the head changed */ + if (*head_blk != orig_head) { + xlog_set_state(log, *head_blk, rhead, rhead_blk, + wrapped); + tail_lsn = atomic64_read(&log->l_tail_lsn); + error = xlog_check_unmount_rec(log, head_blk, tail_blk, + rhead, rhead_blk, bp, + &clean); + if (error) + goto done; + } + } + + /* + * Note that the unmount was clean. If the unmount was not clean, we + * need to know this to rebuild the superblock counters from the perag + * headers if we have a filesystem using non-persistent counters. + */ + if (clean) + log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN; + /* * Make sure that there are no blocks in front of the head * with the same cycle number as the head. This can happen @@ -3204,6 +3586,7 @@ xlog_recover_dquot_ra_pass2( struct xfs_disk_dquot *recddq; struct xfs_dq_logformat *dq_f; uint type; + int len; if (mp->m_qflags == 0) @@ -3224,8 +3607,12 @@ xlog_recover_dquot_ra_pass2( ASSERT(dq_f); ASSERT(dq_f->qlf_len == 1); - xfs_buf_readahead(mp->m_ddev_targp, dq_f->qlf_blkno, - XFS_FSB_TO_BB(mp, dq_f->qlf_len), NULL); + len = XFS_FSB_TO_BB(mp, dq_f->qlf_len); + if (xlog_peek_buffer_cancelled(log, dq_f->qlf_blkno, len, 0)) + return; + + xfs_buf_readahead(mp->m_ddev_targp, dq_f->qlf_blkno, len, + &xfs_dquot_buf_ra_ops); } STATIC void @@ -4118,25 +4505,68 @@ xlog_recover_process_iunlinks( mp->m_dmevmask = mp_dmevmask; } +STATIC int +xlog_unpack_data( + struct xlog_rec_header *rhead, + char *dp, + struct xlog *log) +{ + int i, j, k; + + for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) && + i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) { + *(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i]; + dp += BBSIZE; + } + + if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { + xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead; + for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) { + j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); + k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); + *(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k]; + dp += BBSIZE; + } + } + + return 0; +} + /* - * Upack the log buffer data and crc check it. If the check fails, issue a - * warning if and only if the CRC in the header is non-zero. This makes the - * check an advisory warning, and the zero CRC check will prevent failure - * warnings from being emitted when upgrading the kernel from one that does not - * add CRCs by default. - * - * When filesystems are CRC enabled, this CRC mismatch becomes a fatal log - * corruption failure + * CRC check, unpack and process a log record. */ STATIC int -xlog_unpack_data_crc( +xlog_recover_process( + struct xlog *log, + struct hlist_head rhash[], struct xlog_rec_header *rhead, char *dp, - struct xlog *log) + int pass) { + int error; __le32 crc; crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len)); + + /* + * Nothing else to do if this is a CRC verification pass. Just return + * if this a record with a non-zero crc. Unfortunately, mkfs always + * sets h_crc to 0 so we must consider this valid even on v5 supers. + * Otherwise, return EFSBADCRC on failure so the callers up the stack + * know precisely what failed. + */ + if (pass == XLOG_RECOVER_CRCPASS) { + if (rhead->h_crc && crc != rhead->h_crc) + return -EFSBADCRC; + return 0; + } + + /* + * We're in the normal recovery path. Issue a warning if and only if the + * CRC in the header is non-zero. This is an advisory warning and the + * zero CRC check prevents warnings from being emitted when upgrading + * the kernel from one that does not add CRCs by default. + */ if (crc != rhead->h_crc) { if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) { xfs_alert(log->l_mp, @@ -4147,47 +4577,18 @@ xlog_unpack_data_crc( } /* - * If we've detected a log record corruption, then we can't - * recover past this point. Abort recovery if we are enforcing - * CRC protection by punting an error back up the stack. + * If the filesystem is CRC enabled, this mismatch becomes a + * fatal log corruption failure. */ if (xfs_sb_version_hascrc(&log->l_mp->m_sb)) return -EFSCORRUPTED; } - return 0; -} - -STATIC int -xlog_unpack_data( - struct xlog_rec_header *rhead, - char *dp, - struct xlog *log) -{ - int i, j, k; - int error; - - error = xlog_unpack_data_crc(rhead, dp, log); + error = xlog_unpack_data(rhead, dp, log); if (error) return error; - for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) && - i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) { - *(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i]; - dp += BBSIZE; - } - - if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) { - xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead; - for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) { - j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE); - *(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k]; - dp += BBSIZE; - } - } - - return 0; + return xlog_recover_process_data(log, rhash, rhead, dp, pass); } STATIC int @@ -4239,18 +4640,21 @@ xlog_do_recovery_pass( struct xlog *log, xfs_daddr_t head_blk, xfs_daddr_t tail_blk, - int pass) + int pass, + xfs_daddr_t *first_bad) /* out: first bad log rec */ { xlog_rec_header_t *rhead; xfs_daddr_t blk_no; + xfs_daddr_t rhead_blk; char *offset; xfs_buf_t *hbp, *dbp; - int error = 0, h_size; + int error = 0, h_size, h_len; int bblks, split_bblks; int hblks, split_hblks, wrapped_hblks; struct hlist_head rhash[XLOG_RHASH_SIZE]; ASSERT(head_blk != tail_blk); + rhead_blk = 0; /* * Read the header of the tail block and get the iclog buffer size from @@ -4274,7 +4678,31 @@ xlog_do_recovery_pass( error = xlog_valid_rec_header(log, rhead, tail_blk); if (error) goto bread_err1; + + /* + * xfsprogs has a bug where record length is based on lsunit but + * h_size (iclog size) is hardcoded to 32k. Now that we + * unconditionally CRC verify the unmount record, this means the + * log buffer can be too small for the record and cause an + * overrun. + * + * Detect this condition here. Use lsunit for the buffer size as + * long as this looks like the mkfs case. Otherwise, return an + * error to avoid a buffer overrun. + */ h_size = be32_to_cpu(rhead->h_size); + h_len = be32_to_cpu(rhead->h_len); + if (h_len > h_size) { + if (h_len <= log->l_mp->m_logbsize && + be32_to_cpu(rhead->h_num_logops) == 1) { + xfs_warn(log->l_mp, + "invalid iclog size (%d bytes), using lsunit (%d bytes)", + h_size, log->l_mp->m_logbsize); + h_size = log->l_mp->m_logbsize; + } else + return -EFSCORRUPTED; + } + if ((be32_to_cpu(rhead->h_version) & XLOG_VERSION_2) && (h_size > XLOG_HEADER_CYCLE_SIZE)) { hblks = h_size / XLOG_HEADER_CYCLE_SIZE; @@ -4301,7 +4729,7 @@ xlog_do_recovery_pass( } memset(rhash, 0, sizeof(rhash)); - blk_no = tail_blk; + blk_no = rhead_blk = tail_blk; if (tail_blk > head_blk) { /* * Perform recovery around the end of the physical log. @@ -4408,19 +4836,18 @@ xlog_do_recovery_pass( goto bread_err2; } - error = xlog_unpack_data(rhead, offset, log); + error = xlog_recover_process(log, rhash, rhead, offset, + pass); if (error) goto bread_err2; - error = xlog_recover_process_data(log, rhash, - rhead, offset, pass); - if (error) - goto bread_err2; blk_no += bblks; + rhead_blk = blk_no; } ASSERT(blk_no >= log->l_logBBsize); blk_no -= log->l_logBBsize; + rhead_blk = blk_no; } /* read first part of physical log */ @@ -4441,21 +4868,22 @@ xlog_do_recovery_pass( if (error) goto bread_err2; - error = xlog_unpack_data(rhead, offset, log); + error = xlog_recover_process(log, rhash, rhead, offset, pass); if (error) goto bread_err2; - error = xlog_recover_process_data(log, rhash, - rhead, offset, pass); - if (error) - goto bread_err2; blk_no += bblks + hblks; + rhead_blk = blk_no; } bread_err2: xlog_put_bp(dbp); bread_err1: xlog_put_bp(hbp); + + if (error && first_bad) + *first_bad = rhead_blk; + return error; } @@ -4493,7 +4921,7 @@ xlog_do_log_recovery( INIT_LIST_HEAD(&log->l_buf_cancel_table[i]); error = xlog_do_recovery_pass(log, head_blk, tail_blk, - XLOG_RECOVER_PASS1); + XLOG_RECOVER_PASS1, NULL); if (error != 0) { kmem_free(log->l_buf_cancel_table); log->l_buf_cancel_table = NULL; @@ -4504,7 +4932,7 @@ xlog_do_log_recovery( * When it is complete free the table of buf cancel items. */ error = xlog_do_recovery_pass(log, head_blk, tail_blk, - XLOG_RECOVER_PASS2); + XLOG_RECOVER_PASS2, NULL); #ifdef DEBUG if (!error) { int i; diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index dc6221942..ade236e90 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -42,11 +42,11 @@ xfs_break_layouts( while ((error = break_layout(inode, false) == -EWOULDBLOCK)) { xfs_iunlock(ip, *iolock); if (with_imutex && (*iolock & XFS_IOLOCK_EXCL)) - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); error = break_layout(inode, true); *iolock = XFS_IOLOCK_EXCL; if (with_imutex) - mutex_lock(&inode->i_mutex); + inode_lock(inode); xfs_ilock(ip, *iolock); } diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index ab1bac6a3..be02a68b2 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -766,7 +766,6 @@ xfs_growfs_rt_alloc( { xfs_fileoff_t bno; /* block number in file */ struct xfs_buf *bp; /* temporary buffer for zeroing */ - int committed; /* transaction committed flag */ xfs_daddr_t d; /* disk block address */ int error; /* error return value */ xfs_fsblock_t firstblock;/* first block allocated in xaction */ @@ -811,7 +810,7 @@ xfs_growfs_rt_alloc( /* * Free any blocks freed up in the transaction, then commit. */ - error = xfs_bmap_finish(&tp, &flist, &committed); + error = xfs_bmap_finish(&tp, &flist, NULL); if (error) goto out_bmap_cancel; error = xfs_trans_commit(tp); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 36bd8825b..59c9b7bd9 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -137,7 +137,7 @@ static const match_table_t tokens = { }; -STATIC unsigned long +STATIC int suffix_kstrtoint(char *s, unsigned int base, int *res) { int last, shift_left_factor = 0, _res; @@ -1714,8 +1714,8 @@ xfs_init_zones(void) xfs_inode_zone = kmem_zone_init_flags(sizeof(xfs_inode_t), "xfs_inode", - KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | KM_ZONE_SPREAD, - xfs_fs_inode_init_once); + KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | KM_ZONE_SPREAD | + KM_ZONE_ACCOUNT, xfs_fs_inode_init_once); if (!xfs_inode_zone) goto out_destroy_efi_zone; diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 996481eeb..b44284c1a 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -178,7 +178,6 @@ xfs_symlink( struct xfs_bmap_free free_list; xfs_fsblock_t first_block; bool unlock_dp_on_error = false; - int committed; xfs_fileoff_t first_fsb; xfs_filblks_t fs_blocks; int nmaps; @@ -387,7 +386,7 @@ xfs_symlink( xfs_trans_set_sync(tp); } - error = xfs_bmap_finish(&tp, &free_list, &committed); + error = xfs_bmap_finish(&tp, &free_list, NULL); if (error) goto out_bmap_cancel; @@ -434,7 +433,6 @@ xfs_inactive_symlink_rmt( struct xfs_inode *ip) { xfs_buf_t *bp; - int committed; int done; int error; xfs_fsblock_t first_block; @@ -510,16 +508,10 @@ xfs_inactive_symlink_rmt( /* * Commit the first transaction. This logs the EFI and the inode. */ - error = xfs_bmap_finish(&tp, &free_list, &committed); + error = xfs_bmap_finish(&tp, &free_list, ip); if (error) goto error_bmap_cancel; /* - * The transaction must have been committed, since there were - * actually extents freed by xfs_bunmapi. See xfs_bmap_finish. - * The new tp has the extent freeing and EFDs. - */ - ASSERT(committed); - /* * The first xact was committed, so add the inode to the new one. * Mark it dirty so it will be logged and moved forward in the log as * part of every commit. diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index ee70f5dec..641d625eb 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -255,11 +255,47 @@ write_grant_head_show( } XFS_SYSFS_ATTR_RO(write_grant_head); +#ifdef DEBUG +STATIC ssize_t +log_badcrc_factor_store( + struct kobject *kobject, + const char *buf, + size_t count) +{ + struct xlog *log = to_xlog(kobject); + int ret; + uint32_t val; + + ret = kstrtouint(buf, 0, &val); + if (ret) + return ret; + + log->l_badcrc_factor = val; + + return count; +} + +STATIC ssize_t +log_badcrc_factor_show( + struct kobject *kobject, + char *buf) +{ + struct xlog *log = to_xlog(kobject); + + return snprintf(buf, PAGE_SIZE, "%d\n", log->l_badcrc_factor); +} + +XFS_SYSFS_ATTR_RW(log_badcrc_factor); +#endif /* DEBUG */ + static struct attribute *xfs_log_attrs[] = { ATTR_LIST(log_head_lsn), ATTR_LIST(log_tail_lsn), ATTR_LIST(reserve_grant_head), ATTR_LIST(write_grant_head), +#ifdef DEBUG + ATTR_LIST(log_badcrc_factor), +#endif NULL, }; diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 877079eb0..391d797cb 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -1222,6 +1222,32 @@ DEFINE_PAGE_EVENT(xfs_writepage); DEFINE_PAGE_EVENT(xfs_releasepage); DEFINE_PAGE_EVENT(xfs_invalidatepage); +DECLARE_EVENT_CLASS(xfs_readpage_class, + TP_PROTO(struct inode *inode, int nr_pages), + TP_ARGS(inode, nr_pages), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(int, nr_pages) + ), + TP_fast_assign( + __entry->dev = inode->i_sb->s_dev; + __entry->ino = inode->i_ino; + __entry->nr_pages = nr_pages; + ), + TP_printk("dev %d:%d ino 0x%llx nr_pages %d", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->nr_pages) +) + +#define DEFINE_READPAGE_EVENT(name) \ +DEFINE_EVENT(xfs_readpage_class, name, \ + TP_PROTO(struct inode *inode, int nr_pages), \ + TP_ARGS(inode, nr_pages)) +DEFINE_READPAGE_EVENT(xfs_vm_readpage); +DEFINE_READPAGE_EVENT(xfs_vm_readpages); + DECLARE_EVENT_CLASS(xfs_imap_class, TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, int type, struct xfs_bmbt_irec *irec), diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c index ce78534a0..995170194 100644 --- a/fs/xfs/xfs_trans_dquot.c +++ b/fs/xfs/xfs_trans_dquot.c @@ -572,12 +572,16 @@ xfs_quota_warn( struct xfs_dquot *dqp, int type) { - /* no warnings for project quotas - we just return ENOSPC later */ + enum quota_type qtype; + if (dqp->dq_flags & XFS_DQ_PROJ) - return; - quota_send_warning(make_kqid(&init_user_ns, - (dqp->dq_flags & XFS_DQ_USER) ? - USRQUOTA : GRPQUOTA, + qtype = PRJQUOTA; + else if (dqp->dq_flags & XFS_DQ_USER) + qtype = USRQUOTA; + else + qtype = GRPQUOTA; + + quota_send_warning(make_kqid(&init_user_ns, qtype, be32_to_cpu(dqp->q_core.d_id)), mp->m_super->s_dev, type); } diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c index 839b35ca2..110f1d7d8 100644 --- a/fs/xfs/xfs_xattr.c +++ b/fs/xfs/xfs_xattr.c @@ -39,9 +39,6 @@ xfs_xattr_get(const struct xattr_handler *handler, struct dentry *dentry, struct xfs_inode *ip = XFS_I(d_inode(dentry)); int error, asize = size; - if (strcmp(name, "") == 0) - return -EINVAL; - /* Convert Linux syscall to XFS internal ATTR flags */ if (!size) { xflags |= ATTR_KERNOVAL; @@ -84,9 +81,6 @@ xfs_xattr_set(const struct xattr_handler *handler, struct dentry *dentry, struct xfs_inode *ip = XFS_I(d_inode(dentry)); int error; - if (strcmp(name, "") == 0) - return -EINVAL; - /* Convert Linux syscall to XFS internal ATTR flags */ if (flags & XATTR_CREATE) xflags |= ATTR_CREATE; @@ -135,47 +129,19 @@ const struct xattr_handler *xfs_xattr_handlers[] = { NULL }; -static unsigned int xfs_xattr_prefix_len(int flags) -{ - if (flags & XFS_ATTR_SECURE) - return sizeof("security"); - else if (flags & XFS_ATTR_ROOT) - return sizeof("trusted"); - else - return sizeof("user"); -} - -static const char *xfs_xattr_prefix(int flags) -{ - if (flags & XFS_ATTR_SECURE) - return xfs_xattr_security_handler.prefix; - else if (flags & XFS_ATTR_ROOT) - return xfs_xattr_trusted_handler.prefix; - else - return xfs_xattr_user_handler.prefix; -} - static int -xfs_xattr_put_listent( +__xfs_xattr_put_listent( struct xfs_attr_list_context *context, - int flags, - unsigned char *name, - int namelen, - int valuelen, - unsigned char *value) + char *prefix, + int prefix_len, + unsigned char *name, + int namelen) { - unsigned int prefix_len = xfs_xattr_prefix_len(flags); char *offset; int arraytop; - ASSERT(context->count >= 0); - - /* - * Only show root namespace entries if we are actually allowed to - * see them. - */ - if ((flags & XFS_ATTR_ROOT) && !capable(CAP_SYS_ADMIN)) - return 0; + if (!context->alist) + goto compute_size; arraytop = context->count + prefix_len + namelen + 1; if (arraytop > context->firstu) { @@ -183,17 +149,19 @@ xfs_xattr_put_listent( return 1; } offset = (char *)context->alist + context->count; - strncpy(offset, xfs_xattr_prefix(flags), prefix_len); + strncpy(offset, prefix, prefix_len); offset += prefix_len; strncpy(offset, (char *)name, namelen); /* real name */ offset += namelen; *offset = '\0'; + +compute_size: context->count += prefix_len + namelen + 1; return 0; } static int -xfs_xattr_put_listent_sizes( +xfs_xattr_put_listent( struct xfs_attr_list_context *context, int flags, unsigned char *name, @@ -201,24 +169,55 @@ xfs_xattr_put_listent_sizes( int valuelen, unsigned char *value) { - context->count += xfs_xattr_prefix_len(flags) + namelen + 1; - return 0; -} + char *prefix; + int prefix_len; -static int -list_one_attr(const char *name, const size_t len, void *data, - size_t size, ssize_t *result) -{ - char *p = data + *result; + ASSERT(context->count >= 0); - *result += len; - if (!size) - return 0; - if (*result > size) - return -ERANGE; + if (flags & XFS_ATTR_ROOT) { +#ifdef CONFIG_XFS_POSIX_ACL + if (namelen == SGI_ACL_FILE_SIZE && + strncmp(name, SGI_ACL_FILE, + SGI_ACL_FILE_SIZE) == 0) { + int ret = __xfs_xattr_put_listent( + context, XATTR_SYSTEM_PREFIX, + XATTR_SYSTEM_PREFIX_LEN, + XATTR_POSIX_ACL_ACCESS, + strlen(XATTR_POSIX_ACL_ACCESS)); + if (ret) + return ret; + } else if (namelen == SGI_ACL_DEFAULT_SIZE && + strncmp(name, SGI_ACL_DEFAULT, + SGI_ACL_DEFAULT_SIZE) == 0) { + int ret = __xfs_xattr_put_listent( + context, XATTR_SYSTEM_PREFIX, + XATTR_SYSTEM_PREFIX_LEN, + XATTR_POSIX_ACL_DEFAULT, + strlen(XATTR_POSIX_ACL_DEFAULT)); + if (ret) + return ret; + } +#endif - strcpy(p, name); - return 0; + /* + * Only show root namespace entries if we are actually allowed to + * see them. + */ + if (!capable(CAP_SYS_ADMIN)) + return 0; + + prefix = XATTR_TRUSTED_PREFIX; + prefix_len = XATTR_TRUSTED_PREFIX_LEN; + } else if (flags & XFS_ATTR_SECURE) { + prefix = XATTR_SECURITY_PREFIX; + prefix_len = XATTR_SECURITY_PREFIX_LEN; + } else { + prefix = XATTR_USER_PREFIX; + prefix_len = XATTR_USER_PREFIX_LEN; + } + + return __xfs_xattr_put_listent(context, prefix, prefix_len, name, + namelen); } ssize_t @@ -227,7 +226,6 @@ xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size) struct xfs_attr_list_context context; struct attrlist_cursor_kern cursor = { 0 }; struct inode *inode = d_inode(dentry); - int error; /* * First read the regular on-disk attributes. @@ -236,37 +234,14 @@ xfs_vn_listxattr(struct dentry *dentry, char *data, size_t size) context.dp = XFS_I(inode); context.cursor = &cursor; context.resynch = 1; - context.alist = data; + context.alist = size ? data : NULL; context.bufsize = size; context.firstu = context.bufsize; - - if (size) - context.put_listent = xfs_xattr_put_listent; - else - context.put_listent = xfs_xattr_put_listent_sizes; + context.put_listent = xfs_xattr_put_listent; xfs_attr_list_int(&context); if (context.count < 0) return -ERANGE; - /* - * Then add the two synthetic ACL attributes. - */ - if (posix_acl_access_exists(inode)) { - error = list_one_attr(POSIX_ACL_XATTR_ACCESS, - strlen(POSIX_ACL_XATTR_ACCESS) + 1, - data, size, &context.count); - if (error) - return error; - } - - if (posix_acl_default_exists(inode)) { - error = list_one_attr(POSIX_ACL_XATTR_DEFAULT, - strlen(POSIX_ACL_XATTR_DEFAULT) + 1, - data, size, &context.count); - if (error) - return error; - } - return context.count; } |