summaryrefslogtreecommitdiff
path: root/fs/fuse
diff options
context:
space:
mode:
Diffstat (limited to 'fs/fuse')
-rw-r--r--fs/fuse/Kconfig27
-rw-r--r--fs/fuse/Makefile8
-rw-r--r--fs/fuse/control.c354
-rw-r--r--fs/fuse/cuse.c636
-rw-r--r--fs/fuse/dev.c2273
-rw-r--r--fs/fuse/dir.c1952
-rw-r--r--fs/fuse/file.c2994
-rw-r--r--fs/fuse/fuse_i.h912
-rw-r--r--fs/fuse/inode.c1320
9 files changed, 10476 insertions, 0 deletions
diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig
new file mode 100644
index 000000000..1b2f6c2c3
--- /dev/null
+++ b/fs/fuse/Kconfig
@@ -0,0 +1,27 @@
+config FUSE_FS
+ tristate "FUSE (Filesystem in Userspace) support"
+ help
+ With FUSE it is possible to implement a fully functional filesystem
+ in a userspace program.
+
+ There's also a companion library: libfuse2. This library is available
+ from the FUSE homepage:
+ <http://fuse.sourceforge.net/>
+ although chances are your distribution already has that library
+ installed if you've installed the "fuse" package itself.
+
+ See <file:Documentation/filesystems/fuse.txt> for more information.
+ See <file:Documentation/Changes> for needed library/utility version.
+
+ If you want to develop a userspace FS, or if you want to use
+ a filesystem based on FUSE, answer Y or M.
+
+config CUSE
+ tristate "Character device in Userspace support"
+ depends on FUSE_FS
+ help
+ This FUSE extension allows character devices to be
+ implemented in userspace.
+
+ If you want to develop or use a userspace character device
+ based on CUSE, answer Y or M.
diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile
new file mode 100644
index 000000000..e95eeb445
--- /dev/null
+++ b/fs/fuse/Makefile
@@ -0,0 +1,8 @@
+#
+# Makefile for the FUSE filesystem.
+#
+
+obj-$(CONFIG_FUSE_FS) += fuse.o
+obj-$(CONFIG_CUSE) += cuse.o
+
+fuse-objs := dev.o dir.o file.o inode.o control.o
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
new file mode 100644
index 000000000..f863ac664
--- /dev/null
+++ b/fs/fuse/control.c
@@ -0,0 +1,354 @@
+/*
+ FUSE: Filesystem in Userspace
+ Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu>
+
+ This program can be distributed under the terms of the GNU GPL.
+ See the file COPYING.
+*/
+
+#include "fuse_i.h"
+
+#include <linux/init.h>
+#include <linux/module.h>
+
+#define FUSE_CTL_SUPER_MAGIC 0x65735543
+
+/*
+ * This is non-NULL when the single instance of the control filesystem
+ * exists. Protected by fuse_mutex
+ */
+static struct super_block *fuse_control_sb;
+
+static struct fuse_conn *fuse_ctl_file_conn_get(struct file *file)
+{
+ struct fuse_conn *fc;
+ mutex_lock(&fuse_mutex);
+ fc = file_inode(file)->i_private;
+ if (fc)
+ fc = fuse_conn_get(fc);
+ mutex_unlock(&fuse_mutex);
+ return fc;
+}
+
+static ssize_t fuse_conn_abort_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
+ if (fc) {
+ fuse_abort_conn(fc);
+ fuse_conn_put(fc);
+ }
+ return count;
+}
+
+static ssize_t fuse_conn_waiting_read(struct file *file, char __user *buf,
+ size_t len, loff_t *ppos)
+{
+ char tmp[32];
+ size_t size;
+
+ if (!*ppos) {
+ long value;
+ struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
+ if (!fc)
+ return 0;
+
+ value = atomic_read(&fc->num_waiting);
+ file->private_data = (void *)value;
+ fuse_conn_put(fc);
+ }
+ size = sprintf(tmp, "%ld\n", (long)file->private_data);
+ return simple_read_from_buffer(buf, len, ppos, tmp, size);
+}
+
+static ssize_t fuse_conn_limit_read(struct file *file, char __user *buf,
+ size_t len, loff_t *ppos, unsigned val)
+{
+ char tmp[32];
+ size_t size = sprintf(tmp, "%u\n", val);
+
+ return simple_read_from_buffer(buf, len, ppos, tmp, size);
+}
+
+static ssize_t fuse_conn_limit_write(struct file *file, const char __user *buf,
+ size_t count, loff_t *ppos, unsigned *val,
+ unsigned global_limit)
+{
+ unsigned long t;
+ unsigned limit = (1 << 16) - 1;
+ int err;
+
+ if (*ppos)
+ return -EINVAL;
+
+ err = kstrtoul_from_user(buf, count, 0, &t);
+ if (err)
+ return err;
+
+ if (!capable(CAP_SYS_ADMIN))
+ limit = min(limit, global_limit);
+
+ if (t > limit)
+ return -EINVAL;
+
+ *val = t;
+
+ return count;
+}
+
+static ssize_t fuse_conn_max_background_read(struct file *file,
+ char __user *buf, size_t len,
+ loff_t *ppos)
+{
+ struct fuse_conn *fc;
+ unsigned val;
+
+ fc = fuse_ctl_file_conn_get(file);
+ if (!fc)
+ return 0;
+
+ val = fc->max_background;
+ fuse_conn_put(fc);
+
+ return fuse_conn_limit_read(file, buf, len, ppos, val);
+}
+
+static ssize_t fuse_conn_max_background_write(struct file *file,
+ const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ unsigned uninitialized_var(val);
+ ssize_t ret;
+
+ ret = fuse_conn_limit_write(file, buf, count, ppos, &val,
+ max_user_bgreq);
+ if (ret > 0) {
+ struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
+ if (fc) {
+ fc->max_background = val;
+ fuse_conn_put(fc);
+ }
+ }
+
+ return ret;
+}
+
+static ssize_t fuse_conn_congestion_threshold_read(struct file *file,
+ char __user *buf, size_t len,
+ loff_t *ppos)
+{
+ struct fuse_conn *fc;
+ unsigned val;
+
+ fc = fuse_ctl_file_conn_get(file);
+ if (!fc)
+ return 0;
+
+ val = fc->congestion_threshold;
+ fuse_conn_put(fc);
+
+ return fuse_conn_limit_read(file, buf, len, ppos, val);
+}
+
+static ssize_t fuse_conn_congestion_threshold_write(struct file *file,
+ const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ unsigned uninitialized_var(val);
+ ssize_t ret;
+
+ ret = fuse_conn_limit_write(file, buf, count, ppos, &val,
+ max_user_congthresh);
+ if (ret > 0) {
+ struct fuse_conn *fc = fuse_ctl_file_conn_get(file);
+ if (fc) {
+ fc->congestion_threshold = val;
+ fuse_conn_put(fc);
+ }
+ }
+
+ return ret;
+}
+
+static const struct file_operations fuse_ctl_abort_ops = {
+ .open = nonseekable_open,
+ .write = fuse_conn_abort_write,
+ .llseek = no_llseek,
+};
+
+static const struct file_operations fuse_ctl_waiting_ops = {
+ .open = nonseekable_open,
+ .read = fuse_conn_waiting_read,
+ .llseek = no_llseek,
+};
+
+static const struct file_operations fuse_conn_max_background_ops = {
+ .open = nonseekable_open,
+ .read = fuse_conn_max_background_read,
+ .write = fuse_conn_max_background_write,
+ .llseek = no_llseek,
+};
+
+static const struct file_operations fuse_conn_congestion_threshold_ops = {
+ .open = nonseekable_open,
+ .read = fuse_conn_congestion_threshold_read,
+ .write = fuse_conn_congestion_threshold_write,
+ .llseek = no_llseek,
+};
+
+static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
+ struct fuse_conn *fc,
+ const char *name,
+ int mode, int nlink,
+ const struct inode_operations *iop,
+ const struct file_operations *fop)
+{
+ struct dentry *dentry;
+ struct inode *inode;
+
+ BUG_ON(fc->ctl_ndents >= FUSE_CTL_NUM_DENTRIES);
+ dentry = d_alloc_name(parent, name);
+ if (!dentry)
+ return NULL;
+
+ fc->ctl_dentry[fc->ctl_ndents++] = dentry;
+ inode = new_inode(fuse_control_sb);
+ if (!inode)
+ return NULL;
+
+ inode->i_ino = get_next_ino();
+ inode->i_mode = mode;
+ inode->i_uid = fc->user_id;
+ inode->i_gid = fc->group_id;
+ inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+ /* setting ->i_op to NULL is not allowed */
+ if (iop)
+ inode->i_op = iop;
+ inode->i_fop = fop;
+ set_nlink(inode, nlink);
+ inode->i_private = fc;
+ d_add(dentry, inode);
+ return dentry;
+}
+
+/*
+ * Add a connection to the control filesystem (if it exists). Caller
+ * must hold fuse_mutex
+ */
+int fuse_ctl_add_conn(struct fuse_conn *fc)
+{
+ struct dentry *parent;
+ char name[32];
+
+ if (!fuse_control_sb)
+ return 0;
+
+ parent = fuse_control_sb->s_root;
+ inc_nlink(d_inode(parent));
+ sprintf(name, "%u", fc->dev);
+ parent = fuse_ctl_add_dentry(parent, fc, name, S_IFDIR | 0500, 2,
+ &simple_dir_inode_operations,
+ &simple_dir_operations);
+ if (!parent)
+ goto err;
+
+ if (!fuse_ctl_add_dentry(parent, fc, "waiting", S_IFREG | 0400, 1,
+ NULL, &fuse_ctl_waiting_ops) ||
+ !fuse_ctl_add_dentry(parent, fc, "abort", S_IFREG | 0200, 1,
+ NULL, &fuse_ctl_abort_ops) ||
+ !fuse_ctl_add_dentry(parent, fc, "max_background", S_IFREG | 0600,
+ 1, NULL, &fuse_conn_max_background_ops) ||
+ !fuse_ctl_add_dentry(parent, fc, "congestion_threshold",
+ S_IFREG | 0600, 1, NULL,
+ &fuse_conn_congestion_threshold_ops))
+ goto err;
+
+ return 0;
+
+ err:
+ fuse_ctl_remove_conn(fc);
+ return -ENOMEM;
+}
+
+/*
+ * Remove a connection from the control filesystem (if it exists).
+ * Caller must hold fuse_mutex
+ */
+void fuse_ctl_remove_conn(struct fuse_conn *fc)
+{
+ int i;
+
+ if (!fuse_control_sb)
+ return;
+
+ for (i = fc->ctl_ndents - 1; i >= 0; i--) {
+ struct dentry *dentry = fc->ctl_dentry[i];
+ d_inode(dentry)->i_private = NULL;
+ d_drop(dentry);
+ dput(dentry);
+ }
+ drop_nlink(d_inode(fuse_control_sb->s_root));
+}
+
+static int fuse_ctl_fill_super(struct super_block *sb, void *data, int silent)
+{
+ struct tree_descr empty_descr = {""};
+ struct fuse_conn *fc;
+ int err;
+
+ err = simple_fill_super(sb, FUSE_CTL_SUPER_MAGIC, &empty_descr);
+ if (err)
+ return err;
+
+ mutex_lock(&fuse_mutex);
+ BUG_ON(fuse_control_sb);
+ fuse_control_sb = sb;
+ list_for_each_entry(fc, &fuse_conn_list, entry) {
+ err = fuse_ctl_add_conn(fc);
+ if (err) {
+ fuse_control_sb = NULL;
+ mutex_unlock(&fuse_mutex);
+ return err;
+ }
+ }
+ mutex_unlock(&fuse_mutex);
+
+ return 0;
+}
+
+static struct dentry *fuse_ctl_mount(struct file_system_type *fs_type,
+ int flags, const char *dev_name, void *raw_data)
+{
+ return mount_single(fs_type, flags, raw_data, fuse_ctl_fill_super);
+}
+
+static void fuse_ctl_kill_sb(struct super_block *sb)
+{
+ struct fuse_conn *fc;
+
+ mutex_lock(&fuse_mutex);
+ fuse_control_sb = NULL;
+ list_for_each_entry(fc, &fuse_conn_list, entry)
+ fc->ctl_ndents = 0;
+ mutex_unlock(&fuse_mutex);
+
+ kill_litter_super(sb);
+}
+
+static struct file_system_type fuse_ctl_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "fusectl",
+ .mount = fuse_ctl_mount,
+ .kill_sb = fuse_ctl_kill_sb,
+};
+MODULE_ALIAS_FS("fusectl");
+
+int __init fuse_ctl_init(void)
+{
+ return register_filesystem(&fuse_ctl_fs_type);
+}
+
+void __exit fuse_ctl_cleanup(void)
+{
+ unregister_filesystem(&fuse_ctl_fs_type);
+}
diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c
new file mode 100644
index 000000000..e5bbf748b
--- /dev/null
+++ b/fs/fuse/cuse.c
@@ -0,0 +1,636 @@
+/*
+ * CUSE: Character device in Userspace
+ *
+ * Copyright (C) 2008-2009 SUSE Linux Products GmbH
+ * Copyright (C) 2008-2009 Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ *
+ * CUSE enables character devices to be implemented from userland much
+ * like FUSE allows filesystems. On initialization /dev/cuse is
+ * created. By opening the file and replying to the CUSE_INIT request
+ * userland CUSE server can create a character device. After that the
+ * operation is very similar to FUSE.
+ *
+ * A CUSE instance involves the following objects.
+ *
+ * cuse_conn : contains fuse_conn and serves as bonding structure
+ * channel : file handle connected to the userland CUSE server
+ * cdev : the implemented character device
+ * dev : generic device for cdev
+ *
+ * Note that 'channel' is what 'dev' is in FUSE. As CUSE deals with
+ * devices, it's called 'channel' to reduce confusion.
+ *
+ * channel determines when the character device dies. When channel is
+ * closed, everything begins to destruct. The cuse_conn is taken off
+ * the lookup table preventing further access from cdev, cdev and
+ * generic device are removed and the base reference of cuse_conn is
+ * put.
+ *
+ * On each open, the matching cuse_conn is looked up and if found an
+ * additional reference is taken which is released when the file is
+ * closed.
+ */
+
+#include <linux/fuse.h>
+#include <linux/cdev.h>
+#include <linux/device.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/kdev_t.h>
+#include <linux/kthread.h>
+#include <linux/list.h>
+#include <linux/magic.h>
+#include <linux/miscdevice.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/module.h>
+#include <linux/uio.h>
+
+#include "fuse_i.h"
+
+#define CUSE_CONNTBL_LEN 64
+
+struct cuse_conn {
+ struct list_head list; /* linked on cuse_conntbl */
+ struct fuse_conn fc; /* fuse connection */
+ struct cdev *cdev; /* associated character device */
+ struct device *dev; /* device representing @cdev */
+
+ /* init parameters, set once during initialization */
+ bool unrestricted_ioctl;
+};
+
+static DEFINE_MUTEX(cuse_lock); /* protects registration */
+static struct list_head cuse_conntbl[CUSE_CONNTBL_LEN];
+static struct class *cuse_class;
+
+static struct cuse_conn *fc_to_cc(struct fuse_conn *fc)
+{
+ return container_of(fc, struct cuse_conn, fc);
+}
+
+static struct list_head *cuse_conntbl_head(dev_t devt)
+{
+ return &cuse_conntbl[(MAJOR(devt) + MINOR(devt)) % CUSE_CONNTBL_LEN];
+}
+
+
+/**************************************************************************
+ * CUSE frontend operations
+ *
+ * These are file operations for the character device.
+ *
+ * On open, CUSE opens a file from the FUSE mnt and stores it to
+ * private_data of the open file. All other ops call FUSE ops on the
+ * FUSE file.
+ */
+
+static ssize_t cuse_read_iter(struct kiocb *kiocb, struct iov_iter *to)
+{
+ struct fuse_io_priv io = { .async = 0, .file = kiocb->ki_filp };
+ loff_t pos = 0;
+
+ return fuse_direct_io(&io, to, &pos, FUSE_DIO_CUSE);
+}
+
+static ssize_t cuse_write_iter(struct kiocb *kiocb, struct iov_iter *from)
+{
+ struct fuse_io_priv io = { .async = 0, .file = kiocb->ki_filp };
+ loff_t pos = 0;
+ /*
+ * No locking or generic_write_checks(), the server is
+ * responsible for locking and sanity checks.
+ */
+ return fuse_direct_io(&io, from, &pos,
+ FUSE_DIO_WRITE | FUSE_DIO_CUSE);
+}
+
+static int cuse_open(struct inode *inode, struct file *file)
+{
+ dev_t devt = inode->i_cdev->dev;
+ struct cuse_conn *cc = NULL, *pos;
+ int rc;
+
+ /* look up and get the connection */
+ mutex_lock(&cuse_lock);
+ list_for_each_entry(pos, cuse_conntbl_head(devt), list)
+ if (pos->dev->devt == devt) {
+ fuse_conn_get(&pos->fc);
+ cc = pos;
+ break;
+ }
+ mutex_unlock(&cuse_lock);
+
+ /* dead? */
+ if (!cc)
+ return -ENODEV;
+
+ /*
+ * Generic permission check is already done against the chrdev
+ * file, proceed to open.
+ */
+ rc = fuse_do_open(&cc->fc, 0, file, 0);
+ if (rc)
+ fuse_conn_put(&cc->fc);
+ return rc;
+}
+
+static int cuse_release(struct inode *inode, struct file *file)
+{
+ struct fuse_file *ff = file->private_data;
+ struct fuse_conn *fc = ff->fc;
+
+ fuse_sync_release(ff, file->f_flags);
+ fuse_conn_put(fc);
+
+ return 0;
+}
+
+static long cuse_file_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ struct fuse_file *ff = file->private_data;
+ struct cuse_conn *cc = fc_to_cc(ff->fc);
+ unsigned int flags = 0;
+
+ if (cc->unrestricted_ioctl)
+ flags |= FUSE_IOCTL_UNRESTRICTED;
+
+ return fuse_do_ioctl(file, cmd, arg, flags);
+}
+
+static long cuse_file_compat_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ struct fuse_file *ff = file->private_data;
+ struct cuse_conn *cc = fc_to_cc(ff->fc);
+ unsigned int flags = FUSE_IOCTL_COMPAT;
+
+ if (cc->unrestricted_ioctl)
+ flags |= FUSE_IOCTL_UNRESTRICTED;
+
+ return fuse_do_ioctl(file, cmd, arg, flags);
+}
+
+static const struct file_operations cuse_frontend_fops = {
+ .owner = THIS_MODULE,
+ .read_iter = cuse_read_iter,
+ .write_iter = cuse_write_iter,
+ .open = cuse_open,
+ .release = cuse_release,
+ .unlocked_ioctl = cuse_file_ioctl,
+ .compat_ioctl = cuse_file_compat_ioctl,
+ .poll = fuse_file_poll,
+ .llseek = noop_llseek,
+};
+
+
+/**************************************************************************
+ * CUSE channel initialization and destruction
+ */
+
+struct cuse_devinfo {
+ const char *name;
+};
+
+/**
+ * cuse_parse_one - parse one key=value pair
+ * @pp: i/o parameter for the current position
+ * @end: points to one past the end of the packed string
+ * @keyp: out parameter for key
+ * @valp: out parameter for value
+ *
+ * *@pp points to packed strings - "key0=val0\0key1=val1\0" which ends
+ * at @end - 1. This function parses one pair and set *@keyp to the
+ * start of the key and *@valp to the start of the value. Note that
+ * the original string is modified such that the key string is
+ * terminated with '\0'. *@pp is updated to point to the next string.
+ *
+ * RETURNS:
+ * 1 on successful parse, 0 on EOF, -errno on failure.
+ */
+static int cuse_parse_one(char **pp, char *end, char **keyp, char **valp)
+{
+ char *p = *pp;
+ char *key, *val;
+
+ while (p < end && *p == '\0')
+ p++;
+ if (p == end)
+ return 0;
+
+ if (end[-1] != '\0') {
+ printk(KERN_ERR "CUSE: info not properly terminated\n");
+ return -EINVAL;
+ }
+
+ key = val = p;
+ p += strlen(p);
+
+ if (valp) {
+ strsep(&val, "=");
+ if (!val)
+ val = key + strlen(key);
+ key = strstrip(key);
+ val = strstrip(val);
+ } else
+ key = strstrip(key);
+
+ if (!strlen(key)) {
+ printk(KERN_ERR "CUSE: zero length info key specified\n");
+ return -EINVAL;
+ }
+
+ *pp = p;
+ *keyp = key;
+ if (valp)
+ *valp = val;
+
+ return 1;
+}
+
+/**
+ * cuse_parse_dev_info - parse device info
+ * @p: device info string
+ * @len: length of device info string
+ * @devinfo: out parameter for parsed device info
+ *
+ * Parse @p to extract device info and store it into @devinfo. String
+ * pointed to by @p is modified by parsing and @devinfo points into
+ * them, so @p shouldn't be freed while @devinfo is in use.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+static int cuse_parse_devinfo(char *p, size_t len, struct cuse_devinfo *devinfo)
+{
+ char *end = p + len;
+ char *uninitialized_var(key), *uninitialized_var(val);
+ int rc;
+
+ while (true) {
+ rc = cuse_parse_one(&p, end, &key, &val);
+ if (rc < 0)
+ return rc;
+ if (!rc)
+ break;
+ if (strcmp(key, "DEVNAME") == 0)
+ devinfo->name = val;
+ else
+ printk(KERN_WARNING "CUSE: unknown device info \"%s\"\n",
+ key);
+ }
+
+ if (!devinfo->name || !strlen(devinfo->name)) {
+ printk(KERN_ERR "CUSE: DEVNAME unspecified\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static void cuse_gendev_release(struct device *dev)
+{
+ kfree(dev);
+}
+
+/**
+ * cuse_process_init_reply - finish initializing CUSE channel
+ *
+ * This function creates the character device and sets up all the
+ * required data structures for it. Please read the comment at the
+ * top of this file for high level overview.
+ */
+static void cuse_process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
+{
+ struct cuse_conn *cc = fc_to_cc(fc), *pos;
+ struct cuse_init_out *arg = req->out.args[0].value;
+ struct page *page = req->pages[0];
+ struct cuse_devinfo devinfo = { };
+ struct device *dev;
+ struct cdev *cdev;
+ dev_t devt;
+ int rc, i;
+
+ if (req->out.h.error ||
+ arg->major != FUSE_KERNEL_VERSION || arg->minor < 11) {
+ goto err;
+ }
+
+ fc->minor = arg->minor;
+ fc->max_read = max_t(unsigned, arg->max_read, 4096);
+ fc->max_write = max_t(unsigned, arg->max_write, 4096);
+
+ /* parse init reply */
+ cc->unrestricted_ioctl = arg->flags & CUSE_UNRESTRICTED_IOCTL;
+
+ rc = cuse_parse_devinfo(page_address(page), req->out.args[1].size,
+ &devinfo);
+ if (rc)
+ goto err;
+
+ /* determine and reserve devt */
+ devt = MKDEV(arg->dev_major, arg->dev_minor);
+ if (!MAJOR(devt))
+ rc = alloc_chrdev_region(&devt, MINOR(devt), 1, devinfo.name);
+ else
+ rc = register_chrdev_region(devt, 1, devinfo.name);
+ if (rc) {
+ printk(KERN_ERR "CUSE: failed to register chrdev region\n");
+ goto err;
+ }
+
+ /* devt determined, create device */
+ rc = -ENOMEM;
+ dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+ if (!dev)
+ goto err_region;
+
+ device_initialize(dev);
+ dev_set_uevent_suppress(dev, 1);
+ dev->class = cuse_class;
+ dev->devt = devt;
+ dev->release = cuse_gendev_release;
+ dev_set_drvdata(dev, cc);
+ dev_set_name(dev, "%s", devinfo.name);
+
+ mutex_lock(&cuse_lock);
+
+ /* make sure the device-name is unique */
+ for (i = 0; i < CUSE_CONNTBL_LEN; ++i) {
+ list_for_each_entry(pos, &cuse_conntbl[i], list)
+ if (!strcmp(dev_name(pos->dev), dev_name(dev)))
+ goto err_unlock;
+ }
+
+ rc = device_add(dev);
+ if (rc)
+ goto err_unlock;
+
+ /* register cdev */
+ rc = -ENOMEM;
+ cdev = cdev_alloc();
+ if (!cdev)
+ goto err_unlock;
+
+ cdev->owner = THIS_MODULE;
+ cdev->ops = &cuse_frontend_fops;
+
+ rc = cdev_add(cdev, devt, 1);
+ if (rc)
+ goto err_cdev;
+
+ cc->dev = dev;
+ cc->cdev = cdev;
+
+ /* make the device available */
+ list_add(&cc->list, cuse_conntbl_head(devt));
+ mutex_unlock(&cuse_lock);
+
+ /* announce device availability */
+ dev_set_uevent_suppress(dev, 0);
+ kobject_uevent(&dev->kobj, KOBJ_ADD);
+out:
+ kfree(arg);
+ __free_page(page);
+ return;
+
+err_cdev:
+ cdev_del(cdev);
+err_unlock:
+ mutex_unlock(&cuse_lock);
+ put_device(dev);
+err_region:
+ unregister_chrdev_region(devt, 1);
+err:
+ fuse_abort_conn(fc);
+ goto out;
+}
+
+static int cuse_send_init(struct cuse_conn *cc)
+{
+ int rc;
+ struct fuse_req *req;
+ struct page *page;
+ struct fuse_conn *fc = &cc->fc;
+ struct cuse_init_in *arg;
+ void *outarg;
+
+ BUILD_BUG_ON(CUSE_INIT_INFO_MAX > PAGE_SIZE);
+
+ req = fuse_get_req_for_background(fc, 1);
+ if (IS_ERR(req)) {
+ rc = PTR_ERR(req);
+ goto err;
+ }
+
+ rc = -ENOMEM;
+ page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ if (!page)
+ goto err_put_req;
+
+ outarg = kzalloc(sizeof(struct cuse_init_out), GFP_KERNEL);
+ if (!outarg)
+ goto err_free_page;
+
+ arg = &req->misc.cuse_init_in;
+ arg->major = FUSE_KERNEL_VERSION;
+ arg->minor = FUSE_KERNEL_MINOR_VERSION;
+ arg->flags |= CUSE_UNRESTRICTED_IOCTL;
+ req->in.h.opcode = CUSE_INIT;
+ req->in.numargs = 1;
+ req->in.args[0].size = sizeof(struct cuse_init_in);
+ req->in.args[0].value = arg;
+ req->out.numargs = 2;
+ req->out.args[0].size = sizeof(struct cuse_init_out);
+ req->out.args[0].value = outarg;
+ req->out.args[1].size = CUSE_INIT_INFO_MAX;
+ req->out.argvar = 1;
+ req->out.argpages = 1;
+ req->pages[0] = page;
+ req->page_descs[0].length = req->out.args[1].size;
+ req->num_pages = 1;
+ req->end = cuse_process_init_reply;
+ fuse_request_send_background(fc, req);
+
+ return 0;
+
+err_free_page:
+ __free_page(page);
+err_put_req:
+ fuse_put_request(fc, req);
+err:
+ return rc;
+}
+
+static void cuse_fc_release(struct fuse_conn *fc)
+{
+ struct cuse_conn *cc = fc_to_cc(fc);
+ kfree_rcu(cc, fc.rcu);
+}
+
+/**
+ * cuse_channel_open - open method for /dev/cuse
+ * @inode: inode for /dev/cuse
+ * @file: file struct being opened
+ *
+ * Userland CUSE server can create a CUSE device by opening /dev/cuse
+ * and replying to the initialization request kernel sends. This
+ * function is responsible for handling CUSE device initialization.
+ * Because the fd opened by this function is used during
+ * initialization, this function only creates cuse_conn and sends
+ * init. The rest is delegated to a kthread.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+static int cuse_channel_open(struct inode *inode, struct file *file)
+{
+ struct cuse_conn *cc;
+ int rc;
+
+ /* set up cuse_conn */
+ cc = kzalloc(sizeof(*cc), GFP_KERNEL);
+ if (!cc)
+ return -ENOMEM;
+
+ fuse_conn_init(&cc->fc);
+
+ INIT_LIST_HEAD(&cc->list);
+ cc->fc.release = cuse_fc_release;
+
+ cc->fc.connected = 1;
+ cc->fc.initialized = 1;
+ rc = cuse_send_init(cc);
+ if (rc) {
+ fuse_conn_put(&cc->fc);
+ return rc;
+ }
+ file->private_data = &cc->fc; /* channel owns base reference to cc */
+
+ return 0;
+}
+
+/**
+ * cuse_channel_release - release method for /dev/cuse
+ * @inode: inode for /dev/cuse
+ * @file: file struct being closed
+ *
+ * Disconnect the channel, deregister CUSE device and initiate
+ * destruction by putting the default reference.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+static int cuse_channel_release(struct inode *inode, struct file *file)
+{
+ struct cuse_conn *cc = fc_to_cc(file->private_data);
+ int rc;
+
+ /* remove from the conntbl, no more access from this point on */
+ mutex_lock(&cuse_lock);
+ list_del_init(&cc->list);
+ mutex_unlock(&cuse_lock);
+
+ /* remove device */
+ if (cc->dev)
+ device_unregister(cc->dev);
+ if (cc->cdev) {
+ unregister_chrdev_region(cc->cdev->dev, 1);
+ cdev_del(cc->cdev);
+ }
+
+ rc = fuse_dev_release(inode, file); /* puts the base reference */
+
+ return rc;
+}
+
+static struct file_operations cuse_channel_fops; /* initialized during init */
+
+
+/**************************************************************************
+ * Misc stuff and module initializatiion
+ *
+ * CUSE exports the same set of attributes to sysfs as fusectl.
+ */
+
+static ssize_t cuse_class_waiting_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct cuse_conn *cc = dev_get_drvdata(dev);
+
+ return sprintf(buf, "%d\n", atomic_read(&cc->fc.num_waiting));
+}
+static DEVICE_ATTR(waiting, 0400, cuse_class_waiting_show, NULL);
+
+static ssize_t cuse_class_abort_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct cuse_conn *cc = dev_get_drvdata(dev);
+
+ fuse_abort_conn(&cc->fc);
+ return count;
+}
+static DEVICE_ATTR(abort, 0200, NULL, cuse_class_abort_store);
+
+static struct attribute *cuse_class_dev_attrs[] = {
+ &dev_attr_waiting.attr,
+ &dev_attr_abort.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(cuse_class_dev);
+
+static struct miscdevice cuse_miscdev = {
+ .minor = CUSE_MINOR,
+ .name = "cuse",
+ .fops = &cuse_channel_fops,
+};
+
+MODULE_ALIAS_MISCDEV(CUSE_MINOR);
+MODULE_ALIAS("devname:cuse");
+
+static int __init cuse_init(void)
+{
+ int i, rc;
+
+ /* init conntbl */
+ for (i = 0; i < CUSE_CONNTBL_LEN; i++)
+ INIT_LIST_HEAD(&cuse_conntbl[i]);
+
+ /* inherit and extend fuse_dev_operations */
+ cuse_channel_fops = fuse_dev_operations;
+ cuse_channel_fops.owner = THIS_MODULE;
+ cuse_channel_fops.open = cuse_channel_open;
+ cuse_channel_fops.release = cuse_channel_release;
+
+ cuse_class = class_create(THIS_MODULE, "cuse");
+ if (IS_ERR(cuse_class))
+ return PTR_ERR(cuse_class);
+
+ cuse_class->dev_groups = cuse_class_dev_groups;
+
+ rc = misc_register(&cuse_miscdev);
+ if (rc) {
+ class_destroy(cuse_class);
+ return rc;
+ }
+
+ return 0;
+}
+
+static void __exit cuse_exit(void)
+{
+ misc_deregister(&cuse_miscdev);
+ class_destroy(cuse_class);
+}
+
+module_init(cuse_init);
+module_exit(cuse_exit);
+
+MODULE_AUTHOR("Tejun Heo <tj@kernel.org>");
+MODULE_DESCRIPTION("Character device in Userspace");
+MODULE_LICENSE("GPL");
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
new file mode 100644
index 000000000..c8b68ab2e
--- /dev/null
+++ b/fs/fuse/dev.c
@@ -0,0 +1,2273 @@
+/*
+ FUSE: Filesystem in Userspace
+ Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu>
+
+ This program can be distributed under the terms of the GNU GPL.
+ See the file COPYING.
+*/
+
+#include "fuse_i.h"
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/poll.h>
+#include <linux/uio.h>
+#include <linux/miscdevice.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/pipe_fs_i.h>
+#include <linux/swap.h>
+#include <linux/splice.h>
+
+MODULE_ALIAS_MISCDEV(FUSE_MINOR);
+MODULE_ALIAS("devname:fuse");
+
+static struct kmem_cache *fuse_req_cachep;
+
+static struct fuse_conn *fuse_get_conn(struct file *file)
+{
+ /*
+ * Lockless access is OK, because file->private data is set
+ * once during mount and is valid until the file is released.
+ */
+ return file->private_data;
+}
+
+static void fuse_request_init(struct fuse_req *req, struct page **pages,
+ struct fuse_page_desc *page_descs,
+ unsigned npages)
+{
+ memset(req, 0, sizeof(*req));
+ memset(pages, 0, sizeof(*pages) * npages);
+ memset(page_descs, 0, sizeof(*page_descs) * npages);
+ INIT_LIST_HEAD(&req->list);
+ INIT_LIST_HEAD(&req->intr_entry);
+ init_waitqueue_head(&req->waitq);
+ atomic_set(&req->count, 1);
+ req->pages = pages;
+ req->page_descs = page_descs;
+ req->max_pages = npages;
+}
+
+static struct fuse_req *__fuse_request_alloc(unsigned npages, gfp_t flags)
+{
+ struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, flags);
+ if (req) {
+ struct page **pages;
+ struct fuse_page_desc *page_descs;
+
+ if (npages <= FUSE_REQ_INLINE_PAGES) {
+ pages = req->inline_pages;
+ page_descs = req->inline_page_descs;
+ } else {
+ pages = kmalloc(sizeof(struct page *) * npages, flags);
+ page_descs = kmalloc(sizeof(struct fuse_page_desc) *
+ npages, flags);
+ }
+
+ if (!pages || !page_descs) {
+ kfree(pages);
+ kfree(page_descs);
+ kmem_cache_free(fuse_req_cachep, req);
+ return NULL;
+ }
+
+ fuse_request_init(req, pages, page_descs, npages);
+ }
+ return req;
+}
+
+struct fuse_req *fuse_request_alloc(unsigned npages)
+{
+ return __fuse_request_alloc(npages, GFP_KERNEL);
+}
+EXPORT_SYMBOL_GPL(fuse_request_alloc);
+
+struct fuse_req *fuse_request_alloc_nofs(unsigned npages)
+{
+ return __fuse_request_alloc(npages, GFP_NOFS);
+}
+
+void fuse_request_free(struct fuse_req *req)
+{
+ if (req->pages != req->inline_pages) {
+ kfree(req->pages);
+ kfree(req->page_descs);
+ }
+ kmem_cache_free(fuse_req_cachep, req);
+}
+
+static void block_sigs(sigset_t *oldset)
+{
+ sigset_t mask;
+
+ siginitsetinv(&mask, sigmask(SIGKILL));
+ sigprocmask(SIG_BLOCK, &mask, oldset);
+}
+
+static void restore_sigs(sigset_t *oldset)
+{
+ sigprocmask(SIG_SETMASK, oldset, NULL);
+}
+
+void __fuse_get_request(struct fuse_req *req)
+{
+ atomic_inc(&req->count);
+}
+
+/* Must be called with > 1 refcount */
+static void __fuse_put_request(struct fuse_req *req)
+{
+ BUG_ON(atomic_read(&req->count) < 2);
+ atomic_dec(&req->count);
+}
+
+static void fuse_req_init_context(struct fuse_req *req)
+{
+ req->in.h.uid = from_kuid_munged(&init_user_ns, current_fsuid());
+ req->in.h.gid = from_kgid_munged(&init_user_ns, current_fsgid());
+ req->in.h.pid = current->pid;
+}
+
+void fuse_set_initialized(struct fuse_conn *fc)
+{
+ /* Make sure stores before this are seen on another CPU */
+ smp_wmb();
+ fc->initialized = 1;
+}
+
+static bool fuse_block_alloc(struct fuse_conn *fc, bool for_background)
+{
+ return !fc->initialized || (for_background && fc->blocked);
+}
+
+static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages,
+ bool for_background)
+{
+ struct fuse_req *req;
+ int err;
+ atomic_inc(&fc->num_waiting);
+
+ if (fuse_block_alloc(fc, for_background)) {
+ sigset_t oldset;
+ int intr;
+
+ block_sigs(&oldset);
+ intr = wait_event_interruptible_exclusive(fc->blocked_waitq,
+ !fuse_block_alloc(fc, for_background));
+ restore_sigs(&oldset);
+ err = -EINTR;
+ if (intr)
+ goto out;
+ }
+ /* Matches smp_wmb() in fuse_set_initialized() */
+ smp_rmb();
+
+ err = -ENOTCONN;
+ if (!fc->connected)
+ goto out;
+
+ req = fuse_request_alloc(npages);
+ err = -ENOMEM;
+ if (!req) {
+ if (for_background)
+ wake_up(&fc->blocked_waitq);
+ goto out;
+ }
+
+ fuse_req_init_context(req);
+ req->waiting = 1;
+ req->background = for_background;
+ return req;
+
+ out:
+ atomic_dec(&fc->num_waiting);
+ return ERR_PTR(err);
+}
+
+struct fuse_req *fuse_get_req(struct fuse_conn *fc, unsigned npages)
+{
+ return __fuse_get_req(fc, npages, false);
+}
+EXPORT_SYMBOL_GPL(fuse_get_req);
+
+struct fuse_req *fuse_get_req_for_background(struct fuse_conn *fc,
+ unsigned npages)
+{
+ return __fuse_get_req(fc, npages, true);
+}
+EXPORT_SYMBOL_GPL(fuse_get_req_for_background);
+
+/*
+ * Return request in fuse_file->reserved_req. However that may
+ * currently be in use. If that is the case, wait for it to become
+ * available.
+ */
+static struct fuse_req *get_reserved_req(struct fuse_conn *fc,
+ struct file *file)
+{
+ struct fuse_req *req = NULL;
+ struct fuse_file *ff = file->private_data;
+
+ do {
+ wait_event(fc->reserved_req_waitq, ff->reserved_req);
+ spin_lock(&fc->lock);
+ if (ff->reserved_req) {
+ req = ff->reserved_req;
+ ff->reserved_req = NULL;
+ req->stolen_file = get_file(file);
+ }
+ spin_unlock(&fc->lock);
+ } while (!req);
+
+ return req;
+}
+
+/*
+ * Put stolen request back into fuse_file->reserved_req
+ */
+static void put_reserved_req(struct fuse_conn *fc, struct fuse_req *req)
+{
+ struct file *file = req->stolen_file;
+ struct fuse_file *ff = file->private_data;
+
+ spin_lock(&fc->lock);
+ fuse_request_init(req, req->pages, req->page_descs, req->max_pages);
+ BUG_ON(ff->reserved_req);
+ ff->reserved_req = req;
+ wake_up_all(&fc->reserved_req_waitq);
+ spin_unlock(&fc->lock);
+ fput(file);
+}
+
+/*
+ * Gets a requests for a file operation, always succeeds
+ *
+ * This is used for sending the FLUSH request, which must get to
+ * userspace, due to POSIX locks which may need to be unlocked.
+ *
+ * If allocation fails due to OOM, use the reserved request in
+ * fuse_file.
+ *
+ * This is very unlikely to deadlock accidentally, since the
+ * filesystem should not have it's own file open. If deadlock is
+ * intentional, it can still be broken by "aborting" the filesystem.
+ */
+struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc,
+ struct file *file)
+{
+ struct fuse_req *req;
+
+ atomic_inc(&fc->num_waiting);
+ wait_event(fc->blocked_waitq, fc->initialized);
+ /* Matches smp_wmb() in fuse_set_initialized() */
+ smp_rmb();
+ req = fuse_request_alloc(0);
+ if (!req)
+ req = get_reserved_req(fc, file);
+
+ fuse_req_init_context(req);
+ req->waiting = 1;
+ req->background = 0;
+ return req;
+}
+
+void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
+{
+ if (atomic_dec_and_test(&req->count)) {
+ if (unlikely(req->background)) {
+ /*
+ * We get here in the unlikely case that a background
+ * request was allocated but not sent
+ */
+ spin_lock(&fc->lock);
+ if (!fc->blocked)
+ wake_up(&fc->blocked_waitq);
+ spin_unlock(&fc->lock);
+ }
+
+ if (req->waiting)
+ atomic_dec(&fc->num_waiting);
+
+ if (req->stolen_file)
+ put_reserved_req(fc, req);
+ else
+ fuse_request_free(req);
+ }
+}
+EXPORT_SYMBOL_GPL(fuse_put_request);
+
+static unsigned len_args(unsigned numargs, struct fuse_arg *args)
+{
+ unsigned nbytes = 0;
+ unsigned i;
+
+ for (i = 0; i < numargs; i++)
+ nbytes += args[i].size;
+
+ return nbytes;
+}
+
+static u64 fuse_get_unique(struct fuse_conn *fc)
+{
+ fc->reqctr++;
+ /* zero is special */
+ if (fc->reqctr == 0)
+ fc->reqctr = 1;
+
+ return fc->reqctr;
+}
+
+static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
+{
+ req->in.h.len = sizeof(struct fuse_in_header) +
+ len_args(req->in.numargs, (struct fuse_arg *) req->in.args);
+ list_add_tail(&req->list, &fc->pending);
+ req->state = FUSE_REQ_PENDING;
+ if (!req->waiting) {
+ req->waiting = 1;
+ atomic_inc(&fc->num_waiting);
+ }
+ wake_up(&fc->waitq);
+ kill_fasync(&fc->fasync, SIGIO, POLL_IN);
+}
+
+void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
+ u64 nodeid, u64 nlookup)
+{
+ forget->forget_one.nodeid = nodeid;
+ forget->forget_one.nlookup = nlookup;
+
+ spin_lock(&fc->lock);
+ if (fc->connected) {
+ fc->forget_list_tail->next = forget;
+ fc->forget_list_tail = forget;
+ wake_up(&fc->waitq);
+ kill_fasync(&fc->fasync, SIGIO, POLL_IN);
+ } else {
+ kfree(forget);
+ }
+ spin_unlock(&fc->lock);
+}
+
+static void flush_bg_queue(struct fuse_conn *fc)
+{
+ while (fc->active_background < fc->max_background &&
+ !list_empty(&fc->bg_queue)) {
+ struct fuse_req *req;
+
+ req = list_entry(fc->bg_queue.next, struct fuse_req, list);
+ list_del(&req->list);
+ fc->active_background++;
+ req->in.h.unique = fuse_get_unique(fc);
+ queue_request(fc, req);
+ }
+}
+
+/*
+ * This function is called when a request is finished. Either a reply
+ * has arrived or it was aborted (and not yet sent) or some error
+ * occurred during communication with userspace, or the device file
+ * was closed. The requester thread is woken up (if still waiting),
+ * the 'end' callback is called if given, else the reference to the
+ * request is released
+ *
+ * Called with fc->lock, unlocks it
+ */
+static void request_end(struct fuse_conn *fc, struct fuse_req *req)
+__releases(fc->lock)
+{
+ void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
+ req->end = NULL;
+ list_del(&req->list);
+ list_del(&req->intr_entry);
+ req->state = FUSE_REQ_FINISHED;
+ if (req->background) {
+ req->background = 0;
+
+ if (fc->num_background == fc->max_background)
+ fc->blocked = 0;
+
+ /* Wake up next waiter, if any */
+ if (!fc->blocked && waitqueue_active(&fc->blocked_waitq))
+ wake_up(&fc->blocked_waitq);
+
+ if (fc->num_background == fc->congestion_threshold &&
+ fc->connected && fc->bdi_initialized) {
+ clear_bdi_congested(&fc->bdi, BLK_RW_SYNC);
+ clear_bdi_congested(&fc->bdi, BLK_RW_ASYNC);
+ }
+ fc->num_background--;
+ fc->active_background--;
+ flush_bg_queue(fc);
+ }
+ spin_unlock(&fc->lock);
+ wake_up(&req->waitq);
+ if (end)
+ end(fc, req);
+ fuse_put_request(fc, req);
+}
+
+static void wait_answer_interruptible(struct fuse_conn *fc,
+ struct fuse_req *req)
+__releases(fc->lock)
+__acquires(fc->lock)
+{
+ if (signal_pending(current))
+ return;
+
+ spin_unlock(&fc->lock);
+ wait_event_interruptible(req->waitq, req->state == FUSE_REQ_FINISHED);
+ spin_lock(&fc->lock);
+}
+
+static void queue_interrupt(struct fuse_conn *fc, struct fuse_req *req)
+{
+ list_add_tail(&req->intr_entry, &fc->interrupts);
+ wake_up(&fc->waitq);
+ kill_fasync(&fc->fasync, SIGIO, POLL_IN);
+}
+
+static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
+__releases(fc->lock)
+__acquires(fc->lock)
+{
+ if (!fc->no_interrupt) {
+ /* Any signal may interrupt this */
+ wait_answer_interruptible(fc, req);
+
+ if (req->aborted)
+ goto aborted;
+ if (req->state == FUSE_REQ_FINISHED)
+ return;
+
+ req->interrupted = 1;
+ if (req->state == FUSE_REQ_SENT)
+ queue_interrupt(fc, req);
+ }
+
+ if (!req->force) {
+ sigset_t oldset;
+
+ /* Only fatal signals may interrupt this */
+ block_sigs(&oldset);
+ wait_answer_interruptible(fc, req);
+ restore_sigs(&oldset);
+
+ if (req->aborted)
+ goto aborted;
+ if (req->state == FUSE_REQ_FINISHED)
+ return;
+
+ /* Request is not yet in userspace, bail out */
+ if (req->state == FUSE_REQ_PENDING) {
+ list_del(&req->list);
+ __fuse_put_request(req);
+ req->out.h.error = -EINTR;
+ return;
+ }
+ }
+
+ /*
+ * Either request is already in userspace, or it was forced.
+ * Wait it out.
+ */
+ spin_unlock(&fc->lock);
+ wait_event(req->waitq, req->state == FUSE_REQ_FINISHED);
+ spin_lock(&fc->lock);
+
+ if (!req->aborted)
+ return;
+
+ aborted:
+ BUG_ON(req->state != FUSE_REQ_FINISHED);
+ if (req->locked) {
+ /* This is uninterruptible sleep, because data is
+ being copied to/from the buffers of req. During
+ locked state, there mustn't be any filesystem
+ operation (e.g. page fault), since that could lead
+ to deadlock */
+ spin_unlock(&fc->lock);
+ wait_event(req->waitq, !req->locked);
+ spin_lock(&fc->lock);
+ }
+}
+
+static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
+{
+ BUG_ON(req->background);
+ spin_lock(&fc->lock);
+ if (!fc->connected)
+ req->out.h.error = -ENOTCONN;
+ else if (fc->conn_error)
+ req->out.h.error = -ECONNREFUSED;
+ else {
+ req->in.h.unique = fuse_get_unique(fc);
+ queue_request(fc, req);
+ /* acquire extra reference, since request is still needed
+ after request_end() */
+ __fuse_get_request(req);
+
+ request_wait_answer(fc, req);
+ }
+ spin_unlock(&fc->lock);
+}
+
+void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
+{
+ req->isreply = 1;
+ __fuse_request_send(fc, req);
+}
+EXPORT_SYMBOL_GPL(fuse_request_send);
+
+static void fuse_adjust_compat(struct fuse_conn *fc, struct fuse_args *args)
+{
+ if (fc->minor < 4 && args->in.h.opcode == FUSE_STATFS)
+ args->out.args[0].size = FUSE_COMPAT_STATFS_SIZE;
+
+ if (fc->minor < 9) {
+ switch (args->in.h.opcode) {
+ case FUSE_LOOKUP:
+ case FUSE_CREATE:
+ case FUSE_MKNOD:
+ case FUSE_MKDIR:
+ case FUSE_SYMLINK:
+ case FUSE_LINK:
+ args->out.args[0].size = FUSE_COMPAT_ENTRY_OUT_SIZE;
+ break;
+ case FUSE_GETATTR:
+ case FUSE_SETATTR:
+ args->out.args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE;
+ break;
+ }
+ }
+ if (fc->minor < 12) {
+ switch (args->in.h.opcode) {
+ case FUSE_CREATE:
+ args->in.args[0].size = sizeof(struct fuse_open_in);
+ break;
+ case FUSE_MKNOD:
+ args->in.args[0].size = FUSE_COMPAT_MKNOD_IN_SIZE;
+ break;
+ }
+ }
+}
+
+ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args)
+{
+ struct fuse_req *req;
+ ssize_t ret;
+
+ req = fuse_get_req(fc, 0);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ /* Needs to be done after fuse_get_req() so that fc->minor is valid */
+ fuse_adjust_compat(fc, args);
+
+ req->in.h.opcode = args->in.h.opcode;
+ req->in.h.nodeid = args->in.h.nodeid;
+ req->in.numargs = args->in.numargs;
+ memcpy(req->in.args, args->in.args,
+ args->in.numargs * sizeof(struct fuse_in_arg));
+ req->out.argvar = args->out.argvar;
+ req->out.numargs = args->out.numargs;
+ memcpy(req->out.args, args->out.args,
+ args->out.numargs * sizeof(struct fuse_arg));
+ fuse_request_send(fc, req);
+ ret = req->out.h.error;
+ if (!ret && args->out.argvar) {
+ BUG_ON(args->out.numargs != 1);
+ ret = req->out.args[0].size;
+ }
+ fuse_put_request(fc, req);
+
+ return ret;
+}
+
+static void fuse_request_send_nowait_locked(struct fuse_conn *fc,
+ struct fuse_req *req)
+{
+ BUG_ON(!req->background);
+ fc->num_background++;
+ if (fc->num_background == fc->max_background)
+ fc->blocked = 1;
+ if (fc->num_background == fc->congestion_threshold &&
+ fc->bdi_initialized) {
+ set_bdi_congested(&fc->bdi, BLK_RW_SYNC);
+ set_bdi_congested(&fc->bdi, BLK_RW_ASYNC);
+ }
+ list_add_tail(&req->list, &fc->bg_queue);
+ flush_bg_queue(fc);
+}
+
+static void fuse_request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
+{
+ spin_lock(&fc->lock);
+ if (fc->connected) {
+ fuse_request_send_nowait_locked(fc, req);
+ spin_unlock(&fc->lock);
+ } else {
+ req->out.h.error = -ENOTCONN;
+ request_end(fc, req);
+ }
+}
+
+void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req)
+{
+ req->isreply = 1;
+ fuse_request_send_nowait(fc, req);
+}
+EXPORT_SYMBOL_GPL(fuse_request_send_background);
+
+static int fuse_request_send_notify_reply(struct fuse_conn *fc,
+ struct fuse_req *req, u64 unique)
+{
+ int err = -ENODEV;
+
+ req->isreply = 0;
+ req->in.h.unique = unique;
+ spin_lock(&fc->lock);
+ if (fc->connected) {
+ queue_request(fc, req);
+ err = 0;
+ }
+ spin_unlock(&fc->lock);
+
+ return err;
+}
+
+/*
+ * Called under fc->lock
+ *
+ * fc->connected must have been checked previously
+ */
+void fuse_request_send_background_locked(struct fuse_conn *fc,
+ struct fuse_req *req)
+{
+ req->isreply = 1;
+ fuse_request_send_nowait_locked(fc, req);
+}
+
+void fuse_force_forget(struct file *file, u64 nodeid)
+{
+ struct inode *inode = file_inode(file);
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_req *req;
+ struct fuse_forget_in inarg;
+
+ memset(&inarg, 0, sizeof(inarg));
+ inarg.nlookup = 1;
+ req = fuse_get_req_nofail_nopages(fc, file);
+ req->in.h.opcode = FUSE_FORGET;
+ req->in.h.nodeid = nodeid;
+ req->in.numargs = 1;
+ req->in.args[0].size = sizeof(inarg);
+ req->in.args[0].value = &inarg;
+ req->isreply = 0;
+ __fuse_request_send(fc, req);
+ /* ignore errors */
+ fuse_put_request(fc, req);
+}
+
+/*
+ * Lock the request. Up to the next unlock_request() there mustn't be
+ * anything that could cause a page-fault. If the request was already
+ * aborted bail out.
+ */
+static int lock_request(struct fuse_conn *fc, struct fuse_req *req)
+{
+ int err = 0;
+ if (req) {
+ spin_lock(&fc->lock);
+ if (req->aborted)
+ err = -ENOENT;
+ else
+ req->locked = 1;
+ spin_unlock(&fc->lock);
+ }
+ return err;
+}
+
+/*
+ * Unlock request. If it was aborted during being locked, the
+ * requester thread is currently waiting for it to be unlocked, so
+ * wake it up.
+ */
+static void unlock_request(struct fuse_conn *fc, struct fuse_req *req)
+{
+ if (req) {
+ spin_lock(&fc->lock);
+ req->locked = 0;
+ if (req->aborted)
+ wake_up(&req->waitq);
+ spin_unlock(&fc->lock);
+ }
+}
+
+struct fuse_copy_state {
+ struct fuse_conn *fc;
+ int write;
+ struct fuse_req *req;
+ struct iov_iter *iter;
+ struct pipe_buffer *pipebufs;
+ struct pipe_buffer *currbuf;
+ struct pipe_inode_info *pipe;
+ unsigned long nr_segs;
+ struct page *pg;
+ unsigned len;
+ unsigned offset;
+ unsigned move_pages:1;
+};
+
+static void fuse_copy_init(struct fuse_copy_state *cs,
+ struct fuse_conn *fc,
+ int write,
+ struct iov_iter *iter)
+{
+ memset(cs, 0, sizeof(*cs));
+ cs->fc = fc;
+ cs->write = write;
+ cs->iter = iter;
+}
+
+/* Unmap and put previous page of userspace buffer */
+static void fuse_copy_finish(struct fuse_copy_state *cs)
+{
+ if (cs->currbuf) {
+ struct pipe_buffer *buf = cs->currbuf;
+
+ if (cs->write)
+ buf->len = PAGE_SIZE - cs->len;
+ cs->currbuf = NULL;
+ } else if (cs->pg) {
+ if (cs->write) {
+ flush_dcache_page(cs->pg);
+ set_page_dirty_lock(cs->pg);
+ }
+ put_page(cs->pg);
+ }
+ cs->pg = NULL;
+}
+
+/*
+ * Get another pagefull of userspace buffer, and map it to kernel
+ * address space, and lock request
+ */
+static int fuse_copy_fill(struct fuse_copy_state *cs)
+{
+ struct page *page;
+ int err;
+
+ unlock_request(cs->fc, cs->req);
+ fuse_copy_finish(cs);
+ if (cs->pipebufs) {
+ struct pipe_buffer *buf = cs->pipebufs;
+
+ if (!cs->write) {
+ err = buf->ops->confirm(cs->pipe, buf);
+ if (err)
+ return err;
+
+ BUG_ON(!cs->nr_segs);
+ cs->currbuf = buf;
+ cs->pg = buf->page;
+ cs->offset = buf->offset;
+ cs->len = buf->len;
+ cs->pipebufs++;
+ cs->nr_segs--;
+ } else {
+ if (cs->nr_segs == cs->pipe->buffers)
+ return -EIO;
+
+ page = alloc_page(GFP_HIGHUSER);
+ if (!page)
+ return -ENOMEM;
+
+ buf->page = page;
+ buf->offset = 0;
+ buf->len = 0;
+
+ cs->currbuf = buf;
+ cs->pg = page;
+ cs->offset = 0;
+ cs->len = PAGE_SIZE;
+ cs->pipebufs++;
+ cs->nr_segs++;
+ }
+ } else {
+ size_t off;
+ err = iov_iter_get_pages(cs->iter, &page, PAGE_SIZE, 1, &off);
+ if (err < 0)
+ return err;
+ BUG_ON(!err);
+ cs->len = err;
+ cs->offset = off;
+ cs->pg = page;
+ cs->offset = off;
+ iov_iter_advance(cs->iter, err);
+ }
+
+ return lock_request(cs->fc, cs->req);
+}
+
+/* Do as much copy to/from userspace buffer as we can */
+static int fuse_copy_do(struct fuse_copy_state *cs, void **val, unsigned *size)
+{
+ unsigned ncpy = min(*size, cs->len);
+ if (val) {
+ void *pgaddr = kmap_atomic(cs->pg);
+ void *buf = pgaddr + cs->offset;
+
+ if (cs->write)
+ memcpy(buf, *val, ncpy);
+ else
+ memcpy(*val, buf, ncpy);
+
+ kunmap_atomic(pgaddr);
+ *val += ncpy;
+ }
+ *size -= ncpy;
+ cs->len -= ncpy;
+ cs->offset += ncpy;
+ return ncpy;
+}
+
+static int fuse_check_page(struct page *page)
+{
+ if (page_mapcount(page) ||
+ page->mapping != NULL ||
+ page_count(page) != 1 ||
+ (page->flags & PAGE_FLAGS_CHECK_AT_PREP &
+ ~(1 << PG_locked |
+ 1 << PG_referenced |
+ 1 << PG_uptodate |
+ 1 << PG_lru |
+ 1 << PG_active |
+ 1 << PG_reclaim))) {
+ printk(KERN_WARNING "fuse: trying to steal weird page\n");
+ printk(KERN_WARNING " page=%p index=%li flags=%08lx, count=%i, mapcount=%i, mapping=%p\n", page, page->index, page->flags, page_count(page), page_mapcount(page), page->mapping);
+ return 1;
+ }
+ return 0;
+}
+
+static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep)
+{
+ int err;
+ struct page *oldpage = *pagep;
+ struct page *newpage;
+ struct pipe_buffer *buf = cs->pipebufs;
+
+ unlock_request(cs->fc, cs->req);
+ fuse_copy_finish(cs);
+
+ err = buf->ops->confirm(cs->pipe, buf);
+ if (err)
+ return err;
+
+ BUG_ON(!cs->nr_segs);
+ cs->currbuf = buf;
+ cs->len = buf->len;
+ cs->pipebufs++;
+ cs->nr_segs--;
+
+ if (cs->len != PAGE_SIZE)
+ goto out_fallback;
+
+ if (buf->ops->steal(cs->pipe, buf) != 0)
+ goto out_fallback;
+
+ newpage = buf->page;
+
+ if (!PageUptodate(newpage))
+ SetPageUptodate(newpage);
+
+ ClearPageMappedToDisk(newpage);
+
+ if (fuse_check_page(newpage) != 0)
+ goto out_fallback_unlock;
+
+ /*
+ * This is a new and locked page, it shouldn't be mapped or
+ * have any special flags on it
+ */
+ if (WARN_ON(page_mapped(oldpage)))
+ goto out_fallback_unlock;
+ if (WARN_ON(page_has_private(oldpage)))
+ goto out_fallback_unlock;
+ if (WARN_ON(PageDirty(oldpage) || PageWriteback(oldpage)))
+ goto out_fallback_unlock;
+ if (WARN_ON(PageMlocked(oldpage)))
+ goto out_fallback_unlock;
+
+ err = replace_page_cache_page(oldpage, newpage, GFP_KERNEL);
+ if (err) {
+ unlock_page(newpage);
+ return err;
+ }
+
+ page_cache_get(newpage);
+
+ if (!(buf->flags & PIPE_BUF_FLAG_LRU))
+ lru_cache_add_file(newpage);
+
+ err = 0;
+ spin_lock(&cs->fc->lock);
+ if (cs->req->aborted)
+ err = -ENOENT;
+ else
+ *pagep = newpage;
+ spin_unlock(&cs->fc->lock);
+
+ if (err) {
+ unlock_page(newpage);
+ page_cache_release(newpage);
+ return err;
+ }
+
+ unlock_page(oldpage);
+ page_cache_release(oldpage);
+ cs->len = 0;
+
+ return 0;
+
+out_fallback_unlock:
+ unlock_page(newpage);
+out_fallback:
+ cs->pg = buf->page;
+ cs->offset = buf->offset;
+
+ err = lock_request(cs->fc, cs->req);
+ if (err)
+ return err;
+
+ return 1;
+}
+
+static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page,
+ unsigned offset, unsigned count)
+{
+ struct pipe_buffer *buf;
+
+ if (cs->nr_segs == cs->pipe->buffers)
+ return -EIO;
+
+ unlock_request(cs->fc, cs->req);
+ fuse_copy_finish(cs);
+
+ buf = cs->pipebufs;
+ page_cache_get(page);
+ buf->page = page;
+ buf->offset = offset;
+ buf->len = count;
+
+ cs->pipebufs++;
+ cs->nr_segs++;
+ cs->len = 0;
+
+ return 0;
+}
+
+/*
+ * Copy a page in the request to/from the userspace buffer. Must be
+ * done atomically
+ */
+static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
+ unsigned offset, unsigned count, int zeroing)
+{
+ int err;
+ struct page *page = *pagep;
+
+ if (page && zeroing && count < PAGE_SIZE)
+ clear_highpage(page);
+
+ while (count) {
+ if (cs->write && cs->pipebufs && page) {
+ return fuse_ref_page(cs, page, offset, count);
+ } else if (!cs->len) {
+ if (cs->move_pages && page &&
+ offset == 0 && count == PAGE_SIZE) {
+ err = fuse_try_move_page(cs, pagep);
+ if (err <= 0)
+ return err;
+ } else {
+ err = fuse_copy_fill(cs);
+ if (err)
+ return err;
+ }
+ }
+ if (page) {
+ void *mapaddr = kmap_atomic(page);
+ void *buf = mapaddr + offset;
+ offset += fuse_copy_do(cs, &buf, &count);
+ kunmap_atomic(mapaddr);
+ } else
+ offset += fuse_copy_do(cs, NULL, &count);
+ }
+ if (page && !cs->write)
+ flush_dcache_page(page);
+ return 0;
+}
+
+/* Copy pages in the request to/from userspace buffer */
+static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes,
+ int zeroing)
+{
+ unsigned i;
+ struct fuse_req *req = cs->req;
+
+ for (i = 0; i < req->num_pages && (nbytes || zeroing); i++) {
+ int err;
+ unsigned offset = req->page_descs[i].offset;
+ unsigned count = min(nbytes, req->page_descs[i].length);
+
+ err = fuse_copy_page(cs, &req->pages[i], offset, count,
+ zeroing);
+ if (err)
+ return err;
+
+ nbytes -= count;
+ }
+ return 0;
+}
+
+/* Copy a single argument in the request to/from userspace buffer */
+static int fuse_copy_one(struct fuse_copy_state *cs, void *val, unsigned size)
+{
+ while (size) {
+ if (!cs->len) {
+ int err = fuse_copy_fill(cs);
+ if (err)
+ return err;
+ }
+ fuse_copy_do(cs, &val, &size);
+ }
+ return 0;
+}
+
+/* Copy request arguments to/from userspace buffer */
+static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs,
+ unsigned argpages, struct fuse_arg *args,
+ int zeroing)
+{
+ int err = 0;
+ unsigned i;
+
+ for (i = 0; !err && i < numargs; i++) {
+ struct fuse_arg *arg = &args[i];
+ if (i == numargs - 1 && argpages)
+ err = fuse_copy_pages(cs, arg->size, zeroing);
+ else
+ err = fuse_copy_one(cs, arg->value, arg->size);
+ }
+ return err;
+}
+
+static int forget_pending(struct fuse_conn *fc)
+{
+ return fc->forget_list_head.next != NULL;
+}
+
+static int request_pending(struct fuse_conn *fc)
+{
+ return !list_empty(&fc->pending) || !list_empty(&fc->interrupts) ||
+ forget_pending(fc);
+}
+
+/* Wait until a request is available on the pending list */
+static void request_wait(struct fuse_conn *fc)
+__releases(fc->lock)
+__acquires(fc->lock)
+{
+ DECLARE_WAITQUEUE(wait, current);
+
+ add_wait_queue_exclusive(&fc->waitq, &wait);
+ while (fc->connected && !request_pending(fc)) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (signal_pending(current))
+ break;
+
+ spin_unlock(&fc->lock);
+ schedule();
+ spin_lock(&fc->lock);
+ }
+ set_current_state(TASK_RUNNING);
+ remove_wait_queue(&fc->waitq, &wait);
+}
+
+/*
+ * Transfer an interrupt request to userspace
+ *
+ * Unlike other requests this is assembled on demand, without a need
+ * to allocate a separate fuse_req structure.
+ *
+ * Called with fc->lock held, releases it
+ */
+static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_copy_state *cs,
+ size_t nbytes, struct fuse_req *req)
+__releases(fc->lock)
+{
+ struct fuse_in_header ih;
+ struct fuse_interrupt_in arg;
+ unsigned reqsize = sizeof(ih) + sizeof(arg);
+ int err;
+
+ list_del_init(&req->intr_entry);
+ req->intr_unique = fuse_get_unique(fc);
+ memset(&ih, 0, sizeof(ih));
+ memset(&arg, 0, sizeof(arg));
+ ih.len = reqsize;
+ ih.opcode = FUSE_INTERRUPT;
+ ih.unique = req->intr_unique;
+ arg.unique = req->in.h.unique;
+
+ spin_unlock(&fc->lock);
+ if (nbytes < reqsize)
+ return -EINVAL;
+
+ err = fuse_copy_one(cs, &ih, sizeof(ih));
+ if (!err)
+ err = fuse_copy_one(cs, &arg, sizeof(arg));
+ fuse_copy_finish(cs);
+
+ return err ? err : reqsize;
+}
+
+static struct fuse_forget_link *dequeue_forget(struct fuse_conn *fc,
+ unsigned max,
+ unsigned *countp)
+{
+ struct fuse_forget_link *head = fc->forget_list_head.next;
+ struct fuse_forget_link **newhead = &head;
+ unsigned count;
+
+ for (count = 0; *newhead != NULL && count < max; count++)
+ newhead = &(*newhead)->next;
+
+ fc->forget_list_head.next = *newhead;
+ *newhead = NULL;
+ if (fc->forget_list_head.next == NULL)
+ fc->forget_list_tail = &fc->forget_list_head;
+
+ if (countp != NULL)
+ *countp = count;
+
+ return head;
+}
+
+static int fuse_read_single_forget(struct fuse_conn *fc,
+ struct fuse_copy_state *cs,
+ size_t nbytes)
+__releases(fc->lock)
+{
+ int err;
+ struct fuse_forget_link *forget = dequeue_forget(fc, 1, NULL);
+ struct fuse_forget_in arg = {
+ .nlookup = forget->forget_one.nlookup,
+ };
+ struct fuse_in_header ih = {
+ .opcode = FUSE_FORGET,
+ .nodeid = forget->forget_one.nodeid,
+ .unique = fuse_get_unique(fc),
+ .len = sizeof(ih) + sizeof(arg),
+ };
+
+ spin_unlock(&fc->lock);
+ kfree(forget);
+ if (nbytes < ih.len)
+ return -EINVAL;
+
+ err = fuse_copy_one(cs, &ih, sizeof(ih));
+ if (!err)
+ err = fuse_copy_one(cs, &arg, sizeof(arg));
+ fuse_copy_finish(cs);
+
+ if (err)
+ return err;
+
+ return ih.len;
+}
+
+static int fuse_read_batch_forget(struct fuse_conn *fc,
+ struct fuse_copy_state *cs, size_t nbytes)
+__releases(fc->lock)
+{
+ int err;
+ unsigned max_forgets;
+ unsigned count;
+ struct fuse_forget_link *head;
+ struct fuse_batch_forget_in arg = { .count = 0 };
+ struct fuse_in_header ih = {
+ .opcode = FUSE_BATCH_FORGET,
+ .unique = fuse_get_unique(fc),
+ .len = sizeof(ih) + sizeof(arg),
+ };
+
+ if (nbytes < ih.len) {
+ spin_unlock(&fc->lock);
+ return -EINVAL;
+ }
+
+ max_forgets = (nbytes - ih.len) / sizeof(struct fuse_forget_one);
+ head = dequeue_forget(fc, max_forgets, &count);
+ spin_unlock(&fc->lock);
+
+ arg.count = count;
+ ih.len += count * sizeof(struct fuse_forget_one);
+ err = fuse_copy_one(cs, &ih, sizeof(ih));
+ if (!err)
+ err = fuse_copy_one(cs, &arg, sizeof(arg));
+
+ while (head) {
+ struct fuse_forget_link *forget = head;
+
+ if (!err) {
+ err = fuse_copy_one(cs, &forget->forget_one,
+ sizeof(forget->forget_one));
+ }
+ head = forget->next;
+ kfree(forget);
+ }
+
+ fuse_copy_finish(cs);
+
+ if (err)
+ return err;
+
+ return ih.len;
+}
+
+static int fuse_read_forget(struct fuse_conn *fc, struct fuse_copy_state *cs,
+ size_t nbytes)
+__releases(fc->lock)
+{
+ if (fc->minor < 16 || fc->forget_list_head.next->next == NULL)
+ return fuse_read_single_forget(fc, cs, nbytes);
+ else
+ return fuse_read_batch_forget(fc, cs, nbytes);
+}
+
+/*
+ * Read a single request into the userspace filesystem's buffer. This
+ * function waits until a request is available, then removes it from
+ * the pending list and copies request data to userspace buffer. If
+ * no reply is needed (FORGET) or request has been aborted or there
+ * was an error during the copying then it's finished by calling
+ * request_end(). Otherwise add it to the processing list, and set
+ * the 'sent' flag.
+ */
+static ssize_t fuse_dev_do_read(struct fuse_conn *fc, struct file *file,
+ struct fuse_copy_state *cs, size_t nbytes)
+{
+ int err;
+ struct fuse_req *req;
+ struct fuse_in *in;
+ unsigned reqsize;
+
+ restart:
+ spin_lock(&fc->lock);
+ err = -EAGAIN;
+ if ((file->f_flags & O_NONBLOCK) && fc->connected &&
+ !request_pending(fc))
+ goto err_unlock;
+
+ request_wait(fc);
+ err = -ENODEV;
+ if (!fc->connected)
+ goto err_unlock;
+ err = -ERESTARTSYS;
+ if (!request_pending(fc))
+ goto err_unlock;
+
+ if (!list_empty(&fc->interrupts)) {
+ req = list_entry(fc->interrupts.next, struct fuse_req,
+ intr_entry);
+ return fuse_read_interrupt(fc, cs, nbytes, req);
+ }
+
+ if (forget_pending(fc)) {
+ if (list_empty(&fc->pending) || fc->forget_batch-- > 0)
+ return fuse_read_forget(fc, cs, nbytes);
+
+ if (fc->forget_batch <= -8)
+ fc->forget_batch = 16;
+ }
+
+ req = list_entry(fc->pending.next, struct fuse_req, list);
+ req->state = FUSE_REQ_READING;
+ list_move(&req->list, &fc->io);
+
+ in = &req->in;
+ reqsize = in->h.len;
+ /* If request is too large, reply with an error and restart the read */
+ if (nbytes < reqsize) {
+ req->out.h.error = -EIO;
+ /* SETXATTR is special, since it may contain too large data */
+ if (in->h.opcode == FUSE_SETXATTR)
+ req->out.h.error = -E2BIG;
+ request_end(fc, req);
+ goto restart;
+ }
+ spin_unlock(&fc->lock);
+ cs->req = req;
+ err = fuse_copy_one(cs, &in->h, sizeof(in->h));
+ if (!err)
+ err = fuse_copy_args(cs, in->numargs, in->argpages,
+ (struct fuse_arg *) in->args, 0);
+ fuse_copy_finish(cs);
+ spin_lock(&fc->lock);
+ req->locked = 0;
+ if (req->aborted) {
+ request_end(fc, req);
+ return -ENODEV;
+ }
+ if (err) {
+ req->out.h.error = -EIO;
+ request_end(fc, req);
+ return err;
+ }
+ if (!req->isreply)
+ request_end(fc, req);
+ else {
+ req->state = FUSE_REQ_SENT;
+ list_move_tail(&req->list, &fc->processing);
+ if (req->interrupted)
+ queue_interrupt(fc, req);
+ spin_unlock(&fc->lock);
+ }
+ return reqsize;
+
+ err_unlock:
+ spin_unlock(&fc->lock);
+ return err;
+}
+
+static int fuse_dev_open(struct inode *inode, struct file *file)
+{
+ /*
+ * The fuse device's file's private_data is used to hold
+ * the fuse_conn(ection) when it is mounted, and is used to
+ * keep track of whether the file has been mounted already.
+ */
+ file->private_data = NULL;
+ return 0;
+}
+
+static ssize_t fuse_dev_read(struct kiocb *iocb, struct iov_iter *to)
+{
+ struct fuse_copy_state cs;
+ struct file *file = iocb->ki_filp;
+ struct fuse_conn *fc = fuse_get_conn(file);
+ if (!fc)
+ return -EPERM;
+
+ if (!iter_is_iovec(to))
+ return -EINVAL;
+
+ fuse_copy_init(&cs, fc, 1, to);
+
+ return fuse_dev_do_read(fc, file, &cs, iov_iter_count(to));
+}
+
+static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos,
+ struct pipe_inode_info *pipe,
+ size_t len, unsigned int flags)
+{
+ int ret;
+ int page_nr = 0;
+ int do_wakeup = 0;
+ struct pipe_buffer *bufs;
+ struct fuse_copy_state cs;
+ struct fuse_conn *fc = fuse_get_conn(in);
+ if (!fc)
+ return -EPERM;
+
+ bufs = kmalloc(pipe->buffers * sizeof(struct pipe_buffer), GFP_KERNEL);
+ if (!bufs)
+ return -ENOMEM;
+
+ fuse_copy_init(&cs, fc, 1, NULL);
+ cs.pipebufs = bufs;
+ cs.pipe = pipe;
+ ret = fuse_dev_do_read(fc, in, &cs, len);
+ if (ret < 0)
+ goto out;
+
+ ret = 0;
+ pipe_lock(pipe);
+
+ if (!pipe->readers) {
+ send_sig(SIGPIPE, current, 0);
+ if (!ret)
+ ret = -EPIPE;
+ goto out_unlock;
+ }
+
+ if (pipe->nrbufs + cs.nr_segs > pipe->buffers) {
+ ret = -EIO;
+ goto out_unlock;
+ }
+
+ while (page_nr < cs.nr_segs) {
+ int newbuf = (pipe->curbuf + pipe->nrbufs) & (pipe->buffers - 1);
+ struct pipe_buffer *buf = pipe->bufs + newbuf;
+
+ buf->page = bufs[page_nr].page;
+ buf->offset = bufs[page_nr].offset;
+ buf->len = bufs[page_nr].len;
+ /*
+ * Need to be careful about this. Having buf->ops in module
+ * code can Oops if the buffer persists after module unload.
+ */
+ buf->ops = &nosteal_pipe_buf_ops;
+
+ pipe->nrbufs++;
+ page_nr++;
+ ret += buf->len;
+
+ if (pipe->files)
+ do_wakeup = 1;
+ }
+
+out_unlock:
+ pipe_unlock(pipe);
+
+ if (do_wakeup) {
+ smp_mb();
+ if (waitqueue_active(&pipe->wait))
+ wake_up_interruptible(&pipe->wait);
+ kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
+ }
+
+out:
+ for (; page_nr < cs.nr_segs; page_nr++)
+ page_cache_release(bufs[page_nr].page);
+
+ kfree(bufs);
+ return ret;
+}
+
+static int fuse_notify_poll(struct fuse_conn *fc, unsigned int size,
+ struct fuse_copy_state *cs)
+{
+ struct fuse_notify_poll_wakeup_out outarg;
+ int err = -EINVAL;
+
+ if (size != sizeof(outarg))
+ goto err;
+
+ err = fuse_copy_one(cs, &outarg, sizeof(outarg));
+ if (err)
+ goto err;
+
+ fuse_copy_finish(cs);
+ return fuse_notify_poll_wakeup(fc, &outarg);
+
+err:
+ fuse_copy_finish(cs);
+ return err;
+}
+
+static int fuse_notify_inval_inode(struct fuse_conn *fc, unsigned int size,
+ struct fuse_copy_state *cs)
+{
+ struct fuse_notify_inval_inode_out outarg;
+ int err = -EINVAL;
+
+ if (size != sizeof(outarg))
+ goto err;
+
+ err = fuse_copy_one(cs, &outarg, sizeof(outarg));
+ if (err)
+ goto err;
+ fuse_copy_finish(cs);
+
+ down_read(&fc->killsb);
+ err = -ENOENT;
+ if (fc->sb) {
+ err = fuse_reverse_inval_inode(fc->sb, outarg.ino,
+ outarg.off, outarg.len);
+ }
+ up_read(&fc->killsb);
+ return err;
+
+err:
+ fuse_copy_finish(cs);
+ return err;
+}
+
+static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
+ struct fuse_copy_state *cs)
+{
+ struct fuse_notify_inval_entry_out outarg;
+ int err = -ENOMEM;
+ char *buf;
+ struct qstr name;
+
+ buf = kzalloc(FUSE_NAME_MAX + 1, GFP_KERNEL);
+ if (!buf)
+ goto err;
+
+ err = -EINVAL;
+ if (size < sizeof(outarg))
+ goto err;
+
+ err = fuse_copy_one(cs, &outarg, sizeof(outarg));
+ if (err)
+ goto err;
+
+ err = -ENAMETOOLONG;
+ if (outarg.namelen > FUSE_NAME_MAX)
+ goto err;
+
+ err = -EINVAL;
+ if (size != sizeof(outarg) + outarg.namelen + 1)
+ goto err;
+
+ name.name = buf;
+ name.len = outarg.namelen;
+ err = fuse_copy_one(cs, buf, outarg.namelen + 1);
+ if (err)
+ goto err;
+ fuse_copy_finish(cs);
+ buf[outarg.namelen] = 0;
+ name.hash = full_name_hash(name.name, name.len);
+
+ down_read(&fc->killsb);
+ err = -ENOENT;
+ if (fc->sb)
+ err = fuse_reverse_inval_entry(fc->sb, outarg.parent, 0, &name);
+ up_read(&fc->killsb);
+ kfree(buf);
+ return err;
+
+err:
+ kfree(buf);
+ fuse_copy_finish(cs);
+ return err;
+}
+
+static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size,
+ struct fuse_copy_state *cs)
+{
+ struct fuse_notify_delete_out outarg;
+ int err = -ENOMEM;
+ char *buf;
+ struct qstr name;
+
+ buf = kzalloc(FUSE_NAME_MAX + 1, GFP_KERNEL);
+ if (!buf)
+ goto err;
+
+ err = -EINVAL;
+ if (size < sizeof(outarg))
+ goto err;
+
+ err = fuse_copy_one(cs, &outarg, sizeof(outarg));
+ if (err)
+ goto err;
+
+ err = -ENAMETOOLONG;
+ if (outarg.namelen > FUSE_NAME_MAX)
+ goto err;
+
+ err = -EINVAL;
+ if (size != sizeof(outarg) + outarg.namelen + 1)
+ goto err;
+
+ name.name = buf;
+ name.len = outarg.namelen;
+ err = fuse_copy_one(cs, buf, outarg.namelen + 1);
+ if (err)
+ goto err;
+ fuse_copy_finish(cs);
+ buf[outarg.namelen] = 0;
+ name.hash = full_name_hash(name.name, name.len);
+
+ down_read(&fc->killsb);
+ err = -ENOENT;
+ if (fc->sb)
+ err = fuse_reverse_inval_entry(fc->sb, outarg.parent,
+ outarg.child, &name);
+ up_read(&fc->killsb);
+ kfree(buf);
+ return err;
+
+err:
+ kfree(buf);
+ fuse_copy_finish(cs);
+ return err;
+}
+
+static int fuse_notify_store(struct fuse_conn *fc, unsigned int size,
+ struct fuse_copy_state *cs)
+{
+ struct fuse_notify_store_out outarg;
+ struct inode *inode;
+ struct address_space *mapping;
+ u64 nodeid;
+ int err;
+ pgoff_t index;
+ unsigned int offset;
+ unsigned int num;
+ loff_t file_size;
+ loff_t end;
+
+ err = -EINVAL;
+ if (size < sizeof(outarg))
+ goto out_finish;
+
+ err = fuse_copy_one(cs, &outarg, sizeof(outarg));
+ if (err)
+ goto out_finish;
+
+ err = -EINVAL;
+ if (size - sizeof(outarg) != outarg.size)
+ goto out_finish;
+
+ nodeid = outarg.nodeid;
+
+ down_read(&fc->killsb);
+
+ err = -ENOENT;
+ if (!fc->sb)
+ goto out_up_killsb;
+
+ inode = ilookup5(fc->sb, nodeid, fuse_inode_eq, &nodeid);
+ if (!inode)
+ goto out_up_killsb;
+
+ mapping = inode->i_mapping;
+ index = outarg.offset >> PAGE_CACHE_SHIFT;
+ offset = outarg.offset & ~PAGE_CACHE_MASK;
+ file_size = i_size_read(inode);
+ end = outarg.offset + outarg.size;
+ if (end > file_size) {
+ file_size = end;
+ fuse_write_update_size(inode, file_size);
+ }
+
+ num = outarg.size;
+ while (num) {
+ struct page *page;
+ unsigned int this_num;
+
+ err = -ENOMEM;
+ page = find_or_create_page(mapping, index,
+ mapping_gfp_mask(mapping));
+ if (!page)
+ goto out_iput;
+
+ this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset);
+ err = fuse_copy_page(cs, &page, offset, this_num, 0);
+ if (!err && offset == 0 &&
+ (this_num == PAGE_CACHE_SIZE || file_size == end))
+ SetPageUptodate(page);
+ unlock_page(page);
+ page_cache_release(page);
+
+ if (err)
+ goto out_iput;
+
+ num -= this_num;
+ offset = 0;
+ index++;
+ }
+
+ err = 0;
+
+out_iput:
+ iput(inode);
+out_up_killsb:
+ up_read(&fc->killsb);
+out_finish:
+ fuse_copy_finish(cs);
+ return err;
+}
+
+static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req)
+{
+ release_pages(req->pages, req->num_pages, false);
+}
+
+static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode,
+ struct fuse_notify_retrieve_out *outarg)
+{
+ int err;
+ struct address_space *mapping = inode->i_mapping;
+ struct fuse_req *req;
+ pgoff_t index;
+ loff_t file_size;
+ unsigned int num;
+ unsigned int offset;
+ size_t total_len = 0;
+ int num_pages;
+
+ offset = outarg->offset & ~PAGE_CACHE_MASK;
+ file_size = i_size_read(inode);
+
+ num = outarg->size;
+ if (outarg->offset > file_size)
+ num = 0;
+ else if (outarg->offset + num > file_size)
+ num = file_size - outarg->offset;
+
+ num_pages = (num + offset + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ num_pages = min(num_pages, FUSE_MAX_PAGES_PER_REQ);
+
+ req = fuse_get_req(fc, num_pages);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ req->in.h.opcode = FUSE_NOTIFY_REPLY;
+ req->in.h.nodeid = outarg->nodeid;
+ req->in.numargs = 2;
+ req->in.argpages = 1;
+ req->page_descs[0].offset = offset;
+ req->end = fuse_retrieve_end;
+
+ index = outarg->offset >> PAGE_CACHE_SHIFT;
+
+ while (num && req->num_pages < num_pages) {
+ struct page *page;
+ unsigned int this_num;
+
+ page = find_get_page(mapping, index);
+ if (!page)
+ break;
+
+ this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset);
+ req->pages[req->num_pages] = page;
+ req->page_descs[req->num_pages].length = this_num;
+ req->num_pages++;
+
+ offset = 0;
+ num -= this_num;
+ total_len += this_num;
+ index++;
+ }
+ req->misc.retrieve_in.offset = outarg->offset;
+ req->misc.retrieve_in.size = total_len;
+ req->in.args[0].size = sizeof(req->misc.retrieve_in);
+ req->in.args[0].value = &req->misc.retrieve_in;
+ req->in.args[1].size = total_len;
+
+ err = fuse_request_send_notify_reply(fc, req, outarg->notify_unique);
+ if (err)
+ fuse_retrieve_end(fc, req);
+
+ return err;
+}
+
+static int fuse_notify_retrieve(struct fuse_conn *fc, unsigned int size,
+ struct fuse_copy_state *cs)
+{
+ struct fuse_notify_retrieve_out outarg;
+ struct inode *inode;
+ int err;
+
+ err = -EINVAL;
+ if (size != sizeof(outarg))
+ goto copy_finish;
+
+ err = fuse_copy_one(cs, &outarg, sizeof(outarg));
+ if (err)
+ goto copy_finish;
+
+ fuse_copy_finish(cs);
+
+ down_read(&fc->killsb);
+ err = -ENOENT;
+ if (fc->sb) {
+ u64 nodeid = outarg.nodeid;
+
+ inode = ilookup5(fc->sb, nodeid, fuse_inode_eq, &nodeid);
+ if (inode) {
+ err = fuse_retrieve(fc, inode, &outarg);
+ iput(inode);
+ }
+ }
+ up_read(&fc->killsb);
+
+ return err;
+
+copy_finish:
+ fuse_copy_finish(cs);
+ return err;
+}
+
+static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
+ unsigned int size, struct fuse_copy_state *cs)
+{
+ /* Don't try to move pages (yet) */
+ cs->move_pages = 0;
+
+ switch (code) {
+ case FUSE_NOTIFY_POLL:
+ return fuse_notify_poll(fc, size, cs);
+
+ case FUSE_NOTIFY_INVAL_INODE:
+ return fuse_notify_inval_inode(fc, size, cs);
+
+ case FUSE_NOTIFY_INVAL_ENTRY:
+ return fuse_notify_inval_entry(fc, size, cs);
+
+ case FUSE_NOTIFY_STORE:
+ return fuse_notify_store(fc, size, cs);
+
+ case FUSE_NOTIFY_RETRIEVE:
+ return fuse_notify_retrieve(fc, size, cs);
+
+ case FUSE_NOTIFY_DELETE:
+ return fuse_notify_delete(fc, size, cs);
+
+ default:
+ fuse_copy_finish(cs);
+ return -EINVAL;
+ }
+}
+
+/* Look up request on processing list by unique ID */
+static struct fuse_req *request_find(struct fuse_conn *fc, u64 unique)
+{
+ struct fuse_req *req;
+
+ list_for_each_entry(req, &fc->processing, list) {
+ if (req->in.h.unique == unique || req->intr_unique == unique)
+ return req;
+ }
+ return NULL;
+}
+
+static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out,
+ unsigned nbytes)
+{
+ unsigned reqsize = sizeof(struct fuse_out_header);
+
+ if (out->h.error)
+ return nbytes != reqsize ? -EINVAL : 0;
+
+ reqsize += len_args(out->numargs, out->args);
+
+ if (reqsize < nbytes || (reqsize > nbytes && !out->argvar))
+ return -EINVAL;
+ else if (reqsize > nbytes) {
+ struct fuse_arg *lastarg = &out->args[out->numargs-1];
+ unsigned diffsize = reqsize - nbytes;
+ if (diffsize > lastarg->size)
+ return -EINVAL;
+ lastarg->size -= diffsize;
+ }
+ return fuse_copy_args(cs, out->numargs, out->argpages, out->args,
+ out->page_zeroing);
+}
+
+/*
+ * Write a single reply to a request. First the header is copied from
+ * the write buffer. The request is then searched on the processing
+ * list by the unique ID found in the header. If found, then remove
+ * it from the list and copy the rest of the buffer to the request.
+ * The request is finished by calling request_end()
+ */
+static ssize_t fuse_dev_do_write(struct fuse_conn *fc,
+ struct fuse_copy_state *cs, size_t nbytes)
+{
+ int err;
+ struct fuse_req *req;
+ struct fuse_out_header oh;
+
+ if (nbytes < sizeof(struct fuse_out_header))
+ return -EINVAL;
+
+ err = fuse_copy_one(cs, &oh, sizeof(oh));
+ if (err)
+ goto err_finish;
+
+ err = -EINVAL;
+ if (oh.len != nbytes)
+ goto err_finish;
+
+ /*
+ * Zero oh.unique indicates unsolicited notification message
+ * and error contains notification code.
+ */
+ if (!oh.unique) {
+ err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), cs);
+ return err ? err : nbytes;
+ }
+
+ err = -EINVAL;
+ if (oh.error <= -1000 || oh.error > 0)
+ goto err_finish;
+
+ spin_lock(&fc->lock);
+ err = -ENOENT;
+ if (!fc->connected)
+ goto err_unlock;
+
+ req = request_find(fc, oh.unique);
+ if (!req)
+ goto err_unlock;
+
+ if (req->aborted) {
+ spin_unlock(&fc->lock);
+ fuse_copy_finish(cs);
+ spin_lock(&fc->lock);
+ request_end(fc, req);
+ return -ENOENT;
+ }
+ /* Is it an interrupt reply? */
+ if (req->intr_unique == oh.unique) {
+ err = -EINVAL;
+ if (nbytes != sizeof(struct fuse_out_header))
+ goto err_unlock;
+
+ if (oh.error == -ENOSYS)
+ fc->no_interrupt = 1;
+ else if (oh.error == -EAGAIN)
+ queue_interrupt(fc, req);
+
+ spin_unlock(&fc->lock);
+ fuse_copy_finish(cs);
+ return nbytes;
+ }
+
+ req->state = FUSE_REQ_WRITING;
+ list_move(&req->list, &fc->io);
+ req->out.h = oh;
+ req->locked = 1;
+ cs->req = req;
+ if (!req->out.page_replace)
+ cs->move_pages = 0;
+ spin_unlock(&fc->lock);
+
+ err = copy_out_args(cs, &req->out, nbytes);
+ fuse_copy_finish(cs);
+
+ spin_lock(&fc->lock);
+ req->locked = 0;
+ if (!err) {
+ if (req->aborted)
+ err = -ENOENT;
+ } else if (!req->aborted)
+ req->out.h.error = -EIO;
+ request_end(fc, req);
+
+ return err ? err : nbytes;
+
+ err_unlock:
+ spin_unlock(&fc->lock);
+ err_finish:
+ fuse_copy_finish(cs);
+ return err;
+}
+
+static ssize_t fuse_dev_write(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct fuse_copy_state cs;
+ struct fuse_conn *fc = fuse_get_conn(iocb->ki_filp);
+ if (!fc)
+ return -EPERM;
+
+ if (!iter_is_iovec(from))
+ return -EINVAL;
+
+ fuse_copy_init(&cs, fc, 0, from);
+
+ return fuse_dev_do_write(fc, &cs, iov_iter_count(from));
+}
+
+static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe,
+ struct file *out, loff_t *ppos,
+ size_t len, unsigned int flags)
+{
+ unsigned nbuf;
+ unsigned idx;
+ struct pipe_buffer *bufs;
+ struct fuse_copy_state cs;
+ struct fuse_conn *fc;
+ size_t rem;
+ ssize_t ret;
+
+ fc = fuse_get_conn(out);
+ if (!fc)
+ return -EPERM;
+
+ bufs = kmalloc(pipe->buffers * sizeof(struct pipe_buffer), GFP_KERNEL);
+ if (!bufs)
+ return -ENOMEM;
+
+ pipe_lock(pipe);
+ nbuf = 0;
+ rem = 0;
+ for (idx = 0; idx < pipe->nrbufs && rem < len; idx++)
+ rem += pipe->bufs[(pipe->curbuf + idx) & (pipe->buffers - 1)].len;
+
+ ret = -EINVAL;
+ if (rem < len) {
+ pipe_unlock(pipe);
+ goto out;
+ }
+
+ rem = len;
+ while (rem) {
+ struct pipe_buffer *ibuf;
+ struct pipe_buffer *obuf;
+
+ BUG_ON(nbuf >= pipe->buffers);
+ BUG_ON(!pipe->nrbufs);
+ ibuf = &pipe->bufs[pipe->curbuf];
+ obuf = &bufs[nbuf];
+
+ if (rem >= ibuf->len) {
+ *obuf = *ibuf;
+ ibuf->ops = NULL;
+ pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
+ pipe->nrbufs--;
+ } else {
+ ibuf->ops->get(pipe, ibuf);
+ *obuf = *ibuf;
+ obuf->flags &= ~PIPE_BUF_FLAG_GIFT;
+ obuf->len = rem;
+ ibuf->offset += obuf->len;
+ ibuf->len -= obuf->len;
+ }
+ nbuf++;
+ rem -= obuf->len;
+ }
+ pipe_unlock(pipe);
+
+ fuse_copy_init(&cs, fc, 0, NULL);
+ cs.pipebufs = bufs;
+ cs.nr_segs = nbuf;
+ cs.pipe = pipe;
+
+ if (flags & SPLICE_F_MOVE)
+ cs.move_pages = 1;
+
+ ret = fuse_dev_do_write(fc, &cs, len);
+
+ for (idx = 0; idx < nbuf; idx++) {
+ struct pipe_buffer *buf = &bufs[idx];
+ buf->ops->release(pipe, buf);
+ }
+out:
+ kfree(bufs);
+ return ret;
+}
+
+static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
+{
+ unsigned mask = POLLOUT | POLLWRNORM;
+ struct fuse_conn *fc = fuse_get_conn(file);
+ if (!fc)
+ return POLLERR;
+
+ poll_wait(file, &fc->waitq, wait);
+
+ spin_lock(&fc->lock);
+ if (!fc->connected)
+ mask = POLLERR;
+ else if (request_pending(fc))
+ mask |= POLLIN | POLLRDNORM;
+ spin_unlock(&fc->lock);
+
+ return mask;
+}
+
+/*
+ * Abort all requests on the given list (pending or processing)
+ *
+ * This function releases and reacquires fc->lock
+ */
+static void end_requests(struct fuse_conn *fc, struct list_head *head)
+__releases(fc->lock)
+__acquires(fc->lock)
+{
+ while (!list_empty(head)) {
+ struct fuse_req *req;
+ req = list_entry(head->next, struct fuse_req, list);
+ req->out.h.error = -ECONNABORTED;
+ request_end(fc, req);
+ spin_lock(&fc->lock);
+ }
+}
+
+/*
+ * Abort requests under I/O
+ *
+ * The requests are set to aborted and finished, and the request
+ * waiter is woken up. This will make request_wait_answer() wait
+ * until the request is unlocked and then return.
+ *
+ * If the request is asynchronous, then the end function needs to be
+ * called after waiting for the request to be unlocked (if it was
+ * locked).
+ */
+static void end_io_requests(struct fuse_conn *fc)
+__releases(fc->lock)
+__acquires(fc->lock)
+{
+ while (!list_empty(&fc->io)) {
+ struct fuse_req *req =
+ list_entry(fc->io.next, struct fuse_req, list);
+ void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
+
+ req->aborted = 1;
+ req->out.h.error = -ECONNABORTED;
+ req->state = FUSE_REQ_FINISHED;
+ list_del_init(&req->list);
+ wake_up(&req->waitq);
+ if (end) {
+ req->end = NULL;
+ __fuse_get_request(req);
+ spin_unlock(&fc->lock);
+ wait_event(req->waitq, !req->locked);
+ end(fc, req);
+ fuse_put_request(fc, req);
+ spin_lock(&fc->lock);
+ }
+ }
+}
+
+static void end_queued_requests(struct fuse_conn *fc)
+__releases(fc->lock)
+__acquires(fc->lock)
+{
+ fc->max_background = UINT_MAX;
+ flush_bg_queue(fc);
+ end_requests(fc, &fc->pending);
+ end_requests(fc, &fc->processing);
+ while (forget_pending(fc))
+ kfree(dequeue_forget(fc, 1, NULL));
+}
+
+static void end_polls(struct fuse_conn *fc)
+{
+ struct rb_node *p;
+
+ p = rb_first(&fc->polled_files);
+
+ while (p) {
+ struct fuse_file *ff;
+ ff = rb_entry(p, struct fuse_file, polled_node);
+ wake_up_interruptible_all(&ff->poll_wait);
+
+ p = rb_next(p);
+ }
+}
+
+/*
+ * Abort all requests.
+ *
+ * Emergency exit in case of a malicious or accidental deadlock, or
+ * just a hung filesystem.
+ *
+ * The same effect is usually achievable through killing the
+ * filesystem daemon and all users of the filesystem. The exception
+ * is the combination of an asynchronous request and the tricky
+ * deadlock (see Documentation/filesystems/fuse.txt).
+ *
+ * During the aborting, progression of requests from the pending and
+ * processing lists onto the io list, and progression of new requests
+ * onto the pending list is prevented by req->connected being false.
+ *
+ * Progression of requests under I/O to the processing list is
+ * prevented by the req->aborted flag being true for these requests.
+ * For this reason requests on the io list must be aborted first.
+ */
+void fuse_abort_conn(struct fuse_conn *fc)
+{
+ spin_lock(&fc->lock);
+ if (fc->connected) {
+ fc->connected = 0;
+ fc->blocked = 0;
+ fuse_set_initialized(fc);
+ end_io_requests(fc);
+ end_queued_requests(fc);
+ end_polls(fc);
+ wake_up_all(&fc->waitq);
+ wake_up_all(&fc->blocked_waitq);
+ kill_fasync(&fc->fasync, SIGIO, POLL_IN);
+ }
+ spin_unlock(&fc->lock);
+}
+EXPORT_SYMBOL_GPL(fuse_abort_conn);
+
+int fuse_dev_release(struct inode *inode, struct file *file)
+{
+ struct fuse_conn *fc = fuse_get_conn(file);
+ if (fc) {
+ spin_lock(&fc->lock);
+ fc->connected = 0;
+ fc->blocked = 0;
+ fuse_set_initialized(fc);
+ end_queued_requests(fc);
+ end_polls(fc);
+ wake_up_all(&fc->blocked_waitq);
+ spin_unlock(&fc->lock);
+ fuse_conn_put(fc);
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(fuse_dev_release);
+
+static int fuse_dev_fasync(int fd, struct file *file, int on)
+{
+ struct fuse_conn *fc = fuse_get_conn(file);
+ if (!fc)
+ return -EPERM;
+
+ /* No locking - fasync_helper does its own locking */
+ return fasync_helper(fd, file, on, &fc->fasync);
+}
+
+const struct file_operations fuse_dev_operations = {
+ .owner = THIS_MODULE,
+ .open = fuse_dev_open,
+ .llseek = no_llseek,
+ .read_iter = fuse_dev_read,
+ .splice_read = fuse_dev_splice_read,
+ .write_iter = fuse_dev_write,
+ .splice_write = fuse_dev_splice_write,
+ .poll = fuse_dev_poll,
+ .release = fuse_dev_release,
+ .fasync = fuse_dev_fasync,
+};
+EXPORT_SYMBOL_GPL(fuse_dev_operations);
+
+static struct miscdevice fuse_miscdevice = {
+ .minor = FUSE_MINOR,
+ .name = "fuse",
+ .fops = &fuse_dev_operations,
+};
+
+int __init fuse_dev_init(void)
+{
+ int err = -ENOMEM;
+ fuse_req_cachep = kmem_cache_create("fuse_request",
+ sizeof(struct fuse_req),
+ 0, 0, NULL);
+ if (!fuse_req_cachep)
+ goto out;
+
+ err = misc_register(&fuse_miscdevice);
+ if (err)
+ goto out_cache_clean;
+
+ return 0;
+
+ out_cache_clean:
+ kmem_cache_destroy(fuse_req_cachep);
+ out:
+ return err;
+}
+
+void fuse_dev_cleanup(void)
+{
+ misc_deregister(&fuse_miscdevice);
+ kmem_cache_destroy(fuse_req_cachep);
+}
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
new file mode 100644
index 000000000..0572bca49
--- /dev/null
+++ b/fs/fuse/dir.c
@@ -0,0 +1,1952 @@
+/*
+ FUSE: Filesystem in Userspace
+ Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu>
+
+ This program can be distributed under the terms of the GNU GPL.
+ See the file COPYING.
+*/
+
+#include "fuse_i.h"
+
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/sched.h>
+#include <linux/namei.h>
+#include <linux/slab.h>
+
+static bool fuse_use_readdirplus(struct inode *dir, struct dir_context *ctx)
+{
+ struct fuse_conn *fc = get_fuse_conn(dir);
+ struct fuse_inode *fi = get_fuse_inode(dir);
+
+ if (!fc->do_readdirplus)
+ return false;
+ if (!fc->readdirplus_auto)
+ return true;
+ if (test_and_clear_bit(FUSE_I_ADVISE_RDPLUS, &fi->state))
+ return true;
+ if (ctx->pos == 0)
+ return true;
+ return false;
+}
+
+static void fuse_advise_use_readdirplus(struct inode *dir)
+{
+ struct fuse_inode *fi = get_fuse_inode(dir);
+
+ set_bit(FUSE_I_ADVISE_RDPLUS, &fi->state);
+}
+
+#if BITS_PER_LONG >= 64
+static inline void fuse_dentry_settime(struct dentry *entry, u64 time)
+{
+ entry->d_time = time;
+}
+
+static inline u64 fuse_dentry_time(struct dentry *entry)
+{
+ return entry->d_time;
+}
+#else
+/*
+ * On 32 bit archs store the high 32 bits of time in d_fsdata
+ */
+static void fuse_dentry_settime(struct dentry *entry, u64 time)
+{
+ entry->d_time = time;
+ entry->d_fsdata = (void *) (unsigned long) (time >> 32);
+}
+
+static u64 fuse_dentry_time(struct dentry *entry)
+{
+ return (u64) entry->d_time +
+ ((u64) (unsigned long) entry->d_fsdata << 32);
+}
+#endif
+
+/*
+ * FUSE caches dentries and attributes with separate timeout. The
+ * time in jiffies until the dentry/attributes are valid is stored in
+ * dentry->d_time and fuse_inode->i_time respectively.
+ */
+
+/*
+ * Calculate the time in jiffies until a dentry/attributes are valid
+ */
+static u64 time_to_jiffies(unsigned long sec, unsigned long nsec)
+{
+ if (sec || nsec) {
+ struct timespec ts = {sec, nsec};
+ return get_jiffies_64() + timespec_to_jiffies(&ts);
+ } else
+ return 0;
+}
+
+/*
+ * Set dentry and possibly attribute timeouts from the lookup/mk*
+ * replies
+ */
+static void fuse_change_entry_timeout(struct dentry *entry,
+ struct fuse_entry_out *o)
+{
+ fuse_dentry_settime(entry,
+ time_to_jiffies(o->entry_valid, o->entry_valid_nsec));
+}
+
+static u64 attr_timeout(struct fuse_attr_out *o)
+{
+ return time_to_jiffies(o->attr_valid, o->attr_valid_nsec);
+}
+
+static u64 entry_attr_timeout(struct fuse_entry_out *o)
+{
+ return time_to_jiffies(o->attr_valid, o->attr_valid_nsec);
+}
+
+/*
+ * Mark the attributes as stale, so that at the next call to
+ * ->getattr() they will be fetched from userspace
+ */
+void fuse_invalidate_attr(struct inode *inode)
+{
+ get_fuse_inode(inode)->i_time = 0;
+}
+
+/**
+ * Mark the attributes as stale due to an atime change. Avoid the invalidate if
+ * atime is not used.
+ */
+void fuse_invalidate_atime(struct inode *inode)
+{
+ if (!IS_RDONLY(inode))
+ fuse_invalidate_attr(inode);
+}
+
+/*
+ * Just mark the entry as stale, so that a next attempt to look it up
+ * will result in a new lookup call to userspace
+ *
+ * This is called when a dentry is about to become negative and the
+ * timeout is unknown (unlink, rmdir, rename and in some cases
+ * lookup)
+ */
+void fuse_invalidate_entry_cache(struct dentry *entry)
+{
+ fuse_dentry_settime(entry, 0);
+}
+
+/*
+ * Same as fuse_invalidate_entry_cache(), but also try to remove the
+ * dentry from the hash
+ */
+static void fuse_invalidate_entry(struct dentry *entry)
+{
+ d_invalidate(entry);
+ fuse_invalidate_entry_cache(entry);
+}
+
+static void fuse_lookup_init(struct fuse_conn *fc, struct fuse_args *args,
+ u64 nodeid, struct qstr *name,
+ struct fuse_entry_out *outarg)
+{
+ memset(outarg, 0, sizeof(struct fuse_entry_out));
+ args->in.h.opcode = FUSE_LOOKUP;
+ args->in.h.nodeid = nodeid;
+ args->in.numargs = 1;
+ args->in.args[0].size = name->len + 1;
+ args->in.args[0].value = name->name;
+ args->out.numargs = 1;
+ args->out.args[0].size = sizeof(struct fuse_entry_out);
+ args->out.args[0].value = outarg;
+}
+
+u64 fuse_get_attr_version(struct fuse_conn *fc)
+{
+ u64 curr_version;
+
+ /*
+ * The spin lock isn't actually needed on 64bit archs, but we
+ * don't yet care too much about such optimizations.
+ */
+ spin_lock(&fc->lock);
+ curr_version = fc->attr_version;
+ spin_unlock(&fc->lock);
+
+ return curr_version;
+}
+
+/*
+ * Check whether the dentry is still valid
+ *
+ * If the entry validity timeout has expired and the dentry is
+ * positive, try to redo the lookup. If the lookup results in a
+ * different inode, then let the VFS invalidate the dentry and redo
+ * the lookup once more. If the lookup results in the same inode,
+ * then refresh the attributes, timeouts and mark the dentry valid.
+ */
+static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
+{
+ struct inode *inode;
+ struct dentry *parent;
+ struct fuse_conn *fc;
+ struct fuse_inode *fi;
+ int ret;
+
+ inode = d_inode_rcu(entry);
+ if (inode && is_bad_inode(inode))
+ goto invalid;
+ else if (time_before64(fuse_dentry_time(entry), get_jiffies_64()) ||
+ (flags & LOOKUP_REVAL)) {
+ struct fuse_entry_out outarg;
+ FUSE_ARGS(args);
+ struct fuse_forget_link *forget;
+ u64 attr_version;
+
+ /* For negative dentries, always do a fresh lookup */
+ if (!inode)
+ goto invalid;
+
+ ret = -ECHILD;
+ if (flags & LOOKUP_RCU)
+ goto out;
+
+ fc = get_fuse_conn(inode);
+
+ forget = fuse_alloc_forget();
+ ret = -ENOMEM;
+ if (!forget)
+ goto out;
+
+ attr_version = fuse_get_attr_version(fc);
+
+ parent = dget_parent(entry);
+ fuse_lookup_init(fc, &args, get_node_id(d_inode(parent)),
+ &entry->d_name, &outarg);
+ ret = fuse_simple_request(fc, &args);
+ dput(parent);
+ /* Zero nodeid is same as -ENOENT */
+ if (!ret && !outarg.nodeid)
+ ret = -ENOENT;
+ if (!ret) {
+ fi = get_fuse_inode(inode);
+ if (outarg.nodeid != get_node_id(inode)) {
+ fuse_queue_forget(fc, forget, outarg.nodeid, 1);
+ goto invalid;
+ }
+ spin_lock(&fc->lock);
+ fi->nlookup++;
+ spin_unlock(&fc->lock);
+ }
+ kfree(forget);
+ if (ret == -ENOMEM)
+ goto out;
+ if (ret || (outarg.attr.mode ^ inode->i_mode) & S_IFMT)
+ goto invalid;
+
+ fuse_change_attributes(inode, &outarg.attr,
+ entry_attr_timeout(&outarg),
+ attr_version);
+ fuse_change_entry_timeout(entry, &outarg);
+ } else if (inode) {
+ fi = get_fuse_inode(inode);
+ if (flags & LOOKUP_RCU) {
+ if (test_bit(FUSE_I_INIT_RDPLUS, &fi->state))
+ return -ECHILD;
+ } else if (test_and_clear_bit(FUSE_I_INIT_RDPLUS, &fi->state)) {
+ parent = dget_parent(entry);
+ fuse_advise_use_readdirplus(d_inode(parent));
+ dput(parent);
+ }
+ }
+ ret = 1;
+out:
+ return ret;
+
+invalid:
+ ret = 0;
+ goto out;
+}
+
+static int invalid_nodeid(u64 nodeid)
+{
+ return !nodeid || nodeid == FUSE_ROOT_ID;
+}
+
+const struct dentry_operations fuse_dentry_operations = {
+ .d_revalidate = fuse_dentry_revalidate,
+};
+
+int fuse_valid_type(int m)
+{
+ return S_ISREG(m) || S_ISDIR(m) || S_ISLNK(m) || S_ISCHR(m) ||
+ S_ISBLK(m) || S_ISFIFO(m) || S_ISSOCK(m);
+}
+
+int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
+ struct fuse_entry_out *outarg, struct inode **inode)
+{
+ struct fuse_conn *fc = get_fuse_conn_super(sb);
+ FUSE_ARGS(args);
+ struct fuse_forget_link *forget;
+ u64 attr_version;
+ int err;
+
+ *inode = NULL;
+ err = -ENAMETOOLONG;
+ if (name->len > FUSE_NAME_MAX)
+ goto out;
+
+
+ forget = fuse_alloc_forget();
+ err = -ENOMEM;
+ if (!forget)
+ goto out;
+
+ attr_version = fuse_get_attr_version(fc);
+
+ fuse_lookup_init(fc, &args, nodeid, name, outarg);
+ err = fuse_simple_request(fc, &args);
+ /* Zero nodeid is same as -ENOENT, but with valid timeout */
+ if (err || !outarg->nodeid)
+ goto out_put_forget;
+
+ err = -EIO;
+ if (!outarg->nodeid)
+ goto out_put_forget;
+ if (!fuse_valid_type(outarg->attr.mode))
+ goto out_put_forget;
+
+ *inode = fuse_iget(sb, outarg->nodeid, outarg->generation,
+ &outarg->attr, entry_attr_timeout(outarg),
+ attr_version);
+ err = -ENOMEM;
+ if (!*inode) {
+ fuse_queue_forget(fc, forget, outarg->nodeid, 1);
+ goto out;
+ }
+ err = 0;
+
+ out_put_forget:
+ kfree(forget);
+ out:
+ return err;
+}
+
+static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
+ unsigned int flags)
+{
+ int err;
+ struct fuse_entry_out outarg;
+ struct inode *inode;
+ struct dentry *newent;
+ bool outarg_valid = true;
+
+ err = fuse_lookup_name(dir->i_sb, get_node_id(dir), &entry->d_name,
+ &outarg, &inode);
+ if (err == -ENOENT) {
+ outarg_valid = false;
+ err = 0;
+ }
+ if (err)
+ goto out_err;
+
+ err = -EIO;
+ if (inode && get_node_id(inode) == FUSE_ROOT_ID)
+ goto out_iput;
+
+ newent = d_splice_alias(inode, entry);
+ err = PTR_ERR(newent);
+ if (IS_ERR(newent))
+ goto out_err;
+
+ entry = newent ? newent : entry;
+ if (outarg_valid)
+ fuse_change_entry_timeout(entry, &outarg);
+ else
+ fuse_invalidate_entry_cache(entry);
+
+ fuse_advise_use_readdirplus(dir);
+ return newent;
+
+ out_iput:
+ iput(inode);
+ out_err:
+ return ERR_PTR(err);
+}
+
+/*
+ * Atomic create+open operation
+ *
+ * If the filesystem doesn't support this, then fall back to separate
+ * 'mknod' + 'open' requests.
+ */
+static int fuse_create_open(struct inode *dir, struct dentry *entry,
+ struct file *file, unsigned flags,
+ umode_t mode, int *opened)
+{
+ int err;
+ struct inode *inode;
+ struct fuse_conn *fc = get_fuse_conn(dir);
+ FUSE_ARGS(args);
+ struct fuse_forget_link *forget;
+ struct fuse_create_in inarg;
+ struct fuse_open_out outopen;
+ struct fuse_entry_out outentry;
+ struct fuse_file *ff;
+
+ /* Userspace expects S_IFREG in create mode */
+ BUG_ON((mode & S_IFMT) != S_IFREG);
+
+ forget = fuse_alloc_forget();
+ err = -ENOMEM;
+ if (!forget)
+ goto out_err;
+
+ err = -ENOMEM;
+ ff = fuse_file_alloc(fc);
+ if (!ff)
+ goto out_put_forget_req;
+
+ if (!fc->dont_mask)
+ mode &= ~current_umask();
+
+ flags &= ~O_NOCTTY;
+ memset(&inarg, 0, sizeof(inarg));
+ memset(&outentry, 0, sizeof(outentry));
+ inarg.flags = flags;
+ inarg.mode = mode;
+ inarg.umask = current_umask();
+ args.in.h.opcode = FUSE_CREATE;
+ args.in.h.nodeid = get_node_id(dir);
+ args.in.numargs = 2;
+ args.in.args[0].size = sizeof(inarg);
+ args.in.args[0].value = &inarg;
+ args.in.args[1].size = entry->d_name.len + 1;
+ args.in.args[1].value = entry->d_name.name;
+ args.out.numargs = 2;
+ args.out.args[0].size = sizeof(outentry);
+ args.out.args[0].value = &outentry;
+ args.out.args[1].size = sizeof(outopen);
+ args.out.args[1].value = &outopen;
+ err = fuse_simple_request(fc, &args);
+ if (err)
+ goto out_free_ff;
+
+ err = -EIO;
+ if (!S_ISREG(outentry.attr.mode) || invalid_nodeid(outentry.nodeid))
+ goto out_free_ff;
+
+ ff->fh = outopen.fh;
+ ff->nodeid = outentry.nodeid;
+ ff->open_flags = outopen.open_flags;
+ inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation,
+ &outentry.attr, entry_attr_timeout(&outentry), 0);
+ if (!inode) {
+ flags &= ~(O_CREAT | O_EXCL | O_TRUNC);
+ fuse_sync_release(ff, flags);
+ fuse_queue_forget(fc, forget, outentry.nodeid, 1);
+ err = -ENOMEM;
+ goto out_err;
+ }
+ kfree(forget);
+ d_instantiate(entry, inode);
+ fuse_change_entry_timeout(entry, &outentry);
+ fuse_invalidate_attr(dir);
+ err = finish_open(file, entry, generic_file_open, opened);
+ if (err) {
+ fuse_sync_release(ff, flags);
+ } else {
+ file->private_data = fuse_file_get(ff);
+ fuse_finish_open(inode, file);
+ }
+ return err;
+
+out_free_ff:
+ fuse_file_free(ff);
+out_put_forget_req:
+ kfree(forget);
+out_err:
+ return err;
+}
+
+static int fuse_mknod(struct inode *, struct dentry *, umode_t, dev_t);
+static int fuse_atomic_open(struct inode *dir, struct dentry *entry,
+ struct file *file, unsigned flags,
+ umode_t mode, int *opened)
+{
+ int err;
+ struct fuse_conn *fc = get_fuse_conn(dir);
+ struct dentry *res = NULL;
+
+ if (d_unhashed(entry)) {
+ res = fuse_lookup(dir, entry, 0);
+ if (IS_ERR(res))
+ return PTR_ERR(res);
+
+ if (res)
+ entry = res;
+ }
+
+ if (!(flags & O_CREAT) || d_really_is_positive(entry))
+ goto no_open;
+
+ /* Only creates */
+ *opened |= FILE_CREATED;
+
+ if (fc->no_create)
+ goto mknod;
+
+ err = fuse_create_open(dir, entry, file, flags, mode, opened);
+ if (err == -ENOSYS) {
+ fc->no_create = 1;
+ goto mknod;
+ }
+out_dput:
+ dput(res);
+ return err;
+
+mknod:
+ err = fuse_mknod(dir, entry, mode, 0);
+ if (err)
+ goto out_dput;
+no_open:
+ return finish_no_open(file, res);
+}
+
+/*
+ * Code shared between mknod, mkdir, symlink and link
+ */
+static int create_new_entry(struct fuse_conn *fc, struct fuse_args *args,
+ struct inode *dir, struct dentry *entry,
+ umode_t mode)
+{
+ struct fuse_entry_out outarg;
+ struct inode *inode;
+ int err;
+ struct fuse_forget_link *forget;
+
+ forget = fuse_alloc_forget();
+ if (!forget)
+ return -ENOMEM;
+
+ memset(&outarg, 0, sizeof(outarg));
+ args->in.h.nodeid = get_node_id(dir);
+ args->out.numargs = 1;
+ args->out.args[0].size = sizeof(outarg);
+ args->out.args[0].value = &outarg;
+ err = fuse_simple_request(fc, args);
+ if (err)
+ goto out_put_forget_req;
+
+ err = -EIO;
+ if (invalid_nodeid(outarg.nodeid))
+ goto out_put_forget_req;
+
+ if ((outarg.attr.mode ^ mode) & S_IFMT)
+ goto out_put_forget_req;
+
+ inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation,
+ &outarg.attr, entry_attr_timeout(&outarg), 0);
+ if (!inode) {
+ fuse_queue_forget(fc, forget, outarg.nodeid, 1);
+ return -ENOMEM;
+ }
+ kfree(forget);
+
+ err = d_instantiate_no_diralias(entry, inode);
+ if (err)
+ return err;
+
+ fuse_change_entry_timeout(entry, &outarg);
+ fuse_invalidate_attr(dir);
+ return 0;
+
+ out_put_forget_req:
+ kfree(forget);
+ return err;
+}
+
+static int fuse_mknod(struct inode *dir, struct dentry *entry, umode_t mode,
+ dev_t rdev)
+{
+ struct fuse_mknod_in inarg;
+ struct fuse_conn *fc = get_fuse_conn(dir);
+ FUSE_ARGS(args);
+
+ if (!fc->dont_mask)
+ mode &= ~current_umask();
+
+ memset(&inarg, 0, sizeof(inarg));
+ inarg.mode = mode;
+ inarg.rdev = new_encode_dev(rdev);
+ inarg.umask = current_umask();
+ args.in.h.opcode = FUSE_MKNOD;
+ args.in.numargs = 2;
+ args.in.args[0].size = sizeof(inarg);
+ args.in.args[0].value = &inarg;
+ args.in.args[1].size = entry->d_name.len + 1;
+ args.in.args[1].value = entry->d_name.name;
+ return create_new_entry(fc, &args, dir, entry, mode);
+}
+
+static int fuse_create(struct inode *dir, struct dentry *entry, umode_t mode,
+ bool excl)
+{
+ return fuse_mknod(dir, entry, mode, 0);
+}
+
+static int fuse_mkdir(struct inode *dir, struct dentry *entry, umode_t mode)
+{
+ struct fuse_mkdir_in inarg;
+ struct fuse_conn *fc = get_fuse_conn(dir);
+ FUSE_ARGS(args);
+
+ if (!fc->dont_mask)
+ mode &= ~current_umask();
+
+ memset(&inarg, 0, sizeof(inarg));
+ inarg.mode = mode;
+ inarg.umask = current_umask();
+ args.in.h.opcode = FUSE_MKDIR;
+ args.in.numargs = 2;
+ args.in.args[0].size = sizeof(inarg);
+ args.in.args[0].value = &inarg;
+ args.in.args[1].size = entry->d_name.len + 1;
+ args.in.args[1].value = entry->d_name.name;
+ return create_new_entry(fc, &args, dir, entry, S_IFDIR);
+}
+
+static int fuse_symlink(struct inode *dir, struct dentry *entry,
+ const char *link)
+{
+ struct fuse_conn *fc = get_fuse_conn(dir);
+ unsigned len = strlen(link) + 1;
+ FUSE_ARGS(args);
+
+ args.in.h.opcode = FUSE_SYMLINK;
+ args.in.numargs = 2;
+ args.in.args[0].size = entry->d_name.len + 1;
+ args.in.args[0].value = entry->d_name.name;
+ args.in.args[1].size = len;
+ args.in.args[1].value = link;
+ return create_new_entry(fc, &args, dir, entry, S_IFLNK);
+}
+
+static inline void fuse_update_ctime(struct inode *inode)
+{
+ if (!IS_NOCMTIME(inode)) {
+ inode->i_ctime = current_fs_time(inode->i_sb);
+ mark_inode_dirty_sync(inode);
+ }
+}
+
+static int fuse_unlink(struct inode *dir, struct dentry *entry)
+{
+ int err;
+ struct fuse_conn *fc = get_fuse_conn(dir);
+ FUSE_ARGS(args);
+
+ args.in.h.opcode = FUSE_UNLINK;
+ args.in.h.nodeid = get_node_id(dir);
+ args.in.numargs = 1;
+ args.in.args[0].size = entry->d_name.len + 1;
+ args.in.args[0].value = entry->d_name.name;
+ err = fuse_simple_request(fc, &args);
+ if (!err) {
+ struct inode *inode = d_inode(entry);
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ spin_lock(&fc->lock);
+ fi->attr_version = ++fc->attr_version;
+ /*
+ * If i_nlink == 0 then unlink doesn't make sense, yet this can
+ * happen if userspace filesystem is careless. It would be
+ * difficult to enforce correct nlink usage so just ignore this
+ * condition here
+ */
+ if (inode->i_nlink > 0)
+ drop_nlink(inode);
+ spin_unlock(&fc->lock);
+ fuse_invalidate_attr(inode);
+ fuse_invalidate_attr(dir);
+ fuse_invalidate_entry_cache(entry);
+ fuse_update_ctime(inode);
+ } else if (err == -EINTR)
+ fuse_invalidate_entry(entry);
+ return err;
+}
+
+static int fuse_rmdir(struct inode *dir, struct dentry *entry)
+{
+ int err;
+ struct fuse_conn *fc = get_fuse_conn(dir);
+ FUSE_ARGS(args);
+
+ args.in.h.opcode = FUSE_RMDIR;
+ args.in.h.nodeid = get_node_id(dir);
+ args.in.numargs = 1;
+ args.in.args[0].size = entry->d_name.len + 1;
+ args.in.args[0].value = entry->d_name.name;
+ err = fuse_simple_request(fc, &args);
+ if (!err) {
+ clear_nlink(d_inode(entry));
+ fuse_invalidate_attr(dir);
+ fuse_invalidate_entry_cache(entry);
+ } else if (err == -EINTR)
+ fuse_invalidate_entry(entry);
+ return err;
+}
+
+static int fuse_rename_common(struct inode *olddir, struct dentry *oldent,
+ struct inode *newdir, struct dentry *newent,
+ unsigned int flags, int opcode, size_t argsize)
+{
+ int err;
+ struct fuse_rename2_in inarg;
+ struct fuse_conn *fc = get_fuse_conn(olddir);
+ FUSE_ARGS(args);
+
+ memset(&inarg, 0, argsize);
+ inarg.newdir = get_node_id(newdir);
+ inarg.flags = flags;
+ args.in.h.opcode = opcode;
+ args.in.h.nodeid = get_node_id(olddir);
+ args.in.numargs = 3;
+ args.in.args[0].size = argsize;
+ args.in.args[0].value = &inarg;
+ args.in.args[1].size = oldent->d_name.len + 1;
+ args.in.args[1].value = oldent->d_name.name;
+ args.in.args[2].size = newent->d_name.len + 1;
+ args.in.args[2].value = newent->d_name.name;
+ err = fuse_simple_request(fc, &args);
+ if (!err) {
+ /* ctime changes */
+ fuse_invalidate_attr(d_inode(oldent));
+ fuse_update_ctime(d_inode(oldent));
+
+ if (flags & RENAME_EXCHANGE) {
+ fuse_invalidate_attr(d_inode(newent));
+ fuse_update_ctime(d_inode(newent));
+ }
+
+ fuse_invalidate_attr(olddir);
+ if (olddir != newdir)
+ fuse_invalidate_attr(newdir);
+
+ /* newent will end up negative */
+ if (!(flags & RENAME_EXCHANGE) && d_really_is_positive(newent)) {
+ fuse_invalidate_attr(d_inode(newent));
+ fuse_invalidate_entry_cache(newent);
+ fuse_update_ctime(d_inode(newent));
+ }
+ } else if (err == -EINTR) {
+ /* If request was interrupted, DEITY only knows if the
+ rename actually took place. If the invalidation
+ fails (e.g. some process has CWD under the renamed
+ directory), then there can be inconsistency between
+ the dcache and the real filesystem. Tough luck. */
+ fuse_invalidate_entry(oldent);
+ if (d_really_is_positive(newent))
+ fuse_invalidate_entry(newent);
+ }
+
+ return err;
+}
+
+static int fuse_rename2(struct inode *olddir, struct dentry *oldent,
+ struct inode *newdir, struct dentry *newent,
+ unsigned int flags)
+{
+ struct fuse_conn *fc = get_fuse_conn(olddir);
+ int err;
+
+ if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
+ return -EINVAL;
+
+ if (flags) {
+ if (fc->no_rename2 || fc->minor < 23)
+ return -EINVAL;
+
+ err = fuse_rename_common(olddir, oldent, newdir, newent, flags,
+ FUSE_RENAME2,
+ sizeof(struct fuse_rename2_in));
+ if (err == -ENOSYS) {
+ fc->no_rename2 = 1;
+ err = -EINVAL;
+ }
+ } else {
+ err = fuse_rename_common(olddir, oldent, newdir, newent, 0,
+ FUSE_RENAME,
+ sizeof(struct fuse_rename_in));
+ }
+
+ return err;
+}
+
+static int fuse_link(struct dentry *entry, struct inode *newdir,
+ struct dentry *newent)
+{
+ int err;
+ struct fuse_link_in inarg;
+ struct inode *inode = d_inode(entry);
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ FUSE_ARGS(args);
+
+ memset(&inarg, 0, sizeof(inarg));
+ inarg.oldnodeid = get_node_id(inode);
+ args.in.h.opcode = FUSE_LINK;
+ args.in.numargs = 2;
+ args.in.args[0].size = sizeof(inarg);
+ args.in.args[0].value = &inarg;
+ args.in.args[1].size = newent->d_name.len + 1;
+ args.in.args[1].value = newent->d_name.name;
+ err = create_new_entry(fc, &args, newdir, newent, inode->i_mode);
+ /* Contrary to "normal" filesystems it can happen that link
+ makes two "logical" inodes point to the same "physical"
+ inode. We invalidate the attributes of the old one, so it
+ will reflect changes in the backing inode (link count,
+ etc.)
+ */
+ if (!err) {
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ spin_lock(&fc->lock);
+ fi->attr_version = ++fc->attr_version;
+ inc_nlink(inode);
+ spin_unlock(&fc->lock);
+ fuse_invalidate_attr(inode);
+ fuse_update_ctime(inode);
+ } else if (err == -EINTR) {
+ fuse_invalidate_attr(inode);
+ }
+ return err;
+}
+
+static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr,
+ struct kstat *stat)
+{
+ unsigned int blkbits;
+ struct fuse_conn *fc = get_fuse_conn(inode);
+
+ /* see the comment in fuse_change_attributes() */
+ if (fc->writeback_cache && S_ISREG(inode->i_mode)) {
+ attr->size = i_size_read(inode);
+ attr->mtime = inode->i_mtime.tv_sec;
+ attr->mtimensec = inode->i_mtime.tv_nsec;
+ attr->ctime = inode->i_ctime.tv_sec;
+ attr->ctimensec = inode->i_ctime.tv_nsec;
+ }
+
+ stat->dev = inode->i_sb->s_dev;
+ stat->ino = attr->ino;
+ stat->mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777);
+ stat->nlink = attr->nlink;
+ stat->uid = make_kuid(&init_user_ns, attr->uid);
+ stat->gid = make_kgid(&init_user_ns, attr->gid);
+ stat->rdev = inode->i_rdev;
+ stat->atime.tv_sec = attr->atime;
+ stat->atime.tv_nsec = attr->atimensec;
+ stat->mtime.tv_sec = attr->mtime;
+ stat->mtime.tv_nsec = attr->mtimensec;
+ stat->ctime.tv_sec = attr->ctime;
+ stat->ctime.tv_nsec = attr->ctimensec;
+ stat->size = attr->size;
+ stat->blocks = attr->blocks;
+
+ if (attr->blksize != 0)
+ blkbits = ilog2(attr->blksize);
+ else
+ blkbits = inode->i_sb->s_blocksize_bits;
+
+ stat->blksize = 1 << blkbits;
+}
+
+static int fuse_do_getattr(struct inode *inode, struct kstat *stat,
+ struct file *file)
+{
+ int err;
+ struct fuse_getattr_in inarg;
+ struct fuse_attr_out outarg;
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ FUSE_ARGS(args);
+ u64 attr_version;
+
+ attr_version = fuse_get_attr_version(fc);
+
+ memset(&inarg, 0, sizeof(inarg));
+ memset(&outarg, 0, sizeof(outarg));
+ /* Directories have separate file-handle space */
+ if (file && S_ISREG(inode->i_mode)) {
+ struct fuse_file *ff = file->private_data;
+
+ inarg.getattr_flags |= FUSE_GETATTR_FH;
+ inarg.fh = ff->fh;
+ }
+ args.in.h.opcode = FUSE_GETATTR;
+ args.in.h.nodeid = get_node_id(inode);
+ args.in.numargs = 1;
+ args.in.args[0].size = sizeof(inarg);
+ args.in.args[0].value = &inarg;
+ args.out.numargs = 1;
+ args.out.args[0].size = sizeof(outarg);
+ args.out.args[0].value = &outarg;
+ err = fuse_simple_request(fc, &args);
+ if (!err) {
+ if ((inode->i_mode ^ outarg.attr.mode) & S_IFMT) {
+ make_bad_inode(inode);
+ err = -EIO;
+ } else {
+ fuse_change_attributes(inode, &outarg.attr,
+ attr_timeout(&outarg),
+ attr_version);
+ if (stat)
+ fuse_fillattr(inode, &outarg.attr, stat);
+ }
+ }
+ return err;
+}
+
+int fuse_update_attributes(struct inode *inode, struct kstat *stat,
+ struct file *file, bool *refreshed)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ int err;
+ bool r;
+
+ if (time_before64(fi->i_time, get_jiffies_64())) {
+ r = true;
+ err = fuse_do_getattr(inode, stat, file);
+ } else {
+ r = false;
+ err = 0;
+ if (stat) {
+ generic_fillattr(inode, stat);
+ stat->mode = fi->orig_i_mode;
+ stat->ino = fi->orig_ino;
+ }
+ }
+
+ if (refreshed != NULL)
+ *refreshed = r;
+
+ return err;
+}
+
+int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
+ u64 child_nodeid, struct qstr *name)
+{
+ int err = -ENOTDIR;
+ struct inode *parent;
+ struct dentry *dir;
+ struct dentry *entry;
+
+ parent = ilookup5(sb, parent_nodeid, fuse_inode_eq, &parent_nodeid);
+ if (!parent)
+ return -ENOENT;
+
+ mutex_lock(&parent->i_mutex);
+ if (!S_ISDIR(parent->i_mode))
+ goto unlock;
+
+ err = -ENOENT;
+ dir = d_find_alias(parent);
+ if (!dir)
+ goto unlock;
+
+ entry = d_lookup(dir, name);
+ dput(dir);
+ if (!entry)
+ goto unlock;
+
+ fuse_invalidate_attr(parent);
+ fuse_invalidate_entry(entry);
+
+ if (child_nodeid != 0 && d_really_is_positive(entry)) {
+ mutex_lock(&d_inode(entry)->i_mutex);
+ if (get_node_id(d_inode(entry)) != child_nodeid) {
+ err = -ENOENT;
+ goto badentry;
+ }
+ if (d_mountpoint(entry)) {
+ err = -EBUSY;
+ goto badentry;
+ }
+ if (d_is_dir(entry)) {
+ shrink_dcache_parent(entry);
+ if (!simple_empty(entry)) {
+ err = -ENOTEMPTY;
+ goto badentry;
+ }
+ d_inode(entry)->i_flags |= S_DEAD;
+ }
+ dont_mount(entry);
+ clear_nlink(d_inode(entry));
+ err = 0;
+ badentry:
+ mutex_unlock(&d_inode(entry)->i_mutex);
+ if (!err)
+ d_delete(entry);
+ } else {
+ err = 0;
+ }
+ dput(entry);
+
+ unlock:
+ mutex_unlock(&parent->i_mutex);
+ iput(parent);
+ return err;
+}
+
+/*
+ * Calling into a user-controlled filesystem gives the filesystem
+ * daemon ptrace-like capabilities over the current process. This
+ * means, that the filesystem daemon is able to record the exact
+ * filesystem operations performed, and can also control the behavior
+ * of the requester process in otherwise impossible ways. For example
+ * it can delay the operation for arbitrary length of time allowing
+ * DoS against the requester.
+ *
+ * For this reason only those processes can call into the filesystem,
+ * for which the owner of the mount has ptrace privilege. This
+ * excludes processes started by other users, suid or sgid processes.
+ */
+int fuse_allow_current_process(struct fuse_conn *fc)
+{
+ const struct cred *cred;
+
+ if (fc->flags & FUSE_ALLOW_OTHER)
+ return 1;
+
+ cred = current_cred();
+ if (uid_eq(cred->euid, fc->user_id) &&
+ uid_eq(cred->suid, fc->user_id) &&
+ uid_eq(cred->uid, fc->user_id) &&
+ gid_eq(cred->egid, fc->group_id) &&
+ gid_eq(cred->sgid, fc->group_id) &&
+ gid_eq(cred->gid, fc->group_id))
+ return 1;
+
+ return 0;
+}
+
+static int fuse_access(struct inode *inode, int mask)
+{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ FUSE_ARGS(args);
+ struct fuse_access_in inarg;
+ int err;
+
+ BUG_ON(mask & MAY_NOT_BLOCK);
+
+ if (fc->no_access)
+ return 0;
+
+ memset(&inarg, 0, sizeof(inarg));
+ inarg.mask = mask & (MAY_READ | MAY_WRITE | MAY_EXEC);
+ args.in.h.opcode = FUSE_ACCESS;
+ args.in.h.nodeid = get_node_id(inode);
+ args.in.numargs = 1;
+ args.in.args[0].size = sizeof(inarg);
+ args.in.args[0].value = &inarg;
+ err = fuse_simple_request(fc, &args);
+ if (err == -ENOSYS) {
+ fc->no_access = 1;
+ err = 0;
+ }
+ return err;
+}
+
+static int fuse_perm_getattr(struct inode *inode, int mask)
+{
+ if (mask & MAY_NOT_BLOCK)
+ return -ECHILD;
+
+ return fuse_do_getattr(inode, NULL, NULL);
+}
+
+/*
+ * Check permission. The two basic access models of FUSE are:
+ *
+ * 1) Local access checking ('default_permissions' mount option) based
+ * on file mode. This is the plain old disk filesystem permission
+ * modell.
+ *
+ * 2) "Remote" access checking, where server is responsible for
+ * checking permission in each inode operation. An exception to this
+ * is if ->permission() was invoked from sys_access() in which case an
+ * access request is sent. Execute permission is still checked
+ * locally based on file mode.
+ */
+static int fuse_permission(struct inode *inode, int mask)
+{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ bool refreshed = false;
+ int err = 0;
+
+ if (!fuse_allow_current_process(fc))
+ return -EACCES;
+
+ /*
+ * If attributes are needed, refresh them before proceeding
+ */
+ if ((fc->flags & FUSE_DEFAULT_PERMISSIONS) ||
+ ((mask & MAY_EXEC) && S_ISREG(inode->i_mode))) {
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ if (time_before64(fi->i_time, get_jiffies_64())) {
+ refreshed = true;
+
+ err = fuse_perm_getattr(inode, mask);
+ if (err)
+ return err;
+ }
+ }
+
+ if (fc->flags & FUSE_DEFAULT_PERMISSIONS) {
+ err = generic_permission(inode, mask);
+
+ /* If permission is denied, try to refresh file
+ attributes. This is also needed, because the root
+ node will at first have no permissions */
+ if (err == -EACCES && !refreshed) {
+ err = fuse_perm_getattr(inode, mask);
+ if (!err)
+ err = generic_permission(inode, mask);
+ }
+
+ /* Note: the opposite of the above test does not
+ exist. So if permissions are revoked this won't be
+ noticed immediately, only after the attribute
+ timeout has expired */
+ } else if (mask & (MAY_ACCESS | MAY_CHDIR)) {
+ err = fuse_access(inode, mask);
+ } else if ((mask & MAY_EXEC) && S_ISREG(inode->i_mode)) {
+ if (!(inode->i_mode & S_IXUGO)) {
+ if (refreshed)
+ return -EACCES;
+
+ err = fuse_perm_getattr(inode, mask);
+ if (!err && !(inode->i_mode & S_IXUGO))
+ return -EACCES;
+ }
+ }
+ return err;
+}
+
+static int parse_dirfile(char *buf, size_t nbytes, struct file *file,
+ struct dir_context *ctx)
+{
+ while (nbytes >= FUSE_NAME_OFFSET) {
+ struct fuse_dirent *dirent = (struct fuse_dirent *) buf;
+ size_t reclen = FUSE_DIRENT_SIZE(dirent);
+ if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX)
+ return -EIO;
+ if (reclen > nbytes)
+ break;
+ if (memchr(dirent->name, '/', dirent->namelen) != NULL)
+ return -EIO;
+
+ if (!dir_emit(ctx, dirent->name, dirent->namelen,
+ dirent->ino, dirent->type))
+ break;
+
+ buf += reclen;
+ nbytes -= reclen;
+ ctx->pos = dirent->off;
+ }
+
+ return 0;
+}
+
+static int fuse_direntplus_link(struct file *file,
+ struct fuse_direntplus *direntplus,
+ u64 attr_version)
+{
+ int err;
+ struct fuse_entry_out *o = &direntplus->entry_out;
+ struct fuse_dirent *dirent = &direntplus->dirent;
+ struct dentry *parent = file->f_path.dentry;
+ struct qstr name = QSTR_INIT(dirent->name, dirent->namelen);
+ struct dentry *dentry;
+ struct dentry *alias;
+ struct inode *dir = d_inode(parent);
+ struct fuse_conn *fc;
+ struct inode *inode;
+
+ if (!o->nodeid) {
+ /*
+ * Unlike in the case of fuse_lookup, zero nodeid does not mean
+ * ENOENT. Instead, it only means the userspace filesystem did
+ * not want to return attributes/handle for this entry.
+ *
+ * So do nothing.
+ */
+ return 0;
+ }
+
+ if (name.name[0] == '.') {
+ /*
+ * We could potentially refresh the attributes of the directory
+ * and its parent?
+ */
+ if (name.len == 1)
+ return 0;
+ if (name.name[1] == '.' && name.len == 2)
+ return 0;
+ }
+
+ if (invalid_nodeid(o->nodeid))
+ return -EIO;
+ if (!fuse_valid_type(o->attr.mode))
+ return -EIO;
+
+ fc = get_fuse_conn(dir);
+
+ name.hash = full_name_hash(name.name, name.len);
+ dentry = d_lookup(parent, &name);
+ if (dentry) {
+ inode = d_inode(dentry);
+ if (!inode) {
+ d_drop(dentry);
+ } else if (get_node_id(inode) != o->nodeid ||
+ ((o->attr.mode ^ inode->i_mode) & S_IFMT)) {
+ d_invalidate(dentry);
+ } else if (is_bad_inode(inode)) {
+ err = -EIO;
+ goto out;
+ } else {
+ struct fuse_inode *fi;
+ fi = get_fuse_inode(inode);
+ spin_lock(&fc->lock);
+ fi->nlookup++;
+ spin_unlock(&fc->lock);
+
+ fuse_change_attributes(inode, &o->attr,
+ entry_attr_timeout(o),
+ attr_version);
+
+ /*
+ * The other branch to 'found' comes via fuse_iget()
+ * which bumps nlookup inside
+ */
+ goto found;
+ }
+ dput(dentry);
+ }
+
+ dentry = d_alloc(parent, &name);
+ err = -ENOMEM;
+ if (!dentry)
+ goto out;
+
+ inode = fuse_iget(dir->i_sb, o->nodeid, o->generation,
+ &o->attr, entry_attr_timeout(o), attr_version);
+ if (!inode)
+ goto out;
+
+ alias = d_splice_alias(inode, dentry);
+ err = PTR_ERR(alias);
+ if (IS_ERR(alias))
+ goto out;
+
+ if (alias) {
+ dput(dentry);
+ dentry = alias;
+ }
+
+found:
+ if (fc->readdirplus_auto)
+ set_bit(FUSE_I_INIT_RDPLUS, &get_fuse_inode(inode)->state);
+ fuse_change_entry_timeout(dentry, o);
+
+ err = 0;
+out:
+ dput(dentry);
+ return err;
+}
+
+static int parse_dirplusfile(char *buf, size_t nbytes, struct file *file,
+ struct dir_context *ctx, u64 attr_version)
+{
+ struct fuse_direntplus *direntplus;
+ struct fuse_dirent *dirent;
+ size_t reclen;
+ int over = 0;
+ int ret;
+
+ while (nbytes >= FUSE_NAME_OFFSET_DIRENTPLUS) {
+ direntplus = (struct fuse_direntplus *) buf;
+ dirent = &direntplus->dirent;
+ reclen = FUSE_DIRENTPLUS_SIZE(direntplus);
+
+ if (!dirent->namelen || dirent->namelen > FUSE_NAME_MAX)
+ return -EIO;
+ if (reclen > nbytes)
+ break;
+ if (memchr(dirent->name, '/', dirent->namelen) != NULL)
+ return -EIO;
+
+ if (!over) {
+ /* We fill entries into dstbuf only as much as
+ it can hold. But we still continue iterating
+ over remaining entries to link them. If not,
+ we need to send a FORGET for each of those
+ which we did not link.
+ */
+ over = !dir_emit(ctx, dirent->name, dirent->namelen,
+ dirent->ino, dirent->type);
+ ctx->pos = dirent->off;
+ }
+
+ buf += reclen;
+ nbytes -= reclen;
+
+ ret = fuse_direntplus_link(file, direntplus, attr_version);
+ if (ret)
+ fuse_force_forget(file, direntplus->entry_out.nodeid);
+ }
+
+ return 0;
+}
+
+static int fuse_readdir(struct file *file, struct dir_context *ctx)
+{
+ int plus, err;
+ size_t nbytes;
+ struct page *page;
+ struct inode *inode = file_inode(file);
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_req *req;
+ u64 attr_version = 0;
+
+ if (is_bad_inode(inode))
+ return -EIO;
+
+ req = fuse_get_req(fc, 1);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ page = alloc_page(GFP_KERNEL);
+ if (!page) {
+ fuse_put_request(fc, req);
+ return -ENOMEM;
+ }
+
+ plus = fuse_use_readdirplus(inode, ctx);
+ req->out.argpages = 1;
+ req->num_pages = 1;
+ req->pages[0] = page;
+ req->page_descs[0].length = PAGE_SIZE;
+ if (plus) {
+ attr_version = fuse_get_attr_version(fc);
+ fuse_read_fill(req, file, ctx->pos, PAGE_SIZE,
+ FUSE_READDIRPLUS);
+ } else {
+ fuse_read_fill(req, file, ctx->pos, PAGE_SIZE,
+ FUSE_READDIR);
+ }
+ fuse_request_send(fc, req);
+ nbytes = req->out.args[0].size;
+ err = req->out.h.error;
+ fuse_put_request(fc, req);
+ if (!err) {
+ if (plus) {
+ err = parse_dirplusfile(page_address(page), nbytes,
+ file, ctx,
+ attr_version);
+ } else {
+ err = parse_dirfile(page_address(page), nbytes, file,
+ ctx);
+ }
+ }
+
+ __free_page(page);
+ fuse_invalidate_atime(inode);
+ return err;
+}
+
+static char *read_link(struct dentry *dentry)
+{
+ struct inode *inode = d_inode(dentry);
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ FUSE_ARGS(args);
+ char *link;
+ ssize_t ret;
+
+ link = (char *) __get_free_page(GFP_KERNEL);
+ if (!link)
+ return ERR_PTR(-ENOMEM);
+
+ args.in.h.opcode = FUSE_READLINK;
+ args.in.h.nodeid = get_node_id(inode);
+ args.out.argvar = 1;
+ args.out.numargs = 1;
+ args.out.args[0].size = PAGE_SIZE - 1;
+ args.out.args[0].value = link;
+ ret = fuse_simple_request(fc, &args);
+ if (ret < 0) {
+ free_page((unsigned long) link);
+ link = ERR_PTR(ret);
+ } else {
+ link[ret] = '\0';
+ }
+ fuse_invalidate_atime(inode);
+ return link;
+}
+
+static void free_link(char *link)
+{
+ if (!IS_ERR(link))
+ free_page((unsigned long) link);
+}
+
+static void *fuse_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+ nd_set_link(nd, read_link(dentry));
+ return NULL;
+}
+
+static void fuse_put_link(struct dentry *dentry, struct nameidata *nd, void *c)
+{
+ free_link(nd_get_link(nd));
+}
+
+static int fuse_dir_open(struct inode *inode, struct file *file)
+{
+ return fuse_open_common(inode, file, true);
+}
+
+static int fuse_dir_release(struct inode *inode, struct file *file)
+{
+ fuse_release_common(file, FUSE_RELEASEDIR);
+
+ return 0;
+}
+
+static int fuse_dir_fsync(struct file *file, loff_t start, loff_t end,
+ int datasync)
+{
+ return fuse_fsync_common(file, start, end, datasync, 1);
+}
+
+static long fuse_dir_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ struct fuse_conn *fc = get_fuse_conn(file->f_mapping->host);
+
+ /* FUSE_IOCTL_DIR only supported for API version >= 7.18 */
+ if (fc->minor < 18)
+ return -ENOTTY;
+
+ return fuse_ioctl_common(file, cmd, arg, FUSE_IOCTL_DIR);
+}
+
+static long fuse_dir_compat_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ struct fuse_conn *fc = get_fuse_conn(file->f_mapping->host);
+
+ if (fc->minor < 18)
+ return -ENOTTY;
+
+ return fuse_ioctl_common(file, cmd, arg,
+ FUSE_IOCTL_COMPAT | FUSE_IOCTL_DIR);
+}
+
+static bool update_mtime(unsigned ivalid, bool trust_local_mtime)
+{
+ /* Always update if mtime is explicitly set */
+ if (ivalid & ATTR_MTIME_SET)
+ return true;
+
+ /* Or if kernel i_mtime is the official one */
+ if (trust_local_mtime)
+ return true;
+
+ /* If it's an open(O_TRUNC) or an ftruncate(), don't update */
+ if ((ivalid & ATTR_SIZE) && (ivalid & (ATTR_OPEN | ATTR_FILE)))
+ return false;
+
+ /* In all other cases update */
+ return true;
+}
+
+static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg,
+ bool trust_local_cmtime)
+{
+ unsigned ivalid = iattr->ia_valid;
+
+ if (ivalid & ATTR_MODE)
+ arg->valid |= FATTR_MODE, arg->mode = iattr->ia_mode;
+ if (ivalid & ATTR_UID)
+ arg->valid |= FATTR_UID, arg->uid = from_kuid(&init_user_ns, iattr->ia_uid);
+ if (ivalid & ATTR_GID)
+ arg->valid |= FATTR_GID, arg->gid = from_kgid(&init_user_ns, iattr->ia_gid);
+ if (ivalid & ATTR_SIZE)
+ arg->valid |= FATTR_SIZE, arg->size = iattr->ia_size;
+ if (ivalid & ATTR_ATIME) {
+ arg->valid |= FATTR_ATIME;
+ arg->atime = iattr->ia_atime.tv_sec;
+ arg->atimensec = iattr->ia_atime.tv_nsec;
+ if (!(ivalid & ATTR_ATIME_SET))
+ arg->valid |= FATTR_ATIME_NOW;
+ }
+ if ((ivalid & ATTR_MTIME) && update_mtime(ivalid, trust_local_cmtime)) {
+ arg->valid |= FATTR_MTIME;
+ arg->mtime = iattr->ia_mtime.tv_sec;
+ arg->mtimensec = iattr->ia_mtime.tv_nsec;
+ if (!(ivalid & ATTR_MTIME_SET) && !trust_local_cmtime)
+ arg->valid |= FATTR_MTIME_NOW;
+ }
+ if ((ivalid & ATTR_CTIME) && trust_local_cmtime) {
+ arg->valid |= FATTR_CTIME;
+ arg->ctime = iattr->ia_ctime.tv_sec;
+ arg->ctimensec = iattr->ia_ctime.tv_nsec;
+ }
+}
+
+/*
+ * Prevent concurrent writepages on inode
+ *
+ * This is done by adding a negative bias to the inode write counter
+ * and waiting for all pending writes to finish.
+ */
+void fuse_set_nowrite(struct inode *inode)
+{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ BUG_ON(!mutex_is_locked(&inode->i_mutex));
+
+ spin_lock(&fc->lock);
+ BUG_ON(fi->writectr < 0);
+ fi->writectr += FUSE_NOWRITE;
+ spin_unlock(&fc->lock);
+ wait_event(fi->page_waitq, fi->writectr == FUSE_NOWRITE);
+}
+
+/*
+ * Allow writepages on inode
+ *
+ * Remove the bias from the writecounter and send any queued
+ * writepages.
+ */
+static void __fuse_release_nowrite(struct inode *inode)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ BUG_ON(fi->writectr != FUSE_NOWRITE);
+ fi->writectr = 0;
+ fuse_flush_writepages(inode);
+}
+
+void fuse_release_nowrite(struct inode *inode)
+{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+
+ spin_lock(&fc->lock);
+ __fuse_release_nowrite(inode);
+ spin_unlock(&fc->lock);
+}
+
+static void fuse_setattr_fill(struct fuse_conn *fc, struct fuse_args *args,
+ struct inode *inode,
+ struct fuse_setattr_in *inarg_p,
+ struct fuse_attr_out *outarg_p)
+{
+ args->in.h.opcode = FUSE_SETATTR;
+ args->in.h.nodeid = get_node_id(inode);
+ args->in.numargs = 1;
+ args->in.args[0].size = sizeof(*inarg_p);
+ args->in.args[0].value = inarg_p;
+ args->out.numargs = 1;
+ args->out.args[0].size = sizeof(*outarg_p);
+ args->out.args[0].value = outarg_p;
+}
+
+/*
+ * Flush inode->i_mtime to the server
+ */
+int fuse_flush_times(struct inode *inode, struct fuse_file *ff)
+{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ FUSE_ARGS(args);
+ struct fuse_setattr_in inarg;
+ struct fuse_attr_out outarg;
+
+ memset(&inarg, 0, sizeof(inarg));
+ memset(&outarg, 0, sizeof(outarg));
+
+ inarg.valid = FATTR_MTIME;
+ inarg.mtime = inode->i_mtime.tv_sec;
+ inarg.mtimensec = inode->i_mtime.tv_nsec;
+ if (fc->minor >= 23) {
+ inarg.valid |= FATTR_CTIME;
+ inarg.ctime = inode->i_ctime.tv_sec;
+ inarg.ctimensec = inode->i_ctime.tv_nsec;
+ }
+ if (ff) {
+ inarg.valid |= FATTR_FH;
+ inarg.fh = ff->fh;
+ }
+ fuse_setattr_fill(fc, &args, inode, &inarg, &outarg);
+
+ return fuse_simple_request(fc, &args);
+}
+
+/*
+ * Set attributes, and at the same time refresh them.
+ *
+ * Truncation is slightly complicated, because the 'truncate' request
+ * may fail, in which case we don't want to touch the mapping.
+ * vmtruncate() doesn't allow for this case, so do the rlimit checking
+ * and the actual truncation by hand.
+ */
+int fuse_do_setattr(struct inode *inode, struct iattr *attr,
+ struct file *file)
+{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ FUSE_ARGS(args);
+ struct fuse_setattr_in inarg;
+ struct fuse_attr_out outarg;
+ bool is_truncate = false;
+ bool is_wb = fc->writeback_cache;
+ loff_t oldsize;
+ int err;
+ bool trust_local_cmtime = is_wb && S_ISREG(inode->i_mode);
+
+ if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS))
+ attr->ia_valid |= ATTR_FORCE;
+
+ err = inode_change_ok(inode, attr);
+ if (err)
+ return err;
+
+ if (attr->ia_valid & ATTR_OPEN) {
+ if (fc->atomic_o_trunc)
+ return 0;
+ file = NULL;
+ }
+
+ if (attr->ia_valid & ATTR_SIZE)
+ is_truncate = true;
+
+ if (is_truncate) {
+ fuse_set_nowrite(inode);
+ set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
+ if (trust_local_cmtime && attr->ia_size != inode->i_size)
+ attr->ia_valid |= ATTR_MTIME | ATTR_CTIME;
+ }
+
+ memset(&inarg, 0, sizeof(inarg));
+ memset(&outarg, 0, sizeof(outarg));
+ iattr_to_fattr(attr, &inarg, trust_local_cmtime);
+ if (file) {
+ struct fuse_file *ff = file->private_data;
+ inarg.valid |= FATTR_FH;
+ inarg.fh = ff->fh;
+ }
+ if (attr->ia_valid & ATTR_SIZE) {
+ /* For mandatory locking in truncate */
+ inarg.valid |= FATTR_LOCKOWNER;
+ inarg.lock_owner = fuse_lock_owner_id(fc, current->files);
+ }
+ fuse_setattr_fill(fc, &args, inode, &inarg, &outarg);
+ err = fuse_simple_request(fc, &args);
+ if (err) {
+ if (err == -EINTR)
+ fuse_invalidate_attr(inode);
+ goto error;
+ }
+
+ if ((inode->i_mode ^ outarg.attr.mode) & S_IFMT) {
+ make_bad_inode(inode);
+ err = -EIO;
+ goto error;
+ }
+
+ spin_lock(&fc->lock);
+ /* the kernel maintains i_mtime locally */
+ if (trust_local_cmtime) {
+ if (attr->ia_valid & ATTR_MTIME)
+ inode->i_mtime = attr->ia_mtime;
+ if (attr->ia_valid & ATTR_CTIME)
+ inode->i_ctime = attr->ia_ctime;
+ /* FIXME: clear I_DIRTY_SYNC? */
+ }
+
+ fuse_change_attributes_common(inode, &outarg.attr,
+ attr_timeout(&outarg));
+ oldsize = inode->i_size;
+ /* see the comment in fuse_change_attributes() */
+ if (!is_wb || is_truncate || !S_ISREG(inode->i_mode))
+ i_size_write(inode, outarg.attr.size);
+
+ if (is_truncate) {
+ /* NOTE: this may release/reacquire fc->lock */
+ __fuse_release_nowrite(inode);
+ }
+ spin_unlock(&fc->lock);
+
+ /*
+ * Only call invalidate_inode_pages2() after removing
+ * FUSE_NOWRITE, otherwise fuse_launder_page() would deadlock.
+ */
+ if ((is_truncate || !is_wb) &&
+ S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) {
+ truncate_pagecache(inode, outarg.attr.size);
+ invalidate_inode_pages2(inode->i_mapping);
+ }
+
+ clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
+ return 0;
+
+error:
+ if (is_truncate)
+ fuse_release_nowrite(inode);
+
+ clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
+ return err;
+}
+
+static int fuse_setattr(struct dentry *entry, struct iattr *attr)
+{
+ struct inode *inode = d_inode(entry);
+
+ if (!fuse_allow_current_process(get_fuse_conn(inode)))
+ return -EACCES;
+
+ if (attr->ia_valid & ATTR_FILE)
+ return fuse_do_setattr(inode, attr, attr->ia_file);
+ else
+ return fuse_do_setattr(inode, attr, NULL);
+}
+
+static int fuse_getattr(struct vfsmount *mnt, struct dentry *entry,
+ struct kstat *stat)
+{
+ struct inode *inode = d_inode(entry);
+ struct fuse_conn *fc = get_fuse_conn(inode);
+
+ if (!fuse_allow_current_process(fc))
+ return -EACCES;
+
+ return fuse_update_attributes(inode, stat, NULL, NULL);
+}
+
+static int fuse_setxattr(struct dentry *entry, const char *name,
+ const void *value, size_t size, int flags)
+{
+ struct inode *inode = d_inode(entry);
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ FUSE_ARGS(args);
+ struct fuse_setxattr_in inarg;
+ int err;
+
+ if (fc->no_setxattr)
+ return -EOPNOTSUPP;
+
+ memset(&inarg, 0, sizeof(inarg));
+ inarg.size = size;
+ inarg.flags = flags;
+ args.in.h.opcode = FUSE_SETXATTR;
+ args.in.h.nodeid = get_node_id(inode);
+ args.in.numargs = 3;
+ args.in.args[0].size = sizeof(inarg);
+ args.in.args[0].value = &inarg;
+ args.in.args[1].size = strlen(name) + 1;
+ args.in.args[1].value = name;
+ args.in.args[2].size = size;
+ args.in.args[2].value = value;
+ err = fuse_simple_request(fc, &args);
+ if (err == -ENOSYS) {
+ fc->no_setxattr = 1;
+ err = -EOPNOTSUPP;
+ }
+ if (!err) {
+ fuse_invalidate_attr(inode);
+ fuse_update_ctime(inode);
+ }
+ return err;
+}
+
+static ssize_t fuse_getxattr(struct dentry *entry, const char *name,
+ void *value, size_t size)
+{
+ struct inode *inode = d_inode(entry);
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ FUSE_ARGS(args);
+ struct fuse_getxattr_in inarg;
+ struct fuse_getxattr_out outarg;
+ ssize_t ret;
+
+ if (fc->no_getxattr)
+ return -EOPNOTSUPP;
+
+ memset(&inarg, 0, sizeof(inarg));
+ inarg.size = size;
+ args.in.h.opcode = FUSE_GETXATTR;
+ args.in.h.nodeid = get_node_id(inode);
+ args.in.numargs = 2;
+ args.in.args[0].size = sizeof(inarg);
+ args.in.args[0].value = &inarg;
+ args.in.args[1].size = strlen(name) + 1;
+ args.in.args[1].value = name;
+ /* This is really two different operations rolled into one */
+ args.out.numargs = 1;
+ if (size) {
+ args.out.argvar = 1;
+ args.out.args[0].size = size;
+ args.out.args[0].value = value;
+ } else {
+ args.out.args[0].size = sizeof(outarg);
+ args.out.args[0].value = &outarg;
+ }
+ ret = fuse_simple_request(fc, &args);
+ if (!ret && !size)
+ ret = outarg.size;
+ if (ret == -ENOSYS) {
+ fc->no_getxattr = 1;
+ ret = -EOPNOTSUPP;
+ }
+ return ret;
+}
+
+static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size)
+{
+ struct inode *inode = d_inode(entry);
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ FUSE_ARGS(args);
+ struct fuse_getxattr_in inarg;
+ struct fuse_getxattr_out outarg;
+ ssize_t ret;
+
+ if (!fuse_allow_current_process(fc))
+ return -EACCES;
+
+ if (fc->no_listxattr)
+ return -EOPNOTSUPP;
+
+ memset(&inarg, 0, sizeof(inarg));
+ inarg.size = size;
+ args.in.h.opcode = FUSE_LISTXATTR;
+ args.in.h.nodeid = get_node_id(inode);
+ args.in.numargs = 1;
+ args.in.args[0].size = sizeof(inarg);
+ args.in.args[0].value = &inarg;
+ /* This is really two different operations rolled into one */
+ args.out.numargs = 1;
+ if (size) {
+ args.out.argvar = 1;
+ args.out.args[0].size = size;
+ args.out.args[0].value = list;
+ } else {
+ args.out.args[0].size = sizeof(outarg);
+ args.out.args[0].value = &outarg;
+ }
+ ret = fuse_simple_request(fc, &args);
+ if (!ret && !size)
+ ret = outarg.size;
+ if (ret == -ENOSYS) {
+ fc->no_listxattr = 1;
+ ret = -EOPNOTSUPP;
+ }
+ return ret;
+}
+
+static int fuse_removexattr(struct dentry *entry, const char *name)
+{
+ struct inode *inode = d_inode(entry);
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ FUSE_ARGS(args);
+ int err;
+
+ if (fc->no_removexattr)
+ return -EOPNOTSUPP;
+
+ args.in.h.opcode = FUSE_REMOVEXATTR;
+ args.in.h.nodeid = get_node_id(inode);
+ args.in.numargs = 1;
+ args.in.args[0].size = strlen(name) + 1;
+ args.in.args[0].value = name;
+ err = fuse_simple_request(fc, &args);
+ if (err == -ENOSYS) {
+ fc->no_removexattr = 1;
+ err = -EOPNOTSUPP;
+ }
+ if (!err) {
+ fuse_invalidate_attr(inode);
+ fuse_update_ctime(inode);
+ }
+ return err;
+}
+
+static const struct inode_operations fuse_dir_inode_operations = {
+ .lookup = fuse_lookup,
+ .mkdir = fuse_mkdir,
+ .symlink = fuse_symlink,
+ .unlink = fuse_unlink,
+ .rmdir = fuse_rmdir,
+ .rename2 = fuse_rename2,
+ .link = fuse_link,
+ .setattr = fuse_setattr,
+ .create = fuse_create,
+ .atomic_open = fuse_atomic_open,
+ .mknod = fuse_mknod,
+ .permission = fuse_permission,
+ .getattr = fuse_getattr,
+ .setxattr = fuse_setxattr,
+ .getxattr = fuse_getxattr,
+ .listxattr = fuse_listxattr,
+ .removexattr = fuse_removexattr,
+};
+
+static const struct file_operations fuse_dir_operations = {
+ .llseek = generic_file_llseek,
+ .read = generic_read_dir,
+ .iterate = fuse_readdir,
+ .open = fuse_dir_open,
+ .release = fuse_dir_release,
+ .fsync = fuse_dir_fsync,
+ .unlocked_ioctl = fuse_dir_ioctl,
+ .compat_ioctl = fuse_dir_compat_ioctl,
+};
+
+static const struct inode_operations fuse_common_inode_operations = {
+ .setattr = fuse_setattr,
+ .permission = fuse_permission,
+ .getattr = fuse_getattr,
+ .setxattr = fuse_setxattr,
+ .getxattr = fuse_getxattr,
+ .listxattr = fuse_listxattr,
+ .removexattr = fuse_removexattr,
+};
+
+static const struct inode_operations fuse_symlink_inode_operations = {
+ .setattr = fuse_setattr,
+ .follow_link = fuse_follow_link,
+ .put_link = fuse_put_link,
+ .readlink = generic_readlink,
+ .getattr = fuse_getattr,
+ .setxattr = fuse_setxattr,
+ .getxattr = fuse_getxattr,
+ .listxattr = fuse_listxattr,
+ .removexattr = fuse_removexattr,
+};
+
+void fuse_init_common(struct inode *inode)
+{
+ inode->i_op = &fuse_common_inode_operations;
+}
+
+void fuse_init_dir(struct inode *inode)
+{
+ inode->i_op = &fuse_dir_inode_operations;
+ inode->i_fop = &fuse_dir_operations;
+}
+
+void fuse_init_symlink(struct inode *inode)
+{
+ inode->i_op = &fuse_symlink_inode_operations;
+}
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
new file mode 100644
index 000000000..5ef05b5c4
--- /dev/null
+++ b/fs/fuse/file.c
@@ -0,0 +1,2994 @@
+/*
+ FUSE: Filesystem in Userspace
+ Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu>
+
+ This program can be distributed under the terms of the GNU GPL.
+ See the file COPYING.
+*/
+
+#include "fuse_i.h"
+
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/compat.h>
+#include <linux/swap.h>
+#include <linux/falloc.h>
+#include <linux/uio.h>
+
+static const struct file_operations fuse_direct_io_file_operations;
+
+static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
+ int opcode, struct fuse_open_out *outargp)
+{
+ struct fuse_open_in inarg;
+ FUSE_ARGS(args);
+
+ memset(&inarg, 0, sizeof(inarg));
+ inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY);
+ if (!fc->atomic_o_trunc)
+ inarg.flags &= ~O_TRUNC;
+ args.in.h.opcode = opcode;
+ args.in.h.nodeid = nodeid;
+ args.in.numargs = 1;
+ args.in.args[0].size = sizeof(inarg);
+ args.in.args[0].value = &inarg;
+ args.out.numargs = 1;
+ args.out.args[0].size = sizeof(*outargp);
+ args.out.args[0].value = outargp;
+
+ return fuse_simple_request(fc, &args);
+}
+
+struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
+{
+ struct fuse_file *ff;
+
+ ff = kmalloc(sizeof(struct fuse_file), GFP_KERNEL);
+ if (unlikely(!ff))
+ return NULL;
+
+ ff->fc = fc;
+ ff->reserved_req = fuse_request_alloc(0);
+ if (unlikely(!ff->reserved_req)) {
+ kfree(ff);
+ return NULL;
+ }
+
+ INIT_LIST_HEAD(&ff->write_entry);
+ atomic_set(&ff->count, 0);
+ RB_CLEAR_NODE(&ff->polled_node);
+ init_waitqueue_head(&ff->poll_wait);
+
+ spin_lock(&fc->lock);
+ ff->kh = ++fc->khctr;
+ spin_unlock(&fc->lock);
+
+ return ff;
+}
+
+void fuse_file_free(struct fuse_file *ff)
+{
+ fuse_request_free(ff->reserved_req);
+ kfree(ff);
+}
+
+struct fuse_file *fuse_file_get(struct fuse_file *ff)
+{
+ atomic_inc(&ff->count);
+ return ff;
+}
+
+static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req)
+{
+ iput(req->misc.release.inode);
+}
+
+static void fuse_file_put(struct fuse_file *ff, bool sync)
+{
+ if (atomic_dec_and_test(&ff->count)) {
+ struct fuse_req *req = ff->reserved_req;
+
+ if (ff->fc->no_open) {
+ /*
+ * Drop the release request when client does not
+ * implement 'open'
+ */
+ req->background = 0;
+ iput(req->misc.release.inode);
+ fuse_put_request(ff->fc, req);
+ } else if (sync) {
+ req->background = 0;
+ fuse_request_send(ff->fc, req);
+ iput(req->misc.release.inode);
+ fuse_put_request(ff->fc, req);
+ } else {
+ req->end = fuse_release_end;
+ req->background = 1;
+ fuse_request_send_background(ff->fc, req);
+ }
+ kfree(ff);
+ }
+}
+
+int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
+ bool isdir)
+{
+ struct fuse_file *ff;
+ int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN;
+
+ ff = fuse_file_alloc(fc);
+ if (!ff)
+ return -ENOMEM;
+
+ ff->fh = 0;
+ ff->open_flags = FOPEN_KEEP_CACHE; /* Default for no-open */
+ if (!fc->no_open || isdir) {
+ struct fuse_open_out outarg;
+ int err;
+
+ err = fuse_send_open(fc, nodeid, file, opcode, &outarg);
+ if (!err) {
+ ff->fh = outarg.fh;
+ ff->open_flags = outarg.open_flags;
+
+ } else if (err != -ENOSYS || isdir) {
+ fuse_file_free(ff);
+ return err;
+ } else {
+ fc->no_open = 1;
+ }
+ }
+
+ if (isdir)
+ ff->open_flags &= ~FOPEN_DIRECT_IO;
+
+ ff->nodeid = nodeid;
+ file->private_data = fuse_file_get(ff);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(fuse_do_open);
+
+static void fuse_link_write_file(struct file *file)
+{
+ struct inode *inode = file_inode(file);
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_file *ff = file->private_data;
+ /*
+ * file may be written through mmap, so chain it onto the
+ * inodes's write_file list
+ */
+ spin_lock(&fc->lock);
+ if (list_empty(&ff->write_entry))
+ list_add(&ff->write_entry, &fi->write_files);
+ spin_unlock(&fc->lock);
+}
+
+void fuse_finish_open(struct inode *inode, struct file *file)
+{
+ struct fuse_file *ff = file->private_data;
+ struct fuse_conn *fc = get_fuse_conn(inode);
+
+ if (ff->open_flags & FOPEN_DIRECT_IO)
+ file->f_op = &fuse_direct_io_file_operations;
+ if (!(ff->open_flags & FOPEN_KEEP_CACHE))
+ invalidate_inode_pages2(inode->i_mapping);
+ if (ff->open_flags & FOPEN_NONSEEKABLE)
+ nonseekable_open(inode, file);
+ if (fc->atomic_o_trunc && (file->f_flags & O_TRUNC)) {
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ spin_lock(&fc->lock);
+ fi->attr_version = ++fc->attr_version;
+ i_size_write(inode, 0);
+ spin_unlock(&fc->lock);
+ fuse_invalidate_attr(inode);
+ if (fc->writeback_cache)
+ file_update_time(file);
+ }
+ if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache)
+ fuse_link_write_file(file);
+}
+
+int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
+{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ int err;
+ bool lock_inode = (file->f_flags & O_TRUNC) &&
+ fc->atomic_o_trunc &&
+ fc->writeback_cache;
+
+ err = generic_file_open(inode, file);
+ if (err)
+ return err;
+
+ if (lock_inode)
+ mutex_lock(&inode->i_mutex);
+
+ err = fuse_do_open(fc, get_node_id(inode), file, isdir);
+
+ if (!err)
+ fuse_finish_open(inode, file);
+
+ if (lock_inode)
+ mutex_unlock(&inode->i_mutex);
+
+ return err;
+}
+
+static void fuse_prepare_release(struct fuse_file *ff, int flags, int opcode)
+{
+ struct fuse_conn *fc = ff->fc;
+ struct fuse_req *req = ff->reserved_req;
+ struct fuse_release_in *inarg = &req->misc.release.in;
+
+ spin_lock(&fc->lock);
+ list_del(&ff->write_entry);
+ if (!RB_EMPTY_NODE(&ff->polled_node))
+ rb_erase(&ff->polled_node, &fc->polled_files);
+ spin_unlock(&fc->lock);
+
+ wake_up_interruptible_all(&ff->poll_wait);
+
+ inarg->fh = ff->fh;
+ inarg->flags = flags;
+ req->in.h.opcode = opcode;
+ req->in.h.nodeid = ff->nodeid;
+ req->in.numargs = 1;
+ req->in.args[0].size = sizeof(struct fuse_release_in);
+ req->in.args[0].value = inarg;
+}
+
+void fuse_release_common(struct file *file, int opcode)
+{
+ struct fuse_file *ff;
+ struct fuse_req *req;
+
+ ff = file->private_data;
+ if (unlikely(!ff))
+ return;
+
+ req = ff->reserved_req;
+ fuse_prepare_release(ff, file->f_flags, opcode);
+
+ if (ff->flock) {
+ struct fuse_release_in *inarg = &req->misc.release.in;
+ inarg->release_flags |= FUSE_RELEASE_FLOCK_UNLOCK;
+ inarg->lock_owner = fuse_lock_owner_id(ff->fc,
+ (fl_owner_t) file);
+ }
+ /* Hold inode until release is finished */
+ req->misc.release.inode = igrab(file_inode(file));
+
+ /*
+ * Normally this will send the RELEASE request, however if
+ * some asynchronous READ or WRITE requests are outstanding,
+ * the sending will be delayed.
+ *
+ * Make the release synchronous if this is a fuseblk mount,
+ * synchronous RELEASE is allowed (and desirable) in this case
+ * because the server can be trusted not to screw up.
+ */
+ fuse_file_put(ff, ff->fc->destroy_req != NULL);
+}
+
+static int fuse_open(struct inode *inode, struct file *file)
+{
+ return fuse_open_common(inode, file, false);
+}
+
+static int fuse_release(struct inode *inode, struct file *file)
+{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+
+ /* see fuse_vma_close() for !writeback_cache case */
+ if (fc->writeback_cache)
+ write_inode_now(inode, 1);
+
+ fuse_release_common(file, FUSE_RELEASE);
+
+ /* return value is ignored by VFS */
+ return 0;
+}
+
+void fuse_sync_release(struct fuse_file *ff, int flags)
+{
+ WARN_ON(atomic_read(&ff->count) > 1);
+ fuse_prepare_release(ff, flags, FUSE_RELEASE);
+ ff->reserved_req->force = 1;
+ ff->reserved_req->background = 0;
+ fuse_request_send(ff->fc, ff->reserved_req);
+ fuse_put_request(ff->fc, ff->reserved_req);
+ kfree(ff);
+}
+EXPORT_SYMBOL_GPL(fuse_sync_release);
+
+/*
+ * Scramble the ID space with XTEA, so that the value of the files_struct
+ * pointer is not exposed to userspace.
+ */
+u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id)
+{
+ u32 *k = fc->scramble_key;
+ u64 v = (unsigned long) id;
+ u32 v0 = v;
+ u32 v1 = v >> 32;
+ u32 sum = 0;
+ int i;
+
+ for (i = 0; i < 32; i++) {
+ v0 += ((v1 << 4 ^ v1 >> 5) + v1) ^ (sum + k[sum & 3]);
+ sum += 0x9E3779B9;
+ v1 += ((v0 << 4 ^ v0 >> 5) + v0) ^ (sum + k[sum>>11 & 3]);
+ }
+
+ return (u64) v0 + ((u64) v1 << 32);
+}
+
+/*
+ * Check if any page in a range is under writeback
+ *
+ * This is currently done by walking the list of writepage requests
+ * for the inode, which can be pretty inefficient.
+ */
+static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from,
+ pgoff_t idx_to)
+{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_req *req;
+ bool found = false;
+
+ spin_lock(&fc->lock);
+ list_for_each_entry(req, &fi->writepages, writepages_entry) {
+ pgoff_t curr_index;
+
+ BUG_ON(req->inode != inode);
+ curr_index = req->misc.write.in.offset >> PAGE_CACHE_SHIFT;
+ if (idx_from < curr_index + req->num_pages &&
+ curr_index <= idx_to) {
+ found = true;
+ break;
+ }
+ }
+ spin_unlock(&fc->lock);
+
+ return found;
+}
+
+static inline bool fuse_page_is_writeback(struct inode *inode, pgoff_t index)
+{
+ return fuse_range_is_writeback(inode, index, index);
+}
+
+/*
+ * Wait for page writeback to be completed.
+ *
+ * Since fuse doesn't rely on the VM writeback tracking, this has to
+ * use some other means.
+ */
+static int fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index));
+ return 0;
+}
+
+/*
+ * Wait for all pending writepages on the inode to finish.
+ *
+ * This is currently done by blocking further writes with FUSE_NOWRITE
+ * and waiting for all sent writes to complete.
+ *
+ * This must be called under i_mutex, otherwise the FUSE_NOWRITE usage
+ * could conflict with truncation.
+ */
+static void fuse_sync_writes(struct inode *inode)
+{
+ fuse_set_nowrite(inode);
+ fuse_release_nowrite(inode);
+}
+
+static int fuse_flush(struct file *file, fl_owner_t id)
+{
+ struct inode *inode = file_inode(file);
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_file *ff = file->private_data;
+ struct fuse_req *req;
+ struct fuse_flush_in inarg;
+ int err;
+
+ if (is_bad_inode(inode))
+ return -EIO;
+
+ if (fc->no_flush)
+ return 0;
+
+ err = write_inode_now(inode, 1);
+ if (err)
+ return err;
+
+ mutex_lock(&inode->i_mutex);
+ fuse_sync_writes(inode);
+ mutex_unlock(&inode->i_mutex);
+
+ req = fuse_get_req_nofail_nopages(fc, file);
+ memset(&inarg, 0, sizeof(inarg));
+ inarg.fh = ff->fh;
+ inarg.lock_owner = fuse_lock_owner_id(fc, id);
+ req->in.h.opcode = FUSE_FLUSH;
+ req->in.h.nodeid = get_node_id(inode);
+ req->in.numargs = 1;
+ req->in.args[0].size = sizeof(inarg);
+ req->in.args[0].value = &inarg;
+ req->force = 1;
+ fuse_request_send(fc, req);
+ err = req->out.h.error;
+ fuse_put_request(fc, req);
+ if (err == -ENOSYS) {
+ fc->no_flush = 1;
+ err = 0;
+ }
+ return err;
+}
+
+int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
+ int datasync, int isdir)
+{
+ struct inode *inode = file->f_mapping->host;
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_file *ff = file->private_data;
+ FUSE_ARGS(args);
+ struct fuse_fsync_in inarg;
+ int err;
+
+ if (is_bad_inode(inode))
+ return -EIO;
+
+ mutex_lock(&inode->i_mutex);
+
+ /*
+ * Start writeback against all dirty pages of the inode, then
+ * wait for all outstanding writes, before sending the FSYNC
+ * request.
+ */
+ err = filemap_write_and_wait_range(inode->i_mapping, start, end);
+ if (err)
+ goto out;
+
+ fuse_sync_writes(inode);
+ err = sync_inode_metadata(inode, 1);
+ if (err)
+ goto out;
+
+ if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir))
+ goto out;
+
+ memset(&inarg, 0, sizeof(inarg));
+ inarg.fh = ff->fh;
+ inarg.fsync_flags = datasync ? 1 : 0;
+ args.in.h.opcode = isdir ? FUSE_FSYNCDIR : FUSE_FSYNC;
+ args.in.h.nodeid = get_node_id(inode);
+ args.in.numargs = 1;
+ args.in.args[0].size = sizeof(inarg);
+ args.in.args[0].value = &inarg;
+ err = fuse_simple_request(fc, &args);
+ if (err == -ENOSYS) {
+ if (isdir)
+ fc->no_fsyncdir = 1;
+ else
+ fc->no_fsync = 1;
+ err = 0;
+ }
+out:
+ mutex_unlock(&inode->i_mutex);
+ return err;
+}
+
+static int fuse_fsync(struct file *file, loff_t start, loff_t end,
+ int datasync)
+{
+ return fuse_fsync_common(file, start, end, datasync, 0);
+}
+
+void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos,
+ size_t count, int opcode)
+{
+ struct fuse_read_in *inarg = &req->misc.read.in;
+ struct fuse_file *ff = file->private_data;
+
+ inarg->fh = ff->fh;
+ inarg->offset = pos;
+ inarg->size = count;
+ inarg->flags = file->f_flags;
+ req->in.h.opcode = opcode;
+ req->in.h.nodeid = ff->nodeid;
+ req->in.numargs = 1;
+ req->in.args[0].size = sizeof(struct fuse_read_in);
+ req->in.args[0].value = inarg;
+ req->out.argvar = 1;
+ req->out.numargs = 1;
+ req->out.args[0].size = count;
+}
+
+static void fuse_release_user_pages(struct fuse_req *req, int write)
+{
+ unsigned i;
+
+ for (i = 0; i < req->num_pages; i++) {
+ struct page *page = req->pages[i];
+ if (write)
+ set_page_dirty_lock(page);
+ put_page(page);
+ }
+}
+
+static ssize_t fuse_get_res_by_io(struct fuse_io_priv *io)
+{
+ if (io->err)
+ return io->err;
+
+ if (io->bytes >= 0 && io->write)
+ return -EIO;
+
+ return io->bytes < 0 ? io->size : io->bytes;
+}
+
+/**
+ * In case of short read, the caller sets 'pos' to the position of
+ * actual end of fuse request in IO request. Otherwise, if bytes_requested
+ * == bytes_transferred or rw == WRITE, the caller sets 'pos' to -1.
+ *
+ * An example:
+ * User requested DIO read of 64K. It was splitted into two 32K fuse requests,
+ * both submitted asynchronously. The first of them was ACKed by userspace as
+ * fully completed (req->out.args[0].size == 32K) resulting in pos == -1. The
+ * second request was ACKed as short, e.g. only 1K was read, resulting in
+ * pos == 33K.
+ *
+ * Thus, when all fuse requests are completed, the minimal non-negative 'pos'
+ * will be equal to the length of the longest contiguous fragment of
+ * transferred data starting from the beginning of IO request.
+ */
+static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
+{
+ bool is_sync = is_sync_kiocb(io->iocb);
+ int left;
+
+ spin_lock(&io->lock);
+ if (err)
+ io->err = io->err ? : err;
+ else if (pos >= 0 && (io->bytes < 0 || pos < io->bytes))
+ io->bytes = pos;
+
+ left = --io->reqs;
+ if (!left && is_sync)
+ complete(io->done);
+ spin_unlock(&io->lock);
+
+ if (!left && !is_sync) {
+ ssize_t res = fuse_get_res_by_io(io);
+
+ if (res >= 0) {
+ struct inode *inode = file_inode(io->iocb->ki_filp);
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ spin_lock(&fc->lock);
+ fi->attr_version = ++fc->attr_version;
+ spin_unlock(&fc->lock);
+ }
+
+ io->iocb->ki_complete(io->iocb, res, 0);
+ kfree(io);
+ }
+}
+
+static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_req *req)
+{
+ struct fuse_io_priv *io = req->io;
+ ssize_t pos = -1;
+
+ fuse_release_user_pages(req, !io->write);
+
+ if (io->write) {
+ if (req->misc.write.in.size != req->misc.write.out.size)
+ pos = req->misc.write.in.offset - io->offset +
+ req->misc.write.out.size;
+ } else {
+ if (req->misc.read.in.size != req->out.args[0].size)
+ pos = req->misc.read.in.offset - io->offset +
+ req->out.args[0].size;
+ }
+
+ fuse_aio_complete(io, req->out.h.error, pos);
+}
+
+static size_t fuse_async_req_send(struct fuse_conn *fc, struct fuse_req *req,
+ size_t num_bytes, struct fuse_io_priv *io)
+{
+ spin_lock(&io->lock);
+ io->size += num_bytes;
+ io->reqs++;
+ spin_unlock(&io->lock);
+
+ req->io = io;
+ req->end = fuse_aio_complete_req;
+
+ __fuse_get_request(req);
+ fuse_request_send_background(fc, req);
+
+ return num_bytes;
+}
+
+static size_t fuse_send_read(struct fuse_req *req, struct fuse_io_priv *io,
+ loff_t pos, size_t count, fl_owner_t owner)
+{
+ struct file *file = io->file;
+ struct fuse_file *ff = file->private_data;
+ struct fuse_conn *fc = ff->fc;
+
+ fuse_read_fill(req, file, pos, count, FUSE_READ);
+ if (owner != NULL) {
+ struct fuse_read_in *inarg = &req->misc.read.in;
+
+ inarg->read_flags |= FUSE_READ_LOCKOWNER;
+ inarg->lock_owner = fuse_lock_owner_id(fc, owner);
+ }
+
+ if (io->async)
+ return fuse_async_req_send(fc, req, count, io);
+
+ fuse_request_send(fc, req);
+ return req->out.args[0].size;
+}
+
+static void fuse_read_update_size(struct inode *inode, loff_t size,
+ u64 attr_ver)
+{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ spin_lock(&fc->lock);
+ if (attr_ver == fi->attr_version && size < inode->i_size &&
+ !test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) {
+ fi->attr_version = ++fc->attr_version;
+ i_size_write(inode, size);
+ }
+ spin_unlock(&fc->lock);
+}
+
+static void fuse_short_read(struct fuse_req *req, struct inode *inode,
+ u64 attr_ver)
+{
+ size_t num_read = req->out.args[0].size;
+ struct fuse_conn *fc = get_fuse_conn(inode);
+
+ if (fc->writeback_cache) {
+ /*
+ * A hole in a file. Some data after the hole are in page cache,
+ * but have not reached the client fs yet. So, the hole is not
+ * present there.
+ */
+ int i;
+ int start_idx = num_read >> PAGE_CACHE_SHIFT;
+ size_t off = num_read & (PAGE_CACHE_SIZE - 1);
+
+ for (i = start_idx; i < req->num_pages; i++) {
+ zero_user_segment(req->pages[i], off, PAGE_CACHE_SIZE);
+ off = 0;
+ }
+ } else {
+ loff_t pos = page_offset(req->pages[0]) + num_read;
+ fuse_read_update_size(inode, pos, attr_ver);
+ }
+}
+
+static int fuse_do_readpage(struct file *file, struct page *page)
+{
+ struct fuse_io_priv io = { .async = 0, .file = file };
+ struct inode *inode = page->mapping->host;
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_req *req;
+ size_t num_read;
+ loff_t pos = page_offset(page);
+ size_t count = PAGE_CACHE_SIZE;
+ u64 attr_ver;
+ int err;
+
+ /*
+ * Page writeback can extend beyond the lifetime of the
+ * page-cache page, so make sure we read a properly synced
+ * page.
+ */
+ fuse_wait_on_page_writeback(inode, page->index);
+
+ req = fuse_get_req(fc, 1);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ attr_ver = fuse_get_attr_version(fc);
+
+ req->out.page_zeroing = 1;
+ req->out.argpages = 1;
+ req->num_pages = 1;
+ req->pages[0] = page;
+ req->page_descs[0].length = count;
+ num_read = fuse_send_read(req, &io, pos, count, NULL);
+ err = req->out.h.error;
+
+ if (!err) {
+ /*
+ * Short read means EOF. If file size is larger, truncate it
+ */
+ if (num_read < count)
+ fuse_short_read(req, inode, attr_ver);
+
+ SetPageUptodate(page);
+ }
+
+ fuse_put_request(fc, req);
+
+ return err;
+}
+
+static int fuse_readpage(struct file *file, struct page *page)
+{
+ struct inode *inode = page->mapping->host;
+ int err;
+
+ err = -EIO;
+ if (is_bad_inode(inode))
+ goto out;
+
+ err = fuse_do_readpage(file, page);
+ fuse_invalidate_atime(inode);
+ out:
+ unlock_page(page);
+ return err;
+}
+
+static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
+{
+ int i;
+ size_t count = req->misc.read.in.size;
+ size_t num_read = req->out.args[0].size;
+ struct address_space *mapping = NULL;
+
+ for (i = 0; mapping == NULL && i < req->num_pages; i++)
+ mapping = req->pages[i]->mapping;
+
+ if (mapping) {
+ struct inode *inode = mapping->host;
+
+ /*
+ * Short read means EOF. If file size is larger, truncate it
+ */
+ if (!req->out.h.error && num_read < count)
+ fuse_short_read(req, inode, req->misc.read.attr_ver);
+
+ fuse_invalidate_atime(inode);
+ }
+
+ for (i = 0; i < req->num_pages; i++) {
+ struct page *page = req->pages[i];
+ if (!req->out.h.error)
+ SetPageUptodate(page);
+ else
+ SetPageError(page);
+ unlock_page(page);
+ page_cache_release(page);
+ }
+ if (req->ff)
+ fuse_file_put(req->ff, false);
+}
+
+static void fuse_send_readpages(struct fuse_req *req, struct file *file)
+{
+ struct fuse_file *ff = file->private_data;
+ struct fuse_conn *fc = ff->fc;
+ loff_t pos = page_offset(req->pages[0]);
+ size_t count = req->num_pages << PAGE_CACHE_SHIFT;
+
+ req->out.argpages = 1;
+ req->out.page_zeroing = 1;
+ req->out.page_replace = 1;
+ fuse_read_fill(req, file, pos, count, FUSE_READ);
+ req->misc.read.attr_ver = fuse_get_attr_version(fc);
+ if (fc->async_read) {
+ req->ff = fuse_file_get(ff);
+ req->end = fuse_readpages_end;
+ fuse_request_send_background(fc, req);
+ } else {
+ fuse_request_send(fc, req);
+ fuse_readpages_end(fc, req);
+ fuse_put_request(fc, req);
+ }
+}
+
+struct fuse_fill_data {
+ struct fuse_req *req;
+ struct file *file;
+ struct inode *inode;
+ unsigned nr_pages;
+};
+
+static int fuse_readpages_fill(void *_data, struct page *page)
+{
+ struct fuse_fill_data *data = _data;
+ struct fuse_req *req = data->req;
+ struct inode *inode = data->inode;
+ struct fuse_conn *fc = get_fuse_conn(inode);
+
+ fuse_wait_on_page_writeback(inode, page->index);
+
+ if (req->num_pages &&
+ (req->num_pages == FUSE_MAX_PAGES_PER_REQ ||
+ (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_read ||
+ req->pages[req->num_pages - 1]->index + 1 != page->index)) {
+ int nr_alloc = min_t(unsigned, data->nr_pages,
+ FUSE_MAX_PAGES_PER_REQ);
+ fuse_send_readpages(req, data->file);
+ if (fc->async_read)
+ req = fuse_get_req_for_background(fc, nr_alloc);
+ else
+ req = fuse_get_req(fc, nr_alloc);
+
+ data->req = req;
+ if (IS_ERR(req)) {
+ unlock_page(page);
+ return PTR_ERR(req);
+ }
+ }
+
+ if (WARN_ON(req->num_pages >= req->max_pages)) {
+ fuse_put_request(fc, req);
+ return -EIO;
+ }
+
+ page_cache_get(page);
+ req->pages[req->num_pages] = page;
+ req->page_descs[req->num_pages].length = PAGE_SIZE;
+ req->num_pages++;
+ data->nr_pages--;
+ return 0;
+}
+
+static int fuse_readpages(struct file *file, struct address_space *mapping,
+ struct list_head *pages, unsigned nr_pages)
+{
+ struct inode *inode = mapping->host;
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_fill_data data;
+ int err;
+ int nr_alloc = min_t(unsigned, nr_pages, FUSE_MAX_PAGES_PER_REQ);
+
+ err = -EIO;
+ if (is_bad_inode(inode))
+ goto out;
+
+ data.file = file;
+ data.inode = inode;
+ if (fc->async_read)
+ data.req = fuse_get_req_for_background(fc, nr_alloc);
+ else
+ data.req = fuse_get_req(fc, nr_alloc);
+ data.nr_pages = nr_pages;
+ err = PTR_ERR(data.req);
+ if (IS_ERR(data.req))
+ goto out;
+
+ err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data);
+ if (!err) {
+ if (data.req->num_pages)
+ fuse_send_readpages(data.req, file);
+ else
+ fuse_put_request(fc, data.req);
+ }
+out:
+ return err;
+}
+
+static ssize_t fuse_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+ struct inode *inode = iocb->ki_filp->f_mapping->host;
+ struct fuse_conn *fc = get_fuse_conn(inode);
+
+ /*
+ * In auto invalidate mode, always update attributes on read.
+ * Otherwise, only update if we attempt to read past EOF (to ensure
+ * i_size is up to date).
+ */
+ if (fc->auto_inval_data ||
+ (iocb->ki_pos + iov_iter_count(to) > i_size_read(inode))) {
+ int err;
+ err = fuse_update_attributes(inode, NULL, iocb->ki_filp, NULL);
+ if (err)
+ return err;
+ }
+
+ return generic_file_read_iter(iocb, to);
+}
+
+static void fuse_write_fill(struct fuse_req *req, struct fuse_file *ff,
+ loff_t pos, size_t count)
+{
+ struct fuse_write_in *inarg = &req->misc.write.in;
+ struct fuse_write_out *outarg = &req->misc.write.out;
+
+ inarg->fh = ff->fh;
+ inarg->offset = pos;
+ inarg->size = count;
+ req->in.h.opcode = FUSE_WRITE;
+ req->in.h.nodeid = ff->nodeid;
+ req->in.numargs = 2;
+ if (ff->fc->minor < 9)
+ req->in.args[0].size = FUSE_COMPAT_WRITE_IN_SIZE;
+ else
+ req->in.args[0].size = sizeof(struct fuse_write_in);
+ req->in.args[0].value = inarg;
+ req->in.args[1].size = count;
+ req->out.numargs = 1;
+ req->out.args[0].size = sizeof(struct fuse_write_out);
+ req->out.args[0].value = outarg;
+}
+
+static size_t fuse_send_write(struct fuse_req *req, struct fuse_io_priv *io,
+ loff_t pos, size_t count, fl_owner_t owner)
+{
+ struct file *file = io->file;
+ struct fuse_file *ff = file->private_data;
+ struct fuse_conn *fc = ff->fc;
+ struct fuse_write_in *inarg = &req->misc.write.in;
+
+ fuse_write_fill(req, ff, pos, count);
+ inarg->flags = file->f_flags;
+ if (owner != NULL) {
+ inarg->write_flags |= FUSE_WRITE_LOCKOWNER;
+ inarg->lock_owner = fuse_lock_owner_id(fc, owner);
+ }
+
+ if (io->async)
+ return fuse_async_req_send(fc, req, count, io);
+
+ fuse_request_send(fc, req);
+ return req->misc.write.out.size;
+}
+
+bool fuse_write_update_size(struct inode *inode, loff_t pos)
+{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ bool ret = false;
+
+ spin_lock(&fc->lock);
+ fi->attr_version = ++fc->attr_version;
+ if (pos > inode->i_size) {
+ i_size_write(inode, pos);
+ ret = true;
+ }
+ spin_unlock(&fc->lock);
+
+ return ret;
+}
+
+static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file,
+ struct inode *inode, loff_t pos,
+ size_t count)
+{
+ size_t res;
+ unsigned offset;
+ unsigned i;
+ struct fuse_io_priv io = { .async = 0, .file = file };
+
+ for (i = 0; i < req->num_pages; i++)
+ fuse_wait_on_page_writeback(inode, req->pages[i]->index);
+
+ res = fuse_send_write(req, &io, pos, count, NULL);
+
+ offset = req->page_descs[0].offset;
+ count = res;
+ for (i = 0; i < req->num_pages; i++) {
+ struct page *page = req->pages[i];
+
+ if (!req->out.h.error && !offset && count >= PAGE_CACHE_SIZE)
+ SetPageUptodate(page);
+
+ if (count > PAGE_CACHE_SIZE - offset)
+ count -= PAGE_CACHE_SIZE - offset;
+ else
+ count = 0;
+ offset = 0;
+
+ unlock_page(page);
+ page_cache_release(page);
+ }
+
+ return res;
+}
+
+static ssize_t fuse_fill_write_pages(struct fuse_req *req,
+ struct address_space *mapping,
+ struct iov_iter *ii, loff_t pos)
+{
+ struct fuse_conn *fc = get_fuse_conn(mapping->host);
+ unsigned offset = pos & (PAGE_CACHE_SIZE - 1);
+ size_t count = 0;
+ int err;
+
+ req->in.argpages = 1;
+ req->page_descs[0].offset = offset;
+
+ do {
+ size_t tmp;
+ struct page *page;
+ pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+ size_t bytes = min_t(size_t, PAGE_CACHE_SIZE - offset,
+ iov_iter_count(ii));
+
+ bytes = min_t(size_t, bytes, fc->max_write - count);
+
+ again:
+ err = -EFAULT;
+ if (iov_iter_fault_in_readable(ii, bytes))
+ break;
+
+ err = -ENOMEM;
+ page = grab_cache_page_write_begin(mapping, index, 0);
+ if (!page)
+ break;
+
+ if (mapping_writably_mapped(mapping))
+ flush_dcache_page(page);
+
+ tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes);
+ flush_dcache_page(page);
+
+ if (!tmp) {
+ unlock_page(page);
+ page_cache_release(page);
+ bytes = min(bytes, iov_iter_single_seg_count(ii));
+ goto again;
+ }
+
+ err = 0;
+ req->pages[req->num_pages] = page;
+ req->page_descs[req->num_pages].length = tmp;
+ req->num_pages++;
+
+ iov_iter_advance(ii, tmp);
+ count += tmp;
+ pos += tmp;
+ offset += tmp;
+ if (offset == PAGE_CACHE_SIZE)
+ offset = 0;
+
+ if (!fc->big_writes)
+ break;
+ } while (iov_iter_count(ii) && count < fc->max_write &&
+ req->num_pages < req->max_pages && offset == 0);
+
+ return count > 0 ? count : err;
+}
+
+static inline unsigned fuse_wr_pages(loff_t pos, size_t len)
+{
+ return min_t(unsigned,
+ ((pos + len - 1) >> PAGE_CACHE_SHIFT) -
+ (pos >> PAGE_CACHE_SHIFT) + 1,
+ FUSE_MAX_PAGES_PER_REQ);
+}
+
+static ssize_t fuse_perform_write(struct file *file,
+ struct address_space *mapping,
+ struct iov_iter *ii, loff_t pos)
+{
+ struct inode *inode = mapping->host;
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ int err = 0;
+ ssize_t res = 0;
+
+ if (is_bad_inode(inode))
+ return -EIO;
+
+ if (inode->i_size < pos + iov_iter_count(ii))
+ set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
+
+ do {
+ struct fuse_req *req;
+ ssize_t count;
+ unsigned nr_pages = fuse_wr_pages(pos, iov_iter_count(ii));
+
+ req = fuse_get_req(fc, nr_pages);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ break;
+ }
+
+ count = fuse_fill_write_pages(req, mapping, ii, pos);
+ if (count <= 0) {
+ err = count;
+ } else {
+ size_t num_written;
+
+ num_written = fuse_send_write_pages(req, file, inode,
+ pos, count);
+ err = req->out.h.error;
+ if (!err) {
+ res += num_written;
+ pos += num_written;
+
+ /* break out of the loop on short write */
+ if (num_written != count)
+ err = -EIO;
+ }
+ }
+ fuse_put_request(fc, req);
+ } while (!err && iov_iter_count(ii));
+
+ if (res > 0)
+ fuse_write_update_size(inode, pos);
+
+ clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
+ fuse_invalidate_attr(inode);
+
+ return res > 0 ? res : err;
+}
+
+static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct file *file = iocb->ki_filp;
+ struct address_space *mapping = file->f_mapping;
+ ssize_t written = 0;
+ ssize_t written_buffered = 0;
+ struct inode *inode = mapping->host;
+ ssize_t err;
+ loff_t endbyte = 0;
+
+ if (get_fuse_conn(inode)->writeback_cache) {
+ /* Update size (EOF optimization) and mode (SUID clearing) */
+ err = fuse_update_attributes(mapping->host, NULL, file, NULL);
+ if (err)
+ return err;
+
+ return generic_file_write_iter(iocb, from);
+ }
+
+ mutex_lock(&inode->i_mutex);
+
+ /* We can write back this queue in page reclaim */
+ current->backing_dev_info = inode_to_bdi(inode);
+
+ err = generic_write_checks(iocb, from);
+ if (err <= 0)
+ goto out;
+
+ err = file_remove_suid(file);
+ if (err)
+ goto out;
+
+ err = file_update_time(file);
+ if (err)
+ goto out;
+
+ if (iocb->ki_flags & IOCB_DIRECT) {
+ loff_t pos = iocb->ki_pos;
+ written = generic_file_direct_write(iocb, from, pos);
+ if (written < 0 || !iov_iter_count(from))
+ goto out;
+
+ pos += written;
+
+ written_buffered = fuse_perform_write(file, mapping, from, pos);
+ if (written_buffered < 0) {
+ err = written_buffered;
+ goto out;
+ }
+ endbyte = pos + written_buffered - 1;
+
+ err = filemap_write_and_wait_range(file->f_mapping, pos,
+ endbyte);
+ if (err)
+ goto out;
+
+ invalidate_mapping_pages(file->f_mapping,
+ pos >> PAGE_CACHE_SHIFT,
+ endbyte >> PAGE_CACHE_SHIFT);
+
+ written += written_buffered;
+ iocb->ki_pos = pos + written_buffered;
+ } else {
+ written = fuse_perform_write(file, mapping, from, iocb->ki_pos);
+ if (written >= 0)
+ iocb->ki_pos += written;
+ }
+out:
+ current->backing_dev_info = NULL;
+ mutex_unlock(&inode->i_mutex);
+
+ return written ? written : err;
+}
+
+static inline void fuse_page_descs_length_init(struct fuse_req *req,
+ unsigned index, unsigned nr_pages)
+{
+ int i;
+
+ for (i = index; i < index + nr_pages; i++)
+ req->page_descs[i].length = PAGE_SIZE -
+ req->page_descs[i].offset;
+}
+
+static inline unsigned long fuse_get_user_addr(const struct iov_iter *ii)
+{
+ return (unsigned long)ii->iov->iov_base + ii->iov_offset;
+}
+
+static inline size_t fuse_get_frag_size(const struct iov_iter *ii,
+ size_t max_size)
+{
+ return min(iov_iter_single_seg_count(ii), max_size);
+}
+
+static int fuse_get_user_pages(struct fuse_req *req, struct iov_iter *ii,
+ size_t *nbytesp, int write)
+{
+ size_t nbytes = 0; /* # bytes already packed in req */
+
+ /* Special case for kernel I/O: can copy directly into the buffer */
+ if (ii->type & ITER_KVEC) {
+ unsigned long user_addr = fuse_get_user_addr(ii);
+ size_t frag_size = fuse_get_frag_size(ii, *nbytesp);
+
+ if (write)
+ req->in.args[1].value = (void *) user_addr;
+ else
+ req->out.args[0].value = (void *) user_addr;
+
+ iov_iter_advance(ii, frag_size);
+ *nbytesp = frag_size;
+ return 0;
+ }
+
+ while (nbytes < *nbytesp && req->num_pages < req->max_pages) {
+ unsigned npages;
+ size_t start;
+ ssize_t ret = iov_iter_get_pages(ii,
+ &req->pages[req->num_pages],
+ *nbytesp - nbytes,
+ req->max_pages - req->num_pages,
+ &start);
+ if (ret < 0)
+ return ret;
+
+ iov_iter_advance(ii, ret);
+ nbytes += ret;
+
+ ret += start;
+ npages = (ret + PAGE_SIZE - 1) / PAGE_SIZE;
+
+ req->page_descs[req->num_pages].offset = start;
+ fuse_page_descs_length_init(req, req->num_pages, npages);
+
+ req->num_pages += npages;
+ req->page_descs[req->num_pages - 1].length -=
+ (PAGE_SIZE - ret) & (PAGE_SIZE - 1);
+ }
+
+ if (write)
+ req->in.argpages = 1;
+ else
+ req->out.argpages = 1;
+
+ *nbytesp = nbytes;
+
+ return 0;
+}
+
+static inline int fuse_iter_npages(const struct iov_iter *ii_p)
+{
+ return iov_iter_npages(ii_p, FUSE_MAX_PAGES_PER_REQ);
+}
+
+ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
+ loff_t *ppos, int flags)
+{
+ int write = flags & FUSE_DIO_WRITE;
+ int cuse = flags & FUSE_DIO_CUSE;
+ struct file *file = io->file;
+ struct inode *inode = file->f_mapping->host;
+ struct fuse_file *ff = file->private_data;
+ struct fuse_conn *fc = ff->fc;
+ size_t nmax = write ? fc->max_write : fc->max_read;
+ loff_t pos = *ppos;
+ size_t count = iov_iter_count(iter);
+ pgoff_t idx_from = pos >> PAGE_CACHE_SHIFT;
+ pgoff_t idx_to = (pos + count - 1) >> PAGE_CACHE_SHIFT;
+ ssize_t res = 0;
+ struct fuse_req *req;
+
+ if (io->async)
+ req = fuse_get_req_for_background(fc, fuse_iter_npages(iter));
+ else
+ req = fuse_get_req(fc, fuse_iter_npages(iter));
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+
+ if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) {
+ if (!write)
+ mutex_lock(&inode->i_mutex);
+ fuse_sync_writes(inode);
+ if (!write)
+ mutex_unlock(&inode->i_mutex);
+ }
+
+ while (count) {
+ size_t nres;
+ fl_owner_t owner = current->files;
+ size_t nbytes = min(count, nmax);
+ int err = fuse_get_user_pages(req, iter, &nbytes, write);
+ if (err) {
+ res = err;
+ break;
+ }
+
+ if (write)
+ nres = fuse_send_write(req, io, pos, nbytes, owner);
+ else
+ nres = fuse_send_read(req, io, pos, nbytes, owner);
+
+ if (!io->async)
+ fuse_release_user_pages(req, !write);
+ if (req->out.h.error) {
+ if (!res)
+ res = req->out.h.error;
+ break;
+ } else if (nres > nbytes) {
+ res = -EIO;
+ break;
+ }
+ count -= nres;
+ res += nres;
+ pos += nres;
+ if (nres != nbytes)
+ break;
+ if (count) {
+ fuse_put_request(fc, req);
+ if (io->async)
+ req = fuse_get_req_for_background(fc,
+ fuse_iter_npages(iter));
+ else
+ req = fuse_get_req(fc, fuse_iter_npages(iter));
+ if (IS_ERR(req))
+ break;
+ }
+ }
+ if (!IS_ERR(req))
+ fuse_put_request(fc, req);
+ if (res > 0)
+ *ppos = pos;
+
+ return res;
+}
+EXPORT_SYMBOL_GPL(fuse_direct_io);
+
+static ssize_t __fuse_direct_read(struct fuse_io_priv *io,
+ struct iov_iter *iter,
+ loff_t *ppos)
+{
+ ssize_t res;
+ struct file *file = io->file;
+ struct inode *inode = file_inode(file);
+
+ if (is_bad_inode(inode))
+ return -EIO;
+
+ res = fuse_direct_io(io, iter, ppos, 0);
+
+ fuse_invalidate_attr(inode);
+
+ return res;
+}
+
+static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+ struct fuse_io_priv io = { .async = 0, .file = iocb->ki_filp };
+ return __fuse_direct_read(&io, to, &iocb->ki_pos);
+}
+
+static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct file *file = iocb->ki_filp;
+ struct inode *inode = file_inode(file);
+ struct fuse_io_priv io = { .async = 0, .file = file };
+ ssize_t res;
+
+ if (is_bad_inode(inode))
+ return -EIO;
+
+ /* Don't allow parallel writes to the same file */
+ mutex_lock(&inode->i_mutex);
+ res = generic_write_checks(iocb, from);
+ if (res > 0)
+ res = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE);
+ fuse_invalidate_attr(inode);
+ if (res > 0)
+ fuse_write_update_size(inode, iocb->ki_pos);
+ mutex_unlock(&inode->i_mutex);
+
+ return res;
+}
+
+static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req)
+{
+ int i;
+
+ for (i = 0; i < req->num_pages; i++)
+ __free_page(req->pages[i]);
+
+ if (req->ff)
+ fuse_file_put(req->ff, false);
+}
+
+static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
+{
+ struct inode *inode = req->inode;
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct backing_dev_info *bdi = inode_to_bdi(inode);
+ int i;
+
+ list_del(&req->writepages_entry);
+ for (i = 0; i < req->num_pages; i++) {
+ dec_bdi_stat(bdi, BDI_WRITEBACK);
+ dec_zone_page_state(req->pages[i], NR_WRITEBACK_TEMP);
+ bdi_writeout_inc(bdi);
+ }
+ wake_up(&fi->page_waitq);
+}
+
+/* Called under fc->lock, may release and reacquire it */
+static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req,
+ loff_t size)
+__releases(fc->lock)
+__acquires(fc->lock)
+{
+ struct fuse_inode *fi = get_fuse_inode(req->inode);
+ struct fuse_write_in *inarg = &req->misc.write.in;
+ __u64 data_size = req->num_pages * PAGE_CACHE_SIZE;
+
+ if (!fc->connected)
+ goto out_free;
+
+ if (inarg->offset + data_size <= size) {
+ inarg->size = data_size;
+ } else if (inarg->offset < size) {
+ inarg->size = size - inarg->offset;
+ } else {
+ /* Got truncated off completely */
+ goto out_free;
+ }
+
+ req->in.args[1].size = inarg->size;
+ fi->writectr++;
+ fuse_request_send_background_locked(fc, req);
+ return;
+
+ out_free:
+ fuse_writepage_finish(fc, req);
+ spin_unlock(&fc->lock);
+ fuse_writepage_free(fc, req);
+ fuse_put_request(fc, req);
+ spin_lock(&fc->lock);
+}
+
+/*
+ * If fi->writectr is positive (no truncate or fsync going on) send
+ * all queued writepage requests.
+ *
+ * Called with fc->lock
+ */
+void fuse_flush_writepages(struct inode *inode)
+__releases(fc->lock)
+__acquires(fc->lock)
+{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ size_t crop = i_size_read(inode);
+ struct fuse_req *req;
+
+ while (fi->writectr >= 0 && !list_empty(&fi->queued_writes)) {
+ req = list_entry(fi->queued_writes.next, struct fuse_req, list);
+ list_del_init(&req->list);
+ fuse_send_writepage(fc, req, crop);
+ }
+}
+
+static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req)
+{
+ struct inode *inode = req->inode;
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ mapping_set_error(inode->i_mapping, req->out.h.error);
+ spin_lock(&fc->lock);
+ while (req->misc.write.next) {
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_write_in *inarg = &req->misc.write.in;
+ struct fuse_req *next = req->misc.write.next;
+ req->misc.write.next = next->misc.write.next;
+ next->misc.write.next = NULL;
+ next->ff = fuse_file_get(req->ff);
+ list_add(&next->writepages_entry, &fi->writepages);
+
+ /*
+ * Skip fuse_flush_writepages() to make it easy to crop requests
+ * based on primary request size.
+ *
+ * 1st case (trivial): there are no concurrent activities using
+ * fuse_set/release_nowrite. Then we're on safe side because
+ * fuse_flush_writepages() would call fuse_send_writepage()
+ * anyway.
+ *
+ * 2nd case: someone called fuse_set_nowrite and it is waiting
+ * now for completion of all in-flight requests. This happens
+ * rarely and no more than once per page, so this should be
+ * okay.
+ *
+ * 3rd case: someone (e.g. fuse_do_setattr()) is in the middle
+ * of fuse_set_nowrite..fuse_release_nowrite section. The fact
+ * that fuse_set_nowrite returned implies that all in-flight
+ * requests were completed along with all of their secondary
+ * requests. Further primary requests are blocked by negative
+ * writectr. Hence there cannot be any in-flight requests and
+ * no invocations of fuse_writepage_end() while we're in
+ * fuse_set_nowrite..fuse_release_nowrite section.
+ */
+ fuse_send_writepage(fc, next, inarg->offset + inarg->size);
+ }
+ fi->writectr--;
+ fuse_writepage_finish(fc, req);
+ spin_unlock(&fc->lock);
+ fuse_writepage_free(fc, req);
+}
+
+static struct fuse_file *__fuse_write_file_get(struct fuse_conn *fc,
+ struct fuse_inode *fi)
+{
+ struct fuse_file *ff = NULL;
+
+ spin_lock(&fc->lock);
+ if (!list_empty(&fi->write_files)) {
+ ff = list_entry(fi->write_files.next, struct fuse_file,
+ write_entry);
+ fuse_file_get(ff);
+ }
+ spin_unlock(&fc->lock);
+
+ return ff;
+}
+
+static struct fuse_file *fuse_write_file_get(struct fuse_conn *fc,
+ struct fuse_inode *fi)
+{
+ struct fuse_file *ff = __fuse_write_file_get(fc, fi);
+ WARN_ON(!ff);
+ return ff;
+}
+
+int fuse_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_file *ff;
+ int err;
+
+ ff = __fuse_write_file_get(fc, fi);
+ err = fuse_flush_times(inode, ff);
+ if (ff)
+ fuse_file_put(ff, 0);
+
+ return err;
+}
+
+static int fuse_writepage_locked(struct page *page)
+{
+ struct address_space *mapping = page->mapping;
+ struct inode *inode = mapping->host;
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_req *req;
+ struct page *tmp_page;
+ int error = -ENOMEM;
+
+ set_page_writeback(page);
+
+ req = fuse_request_alloc_nofs(1);
+ if (!req)
+ goto err;
+
+ req->background = 1; /* writeback always goes to bg_queue */
+ tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ if (!tmp_page)
+ goto err_free;
+
+ error = -EIO;
+ req->ff = fuse_write_file_get(fc, fi);
+ if (!req->ff)
+ goto err_nofile;
+
+ fuse_write_fill(req, req->ff, page_offset(page), 0);
+
+ copy_highpage(tmp_page, page);
+ req->misc.write.in.write_flags |= FUSE_WRITE_CACHE;
+ req->misc.write.next = NULL;
+ req->in.argpages = 1;
+ req->num_pages = 1;
+ req->pages[0] = tmp_page;
+ req->page_descs[0].offset = 0;
+ req->page_descs[0].length = PAGE_SIZE;
+ req->end = fuse_writepage_end;
+ req->inode = inode;
+
+ inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK);
+ inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
+
+ spin_lock(&fc->lock);
+ list_add(&req->writepages_entry, &fi->writepages);
+ list_add_tail(&req->list, &fi->queued_writes);
+ fuse_flush_writepages(inode);
+ spin_unlock(&fc->lock);
+
+ end_page_writeback(page);
+
+ return 0;
+
+err_nofile:
+ __free_page(tmp_page);
+err_free:
+ fuse_request_free(req);
+err:
+ end_page_writeback(page);
+ return error;
+}
+
+static int fuse_writepage(struct page *page, struct writeback_control *wbc)
+{
+ int err;
+
+ if (fuse_page_is_writeback(page->mapping->host, page->index)) {
+ /*
+ * ->writepages() should be called for sync() and friends. We
+ * should only get here on direct reclaim and then we are
+ * allowed to skip a page which is already in flight
+ */
+ WARN_ON(wbc->sync_mode == WB_SYNC_ALL);
+
+ redirty_page_for_writepage(wbc, page);
+ return 0;
+ }
+
+ err = fuse_writepage_locked(page);
+ unlock_page(page);
+
+ return err;
+}
+
+struct fuse_fill_wb_data {
+ struct fuse_req *req;
+ struct fuse_file *ff;
+ struct inode *inode;
+ struct page **orig_pages;
+};
+
+static void fuse_writepages_send(struct fuse_fill_wb_data *data)
+{
+ struct fuse_req *req = data->req;
+ struct inode *inode = data->inode;
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ int num_pages = req->num_pages;
+ int i;
+
+ req->ff = fuse_file_get(data->ff);
+ spin_lock(&fc->lock);
+ list_add_tail(&req->list, &fi->queued_writes);
+ fuse_flush_writepages(inode);
+ spin_unlock(&fc->lock);
+
+ for (i = 0; i < num_pages; i++)
+ end_page_writeback(data->orig_pages[i]);
+}
+
+static bool fuse_writepage_in_flight(struct fuse_req *new_req,
+ struct page *page)
+{
+ struct fuse_conn *fc = get_fuse_conn(new_req->inode);
+ struct fuse_inode *fi = get_fuse_inode(new_req->inode);
+ struct fuse_req *tmp;
+ struct fuse_req *old_req;
+ bool found = false;
+ pgoff_t curr_index;
+
+ BUG_ON(new_req->num_pages != 0);
+
+ spin_lock(&fc->lock);
+ list_del(&new_req->writepages_entry);
+ list_for_each_entry(old_req, &fi->writepages, writepages_entry) {
+ BUG_ON(old_req->inode != new_req->inode);
+ curr_index = old_req->misc.write.in.offset >> PAGE_CACHE_SHIFT;
+ if (curr_index <= page->index &&
+ page->index < curr_index + old_req->num_pages) {
+ found = true;
+ break;
+ }
+ }
+ if (!found) {
+ list_add(&new_req->writepages_entry, &fi->writepages);
+ goto out_unlock;
+ }
+
+ new_req->num_pages = 1;
+ for (tmp = old_req; tmp != NULL; tmp = tmp->misc.write.next) {
+ BUG_ON(tmp->inode != new_req->inode);
+ curr_index = tmp->misc.write.in.offset >> PAGE_CACHE_SHIFT;
+ if (tmp->num_pages == 1 &&
+ curr_index == page->index) {
+ old_req = tmp;
+ }
+ }
+
+ if (old_req->num_pages == 1 && (old_req->state == FUSE_REQ_INIT ||
+ old_req->state == FUSE_REQ_PENDING)) {
+ struct backing_dev_info *bdi = inode_to_bdi(page->mapping->host);
+
+ copy_highpage(old_req->pages[0], page);
+ spin_unlock(&fc->lock);
+
+ dec_bdi_stat(bdi, BDI_WRITEBACK);
+ dec_zone_page_state(page, NR_WRITEBACK_TEMP);
+ bdi_writeout_inc(bdi);
+ fuse_writepage_free(fc, new_req);
+ fuse_request_free(new_req);
+ goto out;
+ } else {
+ new_req->misc.write.next = old_req->misc.write.next;
+ old_req->misc.write.next = new_req;
+ }
+out_unlock:
+ spin_unlock(&fc->lock);
+out:
+ return found;
+}
+
+static int fuse_writepages_fill(struct page *page,
+ struct writeback_control *wbc, void *_data)
+{
+ struct fuse_fill_wb_data *data = _data;
+ struct fuse_req *req = data->req;
+ struct inode *inode = data->inode;
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct page *tmp_page;
+ bool is_writeback;
+ int err;
+
+ if (!data->ff) {
+ err = -EIO;
+ data->ff = fuse_write_file_get(fc, get_fuse_inode(inode));
+ if (!data->ff)
+ goto out_unlock;
+ }
+
+ /*
+ * Being under writeback is unlikely but possible. For example direct
+ * read to an mmaped fuse file will set the page dirty twice; once when
+ * the pages are faulted with get_user_pages(), and then after the read
+ * completed.
+ */
+ is_writeback = fuse_page_is_writeback(inode, page->index);
+
+ if (req && req->num_pages &&
+ (is_writeback || req->num_pages == FUSE_MAX_PAGES_PER_REQ ||
+ (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_write ||
+ data->orig_pages[req->num_pages - 1]->index + 1 != page->index)) {
+ fuse_writepages_send(data);
+ data->req = NULL;
+ }
+ err = -ENOMEM;
+ tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+ if (!tmp_page)
+ goto out_unlock;
+
+ /*
+ * The page must not be redirtied until the writeout is completed
+ * (i.e. userspace has sent a reply to the write request). Otherwise
+ * there could be more than one temporary page instance for each real
+ * page.
+ *
+ * This is ensured by holding the page lock in page_mkwrite() while
+ * checking fuse_page_is_writeback(). We already hold the page lock
+ * since clear_page_dirty_for_io() and keep it held until we add the
+ * request to the fi->writepages list and increment req->num_pages.
+ * After this fuse_page_is_writeback() will indicate that the page is
+ * under writeback, so we can release the page lock.
+ */
+ if (data->req == NULL) {
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ err = -ENOMEM;
+ req = fuse_request_alloc_nofs(FUSE_MAX_PAGES_PER_REQ);
+ if (!req) {
+ __free_page(tmp_page);
+ goto out_unlock;
+ }
+
+ fuse_write_fill(req, data->ff, page_offset(page), 0);
+ req->misc.write.in.write_flags |= FUSE_WRITE_CACHE;
+ req->misc.write.next = NULL;
+ req->in.argpages = 1;
+ req->background = 1;
+ req->num_pages = 0;
+ req->end = fuse_writepage_end;
+ req->inode = inode;
+
+ spin_lock(&fc->lock);
+ list_add(&req->writepages_entry, &fi->writepages);
+ spin_unlock(&fc->lock);
+
+ data->req = req;
+ }
+ set_page_writeback(page);
+
+ copy_highpage(tmp_page, page);
+ req->pages[req->num_pages] = tmp_page;
+ req->page_descs[req->num_pages].offset = 0;
+ req->page_descs[req->num_pages].length = PAGE_SIZE;
+
+ inc_bdi_stat(inode_to_bdi(inode), BDI_WRITEBACK);
+ inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
+
+ err = 0;
+ if (is_writeback && fuse_writepage_in_flight(req, page)) {
+ end_page_writeback(page);
+ data->req = NULL;
+ goto out_unlock;
+ }
+ data->orig_pages[req->num_pages] = page;
+
+ /*
+ * Protected by fc->lock against concurrent access by
+ * fuse_page_is_writeback().
+ */
+ spin_lock(&fc->lock);
+ req->num_pages++;
+ spin_unlock(&fc->lock);
+
+out_unlock:
+ unlock_page(page);
+
+ return err;
+}
+
+static int fuse_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct inode *inode = mapping->host;
+ struct fuse_fill_wb_data data;
+ int err;
+
+ err = -EIO;
+ if (is_bad_inode(inode))
+ goto out;
+
+ data.inode = inode;
+ data.req = NULL;
+ data.ff = NULL;
+
+ err = -ENOMEM;
+ data.orig_pages = kcalloc(FUSE_MAX_PAGES_PER_REQ,
+ sizeof(struct page *),
+ GFP_NOFS);
+ if (!data.orig_pages)
+ goto out;
+
+ err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data);
+ if (data.req) {
+ /* Ignore errors if we can write at least one page */
+ BUG_ON(!data.req->num_pages);
+ fuse_writepages_send(&data);
+ err = 0;
+ }
+ if (data.ff)
+ fuse_file_put(data.ff, false);
+
+ kfree(data.orig_pages);
+out:
+ return err;
+}
+
+/*
+ * It's worthy to make sure that space is reserved on disk for the write,
+ * but how to implement it without killing performance need more thinking.
+ */
+static int fuse_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata)
+{
+ pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+ struct fuse_conn *fc = get_fuse_conn(file_inode(file));
+ struct page *page;
+ loff_t fsize;
+ int err = -ENOMEM;
+
+ WARN_ON(!fc->writeback_cache);
+
+ page = grab_cache_page_write_begin(mapping, index, flags);
+ if (!page)
+ goto error;
+
+ fuse_wait_on_page_writeback(mapping->host, page->index);
+
+ if (PageUptodate(page) || len == PAGE_CACHE_SIZE)
+ goto success;
+ /*
+ * Check if the start this page comes after the end of file, in which
+ * case the readpage can be optimized away.
+ */
+ fsize = i_size_read(mapping->host);
+ if (fsize <= (pos & PAGE_CACHE_MASK)) {
+ size_t off = pos & ~PAGE_CACHE_MASK;
+ if (off)
+ zero_user_segment(page, 0, off);
+ goto success;
+ }
+ err = fuse_do_readpage(file, page);
+ if (err)
+ goto cleanup;
+success:
+ *pagep = page;
+ return 0;
+
+cleanup:
+ unlock_page(page);
+ page_cache_release(page);
+error:
+ return err;
+}
+
+static int fuse_write_end(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
+{
+ struct inode *inode = page->mapping->host;
+
+ if (!PageUptodate(page)) {
+ /* Zero any unwritten bytes at the end of the page */
+ size_t endoff = (pos + copied) & ~PAGE_CACHE_MASK;
+ if (endoff)
+ zero_user_segment(page, endoff, PAGE_CACHE_SIZE);
+ SetPageUptodate(page);
+ }
+
+ fuse_write_update_size(inode, pos + copied);
+ set_page_dirty(page);
+ unlock_page(page);
+ page_cache_release(page);
+
+ return copied;
+}
+
+static int fuse_launder_page(struct page *page)
+{
+ int err = 0;
+ if (clear_page_dirty_for_io(page)) {
+ struct inode *inode = page->mapping->host;
+ err = fuse_writepage_locked(page);
+ if (!err)
+ fuse_wait_on_page_writeback(inode, page->index);
+ }
+ return err;
+}
+
+/*
+ * Write back dirty pages now, because there may not be any suitable
+ * open files later
+ */
+static void fuse_vma_close(struct vm_area_struct *vma)
+{
+ filemap_write_and_wait(vma->vm_file->f_mapping);
+}
+
+/*
+ * Wait for writeback against this page to complete before allowing it
+ * to be marked dirty again, and hence written back again, possibly
+ * before the previous writepage completed.
+ *
+ * Block here, instead of in ->writepage(), so that the userspace fs
+ * can only block processes actually operating on the filesystem.
+ *
+ * Otherwise unprivileged userspace fs would be able to block
+ * unrelated:
+ *
+ * - page migration
+ * - sync(2)
+ * - try_to_free_pages() with order > PAGE_ALLOC_COSTLY_ORDER
+ */
+static int fuse_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ struct page *page = vmf->page;
+ struct inode *inode = file_inode(vma->vm_file);
+
+ file_update_time(vma->vm_file);
+ lock_page(page);
+ if (page->mapping != inode->i_mapping) {
+ unlock_page(page);
+ return VM_FAULT_NOPAGE;
+ }
+
+ fuse_wait_on_page_writeback(inode, page->index);
+ return VM_FAULT_LOCKED;
+}
+
+static const struct vm_operations_struct fuse_file_vm_ops = {
+ .close = fuse_vma_close,
+ .fault = filemap_fault,
+ .map_pages = filemap_map_pages,
+ .page_mkwrite = fuse_page_mkwrite,
+};
+
+static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
+ fuse_link_write_file(file);
+
+ file_accessed(file);
+ vma->vm_ops = &fuse_file_vm_ops;
+ return 0;
+}
+
+static int fuse_direct_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ /* Can't provide the coherency needed for MAP_SHARED */
+ if (vma->vm_flags & VM_MAYSHARE)
+ return -ENODEV;
+
+ invalidate_inode_pages2(file->f_mapping);
+
+ return generic_file_mmap(file, vma);
+}
+
+static int convert_fuse_file_lock(const struct fuse_file_lock *ffl,
+ struct file_lock *fl)
+{
+ switch (ffl->type) {
+ case F_UNLCK:
+ break;
+
+ case F_RDLCK:
+ case F_WRLCK:
+ if (ffl->start > OFFSET_MAX || ffl->end > OFFSET_MAX ||
+ ffl->end < ffl->start)
+ return -EIO;
+
+ fl->fl_start = ffl->start;
+ fl->fl_end = ffl->end;
+ fl->fl_pid = ffl->pid;
+ break;
+
+ default:
+ return -EIO;
+ }
+ fl->fl_type = ffl->type;
+ return 0;
+}
+
+static void fuse_lk_fill(struct fuse_args *args, struct file *file,
+ const struct file_lock *fl, int opcode, pid_t pid,
+ int flock, struct fuse_lk_in *inarg)
+{
+ struct inode *inode = file_inode(file);
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_file *ff = file->private_data;
+
+ memset(inarg, 0, sizeof(*inarg));
+ inarg->fh = ff->fh;
+ inarg->owner = fuse_lock_owner_id(fc, fl->fl_owner);
+ inarg->lk.start = fl->fl_start;
+ inarg->lk.end = fl->fl_end;
+ inarg->lk.type = fl->fl_type;
+ inarg->lk.pid = pid;
+ if (flock)
+ inarg->lk_flags |= FUSE_LK_FLOCK;
+ args->in.h.opcode = opcode;
+ args->in.h.nodeid = get_node_id(inode);
+ args->in.numargs = 1;
+ args->in.args[0].size = sizeof(*inarg);
+ args->in.args[0].value = inarg;
+}
+
+static int fuse_getlk(struct file *file, struct file_lock *fl)
+{
+ struct inode *inode = file_inode(file);
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ FUSE_ARGS(args);
+ struct fuse_lk_in inarg;
+ struct fuse_lk_out outarg;
+ int err;
+
+ fuse_lk_fill(&args, file, fl, FUSE_GETLK, 0, 0, &inarg);
+ args.out.numargs = 1;
+ args.out.args[0].size = sizeof(outarg);
+ args.out.args[0].value = &outarg;
+ err = fuse_simple_request(fc, &args);
+ if (!err)
+ err = convert_fuse_file_lock(&outarg.lk, fl);
+
+ return err;
+}
+
+static int fuse_setlk(struct file *file, struct file_lock *fl, int flock)
+{
+ struct inode *inode = file_inode(file);
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ FUSE_ARGS(args);
+ struct fuse_lk_in inarg;
+ int opcode = (fl->fl_flags & FL_SLEEP) ? FUSE_SETLKW : FUSE_SETLK;
+ pid_t pid = fl->fl_type != F_UNLCK ? current->tgid : 0;
+ int err;
+
+ if (fl->fl_lmops && fl->fl_lmops->lm_grant) {
+ /* NLM needs asynchronous locks, which we don't support yet */
+ return -ENOLCK;
+ }
+
+ /* Unlock on close is handled by the flush method */
+ if (fl->fl_flags & FL_CLOSE)
+ return 0;
+
+ fuse_lk_fill(&args, file, fl, opcode, pid, flock, &inarg);
+ err = fuse_simple_request(fc, &args);
+
+ /* locking is restartable */
+ if (err == -EINTR)
+ err = -ERESTARTSYS;
+
+ return err;
+}
+
+static int fuse_file_lock(struct file *file, int cmd, struct file_lock *fl)
+{
+ struct inode *inode = file_inode(file);
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ int err;
+
+ if (cmd == F_CANCELLK) {
+ err = 0;
+ } else if (cmd == F_GETLK) {
+ if (fc->no_lock) {
+ posix_test_lock(file, fl);
+ err = 0;
+ } else
+ err = fuse_getlk(file, fl);
+ } else {
+ if (fc->no_lock)
+ err = posix_lock_file(file, fl, NULL);
+ else
+ err = fuse_setlk(file, fl, 0);
+ }
+ return err;
+}
+
+static int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl)
+{
+ struct inode *inode = file_inode(file);
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ int err;
+
+ if (fc->no_flock) {
+ err = flock_lock_file_wait(file, fl);
+ } else {
+ struct fuse_file *ff = file->private_data;
+
+ /* emulate flock with POSIX locks */
+ ff->flock = true;
+ err = fuse_setlk(file, fl, 1);
+ }
+
+ return err;
+}
+
+static sector_t fuse_bmap(struct address_space *mapping, sector_t block)
+{
+ struct inode *inode = mapping->host;
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ FUSE_ARGS(args);
+ struct fuse_bmap_in inarg;
+ struct fuse_bmap_out outarg;
+ int err;
+
+ if (!inode->i_sb->s_bdev || fc->no_bmap)
+ return 0;
+
+ memset(&inarg, 0, sizeof(inarg));
+ inarg.block = block;
+ inarg.blocksize = inode->i_sb->s_blocksize;
+ args.in.h.opcode = FUSE_BMAP;
+ args.in.h.nodeid = get_node_id(inode);
+ args.in.numargs = 1;
+ args.in.args[0].size = sizeof(inarg);
+ args.in.args[0].value = &inarg;
+ args.out.numargs = 1;
+ args.out.args[0].size = sizeof(outarg);
+ args.out.args[0].value = &outarg;
+ err = fuse_simple_request(fc, &args);
+ if (err == -ENOSYS)
+ fc->no_bmap = 1;
+
+ return err ? 0 : outarg.block;
+}
+
+static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence)
+{
+ loff_t retval;
+ struct inode *inode = file_inode(file);
+
+ /* No i_mutex protection necessary for SEEK_CUR and SEEK_SET */
+ if (whence == SEEK_CUR || whence == SEEK_SET)
+ return generic_file_llseek(file, offset, whence);
+
+ mutex_lock(&inode->i_mutex);
+ retval = fuse_update_attributes(inode, NULL, file, NULL);
+ if (!retval)
+ retval = generic_file_llseek(file, offset, whence);
+ mutex_unlock(&inode->i_mutex);
+
+ return retval;
+}
+
+static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov,
+ unsigned int nr_segs, size_t bytes, bool to_user)
+{
+ struct iov_iter ii;
+ int page_idx = 0;
+
+ if (!bytes)
+ return 0;
+
+ iov_iter_init(&ii, to_user ? READ : WRITE, iov, nr_segs, bytes);
+
+ while (iov_iter_count(&ii)) {
+ struct page *page = pages[page_idx++];
+ size_t todo = min_t(size_t, PAGE_SIZE, iov_iter_count(&ii));
+ void *kaddr;
+
+ kaddr = kmap(page);
+
+ while (todo) {
+ char __user *uaddr = ii.iov->iov_base + ii.iov_offset;
+ size_t iov_len = ii.iov->iov_len - ii.iov_offset;
+ size_t copy = min(todo, iov_len);
+ size_t left;
+
+ if (!to_user)
+ left = copy_from_user(kaddr, uaddr, copy);
+ else
+ left = copy_to_user(uaddr, kaddr, copy);
+
+ if (unlikely(left))
+ return -EFAULT;
+
+ iov_iter_advance(&ii, copy);
+ todo -= copy;
+ kaddr += copy;
+ }
+
+ kunmap(page);
+ }
+
+ return 0;
+}
+
+/*
+ * CUSE servers compiled on 32bit broke on 64bit kernels because the
+ * ABI was defined to be 'struct iovec' which is different on 32bit
+ * and 64bit. Fortunately we can determine which structure the server
+ * used from the size of the reply.
+ */
+static int fuse_copy_ioctl_iovec_old(struct iovec *dst, void *src,
+ size_t transferred, unsigned count,
+ bool is_compat)
+{
+#ifdef CONFIG_COMPAT
+ if (count * sizeof(struct compat_iovec) == transferred) {
+ struct compat_iovec *ciov = src;
+ unsigned i;
+
+ /*
+ * With this interface a 32bit server cannot support
+ * non-compat (i.e. ones coming from 64bit apps) ioctl
+ * requests
+ */
+ if (!is_compat)
+ return -EINVAL;
+
+ for (i = 0; i < count; i++) {
+ dst[i].iov_base = compat_ptr(ciov[i].iov_base);
+ dst[i].iov_len = ciov[i].iov_len;
+ }
+ return 0;
+ }
+#endif
+
+ if (count * sizeof(struct iovec) != transferred)
+ return -EIO;
+
+ memcpy(dst, src, transferred);
+ return 0;
+}
+
+/* Make sure iov_length() won't overflow */
+static int fuse_verify_ioctl_iov(struct iovec *iov, size_t count)
+{
+ size_t n;
+ u32 max = FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT;
+
+ for (n = 0; n < count; n++, iov++) {
+ if (iov->iov_len > (size_t) max)
+ return -ENOMEM;
+ max -= iov->iov_len;
+ }
+ return 0;
+}
+
+static int fuse_copy_ioctl_iovec(struct fuse_conn *fc, struct iovec *dst,
+ void *src, size_t transferred, unsigned count,
+ bool is_compat)
+{
+ unsigned i;
+ struct fuse_ioctl_iovec *fiov = src;
+
+ if (fc->minor < 16) {
+ return fuse_copy_ioctl_iovec_old(dst, src, transferred,
+ count, is_compat);
+ }
+
+ if (count * sizeof(struct fuse_ioctl_iovec) != transferred)
+ return -EIO;
+
+ for (i = 0; i < count; i++) {
+ /* Did the server supply an inappropriate value? */
+ if (fiov[i].base != (unsigned long) fiov[i].base ||
+ fiov[i].len != (unsigned long) fiov[i].len)
+ return -EIO;
+
+ dst[i].iov_base = (void __user *) (unsigned long) fiov[i].base;
+ dst[i].iov_len = (size_t) fiov[i].len;
+
+#ifdef CONFIG_COMPAT
+ if (is_compat &&
+ (ptr_to_compat(dst[i].iov_base) != fiov[i].base ||
+ (compat_size_t) dst[i].iov_len != fiov[i].len))
+ return -EIO;
+#endif
+ }
+
+ return 0;
+}
+
+
+/*
+ * For ioctls, there is no generic way to determine how much memory
+ * needs to be read and/or written. Furthermore, ioctls are allowed
+ * to dereference the passed pointer, so the parameter requires deep
+ * copying but FUSE has no idea whatsoever about what to copy in or
+ * out.
+ *
+ * This is solved by allowing FUSE server to retry ioctl with
+ * necessary in/out iovecs. Let's assume the ioctl implementation
+ * needs to read in the following structure.
+ *
+ * struct a {
+ * char *buf;
+ * size_t buflen;
+ * }
+ *
+ * On the first callout to FUSE server, inarg->in_size and
+ * inarg->out_size will be NULL; then, the server completes the ioctl
+ * with FUSE_IOCTL_RETRY set in out->flags, out->in_iovs set to 1 and
+ * the actual iov array to
+ *
+ * { { .iov_base = inarg.arg, .iov_len = sizeof(struct a) } }
+ *
+ * which tells FUSE to copy in the requested area and retry the ioctl.
+ * On the second round, the server has access to the structure and
+ * from that it can tell what to look for next, so on the invocation,
+ * it sets FUSE_IOCTL_RETRY, out->in_iovs to 2 and iov array to
+ *
+ * { { .iov_base = inarg.arg, .iov_len = sizeof(struct a) },
+ * { .iov_base = a.buf, .iov_len = a.buflen } }
+ *
+ * FUSE will copy both struct a and the pointed buffer from the
+ * process doing the ioctl and retry ioctl with both struct a and the
+ * buffer.
+ *
+ * This time, FUSE server has everything it needs and completes ioctl
+ * without FUSE_IOCTL_RETRY which finishes the ioctl call.
+ *
+ * Copying data out works the same way.
+ *
+ * Note that if FUSE_IOCTL_UNRESTRICTED is clear, the kernel
+ * automatically initializes in and out iovs by decoding @cmd with
+ * _IOC_* macros and the server is not allowed to request RETRY. This
+ * limits ioctl data transfers to well-formed ioctls and is the forced
+ * behavior for all FUSE servers.
+ */
+long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
+ unsigned int flags)
+{
+ struct fuse_file *ff = file->private_data;
+ struct fuse_conn *fc = ff->fc;
+ struct fuse_ioctl_in inarg = {
+ .fh = ff->fh,
+ .cmd = cmd,
+ .arg = arg,
+ .flags = flags
+ };
+ struct fuse_ioctl_out outarg;
+ struct fuse_req *req = NULL;
+ struct page **pages = NULL;
+ struct iovec *iov_page = NULL;
+ struct iovec *in_iov = NULL, *out_iov = NULL;
+ unsigned int in_iovs = 0, out_iovs = 0, num_pages = 0, max_pages;
+ size_t in_size, out_size, transferred;
+ int err;
+
+#if BITS_PER_LONG == 32
+ inarg.flags |= FUSE_IOCTL_32BIT;
+#else
+ if (flags & FUSE_IOCTL_COMPAT)
+ inarg.flags |= FUSE_IOCTL_32BIT;
+#endif
+
+ /* assume all the iovs returned by client always fits in a page */
+ BUILD_BUG_ON(sizeof(struct fuse_ioctl_iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE);
+
+ err = -ENOMEM;
+ pages = kcalloc(FUSE_MAX_PAGES_PER_REQ, sizeof(pages[0]), GFP_KERNEL);
+ iov_page = (struct iovec *) __get_free_page(GFP_KERNEL);
+ if (!pages || !iov_page)
+ goto out;
+
+ /*
+ * If restricted, initialize IO parameters as encoded in @cmd.
+ * RETRY from server is not allowed.
+ */
+ if (!(flags & FUSE_IOCTL_UNRESTRICTED)) {
+ struct iovec *iov = iov_page;
+
+ iov->iov_base = (void __user *)arg;
+ iov->iov_len = _IOC_SIZE(cmd);
+
+ if (_IOC_DIR(cmd) & _IOC_WRITE) {
+ in_iov = iov;
+ in_iovs = 1;
+ }
+
+ if (_IOC_DIR(cmd) & _IOC_READ) {
+ out_iov = iov;
+ out_iovs = 1;
+ }
+ }
+
+ retry:
+ inarg.in_size = in_size = iov_length(in_iov, in_iovs);
+ inarg.out_size = out_size = iov_length(out_iov, out_iovs);
+
+ /*
+ * Out data can be used either for actual out data or iovs,
+ * make sure there always is at least one page.
+ */
+ out_size = max_t(size_t, out_size, PAGE_SIZE);
+ max_pages = DIV_ROUND_UP(max(in_size, out_size), PAGE_SIZE);
+
+ /* make sure there are enough buffer pages and init request with them */
+ err = -ENOMEM;
+ if (max_pages > FUSE_MAX_PAGES_PER_REQ)
+ goto out;
+ while (num_pages < max_pages) {
+ pages[num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM);
+ if (!pages[num_pages])
+ goto out;
+ num_pages++;
+ }
+
+ req = fuse_get_req(fc, num_pages);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ req = NULL;
+ goto out;
+ }
+ memcpy(req->pages, pages, sizeof(req->pages[0]) * num_pages);
+ req->num_pages = num_pages;
+ fuse_page_descs_length_init(req, 0, req->num_pages);
+
+ /* okay, let's send it to the client */
+ req->in.h.opcode = FUSE_IOCTL;
+ req->in.h.nodeid = ff->nodeid;
+ req->in.numargs = 1;
+ req->in.args[0].size = sizeof(inarg);
+ req->in.args[0].value = &inarg;
+ if (in_size) {
+ req->in.numargs++;
+ req->in.args[1].size = in_size;
+ req->in.argpages = 1;
+
+ err = fuse_ioctl_copy_user(pages, in_iov, in_iovs, in_size,
+ false);
+ if (err)
+ goto out;
+ }
+
+ req->out.numargs = 2;
+ req->out.args[0].size = sizeof(outarg);
+ req->out.args[0].value = &outarg;
+ req->out.args[1].size = out_size;
+ req->out.argpages = 1;
+ req->out.argvar = 1;
+
+ fuse_request_send(fc, req);
+ err = req->out.h.error;
+ transferred = req->out.args[1].size;
+ fuse_put_request(fc, req);
+ req = NULL;
+ if (err)
+ goto out;
+
+ /* did it ask for retry? */
+ if (outarg.flags & FUSE_IOCTL_RETRY) {
+ void *vaddr;
+
+ /* no retry if in restricted mode */
+ err = -EIO;
+ if (!(flags & FUSE_IOCTL_UNRESTRICTED))
+ goto out;
+
+ in_iovs = outarg.in_iovs;
+ out_iovs = outarg.out_iovs;
+
+ /*
+ * Make sure things are in boundary, separate checks
+ * are to protect against overflow.
+ */
+ err = -ENOMEM;
+ if (in_iovs > FUSE_IOCTL_MAX_IOV ||
+ out_iovs > FUSE_IOCTL_MAX_IOV ||
+ in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV)
+ goto out;
+
+ vaddr = kmap_atomic(pages[0]);
+ err = fuse_copy_ioctl_iovec(fc, iov_page, vaddr,
+ transferred, in_iovs + out_iovs,
+ (flags & FUSE_IOCTL_COMPAT) != 0);
+ kunmap_atomic(vaddr);
+ if (err)
+ goto out;
+
+ in_iov = iov_page;
+ out_iov = in_iov + in_iovs;
+
+ err = fuse_verify_ioctl_iov(in_iov, in_iovs);
+ if (err)
+ goto out;
+
+ err = fuse_verify_ioctl_iov(out_iov, out_iovs);
+ if (err)
+ goto out;
+
+ goto retry;
+ }
+
+ err = -EIO;
+ if (transferred > inarg.out_size)
+ goto out;
+
+ err = fuse_ioctl_copy_user(pages, out_iov, out_iovs, transferred, true);
+ out:
+ if (req)
+ fuse_put_request(fc, req);
+ free_page((unsigned long) iov_page);
+ while (num_pages)
+ __free_page(pages[--num_pages]);
+ kfree(pages);
+
+ return err ? err : outarg.result;
+}
+EXPORT_SYMBOL_GPL(fuse_do_ioctl);
+
+long fuse_ioctl_common(struct file *file, unsigned int cmd,
+ unsigned long arg, unsigned int flags)
+{
+ struct inode *inode = file_inode(file);
+ struct fuse_conn *fc = get_fuse_conn(inode);
+
+ if (!fuse_allow_current_process(fc))
+ return -EACCES;
+
+ if (is_bad_inode(inode))
+ return -EIO;
+
+ return fuse_do_ioctl(file, cmd, arg, flags);
+}
+
+static long fuse_file_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ return fuse_ioctl_common(file, cmd, arg, 0);
+}
+
+static long fuse_file_compat_ioctl(struct file *file, unsigned int cmd,
+ unsigned long arg)
+{
+ return fuse_ioctl_common(file, cmd, arg, FUSE_IOCTL_COMPAT);
+}
+
+/*
+ * All files which have been polled are linked to RB tree
+ * fuse_conn->polled_files which is indexed by kh. Walk the tree and
+ * find the matching one.
+ */
+static struct rb_node **fuse_find_polled_node(struct fuse_conn *fc, u64 kh,
+ struct rb_node **parent_out)
+{
+ struct rb_node **link = &fc->polled_files.rb_node;
+ struct rb_node *last = NULL;
+
+ while (*link) {
+ struct fuse_file *ff;
+
+ last = *link;
+ ff = rb_entry(last, struct fuse_file, polled_node);
+
+ if (kh < ff->kh)
+ link = &last->rb_left;
+ else if (kh > ff->kh)
+ link = &last->rb_right;
+ else
+ return link;
+ }
+
+ if (parent_out)
+ *parent_out = last;
+ return link;
+}
+
+/*
+ * The file is about to be polled. Make sure it's on the polled_files
+ * RB tree. Note that files once added to the polled_files tree are
+ * not removed before the file is released. This is because a file
+ * polled once is likely to be polled again.
+ */
+static void fuse_register_polled_file(struct fuse_conn *fc,
+ struct fuse_file *ff)
+{
+ spin_lock(&fc->lock);
+ if (RB_EMPTY_NODE(&ff->polled_node)) {
+ struct rb_node **link, *uninitialized_var(parent);
+
+ link = fuse_find_polled_node(fc, ff->kh, &parent);
+ BUG_ON(*link);
+ rb_link_node(&ff->polled_node, parent, link);
+ rb_insert_color(&ff->polled_node, &fc->polled_files);
+ }
+ spin_unlock(&fc->lock);
+}
+
+unsigned fuse_file_poll(struct file *file, poll_table *wait)
+{
+ struct fuse_file *ff = file->private_data;
+ struct fuse_conn *fc = ff->fc;
+ struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh };
+ struct fuse_poll_out outarg;
+ FUSE_ARGS(args);
+ int err;
+
+ if (fc->no_poll)
+ return DEFAULT_POLLMASK;
+
+ poll_wait(file, &ff->poll_wait, wait);
+ inarg.events = (__u32)poll_requested_events(wait);
+
+ /*
+ * Ask for notification iff there's someone waiting for it.
+ * The client may ignore the flag and always notify.
+ */
+ if (waitqueue_active(&ff->poll_wait)) {
+ inarg.flags |= FUSE_POLL_SCHEDULE_NOTIFY;
+ fuse_register_polled_file(fc, ff);
+ }
+
+ args.in.h.opcode = FUSE_POLL;
+ args.in.h.nodeid = ff->nodeid;
+ args.in.numargs = 1;
+ args.in.args[0].size = sizeof(inarg);
+ args.in.args[0].value = &inarg;
+ args.out.numargs = 1;
+ args.out.args[0].size = sizeof(outarg);
+ args.out.args[0].value = &outarg;
+ err = fuse_simple_request(fc, &args);
+
+ if (!err)
+ return outarg.revents;
+ if (err == -ENOSYS) {
+ fc->no_poll = 1;
+ return DEFAULT_POLLMASK;
+ }
+ return POLLERR;
+}
+EXPORT_SYMBOL_GPL(fuse_file_poll);
+
+/*
+ * This is called from fuse_handle_notify() on FUSE_NOTIFY_POLL and
+ * wakes up the poll waiters.
+ */
+int fuse_notify_poll_wakeup(struct fuse_conn *fc,
+ struct fuse_notify_poll_wakeup_out *outarg)
+{
+ u64 kh = outarg->kh;
+ struct rb_node **link;
+
+ spin_lock(&fc->lock);
+
+ link = fuse_find_polled_node(fc, kh, NULL);
+ if (*link) {
+ struct fuse_file *ff;
+
+ ff = rb_entry(*link, struct fuse_file, polled_node);
+ wake_up_interruptible_sync(&ff->poll_wait);
+ }
+
+ spin_unlock(&fc->lock);
+ return 0;
+}
+
+static void fuse_do_truncate(struct file *file)
+{
+ struct inode *inode = file->f_mapping->host;
+ struct iattr attr;
+
+ attr.ia_valid = ATTR_SIZE;
+ attr.ia_size = i_size_read(inode);
+
+ attr.ia_file = file;
+ attr.ia_valid |= ATTR_FILE;
+
+ fuse_do_setattr(inode, &attr, file);
+}
+
+static inline loff_t fuse_round_up(loff_t off)
+{
+ return round_up(off, FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT);
+}
+
+static ssize_t
+fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
+{
+ DECLARE_COMPLETION_ONSTACK(wait);
+ ssize_t ret = 0;
+ struct file *file = iocb->ki_filp;
+ struct fuse_file *ff = file->private_data;
+ bool async_dio = ff->fc->async_dio;
+ loff_t pos = 0;
+ struct inode *inode;
+ loff_t i_size;
+ size_t count = iov_iter_count(iter);
+ struct fuse_io_priv *io;
+
+ pos = offset;
+ inode = file->f_mapping->host;
+ i_size = i_size_read(inode);
+
+ if ((iov_iter_rw(iter) == READ) && (offset > i_size))
+ return 0;
+
+ /* optimization for short read */
+ if (async_dio && iov_iter_rw(iter) != WRITE && offset + count > i_size) {
+ if (offset >= i_size)
+ return 0;
+ iov_iter_truncate(iter, fuse_round_up(i_size - offset));
+ count = iov_iter_count(iter);
+ }
+
+ io = kmalloc(sizeof(struct fuse_io_priv), GFP_KERNEL);
+ if (!io)
+ return -ENOMEM;
+ spin_lock_init(&io->lock);
+ io->reqs = 1;
+ io->bytes = -1;
+ io->size = 0;
+ io->offset = offset;
+ io->write = (iov_iter_rw(iter) == WRITE);
+ io->err = 0;
+ io->file = file;
+ /*
+ * By default, we want to optimize all I/Os with async request
+ * submission to the client filesystem if supported.
+ */
+ io->async = async_dio;
+ io->iocb = iocb;
+
+ /*
+ * We cannot asynchronously extend the size of a file. We have no method
+ * to wait on real async I/O requests, so we must submit this request
+ * synchronously.
+ */
+ if (!is_sync_kiocb(iocb) && (offset + count > i_size) &&
+ iov_iter_rw(iter) == WRITE)
+ io->async = false;
+
+ if (io->async && is_sync_kiocb(iocb))
+ io->done = &wait;
+
+ if (iov_iter_rw(iter) == WRITE) {
+ ret = fuse_direct_io(io, iter, &pos, FUSE_DIO_WRITE);
+ fuse_invalidate_attr(inode);
+ } else {
+ ret = __fuse_direct_read(io, iter, &pos);
+ }
+
+ if (io->async) {
+ fuse_aio_complete(io, ret < 0 ? ret : 0, -1);
+
+ /* we have a non-extending, async request, so return */
+ if (!is_sync_kiocb(iocb))
+ return -EIOCBQUEUED;
+
+ wait_for_completion(&wait);
+ ret = fuse_get_res_by_io(io);
+ }
+
+ kfree(io);
+
+ if (iov_iter_rw(iter) == WRITE) {
+ if (ret > 0)
+ fuse_write_update_size(inode, pos);
+ else if (ret < 0 && offset + count > i_size)
+ fuse_do_truncate(file);
+ }
+
+ return ret;
+}
+
+static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
+ loff_t length)
+{
+ struct fuse_file *ff = file->private_data;
+ struct inode *inode = file_inode(file);
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ struct fuse_conn *fc = ff->fc;
+ FUSE_ARGS(args);
+ struct fuse_fallocate_in inarg = {
+ .fh = ff->fh,
+ .offset = offset,
+ .length = length,
+ .mode = mode
+ };
+ int err;
+ bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) ||
+ (mode & FALLOC_FL_PUNCH_HOLE);
+
+ if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+ return -EOPNOTSUPP;
+
+ if (fc->no_fallocate)
+ return -EOPNOTSUPP;
+
+ if (lock_inode) {
+ mutex_lock(&inode->i_mutex);
+ if (mode & FALLOC_FL_PUNCH_HOLE) {
+ loff_t endbyte = offset + length - 1;
+ err = filemap_write_and_wait_range(inode->i_mapping,
+ offset, endbyte);
+ if (err)
+ goto out;
+
+ fuse_sync_writes(inode);
+ }
+ }
+
+ if (!(mode & FALLOC_FL_KEEP_SIZE))
+ set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
+
+ args.in.h.opcode = FUSE_FALLOCATE;
+ args.in.h.nodeid = ff->nodeid;
+ args.in.numargs = 1;
+ args.in.args[0].size = sizeof(inarg);
+ args.in.args[0].value = &inarg;
+ err = fuse_simple_request(fc, &args);
+ if (err == -ENOSYS) {
+ fc->no_fallocate = 1;
+ err = -EOPNOTSUPP;
+ }
+ if (err)
+ goto out;
+
+ /* we could have extended the file */
+ if (!(mode & FALLOC_FL_KEEP_SIZE)) {
+ bool changed = fuse_write_update_size(inode, offset + length);
+
+ if (changed && fc->writeback_cache)
+ file_update_time(file);
+ }
+
+ if (mode & FALLOC_FL_PUNCH_HOLE)
+ truncate_pagecache_range(inode, offset, offset + length - 1);
+
+ fuse_invalidate_attr(inode);
+
+out:
+ if (!(mode & FALLOC_FL_KEEP_SIZE))
+ clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
+
+ if (lock_inode)
+ mutex_unlock(&inode->i_mutex);
+
+ return err;
+}
+
+static const struct file_operations fuse_file_operations = {
+ .llseek = fuse_file_llseek,
+ .read_iter = fuse_file_read_iter,
+ .write_iter = fuse_file_write_iter,
+ .mmap = fuse_file_mmap,
+ .open = fuse_open,
+ .flush = fuse_flush,
+ .release = fuse_release,
+ .fsync = fuse_fsync,
+ .lock = fuse_file_lock,
+ .flock = fuse_file_flock,
+ .splice_read = generic_file_splice_read,
+ .unlocked_ioctl = fuse_file_ioctl,
+ .compat_ioctl = fuse_file_compat_ioctl,
+ .poll = fuse_file_poll,
+ .fallocate = fuse_file_fallocate,
+};
+
+static const struct file_operations fuse_direct_io_file_operations = {
+ .llseek = fuse_file_llseek,
+ .read_iter = fuse_direct_read_iter,
+ .write_iter = fuse_direct_write_iter,
+ .mmap = fuse_direct_mmap,
+ .open = fuse_open,
+ .flush = fuse_flush,
+ .release = fuse_release,
+ .fsync = fuse_fsync,
+ .lock = fuse_file_lock,
+ .flock = fuse_file_flock,
+ .unlocked_ioctl = fuse_file_ioctl,
+ .compat_ioctl = fuse_file_compat_ioctl,
+ .poll = fuse_file_poll,
+ .fallocate = fuse_file_fallocate,
+ /* no splice_read */
+};
+
+static const struct address_space_operations fuse_file_aops = {
+ .readpage = fuse_readpage,
+ .writepage = fuse_writepage,
+ .writepages = fuse_writepages,
+ .launder_page = fuse_launder_page,
+ .readpages = fuse_readpages,
+ .set_page_dirty = __set_page_dirty_nobuffers,
+ .bmap = fuse_bmap,
+ .direct_IO = fuse_direct_IO,
+ .write_begin = fuse_write_begin,
+ .write_end = fuse_write_end,
+};
+
+void fuse_init_file_inode(struct inode *inode)
+{
+ inode->i_fop = &fuse_file_operations;
+ inode->i_data.a_ops = &fuse_file_aops;
+}
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
new file mode 100644
index 000000000..7354dc142
--- /dev/null
+++ b/fs/fuse/fuse_i.h
@@ -0,0 +1,912 @@
+/*
+ FUSE: Filesystem in Userspace
+ Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu>
+
+ This program can be distributed under the terms of the GNU GPL.
+ See the file COPYING.
+*/
+
+#ifndef _FS_FUSE_I_H
+#define _FS_FUSE_I_H
+
+#include <linux/fuse.h>
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/wait.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/mm.h>
+#include <linux/backing-dev.h>
+#include <linux/mutex.h>
+#include <linux/rwsem.h>
+#include <linux/rbtree.h>
+#include <linux/poll.h>
+#include <linux/workqueue.h>
+
+/** Max number of pages that can be used in a single read request */
+#define FUSE_MAX_PAGES_PER_REQ 32
+
+/** Bias for fi->writectr, meaning new writepages must not be sent */
+#define FUSE_NOWRITE INT_MIN
+
+/** It could be as large as PATH_MAX, but would that have any uses? */
+#define FUSE_NAME_MAX 1024
+
+/** Number of dentries for each connection in the control filesystem */
+#define FUSE_CTL_NUM_DENTRIES 5
+
+/** If the FUSE_DEFAULT_PERMISSIONS flag is given, the filesystem
+ module will check permissions based on the file mode. Otherwise no
+ permission checking is done in the kernel */
+#define FUSE_DEFAULT_PERMISSIONS (1 << 0)
+
+/** If the FUSE_ALLOW_OTHER flag is given, then not only the user
+ doing the mount will be allowed to access the filesystem */
+#define FUSE_ALLOW_OTHER (1 << 1)
+
+/** Number of page pointers embedded in fuse_req */
+#define FUSE_REQ_INLINE_PAGES 1
+
+/** List of active connections */
+extern struct list_head fuse_conn_list;
+
+/** Global mutex protecting fuse_conn_list and the control filesystem */
+extern struct mutex fuse_mutex;
+
+/** Module parameters */
+extern unsigned max_user_bgreq;
+extern unsigned max_user_congthresh;
+
+/* One forget request */
+struct fuse_forget_link {
+ struct fuse_forget_one forget_one;
+ struct fuse_forget_link *next;
+};
+
+/** FUSE inode */
+struct fuse_inode {
+ /** Inode data */
+ struct inode inode;
+
+ /** Unique ID, which identifies the inode between userspace
+ * and kernel */
+ u64 nodeid;
+
+ /** Number of lookups on this inode */
+ u64 nlookup;
+
+ /** The request used for sending the FORGET message */
+ struct fuse_forget_link *forget;
+
+ /** Time in jiffies until the file attributes are valid */
+ u64 i_time;
+
+ /** The sticky bit in inode->i_mode may have been removed, so
+ preserve the original mode */
+ umode_t orig_i_mode;
+
+ /** 64 bit inode number */
+ u64 orig_ino;
+
+ /** Version of last attribute change */
+ u64 attr_version;
+
+ /** Files usable in writepage. Protected by fc->lock */
+ struct list_head write_files;
+
+ /** Writepages pending on truncate or fsync */
+ struct list_head queued_writes;
+
+ /** Number of sent writes, a negative bias (FUSE_NOWRITE)
+ * means more writes are blocked */
+ int writectr;
+
+ /** Waitq for writepage completion */
+ wait_queue_head_t page_waitq;
+
+ /** List of writepage requestst (pending or sent) */
+ struct list_head writepages;
+
+ /** Miscellaneous bits describing inode state */
+ unsigned long state;
+};
+
+/** FUSE inode state bits */
+enum {
+ /** Advise readdirplus */
+ FUSE_I_ADVISE_RDPLUS,
+ /** Initialized with readdirplus */
+ FUSE_I_INIT_RDPLUS,
+ /** An operation changing file size is in progress */
+ FUSE_I_SIZE_UNSTABLE,
+};
+
+struct fuse_conn;
+
+/** FUSE specific file data */
+struct fuse_file {
+ /** Fuse connection for this file */
+ struct fuse_conn *fc;
+
+ /** Request reserved for flush and release */
+ struct fuse_req *reserved_req;
+
+ /** Kernel file handle guaranteed to be unique */
+ u64 kh;
+
+ /** File handle used by userspace */
+ u64 fh;
+
+ /** Node id of this file */
+ u64 nodeid;
+
+ /** Refcount */
+ atomic_t count;
+
+ /** FOPEN_* flags returned by open */
+ u32 open_flags;
+
+ /** Entry on inode's write_files list */
+ struct list_head write_entry;
+
+ /** RB node to be linked on fuse_conn->polled_files */
+ struct rb_node polled_node;
+
+ /** Wait queue head for poll */
+ wait_queue_head_t poll_wait;
+
+ /** Has flock been performed on this file? */
+ bool flock:1;
+};
+
+/** One input argument of a request */
+struct fuse_in_arg {
+ unsigned size;
+ const void *value;
+};
+
+/** The request input */
+struct fuse_in {
+ /** The request header */
+ struct fuse_in_header h;
+
+ /** True if the data for the last argument is in req->pages */
+ unsigned argpages:1;
+
+ /** Number of arguments */
+ unsigned numargs;
+
+ /** Array of arguments */
+ struct fuse_in_arg args[3];
+};
+
+/** One output argument of a request */
+struct fuse_arg {
+ unsigned size;
+ void *value;
+};
+
+/** The request output */
+struct fuse_out {
+ /** Header returned from userspace */
+ struct fuse_out_header h;
+
+ /*
+ * The following bitfields are not changed during the request
+ * processing
+ */
+
+ /** Last argument is variable length (can be shorter than
+ arg->size) */
+ unsigned argvar:1;
+
+ /** Last argument is a list of pages to copy data to */
+ unsigned argpages:1;
+
+ /** Zero partially or not copied pages */
+ unsigned page_zeroing:1;
+
+ /** Pages may be replaced with new ones */
+ unsigned page_replace:1;
+
+ /** Number or arguments */
+ unsigned numargs;
+
+ /** Array of arguments */
+ struct fuse_arg args[2];
+};
+
+/** FUSE page descriptor */
+struct fuse_page_desc {
+ unsigned int length;
+ unsigned int offset;
+};
+
+struct fuse_args {
+ struct {
+ struct {
+ uint32_t opcode;
+ uint64_t nodeid;
+ } h;
+ unsigned numargs;
+ struct fuse_in_arg args[3];
+
+ } in;
+ struct {
+ unsigned argvar:1;
+ unsigned numargs;
+ struct fuse_arg args[2];
+ } out;
+};
+
+#define FUSE_ARGS(args) struct fuse_args args = {}
+
+/** The request state */
+enum fuse_req_state {
+ FUSE_REQ_INIT = 0,
+ FUSE_REQ_PENDING,
+ FUSE_REQ_READING,
+ FUSE_REQ_SENT,
+ FUSE_REQ_WRITING,
+ FUSE_REQ_FINISHED
+};
+
+/** The request IO state (for asynchronous processing) */
+struct fuse_io_priv {
+ int async;
+ spinlock_t lock;
+ unsigned reqs;
+ ssize_t bytes;
+ size_t size;
+ __u64 offset;
+ bool write;
+ int err;
+ struct kiocb *iocb;
+ struct file *file;
+ struct completion *done;
+};
+
+/**
+ * A request to the client
+ */
+struct fuse_req {
+ /** This can be on either pending processing or io lists in
+ fuse_conn */
+ struct list_head list;
+
+ /** Entry on the interrupts list */
+ struct list_head intr_entry;
+
+ /** refcount */
+ atomic_t count;
+
+ /** Unique ID for the interrupt request */
+ u64 intr_unique;
+
+ /*
+ * The following bitfields are either set once before the
+ * request is queued or setting/clearing them is protected by
+ * fuse_conn->lock
+ */
+
+ /** True if the request has reply */
+ unsigned isreply:1;
+
+ /** Force sending of the request even if interrupted */
+ unsigned force:1;
+
+ /** The request was aborted */
+ unsigned aborted:1;
+
+ /** Request is sent in the background */
+ unsigned background:1;
+
+ /** The request has been interrupted */
+ unsigned interrupted:1;
+
+ /** Data is being copied to/from the request */
+ unsigned locked:1;
+
+ /** Request is counted as "waiting" */
+ unsigned waiting:1;
+
+ /** State of the request */
+ enum fuse_req_state state;
+
+ /** The request input */
+ struct fuse_in in;
+
+ /** The request output */
+ struct fuse_out out;
+
+ /** Used to wake up the task waiting for completion of request*/
+ wait_queue_head_t waitq;
+
+ /** Data for asynchronous requests */
+ union {
+ struct {
+ struct fuse_release_in in;
+ struct inode *inode;
+ } release;
+ struct fuse_init_in init_in;
+ struct fuse_init_out init_out;
+ struct cuse_init_in cuse_init_in;
+ struct {
+ struct fuse_read_in in;
+ u64 attr_ver;
+ } read;
+ struct {
+ struct fuse_write_in in;
+ struct fuse_write_out out;
+ struct fuse_req *next;
+ } write;
+ struct fuse_notify_retrieve_in retrieve_in;
+ } misc;
+
+ /** page vector */
+ struct page **pages;
+
+ /** page-descriptor vector */
+ struct fuse_page_desc *page_descs;
+
+ /** size of the 'pages' array */
+ unsigned max_pages;
+
+ /** inline page vector */
+ struct page *inline_pages[FUSE_REQ_INLINE_PAGES];
+
+ /** inline page-descriptor vector */
+ struct fuse_page_desc inline_page_descs[FUSE_REQ_INLINE_PAGES];
+
+ /** number of pages in vector */
+ unsigned num_pages;
+
+ /** File used in the request (or NULL) */
+ struct fuse_file *ff;
+
+ /** Inode used in the request or NULL */
+ struct inode *inode;
+
+ /** AIO control block */
+ struct fuse_io_priv *io;
+
+ /** Link on fi->writepages */
+ struct list_head writepages_entry;
+
+ /** Request completion callback */
+ void (*end)(struct fuse_conn *, struct fuse_req *);
+
+ /** Request is stolen from fuse_file->reserved_req */
+ struct file *stolen_file;
+};
+
+/**
+ * A Fuse connection.
+ *
+ * This structure is created, when the filesystem is mounted, and is
+ * destroyed, when the client device is closed and the filesystem is
+ * unmounted.
+ */
+struct fuse_conn {
+ /** Lock protecting accessess to members of this structure */
+ spinlock_t lock;
+
+ /** Refcount */
+ atomic_t count;
+
+ struct rcu_head rcu;
+
+ /** The user id for this mount */
+ kuid_t user_id;
+
+ /** The group id for this mount */
+ kgid_t group_id;
+
+ /** The fuse mount flags for this mount */
+ unsigned flags;
+
+ /** Maximum read size */
+ unsigned max_read;
+
+ /** Maximum write size */
+ unsigned max_write;
+
+ /** Readers of the connection are waiting on this */
+ wait_queue_head_t waitq;
+
+ /** The list of pending requests */
+ struct list_head pending;
+
+ /** The list of requests being processed */
+ struct list_head processing;
+
+ /** The list of requests under I/O */
+ struct list_head io;
+
+ /** The next unique kernel file handle */
+ u64 khctr;
+
+ /** rbtree of fuse_files waiting for poll events indexed by ph */
+ struct rb_root polled_files;
+
+ /** Maximum number of outstanding background requests */
+ unsigned max_background;
+
+ /** Number of background requests at which congestion starts */
+ unsigned congestion_threshold;
+
+ /** Number of requests currently in the background */
+ unsigned num_background;
+
+ /** Number of background requests currently queued for userspace */
+ unsigned active_background;
+
+ /** The list of background requests set aside for later queuing */
+ struct list_head bg_queue;
+
+ /** Pending interrupts */
+ struct list_head interrupts;
+
+ /** Queue of pending forgets */
+ struct fuse_forget_link forget_list_head;
+ struct fuse_forget_link *forget_list_tail;
+
+ /** Batching of FORGET requests (positive indicates FORGET batch) */
+ int forget_batch;
+
+ /** Flag indicating that INIT reply has been received. Allocating
+ * any fuse request will be suspended until the flag is set */
+ int initialized;
+
+ /** Flag indicating if connection is blocked. This will be
+ the case before the INIT reply is received, and if there
+ are too many outstading backgrounds requests */
+ int blocked;
+
+ /** waitq for blocked connection */
+ wait_queue_head_t blocked_waitq;
+
+ /** waitq for reserved requests */
+ wait_queue_head_t reserved_req_waitq;
+
+ /** The next unique request id */
+ u64 reqctr;
+
+ /** Connection established, cleared on umount, connection
+ abort and device release */
+ unsigned connected;
+
+ /** Connection failed (version mismatch). Cannot race with
+ setting other bitfields since it is only set once in INIT
+ reply, before any other request, and never cleared */
+ unsigned conn_error:1;
+
+ /** Connection successful. Only set in INIT */
+ unsigned conn_init:1;
+
+ /** Do readpages asynchronously? Only set in INIT */
+ unsigned async_read:1;
+
+ /** Do not send separate SETATTR request before open(O_TRUNC) */
+ unsigned atomic_o_trunc:1;
+
+ /** Filesystem supports NFS exporting. Only set in INIT */
+ unsigned export_support:1;
+
+ /** Set if bdi is valid */
+ unsigned bdi_initialized:1;
+
+ /** write-back cache policy (default is write-through) */
+ unsigned writeback_cache:1;
+
+ /*
+ * The following bitfields are only for optimization purposes
+ * and hence races in setting them will not cause malfunction
+ */
+
+ /** Is open/release not implemented by fs? */
+ unsigned no_open:1;
+
+ /** Is fsync not implemented by fs? */
+ unsigned no_fsync:1;
+
+ /** Is fsyncdir not implemented by fs? */
+ unsigned no_fsyncdir:1;
+
+ /** Is flush not implemented by fs? */
+ unsigned no_flush:1;
+
+ /** Is setxattr not implemented by fs? */
+ unsigned no_setxattr:1;
+
+ /** Is getxattr not implemented by fs? */
+ unsigned no_getxattr:1;
+
+ /** Is listxattr not implemented by fs? */
+ unsigned no_listxattr:1;
+
+ /** Is removexattr not implemented by fs? */
+ unsigned no_removexattr:1;
+
+ /** Are posix file locking primitives not implemented by fs? */
+ unsigned no_lock:1;
+
+ /** Is access not implemented by fs? */
+ unsigned no_access:1;
+
+ /** Is create not implemented by fs? */
+ unsigned no_create:1;
+
+ /** Is interrupt not implemented by fs? */
+ unsigned no_interrupt:1;
+
+ /** Is bmap not implemented by fs? */
+ unsigned no_bmap:1;
+
+ /** Is poll not implemented by fs? */
+ unsigned no_poll:1;
+
+ /** Do multi-page cached writes */
+ unsigned big_writes:1;
+
+ /** Don't apply umask to creation modes */
+ unsigned dont_mask:1;
+
+ /** Are BSD file locking primitives not implemented by fs? */
+ unsigned no_flock:1;
+
+ /** Is fallocate not implemented by fs? */
+ unsigned no_fallocate:1;
+
+ /** Is rename with flags implemented by fs? */
+ unsigned no_rename2:1;
+
+ /** Use enhanced/automatic page cache invalidation. */
+ unsigned auto_inval_data:1;
+
+ /** Does the filesystem support readdirplus? */
+ unsigned do_readdirplus:1;
+
+ /** Does the filesystem want adaptive readdirplus? */
+ unsigned readdirplus_auto:1;
+
+ /** Does the filesystem support asynchronous direct-IO submission? */
+ unsigned async_dio:1;
+
+ /** The number of requests waiting for completion */
+ atomic_t num_waiting;
+
+ /** Negotiated minor version */
+ unsigned minor;
+
+ /** Backing dev info */
+ struct backing_dev_info bdi;
+
+ /** Entry on the fuse_conn_list */
+ struct list_head entry;
+
+ /** Device ID from super block */
+ dev_t dev;
+
+ /** Dentries in the control filesystem */
+ struct dentry *ctl_dentry[FUSE_CTL_NUM_DENTRIES];
+
+ /** number of dentries used in the above array */
+ int ctl_ndents;
+
+ /** O_ASYNC requests */
+ struct fasync_struct *fasync;
+
+ /** Key for lock owner ID scrambling */
+ u32 scramble_key[4];
+
+ /** Reserved request for the DESTROY message */
+ struct fuse_req *destroy_req;
+
+ /** Version counter for attribute changes */
+ u64 attr_version;
+
+ /** Called on final put */
+ void (*release)(struct fuse_conn *);
+
+ /** Super block for this connection. */
+ struct super_block *sb;
+
+ /** Read/write semaphore to hold when accessing sb. */
+ struct rw_semaphore killsb;
+};
+
+static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb)
+{
+ return sb->s_fs_info;
+}
+
+static inline struct fuse_conn *get_fuse_conn(struct inode *inode)
+{
+ return get_fuse_conn_super(inode->i_sb);
+}
+
+static inline struct fuse_inode *get_fuse_inode(struct inode *inode)
+{
+ return container_of(inode, struct fuse_inode, inode);
+}
+
+static inline u64 get_node_id(struct inode *inode)
+{
+ return get_fuse_inode(inode)->nodeid;
+}
+
+/** Device operations */
+extern const struct file_operations fuse_dev_operations;
+
+extern const struct dentry_operations fuse_dentry_operations;
+
+/**
+ * Inode to nodeid comparison.
+ */
+int fuse_inode_eq(struct inode *inode, void *_nodeidp);
+
+/**
+ * Get a filled in inode
+ */
+struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
+ int generation, struct fuse_attr *attr,
+ u64 attr_valid, u64 attr_version);
+
+int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
+ struct fuse_entry_out *outarg, struct inode **inode);
+
+/**
+ * Send FORGET command
+ */
+void fuse_queue_forget(struct fuse_conn *fc, struct fuse_forget_link *forget,
+ u64 nodeid, u64 nlookup);
+
+struct fuse_forget_link *fuse_alloc_forget(void);
+
+/* Used by READDIRPLUS */
+void fuse_force_forget(struct file *file, u64 nodeid);
+
+/**
+ * Initialize READ or READDIR request
+ */
+void fuse_read_fill(struct fuse_req *req, struct file *file,
+ loff_t pos, size_t count, int opcode);
+
+/**
+ * Send OPEN or OPENDIR request
+ */
+int fuse_open_common(struct inode *inode, struct file *file, bool isdir);
+
+struct fuse_file *fuse_file_alloc(struct fuse_conn *fc);
+struct fuse_file *fuse_file_get(struct fuse_file *ff);
+void fuse_file_free(struct fuse_file *ff);
+void fuse_finish_open(struct inode *inode, struct file *file);
+
+void fuse_sync_release(struct fuse_file *ff, int flags);
+
+/**
+ * Send RELEASE or RELEASEDIR request
+ */
+void fuse_release_common(struct file *file, int opcode);
+
+/**
+ * Send FSYNC or FSYNCDIR request
+ */
+int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
+ int datasync, int isdir);
+
+/**
+ * Notify poll wakeup
+ */
+int fuse_notify_poll_wakeup(struct fuse_conn *fc,
+ struct fuse_notify_poll_wakeup_out *outarg);
+
+/**
+ * Initialize file operations on a regular file
+ */
+void fuse_init_file_inode(struct inode *inode);
+
+/**
+ * Initialize inode operations on regular files and special files
+ */
+void fuse_init_common(struct inode *inode);
+
+/**
+ * Initialize inode and file operations on a directory
+ */
+void fuse_init_dir(struct inode *inode);
+
+/**
+ * Initialize inode operations on a symlink
+ */
+void fuse_init_symlink(struct inode *inode);
+
+/**
+ * Change attributes of an inode
+ */
+void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
+ u64 attr_valid, u64 attr_version);
+
+void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
+ u64 attr_valid);
+
+/**
+ * Initialize the client device
+ */
+int fuse_dev_init(void);
+
+/**
+ * Cleanup the client device
+ */
+void fuse_dev_cleanup(void);
+
+int fuse_ctl_init(void);
+void __exit fuse_ctl_cleanup(void);
+
+/**
+ * Allocate a request
+ */
+struct fuse_req *fuse_request_alloc(unsigned npages);
+
+struct fuse_req *fuse_request_alloc_nofs(unsigned npages);
+
+/**
+ * Free a request
+ */
+void fuse_request_free(struct fuse_req *req);
+
+/**
+ * Get a request, may fail with -ENOMEM,
+ * caller should specify # elements in req->pages[] explicitly
+ */
+struct fuse_req *fuse_get_req(struct fuse_conn *fc, unsigned npages);
+struct fuse_req *fuse_get_req_for_background(struct fuse_conn *fc,
+ unsigned npages);
+
+/*
+ * Increment reference count on request
+ */
+void __fuse_get_request(struct fuse_req *req);
+
+/**
+ * Gets a requests for a file operation, always succeeds
+ */
+struct fuse_req *fuse_get_req_nofail_nopages(struct fuse_conn *fc,
+ struct file *file);
+
+/**
+ * Decrement reference count of a request. If count goes to zero free
+ * the request.
+ */
+void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req);
+
+/**
+ * Send a request (synchronous)
+ */
+void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req);
+
+/**
+ * Simple request sending that does request allocation and freeing
+ */
+ssize_t fuse_simple_request(struct fuse_conn *fc, struct fuse_args *args);
+
+/**
+ * Send a request in the background
+ */
+void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req);
+
+void fuse_request_send_background_locked(struct fuse_conn *fc,
+ struct fuse_req *req);
+
+/* Abort all requests */
+void fuse_abort_conn(struct fuse_conn *fc);
+
+/**
+ * Invalidate inode attributes
+ */
+void fuse_invalidate_attr(struct inode *inode);
+
+void fuse_invalidate_entry_cache(struct dentry *entry);
+
+void fuse_invalidate_atime(struct inode *inode);
+
+/**
+ * Acquire reference to fuse_conn
+ */
+struct fuse_conn *fuse_conn_get(struct fuse_conn *fc);
+
+/**
+ * Initialize fuse_conn
+ */
+void fuse_conn_init(struct fuse_conn *fc);
+
+/**
+ * Release reference to fuse_conn
+ */
+void fuse_conn_put(struct fuse_conn *fc);
+
+/**
+ * Add connection to control filesystem
+ */
+int fuse_ctl_add_conn(struct fuse_conn *fc);
+
+/**
+ * Remove connection from control filesystem
+ */
+void fuse_ctl_remove_conn(struct fuse_conn *fc);
+
+/**
+ * Is file type valid?
+ */
+int fuse_valid_type(int m);
+
+/**
+ * Is current process allowed to perform filesystem operation?
+ */
+int fuse_allow_current_process(struct fuse_conn *fc);
+
+u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id);
+
+int fuse_update_attributes(struct inode *inode, struct kstat *stat,
+ struct file *file, bool *refreshed);
+
+void fuse_flush_writepages(struct inode *inode);
+
+void fuse_set_nowrite(struct inode *inode);
+void fuse_release_nowrite(struct inode *inode);
+
+u64 fuse_get_attr_version(struct fuse_conn *fc);
+
+/**
+ * File-system tells the kernel to invalidate cache for the given node id.
+ */
+int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid,
+ loff_t offset, loff_t len);
+
+/**
+ * File-system tells the kernel to invalidate parent attributes and
+ * the dentry matching parent/name.
+ *
+ * If the child_nodeid is non-zero and:
+ * - matches the inode number for the dentry matching parent/name,
+ * - is not a mount point
+ * - is a file or oan empty directory
+ * then the dentry is unhashed (d_delete()).
+ */
+int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
+ u64 child_nodeid, struct qstr *name);
+
+int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
+ bool isdir);
+
+/**
+ * fuse_direct_io() flags
+ */
+
+/** If set, it is WRITE; otherwise - READ */
+#define FUSE_DIO_WRITE (1 << 0)
+
+/** CUSE pass fuse_direct_io() a file which f_mapping->host is not from FUSE */
+#define FUSE_DIO_CUSE (1 << 1)
+
+ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter,
+ loff_t *ppos, int flags);
+long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
+ unsigned int flags);
+long fuse_ioctl_common(struct file *file, unsigned int cmd,
+ unsigned long arg, unsigned int flags);
+unsigned fuse_file_poll(struct file *file, poll_table *wait);
+int fuse_dev_release(struct inode *inode, struct file *file);
+
+bool fuse_write_update_size(struct inode *inode, loff_t pos);
+
+int fuse_flush_times(struct inode *inode, struct fuse_file *ff);
+int fuse_write_inode(struct inode *inode, struct writeback_control *wbc);
+
+int fuse_do_setattr(struct inode *inode, struct iattr *attr,
+ struct file *file);
+
+void fuse_set_initialized(struct fuse_conn *fc);
+
+#endif /* _FS_FUSE_I_H */
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
new file mode 100644
index 000000000..708d69711
--- /dev/null
+++ b/fs/fuse/inode.c
@@ -0,0 +1,1320 @@
+/*
+ FUSE: Filesystem in Userspace
+ Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu>
+
+ This program can be distributed under the terms of the GNU GPL.
+ See the file COPYING.
+*/
+
+#include "fuse_i.h"
+
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/parser.h>
+#include <linux/statfs.h>
+#include <linux/random.h>
+#include <linux/sched.h>
+#include <linux/exportfs.h>
+
+MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
+MODULE_DESCRIPTION("Filesystem in Userspace");
+MODULE_LICENSE("GPL");
+
+static struct kmem_cache *fuse_inode_cachep;
+struct list_head fuse_conn_list;
+DEFINE_MUTEX(fuse_mutex);
+
+static int set_global_limit(const char *val, struct kernel_param *kp);
+
+unsigned max_user_bgreq;
+module_param_call(max_user_bgreq, set_global_limit, param_get_uint,
+ &max_user_bgreq, 0644);
+__MODULE_PARM_TYPE(max_user_bgreq, "uint");
+MODULE_PARM_DESC(max_user_bgreq,
+ "Global limit for the maximum number of backgrounded requests an "
+ "unprivileged user can set");
+
+unsigned max_user_congthresh;
+module_param_call(max_user_congthresh, set_global_limit, param_get_uint,
+ &max_user_congthresh, 0644);
+__MODULE_PARM_TYPE(max_user_congthresh, "uint");
+MODULE_PARM_DESC(max_user_congthresh,
+ "Global limit for the maximum congestion threshold an "
+ "unprivileged user can set");
+
+#define FUSE_SUPER_MAGIC 0x65735546
+
+#define FUSE_DEFAULT_BLKSIZE 512
+
+/** Maximum number of outstanding background requests */
+#define FUSE_DEFAULT_MAX_BACKGROUND 12
+
+/** Congestion starts at 75% of maximum */
+#define FUSE_DEFAULT_CONGESTION_THRESHOLD (FUSE_DEFAULT_MAX_BACKGROUND * 3 / 4)
+
+struct fuse_mount_data {
+ int fd;
+ unsigned rootmode;
+ kuid_t user_id;
+ kgid_t group_id;
+ unsigned fd_present:1;
+ unsigned rootmode_present:1;
+ unsigned user_id_present:1;
+ unsigned group_id_present:1;
+ unsigned flags;
+ unsigned max_read;
+ unsigned blksize;
+};
+
+struct fuse_forget_link *fuse_alloc_forget(void)
+{
+ return kzalloc(sizeof(struct fuse_forget_link), GFP_KERNEL);
+}
+
+static struct inode *fuse_alloc_inode(struct super_block *sb)
+{
+ struct inode *inode;
+ struct fuse_inode *fi;
+
+ inode = kmem_cache_alloc(fuse_inode_cachep, GFP_KERNEL);
+ if (!inode)
+ return NULL;
+
+ fi = get_fuse_inode(inode);
+ fi->i_time = 0;
+ fi->nodeid = 0;
+ fi->nlookup = 0;
+ fi->attr_version = 0;
+ fi->writectr = 0;
+ fi->orig_ino = 0;
+ fi->state = 0;
+ INIT_LIST_HEAD(&fi->write_files);
+ INIT_LIST_HEAD(&fi->queued_writes);
+ INIT_LIST_HEAD(&fi->writepages);
+ init_waitqueue_head(&fi->page_waitq);
+ fi->forget = fuse_alloc_forget();
+ if (!fi->forget) {
+ kmem_cache_free(fuse_inode_cachep, inode);
+ return NULL;
+ }
+
+ return inode;
+}
+
+static void fuse_i_callback(struct rcu_head *head)
+{
+ struct inode *inode = container_of(head, struct inode, i_rcu);
+ kmem_cache_free(fuse_inode_cachep, inode);
+}
+
+static void fuse_destroy_inode(struct inode *inode)
+{
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ BUG_ON(!list_empty(&fi->write_files));
+ BUG_ON(!list_empty(&fi->queued_writes));
+ kfree(fi->forget);
+ call_rcu(&inode->i_rcu, fuse_i_callback);
+}
+
+static void fuse_evict_inode(struct inode *inode)
+{
+ truncate_inode_pages_final(&inode->i_data);
+ clear_inode(inode);
+ if (inode->i_sb->s_flags & MS_ACTIVE) {
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ fuse_queue_forget(fc, fi->forget, fi->nodeid, fi->nlookup);
+ fi->forget = NULL;
+ }
+}
+
+static int fuse_remount_fs(struct super_block *sb, int *flags, char *data)
+{
+ sync_filesystem(sb);
+ if (*flags & MS_MANDLOCK)
+ return -EINVAL;
+
+ return 0;
+}
+
+/*
+ * ino_t is 32-bits on 32-bit arch. We have to squash the 64-bit value down
+ * so that it will fit.
+ */
+static ino_t fuse_squash_ino(u64 ino64)
+{
+ ino_t ino = (ino_t) ino64;
+ if (sizeof(ino_t) < sizeof(u64))
+ ino ^= ino64 >> (sizeof(u64) - sizeof(ino_t)) * 8;
+ return ino;
+}
+
+void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
+ u64 attr_valid)
+{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_inode *fi = get_fuse_inode(inode);
+
+ fi->attr_version = ++fc->attr_version;
+ fi->i_time = attr_valid;
+
+ inode->i_ino = fuse_squash_ino(attr->ino);
+ inode->i_mode = (inode->i_mode & S_IFMT) | (attr->mode & 07777);
+ set_nlink(inode, attr->nlink);
+ inode->i_uid = make_kuid(&init_user_ns, attr->uid);
+ inode->i_gid = make_kgid(&init_user_ns, attr->gid);
+ inode->i_blocks = attr->blocks;
+ inode->i_atime.tv_sec = attr->atime;
+ inode->i_atime.tv_nsec = attr->atimensec;
+ /* mtime from server may be stale due to local buffered write */
+ if (!fc->writeback_cache || !S_ISREG(inode->i_mode)) {
+ inode->i_mtime.tv_sec = attr->mtime;
+ inode->i_mtime.tv_nsec = attr->mtimensec;
+ inode->i_ctime.tv_sec = attr->ctime;
+ inode->i_ctime.tv_nsec = attr->ctimensec;
+ }
+
+ if (attr->blksize != 0)
+ inode->i_blkbits = ilog2(attr->blksize);
+ else
+ inode->i_blkbits = inode->i_sb->s_blocksize_bits;
+
+ /*
+ * Don't set the sticky bit in i_mode, unless we want the VFS
+ * to check permissions. This prevents failures due to the
+ * check in may_delete().
+ */
+ fi->orig_i_mode = inode->i_mode;
+ if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS))
+ inode->i_mode &= ~S_ISVTX;
+
+ fi->orig_ino = attr->ino;
+}
+
+void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
+ u64 attr_valid, u64 attr_version)
+{
+ struct fuse_conn *fc = get_fuse_conn(inode);
+ struct fuse_inode *fi = get_fuse_inode(inode);
+ bool is_wb = fc->writeback_cache;
+ loff_t oldsize;
+ struct timespec old_mtime;
+
+ spin_lock(&fc->lock);
+ if ((attr_version != 0 && fi->attr_version > attr_version) ||
+ test_bit(FUSE_I_SIZE_UNSTABLE, &fi->state)) {
+ spin_unlock(&fc->lock);
+ return;
+ }
+
+ old_mtime = inode->i_mtime;
+ fuse_change_attributes_common(inode, attr, attr_valid);
+
+ oldsize = inode->i_size;
+ /*
+ * In case of writeback_cache enabled, the cached writes beyond EOF
+ * extend local i_size without keeping userspace server in sync. So,
+ * attr->size coming from server can be stale. We cannot trust it.
+ */
+ if (!is_wb || !S_ISREG(inode->i_mode))
+ i_size_write(inode, attr->size);
+ spin_unlock(&fc->lock);
+
+ if (!is_wb && S_ISREG(inode->i_mode)) {
+ bool inval = false;
+
+ if (oldsize != attr->size) {
+ truncate_pagecache(inode, attr->size);
+ inval = true;
+ } else if (fc->auto_inval_data) {
+ struct timespec new_mtime = {
+ .tv_sec = attr->mtime,
+ .tv_nsec = attr->mtimensec,
+ };
+
+ /*
+ * Auto inval mode also checks and invalidates if mtime
+ * has changed.
+ */
+ if (!timespec_equal(&old_mtime, &new_mtime))
+ inval = true;
+ }
+
+ if (inval)
+ invalidate_inode_pages2(inode->i_mapping);
+ }
+}
+
+static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr)
+{
+ inode->i_mode = attr->mode & S_IFMT;
+ inode->i_size = attr->size;
+ inode->i_mtime.tv_sec = attr->mtime;
+ inode->i_mtime.tv_nsec = attr->mtimensec;
+ inode->i_ctime.tv_sec = attr->ctime;
+ inode->i_ctime.tv_nsec = attr->ctimensec;
+ if (S_ISREG(inode->i_mode)) {
+ fuse_init_common(inode);
+ fuse_init_file_inode(inode);
+ } else if (S_ISDIR(inode->i_mode))
+ fuse_init_dir(inode);
+ else if (S_ISLNK(inode->i_mode))
+ fuse_init_symlink(inode);
+ else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
+ S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
+ fuse_init_common(inode);
+ init_special_inode(inode, inode->i_mode,
+ new_decode_dev(attr->rdev));
+ } else
+ BUG();
+}
+
+int fuse_inode_eq(struct inode *inode, void *_nodeidp)
+{
+ u64 nodeid = *(u64 *) _nodeidp;
+ if (get_node_id(inode) == nodeid)
+ return 1;
+ else
+ return 0;
+}
+
+static int fuse_inode_set(struct inode *inode, void *_nodeidp)
+{
+ u64 nodeid = *(u64 *) _nodeidp;
+ get_fuse_inode(inode)->nodeid = nodeid;
+ return 0;
+}
+
+struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
+ int generation, struct fuse_attr *attr,
+ u64 attr_valid, u64 attr_version)
+{
+ struct inode *inode;
+ struct fuse_inode *fi;
+ struct fuse_conn *fc = get_fuse_conn_super(sb);
+
+ retry:
+ inode = iget5_locked(sb, nodeid, fuse_inode_eq, fuse_inode_set, &nodeid);
+ if (!inode)
+ return NULL;
+
+ if ((inode->i_state & I_NEW)) {
+ inode->i_flags |= S_NOATIME;
+ if (!fc->writeback_cache || !S_ISREG(attr->mode))
+ inode->i_flags |= S_NOCMTIME;
+ inode->i_generation = generation;
+ fuse_init_inode(inode, attr);
+ unlock_new_inode(inode);
+ } else if ((inode->i_mode ^ attr->mode) & S_IFMT) {
+ /* Inode has changed type, any I/O on the old should fail */
+ make_bad_inode(inode);
+ iput(inode);
+ goto retry;
+ }
+
+ fi = get_fuse_inode(inode);
+ spin_lock(&fc->lock);
+ fi->nlookup++;
+ spin_unlock(&fc->lock);
+ fuse_change_attributes(inode, attr, attr_valid, attr_version);
+
+ return inode;
+}
+
+int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid,
+ loff_t offset, loff_t len)
+{
+ struct inode *inode;
+ pgoff_t pg_start;
+ pgoff_t pg_end;
+
+ inode = ilookup5(sb, nodeid, fuse_inode_eq, &nodeid);
+ if (!inode)
+ return -ENOENT;
+
+ fuse_invalidate_attr(inode);
+ if (offset >= 0) {
+ pg_start = offset >> PAGE_CACHE_SHIFT;
+ if (len <= 0)
+ pg_end = -1;
+ else
+ pg_end = (offset + len - 1) >> PAGE_CACHE_SHIFT;
+ invalidate_inode_pages2_range(inode->i_mapping,
+ pg_start, pg_end);
+ }
+ iput(inode);
+ return 0;
+}
+
+static void fuse_umount_begin(struct super_block *sb)
+{
+ fuse_abort_conn(get_fuse_conn_super(sb));
+}
+
+static void fuse_send_destroy(struct fuse_conn *fc)
+{
+ struct fuse_req *req = fc->destroy_req;
+ if (req && fc->conn_init) {
+ fc->destroy_req = NULL;
+ req->in.h.opcode = FUSE_DESTROY;
+ req->force = 1;
+ req->background = 0;
+ fuse_request_send(fc, req);
+ fuse_put_request(fc, req);
+ }
+}
+
+static void fuse_bdi_destroy(struct fuse_conn *fc)
+{
+ if (fc->bdi_initialized)
+ bdi_destroy(&fc->bdi);
+}
+
+static void fuse_put_super(struct super_block *sb)
+{
+ struct fuse_conn *fc = get_fuse_conn_super(sb);
+
+ fuse_send_destroy(fc);
+
+ fuse_abort_conn(fc);
+ mutex_lock(&fuse_mutex);
+ list_del(&fc->entry);
+ fuse_ctl_remove_conn(fc);
+ mutex_unlock(&fuse_mutex);
+ fuse_bdi_destroy(fc);
+
+ fuse_conn_put(fc);
+}
+
+static void convert_fuse_statfs(struct kstatfs *stbuf, struct fuse_kstatfs *attr)
+{
+ stbuf->f_type = FUSE_SUPER_MAGIC;
+ stbuf->f_bsize = attr->bsize;
+ stbuf->f_frsize = attr->frsize;
+ stbuf->f_blocks = attr->blocks;
+ stbuf->f_bfree = attr->bfree;
+ stbuf->f_bavail = attr->bavail;
+ stbuf->f_files = attr->files;
+ stbuf->f_ffree = attr->ffree;
+ stbuf->f_namelen = attr->namelen;
+ /* fsid is left zero */
+}
+
+static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+ struct super_block *sb = dentry->d_sb;
+ struct fuse_conn *fc = get_fuse_conn_super(sb);
+ FUSE_ARGS(args);
+ struct fuse_statfs_out outarg;
+ int err;
+
+ if (!fuse_allow_current_process(fc)) {
+ buf->f_type = FUSE_SUPER_MAGIC;
+ return 0;
+ }
+
+ memset(&outarg, 0, sizeof(outarg));
+ args.in.numargs = 0;
+ args.in.h.opcode = FUSE_STATFS;
+ args.in.h.nodeid = get_node_id(d_inode(dentry));
+ args.out.numargs = 1;
+ args.out.args[0].size = sizeof(outarg);
+ args.out.args[0].value = &outarg;
+ err = fuse_simple_request(fc, &args);
+ if (!err)
+ convert_fuse_statfs(buf, &outarg.st);
+ return err;
+}
+
+enum {
+ OPT_FD,
+ OPT_ROOTMODE,
+ OPT_USER_ID,
+ OPT_GROUP_ID,
+ OPT_DEFAULT_PERMISSIONS,
+ OPT_ALLOW_OTHER,
+ OPT_MAX_READ,
+ OPT_BLKSIZE,
+ OPT_ERR
+};
+
+static const match_table_t tokens = {
+ {OPT_FD, "fd=%u"},
+ {OPT_ROOTMODE, "rootmode=%o"},
+ {OPT_USER_ID, "user_id=%u"},
+ {OPT_GROUP_ID, "group_id=%u"},
+ {OPT_DEFAULT_PERMISSIONS, "default_permissions"},
+ {OPT_ALLOW_OTHER, "allow_other"},
+ {OPT_MAX_READ, "max_read=%u"},
+ {OPT_BLKSIZE, "blksize=%u"},
+ {OPT_ERR, NULL}
+};
+
+static int fuse_match_uint(substring_t *s, unsigned int *res)
+{
+ int err = -ENOMEM;
+ char *buf = match_strdup(s);
+ if (buf) {
+ err = kstrtouint(buf, 10, res);
+ kfree(buf);
+ }
+ return err;
+}
+
+static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev)
+{
+ char *p;
+ memset(d, 0, sizeof(struct fuse_mount_data));
+ d->max_read = ~0;
+ d->blksize = FUSE_DEFAULT_BLKSIZE;
+
+ while ((p = strsep(&opt, ",")) != NULL) {
+ int token;
+ int value;
+ unsigned uv;
+ substring_t args[MAX_OPT_ARGS];
+ if (!*p)
+ continue;
+
+ token = match_token(p, tokens, args);
+ switch (token) {
+ case OPT_FD:
+ if (match_int(&args[0], &value))
+ return 0;
+ d->fd = value;
+ d->fd_present = 1;
+ break;
+
+ case OPT_ROOTMODE:
+ if (match_octal(&args[0], &value))
+ return 0;
+ if (!fuse_valid_type(value))
+ return 0;
+ d->rootmode = value;
+ d->rootmode_present = 1;
+ break;
+
+ case OPT_USER_ID:
+ if (fuse_match_uint(&args[0], &uv))
+ return 0;
+ d->user_id = make_kuid(current_user_ns(), uv);
+ if (!uid_valid(d->user_id))
+ return 0;
+ d->user_id_present = 1;
+ break;
+
+ case OPT_GROUP_ID:
+ if (fuse_match_uint(&args[0], &uv))
+ return 0;
+ d->group_id = make_kgid(current_user_ns(), uv);
+ if (!gid_valid(d->group_id))
+ return 0;
+ d->group_id_present = 1;
+ break;
+
+ case OPT_DEFAULT_PERMISSIONS:
+ d->flags |= FUSE_DEFAULT_PERMISSIONS;
+ break;
+
+ case OPT_ALLOW_OTHER:
+ d->flags |= FUSE_ALLOW_OTHER;
+ break;
+
+ case OPT_MAX_READ:
+ if (match_int(&args[0], &value))
+ return 0;
+ d->max_read = value;
+ break;
+
+ case OPT_BLKSIZE:
+ if (!is_bdev || match_int(&args[0], &value))
+ return 0;
+ d->blksize = value;
+ break;
+
+ default:
+ return 0;
+ }
+ }
+
+ if (!d->fd_present || !d->rootmode_present ||
+ !d->user_id_present || !d->group_id_present)
+ return 0;
+
+ return 1;
+}
+
+static int fuse_show_options(struct seq_file *m, struct dentry *root)
+{
+ struct super_block *sb = root->d_sb;
+ struct fuse_conn *fc = get_fuse_conn_super(sb);
+
+ seq_printf(m, ",user_id=%u", from_kuid_munged(&init_user_ns, fc->user_id));
+ seq_printf(m, ",group_id=%u", from_kgid_munged(&init_user_ns, fc->group_id));
+ if (fc->flags & FUSE_DEFAULT_PERMISSIONS)
+ seq_puts(m, ",default_permissions");
+ if (fc->flags & FUSE_ALLOW_OTHER)
+ seq_puts(m, ",allow_other");
+ if (fc->max_read != ~0)
+ seq_printf(m, ",max_read=%u", fc->max_read);
+ if (sb->s_bdev && sb->s_blocksize != FUSE_DEFAULT_BLKSIZE)
+ seq_printf(m, ",blksize=%lu", sb->s_blocksize);
+ return 0;
+}
+
+void fuse_conn_init(struct fuse_conn *fc)
+{
+ memset(fc, 0, sizeof(*fc));
+ spin_lock_init(&fc->lock);
+ init_rwsem(&fc->killsb);
+ atomic_set(&fc->count, 1);
+ init_waitqueue_head(&fc->waitq);
+ init_waitqueue_head(&fc->blocked_waitq);
+ init_waitqueue_head(&fc->reserved_req_waitq);
+ INIT_LIST_HEAD(&fc->pending);
+ INIT_LIST_HEAD(&fc->processing);
+ INIT_LIST_HEAD(&fc->io);
+ INIT_LIST_HEAD(&fc->interrupts);
+ INIT_LIST_HEAD(&fc->bg_queue);
+ INIT_LIST_HEAD(&fc->entry);
+ fc->forget_list_tail = &fc->forget_list_head;
+ atomic_set(&fc->num_waiting, 0);
+ fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND;
+ fc->congestion_threshold = FUSE_DEFAULT_CONGESTION_THRESHOLD;
+ fc->khctr = 0;
+ fc->polled_files = RB_ROOT;
+ fc->reqctr = 0;
+ fc->blocked = 0;
+ fc->initialized = 0;
+ fc->attr_version = 1;
+ get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key));
+}
+EXPORT_SYMBOL_GPL(fuse_conn_init);
+
+void fuse_conn_put(struct fuse_conn *fc)
+{
+ if (atomic_dec_and_test(&fc->count)) {
+ if (fc->destroy_req)
+ fuse_request_free(fc->destroy_req);
+ fc->release(fc);
+ }
+}
+EXPORT_SYMBOL_GPL(fuse_conn_put);
+
+struct fuse_conn *fuse_conn_get(struct fuse_conn *fc)
+{
+ atomic_inc(&fc->count);
+ return fc;
+}
+EXPORT_SYMBOL_GPL(fuse_conn_get);
+
+static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode)
+{
+ struct fuse_attr attr;
+ memset(&attr, 0, sizeof(attr));
+
+ attr.mode = mode;
+ attr.ino = FUSE_ROOT_ID;
+ attr.nlink = 1;
+ return fuse_iget(sb, 1, 0, &attr, 0, 0);
+}
+
+struct fuse_inode_handle {
+ u64 nodeid;
+ u32 generation;
+};
+
+static struct dentry *fuse_get_dentry(struct super_block *sb,
+ struct fuse_inode_handle *handle)
+{
+ struct fuse_conn *fc = get_fuse_conn_super(sb);
+ struct inode *inode;
+ struct dentry *entry;
+ int err = -ESTALE;
+
+ if (handle->nodeid == 0)
+ goto out_err;
+
+ inode = ilookup5(sb, handle->nodeid, fuse_inode_eq, &handle->nodeid);
+ if (!inode) {
+ struct fuse_entry_out outarg;
+ struct qstr name;
+
+ if (!fc->export_support)
+ goto out_err;
+
+ name.len = 1;
+ name.name = ".";
+ err = fuse_lookup_name(sb, handle->nodeid, &name, &outarg,
+ &inode);
+ if (err && err != -ENOENT)
+ goto out_err;
+ if (err || !inode) {
+ err = -ESTALE;
+ goto out_err;
+ }
+ err = -EIO;
+ if (get_node_id(inode) != handle->nodeid)
+ goto out_iput;
+ }
+ err = -ESTALE;
+ if (inode->i_generation != handle->generation)
+ goto out_iput;
+
+ entry = d_obtain_alias(inode);
+ if (!IS_ERR(entry) && get_node_id(inode) != FUSE_ROOT_ID)
+ fuse_invalidate_entry_cache(entry);
+
+ return entry;
+
+ out_iput:
+ iput(inode);
+ out_err:
+ return ERR_PTR(err);
+}
+
+static int fuse_encode_fh(struct inode *inode, u32 *fh, int *max_len,
+ struct inode *parent)
+{
+ int len = parent ? 6 : 3;
+ u64 nodeid;
+ u32 generation;
+
+ if (*max_len < len) {
+ *max_len = len;
+ return FILEID_INVALID;
+ }
+
+ nodeid = get_fuse_inode(inode)->nodeid;
+ generation = inode->i_generation;
+
+ fh[0] = (u32)(nodeid >> 32);
+ fh[1] = (u32)(nodeid & 0xffffffff);
+ fh[2] = generation;
+
+ if (parent) {
+ nodeid = get_fuse_inode(parent)->nodeid;
+ generation = parent->i_generation;
+
+ fh[3] = (u32)(nodeid >> 32);
+ fh[4] = (u32)(nodeid & 0xffffffff);
+ fh[5] = generation;
+ }
+
+ *max_len = len;
+ return parent ? 0x82 : 0x81;
+}
+
+static struct dentry *fuse_fh_to_dentry(struct super_block *sb,
+ struct fid *fid, int fh_len, int fh_type)
+{
+ struct fuse_inode_handle handle;
+
+ if ((fh_type != 0x81 && fh_type != 0x82) || fh_len < 3)
+ return NULL;
+
+ handle.nodeid = (u64) fid->raw[0] << 32;
+ handle.nodeid |= (u64) fid->raw[1];
+ handle.generation = fid->raw[2];
+ return fuse_get_dentry(sb, &handle);
+}
+
+static struct dentry *fuse_fh_to_parent(struct super_block *sb,
+ struct fid *fid, int fh_len, int fh_type)
+{
+ struct fuse_inode_handle parent;
+
+ if (fh_type != 0x82 || fh_len < 6)
+ return NULL;
+
+ parent.nodeid = (u64) fid->raw[3] << 32;
+ parent.nodeid |= (u64) fid->raw[4];
+ parent.generation = fid->raw[5];
+ return fuse_get_dentry(sb, &parent);
+}
+
+static struct dentry *fuse_get_parent(struct dentry *child)
+{
+ struct inode *child_inode = d_inode(child);
+ struct fuse_conn *fc = get_fuse_conn(child_inode);
+ struct inode *inode;
+ struct dentry *parent;
+ struct fuse_entry_out outarg;
+ struct qstr name;
+ int err;
+
+ if (!fc->export_support)
+ return ERR_PTR(-ESTALE);
+
+ name.len = 2;
+ name.name = "..";
+ err = fuse_lookup_name(child_inode->i_sb, get_node_id(child_inode),
+ &name, &outarg, &inode);
+ if (err) {
+ if (err == -ENOENT)
+ return ERR_PTR(-ESTALE);
+ return ERR_PTR(err);
+ }
+
+ parent = d_obtain_alias(inode);
+ if (!IS_ERR(parent) && get_node_id(inode) != FUSE_ROOT_ID)
+ fuse_invalidate_entry_cache(parent);
+
+ return parent;
+}
+
+static const struct export_operations fuse_export_operations = {
+ .fh_to_dentry = fuse_fh_to_dentry,
+ .fh_to_parent = fuse_fh_to_parent,
+ .encode_fh = fuse_encode_fh,
+ .get_parent = fuse_get_parent,
+};
+
+static const struct super_operations fuse_super_operations = {
+ .alloc_inode = fuse_alloc_inode,
+ .destroy_inode = fuse_destroy_inode,
+ .evict_inode = fuse_evict_inode,
+ .write_inode = fuse_write_inode,
+ .drop_inode = generic_delete_inode,
+ .remount_fs = fuse_remount_fs,
+ .put_super = fuse_put_super,
+ .umount_begin = fuse_umount_begin,
+ .statfs = fuse_statfs,
+ .show_options = fuse_show_options,
+};
+
+static void sanitize_global_limit(unsigned *limit)
+{
+ if (*limit == 0)
+ *limit = ((totalram_pages << PAGE_SHIFT) >> 13) /
+ sizeof(struct fuse_req);
+
+ if (*limit >= 1 << 16)
+ *limit = (1 << 16) - 1;
+}
+
+static int set_global_limit(const char *val, struct kernel_param *kp)
+{
+ int rv;
+
+ rv = param_set_uint(val, kp);
+ if (rv)
+ return rv;
+
+ sanitize_global_limit((unsigned *)kp->arg);
+
+ return 0;
+}
+
+static void process_init_limits(struct fuse_conn *fc, struct fuse_init_out *arg)
+{
+ int cap_sys_admin = capable(CAP_SYS_ADMIN);
+
+ if (arg->minor < 13)
+ return;
+
+ sanitize_global_limit(&max_user_bgreq);
+ sanitize_global_limit(&max_user_congthresh);
+
+ if (arg->max_background) {
+ fc->max_background = arg->max_background;
+
+ if (!cap_sys_admin && fc->max_background > max_user_bgreq)
+ fc->max_background = max_user_bgreq;
+ }
+ if (arg->congestion_threshold) {
+ fc->congestion_threshold = arg->congestion_threshold;
+
+ if (!cap_sys_admin &&
+ fc->congestion_threshold > max_user_congthresh)
+ fc->congestion_threshold = max_user_congthresh;
+ }
+}
+
+static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
+{
+ struct fuse_init_out *arg = &req->misc.init_out;
+
+ if (req->out.h.error || arg->major != FUSE_KERNEL_VERSION)
+ fc->conn_error = 1;
+ else {
+ unsigned long ra_pages;
+
+ process_init_limits(fc, arg);
+
+ if (arg->minor >= 6) {
+ ra_pages = arg->max_readahead / PAGE_CACHE_SIZE;
+ if (arg->flags & FUSE_ASYNC_READ)
+ fc->async_read = 1;
+ if (!(arg->flags & FUSE_POSIX_LOCKS))
+ fc->no_lock = 1;
+ if (arg->minor >= 17) {
+ if (!(arg->flags & FUSE_FLOCK_LOCKS))
+ fc->no_flock = 1;
+ } else {
+ if (!(arg->flags & FUSE_POSIX_LOCKS))
+ fc->no_flock = 1;
+ }
+ if (arg->flags & FUSE_ATOMIC_O_TRUNC)
+ fc->atomic_o_trunc = 1;
+ if (arg->minor >= 9) {
+ /* LOOKUP has dependency on proto version */
+ if (arg->flags & FUSE_EXPORT_SUPPORT)
+ fc->export_support = 1;
+ }
+ if (arg->flags & FUSE_BIG_WRITES)
+ fc->big_writes = 1;
+ if (arg->flags & FUSE_DONT_MASK)
+ fc->dont_mask = 1;
+ if (arg->flags & FUSE_AUTO_INVAL_DATA)
+ fc->auto_inval_data = 1;
+ if (arg->flags & FUSE_DO_READDIRPLUS) {
+ fc->do_readdirplus = 1;
+ if (arg->flags & FUSE_READDIRPLUS_AUTO)
+ fc->readdirplus_auto = 1;
+ }
+ if (arg->flags & FUSE_ASYNC_DIO)
+ fc->async_dio = 1;
+ if (arg->flags & FUSE_WRITEBACK_CACHE)
+ fc->writeback_cache = 1;
+ if (arg->time_gran && arg->time_gran <= 1000000000)
+ fc->sb->s_time_gran = arg->time_gran;
+ } else {
+ ra_pages = fc->max_read / PAGE_CACHE_SIZE;
+ fc->no_lock = 1;
+ fc->no_flock = 1;
+ }
+
+ fc->bdi.ra_pages = min(fc->bdi.ra_pages, ra_pages);
+ fc->minor = arg->minor;
+ fc->max_write = arg->minor < 5 ? 4096 : arg->max_write;
+ fc->max_write = max_t(unsigned, 4096, fc->max_write);
+ fc->conn_init = 1;
+ }
+ fuse_set_initialized(fc);
+ wake_up_all(&fc->blocked_waitq);
+}
+
+static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
+{
+ struct fuse_init_in *arg = &req->misc.init_in;
+
+ arg->major = FUSE_KERNEL_VERSION;
+ arg->minor = FUSE_KERNEL_MINOR_VERSION;
+ arg->max_readahead = fc->bdi.ra_pages * PAGE_CACHE_SIZE;
+ arg->flags |= FUSE_ASYNC_READ | FUSE_POSIX_LOCKS | FUSE_ATOMIC_O_TRUNC |
+ FUSE_EXPORT_SUPPORT | FUSE_BIG_WRITES | FUSE_DONT_MASK |
+ FUSE_SPLICE_WRITE | FUSE_SPLICE_MOVE | FUSE_SPLICE_READ |
+ FUSE_FLOCK_LOCKS | FUSE_IOCTL_DIR | FUSE_AUTO_INVAL_DATA |
+ FUSE_DO_READDIRPLUS | FUSE_READDIRPLUS_AUTO | FUSE_ASYNC_DIO |
+ FUSE_WRITEBACK_CACHE | FUSE_NO_OPEN_SUPPORT;
+ req->in.h.opcode = FUSE_INIT;
+ req->in.numargs = 1;
+ req->in.args[0].size = sizeof(*arg);
+ req->in.args[0].value = arg;
+ req->out.numargs = 1;
+ /* Variable length argument used for backward compatibility
+ with interface version < 7.5. Rest of init_out is zeroed
+ by do_get_request(), so a short reply is not a problem */
+ req->out.argvar = 1;
+ req->out.args[0].size = sizeof(struct fuse_init_out);
+ req->out.args[0].value = &req->misc.init_out;
+ req->end = process_init_reply;
+ fuse_request_send_background(fc, req);
+}
+
+static void fuse_free_conn(struct fuse_conn *fc)
+{
+ kfree_rcu(fc, rcu);
+}
+
+static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
+{
+ int err;
+
+ fc->bdi.name = "fuse";
+ fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
+ /* fuse does it's own writeback accounting */
+ fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB | BDI_CAP_STRICTLIMIT;
+
+ err = bdi_init(&fc->bdi);
+ if (err)
+ return err;
+
+ fc->bdi_initialized = 1;
+
+ if (sb->s_bdev) {
+ err = bdi_register(&fc->bdi, NULL, "%u:%u-fuseblk",
+ MAJOR(fc->dev), MINOR(fc->dev));
+ } else {
+ err = bdi_register_dev(&fc->bdi, fc->dev);
+ }
+
+ if (err)
+ return err;
+
+ /*
+ * For a single fuse filesystem use max 1% of dirty +
+ * writeback threshold.
+ *
+ * This gives about 1M of write buffer for memory maps on a
+ * machine with 1G and 10% dirty_ratio, which should be more
+ * than enough.
+ *
+ * Privileged users can raise it by writing to
+ *
+ * /sys/class/bdi/<bdi>/max_ratio
+ */
+ bdi_set_max_ratio(&fc->bdi, 1);
+
+ return 0;
+}
+
+static int fuse_fill_super(struct super_block *sb, void *data, int silent)
+{
+ struct fuse_conn *fc;
+ struct inode *root;
+ struct fuse_mount_data d;
+ struct file *file;
+ struct dentry *root_dentry;
+ struct fuse_req *init_req;
+ int err;
+ int is_bdev = sb->s_bdev != NULL;
+
+ err = -EINVAL;
+ if (sb->s_flags & MS_MANDLOCK)
+ goto err;
+
+ sb->s_flags &= ~(MS_NOSEC | MS_I_VERSION);
+
+ if (!parse_fuse_opt(data, &d, is_bdev))
+ goto err;
+
+ if (is_bdev) {
+#ifdef CONFIG_BLOCK
+ err = -EINVAL;
+ if (!sb_set_blocksize(sb, d.blksize))
+ goto err;
+#endif
+ } else {
+ sb->s_blocksize = PAGE_CACHE_SIZE;
+ sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+ }
+ sb->s_magic = FUSE_SUPER_MAGIC;
+ sb->s_op = &fuse_super_operations;
+ sb->s_maxbytes = MAX_LFS_FILESIZE;
+ sb->s_time_gran = 1;
+ sb->s_export_op = &fuse_export_operations;
+
+ file = fget(d.fd);
+ err = -EINVAL;
+ if (!file)
+ goto err;
+
+ if ((file->f_op != &fuse_dev_operations) ||
+ (file->f_cred->user_ns != &init_user_ns))
+ goto err_fput;
+
+ fc = kmalloc(sizeof(*fc), GFP_KERNEL);
+ err = -ENOMEM;
+ if (!fc)
+ goto err_fput;
+
+ fuse_conn_init(fc);
+ fc->release = fuse_free_conn;
+
+ fc->dev = sb->s_dev;
+ fc->sb = sb;
+ err = fuse_bdi_init(fc, sb);
+ if (err)
+ goto err_put_conn;
+
+ sb->s_bdi = &fc->bdi;
+
+ /* Handle umasking inside the fuse code */
+ if (sb->s_flags & MS_POSIXACL)
+ fc->dont_mask = 1;
+ sb->s_flags |= MS_POSIXACL;
+
+ fc->flags = d.flags;
+ fc->user_id = d.user_id;
+ fc->group_id = d.group_id;
+ fc->max_read = max_t(unsigned, 4096, d.max_read);
+
+ /* Used by get_root_inode() */
+ sb->s_fs_info = fc;
+
+ err = -ENOMEM;
+ root = fuse_get_root_inode(sb, d.rootmode);
+ root_dentry = d_make_root(root);
+ if (!root_dentry)
+ goto err_put_conn;
+ /* only now - we want root dentry with NULL ->d_op */
+ sb->s_d_op = &fuse_dentry_operations;
+
+ init_req = fuse_request_alloc(0);
+ if (!init_req)
+ goto err_put_root;
+ init_req->background = 1;
+
+ if (is_bdev) {
+ fc->destroy_req = fuse_request_alloc(0);
+ if (!fc->destroy_req)
+ goto err_free_init_req;
+ }
+
+ mutex_lock(&fuse_mutex);
+ err = -EINVAL;
+ if (file->private_data)
+ goto err_unlock;
+
+ err = fuse_ctl_add_conn(fc);
+ if (err)
+ goto err_unlock;
+
+ list_add_tail(&fc->entry, &fuse_conn_list);
+ sb->s_root = root_dentry;
+ fc->connected = 1;
+ file->private_data = fuse_conn_get(fc);
+ mutex_unlock(&fuse_mutex);
+ /*
+ * atomic_dec_and_test() in fput() provides the necessary
+ * memory barrier for file->private_data to be visible on all
+ * CPUs after this
+ */
+ fput(file);
+
+ fuse_send_init(fc, init_req);
+
+ return 0;
+
+ err_unlock:
+ mutex_unlock(&fuse_mutex);
+ err_free_init_req:
+ fuse_request_free(init_req);
+ err_put_root:
+ dput(root_dentry);
+ err_put_conn:
+ fuse_bdi_destroy(fc);
+ fuse_conn_put(fc);
+ err_fput:
+ fput(file);
+ err:
+ return err;
+}
+
+static struct dentry *fuse_mount(struct file_system_type *fs_type,
+ int flags, const char *dev_name,
+ void *raw_data)
+{
+ return mount_nodev(fs_type, flags, raw_data, fuse_fill_super);
+}
+
+static void fuse_kill_sb_anon(struct super_block *sb)
+{
+ struct fuse_conn *fc = get_fuse_conn_super(sb);
+
+ if (fc) {
+ down_write(&fc->killsb);
+ fc->sb = NULL;
+ up_write(&fc->killsb);
+ }
+
+ kill_anon_super(sb);
+}
+
+static struct file_system_type fuse_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "fuse",
+ .fs_flags = FS_HAS_SUBTYPE,
+ .mount = fuse_mount,
+ .kill_sb = fuse_kill_sb_anon,
+};
+MODULE_ALIAS_FS("fuse");
+
+#ifdef CONFIG_BLOCK
+static struct dentry *fuse_mount_blk(struct file_system_type *fs_type,
+ int flags, const char *dev_name,
+ void *raw_data)
+{
+ return mount_bdev(fs_type, flags, dev_name, raw_data, fuse_fill_super);
+}
+
+static void fuse_kill_sb_blk(struct super_block *sb)
+{
+ struct fuse_conn *fc = get_fuse_conn_super(sb);
+
+ if (fc) {
+ down_write(&fc->killsb);
+ fc->sb = NULL;
+ up_write(&fc->killsb);
+ }
+
+ kill_block_super(sb);
+}
+
+static struct file_system_type fuseblk_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "fuseblk",
+ .mount = fuse_mount_blk,
+ .kill_sb = fuse_kill_sb_blk,
+ .fs_flags = FS_REQUIRES_DEV | FS_HAS_SUBTYPE,
+};
+MODULE_ALIAS_FS("fuseblk");
+
+static inline int register_fuseblk(void)
+{
+ return register_filesystem(&fuseblk_fs_type);
+}
+
+static inline void unregister_fuseblk(void)
+{
+ unregister_filesystem(&fuseblk_fs_type);
+}
+#else
+static inline int register_fuseblk(void)
+{
+ return 0;
+}
+
+static inline void unregister_fuseblk(void)
+{
+}
+#endif
+
+static void fuse_inode_init_once(void *foo)
+{
+ struct inode *inode = foo;
+
+ inode_init_once(inode);
+}
+
+static int __init fuse_fs_init(void)
+{
+ int err;
+
+ fuse_inode_cachep = kmem_cache_create("fuse_inode",
+ sizeof(struct fuse_inode),
+ 0, SLAB_HWCACHE_ALIGN,
+ fuse_inode_init_once);
+ err = -ENOMEM;
+ if (!fuse_inode_cachep)
+ goto out;
+
+ err = register_fuseblk();
+ if (err)
+ goto out2;
+
+ err = register_filesystem(&fuse_fs_type);
+ if (err)
+ goto out3;
+
+ return 0;
+
+ out3:
+ unregister_fuseblk();
+ out2:
+ kmem_cache_destroy(fuse_inode_cachep);
+ out:
+ return err;
+}
+
+static void fuse_fs_cleanup(void)
+{
+ unregister_filesystem(&fuse_fs_type);
+ unregister_fuseblk();
+
+ /*
+ * Make sure all delayed rcu free inodes are flushed before we
+ * destroy cache.
+ */
+ rcu_barrier();
+ kmem_cache_destroy(fuse_inode_cachep);
+}
+
+static struct kobject *fuse_kobj;
+
+static int fuse_sysfs_init(void)
+{
+ int err;
+
+ fuse_kobj = kobject_create_and_add("fuse", fs_kobj);
+ if (!fuse_kobj) {
+ err = -ENOMEM;
+ goto out_err;
+ }
+
+ err = sysfs_create_mount_point(fuse_kobj, "connections");
+ if (err)
+ goto out_fuse_unregister;
+
+ return 0;
+
+ out_fuse_unregister:
+ kobject_put(fuse_kobj);
+ out_err:
+ return err;
+}
+
+static void fuse_sysfs_cleanup(void)
+{
+ sysfs_remove_mount_point(fuse_kobj, "connections");
+ kobject_put(fuse_kobj);
+}
+
+static int __init fuse_init(void)
+{
+ int res;
+
+ printk(KERN_INFO "fuse init (API version %i.%i)\n",
+ FUSE_KERNEL_VERSION, FUSE_KERNEL_MINOR_VERSION);
+
+ INIT_LIST_HEAD(&fuse_conn_list);
+ res = fuse_fs_init();
+ if (res)
+ goto err;
+
+ res = fuse_dev_init();
+ if (res)
+ goto err_fs_cleanup;
+
+ res = fuse_sysfs_init();
+ if (res)
+ goto err_dev_cleanup;
+
+ res = fuse_ctl_init();
+ if (res)
+ goto err_sysfs_cleanup;
+
+ sanitize_global_limit(&max_user_bgreq);
+ sanitize_global_limit(&max_user_congthresh);
+
+ return 0;
+
+ err_sysfs_cleanup:
+ fuse_sysfs_cleanup();
+ err_dev_cleanup:
+ fuse_dev_cleanup();
+ err_fs_cleanup:
+ fuse_fs_cleanup();
+ err:
+ return res;
+}
+
+static void __exit fuse_exit(void)
+{
+ printk(KERN_DEBUG "fuse exit\n");
+
+ fuse_ctl_cleanup();
+ fuse_sysfs_cleanup();
+ fuse_fs_cleanup();
+ fuse_dev_cleanup();
+}
+
+module_init(fuse_init);
+module_exit(fuse_exit);