/*
 * Copyright (C) 2005-2016 Junjiro R. Okajima
 */

/*
 * external inode number translation table and bitmap
 */

#include <linux/seq_file.h>
#include <linux/statfs.h>
#include "aufs.h"

/* todo: unnecessary to support mmap_sem since kernel-space? */
ssize_t xino_fread(vfs_readf_t func, struct file *file, void *kbuf, size_t size,
		   loff_t *pos)
{
	ssize_t err;
	mm_segment_t oldfs;
	union {
		void *k;
		char __user *u;
	} buf;

	buf.k = kbuf;
	oldfs = get_fs();
	set_fs(KERNEL_DS);
	do {
		/* todo: signal_pending? */
		err = func(file, buf.u, size, pos);
	} while (err == -EAGAIN || err == -EINTR);
	set_fs(oldfs);

#if 0 /* reserved for future use */
	if (err > 0)
		fsnotify_access(file->f_path.dentry);
#endif

	return err;
}

/* ---------------------------------------------------------------------- */

static ssize_t xino_fwrite_wkq(vfs_writef_t func, struct file *file, void *buf,
			       size_t size, loff_t *pos);

static ssize_t do_xino_fwrite(vfs_writef_t func, struct file *file, void *kbuf,
			      size_t size, loff_t *pos)
{
	ssize_t err;
	mm_segment_t oldfs;
	union {
		void *k;
		const char __user *u;
	} buf;
	int i;
	const int prevent_endless = 10;

	i = 0;
	buf.k = kbuf;
	oldfs = get_fs();
	set_fs(KERNEL_DS);
	do {
		err = func(file, buf.u, size, pos);
		if (err == -EINTR
		    && !au_wkq_test()
		    && fatal_signal_pending(current)) {
			set_fs(oldfs);
			err = xino_fwrite_wkq(func, file, kbuf, size, pos);
			BUG_ON(err == -EINTR);
			oldfs = get_fs();
			set_fs(KERNEL_DS);
		}
	} while (i++ < prevent_endless
		 && (err == -EAGAIN || err == -EINTR));
	set_fs(oldfs);

#if 0 /* reserved for future use */
	if (err > 0)
		fsnotify_modify(file->f_path.dentry);
#endif

	return err;
}

struct do_xino_fwrite_args {
	ssize_t *errp;
	vfs_writef_t func;
	struct file *file;
	void *buf;
	size_t size;
	loff_t *pos;
};

static void call_do_xino_fwrite(void *args)
{
	struct do_xino_fwrite_args *a = args;
	*a->errp = do_xino_fwrite(a->func, a->file, a->buf, a->size, a->pos);
}

static ssize_t xino_fwrite_wkq(vfs_writef_t func, struct file *file, void *buf,
			       size_t size, loff_t *pos)
{
	ssize_t err;
	int wkq_err;
	struct do_xino_fwrite_args args = {
		.errp	= &err,
		.func	= func,
		.file	= file,
		.buf	= buf,
		.size	= size,
		.pos	= pos
	};

	/*
	 * it breaks RLIMIT_FSIZE and normal user's limit,
	 * users should care about quota and real 'filesystem full.'
	 */
	wkq_err = au_wkq_wait(call_do_xino_fwrite, &args);
	if (unlikely(wkq_err))
		err = wkq_err;

	return err;
}

ssize_t xino_fwrite(vfs_writef_t func, struct file *file, void *buf,
		    size_t size, loff_t *pos)
{
	ssize_t err;

	if (rlimit(RLIMIT_FSIZE) == RLIM_INFINITY) {
		lockdep_off();
		err = do_xino_fwrite(func, file, buf, size, pos);
		lockdep_on();
	} else
		err = xino_fwrite_wkq(func, file, buf, size, pos);

	return err;
}

/* ---------------------------------------------------------------------- */

/*
 * create a new xinofile at the same place/path as @base_file.
 */
struct file *au_xino_create2(struct file *base_file, struct file *copy_src)
{
	struct file *file;
	struct dentry *base, *parent;
	struct inode *dir, *delegated;
	struct qstr *name;
	struct path path;
	int err;

	base = base_file->f_path.dentry;
	parent = base->d_parent; /* dir inode is locked */
	dir = d_inode(parent);
	IMustLock(dir);

	file = ERR_PTR(-EINVAL);
	name = &base->d_name;
	path.dentry = vfsub_lookup_one_len(name->name, parent, name->len);
	if (IS_ERR(path.dentry)) {
		file = (void *)path.dentry;
		pr_err("%pd lookup err %ld\n",
		       base, PTR_ERR(path.dentry));
		goto out;
	}

	/* no need to mnt_want_write() since we call dentry_open() later */
	err = vfs_create(dir, path.dentry, S_IRUGO | S_IWUGO, NULL);
	if (unlikely(err)) {
		file = ERR_PTR(err);
		pr_err("%pd create err %d\n", base, err);
		goto out_dput;
	}

	path.mnt = base_file->f_path.mnt;
	file = vfsub_dentry_open(&path,
				 O_RDWR | O_CREAT | O_EXCL | O_LARGEFILE
				 /* | __FMODE_NONOTIFY */);
	if (IS_ERR(file)) {
		pr_err("%pd open err %ld\n", base, PTR_ERR(file));
		goto out_dput;
	}

	delegated = NULL;
	err = vfsub_unlink(dir, &file->f_path, &delegated, /*force*/0);
	if (unlikely(err == -EWOULDBLOCK)) {
		pr_warn("cannot retry for NFSv4 delegation"
			" for an internal unlink\n");
		iput(delegated);
	}
	if (unlikely(err)) {
		pr_err("%pd unlink err %d\n", base, err);
		goto out_fput;
	}

	if (copy_src) {
		/* no one can touch copy_src xino */
		err = au_copy_file(file, copy_src, vfsub_f_size_read(copy_src));
		if (unlikely(err)) {
			pr_err("%pd copy err %d\n", base, err);
			goto out_fput;
		}
	}
	goto out_dput; /* success */

out_fput:
	fput(file);
	file = ERR_PTR(err);
out_dput:
	dput(path.dentry);
out:
	return file;
}

struct au_xino_lock_dir {
	struct au_hinode *hdir;
	struct dentry *parent;
	struct mutex *mtx;
};

static void au_xino_lock_dir(struct super_block *sb, struct file *xino,
			     struct au_xino_lock_dir *ldir)
{
	aufs_bindex_t brid, bindex;

	ldir->hdir = NULL;
	bindex = -1;
	brid = au_xino_brid(sb);
	if (brid >= 0)
		bindex = au_br_index(sb, brid);
	if (bindex >= 0) {
		ldir->hdir = au_hi(d_inode(sb->s_root), bindex);
		au_hn_imtx_lock_nested(ldir->hdir, AuLsc_I_PARENT);
	} else {
		ldir->parent = dget_parent(xino->f_path.dentry);
		ldir->mtx = &d_inode(ldir->parent)->i_mutex;
		mutex_lock_nested(ldir->mtx, AuLsc_I_PARENT);
	}
}

static void au_xino_unlock_dir(struct au_xino_lock_dir *ldir)
{
	if (ldir->hdir)
		au_hn_imtx_unlock(ldir->hdir);
	else {
		mutex_unlock(ldir->mtx);
		dput(ldir->parent);
	}
}

/* ---------------------------------------------------------------------- */

/* trucate xino files asynchronously */

int au_xino_trunc(struct super_block *sb, aufs_bindex_t bindex)
{
	int err;
	unsigned long jiffy;
	blkcnt_t blocks;
	aufs_bindex_t bi, bend;
	struct kstatfs *st;
	struct au_branch *br;
	struct file *new_xino, *file;
	struct super_block *h_sb;
	struct au_xino_lock_dir ldir;

	err = -ENOMEM;
	st = kmalloc(sizeof(*st), GFP_NOFS);
	if (unlikely(!st))
		goto out;

	err = -EINVAL;
	bend = au_sbend(sb);
	if (unlikely(bindex < 0 || bend < bindex))
		goto out_st;
	br = au_sbr(sb, bindex);
	file = br->br_xino.xi_file;
	if (!file)
		goto out_st;

	err = vfs_statfs(&file->f_path, st);
	if (unlikely(err))
		AuErr1("statfs err %d, ignored\n", err);
	jiffy = jiffies;
	blocks = file_inode(file)->i_blocks;
	pr_info("begin truncating xino(b%d), ib%llu, %llu/%llu free blks\n",
		bindex, (u64)blocks, st->f_bfree, st->f_blocks);

	au_xino_lock_dir(sb, file, &ldir);
	/* mnt_want_write() is unnecessary here */
	new_xino = au_xino_create2(file, file);
	au_xino_unlock_dir(&ldir);
	err = PTR_ERR(new_xino);
	if (IS_ERR(new_xino)) {
		pr_err("err %d, ignored\n", err);
		goto out_st;
	}
	err = 0;
	fput(file);
	br->br_xino.xi_file = new_xino;

	h_sb = au_br_sb(br);
	for (bi = 0; bi <= bend; bi++) {
		if (unlikely(bi == bindex))
			continue;
		br = au_sbr(sb, bi);
		if (au_br_sb(br) != h_sb)
			continue;

		fput(br->br_xino.xi_file);
		br->br_xino.xi_file = new_xino;
		get_file(new_xino);
	}

	err = vfs_statfs(&new_xino->f_path, st);
	if (!err) {
		pr_info("end truncating xino(b%d), ib%llu, %llu/%llu free blks\n",
			bindex, (u64)file_inode(new_xino)->i_blocks,
			st->f_bfree, st->f_blocks);
		if (file_inode(new_xino)->i_blocks < blocks)
			au_sbi(sb)->si_xino_jiffy = jiffy;
	} else
		AuErr1("statfs err %d, ignored\n", err);

out_st:
	kfree(st);
out:
	return err;
}

struct xino_do_trunc_args {
	struct super_block *sb;
	struct au_branch *br;
};

static void xino_do_trunc(void *_args)
{
	struct xino_do_trunc_args *args = _args;
	struct super_block *sb;
	struct au_branch *br;
	struct inode *dir;
	int err;
	aufs_bindex_t bindex;

	err = 0;
	sb = args->sb;
	dir = d_inode(sb->s_root);
	br = args->br;

	si_noflush_write_lock(sb);
	ii_read_lock_parent(dir);
	bindex = au_br_index(sb, br->br_id);
	err = au_xino_trunc(sb, bindex);
	ii_read_unlock(dir);
	if (unlikely(err))
		pr_warn("err b%d, (%d)\n", bindex, err);
	atomic_dec(&br->br_xino_running);
	atomic_dec(&br->br_count);
	si_write_unlock(sb);
	au_nwt_done(&au_sbi(sb)->si_nowait);
	kfree(args);
}

static int xino_trunc_test(struct super_block *sb, struct au_branch *br)
{
	int err;
	struct kstatfs st;
	struct au_sbinfo *sbinfo;

	/* todo: si_xino_expire and the ratio should be customizable */
	sbinfo = au_sbi(sb);
	if (time_before(jiffies,
			sbinfo->si_xino_jiffy + sbinfo->si_xino_expire))
		return 0;

	/* truncation border */
	err = vfs_statfs(&br->br_xino.xi_file->f_path, &st);
	if (unlikely(err)) {
		AuErr1("statfs err %d, ignored\n", err);
		return 0;
	}
	if (div64_u64(st.f_bfree * 100, st.f_blocks) >= AUFS_XINO_DEF_TRUNC)
		return 0;

	return 1;
}

static void xino_try_trunc(struct super_block *sb, struct au_branch *br)
{
	struct xino_do_trunc_args *args;
	int wkq_err;

	if (!xino_trunc_test(sb, br))
		return;

	if (atomic_inc_return(&br->br_xino_running) > 1)
		goto out;

	/* lock and kfree() will be called in trunc_xino() */
	args = kmalloc(sizeof(*args), GFP_NOFS);
	if (unlikely(!args)) {
		AuErr1("no memory\n");
		goto out_args;
	}

	atomic_inc(&br->br_count);
	args->sb = sb;
	args->br = br;
	wkq_err = au_wkq_nowait(xino_do_trunc, args, sb, /*flags*/0);
	if (!wkq_err)
		return; /* success */

	pr_err("wkq %d\n", wkq_err);
	atomic_dec(&br->br_count);

out_args:
	kfree(args);
out:
	atomic_dec(&br->br_xino_running);
}

/* ---------------------------------------------------------------------- */

static int au_xino_do_write(vfs_writef_t write, struct file *file,
			    ino_t h_ino, ino_t ino)
{
	loff_t pos;
	ssize_t sz;

	pos = h_ino;
	if (unlikely(au_loff_max / sizeof(ino) - 1 < pos)) {
		AuIOErr1("too large hi%lu\n", (unsigned long)h_ino);
		return -EFBIG;
	}
	pos *= sizeof(ino);
	sz = xino_fwrite(write, file, &ino, sizeof(ino), &pos);
	if (sz == sizeof(ino))
		return 0; /* success */

	AuIOErr("write failed (%zd)\n", sz);
	return -EIO;
}

/*
 * write @ino to the xinofile for the specified branch{@sb, @bindex}
 * at the position of @h_ino.
 * even if @ino is zero, it is written to the xinofile and means no entry.
 * if the size of the xino file on a specific filesystem exceeds the watermark,
 * try truncating it.
 */
int au_xino_write(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino,
		  ino_t ino)
{
	int err;
	unsigned int mnt_flags;
	struct au_branch *br;

	BUILD_BUG_ON(sizeof(long long) != sizeof(au_loff_max)
		     || ((loff_t)-1) > 0);
	SiMustAnyLock(sb);

	mnt_flags = au_mntflags(sb);
	if (!au_opt_test(mnt_flags, XINO))
		return 0;

	br = au_sbr(sb, bindex);
	err = au_xino_do_write(au_sbi(sb)->si_xwrite, br->br_xino.xi_file,
			       h_ino, ino);
	if (!err) {
		if (au_opt_test(mnt_flags, TRUNC_XINO)
		    && au_test_fs_trunc_xino(au_br_sb(br)))
			xino_try_trunc(sb, br);
		return 0; /* success */
	}

	AuIOErr("write failed (%d)\n", err);
	return -EIO;
}

/* ---------------------------------------------------------------------- */

/* aufs inode number bitmap */

static const int page_bits = (int)PAGE_SIZE * BITS_PER_BYTE;
static ino_t xib_calc_ino(unsigned long pindex, int bit)
{
	ino_t ino;

	AuDebugOn(bit < 0 || page_bits <= bit);
	ino = AUFS_FIRST_INO + pindex * page_bits + bit;
	return ino;
}

static void xib_calc_bit(ino_t ino, unsigned long *pindex, int *bit)
{
	AuDebugOn(ino < AUFS_FIRST_INO);
	ino -= AUFS_FIRST_INO;
	*pindex = ino / page_bits;
	*bit = ino % page_bits;
}

static int xib_pindex(struct super_block *sb, unsigned long pindex)
{
	int err;
	loff_t pos;
	ssize_t sz;
	struct au_sbinfo *sbinfo;
	struct file *xib;
	unsigned long *p;

	sbinfo = au_sbi(sb);
	MtxMustLock(&sbinfo->si_xib_mtx);
	AuDebugOn(pindex > ULONG_MAX / PAGE_SIZE
		  || !au_opt_test(sbinfo->si_mntflags, XINO));

	if (pindex == sbinfo->si_xib_last_pindex)
		return 0;

	xib = sbinfo->si_xib;
	p = sbinfo->si_xib_buf;
	pos = sbinfo->si_xib_last_pindex;
	pos *= PAGE_SIZE;
	sz = xino_fwrite(sbinfo->si_xwrite, xib, p, PAGE_SIZE, &pos);
	if (unlikely(sz != PAGE_SIZE))
		goto out;

	pos = pindex;
	pos *= PAGE_SIZE;
	if (vfsub_f_size_read(xib) >= pos + PAGE_SIZE)
		sz = xino_fread(sbinfo->si_xread, xib, p, PAGE_SIZE, &pos);
	else {
		memset(p, 0, PAGE_SIZE);
		sz = xino_fwrite(sbinfo->si_xwrite, xib, p, PAGE_SIZE, &pos);
	}
	if (sz == PAGE_SIZE) {
		sbinfo->si_xib_last_pindex = pindex;
		return 0; /* success */
	}

out:
	AuIOErr1("write failed (%zd)\n", sz);
	err = sz;
	if (sz >= 0)
		err = -EIO;
	return err;
}

/* ---------------------------------------------------------------------- */

static void au_xib_clear_bit(struct inode *inode)
{
	int err, bit;
	unsigned long pindex;
	struct super_block *sb;
	struct au_sbinfo *sbinfo;

	AuDebugOn(inode->i_nlink);

	sb = inode->i_sb;
	xib_calc_bit(inode->i_ino, &pindex, &bit);
	AuDebugOn(page_bits <= bit);
	sbinfo = au_sbi(sb);
	mutex_lock(&sbinfo->si_xib_mtx);
	err = xib_pindex(sb, pindex);
	if (!err) {
		clear_bit(bit, sbinfo->si_xib_buf);
		sbinfo->si_xib_next_bit = bit;
	}
	mutex_unlock(&sbinfo->si_xib_mtx);
}

/* for s_op->delete_inode() */
void au_xino_delete_inode(struct inode *inode, const int unlinked)
{
	int err;
	unsigned int mnt_flags;
	aufs_bindex_t bindex, bend, bi;
	unsigned char try_trunc;
	struct au_iinfo *iinfo;
	struct super_block *sb;
	struct au_hinode *hi;
	struct inode *h_inode;
	struct au_branch *br;
	vfs_writef_t xwrite;

	sb = inode->i_sb;
	mnt_flags = au_mntflags(sb);
	if (!au_opt_test(mnt_flags, XINO)
	    || inode->i_ino == AUFS_ROOT_INO)
		return;

	if (unlinked) {
		au_xigen_inc(inode);
		au_xib_clear_bit(inode);
	}

	iinfo = au_ii(inode);
	if (!iinfo)
		return;

	bindex = iinfo->ii_bstart;
	if (bindex < 0)
		return;

	xwrite = au_sbi(sb)->si_xwrite;
	try_trunc = !!au_opt_test(mnt_flags, TRUNC_XINO);
	hi = iinfo->ii_hinode + bindex;
	bend = iinfo->ii_bend;
	for (; bindex <= bend; bindex++, hi++) {
		h_inode = hi->hi_inode;
		if (!h_inode
		    || (!unlinked && h_inode->i_nlink))
			continue;

		/* inode may not be revalidated */
		bi = au_br_index(sb, hi->hi_id);
		if (bi < 0)
			continue;

		br = au_sbr(sb, bi);
		err = au_xino_do_write(xwrite, br->br_xino.xi_file,
				       h_inode->i_ino, /*ino*/0);
		if (!err && try_trunc
		    && au_test_fs_trunc_xino(au_br_sb(br)))
			xino_try_trunc(sb, br);
	}
}

/* get an unused inode number from bitmap */
ino_t au_xino_new_ino(struct super_block *sb)
{
	ino_t ino;
	unsigned long *p, pindex, ul, pend;
	struct au_sbinfo *sbinfo;
	struct file *file;
	int free_bit, err;

	if (!au_opt_test(au_mntflags(sb), XINO))
		return iunique(sb, AUFS_FIRST_INO);

	sbinfo = au_sbi(sb);
	mutex_lock(&sbinfo->si_xib_mtx);
	p = sbinfo->si_xib_buf;
	free_bit = sbinfo->si_xib_next_bit;
	if (free_bit < page_bits && !test_bit(free_bit, p))
		goto out; /* success */
	free_bit = find_first_zero_bit(p, page_bits);
	if (free_bit < page_bits)
		goto out; /* success */

	pindex = sbinfo->si_xib_last_pindex;
	for (ul = pindex - 1; ul < ULONG_MAX; ul--) {
		err = xib_pindex(sb, ul);
		if (unlikely(err))
			goto out_err;
		free_bit = find_first_zero_bit(p, page_bits);
		if (free_bit < page_bits)
			goto out; /* success */
	}

	file = sbinfo->si_xib;
	pend = vfsub_f_size_read(file) / PAGE_SIZE;
	for (ul = pindex + 1; ul <= pend; ul++) {
		err = xib_pindex(sb, ul);
		if (unlikely(err))
			goto out_err;
		free_bit = find_first_zero_bit(p, page_bits);
		if (free_bit < page_bits)
			goto out; /* success */
	}
	BUG();

out:
	set_bit(free_bit, p);
	sbinfo->si_xib_next_bit = free_bit + 1;
	pindex = sbinfo->si_xib_last_pindex;
	mutex_unlock(&sbinfo->si_xib_mtx);
	ino = xib_calc_ino(pindex, free_bit);
	AuDbg("i%lu\n", (unsigned long)ino);
	return ino;
out_err:
	mutex_unlock(&sbinfo->si_xib_mtx);
	AuDbg("i0\n");
	return 0;
}

/*
 * read @ino from xinofile for the specified branch{@sb, @bindex}
 * at the position of @h_ino.
 * if @ino does not exist and @do_new is true, get new one.
 */
int au_xino_read(struct super_block *sb, aufs_bindex_t bindex, ino_t h_ino,
		 ino_t *ino)
{
	int err;
	ssize_t sz;
	loff_t pos;
	struct file *file;
	struct au_sbinfo *sbinfo;

	*ino = 0;
	if (!au_opt_test(au_mntflags(sb), XINO))
		return 0; /* no xino */

	err = 0;
	sbinfo = au_sbi(sb);
	pos = h_ino;
	if (unlikely(au_loff_max / sizeof(*ino) - 1 < pos)) {
		AuIOErr1("too large hi%lu\n", (unsigned long)h_ino);
		return -EFBIG;
	}
	pos *= sizeof(*ino);

	file = au_sbr(sb, bindex)->br_xino.xi_file;
	if (vfsub_f_size_read(file) < pos + sizeof(*ino))
		return 0; /* no ino */

	sz = xino_fread(sbinfo->si_xread, file, ino, sizeof(*ino), &pos);
	if (sz == sizeof(*ino))
		return 0; /* success */

	err = sz;
	if (unlikely(sz >= 0)) {
		err = -EIO;
		AuIOErr("xino read error (%zd)\n", sz);
	}

	return err;
}

/* ---------------------------------------------------------------------- */

/* create and set a new xino file */

struct file *au_xino_create(struct super_block *sb, char *fname, int silent)
{
	struct file *file;
	struct dentry *h_parent, *d;
	struct inode *h_dir, *inode;
	int err;

	/*
	 * at mount-time, and the xino file is the default path,
	 * hnotify is disabled so we have no notify events to ignore.
	 * when a user specified the xino, we cannot get au_hdir to be ignored.
	 */
	file = vfsub_filp_open(fname, O_RDWR | O_CREAT | O_EXCL | O_LARGEFILE
			       /* | __FMODE_NONOTIFY */,
			       S_IRUGO | S_IWUGO);
	if (IS_ERR(file)) {
		if (!silent)
			pr_err("open %s(%ld)\n", fname, PTR_ERR(file));
		return file;
	}

	/* keep file count */
	err = 0;
	inode = file_inode(file);
	h_parent = dget_parent(file->f_path.dentry);
	h_dir = d_inode(h_parent);
	mutex_lock_nested(&h_dir->i_mutex, AuLsc_I_PARENT);
	/* mnt_want_write() is unnecessary here */
	/* no delegation since it is just created */
	if (inode->i_nlink)
		err = vfsub_unlink(h_dir, &file->f_path, /*delegated*/NULL,
				   /*force*/0);
	mutex_unlock(&h_dir->i_mutex);
	dput(h_parent);
	if (unlikely(err)) {
		if (!silent)
			pr_err("unlink %s(%d)\n", fname, err);
		goto out;
	}

	err = -EINVAL;
	d = file->f_path.dentry;
	if (unlikely(sb == d->d_sb)) {
		if (!silent)
			pr_err("%s must be outside\n", fname);
		goto out;
	}
	if (unlikely(au_test_fs_bad_xino(d->d_sb))) {
		if (!silent)
			pr_err("xino doesn't support %s(%s)\n",
			       fname, au_sbtype(d->d_sb));
		goto out;
	}
	return file; /* success */

out:
	fput(file);
	file = ERR_PTR(err);
	return file;
}

/*
 * find another branch who is on the same filesystem of the specified
 * branch{@btgt}. search until @bend.
 */
static int is_sb_shared(struct super_block *sb, aufs_bindex_t btgt,
			aufs_bindex_t bend)
{
	aufs_bindex_t bindex;
	struct super_block *tgt_sb = au_sbr_sb(sb, btgt);

	for (bindex = 0; bindex < btgt; bindex++)
		if (unlikely(tgt_sb == au_sbr_sb(sb, bindex)))
			return bindex;
	for (bindex++; bindex <= bend; bindex++)
		if (unlikely(tgt_sb == au_sbr_sb(sb, bindex)))
			return bindex;
	return -1;
}

/* ---------------------------------------------------------------------- */

/*
 * initialize the xinofile for the specified branch @br
 * at the place/path where @base_file indicates.
 * test whether another branch is on the same filesystem or not,
 * if @do_test is true.
 */
int au_xino_br(struct super_block *sb, struct au_branch *br, ino_t h_ino,
	       struct file *base_file, int do_test)
{
	int err;
	ino_t ino;
	aufs_bindex_t bend, bindex;
	struct au_branch *shared_br, *b;
	struct file *file;
	struct super_block *tgt_sb;

	shared_br = NULL;
	bend = au_sbend(sb);
	if (do_test) {
		tgt_sb = au_br_sb(br);
		for (bindex = 0; bindex <= bend; bindex++) {
			b = au_sbr(sb, bindex);
			if (tgt_sb == au_br_sb(b)) {
				shared_br = b;
				break;
			}
		}
	}

	if (!shared_br || !shared_br->br_xino.xi_file) {
		struct au_xino_lock_dir ldir;

		au_xino_lock_dir(sb, base_file, &ldir);
		/* mnt_want_write() is unnecessary here */
		file = au_xino_create2(base_file, NULL);
		au_xino_unlock_dir(&ldir);
		err = PTR_ERR(file);
		if (IS_ERR(file))
			goto out;
		br->br_xino.xi_file = file;
	} else {
		br->br_xino.xi_file = shared_br->br_xino.xi_file;
		get_file(br->br_xino.xi_file);
	}

	ino = AUFS_ROOT_INO;
	err = au_xino_do_write(au_sbi(sb)->si_xwrite, br->br_xino.xi_file,
			       h_ino, ino);
	if (unlikely(err)) {
		fput(br->br_xino.xi_file);
		br->br_xino.xi_file = NULL;
	}

out:
	return err;
}

/* ---------------------------------------------------------------------- */

/* trucate a xino bitmap file */

/* todo: slow */
static int do_xib_restore(struct super_block *sb, struct file *file, void *page)
{
	int err, bit;
	ssize_t sz;
	unsigned long pindex;
	loff_t pos, pend;
	struct au_sbinfo *sbinfo;
	vfs_readf_t func;
	ino_t *ino;
	unsigned long *p;

	err = 0;
	sbinfo = au_sbi(sb);
	MtxMustLock(&sbinfo->si_xib_mtx);
	p = sbinfo->si_xib_buf;
	func = sbinfo->si_xread;
	pend = vfsub_f_size_read(file);
	pos = 0;
	while (pos < pend) {
		sz = xino_fread(func, file, page, PAGE_SIZE, &pos);
		err = sz;
		if (unlikely(sz <= 0))
			goto out;

		err = 0;
		for (ino = page; sz > 0; ino++, sz -= sizeof(ino)) {
			if (unlikely(*ino < AUFS_FIRST_INO))
				continue;

			xib_calc_bit(*ino, &pindex, &bit);
			AuDebugOn(page_bits <= bit);
			err = xib_pindex(sb, pindex);
			if (!err)
				set_bit(bit, p);
			else
				goto out;
		}
	}

out:
	return err;
}

static int xib_restore(struct super_block *sb)
{
	int err;
	aufs_bindex_t bindex, bend;
	void *page;

	err = -ENOMEM;
	page = (void *)__get_free_page(GFP_NOFS);
	if (unlikely(!page))
		goto out;

	err = 0;
	bend = au_sbend(sb);
	for (bindex = 0; !err && bindex <= bend; bindex++)
		if (!bindex || is_sb_shared(sb, bindex, bindex - 1) < 0)
			err = do_xib_restore
				(sb, au_sbr(sb, bindex)->br_xino.xi_file, page);
		else
			AuDbg("b%d\n", bindex);
	free_page((unsigned long)page);

out:
	return err;
}

int au_xib_trunc(struct super_block *sb)
{
	int err;
	ssize_t sz;
	loff_t pos;
	struct au_xino_lock_dir ldir;
	struct au_sbinfo *sbinfo;
	unsigned long *p;
	struct file *file;

	SiMustWriteLock(sb);

	err = 0;
	sbinfo = au_sbi(sb);
	if (!au_opt_test(sbinfo->si_mntflags, XINO))
		goto out;

	file = sbinfo->si_xib;
	if (vfsub_f_size_read(file) <= PAGE_SIZE)
		goto out;

	au_xino_lock_dir(sb, file, &ldir);
	/* mnt_want_write() is unnecessary here */
	file = au_xino_create2(sbinfo->si_xib, NULL);
	au_xino_unlock_dir(&ldir);
	err = PTR_ERR(file);
	if (IS_ERR(file))
		goto out;
	fput(sbinfo->si_xib);
	sbinfo->si_xib = file;

	p = sbinfo->si_xib_buf;
	memset(p, 0, PAGE_SIZE);
	pos = 0;
	sz = xino_fwrite(sbinfo->si_xwrite, sbinfo->si_xib, p, PAGE_SIZE, &pos);
	if (unlikely(sz != PAGE_SIZE)) {
		err = sz;
		AuIOErr("err %d\n", err);
		if (sz >= 0)
			err = -EIO;
		goto out;
	}

	mutex_lock(&sbinfo->si_xib_mtx);
	/* mnt_want_write() is unnecessary here */
	err = xib_restore(sb);
	mutex_unlock(&sbinfo->si_xib_mtx);

out:
	return err;
}

/* ---------------------------------------------------------------------- */

/*
 * xino mount option handlers
 */

/* xino bitmap */
static void xino_clear_xib(struct super_block *sb)
{
	struct au_sbinfo *sbinfo;

	SiMustWriteLock(sb);

	sbinfo = au_sbi(sb);
	sbinfo->si_xread = NULL;
	sbinfo->si_xwrite = NULL;
	if (sbinfo->si_xib)
		fput(sbinfo->si_xib);
	sbinfo->si_xib = NULL;
	free_page((unsigned long)sbinfo->si_xib_buf);
	sbinfo->si_xib_buf = NULL;
}

static int au_xino_set_xib(struct super_block *sb, struct file *base)
{
	int err;
	loff_t pos;
	struct au_sbinfo *sbinfo;
	struct file *file;

	SiMustWriteLock(sb);

	sbinfo = au_sbi(sb);
	file = au_xino_create2(base, sbinfo->si_xib);
	err = PTR_ERR(file);
	if (IS_ERR(file))
		goto out;
	if (sbinfo->si_xib)
		fput(sbinfo->si_xib);
	sbinfo->si_xib = file;
	sbinfo->si_xread = vfs_readf(file);
	sbinfo->si_xwrite = vfs_writef(file);

	err = -ENOMEM;
	if (!sbinfo->si_xib_buf)
		sbinfo->si_xib_buf = (void *)get_zeroed_page(GFP_NOFS);
	if (unlikely(!sbinfo->si_xib_buf))
		goto out_unset;

	sbinfo->si_xib_last_pindex = 0;
	sbinfo->si_xib_next_bit = 0;
	if (vfsub_f_size_read(file) < PAGE_SIZE) {
		pos = 0;
		err = xino_fwrite(sbinfo->si_xwrite, file, sbinfo->si_xib_buf,
				  PAGE_SIZE, &pos);
		if (unlikely(err != PAGE_SIZE))
			goto out_free;
	}
	err = 0;
	goto out; /* success */

out_free:
	free_page((unsigned long)sbinfo->si_xib_buf);
	sbinfo->si_xib_buf = NULL;
	if (err >= 0)
		err = -EIO;
out_unset:
	fput(sbinfo->si_xib);
	sbinfo->si_xib = NULL;
	sbinfo->si_xread = NULL;
	sbinfo->si_xwrite = NULL;
out:
	return err;
}

/* xino for each branch */
static void xino_clear_br(struct super_block *sb)
{
	aufs_bindex_t bindex, bend;
	struct au_branch *br;

	bend = au_sbend(sb);
	for (bindex = 0; bindex <= bend; bindex++) {
		br = au_sbr(sb, bindex);
		if (!br || !br->br_xino.xi_file)
			continue;

		fput(br->br_xino.xi_file);
		br->br_xino.xi_file = NULL;
	}
}

static int au_xino_set_br(struct super_block *sb, struct file *base)
{
	int err;
	ino_t ino;
	aufs_bindex_t bindex, bend, bshared;
	struct {
		struct file *old, *new;
	} *fpair, *p;
	struct au_branch *br;
	struct inode *inode;
	vfs_writef_t writef;

	SiMustWriteLock(sb);

	err = -ENOMEM;
	bend = au_sbend(sb);
	fpair = kcalloc(bend + 1, sizeof(*fpair), GFP_NOFS);
	if (unlikely(!fpair))
		goto out;

	inode = d_inode(sb->s_root);
	ino = AUFS_ROOT_INO;
	writef = au_sbi(sb)->si_xwrite;
	for (bindex = 0, p = fpair; bindex <= bend; bindex++, p++) {
		br = au_sbr(sb, bindex);
		bshared = is_sb_shared(sb, bindex, bindex - 1);
		if (bshared >= 0) {
			/* shared xino */
			*p = fpair[bshared];
			get_file(p->new);
		}

		if (!p->new) {
			/* new xino */
			p->old = br->br_xino.xi_file;
			p->new = au_xino_create2(base, br->br_xino.xi_file);
			err = PTR_ERR(p->new);
			if (IS_ERR(p->new)) {
				p->new = NULL;
				goto out_pair;
			}
		}

		err = au_xino_do_write(writef, p->new,
				       au_h_iptr(inode, bindex)->i_ino, ino);
		if (unlikely(err))
			goto out_pair;
	}

	for (bindex = 0, p = fpair; bindex <= bend; bindex++, p++) {
		br = au_sbr(sb, bindex);
		if (br->br_xino.xi_file)
			fput(br->br_xino.xi_file);
		get_file(p->new);
		br->br_xino.xi_file = p->new;
	}

out_pair:
	for (bindex = 0, p = fpair; bindex <= bend; bindex++, p++)
		if (p->new)
			fput(p->new);
		else
			break;
	kfree(fpair);
out:
	return err;
}

void au_xino_clr(struct super_block *sb)
{
	struct au_sbinfo *sbinfo;

	au_xigen_clr(sb);
	xino_clear_xib(sb);
	xino_clear_br(sb);
	sbinfo = au_sbi(sb);
	/* lvalue, do not call au_mntflags() */
	au_opt_clr(sbinfo->si_mntflags, XINO);
}

int au_xino_set(struct super_block *sb, struct au_opt_xino *xino, int remount)
{
	int err, skip;
	struct dentry *parent, *cur_parent;
	struct qstr *dname, *cur_name;
	struct file *cur_xino;
	struct inode *dir;
	struct au_sbinfo *sbinfo;

	SiMustWriteLock(sb);

	err = 0;
	sbinfo = au_sbi(sb);
	parent = dget_parent(xino->file->f_path.dentry);
	if (remount) {
		skip = 0;
		dname = &xino->file->f_path.dentry->d_name;
		cur_xino = sbinfo->si_xib;
		if (cur_xino) {
			cur_parent = dget_parent(cur_xino->f_path.dentry);
			cur_name = &cur_xino->f_path.dentry->d_name;
			skip = (cur_parent == parent
				&& au_qstreq(dname, cur_name));
			dput(cur_parent);
		}
		if (skip)
			goto out;
	}

	au_opt_set(sbinfo->si_mntflags, XINO);
	dir = d_inode(parent);
	mutex_lock_nested(&dir->i_mutex, AuLsc_I_PARENT);
	/* mnt_want_write() is unnecessary here */
	err = au_xino_set_xib(sb, xino->file);
	if (!err)
		err = au_xigen_set(sb, xino->file);
	if (!err)
		err = au_xino_set_br(sb, xino->file);
	mutex_unlock(&dir->i_mutex);
	if (!err)
		goto out; /* success */

	/* reset all */
	AuIOErr("failed creating xino(%d).\n", err);
	au_xigen_clr(sb);
	xino_clear_xib(sb);

out:
	dput(parent);
	return err;
}

/* ---------------------------------------------------------------------- */

/*
 * create a xinofile at the default place/path.
 */
struct file *au_xino_def(struct super_block *sb)
{
	struct file *file;
	char *page, *p;
	struct au_branch *br;
	struct super_block *h_sb;
	struct path path;
	aufs_bindex_t bend, bindex, bwr;

	br = NULL;
	bend = au_sbend(sb);
	bwr = -1;
	for (bindex = 0; bindex <= bend; bindex++) {
		br = au_sbr(sb, bindex);
		if (au_br_writable(br->br_perm)
		    && !au_test_fs_bad_xino(au_br_sb(br))) {
			bwr = bindex;
			break;
		}
	}

	if (bwr >= 0) {
		file = ERR_PTR(-ENOMEM);
		page = (void *)__get_free_page(GFP_NOFS);
		if (unlikely(!page))
			goto out;
		path.mnt = au_br_mnt(br);
		path.dentry = au_h_dptr(sb->s_root, bwr);
		p = d_path(&path, page, PATH_MAX - sizeof(AUFS_XINO_FNAME));
		file = (void *)p;
		if (!IS_ERR(p)) {
			strcat(p, "/" AUFS_XINO_FNAME);
			AuDbg("%s\n", p);
			file = au_xino_create(sb, p, /*silent*/0);
			if (!IS_ERR(file))
				au_xino_brid_set(sb, br->br_id);
		}
		free_page((unsigned long)page);
	} else {
		file = au_xino_create(sb, AUFS_XINO_DEFPATH, /*silent*/0);
		if (IS_ERR(file))
			goto out;
		h_sb = file->f_path.dentry->d_sb;
		if (unlikely(au_test_fs_bad_xino(h_sb))) {
			pr_err("xino doesn't support %s(%s)\n",
			       AUFS_XINO_DEFPATH, au_sbtype(h_sb));
			fput(file);
			file = ERR_PTR(-EINVAL);
		}
		if (!IS_ERR(file))
			au_xino_brid_set(sb, -1);
	}

out:
	return file;
}

/* ---------------------------------------------------------------------- */

int au_xino_path(struct seq_file *seq, struct file *file)
{
	int err;

	err = au_seq_path(seq, &file->f_path);
	if (unlikely(err))
		goto out;

#define Deleted "\\040(deleted)"
	seq->count -= sizeof(Deleted) - 1;
	AuDebugOn(memcmp(seq->buf + seq->count, Deleted,
			 sizeof(Deleted) - 1));
#undef Deleted

out:
	return err;
}